plugins/af_lang_detect/languagedetect/LanguageDetect.php

   1 <?php
   2
   3 /**
   4  * Detects the language of a given piece of text.
   5  *
   6  * Attempts to detect the language of a sample of text by correlating ranked
   7  * 3-gram frequencies to a table of 3-gram frequencies of known languages.
   8  *
   9  * Implements a version of a technique originally proposed by Cavnar & Trenkle
  10  * (1994): "N-Gram-Based Text Categorization"
  11  *
  12  * PHP version 5
  13  *
  14  * @category  Text
  15  * @package   Text_LanguageDetect
  16  * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
  17  * @copyright 2005-2006 Nicholas Pisarro
  18  * @license   http://www.debian.org/misc/bsd.license BSD
  19  * @version   SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $
  20  * @link      http://pear.php.net/package/Text_LanguageDetect/
  21  * @link      http://langdetect.blogspot.com/
  22  */
  23
  24 require_once __DIR__ . '/Text/LanguageDetect/Exception.php';
  25 require_once __DIR__ . '/Text/LanguageDetect/Parser.php';
  26 require_once __DIR__ . '/Text/LanguageDetect/ISO639.php';
  27
  28 /**
  29  * Language detection class
  30  *
  31  * Requires the langauge model database (lang.dat) that should have
  32  * accompanied this class definition in order to be instantiated.
  33  *
  34  * Example usage:
  35  *
  36  * <code>
  37  * require_once 'Text/LanguageDetect.php';
  38  *
  39  * $l = new Text_LanguageDetect;
  40  *
  41  * $stdin = fopen('php://stdin', 'r');
  42  *
  43  * echo "Supported languages:\n";
  44  *
  45  * try {
  46  *     $langs = $l->getLanguages();
  47  * } catch (Text_LanguageDetect_Exception $e) {
  48  *     die($e->getMessage());
  49  * }
  50  *
  51  * sort($langs);
  52  * echo join(', ', $langs);
  53  *
  54  * while ($line = fgets($stdin)) {
  55  *     print_r($l->detect($line, 4));
  56  * }
  57  * </code>
  58  *
  59  * @category  Text
  60  * @package   Text_LanguageDetect
  61  * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
  62  * @copyright 2005 Nicholas Pisarro
  63  * @license   http://www.debian.org/misc/bsd.license BSD
  64  * @version   Release: @package_version@
  65  * @link      http://pear.php.net/package/Text_LanguageDetect/
  66  * @todo      allow users to generate their own language models
  67  *
  68  * @SuppressWarnings(PHPMD)
  69  */
  70 class Text_LanguageDetect
  71 {
  72     /**
  73      * The filename that stores the trigram data for the detector
  74      *
  75      * If this value starts with a slash (/) or a dot (.) the value of
  76      * $this->_data_dir will be ignored
  77      *
  78      * @var      string
  79      * @access   private
  80      */
  81     var $_db_filename = 'lang.dat';
  82
  83     /**
  84      * The filename that stores the unicode block definitions
  85      *
  86      * If this value starts with a slash (/) or a dot (.) the value of
  87      * $this->_data_dir will be ignored
  88      *
  89      * @var string
  90      * @access private
  91      */
  92     var $_unicode_db_filename = 'unicode_blocks.dat';
  93
  94     /**
  95      * The data directory
  96      *
  97      * Should be set by PEAR installer
  98      *
  99      * @var      string
 100      * @access   private
 101      */
 102     var $_data_dir = '@data_dir@';
 103
 104     /**
 105      * The trigram data for comparison
 106      *
 107      * Will be loaded on start from $this->_db_filename
 108      *
 109      * @var      array
 110      * @access   private
 111      */
 112     var $_lang_db = array();
 113
 114     /**
 115      * stores the map of the trigram data to unicode characters
 116      *
 117      * @access private
 118      * @var array
 119      */
 120     var $_unicode_map;
 121
 122     /**
 123      * The size of the trigram data arrays
 124      *
 125      * @var      int
 126      * @access   private
 127      */
 128     var $_threshold = 300;
 129
 130     /**
 131      * the maximum possible score.
 132      *
 133      * needed for score normalization. Different depending on the
 134      * perl compatibility setting
 135      *
 136      * @access  private
 137      * @var     int
 138      * @see     setPerlCompatible()
 139      */
 140     var $_max_score = 0;
 141
 142     /**
 143      * Whether or not to simulate perl's Language::Guess exactly
 144      *
 145      * @access  private
 146      * @var     bool
 147      * @see     setPerlCompatible()
 148      */
 149     var $_perl_compatible = false;
 150
 151     /**
 152      * Whether to use the unicode block detection to speed up processing
 153      *
 154      * @access private
 155      * @var bool
 156      */
 157     var $_use_unicode_narrowing = true;
 158
 159     /**
 160      * stores the result of the clustering operation
 161      *
 162      * @access  private
 163      * @var     array
 164      * @see     clusterLanguages()
 165      */
 166     var $_clusters;
 167
 168     /**
 169      * Which type of "language names" are accepted and returned:
 170      *
 171      * 0 - language name ("english")
 172      * 2 - 2-letter ISO 639-1 code ("en")
 173      * 3 - 3-letter ISO 639-2 code ("eng")
 174      */
 175     var $_name_mode = 0;
 176
 177     /**
 178      * Constructor
 179      *
 180      * Will attempt to load the language database. If it fails, you will get
 181      * an exception.
 182      */
 183     function __construct()
 184     {
 185         $data = $this->_readdb($this->_db_filename);
 186         $this->_checkTrigram($data['trigram']);
 187         $this->_lang_db = $data['trigram'];
 188
 189         if (isset($data['trigram-unicodemap'])) {
 190             $this->_unicode_map = $data['trigram-unicodemap'];
 191         }
 192
 193         // Not yet implemented:
 194         if (isset($data['trigram-clusters'])) {
 195             $this->_clusters = $data['trigram-clusters'];
 196         }
 197     }
 198
 199     /**
 200      * Returns the path to the location of the database
 201      *
 202      * @param string $fname File name to load
 203      *
 204      * @return string expected path to the language model database
 205      * @access private
 206      */
 207     function _get_data_loc($fname)
 208     {
 209         if ($fname{0} == '/' || $fname{0} == '.') {
 210             // if filename starts with a slash, assume it's an absolute pathname
 211             // and skip whatever is in $this->_data_dir
 212             return $fname;
 213
 214         } elseif ($this->_data_dir != '@' . 'data_dir' . '@') {
 215             // if the data dir was set by the PEAR installer, use that
 216             return $this->_data_dir . '/Text_LanguageDetect/' . $fname;
 217
 218         } else {
 219             // assume this was just unpacked somewhere
 220             // try the local working directory if otherwise
 221             return __DIR__ . '/data/' . $fname;
 222         }
 223     }
 224
 225     /**
 226      * Loads the language trigram database from filename
 227      *
 228      * Trigram datbase should be a serialize()'d array
 229      *
 230      * @param string $fname the filename where the data is stored
 231      *
 232      * @return array the language model data
 233      * @throws Text_LanguageDetect_Exception
 234      * @access private
 235      */
 236     function _readdb($fname)
 237     {
 238         // finds the correct data dir
 239         $fname = $this->_get_data_loc($fname);
 240
 241         // input check
 242         if (!file_exists($fname)) {
 243             throw new Text_LanguageDetect_Exception(
 244                 'Language database does not exist: ' . $fname,
 245                 Text_LanguageDetect_Exception::DB_NOT_FOUND
 246             );
 247         } elseif (!is_readable($fname)) {
 248             throw new Text_LanguageDetect_Exception(
 249                 'Language database is not readable: ' . $fname,
 250                 Text_LanguageDetect_Exception::DB_NOT_READABLE
 251             );
 252         }
 253
 254         return unserialize(file_get_contents($fname));
 255     }
 256
 257
 258     /**
 259      * Checks if this object is ready to detect languages
 260      *
 261      * @param array $trigram Trigram data from database
 262      *
 263      * @return void
 264      * @access private
 265      */
 266     function _checkTrigram($trigram)
 267     {
 268         if (!is_array($trigram)) {
 269             if (ini_get('magic_quotes_runtime')) {
 270                 throw new Text_LanguageDetect_Exception(
 271                     'Error loading database. Try turning magic_quotes_runtime off.',
 272                     Text_LanguageDetect_Exception::MAGIC_QUOTES
 273                 );
 274             }
 275             throw new Text_LanguageDetect_Exception(
 276                 'Language database is not an array.',
 277                 Text_LanguageDetect_Exception::DB_NOT_ARRAY
 278             );
 279         } elseif (empty($trigram)) {
 280             throw new Text_LanguageDetect_Exception(
 281                 'Language database has no elements.',
 282                 Text_LanguageDetect_Exception::DB_EMPTY
 283             );
 284         }
 285     }
 286
 287     /**
 288      * Omits languages
 289      *
 290      * Pass this function the name of or an array of names of
 291      * languages that you don't want considered
 292      *
 293      * If you're only expecting a limited set of languages, this can greatly
 294      * speed up processing
 295      *
 296      * @param mixed $omit_list    language name or array of names to omit
 297      * @param bool  $include_only if true will include (rather than
 298      *                            exclude) only those in the list
 299      *
 300      * @return int number of languages successfully deleted
 301      * @throws Text_LanguageDetect_Exception
 302      */
 303     public function omitLanguages($omit_list, $include_only = false)
 304     {
 305         $deleted = 0;
 306
 307         $omit_list = $this->_convertFromNameMode($omit_list);
 308
 309         if (!$include_only) {
 310             // deleting the given languages
 311             if (!is_array($omit_list)) {
 312                 $omit_list = strtolower($omit_list); // case desensitize
 313                 if (isset($this->_lang_db[$omit_list])) {
 314                     unset($this->_lang_db[$omit_list]);
 315                     $deleted++;
 316                 }
 317             } else {
 318                 foreach ($omit_list as $omit_lang) {
 319                     if (isset($this->_lang_db[$omit_lang])) {
 320                         unset($this->_lang_db[$omit_lang]);
 321                         $deleted++;
 322                     }
 323                 }
 324             }
 325
 326         } else {
 327             // deleting all except the given languages
 328             if (!is_array($omit_list)) {
 329                 $omit_list = array($omit_list);
 330             }
 331
 332             // case desensitize
 333             foreach ($omit_list as $key => $omit_lang) {
 334                 $omit_list[$key] = strtolower($omit_lang);
 335             }
 336
 337             foreach (array_keys($this->_lang_db) as $lang) {
 338                 if (!in_array($lang, $omit_list)) {
 339                     unset($this->_lang_db[$lang]);
 340                     $deleted++;
 341                 }
 342             }
 343         }
 344
 345         // reset the cluster cache if the number of languages changes
 346         // this will then have to be recalculated
 347         if (isset($this->_clusters) && $deleted > 0) {
 348             $this->_clusters = null;
 349         }
 350
 351         return $deleted;
 352     }
 353
 354
 355     /**
 356      * Returns the number of languages that this object can detect
 357      *
 358      * @access public
 359      * @return int            the number of languages
 360      * @throws   Text_LanguageDetect_Exception
 361      */
 362     function getLanguageCount()
 363     {
 364         return count($this->_lang_db);
 365     }
 366
 367     /**
 368      * Checks if the language with the given name exists in the database
 369      *
 370      * @param mixed $lang Language name or array of language names
 371      *
 372      * @return bool true if language model exists
 373      */
 374     public function languageExists($lang)
 375     {
 376         $lang = $this->_convertFromNameMode($lang);
 377
 378         if (is_string($lang)) {
 379             return isset($this->_lang_db[strtolower($lang)]);
 380
 381         } elseif (is_array($lang)) {
 382             foreach ($lang as $test_lang) {
 383                 if (!isset($this->_lang_db[strtolower($test_lang)])) {
 384                     return false;
 385                 }
 386             }
 387             return true;
 388
 389         } else {
 390             throw new Text_LanguageDetect_Exception(
 391                 'Unsupported parameter type passed to languageExists()',
 392                 Text_LanguageDetect_Exception::PARAM_TYPE
 393             );
 394         }
 395     }
 396
 397     /**
 398      * Returns the list of detectable languages
 399      *
 400      * @access public
 401      * @return array        the names of the languages known to this object<<<<<<<
 402      * @throws   Text_LanguageDetect_Exception
 403      */
 404     function getLanguages()
 405     {
 406         return $this->_convertToNameMode(
 407             array_keys($this->_lang_db)
 408         );
 409     }
 410
 411     /**
 412      * Make this object behave like Language::Guess
 413      *
 414      * @param bool $setting false to turn off perl compatibility
 415      *
 416      * @return void
 417      */
 418     public function setPerlCompatible($setting = true)
 419     {
 420         if (is_bool($setting)) { // input check
 421             $this->_perl_compatible = $setting;
 422
 423             if ($setting == true) {
 424                 $this->_max_score = $this->_threshold;
 425             } else {
 426                 $this->_max_score = 0;
 427             }
 428         }
 429
 430     }
 431
 432     /**
 433      * Sets the way how language names are accepted and returned.
 434      *
 435      * @param integer $name_mode One of the following modes:
 436      *                           0 - language name ("english")
 437      *                           2 - 2-letter ISO 639-1 code ("en")
 438      *                           3 - 3-letter ISO 639-2 code ("eng")
 439      *
 440      * @return void
 441      */
 442     function setNameMode($name_mode)
 443     {
 444         $this->_name_mode = $name_mode;
 445     }
 446
 447     /**
 448      * Whether to use unicode block ranges in detection
 449      *
 450      * Should speed up most detections if turned on (detault is on). In some
 451      * circumstances it may be slower, such as for large text samples (> 10K)
 452      * in languages that use latin scripts. In other cases it should speed up
 453      * detection noticeably.
 454      *
 455      * @param bool $setting false to turn off
 456      *
 457      * @return void
 458      */
 459     public function useUnicodeBlocks($setting = true)
 460     {
 461         if (is_bool($setting)) {
 462             $this->_use_unicode_narrowing = $setting;
 463         }
 464     }
 465
 466     /**
 467      * Converts a piece of text into trigrams
 468      *
 469      * @param string $text text to convert
 470      *
 471      * @return     array array of trigram frequencies
 472      * @access     private
 473      * @deprecated Superceded by the Text_LanguageDetect_Parser class
 474      */
 475     function _trigram($text)
 476     {
 477         $s = new Text_LanguageDetect_Parser($text);
 478         $s->prepareTrigram();
 479         $s->prepareUnicode(false);
 480         $s->setPadStart(!$this->_perl_compatible);
 481         $s->analyze();
 482         return $s->getTrigramFreqs();
 483     }
 484
 485     /**
 486      * Converts a set of trigrams from frequencies to ranks
 487      *
 488      * Thresholds (cuts off) the list at $this->_threshold
 489      *
 490      * @param array $arr array of trigram
 491      *
 492      * @return array ranks of trigrams
 493      * @access protected
 494      */
 495     function _arr_rank($arr)
 496     {
 497
 498         // sorts alphabetically first as a standard way of breaking rank ties
 499         $this->_bub_sort($arr);
 500
 501         // below might also work, but seemed to introduce errors in testing
 502         //ksort($arr);
 503         //asort($arr);
 504
 505         $rank = array();
 506
 507         $i = 0;
 508         foreach ($arr as $key => $value) {
 509             $rank[$key] = $i++;
 510
 511             // cut off at a standard threshold
 512             if ($i >= $this->_threshold) {
 513                 break;
 514             }
 515         }
 516
 517         return $rank;
 518     }
 519
 520     /**
 521      * Sorts an array by value breaking ties alphabetically
 522      *
 523      * @param array &$arr the array to sort
 524      *
 525      * @return void
 526      * @access private
 527      */
 528     function _bub_sort(&$arr)
 529     {
 530         // should do the same as this perl statement:
 531         // sort { $trigrams{$b} == $trigrams{$a}
 532         //   ?  $a cmp $b : $trigrams{$b} <=> $trigrams{$a} }
 533
 534         // needs to sort by both key and value at once
 535         // using the key to break ties for the value
 536
 537         // converts array into an array of arrays of each key and value
 538         // may be a better way of doing this
 539         $combined = array();
 540
 541         foreach ($arr as $key => $value) {
 542             $combined[] = array($key, $value);
 543         }
 544
 545         usort($combined, array($this, '_sort_func'));
 546
 547         $replacement = array();
 548         foreach ($combined as $key => $value) {
 549             list($new_key, $new_value) = $value;
 550             $replacement[$new_key] = $new_value;
 551         }
 552
 553         $arr = $replacement;
 554     }
 555
 556     /**
 557      * Sort function used by bubble sort
 558      *
 559      * Callback function for usort().
 560      *
 561      * @param array $a first param passed by usort()
 562      * @param array $b second param passed by usort()
 563      *
 564      * @return int 1 if $a is greater, -1 if not
 565      * @see    _bub_sort()
 566      * @access private
 567      */
 568     function _sort_func($a, $b)
 569     {
 570         // each is actually a key/value pair, so that it can compare using both
 571         list($a_key, $a_value) = $a;
 572         list($b_key, $b_value) = $b;
 573
 574         if ($a_value == $b_value) {
 575             // if the values are the same, break ties using the key
 576             return strcmp($a_key, $b_key);
 577
 578         } else {
 579             // if not, just sort normally
 580             if ($a_value > $b_value) {
 581                 return -1;
 582             } else {
 583                 return 1;
 584             }
 585         }
 586
 587         // 0 should not be possible because keys must be unique
 588     }
 589
 590     /**
 591      * Calculates a linear rank-order distance statistic between two sets of
 592      * ranked trigrams
 593      *
 594      * Sums the differences in rank for each trigram. If the trigram does not
 595      * appear in both, consider it a difference of $this->_threshold.
 596      *
 597      * This distance measure was proposed by Cavnar & Trenkle (1994). Despite
 598      * its simplicity it has been shown to be highly accurate for language
 599      * identification tasks.
 600      *
 601      * @param array $arr1 the reference set of trigram ranks
 602      * @param array $arr2 the target set of trigram ranks
 603      *
 604      * @return int the sum of the differences between the ranks of
 605      *             the two trigram sets
 606      * @access private
 607      */
 608     function _distance($arr1, $arr2)
 609     {
 610         $sumdist = 0;
 611
 612         foreach ($arr2 as $key => $value) {
 613             if (isset($arr1[$key])) {
 614                 $distance = abs($value - $arr1[$key]);
 615             } else {
 616                 // $this->_threshold sets the maximum possible distance value
 617                 // for any one pair of trigrams
 618                 $distance = $this->_threshold;
 619             }
 620             $sumdist += $distance;
 621         }
 622
 623         return $sumdist;
 624
 625         // todo: there are other distance statistics to try, e.g. relative
 626         //       entropy, but they're probably more costly to compute
 627     }
 628
 629     /**
 630      * Normalizes the score returned by _distance()
 631      *
 632      * Different if perl compatible or not
 633      *
 634      * @param int $score      the score from _distance()
 635      * @param int $base_count the number of trigrams being considered
 636      *
 637      * @return float the normalized score
 638      * @see    _distance()
 639      * @access private
 640      */
 641     function _normalize_score($score, $base_count = null)
 642     {
 643         if ($base_count === null) {
 644             $base_count = $this->_threshold;
 645         }
 646
 647         if (!$this->_perl_compatible) {
 648             return 1 - ($score / $base_count / $this->_threshold);
 649         } else {
 650             return floor($score / $base_count);
 651         }
 652     }
 653
 654
 655     /**
 656      * Detects the closeness of a sample of text to the known languages
 657      *
 658      * Calculates the statistical difference between the text and
 659      * the trigrams for each language, normalizes the score then
 660      * returns results for all languages in sorted order
 661      *
 662      * If perl compatible, the score is 300-0, 0 being most similar.
 663      * Otherwise, it's 0-1 with 1 being most similar.
 664      *
 665      * The $sample text should be at least a few sentences in length;
 666      * should be ascii-7 or utf8 encoded, if another and the mbstring extension
 667      * is present it will try to detect and convert. However, experience has
 668      * shown that mb_detect_encoding() *does not work very well* with at least
 669      * some types of encoding.
 670      *
 671      * @param string $sample a sample of text to compare.
 672      * @param int    $limit  if specified, return an array of the most likely
 673      *                       $limit languages and their scores.
 674      *
 675      * @return mixed sorted array of language scores, blank array if no
 676      *               useable text was found
 677      * @see    _distance()
 678      * @throws Text_LanguageDetect_Exception
 679      */
 680     public function detect($sample, $limit = 0)
 681     {
 682         // input check
 683         if (!Text_LanguageDetect_Parser::validateString($sample)) {
 684             return array();
 685         }
 686
 687         // check char encoding
 688         // (only if mbstring extension is compiled and PHP > 4.0.6)
 689         if (function_exists('mb_detect_encoding')
 690             && function_exists('mb_convert_encoding')
 691         ) {
 692             // mb_detect_encoding isn't very reliable, to say the least
 693             // detection should still work with a sufficient sample
 694             //  of ascii characters
 695             $encoding = mb_detect_encoding($sample);
 696
 697             // mb_detect_encoding() will return FALSE if detection fails
 698             // don't attempt conversion if that's the case
 699             if ($encoding != 'ASCII' && $encoding != 'UTF-8'
 700                 && $encoding !== false
 701             ) {
 702                 // verify the encoding exists in mb_list_encodings
 703                 if (in_array($encoding, mb_list_encodings())) {
 704                     $sample = mb_convert_encoding($sample, 'UTF-8', $encoding);
 705                 }
 706             }
 707         }
 708
 709         $sample_obj = new Text_LanguageDetect_Parser($sample);
 710         $sample_obj->prepareTrigram();
 711         if ($this->_use_unicode_narrowing) {
 712             $sample_obj->prepareUnicode();
 713         }
 714         $sample_obj->setPadStart(!$this->_perl_compatible);
 715         $sample_obj->analyze();
 716
 717         $trigram_freqs =& $sample_obj->getTrigramRanks();
 718         $trigram_count = count($trigram_freqs);
 719
 720         if ($trigram_count == 0) {
 721             return array();
 722         }
 723
 724         $scores = array();
 725
 726         // use unicode block detection to narrow down the possibilities
 727         if ($this->_use_unicode_narrowing) {
 728             $blocks =& $sample_obj->getUnicodeBlocks();
 729
 730             if (is_array($blocks)) {
 731                 $present_blocks = array_keys($blocks);
 732             } else {
 733                 throw new Text_LanguageDetect_Exception(
 734                     'Error during block detection',
 735                     Text_LanguageDetect_Exception::BLOCK_DETECTION
 736                 );
 737             }
 738
 739             $possible_langs = array();
 740
 741             foreach ($present_blocks as $blockname) {
 742                 if (isset($this->_unicode_map[$blockname])) {
 743
 744                     $possible_langs = array_merge(
 745                         $possible_langs,
 746                         array_keys($this->_unicode_map[$blockname])
 747                     );
 748
 749                     // todo: faster way to do this?
 750                 }
 751             }
 752
 753             // could also try an intersect operation rather than a union
 754             // in other words, choose languages whose trigrams contain
 755             // ALL of the unicode blocks found in this sample
 756             // would improve speed but would be completely thrown off by an
 757             // unexpected character, like an umlaut appearing in english text
 758
 759             $possible_langs = array_intersect(
 760                 array_keys($this->_lang_db),
 761                 array_unique($possible_langs)
 762             );
 763
 764             // needs to intersect it with the keys of _lang_db in case
 765             // languages have been omitted
 766
 767         } else {
 768             // or just try 'em all
 769             $possible_langs = array_keys($this->_lang_db);
 770         }
 771
 772
 773         foreach ($possible_langs as $lang) {
 774             $scores[$lang] = $this->_normalize_score(
 775                 $this->_distance($this->_lang_db[$lang], $trigram_freqs),
 776                 $trigram_count
 777             );
 778         }
 779
 780         unset($sample_obj);
 781
 782         if ($this->_perl_compatible) {
 783             asort($scores);
 784         } else {
 785             arsort($scores);
 786         }
 787
 788         // todo: drop languages with a score of $this->_max_score?
 789
 790         // limit the number of returned scores
 791         if ($limit && is_numeric($limit)) {
 792             $limited_scores = array();
 793
 794             $i = 0;
 795             foreach ($scores as $key => $value) {
 796                 if ($i++ >= $limit) {
 797                     break;
 798                 }
 799
 800                 $limited_scores[$key] = $value;
 801             }
 802
 803             return $this->_convertToNameMode($limited_scores, true);
 804         } else {
 805             return $this->_convertToNameMode($scores, true);
 806         }
 807     }
 808
 809     /**
 810      * Returns only the most similar language to the text sample
 811      *
 812      * Calls $this->detect() and returns only the top result
 813      *
 814      * @param string $sample text to detect the language of
 815      *
 816      * @return string the name of the most likely language
 817      *                or null if no language is similar
 818      * @see    detect()
 819      * @throws Text_LanguageDetect_Exception
 820      */
 821     public function detectSimple($sample)
 822     {
 823         $scores = $this->detect($sample, 1);
 824
 825         // if top language has the maximum possible score,
 826         // then the top score will have been picked at random
 827         if (!is_array($scores) || empty($scores)
 828             || current($scores) == $this->_max_score
 829         ) {
 830             return null;
 831         } else {
 832             return key($scores);
 833         }
 834     }
 835
 836     /**
 837      * Returns an array containing the most similar language and a confidence
 838      * rating
 839      *
 840      * Confidence is a simple measure calculated from the similarity score
 841      * minus the similarity score from the next most similar language
 842      * divided by the highest possible score. Languages that have closely
 843      * related cousins (e.g. Norwegian and Danish) should generally have lower
 844      * confidence scores.
 845      *
 846      * The similarity score answers the question "How likely is the text the
 847      * returned language regardless of the other languages considered?" The
 848      * confidence score is one way of answering the question "how likely is the
 849      * text the detected language relative to the rest of the language model
 850      * set?"
 851      *
 852      * To see how similar languages are a priori, see languageSimilarity()
 853      *
 854      * @param string $sample text for which language will be detected
 855      *
 856      * @return array most similar language, score and confidence rating
 857      *               or null if no language is similar
 858      * @see    detect()
 859      * @throws Text_LanguageDetect_Exception
 860      */
 861     public function detectConfidence($sample)
 862     {
 863         $scores = $this->detect($sample, 2);
 864
 865         // if most similar language has the max score, it
 866         // will have been picked at random
 867         if (!is_array($scores) || empty($scores)
 868             || current($scores) == $this->_max_score
 869         ) {
 870             return null;
 871         }
 872
 873         $arr['language'] = key($scores);
 874         $arr['similarity'] = current($scores);
 875         if (next($scores) !== false) { // if false then no next element
 876             // the goal is to return a higher value if the distance between
 877             // the similarity of the first score and the second score is high
 878
 879             if ($this->_perl_compatible) {
 880                 $arr['confidence'] = (current($scores) - $arr['similarity'])
 881                     / $this->_max_score;
 882
 883             } else {
 884                 $arr['confidence'] = $arr['similarity'] - current($scores);
 885
 886             }
 887
 888         } else {
 889             $arr['confidence'] = null;
 890         }
 891
 892         return $arr;
 893     }
 894
 895     /**
 896      * Returns the distribution of unicode blocks in a given utf8 string
 897      *
 898      * For the block name of a single char, use unicodeBlockName()
 899      *
 900      * @param string $str          input string. Must be ascii or utf8
 901      * @param bool   $skip_symbols if true, skip ascii digits, symbols and
 902      *                             non-printing characters. Includes spaces,
 903      *                             newlines and common punctutation characters.
 904      *
 905      * @return array
 906      * @throws Text_LanguageDetect_Exception
 907      */
 908     public function detectUnicodeBlocks($str, $skip_symbols)
 909     {
 910         $skip_symbols = (bool)$skip_symbols;
 911         $str          = (string)$str;
 912
 913         $sample_obj = new Text_LanguageDetect_Parser($str);
 914         $sample_obj->prepareUnicode();
 915         $sample_obj->prepareTrigram(false);
 916         $sample_obj->setUnicodeSkipSymbols($skip_symbols);
 917         $sample_obj->analyze();
 918         $blocks = $sample_obj->getUnicodeBlocks();
 919         unset($sample_obj);
 920         return $blocks;
 921     }
 922
 923     /**
 924      * Returns the block name for a given unicode value
 925      *
 926      * If passed a string, will assume it is being passed a UTF8-formatted
 927      * character and will automatically convert. Otherwise it will assume it
 928      * is being passed a numeric unicode value.
 929      *
 930      * Make sure input is of the correct type!
 931      *
 932      * @param mixed $unicode unicode value or utf8 char
 933      *
 934      * @return mixed the block name string or false if not found
 935      * @throws Text_LanguageDetect_Exception
 936      */
 937     public function unicodeBlockName($unicode)
 938     {
 939         if (is_string($unicode)) {
 940             // assume it is being passed a utf8 char, so convert it
 941             if (self::utf8strlen($unicode) > 1) {
 942                 throw new Text_LanguageDetect_Exception(
 943                     'Pass a single char only to this method',
 944                     Text_LanguageDetect_Exception::PARAM_TYPE
 945                 );
 946             }
 947             $unicode = $this->_utf8char2unicode($unicode);
 948
 949         } elseif (!is_int($unicode)) {
 950             throw new Text_LanguageDetect_Exception(
 951                 'Input must be of type string or int.',
 952                 Text_LanguageDetect_Exception::PARAM_TYPE
 953             );
 954         }
 955
 956         $blocks = $this->_read_unicode_block_db();
 957
 958         $result = $this->_unicode_block_name($unicode, $blocks);
 959
 960         if ($result == -1) {
 961             return false;
 962         } else {
 963             return $result[2];
 964         }
 965     }
 966
 967     /**
 968      * Searches the unicode block database
 969      *
 970      * Returns the block name for a given unicode value. unicodeBlockName() is
 971      * the public interface for this function, which does input checks which
 972      * this function omits for speed.
 973      *
 974      * @param int   $unicode     the unicode value
 975      * @param array $blocks      the block database
 976      * @param int   $block_count the number of defined blocks in the database
 977      *
 978      * @return mixed Block name, -1 if it failed
 979      * @see    unicodeBlockName()
 980      * @access protected
 981      */
 982     function _unicode_block_name($unicode, $blocks, $block_count = -1)
 983     {
 984         // for a reference, see
 985         // http://www.unicode.org/Public/UNIDATA/Blocks.txt
 986
 987         // assume that ascii characters are the most common
 988         // so try it first for efficiency
 989         if ($unicode <= $blocks[0][1]) {
 990             return $blocks[0];
 991         }
 992
 993         // the optional $block_count param is for efficiency
 994         // so we this function doesn't have to run count() every time
 995         if ($block_count != -1) {
 996             $high = $block_count - 1;
 997         } else {
 998             $high = count($blocks) - 1;
 999         }
1000
1001         $low = 1; // start with 1 because ascii was 0
1002
1003         // your average binary search algorithm
1004         while ($low <= $high) {
1005             $mid = floor(($low + $high) / 2);
1006
1007             if ($unicode < $blocks[$mid][0]) {
1008                 // if it's lower than the lower bound
1009                 $high = $mid - 1;
1010
1011             } elseif ($unicode > $blocks[$mid][1]) {
1012                 // if it's higher than the upper bound
1013                 $low = $mid + 1;
1014
1015             } else {
1016                 // found it
1017                 return $blocks[$mid];
1018             }
1019         }
1020
1021         // failed to find the block
1022         return -1;
1023
1024         // todo: differentiate when it's out of range or when it falls
1025         //       into an unassigned range?
1026     }
1027
1028     /**
1029      * Brings up the unicode block database
1030      *
1031      * @return array the database of unicode block definitions
1032      * @throws Text_LanguageDetect_Exception
1033      * @access protected
1034      */
1035     function _read_unicode_block_db()
1036     {
1037         // since the unicode definitions are always going to be the same,
1038         // might as well share the memory for the db with all other instances
1039         // of this class
1040         static $data;
1041
1042         if (!isset($data)) {
1043             $data = $this->_readdb($this->_unicode_db_filename);
1044         }
1045
1046         return $data;
1047     }
1048
1049     /**
1050      * Calculate the similarities between the language models
1051      *
1052      * Use this function to see how similar languages are to each other.
1053      *
1054      * If passed 2 language names, will return just those languages compared.
1055      * If passed 1 language name, will return that language compared to
1056      * all others.
1057      * If passed none, will return an array of every language model compared
1058      * to every other one.
1059      *
1060      * @param string $lang1 the name of the first language to be compared
1061      * @param string $lang2 the name of the second language to be compared
1062      *
1063      * @return array scores of every language compared
1064      *               or the score of just the provided languages
1065      *               or null if one of the supplied languages does not exist
1066      * @throws Text_LanguageDetect_Exception
1067      */
1068     public function languageSimilarity($lang1 = null, $lang2 = null)
1069     {
1070         $lang1 = $this->_convertFromNameMode($lang1);
1071         $lang2 = $this->_convertFromNameMode($lang2);
1072         if ($lang1 != null) {
1073             $lang1 = strtolower($lang1);
1074
1075             // check if language model exists
1076             if (!isset($this->_lang_db[$lang1])) {
1077                 return null;
1078             }
1079
1080             if ($lang2 != null) {
1081                 if (!isset($this->_lang_db[$lang2])) {
1082                     // check if language model exists
1083                     return null;
1084                 }
1085
1086                 $lang2 = strtolower($lang2);
1087
1088                 // compare just these two languages
1089                 return $this->_normalize_score(
1090                     $this->_distance(
1091                         $this->_lang_db[$lang1],
1092                         $this->_lang_db[$lang2]
1093                     )
1094                 );
1095
1096             } else {
1097                 // compare just $lang1 to all languages
1098                 $return_arr = array();
1099                 foreach ($this->_lang_db as $key => $value) {
1100                     if ($key != $lang1) {
1101                         // don't compare a language to itself
1102                         $return_arr[$key] = $this->_normalize_score(
1103                             $this->_distance($this->_lang_db[$lang1], $value)
1104                         );
1105                     }
1106                 }
1107                 asort($return_arr);
1108
1109                 return $return_arr;
1110             }
1111
1112
1113         } else {
1114             // compare all languages to each other
1115             $return_arr = array();
1116             foreach (array_keys($this->_lang_db) as $lang1) {
1117                 foreach (array_keys($this->_lang_db) as $lang2) {
1118                     // skip comparing languages to themselves
1119                     if ($lang1 != $lang2) {
1120
1121                         if (isset($return_arr[$lang2][$lang1])) {
1122                             // don't re-calculate what's already been done
1123                             $return_arr[$lang1][$lang2]
1124                                 = $return_arr[$lang2][$lang1];
1125
1126                         } else {
1127                             // calculate
1128                             $return_arr[$lang1][$lang2]
1129                                 = $this->_normalize_score(
1130                                     $this->_distance(
1131                                         $this->_lang_db[$lang1],
1132                                         $this->_lang_db[$lang2]
1133                                     )
1134                                 );
1135
1136                         }
1137                     }
1138                 }
1139             }
1140             return $return_arr;
1141         }
1142     }
1143
1144     /**
1145      * Cluster known languages according to languageSimilarity()
1146      *
1147      * WARNING: this method is EXPERIMENTAL. It is not recommended for common
1148      * use, and it may disappear or its functionality may change in future
1149      * releases without notice.
1150      *
1151      * Uses a nearest neighbor technique to generate the maximum possible
1152      * number of dendograms from the similarity data.
1153      *
1154      * @access      public
1155      * @return      array language cluster data
1156      * @throws      Text_LanguageDetect_Exception
1157      * @see         languageSimilarity()
1158      * @deprecated  this function will eventually be removed and placed into
1159      *              the model generation class
1160      */
1161     function clusterLanguages()
1162     {
1163         // todo: set the maximum number of clusters
1164         // return cached result, if any
1165         if (isset($this->_clusters)) {
1166             return $this->_clusters;
1167         }
1168
1169         $langs = array_keys($this->_lang_db);
1170
1171         $arr = $this->languageSimilarity();
1172
1173         sort($langs);
1174
1175         foreach ($langs as $lang) {
1176             if (!isset($this->_lang_db[$lang])) {
1177                 throw new Text_LanguageDetect_Exception(
1178                     "missing $lang!",
1179                     Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE
1180                 );
1181             }
1182         }
1183
1184         // http://www.psychstat.missouristate.edu/multibook/mlt04m.html
1185         foreach ($langs as $old_key => $lang1) {
1186             $langs[$lang1] = $lang1;
1187             unset($langs[$old_key]);
1188         }
1189
1190         $result_data = $really_map = array();
1191
1192         $i = 0;
1193         while (count($langs) > 2 && $i++ < 200) {
1194             $highest_score = -1;
1195             $highest_key1 = '';
1196             $highest_key2 = '';
1197             foreach ($langs as $lang1) {
1198                 foreach ($langs as $lang2) {
1199                     if ($lang1 != $lang2
1200                         && $arr[$lang1][$lang2] > $highest_score
1201                     ) {
1202                         $highest_score = $arr[$lang1][$lang2];
1203                         $highest_key1 = $lang1;
1204                         $highest_key2 = $lang2;
1205                     }
1206                 }
1207             }
1208
1209             if (!$highest_key1) {
1210                 // should not ever happen
1211                 throw new Text_LanguageDetect_Exception(
1212                     "no highest key? (step: $i)",
1213                     Text_LanguageDetect_Exception::NO_HIGHEST_KEY
1214                 );
1215             }
1216
1217             if ($highest_score == 0) {
1218                 // languages are perfectly dissimilar
1219                 break;
1220             }
1221
1222             // $highest_key1 and $highest_key2 are most similar
1223             $sum1 = array_sum($arr[$highest_key1]);
1224             $sum2 = array_sum($arr[$highest_key2]);
1225
1226             // use the score for the one that is most similar to the rest of
1227             // the field as the score for the group
1228             // todo: could try averaging or "centroid" method instead
1229             // seems like that might make more sense
1230             // actually nearest neighbor may be better for binary searching
1231
1232
1233             // for "Complete Linkage"/"furthest neighbor"
1234             // sign should be <
1235             // for "Single Linkage"/"nearest neighbor" method
1236             // should should be >
1237             // results seem to be pretty much the same with either method
1238
1239             // figure out which to delete and which to replace
1240             if ($sum1 > $sum2) {
1241                 $replaceme = $highest_key1;
1242                 $deleteme = $highest_key2;
1243             } else {
1244                 $replaceme = $highest_key2;
1245                 $deleteme = $highest_key1;
1246             }
1247
1248             $newkey = $replaceme . ':' . $deleteme;
1249
1250             // $replaceme is most similar to remaining languages
1251             // replace $replaceme with '$newkey', deleting $deleteme
1252
1253             // keep a record of which fork is really which language
1254             $really_lang = $replaceme;
1255             while (isset($really_map[$really_lang])) {
1256                 $really_lang = $really_map[$really_lang];
1257             }
1258             $really_map[$newkey] = $really_lang;
1259
1260
1261             // replace the best fitting key, delete the other
1262             foreach ($arr as $key1 => $arr2) {
1263                 foreach ($arr2 as $key2 => $value2) {
1264                     if ($key2 == $replaceme) {
1265                         $arr[$key1][$newkey] = $arr[$key1][$key2];
1266                         unset($arr[$key1][$key2]);
1267                         // replacing $arr[$key1][$key2] with $arr[$key1][$newkey]
1268                     }
1269
1270                     if ($key1 == $replaceme) {
1271                         $arr[$newkey][$key2] = $arr[$key1][$key2];
1272                         unset($arr[$key1][$key2]);
1273                         // replacing $arr[$key1][$key2] with $arr[$newkey][$key2]
1274                     }
1275
1276                     if ($key1 == $deleteme || $key2 == $deleteme) {
1277                         // deleting $arr[$key1][$key2]
1278                         unset($arr[$key1][$key2]);
1279                     }
1280                 }
1281             }
1282
1283
1284             unset($langs[$highest_key1]);
1285             unset($langs[$highest_key2]);
1286             $langs[$newkey] = $newkey;
1287
1288
1289             // some of these may be overkill
1290             $result_data[$newkey] = array(
1291                                 'newkey' => $newkey,
1292                                 'count' => $i,
1293                                 'diff' => abs($sum1 - $sum2),
1294                                 'score' => $highest_score,
1295                                 'bestfit' => $replaceme,
1296                                 'otherfit' => $deleteme,
1297                                 'really' => $really_lang,
1298                             );
1299         }
1300
1301         $return_val = array(
1302                 'open_forks' => $langs,
1303                         // the top level of clusters
1304                         // clusters that are mutually exclusive
1305                         // or specified by a specific maximum
1306
1307                 'fork_data' => $result_data,
1308                         // data for each split
1309
1310                 'name_map' => $really_map,
1311                         // which cluster is really which language
1312                         // using the nearest neighbor technique, the cluster
1313                         // inherits all of the properties of its most-similar member
1314                         // this keeps track
1315             );
1316
1317
1318         // saves the result in the object
1319         $this->_clusters = $return_val;
1320
1321         return $return_val;
1322     }
1323
1324
1325     /**
1326      * Perform an intelligent detection based on clusterLanguages()
1327      *
1328      * WARNING: this method is EXPERIMENTAL. It is not recommended for common
1329      * use, and it may disappear or its functionality may change in future
1330      * releases without notice.
1331      *
1332      * This compares the sample text to top the top level of clusters. If the
1333      * sample is similar to the cluster it will drop down and compare it to the
1334      * languages in the cluster, and so on until it hits a leaf node.
1335      *
1336      * this should find the language in considerably fewer compares
1337      * (the equivalent of a binary search), however clusterLanguages() is costly
1338      * and the loss of accuracy from this technique is significant.
1339      *
1340      * This method may need to be 'fuzzier' in order to become more accurate.
1341      *
1342      * This function could be more useful if the universe of possible languages
1343      * was very large, however in such cases some method of Bayesian inference
1344      * might be more helpful.
1345      *
1346      * @param string $str input string
1347      *
1348      * @return array language scores (only those compared)
1349      * @throws Text_LanguageDetect_Exception
1350      * @see    clusterLanguages()
1351      */
1352     public function clusteredSearch($str)
1353     {
1354         // input check
1355         if (!Text_LanguageDetect_Parser::validateString($str)) {
1356             return array();
1357         }
1358
1359         // clusterLanguages() will return a cached result if possible
1360         // so it's safe to call it every time
1361         $result = $this->clusterLanguages();
1362
1363         $dendogram_start = $result['open_forks'];
1364         $dendogram_data  = $result['fork_data'];
1365         $dendogram_alias = $result['name_map'];
1366
1367         $sample_obj = new Text_LanguageDetect_Parser($str);
1368         $sample_obj->prepareTrigram();
1369         $sample_obj->setPadStart(!$this->_perl_compatible);
1370         $sample_obj->analyze();
1371         $sample_result = $sample_obj->getTrigramRanks();
1372         $sample_count  = count($sample_result);
1373
1374         // input check
1375         if ($sample_count == 0) {
1376             return array();
1377         }
1378
1379         $i = 0; // counts the number of steps
1380
1381         foreach ($dendogram_start as $lang) {
1382             if (isset($dendogram_alias[$lang])) {
1383                 $lang_key = $dendogram_alias[$lang];
1384             } else {
1385                 $lang_key = $lang;
1386             }
1387
1388             $scores[$lang] = $this->_normalize_score(
1389                 $this->_distance($this->_lang_db[$lang_key], $sample_result),
1390                 $sample_count
1391             );
1392
1393             $i++;
1394         }
1395
1396         if ($this->_perl_compatible) {
1397             asort($scores);
1398         } else {
1399             arsort($scores);
1400         }
1401
1402         $top_score = current($scores);
1403         $top_key = key($scores);
1404
1405         // of starting forks, $top_key is the most similar to the sample
1406
1407         $cur_key = $top_key;
1408         while (isset($dendogram_data[$cur_key])) {
1409             $lang1 = $dendogram_data[$cur_key]['bestfit'];
1410             $lang2 = $dendogram_data[$cur_key]['otherfit'];
1411             foreach (array($lang1, $lang2) as $lang) {
1412                 if (isset($dendogram_alias[$lang])) {
1413                     $lang_key = $dendogram_alias[$lang];
1414                 } else {
1415                     $lang_key = $lang;
1416                 }
1417
1418                 $scores[$lang] = $this->_normalize_score(
1419                     $this->_distance($this->_lang_db[$lang_key], $sample_result),
1420                     $sample_count
1421                 );
1422
1423                 //todo: does not need to do same comparison again
1424             }
1425
1426             $i++;
1427
1428             if ($scores[$lang1] > $scores[$lang2]) {
1429                 $cur_key = $lang1;
1430                 $loser_key = $lang2;
1431             } else {
1432                 $cur_key = $lang2;
1433                 $loser_key = $lang1;
1434             }
1435
1436             $diff = $scores[$cur_key] - $scores[$loser_key];
1437
1438             // $cur_key ({$dendogram_alias[$cur_key]}) wins
1439             // over $loser_key ({$dendogram_alias[$loser_key]})
1440             // with a difference of $diff
1441         }
1442
1443         // found result in $i compares
1444
1445         // rather than sorting the result, preserve it so that you can see
1446         // which paths the algorithm decided to take along the tree
1447
1448         // but sometimes the last item is only the second highest
1449         if (($this->_perl_compatible  && (end($scores) > prev($scores)))
1450             || (!$this->_perl_compatible && (end($scores) < prev($scores)))
1451         ) {
1452             $real_last_score = current($scores);
1453             $real_last_key = key($scores);
1454
1455             // swaps the 2nd-to-last item for the last item
1456             unset($scores[$real_last_key]);
1457             $scores[$real_last_key] = $real_last_score;
1458         }
1459
1460
1461         if (!$this->_perl_compatible) {
1462             $scores = array_reverse($scores, true);
1463             // second param requires php > 4.0.3
1464         }
1465
1466         return $scores;
1467     }
1468
1469     /**
1470      * ut8-safe strlen()
1471      *
1472      * Returns the numbers of characters (not bytes) in a utf8 string
1473      *
1474      * @param string $str string to get the length of
1475      *
1476      * @return int number of chars
1477      */
1478     public static function utf8strlen($str)
1479     {
1480         // utf8_decode() will convert unknown chars to '?', which is actually
1481         // ideal for counting.
1482
1483         return strlen(utf8_decode($str));
1484
1485         // idea stolen from dokuwiki
1486     }
1487
1488     /**
1489      * Returns the unicode value of a utf8 char
1490      *
1491      * @param string $char a utf8 (possibly multi-byte) char
1492      *
1493      * @return int unicode value
1494      * @access protected
1495      * @link   http://en.wikipedia.org/wiki/UTF-8
1496      */
1497     function _utf8char2unicode($char)
1498     {
1499         // strlen() here will actually get the binary length of a single char
1500         switch (strlen($char)) {
1501         case 1:
1502             // normal ASCII-7 byte
1503             // 0xxxxxxx -->  0xxxxxxx
1504             return ord($char{0});
1505
1506         case 2:
1507             // 2 byte unicode
1508             // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx
1509             $z = (ord($char{0}) & 0x000001F) << 6;
1510             $x = (ord($char{1}) & 0x0000003F);
1511             return ($z | $x);
1512
1513         case 3:
1514             // 3 byte unicode
1515             // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx
1516             $z =  (ord($char{0}) & 0x0000000F) << 12;
1517             $x1 = (ord($char{1}) & 0x0000003F) << 6;
1518             $x2 = (ord($char{2}) & 0x0000003F);
1519             return ($z | $x1 | $x2);
1520
1521         case 4:
1522             // 4 byte unicode
1523             // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx -->
1524             // 000zzzzz xxxxxxxx xxxxxxxx
1525             $z1 = (ord($char{0}) & 0x00000007) << 18;
1526             $z2 = (ord($char{1}) & 0x0000003F) << 12;
1527             $x1 = (ord($char{2}) & 0x0000003F) << 6;
1528             $x2 = (ord($char{3}) & 0x0000003F);
1529             return ($z1 | $z2 | $x1 | $x2);
1530         }
1531     }
1532
1533     /**
1534      * utf8-safe fast character iterator
1535      *
1536      * Will get the next character starting from $counter, which will then be
1537      * incremented. If a multi-byte char the bytes will be concatenated and
1538      * $counter will be incremeted by the number of bytes in the char.
1539      *
1540      * @param string $str             the string being iterated over
1541      * @param int    &$counter        the iterator, will increment by reference
1542      * @param bool   $special_convert whether to do special conversions
1543      *
1544      * @return char the next (possibly multi-byte) char from $counter
1545      * @access private
1546      */
1547     static function _next_char($str, &$counter, $special_convert = false)
1548     {
1549         $char = $str{$counter++};
1550         $ord = ord($char);
1551
1552         // for a description of the utf8 system see
1553         // http://www.phpclasses.org/browse/file/5131.html
1554
1555         // normal ascii one byte char
1556         if ($ord <= 127) {
1557             // special conversions needed for this package
1558             // (that only apply to regular ascii characters)
1559             // lower case, and convert all non-alphanumeric characters
1560             // other than "'" to space
1561             if ($special_convert && $char != ' ' && $char != "'") {
1562                 if ($ord >= 65 && $ord <= 90) { // A-Z
1563                     $char = chr($ord + 32); // lower case
1564                 } elseif ($ord < 97 || $ord > 122) { // NOT a-z
1565                     $char = ' '; // convert to space
1566                 }
1567             }
1568
1569             return $char;
1570
1571         } elseif ($ord >> 5 == 6) { // two-byte char
1572             // multi-byte chars
1573             $nextchar = $str{$counter++}; // get next byte
1574
1575             // lower-casing of non-ascii characters is still incomplete
1576
1577             if ($special_convert) {
1578                 // lower case latin accented characters
1579                 if ($ord == 195) {
1580                     $nextord = ord($nextchar);
1581                     $nextord_adj = $nextord + 64;
1582                     // for a reference, see
1583                     // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html
1584
1585                     // &Agrave; - &THORN; but not &times;
1586                     if ($nextord_adj >= 192
1587                         && $nextord_adj <= 222
1588                         && $nextord_adj != 215
1589                     ) {
1590                         $nextchar = chr($nextord + 32);
1591                     }
1592
1593                 } elseif ($ord == 208) {
1594                     // lower case cyrillic alphabet
1595                     $nextord = ord($nextchar);
1596                     // if A - Pe
1597                     if ($nextord >= 144 && $nextord <= 159) {
1598                         // lower case
1599                         $nextchar = chr($nextord + 32);
1600
1601                     } elseif ($nextord >= 160 && $nextord <= 175) {
1602                         // if Er - Ya
1603                         // lower case
1604                         $char = chr(209); // == $ord++
1605                         $nextchar = chr($nextord - 32);
1606                     }
1607                 }
1608             }
1609
1610             // tag on next byte
1611             return $char . $nextchar;
1612         } elseif ($ord >> 4  == 14) { // three-byte char
1613
1614             // tag on next 2 bytes
1615             return $char . $str{$counter++} . $str{$counter++};
1616
1617         } elseif ($ord >> 3 == 30) { // four-byte char
1618
1619             // tag on next 3 bytes
1620             return $char . $str{$counter++} . $str{$counter++} . $str{$counter++};
1621
1622         } else {
1623             // error?
1624         }
1625     }
1626
1627     /**
1628      * Converts an $language input parameter from the configured mode
1629      * to the language name that is used internally.
1630      *
1631      * Works for strings and arrays.
1632      *
1633      * @param string|array $lang       A language description ("english"/"en"/"eng")
1634      * @param boolean      $convertKey If $lang is an array, setting $key
1635      *                                 converts the keys to the language name.
1636      *
1637      * @return string|array Language name
1638      */
1639     function _convertFromNameMode($lang, $convertKey = false)
1640     {
1641         if ($this->_name_mode == 0) {
1642             return $lang;
1643         }
1644
1645         if ($this->_name_mode == 2) {
1646             $method = 'code2ToName';
1647         } else {
1648             $method = 'code3ToName';
1649         }
1650
1651         if (is_string($lang)) {
1652             return (string)Text_LanguageDetect_ISO639::$method($lang);
1653         }
1654
1655         $newlang = array();
1656         foreach ($lang as $key => $val) {
1657             if ($convertKey) {
1658                 $newkey = (string)Text_LanguageDetect_ISO639::$method($key);
1659                 $newlang[$newkey] = $val;
1660             } else {
1661                 $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val);
1662             }
1663         }
1664         return $newlang;
1665     }
1666
1667     /**
1668      * Converts an $language output parameter from the language name that is
1669      * used internally to the configured mode.
1670      *
1671      * Works for strings and arrays.
1672      *
1673      * @param string|array $lang       A language description ("english"/"en"/"eng")
1674      * @param boolean      $convertKey If $lang is an array, setting $key
1675      *                                 converts the keys to the language name.
1676      *
1677      * @return string|array Language name
1678      */
1679     function _convertToNameMode($lang, $convertKey = false)
1680     {
1681         if ($this->_name_mode == 0) {
1682             return $lang;
1683         }
1684
1685         if ($this->_name_mode == 2) {
1686             $method = 'nameToCode2';
1687         } else {
1688             $method = 'nameToCode3';
1689         }
1690
1691         if (is_string($lang)) {
1692             return Text_LanguageDetect_ISO639::$method($lang);
1693         }
1694
1695         $newlang = array();
1696         foreach ($lang as $key => $val) {
1697             if ($convertKey) {
1698                 $newkey = Text_LanguageDetect_ISO639::$method($key);
1699                 $newlang[$newkey] = $val;
1700             } else {
1701                 $newlang[$key] = Text_LanguageDetect_ISO639::$method($val);
1702             }
1703         }
1704         return $newlang;
1705     }
1706 }
1707
1708 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */