plugins/af_lang_detect/languagedetect/LanguageDetect.php

   1 <?php
   2
   3 /**
   4  * Detects the language of a given piece of text.
   5  *
   6  * Attempts to detect the language of a sample of text by correlating ranked
   7  * 3-gram frequencies to a table of 3-gram frequencies of known languages.
   8  *
   9  * Implements a version of a technique originally proposed by Cavnar & Trenkle
  10  * (1994): "N-Gram-Based Text Categorization"
  11  *
  12  * PHP version 5
  13  *
  14  * @category  Text
  15  * @package   Text_LanguageDetect
  16  * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
  17  * @copyright 2005-2006 Nicholas Pisarro
  18  * @license   http://www.debian.org/misc/bsd.license BSD
  19  * @version   SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $
  20  * @link      http://pear.php.net/package/Text_LanguageDetect/
  21  * @link      http://langdetect.blogspot.com/
  22  */
  23
  24 require_once 'lib/languagedetect/Text/LanguageDetect/Exception.php';
  25 require_once 'lib/languagedetect/Text/LanguageDetect/Parser.php';
  26 require_once 'lib/languagedetect/Text/LanguageDetect/ISO639.php';
  27
  28 /**
  29  * Language detection class
  30  *
  31  * Requires the langauge model database (lang.dat) that should have
  32  * accompanied this class definition in order to be instantiated.
  33  *
  34  * Example usage:
  35  *
  36  * <code>
  37  * require_once 'Text/LanguageDetect.php';
  38  *
  39  * $l = new Text_LanguageDetect;
  40  *
  41  * $stdin = fopen('php://stdin', 'r');
  42  *
  43  * echo "Supported languages:\n";
  44  *
  45  * try {
  46  *     $langs = $l->getLanguages();
  47  * } catch (Text_LanguageDetect_Exception $e) {
  48  *     die($e->getMessage());
  49  * }
  50  *
  51  * sort($langs);
  52  * echo join(', ', $langs);
  53  *
  54  * while ($line = fgets($stdin)) {
  55  *     print_r($l->detect($line, 4));
  56  * }
  57  * </code>
  58  *
  59  * @category  Text
  60  * @package   Text_LanguageDetect
  61  * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
  62  * @copyright 2005 Nicholas Pisarro
  63  * @license   http://www.debian.org/misc/bsd.license BSD
  64  * @version   Release: @package_version@
  65  * @link      http://pear.php.net/package/Text_LanguageDetect/
  66  * @todo      allow users to generate their own language models
  67  */
  68 class Text_LanguageDetect
  69 {
  70     /**
  71      * The filename that stores the trigram data for the detector
  72      *
  73      * If this value starts with a slash (/) or a dot (.) the value of
  74      * $this->_data_dir will be ignored
  75      *
  76      * @var      string
  77      * @access   private
  78      */
  79     var $_db_filename = 'lang.dat';
  80
  81     /**
  82      * The filename that stores the unicode block definitions
  83      *
  84      * If this value starts with a slash (/) or a dot (.) the value of
  85      * $this->_data_dir will be ignored
  86      *
  87      * @var string
  88      * @access private
  89      */
  90     var $_unicode_db_filename = 'unicode_blocks.dat';
  91
  92     /**
  93      * The data directory
  94      *
  95      * Should be set by PEAR installer
  96      *
  97      * @var      string
  98      * @access   private
  99      */
 100     var $_data_dir = '@data_dir@';
 101
 102     /**
 103      * The trigram data for comparison
 104      *
 105      * Will be loaded on start from $this->_db_filename
 106      *
 107      * @var      array
 108      * @access   private
 109      */
 110     var $_lang_db = array();
 111
 112     /**
 113      * stores the map of the trigram data to unicode characters
 114      *
 115      * @access private
 116      * @var array
 117      */
 118     var $_unicode_map;
 119
 120     /**
 121      * The size of the trigram data arrays
 122      *
 123      * @var      int
 124      * @access   private
 125      */
 126     var $_threshold = 300;
 127
 128     /**
 129      * the maximum possible score.
 130      *
 131      * needed for score normalization. Different depending on the
 132      * perl compatibility setting
 133      *
 134      * @access  private
 135      * @var     int
 136      * @see     setPerlCompatible()
 137      */
 138     var $_max_score = 0;
 139
 140     /**
 141      * Whether or not to simulate perl's Language::Guess exactly
 142      *
 143      * @access  private
 144      * @var     bool
 145      * @see     setPerlCompatible()
 146      */
 147     var $_perl_compatible = false;
 148
 149     /**
 150      * Whether to use the unicode block detection to speed up processing
 151      *
 152      * @access private
 153      * @var bool
 154      */
 155     var $_use_unicode_narrowing = true;
 156
 157     /**
 158      * stores the result of the clustering operation
 159      *
 160      * @access  private
 161      * @var     array
 162      * @see     clusterLanguages()
 163      */
 164     var $_clusters;
 165
 166     /**
 167      * Which type of "language names" are accepted and returned:
 168      *
 169      * 0 - language name ("english")
 170      * 2 - 2-letter ISO 639-1 code ("en")
 171      * 3 - 3-letter ISO 639-2 code ("eng")
 172      */
 173     var $_name_mode = 0;
 174
 175     /**
 176      * Constructor
 177      *
 178      * Will attempt to load the language database. If it fails, you will get
 179      * an exception.
 180      */
 181     function __construct()
 182     {
 183         $data = $this->_readdb($this->_db_filename);
 184         $this->_checkTrigram($data['trigram']);
 185         $this->_lang_db = $data['trigram'];
 186
 187         if (isset($data['trigram-unicodemap'])) {
 188             $this->_unicode_map = $data['trigram-unicodemap'];
 189         }
 190
 191         // Not yet implemented:
 192         if (isset($data['trigram-clusters'])) {
 193             $this->_clusters = $data['trigram-clusters'];
 194         }
 195     }
 196
 197     /**
 198      * Returns the path to the location of the database
 199      *
 200      * @param string $fname File name to load
 201      *
 202      * @return string expected path to the language model database
 203      * @access private
 204      */
 205     function _get_data_loc($fname)
 206     {
 207         if ($fname{0} == '/' || $fname{0} == '.') {
 208             // if filename starts with a slash, assume it's an absolute pathname
 209             // and skip whatever is in $this->_data_dir
 210             return $fname;
 211
 212         } elseif ($this->_data_dir != '@' . 'data_dir' . '@') {
 213             // if the data dir was set by the PEAR installer, use that
 214             return $this->_data_dir . '/Text_LanguageDetect/' . $fname;
 215
 216         } else {
 217             // assume this was just unpacked somewhere
 218             // try the local working directory if otherwise
 219             return __DIR__ . '/data/' . $fname;
 220         }
 221     }
 222
 223     /**
 224      * Loads the language trigram database from filename
 225      *
 226      * Trigram datbase should be a serialize()'d array
 227      *
 228      * @param string $fname the filename where the data is stored
 229      *
 230      * @return array the language model data
 231      * @throws Text_LanguageDetect_Exception
 232      * @access private
 233      */
 234     function _readdb($fname)
 235     {
 236         // finds the correct data dir
 237         $fname = $this->_get_data_loc($fname);
 238
 239         // input check
 240         if (!file_exists($fname)) {
 241             throw new Text_LanguageDetect_Exception(
 242                 'Language database does not exist: ' . $fname,
 243                 Text_LanguageDetect_Exception::DB_NOT_FOUND
 244             );
 245         } elseif (!is_readable($fname)) {
 246             throw new Text_LanguageDetect_Exception(
 247                 'Language database is not readable: ' . $fname,
 248                 Text_LanguageDetect_Exception::DB_NOT_READABLE
 249             );
 250         }
 251
 252         return unserialize(file_get_contents($fname));
 253     }
 254
 255
 256     /**
 257      * Checks if this object is ready to detect languages
 258      *
 259      * @param array $trigram Trigram data from database
 260      *
 261      * @return void
 262      * @access private
 263      */
 264     function _checkTrigram($trigram)
 265     {
 266         if (!is_array($trigram)) {
 267             if (ini_get('magic_quotes_runtime')) {
 268                 throw new Text_LanguageDetect_Exception(
 269                     'Error loading database. Try turning magic_quotes_runtime off.',
 270                     Text_LanguageDetect_Exception::MAGIC_QUOTES
 271                 );
 272             }
 273             throw new Text_LanguageDetect_Exception(
 274                 'Language database is not an array.',
 275                 Text_LanguageDetect_Exception::DB_NOT_ARRAY
 276             );
 277         } elseif (empty($trigram)) {
 278             throw new Text_LanguageDetect_Exception(
 279                 'Language database has no elements.',
 280                 Text_LanguageDetect_Exception::DB_EMPTY
 281             );
 282         }
 283     }
 284
 285     /**
 286      * Omits languages
 287      *
 288      * Pass this function the name of or an array of names of
 289      * languages that you don't want considered
 290      *
 291      * If you're only expecting a limited set of languages, this can greatly
 292      * speed up processing
 293      *
 294      * @param mixed $omit_list    language name or array of names to omit
 295      * @param bool  $include_only if true will include (rather than
 296      *                            exclude) only those in the list
 297      *
 298      * @return int number of languages successfully deleted
 299      * @throws Text_LanguageDetect_Exception
 300      */
 301     public function omitLanguages($omit_list, $include_only = false)
 302     {
 303         $deleted = 0;
 304
 305         $omit_list = $this->_convertFromNameMode($omit_list);
 306
 307         if (!$include_only) {
 308             // deleting the given languages
 309             if (!is_array($omit_list)) {
 310                 $omit_list = strtolower($omit_list); // case desensitize
 311                 if (isset($this->_lang_db[$omit_list])) {
 312                     unset($this->_lang_db[$omit_list]);
 313                     $deleted++;
 314                 }
 315             } else {
 316                 foreach ($omit_list as $omit_lang) {
 317                     if (isset($this->_lang_db[$omit_lang])) {
 318                         unset($this->_lang_db[$omit_lang]);
 319                         $deleted++;
 320                     }
 321                 }
 322             }
 323
 324         } else {
 325             // deleting all except the given languages
 326             if (!is_array($omit_list)) {
 327                 $omit_list = array($omit_list);
 328             }
 329
 330             // case desensitize
 331             foreach ($omit_list as $key => $omit_lang) {
 332                 $omit_list[$key] = strtolower($omit_lang);
 333             }
 334
 335             foreach (array_keys($this->_lang_db) as $lang) {
 336                 if (!in_array($lang, $omit_list)) {
 337                     unset($this->_lang_db[$lang]);
 338                     $deleted++;
 339                 }
 340             }
 341         }
 342
 343         // reset the cluster cache if the number of languages changes
 344         // this will then have to be recalculated
 345         if (isset($this->_clusters) && $deleted > 0) {
 346             $this->_clusters = null;
 347         }
 348
 349         return $deleted;
 350     }
 351
 352
 353     /**
 354      * Returns the number of languages that this object can detect
 355      *
 356      * @access public
 357      * @return int            the number of languages
 358      * @throws   Text_LanguageDetect_Exception
 359      */
 360     function getLanguageCount()
 361     {
 362         return count($this->_lang_db);
 363     }
 364
 365     /**
 366      * Checks if the language with the given name exists in the database
 367      *
 368      * @param mixed $lang Language name or array of language names
 369      *
 370      * @return bool true if language model exists
 371      */
 372     public function languageExists($lang)
 373     {
 374         $lang = $this->_convertFromNameMode($lang);
 375
 376         if (is_string($lang)) {
 377             return isset($this->_lang_db[strtolower($lang)]);
 378
 379         } elseif (is_array($lang)) {
 380             foreach ($lang as $test_lang) {
 381                 if (!isset($this->_lang_db[strtolower($test_lang)])) {
 382                     return false;
 383                 }
 384             }
 385             return true;
 386
 387         } else {
 388             throw new Text_LanguageDetect_Exception(
 389                 'Unsupported parameter type passed to languageExists()',
 390                 Text_LanguageDetect_Exception::PARAM_TYPE
 391             );
 392         }
 393     }
 394
 395     /**
 396      * Returns the list of detectable languages
 397      *
 398      * @access public
 399      * @return array        the names of the languages known to this object<<<<<<<
 400      * @throws   Text_LanguageDetect_Exception
 401      */
 402     function getLanguages()
 403     {
 404         return $this->_convertToNameMode(
 405             array_keys($this->_lang_db)
 406         );
 407     }
 408
 409     /**
 410      * Make this object behave like Language::Guess
 411      *
 412      * @param bool $setting false to turn off perl compatibility
 413      *
 414      * @return void
 415      */
 416     public function setPerlCompatible($setting = true)
 417     {
 418         if (is_bool($setting)) { // input check
 419             $this->_perl_compatible = $setting;
 420
 421             if ($setting == true) {
 422                 $this->_max_score = $this->_threshold;
 423             } else {
 424                 $this->_max_score = 0;
 425             }
 426         }
 427
 428     }
 429
 430     /**
 431      * Sets the way how language names are accepted and returned.
 432      *
 433      * @param integer $name_mode One of the following modes:
 434      *                           0 - language name ("english")
 435      *                           2 - 2-letter ISO 639-1 code ("en")
 436      *                           3 - 3-letter ISO 639-2 code ("eng")
 437      *
 438      * @return void
 439      */
 440     function setNameMode($name_mode)
 441     {
 442         $this->_name_mode = $name_mode;
 443     }
 444
 445     /**
 446      * Whether to use unicode block ranges in detection
 447      *
 448      * Should speed up most detections if turned on (detault is on). In some
 449      * circumstances it may be slower, such as for large text samples (> 10K)
 450      * in languages that use latin scripts. In other cases it should speed up
 451      * detection noticeably.
 452      *
 453      * @param bool $setting false to turn off
 454      *
 455      * @return void
 456      */
 457     public function useUnicodeBlocks($setting = true)
 458     {
 459         if (is_bool($setting)) {
 460             $this->_use_unicode_narrowing = $setting;
 461         }
 462     }
 463
 464     /**
 465      * Converts a piece of text into trigrams
 466      *
 467      * @param string $text text to convert
 468      *
 469      * @return     array array of trigram frequencies
 470      * @access     private
 471      * @deprecated Superceded by the Text_LanguageDetect_Parser class
 472      */
 473     function _trigram($text)
 474     {
 475         $s = new Text_LanguageDetect_Parser($text);
 476         $s->prepareTrigram();
 477         $s->prepareUnicode(false);
 478         $s->setPadStart(!$this->_perl_compatible);
 479         $s->analyze();
 480         return $s->getTrigramFreqs();
 481     }
 482
 483     /**
 484      * Converts a set of trigrams from frequencies to ranks
 485      *
 486      * Thresholds (cuts off) the list at $this->_threshold
 487      *
 488      * @param array $arr array of trigram
 489      *
 490      * @return array ranks of trigrams
 491      * @access protected
 492      */
 493     function _arr_rank($arr)
 494     {
 495
 496         // sorts alphabetically first as a standard way of breaking rank ties
 497         $this->_bub_sort($arr);
 498
 499         // below might also work, but seemed to introduce errors in testing
 500         //ksort($arr);
 501         //asort($arr);
 502
 503         $rank = array();
 504
 505         $i = 0;
 506         foreach ($arr as $key => $value) {
 507             $rank[$key] = $i++;
 508
 509             // cut off at a standard threshold
 510             if ($i >= $this->_threshold) {
 511                 break;
 512             }
 513         }
 514
 515         return $rank;
 516     }
 517
 518     /**
 519      * Sorts an array by value breaking ties alphabetically
 520      *
 521      * @param array &$arr the array to sort
 522      *
 523      * @return void
 524      * @access private
 525      */
 526     function _bub_sort(&$arr)
 527     {
 528         // should do the same as this perl statement:
 529         // sort { $trigrams{$b} == $trigrams{$a}
 530         //   ?  $a cmp $b : $trigrams{$b} <=> $trigrams{$a} }
 531
 532         // needs to sort by both key and value at once
 533         // using the key to break ties for the value
 534
 535         // converts array into an array of arrays of each key and value
 536         // may be a better way of doing this
 537         $combined = array();
 538
 539         foreach ($arr as $key => $value) {
 540             $combined[] = array($key, $value);
 541         }
 542
 543         usort($combined, array($this, '_sort_func'));
 544
 545         $replacement = array();
 546         foreach ($combined as $key => $value) {
 547             list($new_key, $new_value) = $value;
 548             $replacement[$new_key] = $new_value;
 549         }
 550
 551         $arr = $replacement;
 552     }
 553
 554     /**
 555      * Sort function used by bubble sort
 556      *
 557      * Callback function for usort().
 558      *
 559      * @param array $a first param passed by usort()
 560      * @param array $b second param passed by usort()
 561      *
 562      * @return int 1 if $a is greater, -1 if not
 563      * @see    _bub_sort()
 564      * @access private
 565      */
 566     function _sort_func($a, $b)
 567     {
 568         // each is actually a key/value pair, so that it can compare using both
 569         list($a_key, $a_value) = $a;
 570         list($b_key, $b_value) = $b;
 571
 572         if ($a_value == $b_value) {
 573             // if the values are the same, break ties using the key
 574             return strcmp($a_key, $b_key);
 575
 576         } else {
 577             // if not, just sort normally
 578             if ($a_value > $b_value) {
 579                 return -1;
 580             } else {
 581                 return 1;
 582             }
 583         }
 584
 585         // 0 should not be possible because keys must be unique
 586     }
 587
 588     /**
 589      * Calculates a linear rank-order distance statistic between two sets of
 590      * ranked trigrams
 591      *
 592      * Sums the differences in rank for each trigram. If the trigram does not
 593      * appear in both, consider it a difference of $this->_threshold.
 594      *
 595      * This distance measure was proposed by Cavnar & Trenkle (1994). Despite
 596      * its simplicity it has been shown to be highly accurate for language
 597      * identification tasks.
 598      *
 599      * @param array $arr1 the reference set of trigram ranks
 600      * @param array $arr2 the target set of trigram ranks
 601      *
 602      * @return int the sum of the differences between the ranks of
 603      *             the two trigram sets
 604      * @access private
 605      */
 606     function _distance($arr1, $arr2)
 607     {
 608         $sumdist = 0;
 609
 610         foreach ($arr2 as $key => $value) {
 611             if (isset($arr1[$key])) {
 612                 $distance = abs($value - $arr1[$key]);
 613             } else {
 614                 // $this->_threshold sets the maximum possible distance value
 615                 // for any one pair of trigrams
 616                 $distance = $this->_threshold;
 617             }
 618             $sumdist += $distance;
 619         }
 620
 621         return $sumdist;
 622
 623         // todo: there are other distance statistics to try, e.g. relative
 624         //       entropy, but they're probably more costly to compute
 625     }
 626
 627     /**
 628      * Normalizes the score returned by _distance()
 629      *
 630      * Different if perl compatible or not
 631      *
 632      * @param int $score      the score from _distance()
 633      * @param int $base_count the number of trigrams being considered
 634      *
 635      * @return float the normalized score
 636      * @see    _distance()
 637      * @access private
 638      */
 639     function _normalize_score($score, $base_count = null)
 640     {
 641         if ($base_count === null) {
 642             $base_count = $this->_threshold;
 643         }
 644
 645         if (!$this->_perl_compatible) {
 646             return 1 - ($score / $base_count / $this->_threshold);
 647         } else {
 648             return floor($score / $base_count);
 649         }
 650     }
 651
 652
 653     /**
 654      * Detects the closeness of a sample of text to the known languages
 655      *
 656      * Calculates the statistical difference between the text and
 657      * the trigrams for each language, normalizes the score then
 658      * returns results for all languages in sorted order
 659      *
 660      * If perl compatible, the score is 300-0, 0 being most similar.
 661      * Otherwise, it's 0-1 with 1 being most similar.
 662      *
 663      * The $sample text should be at least a few sentences in length;
 664      * should be ascii-7 or utf8 encoded, if another and the mbstring extension
 665      * is present it will try to detect and convert. However, experience has
 666      * shown that mb_detect_encoding() *does not work very well* with at least
 667      * some types of encoding.
 668      *
 669      * @param string $sample a sample of text to compare.
 670      * @param int    $limit  if specified, return an array of the most likely
 671      *                       $limit languages and their scores.
 672      *
 673      * @return mixed sorted array of language scores, blank array if no
 674      *               useable text was found
 675      * @see    _distance()
 676      * @throws Text_LanguageDetect_Exception
 677      */
 678     public function detect($sample, $limit = 0)
 679     {
 680         // input check
 681         if (!Text_LanguageDetect_Parser::validateString($sample)) {
 682             return array();
 683         }
 684
 685         // check char encoding
 686         // (only if mbstring extension is compiled and PHP > 4.0.6)
 687         if (function_exists('mb_detect_encoding')
 688             && function_exists('mb_convert_encoding')
 689         ) {
 690             // mb_detect_encoding isn't very reliable, to say the least
 691             // detection should still work with a sufficient sample
 692             //  of ascii characters
 693             $encoding = mb_detect_encoding($sample);
 694
 695             // mb_detect_encoding() will return FALSE if detection fails
 696             // don't attempt conversion if that's the case
 697             if ($encoding != 'ASCII' && $encoding != 'UTF-8'
 698                 && $encoding !== false
 699             ) {
 700                 // verify the encoding exists in mb_list_encodings
 701                 if (in_array($encoding, mb_list_encodings())) {
 702                     $sample = mb_convert_encoding($sample, 'UTF-8', $encoding);
 703                 }
 704             }
 705         }
 706
 707         $sample_obj = new Text_LanguageDetect_Parser($sample);
 708         $sample_obj->prepareTrigram();
 709         if ($this->_use_unicode_narrowing) {
 710             $sample_obj->prepareUnicode();
 711         }
 712         $sample_obj->setPadStart(!$this->_perl_compatible);
 713         $sample_obj->analyze();
 714
 715         $trigram_freqs =& $sample_obj->getTrigramRanks();
 716         $trigram_count = count($trigram_freqs);
 717
 718         if ($trigram_count == 0) {
 719             return array();
 720         }
 721
 722         $scores = array();
 723
 724         // use unicode block detection to narrow down the possibilities
 725         if ($this->_use_unicode_narrowing) {
 726             $blocks =& $sample_obj->getUnicodeBlocks();
 727
 728             if (is_array($blocks)) {
 729                 $present_blocks = array_keys($blocks);
 730             } else {
 731                 throw new Text_LanguageDetect_Exception(
 732                     'Error during block detection',
 733                     Text_LanguageDetect_Exception::BLOCK_DETECTION
 734                 );
 735             }
 736
 737             $possible_langs = array();
 738
 739             foreach ($present_blocks as $blockname) {
 740                 if (isset($this->_unicode_map[$blockname])) {
 741
 742                     $possible_langs = array_merge(
 743                         $possible_langs,
 744                         array_keys($this->_unicode_map[$blockname])
 745                     );
 746
 747                     // todo: faster way to do this?
 748                 }
 749             }
 750
 751             // could also try an intersect operation rather than a union
 752             // in other words, choose languages whose trigrams contain
 753             // ALL of the unicode blocks found in this sample
 754             // would improve speed but would be completely thrown off by an
 755             // unexpected character, like an umlaut appearing in english text
 756
 757             $possible_langs = array_intersect(
 758                 array_keys($this->_lang_db),
 759                 array_unique($possible_langs)
 760             );
 761
 762             // needs to intersect it with the keys of _lang_db in case
 763             // languages have been omitted
 764
 765         } else {
 766             // or just try 'em all
 767             $possible_langs = array_keys($this->_lang_db);
 768         }
 769
 770
 771         foreach ($possible_langs as $lang) {
 772             $scores[$lang] = $this->_normalize_score(
 773                 $this->_distance($this->_lang_db[$lang], $trigram_freqs),
 774                 $trigram_count
 775             );
 776         }
 777
 778         unset($sample_obj);
 779
 780         if ($this->_perl_compatible) {
 781             asort($scores);
 782         } else {
 783             arsort($scores);
 784         }
 785
 786         // todo: drop languages with a score of $this->_max_score?
 787
 788         // limit the number of returned scores
 789         if ($limit && is_numeric($limit)) {
 790             $limited_scores = array();
 791
 792             $i = 0;
 793             foreach ($scores as $key => $value) {
 794                 if ($i++ >= $limit) {
 795                     break;
 796                 }
 797
 798                 $limited_scores[$key] = $value;
 799             }
 800
 801             return $this->_convertToNameMode($limited_scores, true);
 802         } else {
 803             return $this->_convertToNameMode($scores, true);
 804         }
 805     }
 806
 807     /**
 808      * Returns only the most similar language to the text sample
 809      *
 810      * Calls $this->detect() and returns only the top result
 811      *
 812      * @param string $sample text to detect the language of
 813      *
 814      * @return string the name of the most likely language
 815      *                or null if no language is similar
 816      * @see    detect()
 817      * @throws Text_LanguageDetect_Exception
 818      */
 819     public function detectSimple($sample)
 820     {
 821         $scores = $this->detect($sample, 1);
 822
 823         // if top language has the maximum possible score,
 824         // then the top score will have been picked at random
 825         if (!is_array($scores) || empty($scores)
 826             || current($scores) == $this->_max_score
 827         ) {
 828             return null;
 829         } else {
 830             return key($scores);
 831         }
 832     }
 833
 834     /**
 835      * Returns an array containing the most similar language and a confidence
 836      * rating
 837      *
 838      * Confidence is a simple measure calculated from the similarity score
 839      * minus the similarity score from the next most similar language
 840      * divided by the highest possible score. Languages that have closely
 841      * related cousins (e.g. Norwegian and Danish) should generally have lower
 842      * confidence scores.
 843      *
 844      * The similarity score answers the question "How likely is the text the
 845      * returned language regardless of the other languages considered?" The
 846      * confidence score is one way of answering the question "how likely is the
 847      * text the detected language relative to the rest of the language model
 848      * set?"
 849      *
 850      * To see how similar languages are a priori, see languageSimilarity()
 851      *
 852      * @param string $sample text for which language will be detected
 853      *
 854      * @return array most similar language, score and confidence rating
 855      *               or null if no language is similar
 856      * @see    detect()
 857      * @throws Text_LanguageDetect_Exception
 858      */
 859     public function detectConfidence($sample)
 860     {
 861         $scores = $this->detect($sample, 2);
 862
 863         // if most similar language has the max score, it
 864         // will have been picked at random
 865         if (!is_array($scores) || empty($scores)
 866             || current($scores) == $this->_max_score
 867         ) {
 868             return null;
 869         }
 870
 871         $arr['language'] = key($scores);
 872         $arr['similarity'] = current($scores);
 873         if (next($scores) !== false) { // if false then no next element
 874             // the goal is to return a higher value if the distance between
 875             // the similarity of the first score and the second score is high
 876
 877             if ($this->_perl_compatible) {
 878                 $arr['confidence'] = (current($scores) - $arr['similarity'])
 879                     / $this->_max_score;
 880
 881             } else {
 882                 $arr['confidence'] = $arr['similarity'] - current($scores);
 883
 884             }
 885
 886         } else {
 887             $arr['confidence'] = null;
 888         }
 889
 890         return $arr;
 891     }
 892
 893     /**
 894      * Returns the distribution of unicode blocks in a given utf8 string
 895      *
 896      * For the block name of a single char, use unicodeBlockName()
 897      *
 898      * @param string $str          input string. Must be ascii or utf8
 899      * @param bool   $skip_symbols if true, skip ascii digits, symbols and
 900      *                             non-printing characters. Includes spaces,
 901      *                             newlines and common punctutation characters.
 902      *
 903      * @return array
 904      * @throws Text_LanguageDetect_Exception
 905      */
 906     public function detectUnicodeBlocks($str, $skip_symbols)
 907     {
 908         $skip_symbols = (bool)$skip_symbols;
 909         $str          = (string)$str;
 910
 911         $sample_obj = new Text_LanguageDetect_Parser($str);
 912         $sample_obj->prepareUnicode();
 913         $sample_obj->prepareTrigram(false);
 914         $sample_obj->setUnicodeSkipSymbols($skip_symbols);
 915         $sample_obj->analyze();
 916         $blocks = $sample_obj->getUnicodeBlocks();
 917         unset($sample_obj);
 918         return $blocks;
 919     }
 920
 921     /**
 922      * Returns the block name for a given unicode value
 923      *
 924      * If passed a string, will assume it is being passed a UTF8-formatted
 925      * character and will automatically convert. Otherwise it will assume it
 926      * is being passed a numeric unicode value.
 927      *
 928      * Make sure input is of the correct type!
 929      *
 930      * @param mixed $unicode unicode value or utf8 char
 931      *
 932      * @return mixed the block name string or false if not found
 933      * @throws Text_LanguageDetect_Exception
 934      */
 935     public function unicodeBlockName($unicode)
 936     {
 937         if (is_string($unicode)) {
 938             // assume it is being passed a utf8 char, so convert it
 939             if (self::utf8strlen($unicode) > 1) {
 940                 throw new Text_LanguageDetect_Exception(
 941                     'Pass a single char only to this method',
 942                     Text_LanguageDetect_Exception::PARAM_TYPE
 943                 );
 944             }
 945             $unicode = $this->_utf8char2unicode($unicode);
 946
 947         } elseif (!is_int($unicode)) {
 948             throw new Text_LanguageDetect_Exception(
 949                 'Input must be of type string or int.',
 950                 Text_LanguageDetect_Exception::PARAM_TYPE
 951             );
 952         }
 953
 954         $blocks = $this->_read_unicode_block_db();
 955
 956         $result = $this->_unicode_block_name($unicode, $blocks);
 957
 958         if ($result == -1) {
 959             return false;
 960         } else {
 961             return $result[2];
 962         }
 963     }
 964
 965     /**
 966      * Searches the unicode block database
 967      *
 968      * Returns the block name for a given unicode value. unicodeBlockName() is
 969      * the public interface for this function, which does input checks which
 970      * this function omits for speed.
 971      *
 972      * @param int   $unicode     the unicode value
 973      * @param array $blocks      the block database
 974      * @param int   $block_count the number of defined blocks in the database
 975      *
 976      * @return mixed Block name, -1 if it failed
 977      * @see    unicodeBlockName()
 978      * @access protected
 979      */
 980     function _unicode_block_name($unicode, $blocks, $block_count = -1)
 981     {
 982         // for a reference, see
 983         // http://www.unicode.org/Public/UNIDATA/Blocks.txt
 984
 985         // assume that ascii characters are the most common
 986         // so try it first for efficiency
 987         if ($unicode <= $blocks[0][1]) {
 988             return $blocks[0];
 989         }
 990
 991         // the optional $block_count param is for efficiency
 992         // so we this function doesn't have to run count() every time
 993         if ($block_count != -1) {
 994             $high = $block_count - 1;
 995         } else {
 996             $high = count($blocks) - 1;
 997         }
 998
 999         $low = 1; // start with 1 because ascii was 0
1000
1001         // your average binary search algorithm
1002         while ($low <= $high) {
1003             $mid = floor(($low + $high) / 2);
1004
1005             if ($unicode < $blocks[$mid][0]) {
1006                 // if it's lower than the lower bound
1007                 $high = $mid - 1;
1008
1009             } elseif ($unicode > $blocks[$mid][1]) {
1010                 // if it's higher than the upper bound
1011                 $low = $mid + 1;
1012
1013             } else {
1014                 // found it
1015                 return $blocks[$mid];
1016             }
1017         }
1018
1019         // failed to find the block
1020         return -1;
1021
1022         // todo: differentiate when it's out of range or when it falls
1023         //       into an unassigned range?
1024     }
1025
1026     /**
1027      * Brings up the unicode block database
1028      *
1029      * @return array the database of unicode block definitions
1030      * @throws Text_LanguageDetect_Exception
1031      * @access protected
1032      */
1033     function _read_unicode_block_db()
1034     {
1035         // since the unicode definitions are always going to be the same,
1036         // might as well share the memory for the db with all other instances
1037         // of this class
1038         static $data;
1039
1040         if (!isset($data)) {
1041             $data = $this->_readdb($this->_unicode_db_filename);
1042         }
1043
1044         return $data;
1045     }
1046
1047     /**
1048      * Calculate the similarities between the language models
1049      *
1050      * Use this function to see how similar languages are to each other.
1051      *
1052      * If passed 2 language names, will return just those languages compared.
1053      * If passed 1 language name, will return that language compared to
1054      * all others.
1055      * If passed none, will return an array of every language model compared
1056      * to every other one.
1057      *
1058      * @param string $lang1 the name of the first language to be compared
1059      * @param string $lang2 the name of the second language to be compared
1060      *
1061      * @return array scores of every language compared
1062      *               or the score of just the provided languages
1063      *               or null if one of the supplied languages does not exist
1064      * @throws Text_LanguageDetect_Exception
1065      */
1066     public function languageSimilarity($lang1 = null, $lang2 = null)
1067     {
1068         $lang1 = $this->_convertFromNameMode($lang1);
1069         $lang2 = $this->_convertFromNameMode($lang2);
1070         if ($lang1 != null) {
1071             $lang1 = strtolower($lang1);
1072
1073             // check if language model exists
1074             if (!isset($this->_lang_db[$lang1])) {
1075                 return null;
1076             }
1077
1078             if ($lang2 != null) {
1079                 if (!isset($this->_lang_db[$lang2])) {
1080                     // check if language model exists
1081                     return null;
1082                 }
1083
1084                 $lang2 = strtolower($lang2);
1085
1086                 // compare just these two languages
1087                 return $this->_normalize_score(
1088                     $this->_distance(
1089                         $this->_lang_db[$lang1],
1090                         $this->_lang_db[$lang2]
1091                     )
1092                 );
1093
1094             } else {
1095                 // compare just $lang1 to all languages
1096                 $return_arr = array();
1097                 foreach ($this->_lang_db as $key => $value) {
1098                     if ($key != $lang1) {
1099                         // don't compare a language to itself
1100                         $return_arr[$key] = $this->_normalize_score(
1101                             $this->_distance($this->_lang_db[$lang1], $value)
1102                         );
1103                     }
1104                 }
1105                 asort($return_arr);
1106
1107                 return $return_arr;
1108             }
1109
1110
1111         } else {
1112             // compare all languages to each other
1113             $return_arr = array();
1114             foreach (array_keys($this->_lang_db) as $lang1) {
1115                 foreach (array_keys($this->_lang_db) as $lang2) {
1116                     // skip comparing languages to themselves
1117                     if ($lang1 != $lang2) {
1118
1119                         if (isset($return_arr[$lang2][$lang1])) {
1120                             // don't re-calculate what's already been done
1121                             $return_arr[$lang1][$lang2]
1122                                 = $return_arr[$lang2][$lang1];
1123
1124                         } else {
1125                             // calculate
1126                             $return_arr[$lang1][$lang2]
1127                                 = $this->_normalize_score(
1128                                     $this->_distance(
1129                                         $this->_lang_db[$lang1],
1130                                         $this->_lang_db[$lang2]
1131                                     )
1132                                 );
1133
1134                         }
1135                     }
1136                 }
1137             }
1138             return $return_arr;
1139         }
1140     }
1141
1142     /**
1143      * Cluster known languages according to languageSimilarity()
1144      *
1145      * WARNING: this method is EXPERIMENTAL. It is not recommended for common
1146      * use, and it may disappear or its functionality may change in future
1147      * releases without notice.
1148      *
1149      * Uses a nearest neighbor technique to generate the maximum possible
1150      * number of dendograms from the similarity data.
1151      *
1152      * @access      public
1153      * @return      array language cluster data
1154      * @throws      Text_LanguageDetect_Exception
1155      * @see         languageSimilarity()
1156      * @deprecated  this function will eventually be removed and placed into
1157      *              the model generation class
1158      */
1159     function clusterLanguages()
1160     {
1161         // todo: set the maximum number of clusters
1162         // return cached result, if any
1163         if (isset($this->_clusters)) {
1164             return $this->_clusters;
1165         }
1166
1167         $langs = array_keys($this->_lang_db);
1168
1169         $arr = $this->languageSimilarity();
1170
1171         sort($langs);
1172
1173         foreach ($langs as $lang) {
1174             if (!isset($this->_lang_db[$lang])) {
1175                 throw new Text_LanguageDetect_Exception(
1176                     "missing $lang!",
1177                     Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE
1178                 );
1179             }
1180         }
1181
1182         // http://www.psychstat.missouristate.edu/multibook/mlt04m.html
1183         foreach ($langs as $old_key => $lang1) {
1184             $langs[$lang1] = $lang1;
1185             unset($langs[$old_key]);
1186         }
1187
1188         $result_data = $really_map = array();
1189
1190         $i = 0;
1191         while (count($langs) > 2 && $i++ < 200) {
1192             $highest_score = -1;
1193             $highest_key1 = '';
1194             $highest_key2 = '';
1195             foreach ($langs as $lang1) {
1196                 foreach ($langs as $lang2) {
1197                     if ($lang1 != $lang2
1198                         && $arr[$lang1][$lang2] > $highest_score
1199                     ) {
1200                         $highest_score = $arr[$lang1][$lang2];
1201                         $highest_key1 = $lang1;
1202                         $highest_key2 = $lang2;
1203                     }
1204                 }
1205             }
1206
1207             if (!$highest_key1) {
1208                 // should not ever happen
1209                 throw new Text_LanguageDetect_Exception(
1210                     "no highest key? (step: $i)",
1211                     Text_LanguageDetect_Exception::NO_HIGHEST_KEY
1212                 );
1213             }
1214
1215             if ($highest_score == 0) {
1216                 // languages are perfectly dissimilar
1217                 break;
1218             }
1219
1220             // $highest_key1 and $highest_key2 are most similar
1221             $sum1 = array_sum($arr[$highest_key1]);
1222             $sum2 = array_sum($arr[$highest_key2]);
1223
1224             // use the score for the one that is most similar to the rest of
1225             // the field as the score for the group
1226             // todo: could try averaging or "centroid" method instead
1227             // seems like that might make more sense
1228             // actually nearest neighbor may be better for binary searching
1229
1230
1231             // for "Complete Linkage"/"furthest neighbor"
1232             // sign should be <
1233             // for "Single Linkage"/"nearest neighbor" method
1234             // should should be >
1235             // results seem to be pretty much the same with either method
1236
1237             // figure out which to delete and which to replace
1238             if ($sum1 > $sum2) {
1239                 $replaceme = $highest_key1;
1240                 $deleteme = $highest_key2;
1241             } else {
1242                 $replaceme = $highest_key2;
1243                 $deleteme = $highest_key1;
1244             }
1245
1246             $newkey = $replaceme . ':' . $deleteme;
1247
1248             // $replaceme is most similar to remaining languages
1249             // replace $replaceme with '$newkey', deleting $deleteme
1250
1251             // keep a record of which fork is really which language
1252             $really_lang = $replaceme;
1253             while (isset($really_map[$really_lang])) {
1254                 $really_lang = $really_map[$really_lang];
1255             }
1256             $really_map[$newkey] = $really_lang;
1257
1258
1259             // replace the best fitting key, delete the other
1260             foreach ($arr as $key1 => $arr2) {
1261                 foreach ($arr2 as $key2 => $value2) {
1262                     if ($key2 == $replaceme) {
1263                         $arr[$key1][$newkey] = $arr[$key1][$key2];
1264                         unset($arr[$key1][$key2]);
1265                         // replacing $arr[$key1][$key2] with $arr[$key1][$newkey]
1266                     }
1267
1268                     if ($key1 == $replaceme) {
1269                         $arr[$newkey][$key2] = $arr[$key1][$key2];
1270                         unset($arr[$key1][$key2]);
1271                         // replacing $arr[$key1][$key2] with $arr[$newkey][$key2]
1272                     }
1273
1274                     if ($key1 == $deleteme || $key2 == $deleteme) {
1275                         // deleting $arr[$key1][$key2]
1276                         unset($arr[$key1][$key2]);
1277                     }
1278                 }
1279             }
1280
1281
1282             unset($langs[$highest_key1]);
1283             unset($langs[$highest_key2]);
1284             $langs[$newkey] = $newkey;
1285
1286
1287             // some of these may be overkill
1288             $result_data[$newkey] = array(
1289                                 'newkey' => $newkey,
1290                                 'count' => $i,
1291                                 'diff' => abs($sum1 - $sum2),
1292                                 'score' => $highest_score,
1293                                 'bestfit' => $replaceme,
1294                                 'otherfit' => $deleteme,
1295                                 'really' => $really_lang,
1296                             );
1297         }
1298
1299         $return_val = array(
1300                 'open_forks' => $langs,
1301                         // the top level of clusters
1302                         // clusters that are mutually exclusive
1303                         // or specified by a specific maximum
1304
1305                 'fork_data' => $result_data,
1306                         // data for each split
1307
1308                 'name_map' => $really_map,
1309                         // which cluster is really which language
1310                         // using the nearest neighbor technique, the cluster
1311                         // inherits all of the properties of its most-similar member
1312                         // this keeps track
1313             );
1314
1315
1316         // saves the result in the object
1317         $this->_clusters = $return_val;
1318
1319         return $return_val;
1320     }
1321
1322
1323     /**
1324      * Perform an intelligent detection based on clusterLanguages()
1325      *
1326      * WARNING: this method is EXPERIMENTAL. It is not recommended for common
1327      * use, and it may disappear or its functionality may change in future
1328      * releases without notice.
1329      *
1330      * This compares the sample text to top the top level of clusters. If the
1331      * sample is similar to the cluster it will drop down and compare it to the
1332      * languages in the cluster, and so on until it hits a leaf node.
1333      *
1334      * this should find the language in considerably fewer compares
1335      * (the equivalent of a binary search), however clusterLanguages() is costly
1336      * and the loss of accuracy from this technique is significant.
1337      *
1338      * This method may need to be 'fuzzier' in order to become more accurate.
1339      *
1340      * This function could be more useful if the universe of possible languages
1341      * was very large, however in such cases some method of Bayesian inference
1342      * might be more helpful.
1343      *
1344      * @param string $str input string
1345      *
1346      * @return array language scores (only those compared)
1347      * @throws Text_LanguageDetect_Exception
1348      * @see    clusterLanguages()
1349      */
1350     public function clusteredSearch($str)
1351     {
1352         // input check
1353         if (!Text_LanguageDetect_Parser::validateString($str)) {
1354             return array();
1355         }
1356
1357         // clusterLanguages() will return a cached result if possible
1358         // so it's safe to call it every time
1359         $result = $this->clusterLanguages();
1360
1361         $dendogram_start = $result['open_forks'];
1362         $dendogram_data  = $result['fork_data'];
1363         $dendogram_alias = $result['name_map'];
1364
1365         $sample_obj = new Text_LanguageDetect_Parser($str);
1366         $sample_obj->prepareTrigram();
1367         $sample_obj->setPadStart(!$this->_perl_compatible);
1368         $sample_obj->analyze();
1369         $sample_result = $sample_obj->getTrigramRanks();
1370         $sample_count  = count($sample_result);
1371
1372         // input check
1373         if ($sample_count == 0) {
1374             return array();
1375         }
1376
1377         $i = 0; // counts the number of steps
1378
1379         foreach ($dendogram_start as $lang) {
1380             if (isset($dendogram_alias[$lang])) {
1381                 $lang_key = $dendogram_alias[$lang];
1382             } else {
1383                 $lang_key = $lang;
1384             }
1385
1386             $scores[$lang] = $this->_normalize_score(
1387                 $this->_distance($this->_lang_db[$lang_key], $sample_result),
1388                 $sample_count
1389             );
1390
1391             $i++;
1392         }
1393
1394         if ($this->_perl_compatible) {
1395             asort($scores);
1396         } else {
1397             arsort($scores);
1398         }
1399
1400         $top_score = current($scores);
1401         $top_key = key($scores);
1402
1403         // of starting forks, $top_key is the most similar to the sample
1404
1405         $cur_key = $top_key;
1406         while (isset($dendogram_data[$cur_key])) {
1407             $lang1 = $dendogram_data[$cur_key]['bestfit'];
1408             $lang2 = $dendogram_data[$cur_key]['otherfit'];
1409             foreach (array($lang1, $lang2) as $lang) {
1410                 if (isset($dendogram_alias[$lang])) {
1411                     $lang_key = $dendogram_alias[$lang];
1412                 } else {
1413                     $lang_key = $lang;
1414                 }
1415
1416                 $scores[$lang] = $this->_normalize_score(
1417                     $this->_distance($this->_lang_db[$lang_key], $sample_result),
1418                     $sample_count
1419                 );
1420
1421                 //todo: does not need to do same comparison again
1422             }
1423
1424             $i++;
1425
1426             if ($scores[$lang1] > $scores[$lang2]) {
1427                 $cur_key = $lang1;
1428                 $loser_key = $lang2;
1429             } else {
1430                 $cur_key = $lang2;
1431                 $loser_key = $lang1;
1432             }
1433
1434             $diff = $scores[$cur_key] - $scores[$loser_key];
1435
1436             // $cur_key ({$dendogram_alias[$cur_key]}) wins
1437             // over $loser_key ({$dendogram_alias[$loser_key]})
1438             // with a difference of $diff
1439         }
1440
1441         // found result in $i compares
1442
1443         // rather than sorting the result, preserve it so that you can see
1444         // which paths the algorithm decided to take along the tree
1445
1446         // but sometimes the last item is only the second highest
1447         if (($this->_perl_compatible  && (end($scores) > prev($scores)))
1448             || (!$this->_perl_compatible && (end($scores) < prev($scores)))
1449         ) {
1450             $real_last_score = current($scores);
1451             $real_last_key = key($scores);
1452
1453             // swaps the 2nd-to-last item for the last item
1454             unset($scores[$real_last_key]);
1455             $scores[$real_last_key] = $real_last_score;
1456         }
1457
1458
1459         if (!$this->_perl_compatible) {
1460             $scores = array_reverse($scores, true);
1461             // second param requires php > 4.0.3
1462         }
1463
1464         return $scores;
1465     }
1466
1467     /**
1468      * ut8-safe strlen()
1469      *
1470      * Returns the numbers of characters (not bytes) in a utf8 string
1471      *
1472      * @param string $str string to get the length of
1473      *
1474      * @return int number of chars
1475      */
1476     public static function utf8strlen($str)
1477     {
1478         // utf8_decode() will convert unknown chars to '?', which is actually
1479         // ideal for counting.
1480
1481         return strlen(utf8_decode($str));
1482
1483         // idea stolen from dokuwiki
1484     }
1485
1486     /**
1487      * Returns the unicode value of a utf8 char
1488      *
1489      * @param string $char a utf8 (possibly multi-byte) char
1490      *
1491      * @return int unicode value
1492      * @access protected
1493      * @link   http://en.wikipedia.org/wiki/UTF-8
1494      */
1495     function _utf8char2unicode($char)
1496     {
1497         // strlen() here will actually get the binary length of a single char
1498         switch (strlen($char)) {
1499         case 1:
1500             // normal ASCII-7 byte
1501             // 0xxxxxxx -->  0xxxxxxx
1502             return ord($char{0});
1503
1504         case 2:
1505             // 2 byte unicode
1506             // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx
1507             $z = (ord($char{0}) & 0x000001F) << 6;
1508             $x = (ord($char{1}) & 0x0000003F);
1509             return ($z | $x);
1510
1511         case 3:
1512             // 3 byte unicode
1513             // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx
1514             $z =  (ord($char{0}) & 0x0000000F) << 12;
1515             $x1 = (ord($char{1}) & 0x0000003F) << 6;
1516             $x2 = (ord($char{2}) & 0x0000003F);
1517             return ($z | $x1 | $x2);
1518
1519         case 4:
1520             // 4 byte unicode
1521             // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx -->
1522             // 000zzzzz xxxxxxxx xxxxxxxx
1523             $z1 = (ord($char{0}) & 0x00000007) << 18;
1524             $z2 = (ord($char{1}) & 0x0000003F) << 12;
1525             $x1 = (ord($char{2}) & 0x0000003F) << 6;
1526             $x2 = (ord($char{3}) & 0x0000003F);
1527             return ($z1 | $z2 | $x1 | $x2);
1528         }
1529     }
1530
1531     /**
1532      * utf8-safe fast character iterator
1533      *
1534      * Will get the next character starting from $counter, which will then be
1535      * incremented. If a multi-byte char the bytes will be concatenated and
1536      * $counter will be incremeted by the number of bytes in the char.
1537      *
1538      * @param string $str             the string being iterated over
1539      * @param int    &$counter        the iterator, will increment by reference
1540      * @param bool   $special_convert whether to do special conversions
1541      *
1542      * @return char the next (possibly multi-byte) char from $counter
1543      * @access private
1544      */
1545     static function _next_char($str, &$counter, $special_convert = false)
1546     {
1547         $char = $str{$counter++};
1548         $ord = ord($char);
1549
1550         // for a description of the utf8 system see
1551         // http://www.phpclasses.org/browse/file/5131.html
1552
1553         // normal ascii one byte char
1554         if ($ord <= 127) {
1555             // special conversions needed for this package
1556             // (that only apply to regular ascii characters)
1557             // lower case, and convert all non-alphanumeric characters
1558             // other than "'" to space
1559             if ($special_convert && $char != ' ' && $char != "'") {
1560                 if ($ord >= 65 && $ord <= 90) { // A-Z
1561                     $char = chr($ord + 32); // lower case
1562                 } elseif ($ord < 97 || $ord > 122) { // NOT a-z
1563                     $char = ' '; // convert to space
1564                 }
1565             }
1566
1567             return $char;
1568
1569         } elseif ($ord >> 5 == 6) { // two-byte char
1570             // multi-byte chars
1571             $nextchar = $str{$counter++}; // get next byte
1572
1573             // lower-casing of non-ascii characters is still incomplete
1574
1575             if ($special_convert) {
1576                 // lower case latin accented characters
1577                 if ($ord == 195) {
1578                     $nextord = ord($nextchar);
1579                     $nextord_adj = $nextord + 64;
1580                     // for a reference, see
1581                     // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html
1582
1583                     // &Agrave; - &THORN; but not &times;
1584                     if ($nextord_adj >= 192
1585                         && $nextord_adj <= 222
1586                         && $nextord_adj != 215
1587                     ) {
1588                         $nextchar = chr($nextord + 32);
1589                     }
1590
1591                 } elseif ($ord == 208) {
1592                     // lower case cyrillic alphabet
1593                     $nextord = ord($nextchar);
1594                     // if A - Pe
1595                     if ($nextord >= 144 && $nextord <= 159) {
1596                         // lower case
1597                         $nextchar = chr($nextord + 32);
1598
1599                     } elseif ($nextord >= 160 && $nextord <= 175) {
1600                         // if Er - Ya
1601                         // lower case
1602                         $char = chr(209); // == $ord++
1603                         $nextchar = chr($nextord - 32);
1604                     }
1605                 }
1606             }
1607
1608             // tag on next byte
1609             return $char . $nextchar;
1610         } elseif ($ord >> 4  == 14) { // three-byte char
1611
1612             // tag on next 2 bytes
1613             return $char . $str{$counter++} . $str{$counter++};
1614
1615         } elseif ($ord >> 3 == 30) { // four-byte char
1616
1617             // tag on next 3 bytes
1618             return $char . $str{$counter++} . $str{$counter++} . $str{$counter++};
1619
1620         } else {
1621             // error?
1622         }
1623     }
1624
1625     /**
1626      * Converts an $language input parameter from the configured mode
1627      * to the language name that is used internally.
1628      *
1629      * Works for strings and arrays.
1630      *
1631      * @param string|array $lang       A language description ("english"/"en"/"eng")
1632      * @param boolean      $convertKey If $lang is an array, setting $key
1633      *                                 converts the keys to the language name.
1634      *
1635      * @return string|array Language name
1636      */
1637     function _convertFromNameMode($lang, $convertKey = false)
1638     {
1639         if ($this->_name_mode == 0) {
1640             return $lang;
1641         }
1642
1643         if ($this->_name_mode == 2) {
1644             $method = 'code2ToName';
1645         } else {
1646             $method = 'code3ToName';
1647         }
1648
1649         if (is_string($lang)) {
1650             return (string)Text_LanguageDetect_ISO639::$method($lang);
1651         }
1652
1653         $newlang = array();
1654         foreach ($lang as $key => $val) {
1655             if ($convertKey) {
1656                 $newkey = (string)Text_LanguageDetect_ISO639::$method($key);
1657                 $newlang[$newkey] = $val;
1658             } else {
1659                 $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val);
1660             }
1661         }
1662         return $newlang;
1663     }
1664
1665     /**
1666      * Converts an $language output parameter from the language name that is
1667      * used internally to the configured mode.
1668      *
1669      * Works for strings and arrays.
1670      *
1671      * @param string|array $lang       A language description ("english"/"en"/"eng")
1672      * @param boolean      $convertKey If $lang is an array, setting $key
1673      *                                 converts the keys to the language name.
1674      *
1675      * @return string|array Language name
1676      */
1677     function _convertToNameMode($lang, $convertKey = false)
1678     {
1679         if ($this->_name_mode == 0) {
1680             return $lang;
1681         }
1682
1683         if ($this->_name_mode == 2) {
1684             $method = 'nameToCode2';
1685         } else {
1686             $method = 'nameToCode3';
1687         }
1688
1689         if (is_string($lang)) {
1690             return Text_LanguageDetect_ISO639::$method($lang);
1691         }
1692
1693         $newlang = array();
1694         foreach ($lang as $key => $val) {
1695             if ($convertKey) {
1696                 $newkey = Text_LanguageDetect_ISO639::$method($key);
1697                 $newlang[$newkey] = $val;
1698             } else {
1699                 $newlang[$key] = Text_LanguageDetect_ISO639::$method($val);
1700             }
1701         }
1702         return $newlang;
1703     }
1704 }
1705
1706 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
1707
1708 ?>