]>
Commit | Line | Data |
---|---|---|
6b461797 AD |
1 | <?php |
2 | ||
3 | /** | |
4 | * Detects the language of a given piece of text. | |
5 | * | |
6 | * Attempts to detect the language of a sample of text by correlating ranked | |
7 | * 3-gram frequencies to a table of 3-gram frequencies of known languages. | |
8 | * | |
9 | * Implements a version of a technique originally proposed by Cavnar & Trenkle | |
10 | * (1994): "N-Gram-Based Text Categorization" | |
11 | * | |
12 | * PHP version 5 | |
13 | * | |
14 | * @category Text | |
15 | * @package Text_LanguageDetect | |
16 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | |
17 | * @copyright 2005-2006 Nicholas Pisarro | |
18 | * @license http://www.debian.org/misc/bsd.license BSD | |
19 | * @version SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $ | |
20 | * @link http://pear.php.net/package/Text_LanguageDetect/ | |
21 | * @link http://langdetect.blogspot.com/ | |
22 | */ | |
23 | ||
57243a95 AD |
24 | require_once __DIR__ . '/Text/LanguageDetect/Exception.php'; |
25 | require_once __DIR__ . '/Text/LanguageDetect/Parser.php'; | |
26 | require_once __DIR__ . '/Text/LanguageDetect/ISO639.php'; | |
6b461797 AD |
27 | |
28 | /** | |
29 | * Language detection class | |
30 | * | |
31 | * Requires the langauge model database (lang.dat) that should have | |
32 | * accompanied this class definition in order to be instantiated. | |
33 | * | |
34 | * Example usage: | |
35 | * | |
36 | * <code> | |
37 | * require_once 'Text/LanguageDetect.php'; | |
38 | * | |
39 | * $l = new Text_LanguageDetect; | |
40 | * | |
41 | * $stdin = fopen('php://stdin', 'r'); | |
42 | * | |
43 | * echo "Supported languages:\n"; | |
44 | * | |
45 | * try { | |
46 | * $langs = $l->getLanguages(); | |
47 | * } catch (Text_LanguageDetect_Exception $e) { | |
48 | * die($e->getMessage()); | |
49 | * } | |
50 | * | |
51 | * sort($langs); | |
52 | * echo join(', ', $langs); | |
53 | * | |
54 | * while ($line = fgets($stdin)) { | |
55 | * print_r($l->detect($line, 4)); | |
56 | * } | |
57 | * </code> | |
58 | * | |
59 | * @category Text | |
60 | * @package Text_LanguageDetect | |
61 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | |
62 | * @copyright 2005 Nicholas Pisarro | |
63 | * @license http://www.debian.org/misc/bsd.license BSD | |
64 | * @version Release: @package_version@ | |
65 | * @link http://pear.php.net/package/Text_LanguageDetect/ | |
66 | * @todo allow users to generate their own language models | |
21ce7d9e AD |
67 | * |
68 | * @SuppressWarnings(PHPMD) | |
6b461797 AD |
69 | */ |
70 | class Text_LanguageDetect | |
71 | { | |
72 | /** | |
73 | * The filename that stores the trigram data for the detector | |
74 | * | |
75 | * If this value starts with a slash (/) or a dot (.) the value of | |
76 | * $this->_data_dir will be ignored | |
77 | * | |
78 | * @var string | |
79 | * @access private | |
80 | */ | |
81 | var $_db_filename = 'lang.dat'; | |
82 | ||
83 | /** | |
84 | * The filename that stores the unicode block definitions | |
85 | * | |
86 | * If this value starts with a slash (/) or a dot (.) the value of | |
87 | * $this->_data_dir will be ignored | |
88 | * | |
89 | * @var string | |
90 | * @access private | |
91 | */ | |
92 | var $_unicode_db_filename = 'unicode_blocks.dat'; | |
93 | ||
94 | /** | |
95 | * The data directory | |
96 | * | |
97 | * Should be set by PEAR installer | |
98 | * | |
99 | * @var string | |
100 | * @access private | |
101 | */ | |
102 | var $_data_dir = '@data_dir@'; | |
103 | ||
104 | /** | |
105 | * The trigram data for comparison | |
106 | * | |
107 | * Will be loaded on start from $this->_db_filename | |
108 | * | |
109 | * @var array | |
110 | * @access private | |
111 | */ | |
112 | var $_lang_db = array(); | |
113 | ||
114 | /** | |
115 | * stores the map of the trigram data to unicode characters | |
116 | * | |
117 | * @access private | |
118 | * @var array | |
119 | */ | |
120 | var $_unicode_map; | |
121 | ||
122 | /** | |
123 | * The size of the trigram data arrays | |
124 | * | |
125 | * @var int | |
126 | * @access private | |
127 | */ | |
128 | var $_threshold = 300; | |
129 | ||
130 | /** | |
131 | * the maximum possible score. | |
132 | * | |
133 | * needed for score normalization. Different depending on the | |
134 | * perl compatibility setting | |
135 | * | |
136 | * @access private | |
137 | * @var int | |
138 | * @see setPerlCompatible() | |
139 | */ | |
140 | var $_max_score = 0; | |
141 | ||
142 | /** | |
143 | * Whether or not to simulate perl's Language::Guess exactly | |
144 | * | |
145 | * @access private | |
146 | * @var bool | |
147 | * @see setPerlCompatible() | |
148 | */ | |
149 | var $_perl_compatible = false; | |
150 | ||
151 | /** | |
152 | * Whether to use the unicode block detection to speed up processing | |
153 | * | |
154 | * @access private | |
155 | * @var bool | |
156 | */ | |
157 | var $_use_unicode_narrowing = true; | |
158 | ||
159 | /** | |
160 | * stores the result of the clustering operation | |
161 | * | |
162 | * @access private | |
163 | * @var array | |
164 | * @see clusterLanguages() | |
165 | */ | |
166 | var $_clusters; | |
167 | ||
168 | /** | |
169 | * Which type of "language names" are accepted and returned: | |
170 | * | |
171 | * 0 - language name ("english") | |
172 | * 2 - 2-letter ISO 639-1 code ("en") | |
173 | * 3 - 3-letter ISO 639-2 code ("eng") | |
174 | */ | |
175 | var $_name_mode = 0; | |
176 | ||
177 | /** | |
178 | * Constructor | |
179 | * | |
180 | * Will attempt to load the language database. If it fails, you will get | |
181 | * an exception. | |
182 | */ | |
183 | function __construct() | |
184 | { | |
185 | $data = $this->_readdb($this->_db_filename); | |
186 | $this->_checkTrigram($data['trigram']); | |
187 | $this->_lang_db = $data['trigram']; | |
188 | ||
189 | if (isset($data['trigram-unicodemap'])) { | |
190 | $this->_unicode_map = $data['trigram-unicodemap']; | |
191 | } | |
192 | ||
193 | // Not yet implemented: | |
194 | if (isset($data['trigram-clusters'])) { | |
195 | $this->_clusters = $data['trigram-clusters']; | |
196 | } | |
197 | } | |
198 | ||
199 | /** | |
200 | * Returns the path to the location of the database | |
201 | * | |
202 | * @param string $fname File name to load | |
203 | * | |
204 | * @return string expected path to the language model database | |
205 | * @access private | |
206 | */ | |
207 | function _get_data_loc($fname) | |
208 | { | |
209 | if ($fname{0} == '/' || $fname{0} == '.') { | |
210 | // if filename starts with a slash, assume it's an absolute pathname | |
211 | // and skip whatever is in $this->_data_dir | |
212 | return $fname; | |
213 | ||
214 | } elseif ($this->_data_dir != '@' . 'data_dir' . '@') { | |
215 | // if the data dir was set by the PEAR installer, use that | |
216 | return $this->_data_dir . '/Text_LanguageDetect/' . $fname; | |
217 | ||
218 | } else { | |
219 | // assume this was just unpacked somewhere | |
220 | // try the local working directory if otherwise | |
221 | return __DIR__ . '/data/' . $fname; | |
222 | } | |
223 | } | |
224 | ||
225 | /** | |
226 | * Loads the language trigram database from filename | |
227 | * | |
228 | * Trigram datbase should be a serialize()'d array | |
229 | * | |
230 | * @param string $fname the filename where the data is stored | |
231 | * | |
232 | * @return array the language model data | |
233 | * @throws Text_LanguageDetect_Exception | |
234 | * @access private | |
235 | */ | |
236 | function _readdb($fname) | |
237 | { | |
238 | // finds the correct data dir | |
239 | $fname = $this->_get_data_loc($fname); | |
240 | ||
241 | // input check | |
242 | if (!file_exists($fname)) { | |
243 | throw new Text_LanguageDetect_Exception( | |
244 | 'Language database does not exist: ' . $fname, | |
245 | Text_LanguageDetect_Exception::DB_NOT_FOUND | |
246 | ); | |
247 | } elseif (!is_readable($fname)) { | |
248 | throw new Text_LanguageDetect_Exception( | |
249 | 'Language database is not readable: ' . $fname, | |
250 | Text_LanguageDetect_Exception::DB_NOT_READABLE | |
251 | ); | |
252 | } | |
253 | ||
254 | return unserialize(file_get_contents($fname)); | |
255 | } | |
256 | ||
257 | ||
258 | /** | |
259 | * Checks if this object is ready to detect languages | |
260 | * | |
261 | * @param array $trigram Trigram data from database | |
262 | * | |
263 | * @return void | |
264 | * @access private | |
265 | */ | |
266 | function _checkTrigram($trigram) | |
267 | { | |
268 | if (!is_array($trigram)) { | |
269 | if (ini_get('magic_quotes_runtime')) { | |
270 | throw new Text_LanguageDetect_Exception( | |
271 | 'Error loading database. Try turning magic_quotes_runtime off.', | |
272 | Text_LanguageDetect_Exception::MAGIC_QUOTES | |
273 | ); | |
274 | } | |
275 | throw new Text_LanguageDetect_Exception( | |
276 | 'Language database is not an array.', | |
277 | Text_LanguageDetect_Exception::DB_NOT_ARRAY | |
278 | ); | |
279 | } elseif (empty($trigram)) { | |
280 | throw new Text_LanguageDetect_Exception( | |
281 | 'Language database has no elements.', | |
282 | Text_LanguageDetect_Exception::DB_EMPTY | |
283 | ); | |
284 | } | |
285 | } | |
286 | ||
287 | /** | |
288 | * Omits languages | |
289 | * | |
290 | * Pass this function the name of or an array of names of | |
291 | * languages that you don't want considered | |
292 | * | |
293 | * If you're only expecting a limited set of languages, this can greatly | |
294 | * speed up processing | |
295 | * | |
296 | * @param mixed $omit_list language name or array of names to omit | |
297 | * @param bool $include_only if true will include (rather than | |
298 | * exclude) only those in the list | |
299 | * | |
300 | * @return int number of languages successfully deleted | |
301 | * @throws Text_LanguageDetect_Exception | |
302 | */ | |
303 | public function omitLanguages($omit_list, $include_only = false) | |
304 | { | |
305 | $deleted = 0; | |
306 | ||
307 | $omit_list = $this->_convertFromNameMode($omit_list); | |
308 | ||
309 | if (!$include_only) { | |
310 | // deleting the given languages | |
311 | if (!is_array($omit_list)) { | |
312 | $omit_list = strtolower($omit_list); // case desensitize | |
313 | if (isset($this->_lang_db[$omit_list])) { | |
314 | unset($this->_lang_db[$omit_list]); | |
315 | $deleted++; | |
316 | } | |
317 | } else { | |
318 | foreach ($omit_list as $omit_lang) { | |
319 | if (isset($this->_lang_db[$omit_lang])) { | |
320 | unset($this->_lang_db[$omit_lang]); | |
321 | $deleted++; | |
322 | } | |
323 | } | |
324 | } | |
325 | ||
326 | } else { | |
327 | // deleting all except the given languages | |
328 | if (!is_array($omit_list)) { | |
329 | $omit_list = array($omit_list); | |
330 | } | |
331 | ||
332 | // case desensitize | |
333 | foreach ($omit_list as $key => $omit_lang) { | |
334 | $omit_list[$key] = strtolower($omit_lang); | |
335 | } | |
336 | ||
337 | foreach (array_keys($this->_lang_db) as $lang) { | |
338 | if (!in_array($lang, $omit_list)) { | |
339 | unset($this->_lang_db[$lang]); | |
340 | $deleted++; | |
341 | } | |
342 | } | |
343 | } | |
344 | ||
345 | // reset the cluster cache if the number of languages changes | |
346 | // this will then have to be recalculated | |
347 | if (isset($this->_clusters) && $deleted > 0) { | |
348 | $this->_clusters = null; | |
349 | } | |
350 | ||
351 | return $deleted; | |
352 | } | |
353 | ||
354 | ||
355 | /** | |
356 | * Returns the number of languages that this object can detect | |
357 | * | |
358 | * @access public | |
359 | * @return int the number of languages | |
360 | * @throws Text_LanguageDetect_Exception | |
361 | */ | |
362 | function getLanguageCount() | |
363 | { | |
364 | return count($this->_lang_db); | |
365 | } | |
366 | ||
367 | /** | |
368 | * Checks if the language with the given name exists in the database | |
369 | * | |
370 | * @param mixed $lang Language name or array of language names | |
371 | * | |
372 | * @return bool true if language model exists | |
373 | */ | |
374 | public function languageExists($lang) | |
375 | { | |
376 | $lang = $this->_convertFromNameMode($lang); | |
377 | ||
378 | if (is_string($lang)) { | |
379 | return isset($this->_lang_db[strtolower($lang)]); | |
380 | ||
381 | } elseif (is_array($lang)) { | |
382 | foreach ($lang as $test_lang) { | |
383 | if (!isset($this->_lang_db[strtolower($test_lang)])) { | |
384 | return false; | |
385 | } | |
386 | } | |
387 | return true; | |
388 | ||
389 | } else { | |
390 | throw new Text_LanguageDetect_Exception( | |
391 | 'Unsupported parameter type passed to languageExists()', | |
392 | Text_LanguageDetect_Exception::PARAM_TYPE | |
393 | ); | |
394 | } | |
395 | } | |
396 | ||
397 | /** | |
398 | * Returns the list of detectable languages | |
399 | * | |
400 | * @access public | |
401 | * @return array the names of the languages known to this object<<<<<<< | |
402 | * @throws Text_LanguageDetect_Exception | |
403 | */ | |
404 | function getLanguages() | |
405 | { | |
406 | return $this->_convertToNameMode( | |
407 | array_keys($this->_lang_db) | |
408 | ); | |
409 | } | |
410 | ||
411 | /** | |
412 | * Make this object behave like Language::Guess | |
413 | * | |
414 | * @param bool $setting false to turn off perl compatibility | |
415 | * | |
416 | * @return void | |
417 | */ | |
418 | public function setPerlCompatible($setting = true) | |
419 | { | |
420 | if (is_bool($setting)) { // input check | |
421 | $this->_perl_compatible = $setting; | |
422 | ||
423 | if ($setting == true) { | |
424 | $this->_max_score = $this->_threshold; | |
425 | } else { | |
426 | $this->_max_score = 0; | |
427 | } | |
428 | } | |
429 | ||
430 | } | |
431 | ||
432 | /** | |
433 | * Sets the way how language names are accepted and returned. | |
434 | * | |
435 | * @param integer $name_mode One of the following modes: | |
436 | * 0 - language name ("english") | |
437 | * 2 - 2-letter ISO 639-1 code ("en") | |
438 | * 3 - 3-letter ISO 639-2 code ("eng") | |
439 | * | |
440 | * @return void | |
441 | */ | |
442 | function setNameMode($name_mode) | |
443 | { | |
444 | $this->_name_mode = $name_mode; | |
445 | } | |
446 | ||
447 | /** | |
448 | * Whether to use unicode block ranges in detection | |
449 | * | |
450 | * Should speed up most detections if turned on (detault is on). In some | |
451 | * circumstances it may be slower, such as for large text samples (> 10K) | |
452 | * in languages that use latin scripts. In other cases it should speed up | |
453 | * detection noticeably. | |
454 | * | |
455 | * @param bool $setting false to turn off | |
456 | * | |
457 | * @return void | |
458 | */ | |
459 | public function useUnicodeBlocks($setting = true) | |
460 | { | |
461 | if (is_bool($setting)) { | |
462 | $this->_use_unicode_narrowing = $setting; | |
463 | } | |
464 | } | |
465 | ||
466 | /** | |
467 | * Converts a piece of text into trigrams | |
468 | * | |
469 | * @param string $text text to convert | |
470 | * | |
471 | * @return array array of trigram frequencies | |
472 | * @access private | |
473 | * @deprecated Superceded by the Text_LanguageDetect_Parser class | |
474 | */ | |
475 | function _trigram($text) | |
476 | { | |
477 | $s = new Text_LanguageDetect_Parser($text); | |
478 | $s->prepareTrigram(); | |
479 | $s->prepareUnicode(false); | |
480 | $s->setPadStart(!$this->_perl_compatible); | |
481 | $s->analyze(); | |
482 | return $s->getTrigramFreqs(); | |
483 | } | |
484 | ||
485 | /** | |
486 | * Converts a set of trigrams from frequencies to ranks | |
487 | * | |
488 | * Thresholds (cuts off) the list at $this->_threshold | |
489 | * | |
490 | * @param array $arr array of trigram | |
491 | * | |
492 | * @return array ranks of trigrams | |
493 | * @access protected | |
494 | */ | |
495 | function _arr_rank($arr) | |
496 | { | |
497 | ||
498 | // sorts alphabetically first as a standard way of breaking rank ties | |
499 | $this->_bub_sort($arr); | |
500 | ||
501 | // below might also work, but seemed to introduce errors in testing | |
502 | //ksort($arr); | |
503 | //asort($arr); | |
504 | ||
505 | $rank = array(); | |
506 | ||
507 | $i = 0; | |
508 | foreach ($arr as $key => $value) { | |
509 | $rank[$key] = $i++; | |
510 | ||
511 | // cut off at a standard threshold | |
512 | if ($i >= $this->_threshold) { | |
513 | break; | |
514 | } | |
515 | } | |
516 | ||
517 | return $rank; | |
518 | } | |
519 | ||
520 | /** | |
521 | * Sorts an array by value breaking ties alphabetically | |
522 | * | |
523 | * @param array &$arr the array to sort | |
524 | * | |
525 | * @return void | |
526 | * @access private | |
527 | */ | |
528 | function _bub_sort(&$arr) | |
529 | { | |
530 | // should do the same as this perl statement: | |
531 | // sort { $trigrams{$b} == $trigrams{$a} | |
532 | // ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } | |
533 | ||
534 | // needs to sort by both key and value at once | |
535 | // using the key to break ties for the value | |
536 | ||
537 | // converts array into an array of arrays of each key and value | |
538 | // may be a better way of doing this | |
539 | $combined = array(); | |
540 | ||
541 | foreach ($arr as $key => $value) { | |
542 | $combined[] = array($key, $value); | |
543 | } | |
544 | ||
545 | usort($combined, array($this, '_sort_func')); | |
546 | ||
547 | $replacement = array(); | |
548 | foreach ($combined as $key => $value) { | |
549 | list($new_key, $new_value) = $value; | |
550 | $replacement[$new_key] = $new_value; | |
551 | } | |
552 | ||
553 | $arr = $replacement; | |
554 | } | |
555 | ||
556 | /** | |
557 | * Sort function used by bubble sort | |
558 | * | |
559 | * Callback function for usort(). | |
560 | * | |
561 | * @param array $a first param passed by usort() | |
562 | * @param array $b second param passed by usort() | |
563 | * | |
564 | * @return int 1 if $a is greater, -1 if not | |
565 | * @see _bub_sort() | |
566 | * @access private | |
567 | */ | |
568 | function _sort_func($a, $b) | |
569 | { | |
570 | // each is actually a key/value pair, so that it can compare using both | |
571 | list($a_key, $a_value) = $a; | |
572 | list($b_key, $b_value) = $b; | |
573 | ||
574 | if ($a_value == $b_value) { | |
575 | // if the values are the same, break ties using the key | |
576 | return strcmp($a_key, $b_key); | |
577 | ||
578 | } else { | |
579 | // if not, just sort normally | |
580 | if ($a_value > $b_value) { | |
581 | return -1; | |
582 | } else { | |
583 | return 1; | |
584 | } | |
585 | } | |
586 | ||
587 | // 0 should not be possible because keys must be unique | |
588 | } | |
589 | ||
590 | /** | |
591 | * Calculates a linear rank-order distance statistic between two sets of | |
592 | * ranked trigrams | |
593 | * | |
594 | * Sums the differences in rank for each trigram. If the trigram does not | |
595 | * appear in both, consider it a difference of $this->_threshold. | |
596 | * | |
597 | * This distance measure was proposed by Cavnar & Trenkle (1994). Despite | |
598 | * its simplicity it has been shown to be highly accurate for language | |
599 | * identification tasks. | |
600 | * | |
601 | * @param array $arr1 the reference set of trigram ranks | |
602 | * @param array $arr2 the target set of trigram ranks | |
603 | * | |
604 | * @return int the sum of the differences between the ranks of | |
605 | * the two trigram sets | |
606 | * @access private | |
607 | */ | |
608 | function _distance($arr1, $arr2) | |
609 | { | |
610 | $sumdist = 0; | |
611 | ||
612 | foreach ($arr2 as $key => $value) { | |
613 | if (isset($arr1[$key])) { | |
614 | $distance = abs($value - $arr1[$key]); | |
615 | } else { | |
616 | // $this->_threshold sets the maximum possible distance value | |
617 | // for any one pair of trigrams | |
618 | $distance = $this->_threshold; | |
619 | } | |
620 | $sumdist += $distance; | |
621 | } | |
622 | ||
623 | return $sumdist; | |
624 | ||
625 | // todo: there are other distance statistics to try, e.g. relative | |
626 | // entropy, but they're probably more costly to compute | |
627 | } | |
628 | ||
629 | /** | |
630 | * Normalizes the score returned by _distance() | |
631 | * | |
632 | * Different if perl compatible or not | |
633 | * | |
634 | * @param int $score the score from _distance() | |
635 | * @param int $base_count the number of trigrams being considered | |
636 | * | |
637 | * @return float the normalized score | |
638 | * @see _distance() | |
639 | * @access private | |
640 | */ | |
641 | function _normalize_score($score, $base_count = null) | |
642 | { | |
643 | if ($base_count === null) { | |
644 | $base_count = $this->_threshold; | |
645 | } | |
646 | ||
647 | if (!$this->_perl_compatible) { | |
648 | return 1 - ($score / $base_count / $this->_threshold); | |
649 | } else { | |
650 | return floor($score / $base_count); | |
651 | } | |
652 | } | |
653 | ||
654 | ||
655 | /** | |
656 | * Detects the closeness of a sample of text to the known languages | |
657 | * | |
658 | * Calculates the statistical difference between the text and | |
659 | * the trigrams for each language, normalizes the score then | |
660 | * returns results for all languages in sorted order | |
661 | * | |
662 | * If perl compatible, the score is 300-0, 0 being most similar. | |
663 | * Otherwise, it's 0-1 with 1 being most similar. | |
664 | * | |
665 | * The $sample text should be at least a few sentences in length; | |
666 | * should be ascii-7 or utf8 encoded, if another and the mbstring extension | |
667 | * is present it will try to detect and convert. However, experience has | |
668 | * shown that mb_detect_encoding() *does not work very well* with at least | |
669 | * some types of encoding. | |
670 | * | |
671 | * @param string $sample a sample of text to compare. | |
672 | * @param int $limit if specified, return an array of the most likely | |
673 | * $limit languages and their scores. | |
674 | * | |
675 | * @return mixed sorted array of language scores, blank array if no | |
676 | * useable text was found | |
677 | * @see _distance() | |
678 | * @throws Text_LanguageDetect_Exception | |
679 | */ | |
680 | public function detect($sample, $limit = 0) | |
681 | { | |
682 | // input check | |
683 | if (!Text_LanguageDetect_Parser::validateString($sample)) { | |
684 | return array(); | |
685 | } | |
686 | ||
687 | // check char encoding | |
688 | // (only if mbstring extension is compiled and PHP > 4.0.6) | |
689 | if (function_exists('mb_detect_encoding') | |
690 | && function_exists('mb_convert_encoding') | |
691 | ) { | |
692 | // mb_detect_encoding isn't very reliable, to say the least | |
693 | // detection should still work with a sufficient sample | |
694 | // of ascii characters | |
695 | $encoding = mb_detect_encoding($sample); | |
696 | ||
697 | // mb_detect_encoding() will return FALSE if detection fails | |
698 | // don't attempt conversion if that's the case | |
699 | if ($encoding != 'ASCII' && $encoding != 'UTF-8' | |
700 | && $encoding !== false | |
701 | ) { | |
702 | // verify the encoding exists in mb_list_encodings | |
703 | if (in_array($encoding, mb_list_encodings())) { | |
704 | $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); | |
705 | } | |
706 | } | |
707 | } | |
708 | ||
709 | $sample_obj = new Text_LanguageDetect_Parser($sample); | |
710 | $sample_obj->prepareTrigram(); | |
711 | if ($this->_use_unicode_narrowing) { | |
712 | $sample_obj->prepareUnicode(); | |
713 | } | |
714 | $sample_obj->setPadStart(!$this->_perl_compatible); | |
715 | $sample_obj->analyze(); | |
716 | ||
717 | $trigram_freqs =& $sample_obj->getTrigramRanks(); | |
718 | $trigram_count = count($trigram_freqs); | |
719 | ||
720 | if ($trigram_count == 0) { | |
721 | return array(); | |
722 | } | |
723 | ||
724 | $scores = array(); | |
725 | ||
726 | // use unicode block detection to narrow down the possibilities | |
727 | if ($this->_use_unicode_narrowing) { | |
728 | $blocks =& $sample_obj->getUnicodeBlocks(); | |
729 | ||
730 | if (is_array($blocks)) { | |
731 | $present_blocks = array_keys($blocks); | |
732 | } else { | |
733 | throw new Text_LanguageDetect_Exception( | |
734 | 'Error during block detection', | |
735 | Text_LanguageDetect_Exception::BLOCK_DETECTION | |
736 | ); | |
737 | } | |
738 | ||
739 | $possible_langs = array(); | |
740 | ||
741 | foreach ($present_blocks as $blockname) { | |
742 | if (isset($this->_unicode_map[$blockname])) { | |
743 | ||
744 | $possible_langs = array_merge( | |
745 | $possible_langs, | |
746 | array_keys($this->_unicode_map[$blockname]) | |
747 | ); | |
748 | ||
749 | // todo: faster way to do this? | |
750 | } | |
751 | } | |
752 | ||
753 | // could also try an intersect operation rather than a union | |
754 | // in other words, choose languages whose trigrams contain | |
755 | // ALL of the unicode blocks found in this sample | |
756 | // would improve speed but would be completely thrown off by an | |
757 | // unexpected character, like an umlaut appearing in english text | |
758 | ||
759 | $possible_langs = array_intersect( | |
760 | array_keys($this->_lang_db), | |
761 | array_unique($possible_langs) | |
762 | ); | |
763 | ||
764 | // needs to intersect it with the keys of _lang_db in case | |
765 | // languages have been omitted | |
766 | ||
767 | } else { | |
768 | // or just try 'em all | |
769 | $possible_langs = array_keys($this->_lang_db); | |
770 | } | |
771 | ||
772 | ||
773 | foreach ($possible_langs as $lang) { | |
774 | $scores[$lang] = $this->_normalize_score( | |
775 | $this->_distance($this->_lang_db[$lang], $trigram_freqs), | |
776 | $trigram_count | |
777 | ); | |
778 | } | |
779 | ||
780 | unset($sample_obj); | |
781 | ||
782 | if ($this->_perl_compatible) { | |
783 | asort($scores); | |
784 | } else { | |
785 | arsort($scores); | |
786 | } | |
787 | ||
788 | // todo: drop languages with a score of $this->_max_score? | |
789 | ||
790 | // limit the number of returned scores | |
791 | if ($limit && is_numeric($limit)) { | |
792 | $limited_scores = array(); | |
793 | ||
794 | $i = 0; | |
795 | foreach ($scores as $key => $value) { | |
796 | if ($i++ >= $limit) { | |
797 | break; | |
798 | } | |
799 | ||
800 | $limited_scores[$key] = $value; | |
801 | } | |
802 | ||
803 | return $this->_convertToNameMode($limited_scores, true); | |
804 | } else { | |
805 | return $this->_convertToNameMode($scores, true); | |
806 | } | |
807 | } | |
808 | ||
809 | /** | |
810 | * Returns only the most similar language to the text sample | |
811 | * | |
812 | * Calls $this->detect() and returns only the top result | |
813 | * | |
814 | * @param string $sample text to detect the language of | |
815 | * | |
816 | * @return string the name of the most likely language | |
817 | * or null if no language is similar | |
818 | * @see detect() | |
819 | * @throws Text_LanguageDetect_Exception | |
820 | */ | |
821 | public function detectSimple($sample) | |
822 | { | |
823 | $scores = $this->detect($sample, 1); | |
824 | ||
825 | // if top language has the maximum possible score, | |
826 | // then the top score will have been picked at random | |
827 | if (!is_array($scores) || empty($scores) | |
828 | || current($scores) == $this->_max_score | |
829 | ) { | |
830 | return null; | |
831 | } else { | |
832 | return key($scores); | |
833 | } | |
834 | } | |
835 | ||
836 | /** | |
837 | * Returns an array containing the most similar language and a confidence | |
838 | * rating | |
839 | * | |
840 | * Confidence is a simple measure calculated from the similarity score | |
841 | * minus the similarity score from the next most similar language | |
842 | * divided by the highest possible score. Languages that have closely | |
843 | * related cousins (e.g. Norwegian and Danish) should generally have lower | |
844 | * confidence scores. | |
845 | * | |
846 | * The similarity score answers the question "How likely is the text the | |
847 | * returned language regardless of the other languages considered?" The | |
848 | * confidence score is one way of answering the question "how likely is the | |
849 | * text the detected language relative to the rest of the language model | |
850 | * set?" | |
851 | * | |
852 | * To see how similar languages are a priori, see languageSimilarity() | |
853 | * | |
854 | * @param string $sample text for which language will be detected | |
855 | * | |
856 | * @return array most similar language, score and confidence rating | |
857 | * or null if no language is similar | |
858 | * @see detect() | |
859 | * @throws Text_LanguageDetect_Exception | |
860 | */ | |
861 | public function detectConfidence($sample) | |
862 | { | |
863 | $scores = $this->detect($sample, 2); | |
864 | ||
865 | // if most similar language has the max score, it | |
866 | // will have been picked at random | |
867 | if (!is_array($scores) || empty($scores) | |
868 | || current($scores) == $this->_max_score | |
869 | ) { | |
870 | return null; | |
871 | } | |
872 | ||
873 | $arr['language'] = key($scores); | |
874 | $arr['similarity'] = current($scores); | |
875 | if (next($scores) !== false) { // if false then no next element | |
876 | // the goal is to return a higher value if the distance between | |
877 | // the similarity of the first score and the second score is high | |
878 | ||
879 | if ($this->_perl_compatible) { | |
880 | $arr['confidence'] = (current($scores) - $arr['similarity']) | |
881 | / $this->_max_score; | |
882 | ||
883 | } else { | |
884 | $arr['confidence'] = $arr['similarity'] - current($scores); | |
885 | ||
886 | } | |
887 | ||
888 | } else { | |
889 | $arr['confidence'] = null; | |
890 | } | |
891 | ||
892 | return $arr; | |
893 | } | |
894 | ||
895 | /** | |
896 | * Returns the distribution of unicode blocks in a given utf8 string | |
897 | * | |
898 | * For the block name of a single char, use unicodeBlockName() | |
899 | * | |
900 | * @param string $str input string. Must be ascii or utf8 | |
901 | * @param bool $skip_symbols if true, skip ascii digits, symbols and | |
902 | * non-printing characters. Includes spaces, | |
903 | * newlines and common punctutation characters. | |
904 | * | |
905 | * @return array | |
906 | * @throws Text_LanguageDetect_Exception | |
907 | */ | |
908 | public function detectUnicodeBlocks($str, $skip_symbols) | |
909 | { | |
910 | $skip_symbols = (bool)$skip_symbols; | |
911 | $str = (string)$str; | |
912 | ||
913 | $sample_obj = new Text_LanguageDetect_Parser($str); | |
914 | $sample_obj->prepareUnicode(); | |
915 | $sample_obj->prepareTrigram(false); | |
916 | $sample_obj->setUnicodeSkipSymbols($skip_symbols); | |
917 | $sample_obj->analyze(); | |
918 | $blocks = $sample_obj->getUnicodeBlocks(); | |
919 | unset($sample_obj); | |
920 | return $blocks; | |
921 | } | |
922 | ||
923 | /** | |
924 | * Returns the block name for a given unicode value | |
925 | * | |
926 | * If passed a string, will assume it is being passed a UTF8-formatted | |
927 | * character and will automatically convert. Otherwise it will assume it | |
928 | * is being passed a numeric unicode value. | |
929 | * | |
930 | * Make sure input is of the correct type! | |
931 | * | |
932 | * @param mixed $unicode unicode value or utf8 char | |
933 | * | |
934 | * @return mixed the block name string or false if not found | |
935 | * @throws Text_LanguageDetect_Exception | |
936 | */ | |
937 | public function unicodeBlockName($unicode) | |
938 | { | |
939 | if (is_string($unicode)) { | |
940 | // assume it is being passed a utf8 char, so convert it | |
941 | if (self::utf8strlen($unicode) > 1) { | |
942 | throw new Text_LanguageDetect_Exception( | |
943 | 'Pass a single char only to this method', | |
944 | Text_LanguageDetect_Exception::PARAM_TYPE | |
945 | ); | |
946 | } | |
947 | $unicode = $this->_utf8char2unicode($unicode); | |
948 | ||
949 | } elseif (!is_int($unicode)) { | |
950 | throw new Text_LanguageDetect_Exception( | |
951 | 'Input must be of type string or int.', | |
952 | Text_LanguageDetect_Exception::PARAM_TYPE | |
953 | ); | |
954 | } | |
955 | ||
956 | $blocks = $this->_read_unicode_block_db(); | |
957 | ||
958 | $result = $this->_unicode_block_name($unicode, $blocks); | |
959 | ||
960 | if ($result == -1) { | |
961 | return false; | |
962 | } else { | |
963 | return $result[2]; | |
964 | } | |
965 | } | |
966 | ||
967 | /** | |
968 | * Searches the unicode block database | |
969 | * | |
970 | * Returns the block name for a given unicode value. unicodeBlockName() is | |
971 | * the public interface for this function, which does input checks which | |
972 | * this function omits for speed. | |
973 | * | |
974 | * @param int $unicode the unicode value | |
975 | * @param array $blocks the block database | |
976 | * @param int $block_count the number of defined blocks in the database | |
977 | * | |
978 | * @return mixed Block name, -1 if it failed | |
979 | * @see unicodeBlockName() | |
980 | * @access protected | |
981 | */ | |
982 | function _unicode_block_name($unicode, $blocks, $block_count = -1) | |
983 | { | |
984 | // for a reference, see | |
985 | // http://www.unicode.org/Public/UNIDATA/Blocks.txt | |
986 | ||
987 | // assume that ascii characters are the most common | |
988 | // so try it first for efficiency | |
989 | if ($unicode <= $blocks[0][1]) { | |
990 | return $blocks[0]; | |
991 | } | |
992 | ||
993 | // the optional $block_count param is for efficiency | |
994 | // so we this function doesn't have to run count() every time | |
995 | if ($block_count != -1) { | |
996 | $high = $block_count - 1; | |
997 | } else { | |
998 | $high = count($blocks) - 1; | |
999 | } | |
1000 | ||
1001 | $low = 1; // start with 1 because ascii was 0 | |
1002 | ||
1003 | // your average binary search algorithm | |
1004 | while ($low <= $high) { | |
1005 | $mid = floor(($low + $high) / 2); | |
1006 | ||
1007 | if ($unicode < $blocks[$mid][0]) { | |
1008 | // if it's lower than the lower bound | |
1009 | $high = $mid - 1; | |
1010 | ||
1011 | } elseif ($unicode > $blocks[$mid][1]) { | |
1012 | // if it's higher than the upper bound | |
1013 | $low = $mid + 1; | |
1014 | ||
1015 | } else { | |
1016 | // found it | |
1017 | return $blocks[$mid]; | |
1018 | } | |
1019 | } | |
1020 | ||
1021 | // failed to find the block | |
1022 | return -1; | |
1023 | ||
1024 | // todo: differentiate when it's out of range or when it falls | |
1025 | // into an unassigned range? | |
1026 | } | |
1027 | ||
1028 | /** | |
1029 | * Brings up the unicode block database | |
1030 | * | |
1031 | * @return array the database of unicode block definitions | |
1032 | * @throws Text_LanguageDetect_Exception | |
1033 | * @access protected | |
1034 | */ | |
1035 | function _read_unicode_block_db() | |
1036 | { | |
1037 | // since the unicode definitions are always going to be the same, | |
1038 | // might as well share the memory for the db with all other instances | |
1039 | // of this class | |
1040 | static $data; | |
1041 | ||
1042 | if (!isset($data)) { | |
1043 | $data = $this->_readdb($this->_unicode_db_filename); | |
1044 | } | |
1045 | ||
1046 | return $data; | |
1047 | } | |
1048 | ||
1049 | /** | |
1050 | * Calculate the similarities between the language models | |
1051 | * | |
1052 | * Use this function to see how similar languages are to each other. | |
1053 | * | |
1054 | * If passed 2 language names, will return just those languages compared. | |
1055 | * If passed 1 language name, will return that language compared to | |
1056 | * all others. | |
1057 | * If passed none, will return an array of every language model compared | |
1058 | * to every other one. | |
1059 | * | |
1060 | * @param string $lang1 the name of the first language to be compared | |
1061 | * @param string $lang2 the name of the second language to be compared | |
1062 | * | |
1063 | * @return array scores of every language compared | |
1064 | * or the score of just the provided languages | |
1065 | * or null if one of the supplied languages does not exist | |
1066 | * @throws Text_LanguageDetect_Exception | |
1067 | */ | |
1068 | public function languageSimilarity($lang1 = null, $lang2 = null) | |
1069 | { | |
1070 | $lang1 = $this->_convertFromNameMode($lang1); | |
1071 | $lang2 = $this->_convertFromNameMode($lang2); | |
1072 | if ($lang1 != null) { | |
1073 | $lang1 = strtolower($lang1); | |
1074 | ||
1075 | // check if language model exists | |
1076 | if (!isset($this->_lang_db[$lang1])) { | |
1077 | return null; | |
1078 | } | |
1079 | ||
1080 | if ($lang2 != null) { | |
1081 | if (!isset($this->_lang_db[$lang2])) { | |
1082 | // check if language model exists | |
1083 | return null; | |
1084 | } | |
1085 | ||
1086 | $lang2 = strtolower($lang2); | |
1087 | ||
1088 | // compare just these two languages | |
1089 | return $this->_normalize_score( | |
1090 | $this->_distance( | |
1091 | $this->_lang_db[$lang1], | |
1092 | $this->_lang_db[$lang2] | |
1093 | ) | |
1094 | ); | |
1095 | ||
1096 | } else { | |
1097 | // compare just $lang1 to all languages | |
1098 | $return_arr = array(); | |
1099 | foreach ($this->_lang_db as $key => $value) { | |
1100 | if ($key != $lang1) { | |
1101 | // don't compare a language to itself | |
1102 | $return_arr[$key] = $this->_normalize_score( | |
1103 | $this->_distance($this->_lang_db[$lang1], $value) | |
1104 | ); | |
1105 | } | |
1106 | } | |
1107 | asort($return_arr); | |
1108 | ||
1109 | return $return_arr; | |
1110 | } | |
1111 | ||
1112 | ||
1113 | } else { | |
1114 | // compare all languages to each other | |
1115 | $return_arr = array(); | |
1116 | foreach (array_keys($this->_lang_db) as $lang1) { | |
1117 | foreach (array_keys($this->_lang_db) as $lang2) { | |
1118 | // skip comparing languages to themselves | |
1119 | if ($lang1 != $lang2) { | |
1120 | ||
1121 | if (isset($return_arr[$lang2][$lang1])) { | |
1122 | // don't re-calculate what's already been done | |
1123 | $return_arr[$lang1][$lang2] | |
1124 | = $return_arr[$lang2][$lang1]; | |
1125 | ||
1126 | } else { | |
1127 | // calculate | |
1128 | $return_arr[$lang1][$lang2] | |
1129 | = $this->_normalize_score( | |
1130 | $this->_distance( | |
1131 | $this->_lang_db[$lang1], | |
1132 | $this->_lang_db[$lang2] | |
1133 | ) | |
1134 | ); | |
1135 | ||
1136 | } | |
1137 | } | |
1138 | } | |
1139 | } | |
1140 | return $return_arr; | |
1141 | } | |
1142 | } | |
1143 | ||
1144 | /** | |
1145 | * Cluster known languages according to languageSimilarity() | |
1146 | * | |
1147 | * WARNING: this method is EXPERIMENTAL. It is not recommended for common | |
1148 | * use, and it may disappear or its functionality may change in future | |
1149 | * releases without notice. | |
1150 | * | |
1151 | * Uses a nearest neighbor technique to generate the maximum possible | |
1152 | * number of dendograms from the similarity data. | |
1153 | * | |
1154 | * @access public | |
1155 | * @return array language cluster data | |
1156 | * @throws Text_LanguageDetect_Exception | |
1157 | * @see languageSimilarity() | |
1158 | * @deprecated this function will eventually be removed and placed into | |
1159 | * the model generation class | |
1160 | */ | |
1161 | function clusterLanguages() | |
1162 | { | |
1163 | // todo: set the maximum number of clusters | |
1164 | // return cached result, if any | |
1165 | if (isset($this->_clusters)) { | |
1166 | return $this->_clusters; | |
1167 | } | |
1168 | ||
1169 | $langs = array_keys($this->_lang_db); | |
1170 | ||
1171 | $arr = $this->languageSimilarity(); | |
1172 | ||
1173 | sort($langs); | |
1174 | ||
1175 | foreach ($langs as $lang) { | |
1176 | if (!isset($this->_lang_db[$lang])) { | |
1177 | throw new Text_LanguageDetect_Exception( | |
1178 | "missing $lang!", | |
1179 | Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE | |
1180 | ); | |
1181 | } | |
1182 | } | |
1183 | ||
1184 | // http://www.psychstat.missouristate.edu/multibook/mlt04m.html | |
1185 | foreach ($langs as $old_key => $lang1) { | |
1186 | $langs[$lang1] = $lang1; | |
1187 | unset($langs[$old_key]); | |
1188 | } | |
1189 | ||
1190 | $result_data = $really_map = array(); | |
1191 | ||
1192 | $i = 0; | |
1193 | while (count($langs) > 2 && $i++ < 200) { | |
1194 | $highest_score = -1; | |
1195 | $highest_key1 = ''; | |
1196 | $highest_key2 = ''; | |
1197 | foreach ($langs as $lang1) { | |
1198 | foreach ($langs as $lang2) { | |
1199 | if ($lang1 != $lang2 | |
1200 | && $arr[$lang1][$lang2] > $highest_score | |
1201 | ) { | |
1202 | $highest_score = $arr[$lang1][$lang2]; | |
1203 | $highest_key1 = $lang1; | |
1204 | $highest_key2 = $lang2; | |
1205 | } | |
1206 | } | |
1207 | } | |
1208 | ||
1209 | if (!$highest_key1) { | |
1210 | // should not ever happen | |
1211 | throw new Text_LanguageDetect_Exception( | |
1212 | "no highest key? (step: $i)", | |
1213 | Text_LanguageDetect_Exception::NO_HIGHEST_KEY | |
1214 | ); | |
1215 | } | |
1216 | ||
1217 | if ($highest_score == 0) { | |
1218 | // languages are perfectly dissimilar | |
1219 | break; | |
1220 | } | |
1221 | ||
1222 | // $highest_key1 and $highest_key2 are most similar | |
1223 | $sum1 = array_sum($arr[$highest_key1]); | |
1224 | $sum2 = array_sum($arr[$highest_key2]); | |
1225 | ||
1226 | // use the score for the one that is most similar to the rest of | |
1227 | // the field as the score for the group | |
1228 | // todo: could try averaging or "centroid" method instead | |
1229 | // seems like that might make more sense | |
1230 | // actually nearest neighbor may be better for binary searching | |
1231 | ||
1232 | ||
1233 | // for "Complete Linkage"/"furthest neighbor" | |
1234 | // sign should be < | |
1235 | // for "Single Linkage"/"nearest neighbor" method | |
1236 | // should should be > | |
1237 | // results seem to be pretty much the same with either method | |
1238 | ||
1239 | // figure out which to delete and which to replace | |
1240 | if ($sum1 > $sum2) { | |
1241 | $replaceme = $highest_key1; | |
1242 | $deleteme = $highest_key2; | |
1243 | } else { | |
1244 | $replaceme = $highest_key2; | |
1245 | $deleteme = $highest_key1; | |
1246 | } | |
1247 | ||
1248 | $newkey = $replaceme . ':' . $deleteme; | |
1249 | ||
1250 | // $replaceme is most similar to remaining languages | |
1251 | // replace $replaceme with '$newkey', deleting $deleteme | |
1252 | ||
1253 | // keep a record of which fork is really which language | |
1254 | $really_lang = $replaceme; | |
1255 | while (isset($really_map[$really_lang])) { | |
1256 | $really_lang = $really_map[$really_lang]; | |
1257 | } | |
1258 | $really_map[$newkey] = $really_lang; | |
1259 | ||
1260 | ||
1261 | // replace the best fitting key, delete the other | |
1262 | foreach ($arr as $key1 => $arr2) { | |
1263 | foreach ($arr2 as $key2 => $value2) { | |
1264 | if ($key2 == $replaceme) { | |
1265 | $arr[$key1][$newkey] = $arr[$key1][$key2]; | |
1266 | unset($arr[$key1][$key2]); | |
1267 | // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] | |
1268 | } | |
1269 | ||
1270 | if ($key1 == $replaceme) { | |
1271 | $arr[$newkey][$key2] = $arr[$key1][$key2]; | |
1272 | unset($arr[$key1][$key2]); | |
1273 | // replacing $arr[$key1][$key2] with $arr[$newkey][$key2] | |
1274 | } | |
1275 | ||
1276 | if ($key1 == $deleteme || $key2 == $deleteme) { | |
1277 | // deleting $arr[$key1][$key2] | |
1278 | unset($arr[$key1][$key2]); | |
1279 | } | |
1280 | } | |
1281 | } | |
1282 | ||
1283 | ||
1284 | unset($langs[$highest_key1]); | |
1285 | unset($langs[$highest_key2]); | |
1286 | $langs[$newkey] = $newkey; | |
1287 | ||
1288 | ||
1289 | // some of these may be overkill | |
1290 | $result_data[$newkey] = array( | |
1291 | 'newkey' => $newkey, | |
1292 | 'count' => $i, | |
1293 | 'diff' => abs($sum1 - $sum2), | |
1294 | 'score' => $highest_score, | |
1295 | 'bestfit' => $replaceme, | |
1296 | 'otherfit' => $deleteme, | |
1297 | 'really' => $really_lang, | |
1298 | ); | |
1299 | } | |
1300 | ||
1301 | $return_val = array( | |
1302 | 'open_forks' => $langs, | |
1303 | // the top level of clusters | |
1304 | // clusters that are mutually exclusive | |
1305 | // or specified by a specific maximum | |
1306 | ||
1307 | 'fork_data' => $result_data, | |
1308 | // data for each split | |
1309 | ||
1310 | 'name_map' => $really_map, | |
1311 | // which cluster is really which language | |
1312 | // using the nearest neighbor technique, the cluster | |
1313 | // inherits all of the properties of its most-similar member | |
1314 | // this keeps track | |
1315 | ); | |
1316 | ||
1317 | ||
1318 | // saves the result in the object | |
1319 | $this->_clusters = $return_val; | |
1320 | ||
1321 | return $return_val; | |
1322 | } | |
1323 | ||
1324 | ||
1325 | /** | |
1326 | * Perform an intelligent detection based on clusterLanguages() | |
1327 | * | |
1328 | * WARNING: this method is EXPERIMENTAL. It is not recommended for common | |
1329 | * use, and it may disappear or its functionality may change in future | |
1330 | * releases without notice. | |
1331 | * | |
1332 | * This compares the sample text to top the top level of clusters. If the | |
1333 | * sample is similar to the cluster it will drop down and compare it to the | |
1334 | * languages in the cluster, and so on until it hits a leaf node. | |
1335 | * | |
1336 | * this should find the language in considerably fewer compares | |
1337 | * (the equivalent of a binary search), however clusterLanguages() is costly | |
1338 | * and the loss of accuracy from this technique is significant. | |
1339 | * | |
1340 | * This method may need to be 'fuzzier' in order to become more accurate. | |
1341 | * | |
1342 | * This function could be more useful if the universe of possible languages | |
1343 | * was very large, however in such cases some method of Bayesian inference | |
1344 | * might be more helpful. | |
1345 | * | |
1346 | * @param string $str input string | |
1347 | * | |
1348 | * @return array language scores (only those compared) | |
1349 | * @throws Text_LanguageDetect_Exception | |
1350 | * @see clusterLanguages() | |
1351 | */ | |
1352 | public function clusteredSearch($str) | |
1353 | { | |
1354 | // input check | |
1355 | if (!Text_LanguageDetect_Parser::validateString($str)) { | |
1356 | return array(); | |
1357 | } | |
1358 | ||
1359 | // clusterLanguages() will return a cached result if possible | |
1360 | // so it's safe to call it every time | |
1361 | $result = $this->clusterLanguages(); | |
1362 | ||
1363 | $dendogram_start = $result['open_forks']; | |
1364 | $dendogram_data = $result['fork_data']; | |
1365 | $dendogram_alias = $result['name_map']; | |
1366 | ||
1367 | $sample_obj = new Text_LanguageDetect_Parser($str); | |
1368 | $sample_obj->prepareTrigram(); | |
1369 | $sample_obj->setPadStart(!$this->_perl_compatible); | |
1370 | $sample_obj->analyze(); | |
1371 | $sample_result = $sample_obj->getTrigramRanks(); | |
1372 | $sample_count = count($sample_result); | |
1373 | ||
1374 | // input check | |
1375 | if ($sample_count == 0) { | |
1376 | return array(); | |
1377 | } | |
1378 | ||
1379 | $i = 0; // counts the number of steps | |
1380 | ||
1381 | foreach ($dendogram_start as $lang) { | |
1382 | if (isset($dendogram_alias[$lang])) { | |
1383 | $lang_key = $dendogram_alias[$lang]; | |
1384 | } else { | |
1385 | $lang_key = $lang; | |
1386 | } | |
1387 | ||
1388 | $scores[$lang] = $this->_normalize_score( | |
1389 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | |
1390 | $sample_count | |
1391 | ); | |
1392 | ||
1393 | $i++; | |
1394 | } | |
1395 | ||
1396 | if ($this->_perl_compatible) { | |
1397 | asort($scores); | |
1398 | } else { | |
1399 | arsort($scores); | |
1400 | } | |
1401 | ||
1402 | $top_score = current($scores); | |
1403 | $top_key = key($scores); | |
1404 | ||
1405 | // of starting forks, $top_key is the most similar to the sample | |
1406 | ||
1407 | $cur_key = $top_key; | |
1408 | while (isset($dendogram_data[$cur_key])) { | |
1409 | $lang1 = $dendogram_data[$cur_key]['bestfit']; | |
1410 | $lang2 = $dendogram_data[$cur_key]['otherfit']; | |
1411 | foreach (array($lang1, $lang2) as $lang) { | |
1412 | if (isset($dendogram_alias[$lang])) { | |
1413 | $lang_key = $dendogram_alias[$lang]; | |
1414 | } else { | |
1415 | $lang_key = $lang; | |
1416 | } | |
1417 | ||
1418 | $scores[$lang] = $this->_normalize_score( | |
1419 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | |
1420 | $sample_count | |
1421 | ); | |
1422 | ||
1423 | //todo: does not need to do same comparison again | |
1424 | } | |
1425 | ||
1426 | $i++; | |
1427 | ||
1428 | if ($scores[$lang1] > $scores[$lang2]) { | |
1429 | $cur_key = $lang1; | |
1430 | $loser_key = $lang2; | |
1431 | } else { | |
1432 | $cur_key = $lang2; | |
1433 | $loser_key = $lang1; | |
1434 | } | |
1435 | ||
1436 | $diff = $scores[$cur_key] - $scores[$loser_key]; | |
1437 | ||
1438 | // $cur_key ({$dendogram_alias[$cur_key]}) wins | |
1439 | // over $loser_key ({$dendogram_alias[$loser_key]}) | |
1440 | // with a difference of $diff | |
1441 | } | |
1442 | ||
1443 | // found result in $i compares | |
1444 | ||
1445 | // rather than sorting the result, preserve it so that you can see | |
1446 | // which paths the algorithm decided to take along the tree | |
1447 | ||
1448 | // but sometimes the last item is only the second highest | |
1449 | if (($this->_perl_compatible && (end($scores) > prev($scores))) | |
1450 | || (!$this->_perl_compatible && (end($scores) < prev($scores))) | |
1451 | ) { | |
1452 | $real_last_score = current($scores); | |
1453 | $real_last_key = key($scores); | |
1454 | ||
1455 | // swaps the 2nd-to-last item for the last item | |
1456 | unset($scores[$real_last_key]); | |
1457 | $scores[$real_last_key] = $real_last_score; | |
1458 | } | |
1459 | ||
1460 | ||
1461 | if (!$this->_perl_compatible) { | |
1462 | $scores = array_reverse($scores, true); | |
1463 | // second param requires php > 4.0.3 | |
1464 | } | |
1465 | ||
1466 | return $scores; | |
1467 | } | |
1468 | ||
1469 | /** | |
1470 | * ut8-safe strlen() | |
1471 | * | |
1472 | * Returns the numbers of characters (not bytes) in a utf8 string | |
1473 | * | |
1474 | * @param string $str string to get the length of | |
1475 | * | |
1476 | * @return int number of chars | |
1477 | */ | |
1478 | public static function utf8strlen($str) | |
1479 | { | |
1480 | // utf8_decode() will convert unknown chars to '?', which is actually | |
1481 | // ideal for counting. | |
1482 | ||
1483 | return strlen(utf8_decode($str)); | |
1484 | ||
1485 | // idea stolen from dokuwiki | |
1486 | } | |
1487 | ||
1488 | /** | |
1489 | * Returns the unicode value of a utf8 char | |
1490 | * | |
1491 | * @param string $char a utf8 (possibly multi-byte) char | |
1492 | * | |
1493 | * @return int unicode value | |
1494 | * @access protected | |
1495 | * @link http://en.wikipedia.org/wiki/UTF-8 | |
1496 | */ | |
1497 | function _utf8char2unicode($char) | |
1498 | { | |
1499 | // strlen() here will actually get the binary length of a single char | |
1500 | switch (strlen($char)) { | |
1501 | case 1: | |
1502 | // normal ASCII-7 byte | |
1503 | // 0xxxxxxx --> 0xxxxxxx | |
1504 | return ord($char{0}); | |
1505 | ||
1506 | case 2: | |
1507 | // 2 byte unicode | |
1508 | // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx | |
1509 | $z = (ord($char{0}) & 0x000001F) << 6; | |
1510 | $x = (ord($char{1}) & 0x0000003F); | |
1511 | return ($z | $x); | |
1512 | ||
1513 | case 3: | |
1514 | // 3 byte unicode | |
1515 | // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx | |
1516 | $z = (ord($char{0}) & 0x0000000F) << 12; | |
1517 | $x1 = (ord($char{1}) & 0x0000003F) << 6; | |
1518 | $x2 = (ord($char{2}) & 0x0000003F); | |
1519 | return ($z | $x1 | $x2); | |
1520 | ||
1521 | case 4: | |
1522 | // 4 byte unicode | |
1523 | // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> | |
1524 | // 000zzzzz xxxxxxxx xxxxxxxx | |
1525 | $z1 = (ord($char{0}) & 0x00000007) << 18; | |
1526 | $z2 = (ord($char{1}) & 0x0000003F) << 12; | |
1527 | $x1 = (ord($char{2}) & 0x0000003F) << 6; | |
1528 | $x2 = (ord($char{3}) & 0x0000003F); | |
1529 | return ($z1 | $z2 | $x1 | $x2); | |
1530 | } | |
1531 | } | |
1532 | ||
1533 | /** | |
1534 | * utf8-safe fast character iterator | |
1535 | * | |
1536 | * Will get the next character starting from $counter, which will then be | |
1537 | * incremented. If a multi-byte char the bytes will be concatenated and | |
1538 | * $counter will be incremeted by the number of bytes in the char. | |
1539 | * | |
1540 | * @param string $str the string being iterated over | |
1541 | * @param int &$counter the iterator, will increment by reference | |
1542 | * @param bool $special_convert whether to do special conversions | |
1543 | * | |
1544 | * @return char the next (possibly multi-byte) char from $counter | |
1545 | * @access private | |
1546 | */ | |
1547 | static function _next_char($str, &$counter, $special_convert = false) | |
1548 | { | |
1549 | $char = $str{$counter++}; | |
1550 | $ord = ord($char); | |
1551 | ||
1552 | // for a description of the utf8 system see | |
1553 | // http://www.phpclasses.org/browse/file/5131.html | |
1554 | ||
1555 | // normal ascii one byte char | |
1556 | if ($ord <= 127) { | |
1557 | // special conversions needed for this package | |
1558 | // (that only apply to regular ascii characters) | |
1559 | // lower case, and convert all non-alphanumeric characters | |
1560 | // other than "'" to space | |
1561 | if ($special_convert && $char != ' ' && $char != "'") { | |
1562 | if ($ord >= 65 && $ord <= 90) { // A-Z | |
1563 | $char = chr($ord + 32); // lower case | |
1564 | } elseif ($ord < 97 || $ord > 122) { // NOT a-z | |
1565 | $char = ' '; // convert to space | |
1566 | } | |
1567 | } | |
1568 | ||
1569 | return $char; | |
1570 | ||
1571 | } elseif ($ord >> 5 == 6) { // two-byte char | |
1572 | // multi-byte chars | |
1573 | $nextchar = $str{$counter++}; // get next byte | |
1574 | ||
1575 | // lower-casing of non-ascii characters is still incomplete | |
1576 | ||
1577 | if ($special_convert) { | |
1578 | // lower case latin accented characters | |
1579 | if ($ord == 195) { | |
1580 | $nextord = ord($nextchar); | |
1581 | $nextord_adj = $nextord + 64; | |
1582 | // for a reference, see | |
1583 | // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html | |
1584 | ||
1585 | // À - Þ but not × | |
1586 | if ($nextord_adj >= 192 | |
1587 | && $nextord_adj <= 222 | |
1588 | && $nextord_adj != 215 | |
1589 | ) { | |
1590 | $nextchar = chr($nextord + 32); | |
1591 | } | |
1592 | ||
1593 | } elseif ($ord == 208) { | |
1594 | // lower case cyrillic alphabet | |
1595 | $nextord = ord($nextchar); | |
1596 | // if A - Pe | |
1597 | if ($nextord >= 144 && $nextord <= 159) { | |
1598 | // lower case | |
1599 | $nextchar = chr($nextord + 32); | |
1600 | ||
1601 | } elseif ($nextord >= 160 && $nextord <= 175) { | |
1602 | // if Er - Ya | |
1603 | // lower case | |
1604 | $char = chr(209); // == $ord++ | |
1605 | $nextchar = chr($nextord - 32); | |
1606 | } | |
1607 | } | |
1608 | } | |
1609 | ||
1610 | // tag on next byte | |
1611 | return $char . $nextchar; | |
1612 | } elseif ($ord >> 4 == 14) { // three-byte char | |
1613 | ||
1614 | // tag on next 2 bytes | |
1615 | return $char . $str{$counter++} . $str{$counter++}; | |
1616 | ||
1617 | } elseif ($ord >> 3 == 30) { // four-byte char | |
1618 | ||
1619 | // tag on next 3 bytes | |
1620 | return $char . $str{$counter++} . $str{$counter++} . $str{$counter++}; | |
1621 | ||
1622 | } else { | |
1623 | // error? | |
1624 | } | |
1625 | } | |
1626 | ||
1627 | /** | |
1628 | * Converts an $language input parameter from the configured mode | |
1629 | * to the language name that is used internally. | |
1630 | * | |
1631 | * Works for strings and arrays. | |
1632 | * | |
1633 | * @param string|array $lang A language description ("english"/"en"/"eng") | |
1634 | * @param boolean $convertKey If $lang is an array, setting $key | |
1635 | * converts the keys to the language name. | |
1636 | * | |
1637 | * @return string|array Language name | |
1638 | */ | |
1639 | function _convertFromNameMode($lang, $convertKey = false) | |
1640 | { | |
1641 | if ($this->_name_mode == 0) { | |
1642 | return $lang; | |
1643 | } | |
1644 | ||
1645 | if ($this->_name_mode == 2) { | |
1646 | $method = 'code2ToName'; | |
1647 | } else { | |
1648 | $method = 'code3ToName'; | |
1649 | } | |
1650 | ||
1651 | if (is_string($lang)) { | |
1652 | return (string)Text_LanguageDetect_ISO639::$method($lang); | |
1653 | } | |
1654 | ||
1655 | $newlang = array(); | |
1656 | foreach ($lang as $key => $val) { | |
1657 | if ($convertKey) { | |
1658 | $newkey = (string)Text_LanguageDetect_ISO639::$method($key); | |
1659 | $newlang[$newkey] = $val; | |
1660 | } else { | |
1661 | $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val); | |
1662 | } | |
1663 | } | |
1664 | return $newlang; | |
1665 | } | |
1666 | ||
1667 | /** | |
1668 | * Converts an $language output parameter from the language name that is | |
1669 | * used internally to the configured mode. | |
1670 | * | |
1671 | * Works for strings and arrays. | |
1672 | * | |
1673 | * @param string|array $lang A language description ("english"/"en"/"eng") | |
1674 | * @param boolean $convertKey If $lang is an array, setting $key | |
1675 | * converts the keys to the language name. | |
1676 | * | |
1677 | * @return string|array Language name | |
1678 | */ | |
1679 | function _convertToNameMode($lang, $convertKey = false) | |
1680 | { | |
1681 | if ($this->_name_mode == 0) { | |
1682 | return $lang; | |
1683 | } | |
1684 | ||
1685 | if ($this->_name_mode == 2) { | |
1686 | $method = 'nameToCode2'; | |
1687 | } else { | |
1688 | $method = 'nameToCode3'; | |
1689 | } | |
1690 | ||
1691 | if (is_string($lang)) { | |
1692 | return Text_LanguageDetect_ISO639::$method($lang); | |
1693 | } | |
1694 | ||
1695 | $newlang = array(); | |
1696 | foreach ($lang as $key => $val) { | |
1697 | if ($convertKey) { | |
1698 | $newkey = Text_LanguageDetect_ISO639::$method($key); | |
1699 | $newlang[$newkey] = $val; | |
1700 | } else { | |
1701 | $newlang[$key] = Text_LanguageDetect_ISO639::$method($val); | |
1702 | } | |
1703 | } | |
1704 | return $newlang; | |
1705 | } | |
1706 | } | |
1707 | ||
1708 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ |