]>
Commit | Line | Data |
---|---|---|
6b461797 AD |
1 | <?php |
2 | ||
3 | /** | |
4 | * Detects the language of a given piece of text. | |
5 | * | |
6 | * Attempts to detect the language of a sample of text by correlating ranked | |
7 | * 3-gram frequencies to a table of 3-gram frequencies of known languages. | |
8 | * | |
9 | * Implements a version of a technique originally proposed by Cavnar & Trenkle | |
10 | * (1994): "N-Gram-Based Text Categorization" | |
11 | * | |
12 | * PHP version 5 | |
13 | * | |
14 | * @category Text | |
15 | * @package Text_LanguageDetect | |
16 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | |
17 | * @copyright 2005-2006 Nicholas Pisarro | |
18 | * @license http://www.debian.org/misc/bsd.license BSD | |
19 | * @version SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $ | |
20 | * @link http://pear.php.net/package/Text_LanguageDetect/ | |
21 | * @link http://langdetect.blogspot.com/ | |
22 | */ | |
23 | ||
24 | require_once 'lib/languagedetect/Text/LanguageDetect/Exception.php'; | |
25 | require_once 'lib/languagedetect/Text/LanguageDetect/Parser.php'; | |
26 | require_once 'lib/languagedetect/Text/LanguageDetect/ISO639.php'; | |
27 | ||
28 | /** | |
29 | * Language detection class | |
30 | * | |
31 | * Requires the langauge model database (lang.dat) that should have | |
32 | * accompanied this class definition in order to be instantiated. | |
33 | * | |
34 | * Example usage: | |
35 | * | |
36 | * <code> | |
37 | * require_once 'Text/LanguageDetect.php'; | |
38 | * | |
39 | * $l = new Text_LanguageDetect; | |
40 | * | |
41 | * $stdin = fopen('php://stdin', 'r'); | |
42 | * | |
43 | * echo "Supported languages:\n"; | |
44 | * | |
45 | * try { | |
46 | * $langs = $l->getLanguages(); | |
47 | * } catch (Text_LanguageDetect_Exception $e) { | |
48 | * die($e->getMessage()); | |
49 | * } | |
50 | * | |
51 | * sort($langs); | |
52 | * echo join(', ', $langs); | |
53 | * | |
54 | * while ($line = fgets($stdin)) { | |
55 | * print_r($l->detect($line, 4)); | |
56 | * } | |
57 | * </code> | |
58 | * | |
59 | * @category Text | |
60 | * @package Text_LanguageDetect | |
61 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | |
62 | * @copyright 2005 Nicholas Pisarro | |
63 | * @license http://www.debian.org/misc/bsd.license BSD | |
64 | * @version Release: @package_version@ | |
65 | * @link http://pear.php.net/package/Text_LanguageDetect/ | |
66 | * @todo allow users to generate their own language models | |
67 | */ | |
68 | class Text_LanguageDetect | |
69 | { | |
70 | /** | |
71 | * The filename that stores the trigram data for the detector | |
72 | * | |
73 | * If this value starts with a slash (/) or a dot (.) the value of | |
74 | * $this->_data_dir will be ignored | |
75 | * | |
76 | * @var string | |
77 | * @access private | |
78 | */ | |
79 | var $_db_filename = 'lang.dat'; | |
80 | ||
81 | /** | |
82 | * The filename that stores the unicode block definitions | |
83 | * | |
84 | * If this value starts with a slash (/) or a dot (.) the value of | |
85 | * $this->_data_dir will be ignored | |
86 | * | |
87 | * @var string | |
88 | * @access private | |
89 | */ | |
90 | var $_unicode_db_filename = 'unicode_blocks.dat'; | |
91 | ||
92 | /** | |
93 | * The data directory | |
94 | * | |
95 | * Should be set by PEAR installer | |
96 | * | |
97 | * @var string | |
98 | * @access private | |
99 | */ | |
100 | var $_data_dir = '@data_dir@'; | |
101 | ||
102 | /** | |
103 | * The trigram data for comparison | |
104 | * | |
105 | * Will be loaded on start from $this->_db_filename | |
106 | * | |
107 | * @var array | |
108 | * @access private | |
109 | */ | |
110 | var $_lang_db = array(); | |
111 | ||
112 | /** | |
113 | * stores the map of the trigram data to unicode characters | |
114 | * | |
115 | * @access private | |
116 | * @var array | |
117 | */ | |
118 | var $_unicode_map; | |
119 | ||
120 | /** | |
121 | * The size of the trigram data arrays | |
122 | * | |
123 | * @var int | |
124 | * @access private | |
125 | */ | |
126 | var $_threshold = 300; | |
127 | ||
128 | /** | |
129 | * the maximum possible score. | |
130 | * | |
131 | * needed for score normalization. Different depending on the | |
132 | * perl compatibility setting | |
133 | * | |
134 | * @access private | |
135 | * @var int | |
136 | * @see setPerlCompatible() | |
137 | */ | |
138 | var $_max_score = 0; | |
139 | ||
140 | /** | |
141 | * Whether or not to simulate perl's Language::Guess exactly | |
142 | * | |
143 | * @access private | |
144 | * @var bool | |
145 | * @see setPerlCompatible() | |
146 | */ | |
147 | var $_perl_compatible = false; | |
148 | ||
149 | /** | |
150 | * Whether to use the unicode block detection to speed up processing | |
151 | * | |
152 | * @access private | |
153 | * @var bool | |
154 | */ | |
155 | var $_use_unicode_narrowing = true; | |
156 | ||
157 | /** | |
158 | * stores the result of the clustering operation | |
159 | * | |
160 | * @access private | |
161 | * @var array | |
162 | * @see clusterLanguages() | |
163 | */ | |
164 | var $_clusters; | |
165 | ||
166 | /** | |
167 | * Which type of "language names" are accepted and returned: | |
168 | * | |
169 | * 0 - language name ("english") | |
170 | * 2 - 2-letter ISO 639-1 code ("en") | |
171 | * 3 - 3-letter ISO 639-2 code ("eng") | |
172 | */ | |
173 | var $_name_mode = 0; | |
174 | ||
175 | /** | |
176 | * Constructor | |
177 | * | |
178 | * Will attempt to load the language database. If it fails, you will get | |
179 | * an exception. | |
180 | */ | |
181 | function __construct() | |
182 | { | |
183 | $data = $this->_readdb($this->_db_filename); | |
184 | $this->_checkTrigram($data['trigram']); | |
185 | $this->_lang_db = $data['trigram']; | |
186 | ||
187 | if (isset($data['trigram-unicodemap'])) { | |
188 | $this->_unicode_map = $data['trigram-unicodemap']; | |
189 | } | |
190 | ||
191 | // Not yet implemented: | |
192 | if (isset($data['trigram-clusters'])) { | |
193 | $this->_clusters = $data['trigram-clusters']; | |
194 | } | |
195 | } | |
196 | ||
197 | /** | |
198 | * Returns the path to the location of the database | |
199 | * | |
200 | * @param string $fname File name to load | |
201 | * | |
202 | * @return string expected path to the language model database | |
203 | * @access private | |
204 | */ | |
205 | function _get_data_loc($fname) | |
206 | { | |
207 | if ($fname{0} == '/' || $fname{0} == '.') { | |
208 | // if filename starts with a slash, assume it's an absolute pathname | |
209 | // and skip whatever is in $this->_data_dir | |
210 | return $fname; | |
211 | ||
212 | } elseif ($this->_data_dir != '@' . 'data_dir' . '@') { | |
213 | // if the data dir was set by the PEAR installer, use that | |
214 | return $this->_data_dir . '/Text_LanguageDetect/' . $fname; | |
215 | ||
216 | } else { | |
217 | // assume this was just unpacked somewhere | |
218 | // try the local working directory if otherwise | |
219 | return __DIR__ . '/data/' . $fname; | |
220 | } | |
221 | } | |
222 | ||
223 | /** | |
224 | * Loads the language trigram database from filename | |
225 | * | |
226 | * Trigram datbase should be a serialize()'d array | |
227 | * | |
228 | * @param string $fname the filename where the data is stored | |
229 | * | |
230 | * @return array the language model data | |
231 | * @throws Text_LanguageDetect_Exception | |
232 | * @access private | |
233 | */ | |
234 | function _readdb($fname) | |
235 | { | |
236 | // finds the correct data dir | |
237 | $fname = $this->_get_data_loc($fname); | |
238 | ||
239 | // input check | |
240 | if (!file_exists($fname)) { | |
241 | throw new Text_LanguageDetect_Exception( | |
242 | 'Language database does not exist: ' . $fname, | |
243 | Text_LanguageDetect_Exception::DB_NOT_FOUND | |
244 | ); | |
245 | } elseif (!is_readable($fname)) { | |
246 | throw new Text_LanguageDetect_Exception( | |
247 | 'Language database is not readable: ' . $fname, | |
248 | Text_LanguageDetect_Exception::DB_NOT_READABLE | |
249 | ); | |
250 | } | |
251 | ||
252 | return unserialize(file_get_contents($fname)); | |
253 | } | |
254 | ||
255 | ||
256 | /** | |
257 | * Checks if this object is ready to detect languages | |
258 | * | |
259 | * @param array $trigram Trigram data from database | |
260 | * | |
261 | * @return void | |
262 | * @access private | |
263 | */ | |
264 | function _checkTrigram($trigram) | |
265 | { | |
266 | if (!is_array($trigram)) { | |
267 | if (ini_get('magic_quotes_runtime')) { | |
268 | throw new Text_LanguageDetect_Exception( | |
269 | 'Error loading database. Try turning magic_quotes_runtime off.', | |
270 | Text_LanguageDetect_Exception::MAGIC_QUOTES | |
271 | ); | |
272 | } | |
273 | throw new Text_LanguageDetect_Exception( | |
274 | 'Language database is not an array.', | |
275 | Text_LanguageDetect_Exception::DB_NOT_ARRAY | |
276 | ); | |
277 | } elseif (empty($trigram)) { | |
278 | throw new Text_LanguageDetect_Exception( | |
279 | 'Language database has no elements.', | |
280 | Text_LanguageDetect_Exception::DB_EMPTY | |
281 | ); | |
282 | } | |
283 | } | |
284 | ||
285 | /** | |
286 | * Omits languages | |
287 | * | |
288 | * Pass this function the name of or an array of names of | |
289 | * languages that you don't want considered | |
290 | * | |
291 | * If you're only expecting a limited set of languages, this can greatly | |
292 | * speed up processing | |
293 | * | |
294 | * @param mixed $omit_list language name or array of names to omit | |
295 | * @param bool $include_only if true will include (rather than | |
296 | * exclude) only those in the list | |
297 | * | |
298 | * @return int number of languages successfully deleted | |
299 | * @throws Text_LanguageDetect_Exception | |
300 | */ | |
301 | public function omitLanguages($omit_list, $include_only = false) | |
302 | { | |
303 | $deleted = 0; | |
304 | ||
305 | $omit_list = $this->_convertFromNameMode($omit_list); | |
306 | ||
307 | if (!$include_only) { | |
308 | // deleting the given languages | |
309 | if (!is_array($omit_list)) { | |
310 | $omit_list = strtolower($omit_list); // case desensitize | |
311 | if (isset($this->_lang_db[$omit_list])) { | |
312 | unset($this->_lang_db[$omit_list]); | |
313 | $deleted++; | |
314 | } | |
315 | } else { | |
316 | foreach ($omit_list as $omit_lang) { | |
317 | if (isset($this->_lang_db[$omit_lang])) { | |
318 | unset($this->_lang_db[$omit_lang]); | |
319 | $deleted++; | |
320 | } | |
321 | } | |
322 | } | |
323 | ||
324 | } else { | |
325 | // deleting all except the given languages | |
326 | if (!is_array($omit_list)) { | |
327 | $omit_list = array($omit_list); | |
328 | } | |
329 | ||
330 | // case desensitize | |
331 | foreach ($omit_list as $key => $omit_lang) { | |
332 | $omit_list[$key] = strtolower($omit_lang); | |
333 | } | |
334 | ||
335 | foreach (array_keys($this->_lang_db) as $lang) { | |
336 | if (!in_array($lang, $omit_list)) { | |
337 | unset($this->_lang_db[$lang]); | |
338 | $deleted++; | |
339 | } | |
340 | } | |
341 | } | |
342 | ||
343 | // reset the cluster cache if the number of languages changes | |
344 | // this will then have to be recalculated | |
345 | if (isset($this->_clusters) && $deleted > 0) { | |
346 | $this->_clusters = null; | |
347 | } | |
348 | ||
349 | return $deleted; | |
350 | } | |
351 | ||
352 | ||
353 | /** | |
354 | * Returns the number of languages that this object can detect | |
355 | * | |
356 | * @access public | |
357 | * @return int the number of languages | |
358 | * @throws Text_LanguageDetect_Exception | |
359 | */ | |
360 | function getLanguageCount() | |
361 | { | |
362 | return count($this->_lang_db); | |
363 | } | |
364 | ||
365 | /** | |
366 | * Checks if the language with the given name exists in the database | |
367 | * | |
368 | * @param mixed $lang Language name or array of language names | |
369 | * | |
370 | * @return bool true if language model exists | |
371 | */ | |
372 | public function languageExists($lang) | |
373 | { | |
374 | $lang = $this->_convertFromNameMode($lang); | |
375 | ||
376 | if (is_string($lang)) { | |
377 | return isset($this->_lang_db[strtolower($lang)]); | |
378 | ||
379 | } elseif (is_array($lang)) { | |
380 | foreach ($lang as $test_lang) { | |
381 | if (!isset($this->_lang_db[strtolower($test_lang)])) { | |
382 | return false; | |
383 | } | |
384 | } | |
385 | return true; | |
386 | ||
387 | } else { | |
388 | throw new Text_LanguageDetect_Exception( | |
389 | 'Unsupported parameter type passed to languageExists()', | |
390 | Text_LanguageDetect_Exception::PARAM_TYPE | |
391 | ); | |
392 | } | |
393 | } | |
394 | ||
395 | /** | |
396 | * Returns the list of detectable languages | |
397 | * | |
398 | * @access public | |
399 | * @return array the names of the languages known to this object<<<<<<< | |
400 | * @throws Text_LanguageDetect_Exception | |
401 | */ | |
402 | function getLanguages() | |
403 | { | |
404 | return $this->_convertToNameMode( | |
405 | array_keys($this->_lang_db) | |
406 | ); | |
407 | } | |
408 | ||
409 | /** | |
410 | * Make this object behave like Language::Guess | |
411 | * | |
412 | * @param bool $setting false to turn off perl compatibility | |
413 | * | |
414 | * @return void | |
415 | */ | |
416 | public function setPerlCompatible($setting = true) | |
417 | { | |
418 | if (is_bool($setting)) { // input check | |
419 | $this->_perl_compatible = $setting; | |
420 | ||
421 | if ($setting == true) { | |
422 | $this->_max_score = $this->_threshold; | |
423 | } else { | |
424 | $this->_max_score = 0; | |
425 | } | |
426 | } | |
427 | ||
428 | } | |
429 | ||
430 | /** | |
431 | * Sets the way how language names are accepted and returned. | |
432 | * | |
433 | * @param integer $name_mode One of the following modes: | |
434 | * 0 - language name ("english") | |
435 | * 2 - 2-letter ISO 639-1 code ("en") | |
436 | * 3 - 3-letter ISO 639-2 code ("eng") | |
437 | * | |
438 | * @return void | |
439 | */ | |
440 | function setNameMode($name_mode) | |
441 | { | |
442 | $this->_name_mode = $name_mode; | |
443 | } | |
444 | ||
445 | /** | |
446 | * Whether to use unicode block ranges in detection | |
447 | * | |
448 | * Should speed up most detections if turned on (detault is on). In some | |
449 | * circumstances it may be slower, such as for large text samples (> 10K) | |
450 | * in languages that use latin scripts. In other cases it should speed up | |
451 | * detection noticeably. | |
452 | * | |
453 | * @param bool $setting false to turn off | |
454 | * | |
455 | * @return void | |
456 | */ | |
457 | public function useUnicodeBlocks($setting = true) | |
458 | { | |
459 | if (is_bool($setting)) { | |
460 | $this->_use_unicode_narrowing = $setting; | |
461 | } | |
462 | } | |
463 | ||
464 | /** | |
465 | * Converts a piece of text into trigrams | |
466 | * | |
467 | * @param string $text text to convert | |
468 | * | |
469 | * @return array array of trigram frequencies | |
470 | * @access private | |
471 | * @deprecated Superceded by the Text_LanguageDetect_Parser class | |
472 | */ | |
473 | function _trigram($text) | |
474 | { | |
475 | $s = new Text_LanguageDetect_Parser($text); | |
476 | $s->prepareTrigram(); | |
477 | $s->prepareUnicode(false); | |
478 | $s->setPadStart(!$this->_perl_compatible); | |
479 | $s->analyze(); | |
480 | return $s->getTrigramFreqs(); | |
481 | } | |
482 | ||
483 | /** | |
484 | * Converts a set of trigrams from frequencies to ranks | |
485 | * | |
486 | * Thresholds (cuts off) the list at $this->_threshold | |
487 | * | |
488 | * @param array $arr array of trigram | |
489 | * | |
490 | * @return array ranks of trigrams | |
491 | * @access protected | |
492 | */ | |
493 | function _arr_rank($arr) | |
494 | { | |
495 | ||
496 | // sorts alphabetically first as a standard way of breaking rank ties | |
497 | $this->_bub_sort($arr); | |
498 | ||
499 | // below might also work, but seemed to introduce errors in testing | |
500 | //ksort($arr); | |
501 | //asort($arr); | |
502 | ||
503 | $rank = array(); | |
504 | ||
505 | $i = 0; | |
506 | foreach ($arr as $key => $value) { | |
507 | $rank[$key] = $i++; | |
508 | ||
509 | // cut off at a standard threshold | |
510 | if ($i >= $this->_threshold) { | |
511 | break; | |
512 | } | |
513 | } | |
514 | ||
515 | return $rank; | |
516 | } | |
517 | ||
518 | /** | |
519 | * Sorts an array by value breaking ties alphabetically | |
520 | * | |
521 | * @param array &$arr the array to sort | |
522 | * | |
523 | * @return void | |
524 | * @access private | |
525 | */ | |
526 | function _bub_sort(&$arr) | |
527 | { | |
528 | // should do the same as this perl statement: | |
529 | // sort { $trigrams{$b} == $trigrams{$a} | |
530 | // ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } | |
531 | ||
532 | // needs to sort by both key and value at once | |
533 | // using the key to break ties for the value | |
534 | ||
535 | // converts array into an array of arrays of each key and value | |
536 | // may be a better way of doing this | |
537 | $combined = array(); | |
538 | ||
539 | foreach ($arr as $key => $value) { | |
540 | $combined[] = array($key, $value); | |
541 | } | |
542 | ||
543 | usort($combined, array($this, '_sort_func')); | |
544 | ||
545 | $replacement = array(); | |
546 | foreach ($combined as $key => $value) { | |
547 | list($new_key, $new_value) = $value; | |
548 | $replacement[$new_key] = $new_value; | |
549 | } | |
550 | ||
551 | $arr = $replacement; | |
552 | } | |
553 | ||
554 | /** | |
555 | * Sort function used by bubble sort | |
556 | * | |
557 | * Callback function for usort(). | |
558 | * | |
559 | * @param array $a first param passed by usort() | |
560 | * @param array $b second param passed by usort() | |
561 | * | |
562 | * @return int 1 if $a is greater, -1 if not | |
563 | * @see _bub_sort() | |
564 | * @access private | |
565 | */ | |
566 | function _sort_func($a, $b) | |
567 | { | |
568 | // each is actually a key/value pair, so that it can compare using both | |
569 | list($a_key, $a_value) = $a; | |
570 | list($b_key, $b_value) = $b; | |
571 | ||
572 | if ($a_value == $b_value) { | |
573 | // if the values are the same, break ties using the key | |
574 | return strcmp($a_key, $b_key); | |
575 | ||
576 | } else { | |
577 | // if not, just sort normally | |
578 | if ($a_value > $b_value) { | |
579 | return -1; | |
580 | } else { | |
581 | return 1; | |
582 | } | |
583 | } | |
584 | ||
585 | // 0 should not be possible because keys must be unique | |
586 | } | |
587 | ||
588 | /** | |
589 | * Calculates a linear rank-order distance statistic between two sets of | |
590 | * ranked trigrams | |
591 | * | |
592 | * Sums the differences in rank for each trigram. If the trigram does not | |
593 | * appear in both, consider it a difference of $this->_threshold. | |
594 | * | |
595 | * This distance measure was proposed by Cavnar & Trenkle (1994). Despite | |
596 | * its simplicity it has been shown to be highly accurate for language | |
597 | * identification tasks. | |
598 | * | |
599 | * @param array $arr1 the reference set of trigram ranks | |
600 | * @param array $arr2 the target set of trigram ranks | |
601 | * | |
602 | * @return int the sum of the differences between the ranks of | |
603 | * the two trigram sets | |
604 | * @access private | |
605 | */ | |
606 | function _distance($arr1, $arr2) | |
607 | { | |
608 | $sumdist = 0; | |
609 | ||
610 | foreach ($arr2 as $key => $value) { | |
611 | if (isset($arr1[$key])) { | |
612 | $distance = abs($value - $arr1[$key]); | |
613 | } else { | |
614 | // $this->_threshold sets the maximum possible distance value | |
615 | // for any one pair of trigrams | |
616 | $distance = $this->_threshold; | |
617 | } | |
618 | $sumdist += $distance; | |
619 | } | |
620 | ||
621 | return $sumdist; | |
622 | ||
623 | // todo: there are other distance statistics to try, e.g. relative | |
624 | // entropy, but they're probably more costly to compute | |
625 | } | |
626 | ||
627 | /** | |
628 | * Normalizes the score returned by _distance() | |
629 | * | |
630 | * Different if perl compatible or not | |
631 | * | |
632 | * @param int $score the score from _distance() | |
633 | * @param int $base_count the number of trigrams being considered | |
634 | * | |
635 | * @return float the normalized score | |
636 | * @see _distance() | |
637 | * @access private | |
638 | */ | |
639 | function _normalize_score($score, $base_count = null) | |
640 | { | |
641 | if ($base_count === null) { | |
642 | $base_count = $this->_threshold; | |
643 | } | |
644 | ||
645 | if (!$this->_perl_compatible) { | |
646 | return 1 - ($score / $base_count / $this->_threshold); | |
647 | } else { | |
648 | return floor($score / $base_count); | |
649 | } | |
650 | } | |
651 | ||
652 | ||
653 | /** | |
654 | * Detects the closeness of a sample of text to the known languages | |
655 | * | |
656 | * Calculates the statistical difference between the text and | |
657 | * the trigrams for each language, normalizes the score then | |
658 | * returns results for all languages in sorted order | |
659 | * | |
660 | * If perl compatible, the score is 300-0, 0 being most similar. | |
661 | * Otherwise, it's 0-1 with 1 being most similar. | |
662 | * | |
663 | * The $sample text should be at least a few sentences in length; | |
664 | * should be ascii-7 or utf8 encoded, if another and the mbstring extension | |
665 | * is present it will try to detect and convert. However, experience has | |
666 | * shown that mb_detect_encoding() *does not work very well* with at least | |
667 | * some types of encoding. | |
668 | * | |
669 | * @param string $sample a sample of text to compare. | |
670 | * @param int $limit if specified, return an array of the most likely | |
671 | * $limit languages and their scores. | |
672 | * | |
673 | * @return mixed sorted array of language scores, blank array if no | |
674 | * useable text was found | |
675 | * @see _distance() | |
676 | * @throws Text_LanguageDetect_Exception | |
677 | */ | |
678 | public function detect($sample, $limit = 0) | |
679 | { | |
680 | // input check | |
681 | if (!Text_LanguageDetect_Parser::validateString($sample)) { | |
682 | return array(); | |
683 | } | |
684 | ||
685 | // check char encoding | |
686 | // (only if mbstring extension is compiled and PHP > 4.0.6) | |
687 | if (function_exists('mb_detect_encoding') | |
688 | && function_exists('mb_convert_encoding') | |
689 | ) { | |
690 | // mb_detect_encoding isn't very reliable, to say the least | |
691 | // detection should still work with a sufficient sample | |
692 | // of ascii characters | |
693 | $encoding = mb_detect_encoding($sample); | |
694 | ||
695 | // mb_detect_encoding() will return FALSE if detection fails | |
696 | // don't attempt conversion if that's the case | |
697 | if ($encoding != 'ASCII' && $encoding != 'UTF-8' | |
698 | && $encoding !== false | |
699 | ) { | |
700 | // verify the encoding exists in mb_list_encodings | |
701 | if (in_array($encoding, mb_list_encodings())) { | |
702 | $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); | |
703 | } | |
704 | } | |
705 | } | |
706 | ||
707 | $sample_obj = new Text_LanguageDetect_Parser($sample); | |
708 | $sample_obj->prepareTrigram(); | |
709 | if ($this->_use_unicode_narrowing) { | |
710 | $sample_obj->prepareUnicode(); | |
711 | } | |
712 | $sample_obj->setPadStart(!$this->_perl_compatible); | |
713 | $sample_obj->analyze(); | |
714 | ||
715 | $trigram_freqs =& $sample_obj->getTrigramRanks(); | |
716 | $trigram_count = count($trigram_freqs); | |
717 | ||
718 | if ($trigram_count == 0) { | |
719 | return array(); | |
720 | } | |
721 | ||
722 | $scores = array(); | |
723 | ||
724 | // use unicode block detection to narrow down the possibilities | |
725 | if ($this->_use_unicode_narrowing) { | |
726 | $blocks =& $sample_obj->getUnicodeBlocks(); | |
727 | ||
728 | if (is_array($blocks)) { | |
729 | $present_blocks = array_keys($blocks); | |
730 | } else { | |
731 | throw new Text_LanguageDetect_Exception( | |
732 | 'Error during block detection', | |
733 | Text_LanguageDetect_Exception::BLOCK_DETECTION | |
734 | ); | |
735 | } | |
736 | ||
737 | $possible_langs = array(); | |
738 | ||
739 | foreach ($present_blocks as $blockname) { | |
740 | if (isset($this->_unicode_map[$blockname])) { | |
741 | ||
742 | $possible_langs = array_merge( | |
743 | $possible_langs, | |
744 | array_keys($this->_unicode_map[$blockname]) | |
745 | ); | |
746 | ||
747 | // todo: faster way to do this? | |
748 | } | |
749 | } | |
750 | ||
751 | // could also try an intersect operation rather than a union | |
752 | // in other words, choose languages whose trigrams contain | |
753 | // ALL of the unicode blocks found in this sample | |
754 | // would improve speed but would be completely thrown off by an | |
755 | // unexpected character, like an umlaut appearing in english text | |
756 | ||
757 | $possible_langs = array_intersect( | |
758 | array_keys($this->_lang_db), | |
759 | array_unique($possible_langs) | |
760 | ); | |
761 | ||
762 | // needs to intersect it with the keys of _lang_db in case | |
763 | // languages have been omitted | |
764 | ||
765 | } else { | |
766 | // or just try 'em all | |
767 | $possible_langs = array_keys($this->_lang_db); | |
768 | } | |
769 | ||
770 | ||
771 | foreach ($possible_langs as $lang) { | |
772 | $scores[$lang] = $this->_normalize_score( | |
773 | $this->_distance($this->_lang_db[$lang], $trigram_freqs), | |
774 | $trigram_count | |
775 | ); | |
776 | } | |
777 | ||
778 | unset($sample_obj); | |
779 | ||
780 | if ($this->_perl_compatible) { | |
781 | asort($scores); | |
782 | } else { | |
783 | arsort($scores); | |
784 | } | |
785 | ||
786 | // todo: drop languages with a score of $this->_max_score? | |
787 | ||
788 | // limit the number of returned scores | |
789 | if ($limit && is_numeric($limit)) { | |
790 | $limited_scores = array(); | |
791 | ||
792 | $i = 0; | |
793 | foreach ($scores as $key => $value) { | |
794 | if ($i++ >= $limit) { | |
795 | break; | |
796 | } | |
797 | ||
798 | $limited_scores[$key] = $value; | |
799 | } | |
800 | ||
801 | return $this->_convertToNameMode($limited_scores, true); | |
802 | } else { | |
803 | return $this->_convertToNameMode($scores, true); | |
804 | } | |
805 | } | |
806 | ||
807 | /** | |
808 | * Returns only the most similar language to the text sample | |
809 | * | |
810 | * Calls $this->detect() and returns only the top result | |
811 | * | |
812 | * @param string $sample text to detect the language of | |
813 | * | |
814 | * @return string the name of the most likely language | |
815 | * or null if no language is similar | |
816 | * @see detect() | |
817 | * @throws Text_LanguageDetect_Exception | |
818 | */ | |
819 | public function detectSimple($sample) | |
820 | { | |
821 | $scores = $this->detect($sample, 1); | |
822 | ||
823 | // if top language has the maximum possible score, | |
824 | // then the top score will have been picked at random | |
825 | if (!is_array($scores) || empty($scores) | |
826 | || current($scores) == $this->_max_score | |
827 | ) { | |
828 | return null; | |
829 | } else { | |
830 | return key($scores); | |
831 | } | |
832 | } | |
833 | ||
834 | /** | |
835 | * Returns an array containing the most similar language and a confidence | |
836 | * rating | |
837 | * | |
838 | * Confidence is a simple measure calculated from the similarity score | |
839 | * minus the similarity score from the next most similar language | |
840 | * divided by the highest possible score. Languages that have closely | |
841 | * related cousins (e.g. Norwegian and Danish) should generally have lower | |
842 | * confidence scores. | |
843 | * | |
844 | * The similarity score answers the question "How likely is the text the | |
845 | * returned language regardless of the other languages considered?" The | |
846 | * confidence score is one way of answering the question "how likely is the | |
847 | * text the detected language relative to the rest of the language model | |
848 | * set?" | |
849 | * | |
850 | * To see how similar languages are a priori, see languageSimilarity() | |
851 | * | |
852 | * @param string $sample text for which language will be detected | |
853 | * | |
854 | * @return array most similar language, score and confidence rating | |
855 | * or null if no language is similar | |
856 | * @see detect() | |
857 | * @throws Text_LanguageDetect_Exception | |
858 | */ | |
859 | public function detectConfidence($sample) | |
860 | { | |
861 | $scores = $this->detect($sample, 2); | |
862 | ||
863 | // if most similar language has the max score, it | |
864 | // will have been picked at random | |
865 | if (!is_array($scores) || empty($scores) | |
866 | || current($scores) == $this->_max_score | |
867 | ) { | |
868 | return null; | |
869 | } | |
870 | ||
871 | $arr['language'] = key($scores); | |
872 | $arr['similarity'] = current($scores); | |
873 | if (next($scores) !== false) { // if false then no next element | |
874 | // the goal is to return a higher value if the distance between | |
875 | // the similarity of the first score and the second score is high | |
876 | ||
877 | if ($this->_perl_compatible) { | |
878 | $arr['confidence'] = (current($scores) - $arr['similarity']) | |
879 | / $this->_max_score; | |
880 | ||
881 | } else { | |
882 | $arr['confidence'] = $arr['similarity'] - current($scores); | |
883 | ||
884 | } | |
885 | ||
886 | } else { | |
887 | $arr['confidence'] = null; | |
888 | } | |
889 | ||
890 | return $arr; | |
891 | } | |
892 | ||
893 | /** | |
894 | * Returns the distribution of unicode blocks in a given utf8 string | |
895 | * | |
896 | * For the block name of a single char, use unicodeBlockName() | |
897 | * | |
898 | * @param string $str input string. Must be ascii or utf8 | |
899 | * @param bool $skip_symbols if true, skip ascii digits, symbols and | |
900 | * non-printing characters. Includes spaces, | |
901 | * newlines and common punctutation characters. | |
902 | * | |
903 | * @return array | |
904 | * @throws Text_LanguageDetect_Exception | |
905 | */ | |
906 | public function detectUnicodeBlocks($str, $skip_symbols) | |
907 | { | |
908 | $skip_symbols = (bool)$skip_symbols; | |
909 | $str = (string)$str; | |
910 | ||
911 | $sample_obj = new Text_LanguageDetect_Parser($str); | |
912 | $sample_obj->prepareUnicode(); | |
913 | $sample_obj->prepareTrigram(false); | |
914 | $sample_obj->setUnicodeSkipSymbols($skip_symbols); | |
915 | $sample_obj->analyze(); | |
916 | $blocks = $sample_obj->getUnicodeBlocks(); | |
917 | unset($sample_obj); | |
918 | return $blocks; | |
919 | } | |
920 | ||
921 | /** | |
922 | * Returns the block name for a given unicode value | |
923 | * | |
924 | * If passed a string, will assume it is being passed a UTF8-formatted | |
925 | * character and will automatically convert. Otherwise it will assume it | |
926 | * is being passed a numeric unicode value. | |
927 | * | |
928 | * Make sure input is of the correct type! | |
929 | * | |
930 | * @param mixed $unicode unicode value or utf8 char | |
931 | * | |
932 | * @return mixed the block name string or false if not found | |
933 | * @throws Text_LanguageDetect_Exception | |
934 | */ | |
935 | public function unicodeBlockName($unicode) | |
936 | { | |
937 | if (is_string($unicode)) { | |
938 | // assume it is being passed a utf8 char, so convert it | |
939 | if (self::utf8strlen($unicode) > 1) { | |
940 | throw new Text_LanguageDetect_Exception( | |
941 | 'Pass a single char only to this method', | |
942 | Text_LanguageDetect_Exception::PARAM_TYPE | |
943 | ); | |
944 | } | |
945 | $unicode = $this->_utf8char2unicode($unicode); | |
946 | ||
947 | } elseif (!is_int($unicode)) { | |
948 | throw new Text_LanguageDetect_Exception( | |
949 | 'Input must be of type string or int.', | |
950 | Text_LanguageDetect_Exception::PARAM_TYPE | |
951 | ); | |
952 | } | |
953 | ||
954 | $blocks = $this->_read_unicode_block_db(); | |
955 | ||
956 | $result = $this->_unicode_block_name($unicode, $blocks); | |
957 | ||
958 | if ($result == -1) { | |
959 | return false; | |
960 | } else { | |
961 | return $result[2]; | |
962 | } | |
963 | } | |
964 | ||
965 | /** | |
966 | * Searches the unicode block database | |
967 | * | |
968 | * Returns the block name for a given unicode value. unicodeBlockName() is | |
969 | * the public interface for this function, which does input checks which | |
970 | * this function omits for speed. | |
971 | * | |
972 | * @param int $unicode the unicode value | |
973 | * @param array $blocks the block database | |
974 | * @param int $block_count the number of defined blocks in the database | |
975 | * | |
976 | * @return mixed Block name, -1 if it failed | |
977 | * @see unicodeBlockName() | |
978 | * @access protected | |
979 | */ | |
980 | function _unicode_block_name($unicode, $blocks, $block_count = -1) | |
981 | { | |
982 | // for a reference, see | |
983 | // http://www.unicode.org/Public/UNIDATA/Blocks.txt | |
984 | ||
985 | // assume that ascii characters are the most common | |
986 | // so try it first for efficiency | |
987 | if ($unicode <= $blocks[0][1]) { | |
988 | return $blocks[0]; | |
989 | } | |
990 | ||
991 | // the optional $block_count param is for efficiency | |
992 | // so we this function doesn't have to run count() every time | |
993 | if ($block_count != -1) { | |
994 | $high = $block_count - 1; | |
995 | } else { | |
996 | $high = count($blocks) - 1; | |
997 | } | |
998 | ||
999 | $low = 1; // start with 1 because ascii was 0 | |
1000 | ||
1001 | // your average binary search algorithm | |
1002 | while ($low <= $high) { | |
1003 | $mid = floor(($low + $high) / 2); | |
1004 | ||
1005 | if ($unicode < $blocks[$mid][0]) { | |
1006 | // if it's lower than the lower bound | |
1007 | $high = $mid - 1; | |
1008 | ||
1009 | } elseif ($unicode > $blocks[$mid][1]) { | |
1010 | // if it's higher than the upper bound | |
1011 | $low = $mid + 1; | |
1012 | ||
1013 | } else { | |
1014 | // found it | |
1015 | return $blocks[$mid]; | |
1016 | } | |
1017 | } | |
1018 | ||
1019 | // failed to find the block | |
1020 | return -1; | |
1021 | ||
1022 | // todo: differentiate when it's out of range or when it falls | |
1023 | // into an unassigned range? | |
1024 | } | |
1025 | ||
1026 | /** | |
1027 | * Brings up the unicode block database | |
1028 | * | |
1029 | * @return array the database of unicode block definitions | |
1030 | * @throws Text_LanguageDetect_Exception | |
1031 | * @access protected | |
1032 | */ | |
1033 | function _read_unicode_block_db() | |
1034 | { | |
1035 | // since the unicode definitions are always going to be the same, | |
1036 | // might as well share the memory for the db with all other instances | |
1037 | // of this class | |
1038 | static $data; | |
1039 | ||
1040 | if (!isset($data)) { | |
1041 | $data = $this->_readdb($this->_unicode_db_filename); | |
1042 | } | |
1043 | ||
1044 | return $data; | |
1045 | } | |
1046 | ||
1047 | /** | |
1048 | * Calculate the similarities between the language models | |
1049 | * | |
1050 | * Use this function to see how similar languages are to each other. | |
1051 | * | |
1052 | * If passed 2 language names, will return just those languages compared. | |
1053 | * If passed 1 language name, will return that language compared to | |
1054 | * all others. | |
1055 | * If passed none, will return an array of every language model compared | |
1056 | * to every other one. | |
1057 | * | |
1058 | * @param string $lang1 the name of the first language to be compared | |
1059 | * @param string $lang2 the name of the second language to be compared | |
1060 | * | |
1061 | * @return array scores of every language compared | |
1062 | * or the score of just the provided languages | |
1063 | * or null if one of the supplied languages does not exist | |
1064 | * @throws Text_LanguageDetect_Exception | |
1065 | */ | |
1066 | public function languageSimilarity($lang1 = null, $lang2 = null) | |
1067 | { | |
1068 | $lang1 = $this->_convertFromNameMode($lang1); | |
1069 | $lang2 = $this->_convertFromNameMode($lang2); | |
1070 | if ($lang1 != null) { | |
1071 | $lang1 = strtolower($lang1); | |
1072 | ||
1073 | // check if language model exists | |
1074 | if (!isset($this->_lang_db[$lang1])) { | |
1075 | return null; | |
1076 | } | |
1077 | ||
1078 | if ($lang2 != null) { | |
1079 | if (!isset($this->_lang_db[$lang2])) { | |
1080 | // check if language model exists | |
1081 | return null; | |
1082 | } | |
1083 | ||
1084 | $lang2 = strtolower($lang2); | |
1085 | ||
1086 | // compare just these two languages | |
1087 | return $this->_normalize_score( | |
1088 | $this->_distance( | |
1089 | $this->_lang_db[$lang1], | |
1090 | $this->_lang_db[$lang2] | |
1091 | ) | |
1092 | ); | |
1093 | ||
1094 | } else { | |
1095 | // compare just $lang1 to all languages | |
1096 | $return_arr = array(); | |
1097 | foreach ($this->_lang_db as $key => $value) { | |
1098 | if ($key != $lang1) { | |
1099 | // don't compare a language to itself | |
1100 | $return_arr[$key] = $this->_normalize_score( | |
1101 | $this->_distance($this->_lang_db[$lang1], $value) | |
1102 | ); | |
1103 | } | |
1104 | } | |
1105 | asort($return_arr); | |
1106 | ||
1107 | return $return_arr; | |
1108 | } | |
1109 | ||
1110 | ||
1111 | } else { | |
1112 | // compare all languages to each other | |
1113 | $return_arr = array(); | |
1114 | foreach (array_keys($this->_lang_db) as $lang1) { | |
1115 | foreach (array_keys($this->_lang_db) as $lang2) { | |
1116 | // skip comparing languages to themselves | |
1117 | if ($lang1 != $lang2) { | |
1118 | ||
1119 | if (isset($return_arr[$lang2][$lang1])) { | |
1120 | // don't re-calculate what's already been done | |
1121 | $return_arr[$lang1][$lang2] | |
1122 | = $return_arr[$lang2][$lang1]; | |
1123 | ||
1124 | } else { | |
1125 | // calculate | |
1126 | $return_arr[$lang1][$lang2] | |
1127 | = $this->_normalize_score( | |
1128 | $this->_distance( | |
1129 | $this->_lang_db[$lang1], | |
1130 | $this->_lang_db[$lang2] | |
1131 | ) | |
1132 | ); | |
1133 | ||
1134 | } | |
1135 | } | |
1136 | } | |
1137 | } | |
1138 | return $return_arr; | |
1139 | } | |
1140 | } | |
1141 | ||
1142 | /** | |
1143 | * Cluster known languages according to languageSimilarity() | |
1144 | * | |
1145 | * WARNING: this method is EXPERIMENTAL. It is not recommended for common | |
1146 | * use, and it may disappear or its functionality may change in future | |
1147 | * releases without notice. | |
1148 | * | |
1149 | * Uses a nearest neighbor technique to generate the maximum possible | |
1150 | * number of dendograms from the similarity data. | |
1151 | * | |
1152 | * @access public | |
1153 | * @return array language cluster data | |
1154 | * @throws Text_LanguageDetect_Exception | |
1155 | * @see languageSimilarity() | |
1156 | * @deprecated this function will eventually be removed and placed into | |
1157 | * the model generation class | |
1158 | */ | |
1159 | function clusterLanguages() | |
1160 | { | |
1161 | // todo: set the maximum number of clusters | |
1162 | // return cached result, if any | |
1163 | if (isset($this->_clusters)) { | |
1164 | return $this->_clusters; | |
1165 | } | |
1166 | ||
1167 | $langs = array_keys($this->_lang_db); | |
1168 | ||
1169 | $arr = $this->languageSimilarity(); | |
1170 | ||
1171 | sort($langs); | |
1172 | ||
1173 | foreach ($langs as $lang) { | |
1174 | if (!isset($this->_lang_db[$lang])) { | |
1175 | throw new Text_LanguageDetect_Exception( | |
1176 | "missing $lang!", | |
1177 | Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE | |
1178 | ); | |
1179 | } | |
1180 | } | |
1181 | ||
1182 | // http://www.psychstat.missouristate.edu/multibook/mlt04m.html | |
1183 | foreach ($langs as $old_key => $lang1) { | |
1184 | $langs[$lang1] = $lang1; | |
1185 | unset($langs[$old_key]); | |
1186 | } | |
1187 | ||
1188 | $result_data = $really_map = array(); | |
1189 | ||
1190 | $i = 0; | |
1191 | while (count($langs) > 2 && $i++ < 200) { | |
1192 | $highest_score = -1; | |
1193 | $highest_key1 = ''; | |
1194 | $highest_key2 = ''; | |
1195 | foreach ($langs as $lang1) { | |
1196 | foreach ($langs as $lang2) { | |
1197 | if ($lang1 != $lang2 | |
1198 | && $arr[$lang1][$lang2] > $highest_score | |
1199 | ) { | |
1200 | $highest_score = $arr[$lang1][$lang2]; | |
1201 | $highest_key1 = $lang1; | |
1202 | $highest_key2 = $lang2; | |
1203 | } | |
1204 | } | |
1205 | } | |
1206 | ||
1207 | if (!$highest_key1) { | |
1208 | // should not ever happen | |
1209 | throw new Text_LanguageDetect_Exception( | |
1210 | "no highest key? (step: $i)", | |
1211 | Text_LanguageDetect_Exception::NO_HIGHEST_KEY | |
1212 | ); | |
1213 | } | |
1214 | ||
1215 | if ($highest_score == 0) { | |
1216 | // languages are perfectly dissimilar | |
1217 | break; | |
1218 | } | |
1219 | ||
1220 | // $highest_key1 and $highest_key2 are most similar | |
1221 | $sum1 = array_sum($arr[$highest_key1]); | |
1222 | $sum2 = array_sum($arr[$highest_key2]); | |
1223 | ||
1224 | // use the score for the one that is most similar to the rest of | |
1225 | // the field as the score for the group | |
1226 | // todo: could try averaging or "centroid" method instead | |
1227 | // seems like that might make more sense | |
1228 | // actually nearest neighbor may be better for binary searching | |
1229 | ||
1230 | ||
1231 | // for "Complete Linkage"/"furthest neighbor" | |
1232 | // sign should be < | |
1233 | // for "Single Linkage"/"nearest neighbor" method | |
1234 | // should should be > | |
1235 | // results seem to be pretty much the same with either method | |
1236 | ||
1237 | // figure out which to delete and which to replace | |
1238 | if ($sum1 > $sum2) { | |
1239 | $replaceme = $highest_key1; | |
1240 | $deleteme = $highest_key2; | |
1241 | } else { | |
1242 | $replaceme = $highest_key2; | |
1243 | $deleteme = $highest_key1; | |
1244 | } | |
1245 | ||
1246 | $newkey = $replaceme . ':' . $deleteme; | |
1247 | ||
1248 | // $replaceme is most similar to remaining languages | |
1249 | // replace $replaceme with '$newkey', deleting $deleteme | |
1250 | ||
1251 | // keep a record of which fork is really which language | |
1252 | $really_lang = $replaceme; | |
1253 | while (isset($really_map[$really_lang])) { | |
1254 | $really_lang = $really_map[$really_lang]; | |
1255 | } | |
1256 | $really_map[$newkey] = $really_lang; | |
1257 | ||
1258 | ||
1259 | // replace the best fitting key, delete the other | |
1260 | foreach ($arr as $key1 => $arr2) { | |
1261 | foreach ($arr2 as $key2 => $value2) { | |
1262 | if ($key2 == $replaceme) { | |
1263 | $arr[$key1][$newkey] = $arr[$key1][$key2]; | |
1264 | unset($arr[$key1][$key2]); | |
1265 | // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] | |
1266 | } | |
1267 | ||
1268 | if ($key1 == $replaceme) { | |
1269 | $arr[$newkey][$key2] = $arr[$key1][$key2]; | |
1270 | unset($arr[$key1][$key2]); | |
1271 | // replacing $arr[$key1][$key2] with $arr[$newkey][$key2] | |
1272 | } | |
1273 | ||
1274 | if ($key1 == $deleteme || $key2 == $deleteme) { | |
1275 | // deleting $arr[$key1][$key2] | |
1276 | unset($arr[$key1][$key2]); | |
1277 | } | |
1278 | } | |
1279 | } | |
1280 | ||
1281 | ||
1282 | unset($langs[$highest_key1]); | |
1283 | unset($langs[$highest_key2]); | |
1284 | $langs[$newkey] = $newkey; | |
1285 | ||
1286 | ||
1287 | // some of these may be overkill | |
1288 | $result_data[$newkey] = array( | |
1289 | 'newkey' => $newkey, | |
1290 | 'count' => $i, | |
1291 | 'diff' => abs($sum1 - $sum2), | |
1292 | 'score' => $highest_score, | |
1293 | 'bestfit' => $replaceme, | |
1294 | 'otherfit' => $deleteme, | |
1295 | 'really' => $really_lang, | |
1296 | ); | |
1297 | } | |
1298 | ||
1299 | $return_val = array( | |
1300 | 'open_forks' => $langs, | |
1301 | // the top level of clusters | |
1302 | // clusters that are mutually exclusive | |
1303 | // or specified by a specific maximum | |
1304 | ||
1305 | 'fork_data' => $result_data, | |
1306 | // data for each split | |
1307 | ||
1308 | 'name_map' => $really_map, | |
1309 | // which cluster is really which language | |
1310 | // using the nearest neighbor technique, the cluster | |
1311 | // inherits all of the properties of its most-similar member | |
1312 | // this keeps track | |
1313 | ); | |
1314 | ||
1315 | ||
1316 | // saves the result in the object | |
1317 | $this->_clusters = $return_val; | |
1318 | ||
1319 | return $return_val; | |
1320 | } | |
1321 | ||
1322 | ||
1323 | /** | |
1324 | * Perform an intelligent detection based on clusterLanguages() | |
1325 | * | |
1326 | * WARNING: this method is EXPERIMENTAL. It is not recommended for common | |
1327 | * use, and it may disappear or its functionality may change in future | |
1328 | * releases without notice. | |
1329 | * | |
1330 | * This compares the sample text to top the top level of clusters. If the | |
1331 | * sample is similar to the cluster it will drop down and compare it to the | |
1332 | * languages in the cluster, and so on until it hits a leaf node. | |
1333 | * | |
1334 | * this should find the language in considerably fewer compares | |
1335 | * (the equivalent of a binary search), however clusterLanguages() is costly | |
1336 | * and the loss of accuracy from this technique is significant. | |
1337 | * | |
1338 | * This method may need to be 'fuzzier' in order to become more accurate. | |
1339 | * | |
1340 | * This function could be more useful if the universe of possible languages | |
1341 | * was very large, however in such cases some method of Bayesian inference | |
1342 | * might be more helpful. | |
1343 | * | |
1344 | * @param string $str input string | |
1345 | * | |
1346 | * @return array language scores (only those compared) | |
1347 | * @throws Text_LanguageDetect_Exception | |
1348 | * @see clusterLanguages() | |
1349 | */ | |
1350 | public function clusteredSearch($str) | |
1351 | { | |
1352 | // input check | |
1353 | if (!Text_LanguageDetect_Parser::validateString($str)) { | |
1354 | return array(); | |
1355 | } | |
1356 | ||
1357 | // clusterLanguages() will return a cached result if possible | |
1358 | // so it's safe to call it every time | |
1359 | $result = $this->clusterLanguages(); | |
1360 | ||
1361 | $dendogram_start = $result['open_forks']; | |
1362 | $dendogram_data = $result['fork_data']; | |
1363 | $dendogram_alias = $result['name_map']; | |
1364 | ||
1365 | $sample_obj = new Text_LanguageDetect_Parser($str); | |
1366 | $sample_obj->prepareTrigram(); | |
1367 | $sample_obj->setPadStart(!$this->_perl_compatible); | |
1368 | $sample_obj->analyze(); | |
1369 | $sample_result = $sample_obj->getTrigramRanks(); | |
1370 | $sample_count = count($sample_result); | |
1371 | ||
1372 | // input check | |
1373 | if ($sample_count == 0) { | |
1374 | return array(); | |
1375 | } | |
1376 | ||
1377 | $i = 0; // counts the number of steps | |
1378 | ||
1379 | foreach ($dendogram_start as $lang) { | |
1380 | if (isset($dendogram_alias[$lang])) { | |
1381 | $lang_key = $dendogram_alias[$lang]; | |
1382 | } else { | |
1383 | $lang_key = $lang; | |
1384 | } | |
1385 | ||
1386 | $scores[$lang] = $this->_normalize_score( | |
1387 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | |
1388 | $sample_count | |
1389 | ); | |
1390 | ||
1391 | $i++; | |
1392 | } | |
1393 | ||
1394 | if ($this->_perl_compatible) { | |
1395 | asort($scores); | |
1396 | } else { | |
1397 | arsort($scores); | |
1398 | } | |
1399 | ||
1400 | $top_score = current($scores); | |
1401 | $top_key = key($scores); | |
1402 | ||
1403 | // of starting forks, $top_key is the most similar to the sample | |
1404 | ||
1405 | $cur_key = $top_key; | |
1406 | while (isset($dendogram_data[$cur_key])) { | |
1407 | $lang1 = $dendogram_data[$cur_key]['bestfit']; | |
1408 | $lang2 = $dendogram_data[$cur_key]['otherfit']; | |
1409 | foreach (array($lang1, $lang2) as $lang) { | |
1410 | if (isset($dendogram_alias[$lang])) { | |
1411 | $lang_key = $dendogram_alias[$lang]; | |
1412 | } else { | |
1413 | $lang_key = $lang; | |
1414 | } | |
1415 | ||
1416 | $scores[$lang] = $this->_normalize_score( | |
1417 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | |
1418 | $sample_count | |
1419 | ); | |
1420 | ||
1421 | //todo: does not need to do same comparison again | |
1422 | } | |
1423 | ||
1424 | $i++; | |
1425 | ||
1426 | if ($scores[$lang1] > $scores[$lang2]) { | |
1427 | $cur_key = $lang1; | |
1428 | $loser_key = $lang2; | |
1429 | } else { | |
1430 | $cur_key = $lang2; | |
1431 | $loser_key = $lang1; | |
1432 | } | |
1433 | ||
1434 | $diff = $scores[$cur_key] - $scores[$loser_key]; | |
1435 | ||
1436 | // $cur_key ({$dendogram_alias[$cur_key]}) wins | |
1437 | // over $loser_key ({$dendogram_alias[$loser_key]}) | |
1438 | // with a difference of $diff | |
1439 | } | |
1440 | ||
1441 | // found result in $i compares | |
1442 | ||
1443 | // rather than sorting the result, preserve it so that you can see | |
1444 | // which paths the algorithm decided to take along the tree | |
1445 | ||
1446 | // but sometimes the last item is only the second highest | |
1447 | if (($this->_perl_compatible && (end($scores) > prev($scores))) | |
1448 | || (!$this->_perl_compatible && (end($scores) < prev($scores))) | |
1449 | ) { | |
1450 | $real_last_score = current($scores); | |
1451 | $real_last_key = key($scores); | |
1452 | ||
1453 | // swaps the 2nd-to-last item for the last item | |
1454 | unset($scores[$real_last_key]); | |
1455 | $scores[$real_last_key] = $real_last_score; | |
1456 | } | |
1457 | ||
1458 | ||
1459 | if (!$this->_perl_compatible) { | |
1460 | $scores = array_reverse($scores, true); | |
1461 | // second param requires php > 4.0.3 | |
1462 | } | |
1463 | ||
1464 | return $scores; | |
1465 | } | |
1466 | ||
1467 | /** | |
1468 | * ut8-safe strlen() | |
1469 | * | |
1470 | * Returns the numbers of characters (not bytes) in a utf8 string | |
1471 | * | |
1472 | * @param string $str string to get the length of | |
1473 | * | |
1474 | * @return int number of chars | |
1475 | */ | |
1476 | public static function utf8strlen($str) | |
1477 | { | |
1478 | // utf8_decode() will convert unknown chars to '?', which is actually | |
1479 | // ideal for counting. | |
1480 | ||
1481 | return strlen(utf8_decode($str)); | |
1482 | ||
1483 | // idea stolen from dokuwiki | |
1484 | } | |
1485 | ||
1486 | /** | |
1487 | * Returns the unicode value of a utf8 char | |
1488 | * | |
1489 | * @param string $char a utf8 (possibly multi-byte) char | |
1490 | * | |
1491 | * @return int unicode value | |
1492 | * @access protected | |
1493 | * @link http://en.wikipedia.org/wiki/UTF-8 | |
1494 | */ | |
1495 | function _utf8char2unicode($char) | |
1496 | { | |
1497 | // strlen() here will actually get the binary length of a single char | |
1498 | switch (strlen($char)) { | |
1499 | case 1: | |
1500 | // normal ASCII-7 byte | |
1501 | // 0xxxxxxx --> 0xxxxxxx | |
1502 | return ord($char{0}); | |
1503 | ||
1504 | case 2: | |
1505 | // 2 byte unicode | |
1506 | // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx | |
1507 | $z = (ord($char{0}) & 0x000001F) << 6; | |
1508 | $x = (ord($char{1}) & 0x0000003F); | |
1509 | return ($z | $x); | |
1510 | ||
1511 | case 3: | |
1512 | // 3 byte unicode | |
1513 | // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx | |
1514 | $z = (ord($char{0}) & 0x0000000F) << 12; | |
1515 | $x1 = (ord($char{1}) & 0x0000003F) << 6; | |
1516 | $x2 = (ord($char{2}) & 0x0000003F); | |
1517 | return ($z | $x1 | $x2); | |
1518 | ||
1519 | case 4: | |
1520 | // 4 byte unicode | |
1521 | // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> | |
1522 | // 000zzzzz xxxxxxxx xxxxxxxx | |
1523 | $z1 = (ord($char{0}) & 0x00000007) << 18; | |
1524 | $z2 = (ord($char{1}) & 0x0000003F) << 12; | |
1525 | $x1 = (ord($char{2}) & 0x0000003F) << 6; | |
1526 | $x2 = (ord($char{3}) & 0x0000003F); | |
1527 | return ($z1 | $z2 | $x1 | $x2); | |
1528 | } | |
1529 | } | |
1530 | ||
1531 | /** | |
1532 | * utf8-safe fast character iterator | |
1533 | * | |
1534 | * Will get the next character starting from $counter, which will then be | |
1535 | * incremented. If a multi-byte char the bytes will be concatenated and | |
1536 | * $counter will be incremeted by the number of bytes in the char. | |
1537 | * | |
1538 | * @param string $str the string being iterated over | |
1539 | * @param int &$counter the iterator, will increment by reference | |
1540 | * @param bool $special_convert whether to do special conversions | |
1541 | * | |
1542 | * @return char the next (possibly multi-byte) char from $counter | |
1543 | * @access private | |
1544 | */ | |
1545 | static function _next_char($str, &$counter, $special_convert = false) | |
1546 | { | |
1547 | $char = $str{$counter++}; | |
1548 | $ord = ord($char); | |
1549 | ||
1550 | // for a description of the utf8 system see | |
1551 | // http://www.phpclasses.org/browse/file/5131.html | |
1552 | ||
1553 | // normal ascii one byte char | |
1554 | if ($ord <= 127) { | |
1555 | // special conversions needed for this package | |
1556 | // (that only apply to regular ascii characters) | |
1557 | // lower case, and convert all non-alphanumeric characters | |
1558 | // other than "'" to space | |
1559 | if ($special_convert && $char != ' ' && $char != "'") { | |
1560 | if ($ord >= 65 && $ord <= 90) { // A-Z | |
1561 | $char = chr($ord + 32); // lower case | |
1562 | } elseif ($ord < 97 || $ord > 122) { // NOT a-z | |
1563 | $char = ' '; // convert to space | |
1564 | } | |
1565 | } | |
1566 | ||
1567 | return $char; | |
1568 | ||
1569 | } elseif ($ord >> 5 == 6) { // two-byte char | |
1570 | // multi-byte chars | |
1571 | $nextchar = $str{$counter++}; // get next byte | |
1572 | ||
1573 | // lower-casing of non-ascii characters is still incomplete | |
1574 | ||
1575 | if ($special_convert) { | |
1576 | // lower case latin accented characters | |
1577 | if ($ord == 195) { | |
1578 | $nextord = ord($nextchar); | |
1579 | $nextord_adj = $nextord + 64; | |
1580 | // for a reference, see | |
1581 | // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html | |
1582 | ||
1583 | // À - Þ but not × | |
1584 | if ($nextord_adj >= 192 | |
1585 | && $nextord_adj <= 222 | |
1586 | && $nextord_adj != 215 | |
1587 | ) { | |
1588 | $nextchar = chr($nextord + 32); | |
1589 | } | |
1590 | ||
1591 | } elseif ($ord == 208) { | |
1592 | // lower case cyrillic alphabet | |
1593 | $nextord = ord($nextchar); | |
1594 | // if A - Pe | |
1595 | if ($nextord >= 144 && $nextord <= 159) { | |
1596 | // lower case | |
1597 | $nextchar = chr($nextord + 32); | |
1598 | ||
1599 | } elseif ($nextord >= 160 && $nextord <= 175) { | |
1600 | // if Er - Ya | |
1601 | // lower case | |
1602 | $char = chr(209); // == $ord++ | |
1603 | $nextchar = chr($nextord - 32); | |
1604 | } | |
1605 | } | |
1606 | } | |
1607 | ||
1608 | // tag on next byte | |
1609 | return $char . $nextchar; | |
1610 | } elseif ($ord >> 4 == 14) { // three-byte char | |
1611 | ||
1612 | // tag on next 2 bytes | |
1613 | return $char . $str{$counter++} . $str{$counter++}; | |
1614 | ||
1615 | } elseif ($ord >> 3 == 30) { // four-byte char | |
1616 | ||
1617 | // tag on next 3 bytes | |
1618 | return $char . $str{$counter++} . $str{$counter++} . $str{$counter++}; | |
1619 | ||
1620 | } else { | |
1621 | // error? | |
1622 | } | |
1623 | } | |
1624 | ||
1625 | /** | |
1626 | * Converts an $language input parameter from the configured mode | |
1627 | * to the language name that is used internally. | |
1628 | * | |
1629 | * Works for strings and arrays. | |
1630 | * | |
1631 | * @param string|array $lang A language description ("english"/"en"/"eng") | |
1632 | * @param boolean $convertKey If $lang is an array, setting $key | |
1633 | * converts the keys to the language name. | |
1634 | * | |
1635 | * @return string|array Language name | |
1636 | */ | |
1637 | function _convertFromNameMode($lang, $convertKey = false) | |
1638 | { | |
1639 | if ($this->_name_mode == 0) { | |
1640 | return $lang; | |
1641 | } | |
1642 | ||
1643 | if ($this->_name_mode == 2) { | |
1644 | $method = 'code2ToName'; | |
1645 | } else { | |
1646 | $method = 'code3ToName'; | |
1647 | } | |
1648 | ||
1649 | if (is_string($lang)) { | |
1650 | return (string)Text_LanguageDetect_ISO639::$method($lang); | |
1651 | } | |
1652 | ||
1653 | $newlang = array(); | |
1654 | foreach ($lang as $key => $val) { | |
1655 | if ($convertKey) { | |
1656 | $newkey = (string)Text_LanguageDetect_ISO639::$method($key); | |
1657 | $newlang[$newkey] = $val; | |
1658 | } else { | |
1659 | $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val); | |
1660 | } | |
1661 | } | |
1662 | return $newlang; | |
1663 | } | |
1664 | ||
1665 | /** | |
1666 | * Converts an $language output parameter from the language name that is | |
1667 | * used internally to the configured mode. | |
1668 | * | |
1669 | * Works for strings and arrays. | |
1670 | * | |
1671 | * @param string|array $lang A language description ("english"/"en"/"eng") | |
1672 | * @param boolean $convertKey If $lang is an array, setting $key | |
1673 | * converts the keys to the language name. | |
1674 | * | |
1675 | * @return string|array Language name | |
1676 | */ | |
1677 | function _convertToNameMode($lang, $convertKey = false) | |
1678 | { | |
1679 | if ($this->_name_mode == 0) { | |
1680 | return $lang; | |
1681 | } | |
1682 | ||
1683 | if ($this->_name_mode == 2) { | |
1684 | $method = 'nameToCode2'; | |
1685 | } else { | |
1686 | $method = 'nameToCode3'; | |
1687 | } | |
1688 | ||
1689 | if (is_string($lang)) { | |
1690 | return Text_LanguageDetect_ISO639::$method($lang); | |
1691 | } | |
1692 | ||
1693 | $newlang = array(); | |
1694 | foreach ($lang as $key => $val) { | |
1695 | if ($convertKey) { | |
1696 | $newkey = Text_LanguageDetect_ISO639::$method($key); | |
1697 | $newlang[$newkey] = $val; | |
1698 | } else { | |
1699 | $newlang[$key] = Text_LanguageDetect_ISO639::$method($val); | |
1700 | } | |
1701 | } | |
1702 | return $newlang; | |
1703 | } | |
1704 | } | |
1705 | ||
1706 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ | |
1707 | ||
1708 | ?> |