From 31b40448fd4a08032f562ce171a66c6952370f60 Mon Sep 17 00:00:00 2001 From: tschuettler Date: Thu, 2 Aug 2018 17:23:14 +0200 Subject: [PATCH] Update af_lang_detect plugin with changes from upstream Reapplied downstream changes for phpmd ruleset --- plugins/af_lang_detect/init.php | 4 +- .../{ => Text}/LanguageDetect.php | 174 ++-- .../Text/LanguageDetect/Exception.php | 24 + .../Text/LanguageDetect/ISO639.php | 3 +- .../Text/LanguageDetect/Parser.php | 189 ++-- .../data/build-unicode_blocks.php | 7 + .../languagedetect/data/unicode_blocks.dat | 2 +- .../languagedetect/data/unicode_blocks.php | 873 ++++++++++++++++++ 8 files changed, 1079 insertions(+), 197 deletions(-) rename plugins/af_lang_detect/languagedetect/{ => Text}/LanguageDetect.php (93%) create mode 100644 plugins/af_lang_detect/languagedetect/data/build-unicode_blocks.php create mode 100644 plugins/af_lang_detect/languagedetect/data/unicode_blocks.php diff --git a/plugins/af_lang_detect/init.php b/plugins/af_lang_detect/init.php index e7874131..3ec0023b 100644 --- a/plugins/af_lang_detect/init.php +++ b/plugins/af_lang_detect/init.php @@ -4,7 +4,7 @@ class Af_Lang_Detect extends Plugin { private $lang; function about() { - return array(1.0, + return array(1.1, "Detect article language", "fox"); } @@ -14,7 +14,7 @@ class Af_Lang_Detect extends Plugin { $host->add_hook($host::HOOK_ARTICLE_FILTER, $this); - require_once __DIR__ . "/languagedetect/LanguageDetect.php"; + require_once __DIR__ . "/languagedetect/Text/LanguageDetect.php"; $this->lang = new Text_LanguageDetect(); $this->lang->setNameMode(2); diff --git a/plugins/af_lang_detect/languagedetect/LanguageDetect.php b/plugins/af_lang_detect/languagedetect/Text/LanguageDetect.php similarity index 93% rename from plugins/af_lang_detect/languagedetect/LanguageDetect.php rename to plugins/af_lang_detect/languagedetect/Text/LanguageDetect.php index 6c186350..ba1647d0 100644 --- a/plugins/af_lang_detect/languagedetect/LanguageDetect.php +++ b/plugins/af_lang_detect/languagedetect/Text/LanguageDetect.php @@ -1,13 +1,6 @@ * @copyright 2005-2006 Nicholas Pisarro - * @license http://www.debian.org/misc/bsd.license BSD - * @version SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $ + * @license BSD http://www.opensource.org/licenses/bsd-license.php * @link http://pear.php.net/package/Text_LanguageDetect/ - * @link http://langdetect.blogspot.com/ */ -require_once __DIR__ . '/Text/LanguageDetect/Exception.php'; -require_once __DIR__ . '/Text/LanguageDetect/Parser.php'; -require_once __DIR__ . '/Text/LanguageDetect/ISO639.php'; +require_once __DIR__ . '/LanguageDetect/Exception.php'; +require_once __DIR__ . '/LanguageDetect/Parser.php'; +require_once __DIR__ . '/LanguageDetect/ISO639.php'; /** - * Language detection class + * Detects the language of a given piece of text. + * + * Attempts to detect the language of a sample of text by correlating ranked + * 3-gram frequencies to a table of 3-gram frequencies of known languages. + * + * Implements a version of a technique originally proposed by Cavnar & Trenkle + * (1994): "N-Gram-Based Text Categorization" * - * Requires the langauge model database (lang.dat) that should have + * Requires the language model database (lang.dat) that should have * accompanied this class definition in order to be instantiated. * * Example usage: @@ -60,10 +57,9 @@ require_once __DIR__ . '/Text/LanguageDetect/ISO639.php'; * @package Text_LanguageDetect * @author Nicholas Pisarro * @copyright 2005 Nicholas Pisarro - * @license http://www.debian.org/misc/bsd.license BSD + * @license BSD http://www.opensource.org/licenses/bsd-license.php * @version Release: @package_version@ * @link http://pear.php.net/package/Text_LanguageDetect/ - * @todo allow users to generate their own language models * * @SuppressWarnings(PHPMD) */ @@ -75,10 +71,9 @@ class Text_LanguageDetect * If this value starts with a slash (/) or a dot (.) the value of * $this->_data_dir will be ignored * - * @var string - * @access private + * @var string */ - var $_db_filename = 'lang.dat'; + protected $_db_filename = 'lang.dat'; /** * The filename that stores the unicode block definitions @@ -87,83 +82,74 @@ class Text_LanguageDetect * $this->_data_dir will be ignored * * @var string - * @access private */ - var $_unicode_db_filename = 'unicode_blocks.dat'; + protected $_unicode_db_filename = 'unicode_blocks.dat'; /** * The data directory * * Should be set by PEAR installer * - * @var string - * @access private + * @var string */ - var $_data_dir = '@data_dir@'; + protected $_data_dir = '@data_dir@'; /** * The trigram data for comparison * * Will be loaded on start from $this->_db_filename * - * @var array - * @access private + * @var array */ - var $_lang_db = array(); + protected $_lang_db = array(); /** - * stores the map of the trigram data to unicode characters + * Stores the map of the trigram data to unicode characters * - * @access private * @var array */ - var $_unicode_map; + protected $_unicode_map; /** * The size of the trigram data arrays * - * @var int - * @access private + * @var int */ - var $_threshold = 300; + protected $_threshold = 300; /** - * the maximum possible score. + * The maximum possible score. * - * needed for score normalization. Different depending on the + * Needed for score normalization. Different depending on the * perl compatibility setting * - * @access private - * @var int - * @see setPerlCompatible() + * @var int + * @see setPerlCompatible() */ - var $_max_score = 0; + protected $_max_score = 0; /** * Whether or not to simulate perl's Language::Guess exactly * - * @access private - * @var bool - * @see setPerlCompatible() + * @var bool + * @see setPerlCompatible() */ - var $_perl_compatible = false; + protected $_perl_compatible = false; /** * Whether to use the unicode block detection to speed up processing * - * @access private * @var bool */ - var $_use_unicode_narrowing = true; + protected $_use_unicode_narrowing = true; /** - * stores the result of the clustering operation + * Stores the result of the clustering operation * - * @access private - * @var array - * @see clusterLanguages() + * @var array + * @see clusterLanguages() */ - var $_clusters; + protected $_clusters; /** * Which type of "language names" are accepted and returned: @@ -172,7 +158,7 @@ class Text_LanguageDetect * 2 - 2-letter ISO 639-1 code ("en") * 3 - 3-letter ISO 639-2 code ("eng") */ - var $_name_mode = 0; + protected $_name_mode = 0; /** * Constructor @@ -180,7 +166,7 @@ class Text_LanguageDetect * Will attempt to load the language database. If it fails, you will get * an exception. */ - function __construct() + public function __construct() { $data = $this->_readdb($this->_db_filename); $this->_checkTrigram($data['trigram']); @@ -202,9 +188,8 @@ class Text_LanguageDetect * @param string $fname File name to load * * @return string expected path to the language model database - * @access private */ - function _get_data_loc($fname) + protected function _get_data_loc($fname) { if ($fname{0} == '/' || $fname{0} == '.') { // if filename starts with a slash, assume it's an absolute pathname @@ -218,7 +203,7 @@ class Text_LanguageDetect } else { // assume this was just unpacked somewhere // try the local working directory if otherwise - return __DIR__ . '/data/' . $fname; + return __DIR__ . '/../data/' . $fname; } } @@ -231,9 +216,8 @@ class Text_LanguageDetect * * @return array the language model data * @throws Text_LanguageDetect_Exception - * @access private */ - function _readdb($fname) + protected function _readdb($fname) { // finds the correct data dir $fname = $this->_get_data_loc($fname); @@ -261,9 +245,8 @@ class Text_LanguageDetect * @param array $trigram Trigram data from database * * @return void - * @access private */ - function _checkTrigram($trigram) + protected function _checkTrigram($trigram) { if (!is_array($trigram)) { if (ini_get('magic_quotes_runtime')) { @@ -355,11 +338,10 @@ class Text_LanguageDetect /** * Returns the number of languages that this object can detect * - * @access public * @return int the number of languages - * @throws Text_LanguageDetect_Exception + * @throws Text_LanguageDetect_Exception */ - function getLanguageCount() + public function getLanguageCount() { return count($this->_lang_db); } @@ -397,11 +379,10 @@ class Text_LanguageDetect /** * Returns the list of detectable languages * - * @access public * @return array the names of the languages known to this object<<<<<<< - * @throws Text_LanguageDetect_Exception + * @throws Text_LanguageDetect_Exception */ - function getLanguages() + public function getLanguages() { return $this->_convertToNameMode( array_keys($this->_lang_db) @@ -439,7 +420,7 @@ class Text_LanguageDetect * * @return void */ - function setNameMode($name_mode) + public function setNameMode($name_mode) { $this->_name_mode = $name_mode; } @@ -469,10 +450,9 @@ class Text_LanguageDetect * @param string $text text to convert * * @return array array of trigram frequencies - * @access private * @deprecated Superceded by the Text_LanguageDetect_Parser class */ - function _trigram($text) + protected function _trigram($text) { $s = new Text_LanguageDetect_Parser($text); $s->prepareTrigram(); @@ -490,9 +470,8 @@ class Text_LanguageDetect * @param array $arr array of trigram * * @return array ranks of trigrams - * @access protected */ - function _arr_rank($arr) + protected function _arr_rank($arr) { // sorts alphabetically first as a standard way of breaking rank ties @@ -520,12 +499,11 @@ class Text_LanguageDetect /** * Sorts an array by value breaking ties alphabetically * - * @param array &$arr the array to sort + * @param array $arr the array to sort * * @return void - * @access private */ - function _bub_sort(&$arr) + protected function _bub_sort(&$arr) { // should do the same as this perl statement: // sort { $trigrams{$b} == $trigrams{$a} @@ -563,9 +541,8 @@ class Text_LanguageDetect * * @return int 1 if $a is greater, -1 if not * @see _bub_sort() - * @access private */ - function _sort_func($a, $b) + protected function _sort_func($a, $b) { // each is actually a key/value pair, so that it can compare using both list($a_key, $a_value) = $a; @@ -603,9 +580,8 @@ class Text_LanguageDetect * * @return int the sum of the differences between the ranks of * the two trigram sets - * @access private */ - function _distance($arr1, $arr2) + protected function _distance($arr1, $arr2) { $sumdist = 0; @@ -636,9 +612,8 @@ class Text_LanguageDetect * * @return float the normalized score * @see _distance() - * @access private */ - function _normalize_score($score, $base_count = null) + protected function _normalize_score($score, $base_count = null) { if ($base_count === null) { $base_count = $this->_threshold; @@ -714,7 +689,7 @@ class Text_LanguageDetect $sample_obj->setPadStart(!$this->_perl_compatible); $sample_obj->analyze(); - $trigram_freqs =& $sample_obj->getTrigramRanks(); + $trigram_freqs = $sample_obj->getTrigramRanks(); $trigram_count = count($trigram_freqs); if ($trigram_count == 0) { @@ -725,7 +700,7 @@ class Text_LanguageDetect // use unicode block detection to narrow down the possibilities if ($this->_use_unicode_narrowing) { - $blocks =& $sample_obj->getUnicodeBlocks(); + $blocks = $sample_obj->getUnicodeBlocks(); if (is_array($blocks)) { $present_blocks = array_keys($blocks); @@ -977,9 +952,8 @@ class Text_LanguageDetect * * @return mixed Block name, -1 if it failed * @see unicodeBlockName() - * @access protected */ - function _unicode_block_name($unicode, $blocks, $block_count = -1) + protected function _unicode_block_name($unicode, $blocks, $block_count = -1) { // for a reference, see // http://www.unicode.org/Public/UNIDATA/Blocks.txt @@ -1030,9 +1004,8 @@ class Text_LanguageDetect * * @return array the database of unicode block definitions * @throws Text_LanguageDetect_Exception - * @access protected */ - function _read_unicode_block_db() + protected function _read_unicode_block_db() { // since the unicode definitions are always going to be the same, // might as well share the memory for the db with all other instances @@ -1151,14 +1124,13 @@ class Text_LanguageDetect * Uses a nearest neighbor technique to generate the maximum possible * number of dendograms from the similarity data. * - * @access public - * @return array language cluster data - * @throws Text_LanguageDetect_Exception - * @see languageSimilarity() - * @deprecated this function will eventually be removed and placed into + * @return array language cluster data + * @throws Text_LanguageDetect_Exception + * @see languageSimilarity() + * @deprecated this function will eventually be removed and placed into * the model generation class */ - function clusterLanguages() + public function clusterLanguages() { // todo: set the maximum number of clusters // return cached result, if any @@ -1467,7 +1439,7 @@ class Text_LanguageDetect } /** - * ut8-safe strlen() + * UTF8-safe strlen() * * Returns the numbers of characters (not bytes) in a utf8 string * @@ -1491,10 +1463,9 @@ class Text_LanguageDetect * @param string $char a utf8 (possibly multi-byte) char * * @return int unicode value - * @access protected * @link http://en.wikipedia.org/wiki/UTF-8 */ - function _utf8char2unicode($char) + protected function _utf8char2unicode($char) { // strlen() here will actually get the binary length of a single char switch (strlen($char)) { @@ -1531,20 +1502,19 @@ class Text_LanguageDetect } /** - * utf8-safe fast character iterator + * UTF8-safe fast character iterator * * Will get the next character starting from $counter, which will then be * incremented. If a multi-byte char the bytes will be concatenated and * $counter will be incremeted by the number of bytes in the char. * * @param string $str the string being iterated over - * @param int &$counter the iterator, will increment by reference + * @param int $counter the iterator, will increment by reference * @param bool $special_convert whether to do special conversions * * @return char the next (possibly multi-byte) char from $counter - * @access private */ - static function _next_char($str, &$counter, $special_convert = false) + protected static function _next_char($str, &$counter, $special_convert = false) { $char = $str{$counter++}; $ord = ord($char); @@ -1636,7 +1606,7 @@ class Text_LanguageDetect * * @return string|array Language name */ - function _convertFromNameMode($lang, $convertKey = false) + protected function _convertFromNameMode($lang, $convertKey = false) { if ($this->_name_mode == 0) { return $lang; @@ -1676,7 +1646,7 @@ class Text_LanguageDetect * * @return string|array Language name */ - function _convertToNameMode($lang, $convertKey = false) + protected function _convertToNameMode($lang, $convertKey = false) { if ($this->_name_mode == 0) { return $lang; diff --git a/plugins/af_lang_detect/languagedetect/Text/LanguageDetect/Exception.php b/plugins/af_lang_detect/languagedetect/Text/LanguageDetect/Exception.php index 196d994f..cdbfe13b 100644 --- a/plugins/af_lang_detect/languagedetect/Text/LanguageDetect/Exception.php +++ b/plugins/af_lang_detect/languagedetect/Text/LanguageDetect/Exception.php @@ -1,4 +1,28 @@ + * @license BSD http://www.opensource.org/licenses/bsd-license.php + * @link http://pear.php.net/package/Text_LanguageDetect/ + */ + +/** + * Part of the PEAR language detection package + * + * PHP version 5 + * + * @category Text + * @package Text_LanguageDetect + * @author Nicholas Pisarro + * @license BSD http://www.opensource.org/licenses/bsd-license.php + * @link http://pear.php.net/package/Text_LanguageDetect/ + * @link http://langdetect.blogspot.com/ + */ class Text_LanguageDetect_Exception extends Exception { /** diff --git a/plugins/af_lang_detect/languagedetect/Text/LanguageDetect/ISO639.php b/plugins/af_lang_detect/languagedetect/Text/LanguageDetect/ISO639.php index 7caa9794..388160c4 100644 --- a/plugins/af_lang_detect/languagedetect/Text/LanguageDetect/ISO639.php +++ b/plugins/af_lang_detect/languagedetect/Text/LanguageDetect/ISO639.php @@ -9,7 +9,6 @@ * @author Christian Weiske * @copyright 2011 Christian Weiske * @license http://www.debian.org/misc/bsd.license BSD - * @version SVN: $Id$ * @link http://pear.php.net/package/Text_LanguageDetect/ */ @@ -23,7 +22,7 @@ * @package Text_LanguageDetect * @author Christian Weiske * @copyright 2011 Christian Weiske - * @license http://www.debian.org/misc/bsd.license BSD + * @license BSD http://www.opensource.org/licenses/bsd-license.php * @link http://www.loc.gov/standards/iso639-2/php/code_list.php * * @SuppressWarnings(PHPMD) diff --git a/plugins/af_lang_detect/languagedetect/Text/LanguageDetect/Parser.php b/plugins/af_lang_detect/languagedetect/Text/LanguageDetect/Parser.php index 44eea897..4f1206d0 100644 --- a/plugins/af_lang_detect/languagedetect/Text/LanguageDetect/Parser.php +++ b/plugins/af_lang_detect/languagedetect/Text/LanguageDetect/Parser.php @@ -1,16 +1,15 @@ + * @copyright 2006 Nicholas Pisarro + * @license BSD http://www.opensource.org/licenses/bsd-license.php + * @link http://pear.php.net/package/Text_LanguageDetect/ */ /** @@ -20,99 +19,106 @@ * class. After a new profile has been built, the data can be retrieved using * the accessor functions. * - * This class is intended to be used by the Text_LanguageDetect class, not + * This class is intended to be used by the Text_LanguageDetect class, not * end-users. * - * @category Text - * @package Text_LanguageDetect - * @author Nicholas Pisarro - * @copyright 2006 - * @license BSD - * @version release: 0.3.0 + * @category Text + * @package Text_LanguageDetect + * @author Nicholas Pisarro + * @copyright 2006 Nicholas Pisarro + * @license BSD http://www.opensource.org/licenses/bsd-license.php + * @version Release: @package_version@ + * @link http://pear.php.net/package/Text_LanguageDetect/ */ class Text_LanguageDetect_Parser extends Text_LanguageDetect { /** - * the piece of text being parsed + * The piece of text being parsed * - * @access private - * @var string + * @var string */ - var $_string; + protected $_string; /** - * stores the trigram frequencies of the sample + * Stores the trigram frequencies of the sample * - * @access private - * @var string + * @var string */ - var $_trigrams = array(); + protected $_trigrams = array(); /** - * stores the trigram ranks of the sample + * Stores the trigram ranks of the sample * - * @access private - * @var array + * @var array */ - var $_trigram_ranks = array(); + protected $_trigram_ranks = array(); /** - * stores the unicode blocks of the sample + * Stores the unicode blocks of the sample * - * @access private - * @var array + * @var array */ - var $_unicode_blocks = array(); - + protected $_unicode_blocks = array(); + /** * Whether the parser should compile the unicode ranges - * - * @access private - * @var bool + * + * @var bool */ - var $_compile_unicode = false; + protected $_compile_unicode = false; /** * Whether the parser should compile trigrams * - * @access private - * @var bool + * @var bool */ - var $_compile_trigram = false; + protected $_compile_trigram = false; /** * Whether the trigram parser should pad the beginning of the string * - * @access private - * @var bool + * @var bool */ - var $_trigram_pad_start = false; + protected $_trigram_pad_start = false; /** * Whether the unicode parser should skip non-alphabetical ascii chars * - * @access private - * @var bool + * @var bool */ - var $_unicode_skip_symbols = true; + protected $_unicode_skip_symbols = true; /** * Constructor * - * @access private - * @param string $string string to be parsed + * @param string $string string to be parsed */ - function Text_LanguageDetect_Parser($string) { + public function __construct($string) + { $this->_string = $string; } + /** + * PHP 4 constructor for backwards compatibility. + * + * @param string $string string to be parsed + * + * @return void + */ + public function Text_LanguageDetect_Parser($string) + { + self::__construct($string); + } + /** * Returns true if a string is suitable for parsing * - * @param string $str input string to test - * @return bool true if acceptable, false if not + * @param string $str input string to test + * + * @return bool true if acceptable, false if not */ - public static function validateString($str) { + public static function validateString($str) + { if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { return true; } else { @@ -121,34 +127,37 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect } /** - * turn on/off trigram counting + * Turn on/off trigram counting * - * @access public - * @param bool $bool true for on, false for off + * @param bool $bool true for on, false for off + * + * @return void */ - function prepareTrigram($bool = true) + public function prepareTrigram($bool = true) { $this->_compile_trigram = $bool; } /** - * turn on/off unicode block counting + * Turn on/off unicode block counting + * + * @param bool $bool true for on, false for off * - * @access public - * @param bool $bool true for on, false for off + * @return void */ - function prepareUnicode($bool = true) + public function prepareUnicode($bool = true) { $this->_compile_unicode = $bool; } /** - * turn on/off padding the beginning of the sample string + * Turn on/off padding the beginning of the sample string + * + * @param bool $bool true for on, false for off * - * @access public - * @param bool $bool true for on, false for off + * @return void */ - function setPadStart($bool = true) + public function setPadStart($bool = true) { $this->_trigram_pad_start = $bool; } @@ -156,10 +165,11 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect /** * Should the unicode block counter skip non-alphabetical ascii chars? * - * @access public - * @param bool $bool true for on, false for off + * @param bool $bool true for on, false for off + * + * @return void */ - function setUnicodeSkipSymbols($bool = true) + public function setUnicodeSkipSymbols($bool = true) { $this->_unicode_skip_symbols = $bool; } @@ -167,10 +177,9 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect /** * Returns the trigram ranks for the text sample * - * @access public - * @return array trigram ranks in the text sample + * @return array Trigram ranks in the text sample */ - function &getTrigramRanks() + public function getTrigramRanks() { return $this->_trigram_ranks; } @@ -178,39 +187,37 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect /** * Return the trigram freqency table * - * only used in testing to make sure the parser is working + * Only used in testing to make sure the parser is working * - * @access public - * @return array trigram freqencies in the text sample + * @return array Trigram freqencies in the text sample */ - function &getTrigramFreqs() + public function getTrigramFreqs() { return $this->_trigram; } /** - * returns the array of unicode blocks + * Returns the array of unicode blocks * - * @access public - * @return array unicode blocks in the text sample + * @return array Unicode blocks in the text sample */ - function &getUnicodeBlocks() + public function getUnicodeBlocks() { return $this->_unicode_blocks; } /** * Executes the parsing operation - * - * Be sure to call the set*() functions to set options and the + * + * Be sure to call the set*() functions to set options and the * prepare*() functions first to tell it what kind of data to compute * * Afterwards the get*() functions can be used to access the compiled * information. * - * @access public + * @return void */ - function analyze() + public function analyze() { $len = strlen($this->_string); $byte_counter = 0; @@ -258,9 +265,9 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect if ($this->_compile_trigram) { if (!($b == ' ' && ($a == ' ' || $char == ' '))) { if (!isset($this->_trigram[$a . $b . $char])) { - $this->_trigram[$a . $b . $char] = 1; + $this->_trigram[$a . $b . $char] = 1; } else { - $this->_trigram[$a . $b . $char]++; + $this->_trigram[$a . $b . $char]++; } } @@ -271,10 +278,11 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect // unicode block detection if ($this->_compile_unicode) { if ($this->_unicode_skip_symbols - && strlen($char) == 1 - && ($char < 'A' || $char > 'z' - || ($char > 'Z' && $char < 'a')) - && $char != "'") { // does not skip the apostrophe + && strlen($char) == 1 + && ($char < 'A' || $char > 'z' + || ($char > 'Z' && $char < 'a')) + && $char != "'" + ) { // does not skip the apostrophe // since it's included in the language // models @@ -297,7 +305,8 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect if ($this->_compile_unicode) { foreach ($unicode_chars as $utf8_char => $count) { $search_result = $this->_unicode_block_name( - $this->_utf8char2unicode($utf8_char), $blocks, $block_count); + $this->_utf8char2unicode($utf8_char), $blocks, $block_count + ); if ($search_result != -1) { $block_name = $search_result[2]; diff --git a/plugins/af_lang_detect/languagedetect/data/build-unicode_blocks.php b/plugins/af_lang_detect/languagedetect/data/build-unicode_blocks.php new file mode 100644 index 00000000..128e9ed9 --- /dev/null +++ b/plugins/af_lang_detect/languagedetect/data/build-unicode_blocks.php @@ -0,0 +1,7 @@ + + array ( + 0 => 0x0000, + 1 => 0x007F, + 2 => 'Basic Latin', + ), + 1 => + array ( + 0 => 0x0080, + 1 => 0x00FF, + 2 => 'Latin-1 Supplement', + ), + 2 => + array ( + 0 => 0x0100, + 1 => 0x017F, + 2 => 'Latin Extended-A', + ), + 3 => + array ( + 0 => 0x0180, + 1 => 0x024F, + 2 => 'Latin Extended-B', + ), + 4 => + array ( + 0 => 0x0250, + 1 => 0x02AF, + 2 => 'IPA Extensions', + ), + 5 => + array ( + 0 => 0x02B0, + 1 => 0x02FF, + 2 => 'Spacing Modifier Letters', + ), + 6 => + array ( + 0 => 0x0300, + 1 => 0x036F, + 2 => 'Combining Diacritical Marks', + ), + 7 => + array ( + 0 => 0x0370, + 1 => 0x03FF, + 2 => 'Greek and Coptic', + ), + 8 => + array ( + 0 => 0x0400, + 1 => 0x04FF, + 2 => 'Cyrillic', + ), + 9 => + array ( + 0 => 0x0500, + 1 => 0x052F, + 2 => 'Cyrillic Supplement', + ), + 10 => + array ( + 0 => 0x0530, + 1 => 0x058F, + 2 => 'Armenian', + ), + 11 => + array ( + 0 => 0x0590, + 1 => 0x05FF, + 2 => 'Hebrew', + ), + 12 => + array ( + 0 => 0x0600, + 1 => 0x06FF, + 2 => 'Arabic', + ), + 13 => + array ( + 0 => 0x0700, + 1 => 0x074F, + 2 => 'Syriac', + ), + 14 => + array ( + 0 => 0x0750, + 1 => 0x077F, + 2 => 'Arabic Supplement', + ), + 15 => + array ( + 0 => 0x0780, + 1 => 0x07BF, + 2 => 'Thaana', + ), + 16 => + array ( + 0 => 0x0900, + 1 => 0x097F, + 2 => 'Devanagari', + ), + 17 => + array ( + 0 => 0x0980, + 1 => 0x09FF, + 2 => 'Bengali', + ), + 18 => + array ( + 0 => 0x0A00, + 1 => 0x0A7F, + 2 => 'Gurmukhi', + ), + 19 => + array ( + 0 => 0x0A80, + 1 => 0x0AFF, + 2 => 'Gujarati', + ), + 20 => + array ( + 0 => 0x0B00, + 1 => 0x0B7F, + 2 => 'Oriya', + ), + 21 => + array ( + 0 => 0x0B80, + 1 => 0x0BFF, + 2 => 'Tamil', + ), + 22 => + array ( + 0 => 0x0C00, + 1 => 0x0C7F, + 2 => 'Telugu', + ), + 23 => + array ( + 0 => 0x0C80, + 1 => 0x0CFF, + 2 => 'Kannada', + ), + 24 => + array ( + 0 => 0x0D00, + 1 => 0x0D7F, + 2 => 'Malayalam', + ), + 25 => + array ( + 0 => 0x0D80, + 1 => 0x0DFF, + 2 => 'Sinhala', + ), + 26 => + array ( + 0 => 0x0E00, + 1 => 0x0E7F, + 2 => 'Thai', + ), + 27 => + array ( + 0 => 0x0E80, + 1 => 0x0EFF, + 2 => 'Lao', + ), + 28 => + array ( + 0 => 0x0F00, + 1 => 0x0FFF, + 2 => 'Tibetan', + ), + 29 => + array ( + 0 => 0x1000, + 1 => 0x109F, + 2 => 'Myanmar', + ), + 30 => + array ( + 0 => 0x10A0, + 1 => 0x10FF, + 2 => 'Georgian', + ), + 31 => + array ( + 0 => 0x1100, + 1 => 0x11FF, + 2 => 'Hangul Jamo', + ), + 32 => + array ( + 0 => 0x1200, + 1 => 0x137F, + 2 => 'Ethiopic', + ), + 33 => + array ( + 0 => 0x1380, + 1 => 0x139F, + 2 => 'Ethiopic Supplement', + ), + 34 => + array ( + 0 => 0x13A0, + 1 => 0x13FF, + 2 => 'Cherokee', + ), + 35 => + array ( + 0 => 0x1400, + 1 => 0x167F, + 2 => 'Unified Canadian Aboriginal Syllabics', + ), + 36 => + array ( + 0 => 0x1680, + 1 => 0x169F, + 2 => 'Ogham', + ), + 37 => + array ( + 0 => 0x16A0, + 1 => 0x16FF, + 2 => 'Runic', + ), + 38 => + array ( + 0 => 0x1700, + 1 => 0x171F, + 2 => 'Tagalog', + ), + 39 => + array ( + 0 => 0x1720, + 1 => 0x173F, + 2 => 'Hanunoo', + ), + 40 => + array ( + 0 => 0x1740, + 1 => 0x175F, + 2 => 'Buhid', + ), + 41 => + array ( + 0 => 0x1760, + 1 => 0x177F, + 2 => 'Tagbanwa', + ), + 42 => + array ( + 0 => 0x1780, + 1 => 0x17FF, + 2 => 'Khmer', + ), + 43 => + array ( + 0 => 0x1800, + 1 => 0x18AF, + 2 => 'Mongolian', + ), + 44 => + array ( + 0 => 0x1900, + 1 => 0x194F, + 2 => 'Limbu', + ), + 45 => + array ( + 0 => 0x1950, + 1 => 0x197F, + 2 => 'Tai Le', + ), + 46 => + array ( + 0 => 0x1980, + 1 => 0x19DF, + 2 => 'New Tai Lue', + ), + 47 => + array ( + 0 => 0x19E0, + 1 => 0x19FF, + 2 => 'Khmer Symbols', + ), + 48 => + array ( + 0 => 0x1A00, + 1 => 0x1A1F, + 2 => 'Buginese', + ), + 49 => + array ( + 0 => 0x1D00, + 1 => 0x1D7F, + 2 => 'Phonetic Extensions', + ), + 50 => + array ( + 0 => 0x1D80, + 1 => 0x1DBF, + 2 => 'Phonetic Extensions Supplement', + ), + 51 => + array ( + 0 => 0x1DC0, + 1 => 0x1DFF, + 2 => 'Combining Diacritical Marks Supplement', + ), + 52 => + array ( + 0 => 0x1E00, + 1 => 0x1EFF, + 2 => 'Latin Extended Additional', + ), + 53 => + array ( + 0 => 0x1F00, + 1 => 0x1FFF, + 2 => 'Greek Extended', + ), + 54 => + array ( + 0 => 0x2000, + 1 => 0x206F, + 2 => 'General Punctuation', + ), + 55 => + array ( + 0 => 0x2070, + 1 => 0x209F, + 2 => 'Superscripts and Subscripts', + ), + 56 => + array ( + 0 => 0x20A0, + 1 => 0x20CF, + 2 => 'Currency Symbols', + ), + 57 => + array ( + 0 => 0x20D0, + 1 => 0x20FF, + 2 => 'Combining Diacritical Marks for Symbols', + ), + 58 => + array ( + 0 => 0x2100, + 1 => 0x214F, + 2 => 'Letterlike Symbols', + ), + 59 => + array ( + 0 => 0x2150, + 1 => 0x218F, + 2 => 'Number Forms', + ), + 60 => + array ( + 0 => 0x2190, + 1 => 0x21FF, + 2 => 'Arrows', + ), + 61 => + array ( + 0 => 0x2200, + 1 => 0x22FF, + 2 => 'Mathematical Operators', + ), + 62 => + array ( + 0 => 0x2300, + 1 => 0x23FF, + 2 => 'Miscellaneous Technical', + ), + 63 => + array ( + 0 => 0x2400, + 1 => 0x243F, + 2 => 'Control Pictures', + ), + 64 => + array ( + 0 => 0x2440, + 1 => 0x245F, + 2 => 'Optical Character Recognition', + ), + 65 => + array ( + 0 => 0x2460, + 1 => 0x24FF, + 2 => 'Enclosed Alphanumerics', + ), + 66 => + array ( + 0 => 0x2500, + 1 => 0x257F, + 2 => 'Box Drawing', + ), + 67 => + array ( + 0 => 0x2580, + 1 => 0x259F, + 2 => 'Block Elements', + ), + 68 => + array ( + 0 => 0x25A0, + 1 => 0x25FF, + 2 => 'Geometric Shapes', + ), + 69 => + array ( + 0 => 0x2600, + 1 => 0x26FF, + 2 => 'Miscellaneous Symbols', + ), + 70 => + array ( + 0 => 0x2700, + 1 => 0x27BF, + 2 => 'Dingbats', + ), + 71 => + array ( + 0 => 0x27C0, + 1 => 0x27EF, + 2 => 'Miscellaneous Mathematical Symbols-A', + ), + 72 => + array ( + 0 => 0x27F0, + 1 => 0x27FF, + 2 => 'Supplemental Arrows-A', + ), + 73 => + array ( + 0 => 0x2800, + 1 => 0x28FF, + 2 => 'Braille Patterns', + ), + 74 => + array ( + 0 => 0x2900, + 1 => 0x297F, + 2 => 'Supplemental Arrows-B', + ), + 75 => + array ( + 0 => 0x2980, + 1 => 0x29FF, + 2 => 'Miscellaneous Mathematical Symbols-B', + ), + 76 => + array ( + 0 => 0x2A00, + 1 => 0x2AFF, + 2 => 'Supplemental Mathematical Operators', + ), + 77 => + array ( + 0 => 0x2B00, + 1 => 0x2BFF, + 2 => 'Miscellaneous Symbols and Arrows', + ), + 78 => + array ( + 0 => 0x2C00, + 1 => 0x2C5F, + 2 => 'Glagolitic', + ), + 79 => + array ( + 0 => 0x2C80, + 1 => 0x2CFF, + 2 => 'Coptic', + ), + 80 => + array ( + 0 => 0x2D00, + 1 => 0x2D2F, + 2 => 'Georgian Supplement', + ), + 81 => + array ( + 0 => 0x2D30, + 1 => 0x2D7F, + 2 => 'Tifinagh', + ), + 82 => + array ( + 0 => 0x2D80, + 1 => 0x2DDF, + 2 => 'Ethiopic Extended', + ), + 83 => + array ( + 0 => 0x2E00, + 1 => 0x2E7F, + 2 => 'Supplemental Punctuation', + ), + 84 => + array ( + 0 => 0x2E80, + 1 => 0x2EFF, + 2 => 'CJK Radicals Supplement', + ), + 85 => + array ( + 0 => 0x2F00, + 1 => 0x2FDF, + 2 => 'Kangxi Radicals', + ), + 86 => + array ( + 0 => 0x2FF0, + 1 => 0x2FFF, + 2 => 'Ideographic Description Characters', + ), + 87 => + array ( + 0 => 0x3000, + 1 => 0x303F, + 2 => 'CJK Symbols and Punctuation', + ), + 88 => + array ( + 0 => 0x3040, + 1 => 0x309F, + 2 => 'Hiragana', + ), + 89 => + array ( + 0 => 0x30A0, + 1 => 0x30FF, + 2 => 'Katakana', + ), + 90 => + array ( + 0 => 0x3100, + 1 => 0x312F, + 2 => 'Bopomofo', + ), + 91 => + array ( + 0 => 0x3130, + 1 => 0x318F, + 2 => 'Hangul Compatibility Jamo', + ), + 92 => + array ( + 0 => 0x3190, + 1 => 0x319F, + 2 => 'Kanbun', + ), + 93 => + array ( + 0 => 0x31A0, + 1 => 0x31BF, + 2 => 'Bopomofo Extended', + ), + 94 => + array ( + 0 => 0x31C0, + 1 => 0x31EF, + 2 => 'CJK Strokes', + ), + 95 => + array ( + 0 => 0x31F0, + 1 => 0x31FF, + 2 => 'Katakana Phonetic Extensions', + ), + 96 => + array ( + 0 => 0x3200, + 1 => 0x32FF, + 2 => 'Enclosed CJK Letters and Months', + ), + 97 => + array ( + 0 => 0x3300, + 1 => 0x33FF, + 2 => 'CJK Compatibility', + ), + 98 => + array ( + 0 => 0x3400, + 1 => 0x4DBF, + 2 => 'CJK Unified Ideographs Extension A', + ), + 99 => + array ( + 0 => 0x4DC0, + 1 => 0x4DFF, + 2 => 'Yijing Hexagram Symbols', + ), + 100 => + array ( + 0 => 0x4E00, + 1 => 0x9FFF, + 2 => 'CJK Unified Ideographs', + ), + 101 => + array ( + 0 => 0xA000, + 1 => 0xA48F, + 2 => 'Yi Syllables', + ), + 102 => + array ( + 0 => 0xA490, + 1 => 0xA4CF, + 2 => 'Yi Radicals', + ), + 103 => + array ( + 0 => 0xA700, + 1 => 0xA71F, + 2 => 'Modifier Tone Letters', + ), + 104 => + array ( + 0 => 0xA800, + 1 => 0xA82F, + 2 => 'Syloti Nagri', + ), + 105 => + array ( + 0 => 0xAC00, + 1 => 0xD7AF, + 2 => 'Hangul Syllables', + ), + 106 => + array ( + 0 => 0xD800, + 1 => 0xDB7F, + 2 => 'High Surrogates', + ), + 107 => + array ( + 0 => 0xDB80, + 1 => 0xDBFF, + 2 => 'High Private Use Surrogates', + ), + 108 => + array ( + 0 => 0xDC00, + 1 => 0xDFFF, + 2 => 'Low Surrogates', + ), + 109 => + array ( + 0 => 0xE000, + 1 => 0xF8FF, + 2 => 'Private Use Area', + ), + 110 => + array ( + 0 => 0xF900, + 1 => 0xFAFF, + 2 => 'CJK Compatibility Ideographs', + ), + 111 => + array ( + 0 => 0xFB00, + 1 => 0xFB4F, + 2 => 'Alphabetic Presentation Forms', + ), + 112 => + array ( + 0 => 0xFB50, + 1 => 0xFDFF, + 2 => 'Arabic Presentation Forms-A', + ), + 113 => + array ( + 0 => 0xFE00, + 1 => 0xFE0F, + 2 => 'Variation Selectors', + ), + 114 => + array ( + 0 => 0xFE10, + 1 => 0xFE1F, + 2 => 'Vertical Forms', + ), + 115 => + array ( + 0 => 0xFE20, + 1 => 0xFE2F, + 2 => 'Combining Half Marks', + ), + 116 => + array ( + 0 => 0xFE30, + 1 => 0xFE4F, + 2 => 'CJK Compatibility Forms', + ), + 117 => + array ( + 0 => 0xFE50, + 1 => 0xFE6F, + 2 => 'Small Form Variants', + ), + 118 => + array ( + 0 => 0xFE70, + 1 => 0xFEFF, + 2 => 'Arabic Presentation Forms-B', + ), + 119 => + array ( + 0 => 0xFF00, + 1 => 0xFFEF, + 2 => 'Halfwidth and Fullwidth Forms', + ), + 120 => + array ( + 0 => 0xFFF0, + 1 => 0xFFFF, + 2 => 'Specials', + ), + 121 => + array ( + 0 => 0x10000, + 1 => 0x1007F, + 2 => 'Linear B Syllabary', + ), + 122 => + array ( + 0 => 0x10080, + 1 => 0x100FF, + 2 => 'Linear B Ideograms', + ), + 123 => + array ( + 0 => 0x10100, + 1 => 0x1013F, + 2 => 'Aegean Numbers', + ), + 124 => + array ( + 0 => 0x10140, + 1 => 0x1018F, + 2 => 'Ancient Greek Numbers', + ), + 125 => + array ( + 0 => 0x10300, + 1 => 0x1032F, + 2 => 'Old Italic', + ), + 126 => + array ( + 0 => 0x10330, + 1 => 0x1034F, + 2 => 'Gothic', + ), + 127 => + array ( + 0 => 0x10380, + 1 => 0x1039F, + 2 => 'Ugaritic', + ), + 128 => + array ( + 0 => 0x103A0, + 1 => 0x103DF, + 2 => 'Old Persian', + ), + 129 => + array ( + 0 => 0x10400, + 1 => 0x1044F, + 2 => 'Deseret', + ), + 130 => + array ( + 0 => 0x10450, + 1 => 0x1047F, + 2 => 'Shavian', + ), + 131 => + array ( + 0 => 0x10480, + 1 => 0x104AF, + 2 => 'Osmanya', + ), + 132 => + array ( + 0 => 0x10800, + 1 => 0x1083F, + 2 => 'Cypriot Syllabary', + ), + 133 => + array ( + 0 => 0x10A00, + 1 => 0x10A5F, + 2 => 'Kharoshthi', + ), + 134 => + array ( + 0 => 0x1D000, + 1 => 0x1D0FF, + 2 => 'Byzantine Musical Symbols', + ), + 135 => + array ( + 0 => 0x1D100, + 1 => 0x1D1FF, + 2 => 'Musical Symbols', + ), + 136 => + array ( + 0 => 0x1D200, + 1 => 0x1D24F, + 2 => 'Ancient Greek Musical Notation', + ), + 137 => + array ( + 0 => 0x1D300, + 1 => 0x1D35F, + 2 => 'Tai Xuan Jing Symbols', + ), + 138 => + array ( + 0 => 0x1D400, + 1 => 0x1D7FF, + 2 => 'Mathematical Alphanumeric Symbols', + ), + 139 => + array ( + 0 => 0x20000, + 1 => 0x2A6DF, + 2 => 'CJK Unified Ideographs Extension B', + ), + 140 => + array ( + 0 => 0x2F800, + 1 => 0x2FA1F, + 2 => 'CJK Compatibility Ideographs Supplement', + ), + 141 => + array ( + 0 => 0xE0000, + 1 => 0xE007F, + 2 => 'Tags', + ), + 142 => + array ( + 0 => 0xE0100, + 1 => 0xE01EF, + 2 => 'Variation Selectors Supplement', + ), + 143 => + array ( + 0 => 0xF0000, + 1 => 0xFFFFF, + 2 => 'Supplementary Private Use Area-A', + ), + 144 => + array ( + 0 => 0x100000, + 1 => 0x10FFFF, + 2 => 'Supplementary Private Use Area-B', + ), +); -- 2.39.2