]> git.wh0rd.org - tt-rss.git/blob - plugins/af_lang_detect/languagedetect/Text/LanguageDetect/Parser.php
Update af_lang_detect plugin with changes from upstream
[tt-rss.git] / plugins / af_lang_detect / languagedetect / Text / LanguageDetect / Parser.php
1 <?php
2 /**
3 * Part of Text_LanguageDetect
4 *
5 * PHP version 5
6 *
7 * @category Text
8 * @package Text_LanguageDetect
9 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
10 * @copyright 2006 Nicholas Pisarro
11 * @license BSD http://www.opensource.org/licenses/bsd-license.php
12 * @link http://pear.php.net/package/Text_LanguageDetect/
13 */
14
15 /**
16 * This class represents a text sample to be parsed.
17 *
18 * This separates the analysis of a text sample from the primary LanguageDetect
19 * class. After a new profile has been built, the data can be retrieved using
20 * the accessor functions.
21 *
22 * This class is intended to be used by the Text_LanguageDetect class, not
23 * end-users.
24 *
25 * @category Text
26 * @package Text_LanguageDetect
27 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
28 * @copyright 2006 Nicholas Pisarro
29 * @license BSD http://www.opensource.org/licenses/bsd-license.php
30 * @version Release: @package_version@
31 * @link http://pear.php.net/package/Text_LanguageDetect/
32 */
33 class Text_LanguageDetect_Parser extends Text_LanguageDetect
34 {
35 /**
36 * The piece of text being parsed
37 *
38 * @var string
39 */
40 protected $_string;
41
42 /**
43 * Stores the trigram frequencies of the sample
44 *
45 * @var string
46 */
47 protected $_trigrams = array();
48
49 /**
50 * Stores the trigram ranks of the sample
51 *
52 * @var array
53 */
54 protected $_trigram_ranks = array();
55
56 /**
57 * Stores the unicode blocks of the sample
58 *
59 * @var array
60 */
61 protected $_unicode_blocks = array();
62
63 /**
64 * Whether the parser should compile the unicode ranges
65 *
66 * @var bool
67 */
68 protected $_compile_unicode = false;
69
70 /**
71 * Whether the parser should compile trigrams
72 *
73 * @var bool
74 */
75 protected $_compile_trigram = false;
76
77 /**
78 * Whether the trigram parser should pad the beginning of the string
79 *
80 * @var bool
81 */
82 protected $_trigram_pad_start = false;
83
84 /**
85 * Whether the unicode parser should skip non-alphabetical ascii chars
86 *
87 * @var bool
88 */
89 protected $_unicode_skip_symbols = true;
90
91 /**
92 * Constructor
93 *
94 * @param string $string string to be parsed
95 */
96 public function __construct($string)
97 {
98 $this->_string = $string;
99 }
100
101 /**
102 * PHP 4 constructor for backwards compatibility.
103 *
104 * @param string $string string to be parsed
105 *
106 * @return void
107 */
108 public function Text_LanguageDetect_Parser($string)
109 {
110 self::__construct($string);
111 }
112
113 /**
114 * Returns true if a string is suitable for parsing
115 *
116 * @param string $str input string to test
117 *
118 * @return bool true if acceptable, false if not
119 */
120 public static function validateString($str)
121 {
122 if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
123 return true;
124 } else {
125 return false;
126 }
127 }
128
129 /**
130 * Turn on/off trigram counting
131 *
132 * @param bool $bool true for on, false for off
133 *
134 * @return void
135 */
136 public function prepareTrigram($bool = true)
137 {
138 $this->_compile_trigram = $bool;
139 }
140
141 /**
142 * Turn on/off unicode block counting
143 *
144 * @param bool $bool true for on, false for off
145 *
146 * @return void
147 */
148 public function prepareUnicode($bool = true)
149 {
150 $this->_compile_unicode = $bool;
151 }
152
153 /**
154 * Turn on/off padding the beginning of the sample string
155 *
156 * @param bool $bool true for on, false for off
157 *
158 * @return void
159 */
160 public function setPadStart($bool = true)
161 {
162 $this->_trigram_pad_start = $bool;
163 }
164
165 /**
166 * Should the unicode block counter skip non-alphabetical ascii chars?
167 *
168 * @param bool $bool true for on, false for off
169 *
170 * @return void
171 */
172 public function setUnicodeSkipSymbols($bool = true)
173 {
174 $this->_unicode_skip_symbols = $bool;
175 }
176
177 /**
178 * Returns the trigram ranks for the text sample
179 *
180 * @return array Trigram ranks in the text sample
181 */
182 public function getTrigramRanks()
183 {
184 return $this->_trigram_ranks;
185 }
186
187 /**
188 * Return the trigram freqency table
189 *
190 * Only used in testing to make sure the parser is working
191 *
192 * @return array Trigram freqencies in the text sample
193 */
194 public function getTrigramFreqs()
195 {
196 return $this->_trigram;
197 }
198
199 /**
200 * Returns the array of unicode blocks
201 *
202 * @return array Unicode blocks in the text sample
203 */
204 public function getUnicodeBlocks()
205 {
206 return $this->_unicode_blocks;
207 }
208
209 /**
210 * Executes the parsing operation
211 *
212 * Be sure to call the set*() functions to set options and the
213 * prepare*() functions first to tell it what kind of data to compute
214 *
215 * Afterwards the get*() functions can be used to access the compiled
216 * information.
217 *
218 * @return void
219 */
220 public function analyze()
221 {
222 $len = strlen($this->_string);
223 $byte_counter = 0;
224
225
226 // unicode startup
227 if ($this->_compile_unicode) {
228 $blocks = $this->_read_unicode_block_db();
229 $block_count = count($blocks);
230
231 $skipped_count = 0;
232 $unicode_chars = array();
233 }
234
235 // trigram startup
236 if ($this->_compile_trigram) {
237 // initialize them as blank so the parser will skip the first two
238 // (since it skips trigrams with more than 2 contiguous spaces)
239 $a = ' ';
240 $b = ' ';
241
242 // kludge
243 // if it finds a valid trigram to start and the start pad option is
244 // off, then set a variable that will be used to reduce this
245 // trigram after parsing has finished
246 if (!$this->_trigram_pad_start) {
247 $a = $this->_next_char($this->_string, $byte_counter, true);
248
249 if ($a != ' ') {
250 $b = $this->_next_char($this->_string, $byte_counter, true);
251 $dropone = " $a$b";
252 }
253
254 $byte_counter = 0;
255 $a = ' ';
256 $b = ' ';
257 }
258 }
259
260 while ($byte_counter < $len) {
261 $char = $this->_next_char($this->_string, $byte_counter, true);
262
263
264 // language trigram detection
265 if ($this->_compile_trigram) {
266 if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
267 if (!isset($this->_trigram[$a . $b . $char])) {
268 $this->_trigram[$a . $b . $char] = 1;
269 } else {
270 $this->_trigram[$a . $b . $char]++;
271 }
272 }
273
274 $a = $b;
275 $b = $char;
276 }
277
278 // unicode block detection
279 if ($this->_compile_unicode) {
280 if ($this->_unicode_skip_symbols
281 && strlen($char) == 1
282 && ($char < 'A' || $char > 'z'
283 || ($char > 'Z' && $char < 'a'))
284 && $char != "'"
285 ) { // does not skip the apostrophe
286 // since it's included in the language
287 // models
288
289 $skipped_count++;
290 continue;
291 }
292
293 // build an array of all the characters
294 if (isset($unicode_chars[$char])) {
295 $unicode_chars[$char]++;
296 } else {
297 $unicode_chars[$char] = 1;
298 }
299 }
300
301 // todo: add byte detection here
302 }
303
304 // unicode cleanup
305 if ($this->_compile_unicode) {
306 foreach ($unicode_chars as $utf8_char => $count) {
307 $search_result = $this->_unicode_block_name(
308 $this->_utf8char2unicode($utf8_char), $blocks, $block_count
309 );
310
311 if ($search_result != -1) {
312 $block_name = $search_result[2];
313 } else {
314 $block_name = '[Malformatted]';
315 }
316
317 if (isset($this->_unicode_blocks[$block_name])) {
318 $this->_unicode_blocks[$block_name] += $count;
319 } else {
320 $this->_unicode_blocks[$block_name] = $count;
321 }
322 }
323 }
324
325
326 // trigram cleanup
327 if ($this->_compile_trigram) {
328 // pad the end
329 if ($b != ' ') {
330 if (!isset($this->_trigram["$a$b "])) {
331 $this->_trigram["$a$b "] = 1;
332 } else {
333 $this->_trigram["$a$b "]++;
334 }
335 }
336
337 // perl compatibility; Language::Guess does not pad the beginning
338 // kludge
339 if (isset($dropone)) {
340 if ($this->_trigram[$dropone] == 1) {
341 unset($this->_trigram[$dropone]);
342 } else {
343 $this->_trigram[$dropone]--;
344 }
345 }
346
347 if (!empty($this->_trigram)) {
348 $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
349 } else {
350 $this->_trigram_ranks = array();
351 }
352 }
353 }
354 }
355
356 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */