]> git.wh0rd.org - tt-rss.git/blob - plugins/af_lang_detect/languagedetect/Text/LanguageDetect/Parser.php
44eea897f03674fc20d263b7d0e424262bfbba69
[tt-rss.git] / plugins / af_lang_detect / languagedetect / Text / LanguageDetect / Parser.php
1 <?php
2
3 /**
4 * This class represents a text sample to be parsed.
5 *
6 * @category Text
7 * @package Text_LanguageDetect
8 * @author Nicholas Pisarro
9 * @copyright 2006
10 * @license BSD
11 * @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
12 * @link http://pear.php.net/package/Text_LanguageDetect/
13 * @link http://langdetect.blogspot.com/
14 */
15
16 /**
17 * This class represents a text sample to be parsed.
18 *
19 * This separates the analysis of a text sample from the primary LanguageDetect
20 * class. After a new profile has been built, the data can be retrieved using
21 * the accessor functions.
22 *
23 * This class is intended to be used by the Text_LanguageDetect class, not
24 * end-users.
25 *
26 * @category Text
27 * @package Text_LanguageDetect
28 * @author Nicholas Pisarro
29 * @copyright 2006
30 * @license BSD
31 * @version release: 0.3.0
32 */
33 class Text_LanguageDetect_Parser extends Text_LanguageDetect
34 {
35 /**
36 * the piece of text being parsed
37 *
38 * @access private
39 * @var string
40 */
41 var $_string;
42
43 /**
44 * stores the trigram frequencies of the sample
45 *
46 * @access private
47 * @var string
48 */
49 var $_trigrams = array();
50
51 /**
52 * stores the trigram ranks of the sample
53 *
54 * @access private
55 * @var array
56 */
57 var $_trigram_ranks = array();
58
59 /**
60 * stores the unicode blocks of the sample
61 *
62 * @access private
63 * @var array
64 */
65 var $_unicode_blocks = array();
66
67 /**
68 * Whether the parser should compile the unicode ranges
69 *
70 * @access private
71 * @var bool
72 */
73 var $_compile_unicode = false;
74
75 /**
76 * Whether the parser should compile trigrams
77 *
78 * @access private
79 * @var bool
80 */
81 var $_compile_trigram = false;
82
83 /**
84 * Whether the trigram parser should pad the beginning of the string
85 *
86 * @access private
87 * @var bool
88 */
89 var $_trigram_pad_start = false;
90
91 /**
92 * Whether the unicode parser should skip non-alphabetical ascii chars
93 *
94 * @access private
95 * @var bool
96 */
97 var $_unicode_skip_symbols = true;
98
99 /**
100 * Constructor
101 *
102 * @access private
103 * @param string $string string to be parsed
104 */
105 function Text_LanguageDetect_Parser($string) {
106 $this->_string = $string;
107 }
108
109 /**
110 * Returns true if a string is suitable for parsing
111 *
112 * @param string $str input string to test
113 * @return bool true if acceptable, false if not
114 */
115 public static function validateString($str) {
116 if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
117 return true;
118 } else {
119 return false;
120 }
121 }
122
123 /**
124 * turn on/off trigram counting
125 *
126 * @access public
127 * @param bool $bool true for on, false for off
128 */
129 function prepareTrigram($bool = true)
130 {
131 $this->_compile_trigram = $bool;
132 }
133
134 /**
135 * turn on/off unicode block counting
136 *
137 * @access public
138 * @param bool $bool true for on, false for off
139 */
140 function prepareUnicode($bool = true)
141 {
142 $this->_compile_unicode = $bool;
143 }
144
145 /**
146 * turn on/off padding the beginning of the sample string
147 *
148 * @access public
149 * @param bool $bool true for on, false for off
150 */
151 function setPadStart($bool = true)
152 {
153 $this->_trigram_pad_start = $bool;
154 }
155
156 /**
157 * Should the unicode block counter skip non-alphabetical ascii chars?
158 *
159 * @access public
160 * @param bool $bool true for on, false for off
161 */
162 function setUnicodeSkipSymbols($bool = true)
163 {
164 $this->_unicode_skip_symbols = $bool;
165 }
166
167 /**
168 * Returns the trigram ranks for the text sample
169 *
170 * @access public
171 * @return array trigram ranks in the text sample
172 */
173 function &getTrigramRanks()
174 {
175 return $this->_trigram_ranks;
176 }
177
178 /**
179 * Return the trigram freqency table
180 *
181 * only used in testing to make sure the parser is working
182 *
183 * @access public
184 * @return array trigram freqencies in the text sample
185 */
186 function &getTrigramFreqs()
187 {
188 return $this->_trigram;
189 }
190
191 /**
192 * returns the array of unicode blocks
193 *
194 * @access public
195 * @return array unicode blocks in the text sample
196 */
197 function &getUnicodeBlocks()
198 {
199 return $this->_unicode_blocks;
200 }
201
202 /**
203 * Executes the parsing operation
204 *
205 * Be sure to call the set*() functions to set options and the
206 * prepare*() functions first to tell it what kind of data to compute
207 *
208 * Afterwards the get*() functions can be used to access the compiled
209 * information.
210 *
211 * @access public
212 */
213 function analyze()
214 {
215 $len = strlen($this->_string);
216 $byte_counter = 0;
217
218
219 // unicode startup
220 if ($this->_compile_unicode) {
221 $blocks = $this->_read_unicode_block_db();
222 $block_count = count($blocks);
223
224 $skipped_count = 0;
225 $unicode_chars = array();
226 }
227
228 // trigram startup
229 if ($this->_compile_trigram) {
230 // initialize them as blank so the parser will skip the first two
231 // (since it skips trigrams with more than 2 contiguous spaces)
232 $a = ' ';
233 $b = ' ';
234
235 // kludge
236 // if it finds a valid trigram to start and the start pad option is
237 // off, then set a variable that will be used to reduce this
238 // trigram after parsing has finished
239 if (!$this->_trigram_pad_start) {
240 $a = $this->_next_char($this->_string, $byte_counter, true);
241
242 if ($a != ' ') {
243 $b = $this->_next_char($this->_string, $byte_counter, true);
244 $dropone = " $a$b";
245 }
246
247 $byte_counter = 0;
248 $a = ' ';
249 $b = ' ';
250 }
251 }
252
253 while ($byte_counter < $len) {
254 $char = $this->_next_char($this->_string, $byte_counter, true);
255
256
257 // language trigram detection
258 if ($this->_compile_trigram) {
259 if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
260 if (!isset($this->_trigram[$a . $b . $char])) {
261 $this->_trigram[$a . $b . $char] = 1;
262 } else {
263 $this->_trigram[$a . $b . $char]++;
264 }
265 }
266
267 $a = $b;
268 $b = $char;
269 }
270
271 // unicode block detection
272 if ($this->_compile_unicode) {
273 if ($this->_unicode_skip_symbols
274 && strlen($char) == 1
275 && ($char < 'A' || $char > 'z'
276 || ($char > 'Z' && $char < 'a'))
277 && $char != "'") { // does not skip the apostrophe
278 // since it's included in the language
279 // models
280
281 $skipped_count++;
282 continue;
283 }
284
285 // build an array of all the characters
286 if (isset($unicode_chars[$char])) {
287 $unicode_chars[$char]++;
288 } else {
289 $unicode_chars[$char] = 1;
290 }
291 }
292
293 // todo: add byte detection here
294 }
295
296 // unicode cleanup
297 if ($this->_compile_unicode) {
298 foreach ($unicode_chars as $utf8_char => $count) {
299 $search_result = $this->_unicode_block_name(
300 $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
301
302 if ($search_result != -1) {
303 $block_name = $search_result[2];
304 } else {
305 $block_name = '[Malformatted]';
306 }
307
308 if (isset($this->_unicode_blocks[$block_name])) {
309 $this->_unicode_blocks[$block_name] += $count;
310 } else {
311 $this->_unicode_blocks[$block_name] = $count;
312 }
313 }
314 }
315
316
317 // trigram cleanup
318 if ($this->_compile_trigram) {
319 // pad the end
320 if ($b != ' ') {
321 if (!isset($this->_trigram["$a$b "])) {
322 $this->_trigram["$a$b "] = 1;
323 } else {
324 $this->_trigram["$a$b "]++;
325 }
326 }
327
328 // perl compatibility; Language::Guess does not pad the beginning
329 // kludge
330 if (isset($dropone)) {
331 if ($this->_trigram[$dropone] == 1) {
332 unset($this->_trigram[$dropone]);
333 } else {
334 $this->_trigram[$dropone]--;
335 }
336 }
337
338 if (!empty($this->_trigram)) {
339 $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
340 } else {
341 $this->_trigram_ranks = array();
342 }
343 }
344 }
345 }
346
347 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */