]>
Commit | Line | Data |
---|---|---|
6b461797 | 1 | <?php |
6b461797 | 2 | /** |
31b40448 | 3 | * Part of Text_LanguageDetect |
4 | * | |
5 | * PHP version 5 | |
6b461797 | 6 | * |
31b40448 | 7 | * @category Text |
8 | * @package Text_LanguageDetect | |
9 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | |
10 | * @copyright 2006 Nicholas Pisarro | |
11 | * @license BSD http://www.opensource.org/licenses/bsd-license.php | |
12 | * @link http://pear.php.net/package/Text_LanguageDetect/ | |
6b461797 AD |
13 | */ |
14 | ||
15 | /** | |
16 | * This class represents a text sample to be parsed. | |
17 | * | |
18 | * This separates the analysis of a text sample from the primary LanguageDetect | |
19 | * class. After a new profile has been built, the data can be retrieved using | |
20 | * the accessor functions. | |
21 | * | |
31b40448 | 22 | * This class is intended to be used by the Text_LanguageDetect class, not |
6b461797 AD |
23 | * end-users. |
24 | * | |
31b40448 | 25 | * @category Text |
26 | * @package Text_LanguageDetect | |
27 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | |
28 | * @copyright 2006 Nicholas Pisarro | |
29 | * @license BSD http://www.opensource.org/licenses/bsd-license.php | |
30 | * @version Release: @package_version@ | |
31 | * @link http://pear.php.net/package/Text_LanguageDetect/ | |
6b461797 AD |
32 | */ |
33 | class Text_LanguageDetect_Parser extends Text_LanguageDetect | |
34 | { | |
35 | /** | |
31b40448 | 36 | * The piece of text being parsed |
6b461797 | 37 | * |
31b40448 | 38 | * @var string |
6b461797 | 39 | */ |
31b40448 | 40 | protected $_string; |
6b461797 AD |
41 | |
42 | /** | |
31b40448 | 43 | * Stores the trigram frequencies of the sample |
6b461797 | 44 | * |
31b40448 | 45 | * @var string |
6b461797 | 46 | */ |
31b40448 | 47 | protected $_trigrams = array(); |
6b461797 AD |
48 | |
49 | /** | |
31b40448 | 50 | * Stores the trigram ranks of the sample |
6b461797 | 51 | * |
31b40448 | 52 | * @var array |
6b461797 | 53 | */ |
31b40448 | 54 | protected $_trigram_ranks = array(); |
6b461797 AD |
55 | |
56 | /** | |
31b40448 | 57 | * Stores the unicode blocks of the sample |
6b461797 | 58 | * |
31b40448 | 59 | * @var array |
6b461797 | 60 | */ |
31b40448 | 61 | protected $_unicode_blocks = array(); |
62 | ||
6b461797 AD |
63 | /** |
64 | * Whether the parser should compile the unicode ranges | |
31b40448 | 65 | * |
66 | * @var bool | |
6b461797 | 67 | */ |
31b40448 | 68 | protected $_compile_unicode = false; |
6b461797 AD |
69 | |
70 | /** | |
71 | * Whether the parser should compile trigrams | |
72 | * | |
31b40448 | 73 | * @var bool |
6b461797 | 74 | */ |
31b40448 | 75 | protected $_compile_trigram = false; |
6b461797 AD |
76 | |
77 | /** | |
78 | * Whether the trigram parser should pad the beginning of the string | |
79 | * | |
31b40448 | 80 | * @var bool |
6b461797 | 81 | */ |
31b40448 | 82 | protected $_trigram_pad_start = false; |
6b461797 AD |
83 | |
84 | /** | |
85 | * Whether the unicode parser should skip non-alphabetical ascii chars | |
86 | * | |
31b40448 | 87 | * @var bool |
6b461797 | 88 | */ |
31b40448 | 89 | protected $_unicode_skip_symbols = true; |
6b461797 AD |
90 | |
91 | /** | |
92 | * Constructor | |
93 | * | |
31b40448 | 94 | * @param string $string string to be parsed |
6b461797 | 95 | */ |
31b40448 | 96 | public function __construct($string) |
97 | { | |
6b461797 AD |
98 | $this->_string = $string; |
99 | } | |
100 | ||
31b40448 | 101 | /** |
102 | * PHP 4 constructor for backwards compatibility. | |
103 | * | |
104 | * @param string $string string to be parsed | |
105 | * | |
106 | * @return void | |
107 | */ | |
108 | public function Text_LanguageDetect_Parser($string) | |
109 | { | |
110 | self::__construct($string); | |
111 | } | |
112 | ||
6b461797 AD |
113 | /** |
114 | * Returns true if a string is suitable for parsing | |
115 | * | |
31b40448 | 116 | * @param string $str input string to test |
117 | * | |
118 | * @return bool true if acceptable, false if not | |
6b461797 | 119 | */ |
31b40448 | 120 | public static function validateString($str) |
121 | { | |
6b461797 AD |
122 | if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { |
123 | return true; | |
124 | } else { | |
125 | return false; | |
126 | } | |
127 | } | |
128 | ||
129 | /** | |
31b40448 | 130 | * Turn on/off trigram counting |
6b461797 | 131 | * |
31b40448 | 132 | * @param bool $bool true for on, false for off |
133 | * | |
134 | * @return void | |
6b461797 | 135 | */ |
31b40448 | 136 | public function prepareTrigram($bool = true) |
6b461797 AD |
137 | { |
138 | $this->_compile_trigram = $bool; | |
139 | } | |
140 | ||
141 | /** | |
31b40448 | 142 | * Turn on/off unicode block counting |
143 | * | |
144 | * @param bool $bool true for on, false for off | |
6b461797 | 145 | * |
31b40448 | 146 | * @return void |
6b461797 | 147 | */ |
31b40448 | 148 | public function prepareUnicode($bool = true) |
6b461797 AD |
149 | { |
150 | $this->_compile_unicode = $bool; | |
151 | } | |
152 | ||
153 | /** | |
31b40448 | 154 | * Turn on/off padding the beginning of the sample string |
155 | * | |
156 | * @param bool $bool true for on, false for off | |
6b461797 | 157 | * |
31b40448 | 158 | * @return void |
6b461797 | 159 | */ |
31b40448 | 160 | public function setPadStart($bool = true) |
6b461797 AD |
161 | { |
162 | $this->_trigram_pad_start = $bool; | |
163 | } | |
164 | ||
165 | /** | |
166 | * Should the unicode block counter skip non-alphabetical ascii chars? | |
167 | * | |
31b40448 | 168 | * @param bool $bool true for on, false for off |
169 | * | |
170 | * @return void | |
6b461797 | 171 | */ |
31b40448 | 172 | public function setUnicodeSkipSymbols($bool = true) |
6b461797 AD |
173 | { |
174 | $this->_unicode_skip_symbols = $bool; | |
175 | } | |
176 | ||
177 | /** | |
178 | * Returns the trigram ranks for the text sample | |
179 | * | |
31b40448 | 180 | * @return array Trigram ranks in the text sample |
6b461797 | 181 | */ |
31b40448 | 182 | public function getTrigramRanks() |
6b461797 AD |
183 | { |
184 | return $this->_trigram_ranks; | |
185 | } | |
186 | ||
187 | /** | |
188 | * Return the trigram freqency table | |
189 | * | |
31b40448 | 190 | * Only used in testing to make sure the parser is working |
6b461797 | 191 | * |
31b40448 | 192 | * @return array Trigram freqencies in the text sample |
6b461797 | 193 | */ |
31b40448 | 194 | public function getTrigramFreqs() |
6b461797 AD |
195 | { |
196 | return $this->_trigram; | |
197 | } | |
198 | ||
199 | /** | |
31b40448 | 200 | * Returns the array of unicode blocks |
6b461797 | 201 | * |
31b40448 | 202 | * @return array Unicode blocks in the text sample |
6b461797 | 203 | */ |
31b40448 | 204 | public function getUnicodeBlocks() |
6b461797 AD |
205 | { |
206 | return $this->_unicode_blocks; | |
207 | } | |
208 | ||
209 | /** | |
210 | * Executes the parsing operation | |
31b40448 | 211 | * |
212 | * Be sure to call the set*() functions to set options and the | |
6b461797 AD |
213 | * prepare*() functions first to tell it what kind of data to compute |
214 | * | |
215 | * Afterwards the get*() functions can be used to access the compiled | |
216 | * information. | |
217 | * | |
31b40448 | 218 | * @return void |
6b461797 | 219 | */ |
31b40448 | 220 | public function analyze() |
6b461797 AD |
221 | { |
222 | $len = strlen($this->_string); | |
223 | $byte_counter = 0; | |
224 | ||
225 | ||
226 | // unicode startup | |
227 | if ($this->_compile_unicode) { | |
228 | $blocks = $this->_read_unicode_block_db(); | |
229 | $block_count = count($blocks); | |
230 | ||
231 | $skipped_count = 0; | |
232 | $unicode_chars = array(); | |
233 | } | |
234 | ||
235 | // trigram startup | |
236 | if ($this->_compile_trigram) { | |
237 | // initialize them as blank so the parser will skip the first two | |
238 | // (since it skips trigrams with more than 2 contiguous spaces) | |
239 | $a = ' '; | |
240 | $b = ' '; | |
241 | ||
242 | // kludge | |
243 | // if it finds a valid trigram to start and the start pad option is | |
244 | // off, then set a variable that will be used to reduce this | |
245 | // trigram after parsing has finished | |
246 | if (!$this->_trigram_pad_start) { | |
247 | $a = $this->_next_char($this->_string, $byte_counter, true); | |
248 | ||
249 | if ($a != ' ') { | |
250 | $b = $this->_next_char($this->_string, $byte_counter, true); | |
251 | $dropone = " $a$b"; | |
252 | } | |
253 | ||
254 | $byte_counter = 0; | |
255 | $a = ' '; | |
256 | $b = ' '; | |
257 | } | |
258 | } | |
259 | ||
260 | while ($byte_counter < $len) { | |
261 | $char = $this->_next_char($this->_string, $byte_counter, true); | |
262 | ||
263 | ||
264 | // language trigram detection | |
265 | if ($this->_compile_trigram) { | |
266 | if (!($b == ' ' && ($a == ' ' || $char == ' '))) { | |
267 | if (!isset($this->_trigram[$a . $b . $char])) { | |
31b40448 | 268 | $this->_trigram[$a . $b . $char] = 1; |
6b461797 | 269 | } else { |
31b40448 | 270 | $this->_trigram[$a . $b . $char]++; |
6b461797 AD |
271 | } |
272 | } | |
273 | ||
274 | $a = $b; | |
275 | $b = $char; | |
276 | } | |
277 | ||
278 | // unicode block detection | |
279 | if ($this->_compile_unicode) { | |
280 | if ($this->_unicode_skip_symbols | |
31b40448 | 281 | && strlen($char) == 1 |
282 | && ($char < 'A' || $char > 'z' | |
283 | || ($char > 'Z' && $char < 'a')) | |
284 | && $char != "'" | |
285 | ) { // does not skip the apostrophe | |
6b461797 AD |
286 | // since it's included in the language |
287 | // models | |
288 | ||
289 | $skipped_count++; | |
290 | continue; | |
291 | } | |
292 | ||
293 | // build an array of all the characters | |
294 | if (isset($unicode_chars[$char])) { | |
295 | $unicode_chars[$char]++; | |
296 | } else { | |
297 | $unicode_chars[$char] = 1; | |
298 | } | |
299 | } | |
300 | ||
301 | // todo: add byte detection here | |
302 | } | |
303 | ||
304 | // unicode cleanup | |
305 | if ($this->_compile_unicode) { | |
306 | foreach ($unicode_chars as $utf8_char => $count) { | |
307 | $search_result = $this->_unicode_block_name( | |
31b40448 | 308 | $this->_utf8char2unicode($utf8_char), $blocks, $block_count |
309 | ); | |
6b461797 AD |
310 | |
311 | if ($search_result != -1) { | |
312 | $block_name = $search_result[2]; | |
313 | } else { | |
314 | $block_name = '[Malformatted]'; | |
315 | } | |
316 | ||
317 | if (isset($this->_unicode_blocks[$block_name])) { | |
318 | $this->_unicode_blocks[$block_name] += $count; | |
319 | } else { | |
320 | $this->_unicode_blocks[$block_name] = $count; | |
321 | } | |
322 | } | |
323 | } | |
324 | ||
325 | ||
326 | // trigram cleanup | |
327 | if ($this->_compile_trigram) { | |
328 | // pad the end | |
329 | if ($b != ' ') { | |
330 | if (!isset($this->_trigram["$a$b "])) { | |
331 | $this->_trigram["$a$b "] = 1; | |
332 | } else { | |
333 | $this->_trigram["$a$b "]++; | |
334 | } | |
335 | } | |
336 | ||
337 | // perl compatibility; Language::Guess does not pad the beginning | |
338 | // kludge | |
339 | if (isset($dropone)) { | |
340 | if ($this->_trigram[$dropone] == 1) { | |
341 | unset($this->_trigram[$dropone]); | |
342 | } else { | |
343 | $this->_trigram[$dropone]--; | |
344 | } | |
345 | } | |
346 | ||
347 | if (!empty($this->_trigram)) { | |
348 | $this->_trigram_ranks = $this->_arr_rank($this->_trigram); | |
349 | } else { | |
350 | $this->_trigram_ranks = array(); | |
351 | } | |
352 | } | |
353 | } | |
354 | } | |
355 | ||
356 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ |