]>
Commit | Line | Data |
---|---|---|
853cc128 AD |
1 | <?php |
2 | /* | |
3 | ***** BEGIN LICENSE BLOCK ***** | |
4 | This file is part of PHP Naive Bayesian Filter. | |
5 | ||
6 | The Initial Developer of the Original Code is | |
7 | Loic d'Anterroches [loic_at_xhtml.net]. | |
8 | Portions created by the Initial Developer are Copyright (C) 2003 | |
9 | the Initial Developer. All Rights Reserved. | |
10 | ||
11 | Contributor(s): | |
12 | See the source | |
13 | ||
14 | PHP Naive Bayesian Filter is free software; you can redistribute it | |
15 | and/or modify it under the terms of the GNU General Public License as | |
16 | published by the Free Software Foundation; either version 2 of | |
17 | the License, or (at your option) any later version. | |
18 | ||
19 | PHP Naive Bayesian Filter is distributed in the hope that it will | |
20 | be useful, but WITHOUT ANY WARRANTY; without even the implied | |
21 | warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
22 | See the GNU General Public License for more details. | |
23 | ||
24 | You should have received a copy of the GNU General Public License | |
25 | along with Foobar; if not, write to the Free Software | |
26 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
27 | ||
28 | Alternatively, the contents of this file may be used under the terms of | |
29 | the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), | |
30 | in which case the provisions of the LGPL are applicable instead | |
31 | of those above. | |
32 | ||
33 | ***** END LICENSE BLOCK ***** | |
34 | */ | |
35 | ||
36 | class NaiveBayesian { | |
37 | /** min token length for it to be taken into consideration */ | |
38 | var $min_token_length = 3; | |
39 | /** max token length for it to be taken into consideration */ | |
40 | var $max_token_length = 15; | |
41 | /** list of token to ignore | |
42 | @see getIgnoreList() | |
43 | */ | |
44 | var $ignore_list = array(); | |
45 | /** storage object | |
46 | @see class NaiveBayesianStorage | |
47 | */ | |
48 | var $nbs = null; | |
49 | ||
50 | function NaiveBayesian($nbs) { | |
51 | $this->nbs = $nbs; | |
52 | ||
53 | return true; | |
54 | } | |
55 | ||
56 | /** categorize a document. | |
57 | Get list of categories in which the document can be categorized | |
58 | with a score for each category. | |
59 | ||
60 | @return array keys = category ids, values = scores | |
61 | @param string document | |
62 | */ | |
63 | function categorize($document) { | |
64 | $scores = array(); | |
65 | $categories = $this->nbs->getCategories(); | |
66 | $tokens = $this->_getTokens($document); | |
67 | ||
68 | // calculate the score in each category | |
69 | $total_words = 0; | |
70 | $ncat = 0; | |
71 | ||
72 | while (list($category, $data) = each($categories)) { | |
73 | $total_words += $data['word_count']; | |
74 | $ncat++; | |
75 | } | |
76 | ||
77 | reset($categories); | |
78 | ||
79 | while (list($category, $data) = each($categories)) { | |
80 | $scores[$category] = $data['probability']; | |
81 | // small probability for a word not in the category | |
82 | // maybe putting 1.0 as a 'no effect' word can also be good | |
4da0cb32 AD |
83 | |
84 | if ($data['word_count'] > 0) | |
85 | $small_proba = 1.0 / ($data['word_count'] * 2); | |
86 | else | |
87 | $small_proba = 0; | |
853cc128 AD |
88 | |
89 | reset($tokens); | |
90 | ||
91 | while (list($token, $count) = each($tokens)) { | |
59e83455 | 92 | |
853cc128 AD |
93 | if ($this->nbs->wordExists($token)) { |
94 | $word = $this->nbs->getWord($token, $category); | |
95 | ||
96 | if ($word['count']) { | |
97 | $proba = $word['count'] / $data['word_count']; | |
98 | } | |
99 | else { | |
100 | $proba = $small_proba; | |
101 | } | |
102 | ||
103 | $scores[$category] *= pow($proba, $count) * pow($total_words / $ncat, $count); | |
104 | // pow($total_words/$ncat, $count) is here to avoid underflow. | |
105 | ||
106 | } | |
107 | } | |
108 | } | |
109 | ||
110 | return $this->_rescale($scores); | |
111 | } | |
112 | ||
113 | /** training against a document. | |
114 | Set a document as being in a specific category. The document becomes a reference | |
115 | and is saved in the table of references. After a set of training is done | |
116 | the updateProbabilities() function must be run. | |
117 | ||
118 | @see updateProbabilities() | |
119 | @see untrain() | |
120 | @return bool success | |
121 | @param string document id, must be unique | |
122 | @param string category_id the category id in which the document should be | |
123 | @param string content of the document | |
124 | */ | |
125 | function train($doc_id, $category_id, $content) { | |
126 | $ret = false; | |
127 | ||
59e83455 | 128 | |
853cc128 | 129 | // if this doc_id already trained, no trained |
59e83455 AD |
130 | if (!$this->nbs->getReference($doc_id, false)) { |
131 | ||
853cc128 AD |
132 | $tokens = $this->_getTokens($content); |
133 | ||
134 | while (list($token, $count) = each($tokens)) { | |
135 | $this->nbs->updateWord($token, $count, $category_id); | |
136 | } | |
137 | ||
138 | $this->nbs->saveReference($doc_id, $category_id, $content); | |
139 | ||
140 | $ret = true; | |
141 | } | |
142 | else { | |
143 | $ret = false; | |
144 | } | |
145 | ||
146 | return $ret; | |
147 | } | |
148 | ||
149 | /** untraining of a document. | |
150 | To remove just one document from the references. | |
151 | ||
152 | @see updateProbabilities() | |
153 | @see untrain() | |
154 | @return bool success | |
155 | @param string document id, must be unique | |
156 | */ | |
157 | function untrain($doc_id) { | |
158 | $ref = $this->nbs->getReference($doc_id); | |
853cc128 | 159 | |
59e83455 | 160 | if (isset($ref['content'])) { |
853cc128 | 161 | |
59e83455 | 162 | $tokens = $this->_getTokens($ref['content']); |
853cc128 | 163 | |
59e83455 AD |
164 | while (list($token, $count) = each($tokens)) { |
165 | $this->nbs->removeWord($token, $count, $ref['category_id']); | |
166 | } | |
167 | ||
168 | $this->nbs->removeReference($doc_id); | |
169 | ||
170 | return true; | |
171 | } else { | |
172 | return false; | |
173 | } | |
853cc128 AD |
174 | } |
175 | ||
176 | /** rescale the results between 0 and 1. | |
177 | ||
178 | @author Ken Williams, ken@mathforum.org | |
179 | @see categorize() | |
180 | @return array normalized scores (keys => category, values => scores) | |
181 | @param array scores (keys => category, values => scores) | |
182 | */ | |
183 | function _rescale($scores) { | |
184 | // Scale everything back to a reasonable area in | |
185 | // logspace (near zero), un-loggify, and normalize | |
186 | $total = 0.0; | |
187 | $max = 0.0; | |
188 | reset($scores); | |
189 | ||
190 | while (list($cat, $score) = each($scores)) { | |
191 | if ($score >= $max) | |
192 | $max = $score; | |
193 | } | |
194 | ||
195 | reset($scores); | |
196 | while (list($cat, $score) = each($scores)) { | |
197 | $scores[$cat] = (float) exp($score - $max); | |
198 | $total += (float) pow($scores[$cat], 2); | |
199 | } | |
200 | ||
201 | $total = (float) sqrt($total); | |
202 | ||
203 | reset($scores); | |
204 | while (list($cat, $score) = each($scores)) { | |
205 | $scores[$cat] = (float) $scores[$cat] / $total; | |
206 | } | |
207 | reset($scores); | |
208 | ||
209 | return $scores; | |
210 | } | |
211 | ||
212 | /** update the probabilities of the categories and word count. | |
213 | This function must be run after a set of training | |
214 | ||
215 | @see train() | |
216 | @see untrain() | |
217 | @return bool sucess | |
218 | */ | |
219 | function updateProbabilities() { | |
220 | // this function is really only database manipulation | |
221 | // that is why all is done in the NaiveBayesianStorage | |
222 | return $this->nbs->updateProbabilities(); | |
223 | } | |
224 | ||
225 | /** Get the list of token to ignore. | |
226 | @return array ignore list | |
227 | */ | |
228 | function getIgnoreList() { | |
229 | return array('the', 'that', 'you', 'for', 'and'); | |
230 | } | |
231 | ||
232 | /** get the tokens from a string | |
233 | ||
234 | @author James Seng. [http://james.seng.cc/] (based on his perl version) | |
235 | ||
236 | @return array tokens | |
237 | @param string the string to get the tokens from | |
238 | */ | |
239 | function _getTokens($string) { | |
240 | $rawtokens = array(); | |
241 | $tokens = array(); | |
59e83455 | 242 | //$string = $this->_cleanString($string); |
853cc128 AD |
243 | |
244 | if (count(0 >= $this->ignore_list)) { | |
245 | $this->ignore_list = $this->getIgnoreList(); | |
246 | } | |
247 | ||
59e83455 | 248 | $rawtokens = preg_split("/[\(\),:\.;\t\r\n ]/", $string, -1, PREG_SPLIT_NO_EMPTY); |
853cc128 AD |
249 | |
250 | // remove some tokens | |
251 | while (list(, $token) = each($rawtokens)) { | |
252 | $token = trim($token); | |
59e83455 | 253 | if (!(('' == $token) || (mb_strpos($token, "&") !== FALSE) || (mb_strlen($token) < $this->min_token_length) || (mb_strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) { |
853cc128 AD |
254 | $tokens[$token]++; |
255 | } | |
256 | } | |
257 | ||
258 | return $tokens; | |
259 | } | |
260 | ||
261 | /** clean a string from the diacritics | |
262 | ||
263 | @author Antoine Bajolet [phpdig_at_toiletoine.net] | |
264 | @author SPIP [http://uzine.net/spip/] | |
265 | ||
266 | @return string clean string | |
267 | @param string string with accents | |
268 | */ | |
269 | function _cleanString($string) { | |
270 | $diac = /* A */ chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) . | |
271 | /* a */ chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) . | |
272 | /* O */ chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) . | |
273 | /* o */ chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) . | |
274 | /* E */ chr(200) . chr(201) . chr(202) . chr(203) . | |
275 | /* e */ chr(232) . chr(233) . chr(234) . chr(235) . | |
276 | /* Cc */ chr(199) . chr(231) . | |
277 | /* I */ chr(204) . chr(205) . chr(206) . chr(207) . | |
278 | /* i */ chr(236) . chr(237) . chr(238) . chr(239) . | |
279 | /* U */ chr(217) . chr(218) . chr(219) . chr(220) . | |
280 | /* u */ chr(249) . chr(250) . chr(251) . chr(252) . | |
281 | /* yNn */ chr(255) . chr(209) . chr(241); | |
282 | ||
283 | return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn')); | |
284 | } | |
285 | ||
286 | } |