]>
Commit | Line | Data |
---|---|---|
853cc128 AD |
1 | <?php |
2 | /* | |
3 | ***** BEGIN LICENSE BLOCK ***** | |
4 | This file is part of PHP Naive Bayesian Filter. | |
5 | ||
6 | The Initial Developer of the Original Code is | |
7 | Loic d'Anterroches [loic_at_xhtml.net]. | |
8 | Portions created by the Initial Developer are Copyright (C) 2003 | |
9 | the Initial Developer. All Rights Reserved. | |
10 | ||
11 | Contributor(s): | |
12 | See the source | |
13 | ||
14 | PHP Naive Bayesian Filter is free software; you can redistribute it | |
15 | and/or modify it under the terms of the GNU General Public License as | |
16 | published by the Free Software Foundation; either version 2 of | |
17 | the License, or (at your option) any later version. | |
18 | ||
19 | PHP Naive Bayesian Filter is distributed in the hope that it will | |
20 | be useful, but WITHOUT ANY WARRANTY; without even the implied | |
21 | warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
22 | See the GNU General Public License for more details. | |
23 | ||
24 | You should have received a copy of the GNU General Public License | |
25 | along with Foobar; if not, write to the Free Software | |
26 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
27 | ||
28 | Alternatively, the contents of this file may be used under the terms of | |
29 | the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), | |
30 | in which case the provisions of the LGPL are applicable instead | |
31 | of those above. | |
32 | ||
33 | ***** END LICENSE BLOCK ***** | |
34 | */ | |
35 | ||
36 | class NaiveBayesian { | |
37 | /** min token length for it to be taken into consideration */ | |
38 | var $min_token_length = 3; | |
39 | /** max token length for it to be taken into consideration */ | |
40 | var $max_token_length = 15; | |
41 | /** list of token to ignore | |
42 | @see getIgnoreList() | |
43 | */ | |
44 | var $ignore_list = array(); | |
45 | /** storage object | |
46 | @see class NaiveBayesianStorage | |
47 | */ | |
48 | var $nbs = null; | |
49 | ||
50 | function NaiveBayesian($nbs) { | |
51 | $this->nbs = $nbs; | |
52 | ||
53 | return true; | |
54 | } | |
55 | ||
56 | /** categorize a document. | |
57 | Get list of categories in which the document can be categorized | |
58 | with a score for each category. | |
59 | ||
60 | @return array keys = category ids, values = scores | |
61 | @param string document | |
62 | */ | |
63 | function categorize($document) { | |
64 | $scores = array(); | |
65 | $categories = $this->nbs->getCategories(); | |
66 | $tokens = $this->_getTokens($document); | |
67 | ||
68 | // calculate the score in each category | |
69 | $total_words = 0; | |
70 | $ncat = 0; | |
71 | ||
72 | while (list($category, $data) = each($categories)) { | |
73 | $total_words += $data['word_count']; | |
74 | $ncat++; | |
75 | } | |
76 | ||
77 | reset($categories); | |
78 | ||
79 | while (list($category, $data) = each($categories)) { | |
80 | $scores[$category] = $data['probability']; | |
81 | // small probability for a word not in the category | |
82 | // maybe putting 1.0 as a 'no effect' word can also be good | |
83 | $small_proba = 1.0 / ($data['word_count'] * 2); | |
84 | ||
85 | reset($tokens); | |
86 | ||
87 | while (list($token, $count) = each($tokens)) { | |
59e83455 | 88 | |
853cc128 AD |
89 | if ($this->nbs->wordExists($token)) { |
90 | $word = $this->nbs->getWord($token, $category); | |
91 | ||
92 | if ($word['count']) { | |
93 | $proba = $word['count'] / $data['word_count']; | |
94 | } | |
95 | else { | |
96 | $proba = $small_proba; | |
97 | } | |
98 | ||
99 | $scores[$category] *= pow($proba, $count) * pow($total_words / $ncat, $count); | |
100 | // pow($total_words/$ncat, $count) is here to avoid underflow. | |
101 | ||
102 | } | |
103 | } | |
104 | } | |
105 | ||
106 | return $this->_rescale($scores); | |
107 | } | |
108 | ||
109 | /** training against a document. | |
110 | Set a document as being in a specific category. The document becomes a reference | |
111 | and is saved in the table of references. After a set of training is done | |
112 | the updateProbabilities() function must be run. | |
113 | ||
114 | @see updateProbabilities() | |
115 | @see untrain() | |
116 | @return bool success | |
117 | @param string document id, must be unique | |
118 | @param string category_id the category id in which the document should be | |
119 | @param string content of the document | |
120 | */ | |
121 | function train($doc_id, $category_id, $content) { | |
122 | $ret = false; | |
123 | ||
59e83455 | 124 | |
853cc128 | 125 | // if this doc_id already trained, no trained |
59e83455 AD |
126 | if (!$this->nbs->getReference($doc_id, false)) { |
127 | ||
853cc128 AD |
128 | $tokens = $this->_getTokens($content); |
129 | ||
130 | while (list($token, $count) = each($tokens)) { | |
131 | $this->nbs->updateWord($token, $count, $category_id); | |
132 | } | |
133 | ||
134 | $this->nbs->saveReference($doc_id, $category_id, $content); | |
135 | ||
136 | $ret = true; | |
137 | } | |
138 | else { | |
139 | $ret = false; | |
140 | } | |
141 | ||
142 | return $ret; | |
143 | } | |
144 | ||
145 | /** untraining of a document. | |
146 | To remove just one document from the references. | |
147 | ||
148 | @see updateProbabilities() | |
149 | @see untrain() | |
150 | @return bool success | |
151 | @param string document id, must be unique | |
152 | */ | |
153 | function untrain($doc_id) { | |
154 | $ref = $this->nbs->getReference($doc_id); | |
853cc128 | 155 | |
59e83455 | 156 | if (isset($ref['content'])) { |
853cc128 | 157 | |
59e83455 | 158 | $tokens = $this->_getTokens($ref['content']); |
853cc128 | 159 | |
59e83455 AD |
160 | while (list($token, $count) = each($tokens)) { |
161 | $this->nbs->removeWord($token, $count, $ref['category_id']); | |
162 | } | |
163 | ||
164 | $this->nbs->removeReference($doc_id); | |
165 | ||
166 | return true; | |
167 | } else { | |
168 | return false; | |
169 | } | |
853cc128 AD |
170 | } |
171 | ||
172 | /** rescale the results between 0 and 1. | |
173 | ||
174 | @author Ken Williams, ken@mathforum.org | |
175 | @see categorize() | |
176 | @return array normalized scores (keys => category, values => scores) | |
177 | @param array scores (keys => category, values => scores) | |
178 | */ | |
179 | function _rescale($scores) { | |
180 | // Scale everything back to a reasonable area in | |
181 | // logspace (near zero), un-loggify, and normalize | |
182 | $total = 0.0; | |
183 | $max = 0.0; | |
184 | reset($scores); | |
185 | ||
186 | while (list($cat, $score) = each($scores)) { | |
187 | if ($score >= $max) | |
188 | $max = $score; | |
189 | } | |
190 | ||
191 | reset($scores); | |
192 | while (list($cat, $score) = each($scores)) { | |
193 | $scores[$cat] = (float) exp($score - $max); | |
194 | $total += (float) pow($scores[$cat], 2); | |
195 | } | |
196 | ||
197 | $total = (float) sqrt($total); | |
198 | ||
199 | reset($scores); | |
200 | while (list($cat, $score) = each($scores)) { | |
201 | $scores[$cat] = (float) $scores[$cat] / $total; | |
202 | } | |
203 | reset($scores); | |
204 | ||
205 | return $scores; | |
206 | } | |
207 | ||
208 | /** update the probabilities of the categories and word count. | |
209 | This function must be run after a set of training | |
210 | ||
211 | @see train() | |
212 | @see untrain() | |
213 | @return bool sucess | |
214 | */ | |
215 | function updateProbabilities() { | |
216 | // this function is really only database manipulation | |
217 | // that is why all is done in the NaiveBayesianStorage | |
218 | return $this->nbs->updateProbabilities(); | |
219 | } | |
220 | ||
221 | /** Get the list of token to ignore. | |
222 | @return array ignore list | |
223 | */ | |
224 | function getIgnoreList() { | |
225 | return array('the', 'that', 'you', 'for', 'and'); | |
226 | } | |
227 | ||
228 | /** get the tokens from a string | |
229 | ||
230 | @author James Seng. [http://james.seng.cc/] (based on his perl version) | |
231 | ||
232 | @return array tokens | |
233 | @param string the string to get the tokens from | |
234 | */ | |
235 | function _getTokens($string) { | |
236 | $rawtokens = array(); | |
237 | $tokens = array(); | |
59e83455 | 238 | //$string = $this->_cleanString($string); |
853cc128 AD |
239 | |
240 | if (count(0 >= $this->ignore_list)) { | |
241 | $this->ignore_list = $this->getIgnoreList(); | |
242 | } | |
243 | ||
59e83455 | 244 | $rawtokens = preg_split("/[\(\),:\.;\t\r\n ]/", $string, -1, PREG_SPLIT_NO_EMPTY); |
853cc128 AD |
245 | |
246 | // remove some tokens | |
247 | while (list(, $token) = each($rawtokens)) { | |
248 | $token = trim($token); | |
59e83455 | 249 | if (!(('' == $token) || (mb_strpos($token, "&") !== FALSE) || (mb_strlen($token) < $this->min_token_length) || (mb_strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) { |
853cc128 AD |
250 | $tokens[$token]++; |
251 | } | |
252 | } | |
253 | ||
254 | return $tokens; | |
255 | } | |
256 | ||
257 | /** clean a string from the diacritics | |
258 | ||
259 | @author Antoine Bajolet [phpdig_at_toiletoine.net] | |
260 | @author SPIP [http://uzine.net/spip/] | |
261 | ||
262 | @return string clean string | |
263 | @param string string with accents | |
264 | */ | |
265 | function _cleanString($string) { | |
266 | $diac = /* A */ chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) . | |
267 | /* a */ chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) . | |
268 | /* O */ chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) . | |
269 | /* o */ chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) . | |
270 | /* E */ chr(200) . chr(201) . chr(202) . chr(203) . | |
271 | /* e */ chr(232) . chr(233) . chr(234) . chr(235) . | |
272 | /* Cc */ chr(199) . chr(231) . | |
273 | /* I */ chr(204) . chr(205) . chr(206) . chr(207) . | |
274 | /* i */ chr(236) . chr(237) . chr(238) . chr(239) . | |
275 | /* U */ chr(217) . chr(218) . chr(219) . chr(220) . | |
276 | /* u */ chr(249) . chr(250) . chr(251) . chr(252) . | |
277 | /* yNn */ chr(255) . chr(209) . chr(241); | |
278 | ||
279 | return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn')); | |
280 | } | |
281 | ||
282 | } |