]>
Commit | Line | Data |
---|---|---|
853cc128 AD |
1 | <?php |
2 | /* | |
3 | ***** BEGIN LICENSE BLOCK ***** | |
4 | This file is part of PHP Naive Bayesian Filter. | |
5 | ||
6 | The Initial Developer of the Original Code is | |
7 | Loic d'Anterroches [loic_at_xhtml.net]. | |
8 | Portions created by the Initial Developer are Copyright (C) 2003 | |
9 | the Initial Developer. All Rights Reserved. | |
10 | ||
11 | Contributor(s): | |
12 | See the source | |
13 | ||
14 | PHP Naive Bayesian Filter is free software; you can redistribute it | |
15 | and/or modify it under the terms of the GNU General Public License as | |
16 | published by the Free Software Foundation; either version 2 of | |
17 | the License, or (at your option) any later version. | |
18 | ||
19 | PHP Naive Bayesian Filter is distributed in the hope that it will | |
20 | be useful, but WITHOUT ANY WARRANTY; without even the implied | |
21 | warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
22 | See the GNU General Public License for more details. | |
23 | ||
24 | You should have received a copy of the GNU General Public License | |
25 | along with Foobar; if not, write to the Free Software | |
26 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
27 | ||
28 | Alternatively, the contents of this file may be used under the terms of | |
29 | the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), | |
30 | in which case the provisions of the LGPL are applicable instead | |
31 | of those above. | |
32 | ||
33 | ***** END LICENSE BLOCK ***** | |
34 | */ | |
35 | ||
36 | class NaiveBayesian { | |
37 | /** min token length for it to be taken into consideration */ | |
38 | var $min_token_length = 3; | |
39 | /** max token length for it to be taken into consideration */ | |
40 | var $max_token_length = 15; | |
41 | /** list of token to ignore | |
42 | @see getIgnoreList() | |
43 | */ | |
44 | var $ignore_list = array(); | |
45 | /** storage object | |
46 | @see class NaiveBayesianStorage | |
47 | */ | |
48 | var $nbs = null; | |
49 | ||
50 | function NaiveBayesian($nbs) { | |
51 | $this->nbs = $nbs; | |
52 | ||
53 | return true; | |
54 | } | |
55 | ||
56 | /** categorize a document. | |
57 | Get list of categories in which the document can be categorized | |
58 | with a score for each category. | |
59 | ||
60 | @return array keys = category ids, values = scores | |
61 | @param string document | |
62 | */ | |
63 | function categorize($document) { | |
64 | $scores = array(); | |
65 | $categories = $this->nbs->getCategories(); | |
66 | $tokens = $this->_getTokens($document); | |
67 | ||
68 | // calculate the score in each category | |
69 | $total_words = 0; | |
70 | $ncat = 0; | |
71 | ||
72 | while (list($category, $data) = each($categories)) { | |
73 | $total_words += $data['word_count']; | |
74 | $ncat++; | |
75 | } | |
76 | ||
77 | reset($categories); | |
78 | ||
79 | while (list($category, $data) = each($categories)) { | |
80 | $scores[$category] = $data['probability']; | |
81 | // small probability for a word not in the category | |
82 | // maybe putting 1.0 as a 'no effect' word can also be good | |
83 | $small_proba = 1.0 / ($data['word_count'] * 2); | |
84 | ||
85 | reset($tokens); | |
86 | ||
87 | while (list($token, $count) = each($tokens)) { | |
88 | if ($this->nbs->wordExists($token)) { | |
89 | $word = $this->nbs->getWord($token, $category); | |
90 | ||
91 | if ($word['count']) { | |
92 | $proba = $word['count'] / $data['word_count']; | |
93 | } | |
94 | else { | |
95 | $proba = $small_proba; | |
96 | } | |
97 | ||
98 | $scores[$category] *= pow($proba, $count) * pow($total_words / $ncat, $count); | |
99 | // pow($total_words/$ncat, $count) is here to avoid underflow. | |
100 | ||
101 | } | |
102 | } | |
103 | } | |
104 | ||
105 | return $this->_rescale($scores); | |
106 | } | |
107 | ||
108 | /** training against a document. | |
109 | Set a document as being in a specific category. The document becomes a reference | |
110 | and is saved in the table of references. After a set of training is done | |
111 | the updateProbabilities() function must be run. | |
112 | ||
113 | @see updateProbabilities() | |
114 | @see untrain() | |
115 | @return bool success | |
116 | @param string document id, must be unique | |
117 | @param string category_id the category id in which the document should be | |
118 | @param string content of the document | |
119 | */ | |
120 | function train($doc_id, $category_id, $content) { | |
121 | $ret = false; | |
122 | ||
123 | // if this doc_id already trained, no trained | |
124 | if (!$this->nbs->getReference($doc_id)) { | |
125 | $tokens = $this->_getTokens($content); | |
126 | ||
127 | while (list($token, $count) = each($tokens)) { | |
128 | $this->nbs->updateWord($token, $count, $category_id); | |
129 | } | |
130 | ||
131 | $this->nbs->saveReference($doc_id, $category_id, $content); | |
132 | ||
133 | $ret = true; | |
134 | } | |
135 | else { | |
136 | $ret = false; | |
137 | } | |
138 | ||
139 | return $ret; | |
140 | } | |
141 | ||
142 | /** untraining of a document. | |
143 | To remove just one document from the references. | |
144 | ||
145 | @see updateProbabilities() | |
146 | @see untrain() | |
147 | @return bool success | |
148 | @param string document id, must be unique | |
149 | */ | |
150 | function untrain($doc_id) { | |
151 | $ref = $this->nbs->getReference($doc_id); | |
152 | $tokens = $this->_getTokens($ref['content']); | |
153 | ||
154 | while (list($token, $count) = each($tokens)) { | |
155 | $this->nbs->removeWord($token, $count, $ref['category_id']); | |
156 | } | |
157 | ||
158 | $this->nbs->removeReference($doc_id); | |
159 | ||
160 | return true; | |
161 | } | |
162 | ||
163 | /** rescale the results between 0 and 1. | |
164 | ||
165 | @author Ken Williams, ken@mathforum.org | |
166 | @see categorize() | |
167 | @return array normalized scores (keys => category, values => scores) | |
168 | @param array scores (keys => category, values => scores) | |
169 | */ | |
170 | function _rescale($scores) { | |
171 | // Scale everything back to a reasonable area in | |
172 | // logspace (near zero), un-loggify, and normalize | |
173 | $total = 0.0; | |
174 | $max = 0.0; | |
175 | reset($scores); | |
176 | ||
177 | while (list($cat, $score) = each($scores)) { | |
178 | if ($score >= $max) | |
179 | $max = $score; | |
180 | } | |
181 | ||
182 | reset($scores); | |
183 | while (list($cat, $score) = each($scores)) { | |
184 | $scores[$cat] = (float) exp($score - $max); | |
185 | $total += (float) pow($scores[$cat], 2); | |
186 | } | |
187 | ||
188 | $total = (float) sqrt($total); | |
189 | ||
190 | reset($scores); | |
191 | while (list($cat, $score) = each($scores)) { | |
192 | $scores[$cat] = (float) $scores[$cat] / $total; | |
193 | } | |
194 | reset($scores); | |
195 | ||
196 | return $scores; | |
197 | } | |
198 | ||
199 | /** update the probabilities of the categories and word count. | |
200 | This function must be run after a set of training | |
201 | ||
202 | @see train() | |
203 | @see untrain() | |
204 | @return bool sucess | |
205 | */ | |
206 | function updateProbabilities() { | |
207 | // this function is really only database manipulation | |
208 | // that is why all is done in the NaiveBayesianStorage | |
209 | return $this->nbs->updateProbabilities(); | |
210 | } | |
211 | ||
212 | /** Get the list of token to ignore. | |
213 | @return array ignore list | |
214 | */ | |
215 | function getIgnoreList() { | |
216 | return array('the', 'that', 'you', 'for', 'and'); | |
217 | } | |
218 | ||
219 | /** get the tokens from a string | |
220 | ||
221 | @author James Seng. [http://james.seng.cc/] (based on his perl version) | |
222 | ||
223 | @return array tokens | |
224 | @param string the string to get the tokens from | |
225 | */ | |
226 | function _getTokens($string) { | |
227 | $rawtokens = array(); | |
228 | $tokens = array(); | |
229 | $string = $this->_cleanString($string); | |
230 | ||
231 | if (count(0 >= $this->ignore_list)) { | |
232 | $this->ignore_list = $this->getIgnoreList(); | |
233 | } | |
234 | ||
235 | $rawtokens = split("[^-_A-Za-z0-9]+", $string); | |
236 | ||
237 | // remove some tokens | |
238 | while (list(, $token) = each($rawtokens)) { | |
239 | $token = trim($token); | |
240 | if (!(('' == $token) || (strlen($token) < $this->min_token_length) || (strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) { | |
241 | $tokens[$token]++; | |
242 | } | |
243 | } | |
244 | ||
245 | return $tokens; | |
246 | } | |
247 | ||
248 | /** clean a string from the diacritics | |
249 | ||
250 | @author Antoine Bajolet [phpdig_at_toiletoine.net] | |
251 | @author SPIP [http://uzine.net/spip/] | |
252 | ||
253 | @return string clean string | |
254 | @param string string with accents | |
255 | */ | |
256 | function _cleanString($string) { | |
257 | $diac = /* A */ chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) . | |
258 | /* a */ chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) . | |
259 | /* O */ chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) . | |
260 | /* o */ chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) . | |
261 | /* E */ chr(200) . chr(201) . chr(202) . chr(203) . | |
262 | /* e */ chr(232) . chr(233) . chr(234) . chr(235) . | |
263 | /* Cc */ chr(199) . chr(231) . | |
264 | /* I */ chr(204) . chr(205) . chr(206) . chr(207) . | |
265 | /* i */ chr(236) . chr(237) . chr(238) . chr(239) . | |
266 | /* U */ chr(217) . chr(218) . chr(219) . chr(220) . | |
267 | /* u */ chr(249) . chr(250) . chr(251) . chr(252) . | |
268 | /* yNn */ chr(255) . chr(209) . chr(241); | |
269 | ||
270 | return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn')); | |
271 | } | |
272 | ||
273 | } |