]> git.wh0rd.org - tt-rss.git/blame - plugins/af_sort_bayes/lib/class.naivebayesian.php
add placeholder stuff for af_sort_bayes
[tt-rss.git] / plugins / af_sort_bayes / lib / class.naivebayesian.php
CommitLineData
853cc128
AD
1<?php
2 /*
3 ***** BEGIN LICENSE BLOCK *****
4 This file is part of PHP Naive Bayesian Filter.
5
6 The Initial Developer of the Original Code is
7 Loic d'Anterroches [loic_at_xhtml.net].
8 Portions created by the Initial Developer are Copyright (C) 2003
9 the Initial Developer. All Rights Reserved.
10
11 Contributor(s):
12 See the source
13
14 PHP Naive Bayesian Filter is free software; you can redistribute it
15 and/or modify it under the terms of the GNU General Public License as
16 published by the Free Software Foundation; either version 2 of
17 the License, or (at your option) any later version.
18
19 PHP Naive Bayesian Filter is distributed in the hope that it will
20 be useful, but WITHOUT ANY WARRANTY; without even the implied
21 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22 See the GNU General Public License for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with Foobar; if not, write to the Free Software
26 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27
28 Alternatively, the contents of this file may be used under the terms of
29 the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 in which case the provisions of the LGPL are applicable instead
31 of those above.
32
33 ***** END LICENSE BLOCK *****
34 */
35
36 class NaiveBayesian {
37 /** min token length for it to be taken into consideration */
38 var $min_token_length = 3;
39 /** max token length for it to be taken into consideration */
40 var $max_token_length = 15;
41 /** list of token to ignore
42 @see getIgnoreList()
43 */
44 var $ignore_list = array();
45 /** storage object
46 @see class NaiveBayesianStorage
47 */
48 var $nbs = null;
49
50 function NaiveBayesian($nbs) {
51 $this->nbs = $nbs;
52
53 return true;
54 }
55
56 /** categorize a document.
57 Get list of categories in which the document can be categorized
58 with a score for each category.
59
60 @return array keys = category ids, values = scores
61 @param string document
62 */
63 function categorize($document) {
64 $scores = array();
65 $categories = $this->nbs->getCategories();
66 $tokens = $this->_getTokens($document);
67
68 // calculate the score in each category
69 $total_words = 0;
70 $ncat = 0;
71
72 while (list($category, $data) = each($categories)) {
73 $total_words += $data['word_count'];
74 $ncat++;
75 }
76
77 reset($categories);
78
79 while (list($category, $data) = each($categories)) {
80 $scores[$category] = $data['probability'];
81 // small probability for a word not in the category
82 // maybe putting 1.0 as a 'no effect' word can also be good
83 $small_proba = 1.0 / ($data['word_count'] * 2);
84
85 reset($tokens);
86
87 while (list($token, $count) = each($tokens)) {
88 if ($this->nbs->wordExists($token)) {
89 $word = $this->nbs->getWord($token, $category);
90
91 if ($word['count']) {
92 $proba = $word['count'] / $data['word_count'];
93 }
94 else {
95 $proba = $small_proba;
96 }
97
98 $scores[$category] *= pow($proba, $count) * pow($total_words / $ncat, $count);
99 // pow($total_words/$ncat, $count) is here to avoid underflow.
100
101 }
102 }
103 }
104
105 return $this->_rescale($scores);
106 }
107
108 /** training against a document.
109 Set a document as being in a specific category. The document becomes a reference
110 and is saved in the table of references. After a set of training is done
111 the updateProbabilities() function must be run.
112
113 @see updateProbabilities()
114 @see untrain()
115 @return bool success
116 @param string document id, must be unique
117 @param string category_id the category id in which the document should be
118 @param string content of the document
119 */
120 function train($doc_id, $category_id, $content) {
121 $ret = false;
122
123 // if this doc_id already trained, no trained
124 if (!$this->nbs->getReference($doc_id)) {
125 $tokens = $this->_getTokens($content);
126
127 while (list($token, $count) = each($tokens)) {
128 $this->nbs->updateWord($token, $count, $category_id);
129 }
130
131 $this->nbs->saveReference($doc_id, $category_id, $content);
132
133 $ret = true;
134 }
135 else {
136 $ret = false;
137 }
138
139 return $ret;
140 }
141
142 /** untraining of a document.
143 To remove just one document from the references.
144
145 @see updateProbabilities()
146 @see untrain()
147 @return bool success
148 @param string document id, must be unique
149 */
150 function untrain($doc_id) {
151 $ref = $this->nbs->getReference($doc_id);
152 $tokens = $this->_getTokens($ref['content']);
153
154 while (list($token, $count) = each($tokens)) {
155 $this->nbs->removeWord($token, $count, $ref['category_id']);
156 }
157
158 $this->nbs->removeReference($doc_id);
159
160 return true;
161 }
162
163 /** rescale the results between 0 and 1.
164
165 @author Ken Williams, ken@mathforum.org
166 @see categorize()
167 @return array normalized scores (keys => category, values => scores)
168 @param array scores (keys => category, values => scores)
169 */
170 function _rescale($scores) {
171 // Scale everything back to a reasonable area in
172 // logspace (near zero), un-loggify, and normalize
173 $total = 0.0;
174 $max = 0.0;
175 reset($scores);
176
177 while (list($cat, $score) = each($scores)) {
178 if ($score >= $max)
179 $max = $score;
180 }
181
182 reset($scores);
183 while (list($cat, $score) = each($scores)) {
184 $scores[$cat] = (float) exp($score - $max);
185 $total += (float) pow($scores[$cat], 2);
186 }
187
188 $total = (float) sqrt($total);
189
190 reset($scores);
191 while (list($cat, $score) = each($scores)) {
192 $scores[$cat] = (float) $scores[$cat] / $total;
193 }
194 reset($scores);
195
196 return $scores;
197 }
198
199 /** update the probabilities of the categories and word count.
200 This function must be run after a set of training
201
202 @see train()
203 @see untrain()
204 @return bool sucess
205 */
206 function updateProbabilities() {
207 // this function is really only database manipulation
208 // that is why all is done in the NaiveBayesianStorage
209 return $this->nbs->updateProbabilities();
210 }
211
212 /** Get the list of token to ignore.
213 @return array ignore list
214 */
215 function getIgnoreList() {
216 return array('the', 'that', 'you', 'for', 'and');
217 }
218
219 /** get the tokens from a string
220
221 @author James Seng. [http://james.seng.cc/] (based on his perl version)
222
223 @return array tokens
224 @param string the string to get the tokens from
225 */
226 function _getTokens($string) {
227 $rawtokens = array();
228 $tokens = array();
229 $string = $this->_cleanString($string);
230
231 if (count(0 >= $this->ignore_list)) {
232 $this->ignore_list = $this->getIgnoreList();
233 }
234
235 $rawtokens = split("[^-_A-Za-z0-9]+", $string);
236
237 // remove some tokens
238 while (list(, $token) = each($rawtokens)) {
239 $token = trim($token);
240 if (!(('' == $token) || (strlen($token) < $this->min_token_length) || (strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) {
241 $tokens[$token]++;
242 }
243 }
244
245 return $tokens;
246 }
247
248 /** clean a string from the diacritics
249
250 @author Antoine Bajolet [phpdig_at_toiletoine.net]
251 @author SPIP [http://uzine.net/spip/]
252
253 @return string clean string
254 @param string string with accents
255 */
256 function _cleanString($string) {
257 $diac = /* A */ chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) .
258 /* a */ chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) .
259 /* O */ chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) .
260 /* o */ chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) .
261 /* E */ chr(200) . chr(201) . chr(202) . chr(203) .
262 /* e */ chr(232) . chr(233) . chr(234) . chr(235) .
263 /* Cc */ chr(199) . chr(231) .
264 /* I */ chr(204) . chr(205) . chr(206) . chr(207) .
265 /* i */ chr(236) . chr(237) . chr(238) . chr(239) .
266 /* U */ chr(217) . chr(218) . chr(219) . chr(220) .
267 /* u */ chr(249) . chr(250) . chr(251) . chr(252) .
268 /* yNn */ chr(255) . chr(209) . chr(241);
269
270 return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn'));
271 }
272
273 }