[tt-rss.git] / plugins / af_sort_bayes / lib / class.naivebayesian.php

<?php
	/*
	 ***** BEGIN LICENSE BLOCK *****
	 This file is part of PHP Naive Bayesian Filter.

	 The Initial Developer of the Original Code is
	 Loic d'Anterroches [loic_at_xhtml.net].
	 Portions created by the Initial Developer are Copyright (C) 2003
	 the Initial Developer. All Rights Reserved.

	 Contributor(s):
	 See the source

	 PHP Naive Bayesian Filter is free software; you can redistribute it
	 and/or modify it under the terms of the GNU General Public License as
	 published by the Free Software Foundation; either version 2 of
	 the License, or (at your option) any later version.

	 PHP Naive Bayesian Filter is distributed in the hope that it will
	 be useful, but WITHOUT ANY WARRANTY; without even the implied
	 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	 See the GNU General Public License for more details.

	 You should have received a copy of the GNU General Public License
	 along with Foobar; if not, write to the Free Software
	 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

	 Alternatively, the contents of this file may be used under the terms of
	 the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
	 in which case the provisions of the LGPL are applicable instead
	 of those above.

	 ***** END LICENSE BLOCK *****
	 */

	class NaiveBayesian {
		/** min token length for it to be taken into consideration */
		var $min_token_length = 3;
		/** max token length for it to be taken into consideration */
		var $max_token_length = 15;
		/** list of token to ignore
		 @see getIgnoreList()
		 */
		var $ignore_list = array();
		/** storage object
		 @see class NaiveBayesianStorage
		 */
		var $nbs = null;

		function NaiveBayesian($nbs) {
			$this->nbs = $nbs;

			return true;
		}

		/** categorize a document.
		 Get list of categories in which the document can be categorized
		 with a score for each category.

		 @return array keys = category ids, values = scores
		 @param string document
		 */
		function categorize($document) {
			$scores = array();
			$categories = $this->nbs->getCategories();
			$tokens = $this->_getTokens($document);

			// calculate the score in each category
			$total_words = 0;
			$ncat = 0;

			while (list($category, $data) = each($categories)) {
				$total_words += $data['word_count'];
				$ncat++;
			}

			reset($categories);

			while (list($category, $data) = each($categories)) {
				$scores[$category] = $data['probability'];
				// small probability for a word not in the category
				// maybe putting 1.0 as a 'no effect' word can also be good
				$small_proba = 1.0 / ($data['word_count'] * 2);

				reset($tokens);

				while (list($token, $count) = each($tokens)) {
					if ($this->nbs->wordExists($token)) {
						$word = $this->nbs->getWord($token, $category);

						if ($word['count']) {
							$proba = $word['count'] / $data['word_count'];
						}
						else {
							$proba = $small_proba;
						}

						$scores[$category] *= pow($proba, $count) * pow($total_words / $ncat, $count);
						// pow($total_words/$ncat, $count) is here to avoid underflow.

					}
				}
			}

			return $this->_rescale($scores);
		}

		/** training against a document.
		 Set a document as being in a specific category. The document becomes a reference
		 and is saved in the table of references. After a set of training is done
		 the updateProbabilities() function must be run.

		 @see updateProbabilities()
		 @see untrain()
		 @return bool success
		 @param string document id, must be unique
		 @param string category_id the category id in which the document should be
		 @param string content of the document
		 */
		function train($doc_id, $category_id, $content) {
			$ret = false;

			// if this doc_id already trained, no trained
			if (!$this->nbs->getReference($doc_id)) {
				$tokens = $this->_getTokens($content);

				while (list($token, $count) = each($tokens)) {
					$this->nbs->updateWord($token, $count, $category_id);
				}

				$this->nbs->saveReference($doc_id, $category_id, $content);

				$ret = true;
			}
			else {
				$ret = false;
			}

			return $ret;
		}

		/** untraining of a document.
		 To remove just one document from the references.

		 @see updateProbabilities()
		 @see untrain()
		 @return bool success
		 @param string document id, must be unique
		 */
		function untrain($doc_id) {
			$ref = $this->nbs->getReference($doc_id);
			$tokens = $this->_getTokens($ref['content']);

			while (list($token, $count) = each($tokens)) {
				$this->nbs->removeWord($token, $count, $ref['category_id']);
			}

			$this->nbs->removeReference($doc_id);

			return true;
		}

		/** rescale the results between 0 and 1.

		 @author Ken Williams, ken@mathforum.org
		 @see categorize()
		 @return array normalized scores (keys => category, values => scores)
		 @param array scores (keys => category, values => scores)
		 */
		function _rescale($scores) {
			// Scale everything back to a reasonable area in
			// logspace (near zero), un-loggify, and normalize
			$total = 0.0;
			$max = 0.0;
			reset($scores);

			while (list($cat, $score) = each($scores)) {
				if ($score >= $max)
					$max = $score;
			}

			reset($scores);
			while (list($cat, $score) = each($scores)) {
				$scores[$cat] = (float) exp($score - $max);
				$total += (float) pow($scores[$cat], 2);
			}

			$total = (float) sqrt($total);

			reset($scores);
			while (list($cat, $score) = each($scores)) {
				$scores[$cat] = (float) $scores[$cat] / $total;
			}
			reset($scores);

			return $scores;
		}

		/** update the probabilities of the categories and word count.
		 This function must be run after a set of training

		 @see train()
		 @see untrain()
		 @return bool sucess
		 */
		function updateProbabilities() {
			// this function is really only database manipulation
			// that is why all is done in the NaiveBayesianStorage
			return $this->nbs->updateProbabilities();
		}

		/** Get the list of token to ignore.
		 @return array ignore list
		 */
		function getIgnoreList() {
			return array('the', 'that', 'you', 'for', 'and');
		}

		/** get the tokens from a string

		 @author James Seng. [http://james.seng.cc/] (based on his perl version)

		 @return array tokens
		 @param  string the string to get the tokens from
		 */
		function _getTokens($string) {
			$rawtokens = array();
			$tokens = array();
			$string = $this->_cleanString($string);

			if (count(0 >= $this->ignore_list)) {
				$this->ignore_list = $this->getIgnoreList();
			}

			$rawtokens = split("[^-_A-Za-z0-9]+", $string);

			// remove some tokens
			while (list(, $token) = each($rawtokens)) {
				$token = trim($token);
				if (!(('' == $token) || (strlen($token) < $this->min_token_length) || (strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) {
					$tokens[$token]++;
				}
			}

			return $tokens;
		}

		/** clean a string from the diacritics

		 @author Antoine Bajolet [phpdig_at_toiletoine.net]
		 @author SPIP [http://uzine.net/spip/]

		 @return string clean string
		 @param  string string with accents
		 */
		function _cleanString($string) {
			$diac = /* A */ chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) .
				/* a */ chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) .
				/* O */ chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) .
				/* o */ chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) .
				/* E */ chr(200) . chr(201) . chr(202) . chr(203) .
				/* e */ chr(232) . chr(233) . chr(234) . chr(235) .
				/* Cc */ chr(199) . chr(231) .
				/* I */ chr(204) . chr(205) . chr(206) . chr(207) .
				/* i */ chr(236) . chr(237) . chr(238) . chr(239) .
				/* U */ chr(217) . chr(218) . chr(219) . chr(220) .
				/* u */ chr(249) . chr(250) . chr(251) . chr(252) .
				/* yNn */ chr(255) . chr(209) . chr(241);

			return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn'));
		}

	}
Commit	Line	Data
853cc128 AD	1	<?php
	2	/*
	3	*** BEGIN LICENSE BLOCK ***
	4	This file is part of PHP Naive Bayesian Filter.
	5
	6	The Initial Developer of the Original Code is
	7	Loic d'Anterroches [loic_at_xhtml.net].
	8	Portions created by the Initial Developer are Copyright (C) 2003
	9	the Initial Developer. All Rights Reserved.
	10
	11	Contributor(s):
	12	See the source
	13
	14	PHP Naive Bayesian Filter is free software; you can redistribute it
	15	and/or modify it under the terms of the GNU General Public License as
	16	published by the Free Software Foundation; either version 2 of
	17	the License, or (at your option) any later version.
	18
	19	PHP Naive Bayesian Filter is distributed in the hope that it will
	20	be useful, but WITHOUT ANY WARRANTY; without even the implied
	21	warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	22	See the GNU General Public License for more details.
	23
	24	You should have received a copy of the GNU General Public License
	25	along with Foobar; if not, write to the Free Software
	26	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	27
	28	Alternatively, the contents of this file may be used under the terms of
	29	the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
	30	in which case the provisions of the LGPL are applicable instead
	31	of those above.
	32
	33	*** END LICENSE BLOCK ***
	34	*/
	35
	36	class NaiveBayesian {
	37	/** min token length for it to be taken into consideration */
	38	var $min_token_length = 3;
	39	/** max token length for it to be taken into consideration */
	40	var $max_token_length = 15;
	41	/** list of token to ignore
	42	@see getIgnoreList()
	43	*/
	44	var $ignore_list = array();
	45	/** storage object
	46	@see class NaiveBayesianStorage
	47	*/
	48	var $nbs = null;
	49
	50	function NaiveBayesian($nbs) {
	51	$this->nbs = $nbs;
	52
	53	return true;
	54	}
	55
	56	/** categorize a document.
	57	Get list of categories in which the document can be categorized
	58	with a score for each category.
	59
	60	@return array keys = category ids, values = scores
	61	@param string document
	62	*/
	63	function categorize($document) {
	64	$scores = array();
65	$categories = $this->nbs->getCategories();
66	$tokens = $this->_getTokens($document);
67
68	// calculate the score in each category
69	$total_words = 0;
70	$ncat = 0;
71
72	while (list($category, $data) = each($categories)) {
73	$total_words += $data['word_count'];
74	$ncat++;
75	}
76
77	reset($categories);
78
79	while (list($category, $data) = each($categories)) {
80	$scores[$category] = $data['probability'];
81	// small probability for a word not in the category
82	// maybe putting 1.0 as a 'no effect' word can also be good
83	$small_proba = 1.0 / ($data['word_count'] * 2);
84
85	reset($tokens);
86
87	while (list($token, $count) = each($tokens)) {
88	if ($this->nbs->wordExists($token)) {
89	$word = $this->nbs->getWord($token, $category);
90
91	if ($word['count']) {
92	$proba = $word['count'] / $data['word_count'];
93	}
94	else {
95	$proba = $small_proba;
96	}
97
98	$scores[$category] = pow($proba, $count) pow($total_words / $ncat, $count);
99	// pow($total_words/$ncat, $count) is here to avoid underflow.
100
101	}
102	}
103	}
104
105	return $this->_rescale($scores);
106	}
107
108	/** training against a document.
109	Set a document as being in a specific category. The document becomes a reference
110	and is saved in the table of references. After a set of training is done
111	the updateProbabilities() function must be run.
112
113	@see updateProbabilities()
114	@see untrain()
115	@return bool success
116	@param string document id, must be unique
117	@param string category_id the category id in which the document should be
118	@param string content of the document
119	*/
120	function train($doc_id, $category_id, $content) {
121	$ret = false;
122
123	// if this doc_id already trained, no trained
124	if (!$this->nbs->getReference($doc_id)) {
125	$tokens = $this->_getTokens($content);
126
127	while (list($token, $count) = each($tokens)) {
128	$this->nbs->updateWord($token, $count, $category_id);
129	}
130
131	$this->nbs->saveReference($doc_id, $category_id, $content);
132
133	$ret = true;
134	}
135	else {
136	$ret = false;
137	}
138
139	return $ret;
140	}
141
142	/** untraining of a document.
143	To remove just one document from the references.
144
145	@see updateProbabilities()
146	@see untrain()
147	@return bool success
148	@param string document id, must be unique
149	*/
150	function untrain($doc_id) {
151	$ref = $this->nbs->getReference($doc_id);
152	$tokens = $this->_getTokens($ref['content']);
153
154	while (list($token, $count) = each($tokens)) {
155	$this->nbs->removeWord($token, $count, $ref['category_id']);
156	}
157
158	$this->nbs->removeReference($doc_id);
159
160	return true;
161	}
162
163	/** rescale the results between 0 and 1.
164
165	@author Ken Williams, ken@mathforum.org
166	@see categorize()
167	@return array normalized scores (keys => category, values => scores)
168	@param array scores (keys => category, values => scores)
169	*/
170	function _rescale($scores) {
171	// Scale everything back to a reasonable area in
172	// logspace (near zero), un-loggify, and normalize
173	$total = 0.0;
174	$max = 0.0;
175	reset($scores);
176
177	while (list($cat, $score) = each($scores)) {
178	if ($score >= $max)
179	$max = $score;
180	}
181
182	reset($scores);
183	while (list($cat, $score) = each($scores)) {
184	$scores[$cat] = (float) exp($score - $max);
185	$total += (float) pow($scores[$cat], 2);
186	}
187
188	$total = (float) sqrt($total);
189
190	reset($scores);
191	while (list($cat, $score) = each($scores)) {
192	$scores[$cat] = (float) $scores[$cat] / $total;
193	}
194	reset($scores);
195
196	return $scores;
197	}
198
199	/** update the probabilities of the categories and word count.
200	This function must be run after a set of training
201
202	@see train()
203	@see untrain()
204	@return bool sucess
205	*/
206	function updateProbabilities() {
207	// this function is really only database manipulation
208	// that is why all is done in the NaiveBayesianStorage
209	return $this->nbs->updateProbabilities();
210	}
211
212	/** Get the list of token to ignore.
213	@return array ignore list
214	*/
215	function getIgnoreList() {
216	return array('the', 'that', 'you', 'for', 'and');
217	}
218
219	/** get the tokens from a string
220
221	@author James Seng. [http://james.seng.cc/] (based on his perl version)
222
223	@return array tokens
224	@param string the string to get the tokens from
225	*/
226	function _getTokens($string) {
227	$rawtokens = array();
228	$tokens = array();
229	$string = $this->_cleanString($string);
230
231	if (count(0 >= $this->ignore_list)) {
232	$this->ignore_list = $this->getIgnoreList();
233	}
234
235	$rawtokens = split("[^-_A-Za-z0-9]+", $string);
236
237	// remove some tokens
238	while (list(, $token) = each($rawtokens)) {
239	$token = trim($token);
240	if (!(('' == $token) \|\| (strlen($token) < $this->min_token_length) \|\| (strlen($token) > $this->max_token_length) \|\| (preg_match('/^[0-9]+$/', $token)) \|\| (in_array($token, $this->ignore_list)))) {
241	$tokens[$token]++;
242	}
243	}
244
245	return $tokens;
246	}
247
248	/** clean a string from the diacritics
249
250	@author Antoine Bajolet [phpdig_at_toiletoine.net]
251	@author SPIP [http://uzine.net/spip/]
252
253	@return string clean string
254	@param string string with accents
255	*/
256	function _cleanString($string) {
257	$diac = /* A */ chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) .
258	/* a */ chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) .
259	/* O */ chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) .
260	/* o */ chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) .
261	/* E */ chr(200) . chr(201) . chr(202) . chr(203) .
262	/* e */ chr(232) . chr(233) . chr(234) . chr(235) .
263	/* Cc */ chr(199) . chr(231) .
264	/* I */ chr(204) . chr(205) . chr(206) . chr(207) .
265	/* i */ chr(236) . chr(237) . chr(238) . chr(239) .
266	/* U */ chr(217) . chr(218) . chr(219) . chr(220) .
267	/* u */ chr(249) . chr(250) . chr(251) . chr(252) .
268	/* yNn */ chr(255) . chr(209) . chr(241);
269
270	return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn'));
271	}
272
273	}