[tt-rss.git] / plugins / af_sort_bayes / lib / class.naivebayesian.php

<?php
	/*
	 ***** BEGIN LICENSE BLOCK *****
	 This file is part of PHP Naive Bayesian Filter.

	 The Initial Developer of the Original Code is
	 Loic d'Anterroches [loic_at_xhtml.net].
	 Portions created by the Initial Developer are Copyright (C) 2003
	 the Initial Developer. All Rights Reserved.

	 Contributor(s):
	 See the source

	 PHP Naive Bayesian Filter is free software; you can redistribute it
	 and/or modify it under the terms of the GNU General Public License as
	 published by the Free Software Foundation; either version 2 of
	 the License, or (at your option) any later version.

	 PHP Naive Bayesian Filter is distributed in the hope that it will
	 be useful, but WITHOUT ANY WARRANTY; without even the implied
	 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	 See the GNU General Public License for more details.

	 You should have received a copy of the GNU General Public License
	 along with Foobar; if not, write to the Free Software
	 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

	 Alternatively, the contents of this file may be used under the terms of
	 the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
	 in which case the provisions of the LGPL are applicable instead
	 of those above.

	 ***** END LICENSE BLOCK *****
	 */

	class NaiveBayesian {
		/** min token length for it to be taken into consideration */
		var $min_token_length = 3;
		/** max token length for it to be taken into consideration */
		var $max_token_length = 15;
		/** list of token to ignore
		 @see getIgnoreList()
		 */
		var $ignore_list = array();
		/** storage object
		 @see class NaiveBayesianStorage
		 */
		var $nbs = null;

		function NaiveBayesian($nbs) {
			$this->nbs = $nbs;

			return true;
		}

		/** categorize a document.
		 Get list of categories in which the document can be categorized
		 with a score for each category.

		 @return array keys = category ids, values = scores
		 @param string document
		 */
		function categorize($document) {
			$scores = array();
			$categories = $this->nbs->getCategories();
			$tokens = $this->_getTokens($document);

			// calculate the score in each category
			$total_words = 0;
			$ncat = 0;

			while (list($category, $data) = each($categories)) {
				$total_words += $data['word_count'];
				$ncat++;
			}

			reset($categories);

			while (list($category, $data) = each($categories)) {
				$scores[$category] = $data['probability'];
				// small probability for a word not in the category
				// maybe putting 1.0 as a 'no effect' word can also be good

				if ($data['word_count'] > 0)
					$small_proba = 1.0 / ($data['word_count'] * 2);
				else
					$small_proba = 0;

				reset($tokens);

				while (list($token, $count) = each($tokens)) {

					if ($this->nbs->wordExists($token)) {
						$word = $this->nbs->getWord($token, $category);

						if ($word['count']) {
							$proba = $word['count'] / $data['word_count'];
						}
						else {
							$proba = $small_proba;
						}

						$scores[$category] *= pow($proba, $count) * pow($total_words / $ncat, $count);
						// pow($total_words/$ncat, $count) is here to avoid underflow.

					}
				}
			}

			return $this->_rescale($scores);
		}

		/** training against a document.
		 Set a document as being in a specific category. The document becomes a reference
		 and is saved in the table of references. After a set of training is done
		 the updateProbabilities() function must be run.

		 @see updateProbabilities()
		 @see untrain()
		 @return bool success
		 @param string document id, must be unique
		 @param string category_id the category id in which the document should be
		 @param string content of the document
		 */
		function train($doc_id, $category_id, $content) {
			$ret = false;


			// if this doc_id already trained, no trained
			if (!$this->nbs->getReference($doc_id, false)) {

				$tokens = $this->_getTokens($content);

				while (list($token, $count) = each($tokens)) {
					$this->nbs->updateWord($token, $count, $category_id);
				}

				$this->nbs->saveReference($doc_id, $category_id, $content);

				$ret = true;
			}
			else {
				$ret = false;
			}

			return $ret;
		}

		/** untraining of a document.
		 To remove just one document from the references.

		 @see updateProbabilities()
		 @see untrain()
		 @return bool success
		 @param string document id, must be unique
		 */
		function untrain($doc_id) {
			$ref = $this->nbs->getReference($doc_id);

			if (isset($ref['content'])) {

				$tokens = $this->_getTokens($ref['content']);

				while (list($token, $count) = each($tokens)) {
					$this->nbs->removeWord($token, $count, $ref['category_id']);
				}

				$this->nbs->removeReference($doc_id);

				return true;
			} else {
				return false;
			}
		}

		/** rescale the results between 0 and 1.

		 @author Ken Williams, ken@mathforum.org
		 @see categorize()
		 @return array normalized scores (keys => category, values => scores)
		 @param array scores (keys => category, values => scores)
		 */
		function _rescale($scores) {
			// Scale everything back to a reasonable area in
			// logspace (near zero), un-loggify, and normalize
			$total = 0.0;
			$max = 0.0;
			reset($scores);

			while (list($cat, $score) = each($scores)) {
				if ($score >= $max)
					$max = $score;
			}

			reset($scores);
			while (list($cat, $score) = each($scores)) {
				$scores[$cat] = (float) exp($score - $max);
				$total += (float) pow($scores[$cat], 2);
			}

			$total = (float) sqrt($total);

			reset($scores);
			while (list($cat, $score) = each($scores)) {
				$scores[$cat] = (float) $scores[$cat] / $total;
			}
			reset($scores);

			return $scores;
		}

		/** update the probabilities of the categories and word count.
		 This function must be run after a set of training

		 @see train()
		 @see untrain()
		 @return bool sucess
		 */
		function updateProbabilities() {
			// this function is really only database manipulation
			// that is why all is done in the NaiveBayesianStorage
			return $this->nbs->updateProbabilities();
		}

		/** Get the list of token to ignore.
		 @return array ignore list
		 */
		function getIgnoreList() {
			return array('the', 'that', 'you', 'for', 'and');
		}

		/** get the tokens from a string

		 @author James Seng. [http://james.seng.cc/] (based on his perl version)

		 @return array tokens
		 @param  string the string to get the tokens from
		 */
		function _getTokens($string) {
			$rawtokens = array();
			$tokens = array();
			//$string = $this->_cleanString($string);

			if (count(0 >= $this->ignore_list)) {
				$this->ignore_list = $this->getIgnoreList();
			}

			$rawtokens = preg_split("/[\(\),:\.;\t\r\n ]/", $string, -1, PREG_SPLIT_NO_EMPTY);

			// remove some tokens
			while (list(, $token) = each($rawtokens)) {
				$token = trim($token);
				if (!(('' == $token) || (mb_strpos($token, "&") !== FALSE) || (mb_strlen($token) < $this->min_token_length) || (mb_strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) {
					$tokens[$token]++;
				}
			}

			return $tokens;
		}

		/** clean a string from the diacritics

		 @author Antoine Bajolet [phpdig_at_toiletoine.net]
		 @author SPIP [http://uzine.net/spip/]

		 @return string clean string
		 @param  string string with accents
		 */
		function _cleanString($string) {
			$diac = /* A */ chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) .
				/* a */ chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) .
				/* O */ chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) .
				/* o */ chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) .
				/* E */ chr(200) . chr(201) . chr(202) . chr(203) .
				/* e */ chr(232) . chr(233) . chr(234) . chr(235) .
				/* Cc */ chr(199) . chr(231) .
				/* I */ chr(204) . chr(205) . chr(206) . chr(207) .
				/* i */ chr(236) . chr(237) . chr(238) . chr(239) .
				/* U */ chr(217) . chr(218) . chr(219) . chr(220) .
				/* u */ chr(249) . chr(250) . chr(251) . chr(252) .
				/* yNn */ chr(255) . chr(209) . chr(241);

			return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn'));
		}

	}
Commit	Line	Data
853cc128 AD	1	<?php
	2	/*
	3	*** BEGIN LICENSE BLOCK ***
	4	This file is part of PHP Naive Bayesian Filter.
	5
	6	The Initial Developer of the Original Code is
	7	Loic d'Anterroches [loic_at_xhtml.net].
	8	Portions created by the Initial Developer are Copyright (C) 2003
	9	the Initial Developer. All Rights Reserved.
	10
	11	Contributor(s):
	12	See the source
	13
	14	PHP Naive Bayesian Filter is free software; you can redistribute it
	15	and/or modify it under the terms of the GNU General Public License as
	16	published by the Free Software Foundation; either version 2 of
	17	the License, or (at your option) any later version.
	18
	19	PHP Naive Bayesian Filter is distributed in the hope that it will
	20	be useful, but WITHOUT ANY WARRANTY; without even the implied
	21	warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	22	See the GNU General Public License for more details.
	23
	24	You should have received a copy of the GNU General Public License
	25	along with Foobar; if not, write to the Free Software
	26	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	27
	28	Alternatively, the contents of this file may be used under the terms of
	29	the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
	30	in which case the provisions of the LGPL are applicable instead
	31	of those above.
	32
	33	*** END LICENSE BLOCK ***
	34	*/
	35
	36	class NaiveBayesian {
	37	/** min token length for it to be taken into consideration */
	38	var $min_token_length = 3;
	39	/** max token length for it to be taken into consideration */
	40	var $max_token_length = 15;
	41	/** list of token to ignore
	42	@see getIgnoreList()
	43	*/
	44	var $ignore_list = array();
	45	/** storage object
	46	@see class NaiveBayesianStorage
	47	*/
	48	var $nbs = null;
	49
	50	function NaiveBayesian($nbs) {
	51	$this->nbs = $nbs;
	52
	53	return true;
	54	}
	55
	56	/** categorize a document.
	57	Get list of categories in which the document can be categorized
	58	with a score for each category.
	59
	60	@return array keys = category ids, values = scores
	61	@param string document
	62	*/
	63	function categorize($document) {
	64	$scores = array();
65	$categories = $this->nbs->getCategories();
66	$tokens = $this->_getTokens($document);
67
68	// calculate the score in each category
69	$total_words = 0;
70	$ncat = 0;
71
72	while (list($category, $data) = each($categories)) {
73	$total_words += $data['word_count'];
74	$ncat++;
75	}
76
77	reset($categories);
78
79	while (list($category, $data) = each($categories)) {
80	$scores[$category] = $data['probability'];
81	// small probability for a word not in the category
82	// maybe putting 1.0 as a 'no effect' word can also be good
4da0cb32 AD	83
	84	if ($data['word_count'] > 0)
	85	$small_proba = 1.0 / ($data['word_count'] * 2);
	86	else
	87	$small_proba = 0;
853cc128 AD	88
	89	reset($tokens);
	90
	91	while (list($token, $count) = each($tokens)) {
59e83455	92
853cc128 AD	93	if ($this->nbs->wordExists($token)) {
	94	$word = $this->nbs->getWord($token, $category);
	95
	96	if ($word['count']) {
	97	$proba = $word['count'] / $data['word_count'];
	98	}
	99	else {
	100	$proba = $small_proba;
	101	}
	102
	103	$scores[$category] = pow($proba, $count) pow($total_words / $ncat, $count);
	104	// pow($total_words/$ncat, $count) is here to avoid underflow.
	105
	106	}
	107	}
	108	}
	109
	110	return $this->_rescale($scores);
	111	}
	112
	113	/** training against a document.
	114	Set a document as being in a specific category. The document becomes a reference
	115	and is saved in the table of references. After a set of training is done
	116	the updateProbabilities() function must be run.
	117
	118	@see updateProbabilities()
	119	@see untrain()
	120	@return bool success
	121	@param string document id, must be unique
	122	@param string category_id the category id in which the document should be
	123	@param string content of the document
	124	*/
	125	function train($doc_id, $category_id, $content) {
	126	$ret = false;
	127
59e83455	128
853cc128	129	// if this doc_id already trained, no trained
59e83455 AD	130	if (!$this->nbs->getReference($doc_id, false)) {
59e83455 AD	131
853cc128 AD	132	$tokens = $this->_getTokens($content);
	133
	134	while (list($token, $count) = each($tokens)) {
	135	$this->nbs->updateWord($token, $count, $category_id);
	136	}
	137
	138	$this->nbs->saveReference($doc_id, $category_id, $content);
	139
	140	$ret = true;
	141	}
	142	else {
	143	$ret = false;
	144	}
	145
	146	return $ret;
	147	}
	148
	149	/** untraining of a document.
	150	To remove just one document from the references.
	151
	152	@see updateProbabilities()
	153	@see untrain()
	154	@return bool success
	155	@param string document id, must be unique
	156	*/
	157	function untrain($doc_id) {
	158	$ref = $this->nbs->getReference($doc_id);
853cc128	159
59e83455	160	if (isset($ref['content'])) {
853cc128	161
59e83455	162	$tokens = $this->_getTokens($ref['content']);
853cc128	163
59e83455 AD	164	while (list($token, $count) = each($tokens)) {
	165	$this->nbs->removeWord($token, $count, $ref['category_id']);
	166	}
	167
	168	$this->nbs->removeReference($doc_id);
	169
	170	return true;
	171	} else {
	172	return false;
	173	}
853cc128 AD	174	}
	175
	176	/** rescale the results between 0 and 1.
	177
	178	@author Ken Williams, ken@mathforum.org
	179	@see categorize()
	180	@return array normalized scores (keys => category, values => scores)
	181	@param array scores (keys => category, values => scores)
	182	*/
	183	function _rescale($scores) {
	184	// Scale everything back to a reasonable area in
	185	// logspace (near zero), un-loggify, and normalize
	186	$total = 0.0;
	187	$max = 0.0;
	188	reset($scores);
	189
	190	while (list($cat, $score) = each($scores)) {
	191	if ($score >= $max)
	192	$max = $score;
	193	}
	194
	195	reset($scores);
	196	while (list($cat, $score) = each($scores)) {
	197	$scores[$cat] = (float) exp($score - $max);
	198	$total += (float) pow($scores[$cat], 2);
	199	}
	200
	201	$total = (float) sqrt($total);
	202
	203	reset($scores);
	204	while (list($cat, $score) = each($scores)) {
	205	$scores[$cat] = (float) $scores[$cat] / $total;
	206	}
	207	reset($scores);
	208
	209	return $scores;
	210	}
	211
	212	/** update the probabilities of the categories and word count.
	213	This function must be run after a set of training
	214
	215	@see train()
	216	@see untrain()
	217	@return bool sucess
	218	*/
	219	function updateProbabilities() {
	220	// this function is really only database manipulation
	221	// that is why all is done in the NaiveBayesianStorage
	222	return $this->nbs->updateProbabilities();
	223	}
	224
	225	/** Get the list of token to ignore.
	226	@return array ignore list
	227	*/
	228	function getIgnoreList() {
	229	return array('the', 'that', 'you', 'for', 'and');
	230	}
	231
	232	/** get the tokens from a string
	233
	234	@author James Seng. [http://james.seng.cc/] (based on his perl version)
	235
	236	@return array tokens
	237	@param string the string to get the tokens from
238	*/
239	function _getTokens($string) {
240	$rawtokens = array();
241	$tokens = array();
59e83455	242	//$string = $this->_cleanString($string);
853cc128 AD	243
	244	if (count(0 >= $this->ignore_list)) {
	245	$this->ignore_list = $this->getIgnoreList();
	246	}
	247
59e83455	248	$rawtokens = preg_split("/[\(\),:\.;\t\r\n ]/", $string, -1, PREG_SPLIT_NO_EMPTY);
853cc128 AD	249
	250	// remove some tokens
	251	while (list(, $token) = each($rawtokens)) {
	252	$token = trim($token);
59e83455	253	if (!(('' == $token) \|\| (mb_strpos($token, "&") !== FALSE) \|\| (mb_strlen($token) < $this->min_token_length) \|\| (mb_strlen($token) > $this->max_token_length) \|\| (preg_match('/^[0-9]+$/', $token)) \|\| (in_array($token, $this->ignore_list)))) {
853cc128 AD	254	$tokens[$token]++;
	255	}
	256	}
	257
	258	return $tokens;
	259	}
	260
	261	/** clean a string from the diacritics
	262
	263	@author Antoine Bajolet [phpdig_at_toiletoine.net]
	264	@author SPIP [http://uzine.net/spip/]
	265
	266	@return string clean string
	267	@param string string with accents
	268	*/
	269	function _cleanString($string) {
	270	$diac = /* A */ chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) .
	271	/* a */ chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) .
	272	/* O */ chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) .
	273	/* o */ chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) .
	274	/* E */ chr(200) . chr(201) . chr(202) . chr(203) .
	275	/* e */ chr(232) . chr(233) . chr(234) . chr(235) .
	276	/* Cc */ chr(199) . chr(231) .
	277	/* I */ chr(204) . chr(205) . chr(206) . chr(207) .
	278	/* i */ chr(236) . chr(237) . chr(238) . chr(239) .
	279	/* U */ chr(217) . chr(218) . chr(219) . chr(220) .
	280	/* u */ chr(249) . chr(250) . chr(251) . chr(252) .
	281	/* yNn */ chr(255) . chr(209) . chr(241);
	282
	283	return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn'));
	284	}
	285
	286	}