X-Git-Url: https://git.wh0rd.org/?a=blobdiff_plain;f=plugins%2Faf_sort_bayes%2Flib%2Fclass.naivebayesian.php;h=c80c3f2151b27ac71a2156ca08a29ea58a8efc43;hb=59e83455f188e48796383bfe4be99deb81cb1caa;hp=1c2ef463bf068dde9ab6f3d598d959fd5480e357;hpb=853cc128d6e262b4a7a693c6321a406674837d73;p=tt-rss.git diff --git a/plugins/af_sort_bayes/lib/class.naivebayesian.php b/plugins/af_sort_bayes/lib/class.naivebayesian.php index 1c2ef463..c80c3f21 100644 --- a/plugins/af_sort_bayes/lib/class.naivebayesian.php +++ b/plugins/af_sort_bayes/lib/class.naivebayesian.php @@ -85,6 +85,7 @@ reset($tokens); while (list($token, $count) = each($tokens)) { + if ($this->nbs->wordExists($token)) { $word = $this->nbs->getWord($token, $category); @@ -120,8 +121,10 @@ function train($doc_id, $category_id, $content) { $ret = false; + // if this doc_id already trained, no trained - if (!$this->nbs->getReference($doc_id)) { + if (!$this->nbs->getReference($doc_id, false)) { + $tokens = $this->_getTokens($content); while (list($token, $count) = each($tokens)) { @@ -149,15 +152,21 @@ */ function untrain($doc_id) { $ref = $this->nbs->getReference($doc_id); - $tokens = $this->_getTokens($ref['content']); - while (list($token, $count) = each($tokens)) { - $this->nbs->removeWord($token, $count, $ref['category_id']); - } + if (isset($ref['content'])) { - $this->nbs->removeReference($doc_id); + $tokens = $this->_getTokens($ref['content']); - return true; + while (list($token, $count) = each($tokens)) { + $this->nbs->removeWord($token, $count, $ref['category_id']); + } + + $this->nbs->removeReference($doc_id); + + return true; + } else { + return false; + } } /** rescale the results between 0 and 1. @@ -226,18 +235,18 @@ function _getTokens($string) { $rawtokens = array(); $tokens = array(); - $string = $this->_cleanString($string); + //$string = $this->_cleanString($string); if (count(0 >= $this->ignore_list)) { $this->ignore_list = $this->getIgnoreList(); } - $rawtokens = split("[^-_A-Za-z0-9]+", $string); + $rawtokens = preg_split("/[\(\),:\.;\t\r\n ]/", $string, -1, PREG_SPLIT_NO_EMPTY); // remove some tokens while (list(, $token) = each($rawtokens)) { $token = trim($token); - if (!(('' == $token) || (strlen($token) < $this->min_token_length) || (strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) { + if (!(('' == $token) || (mb_strpos($token, "&") !== FALSE) || (mb_strlen($token) < $this->min_token_length) || (mb_strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) { $tokens[$token]++; } }