]> git.wh0rd.org - tt-rss.git/blobdiff - plugins/af_sort_bayes/lib/class.naivebayesian.php
add some more bayes stuff
[tt-rss.git] / plugins / af_sort_bayes / lib / class.naivebayesian.php
index 1c2ef463bf068dde9ab6f3d598d959fd5480e357..c80c3f2151b27ac71a2156ca08a29ea58a8efc43 100644 (file)
@@ -85,6 +85,7 @@
                                reset($tokens);
 
                                while (list($token, $count) = each($tokens)) {
+
                                        if ($this->nbs->wordExists($token)) {
                                                $word = $this->nbs->getWord($token, $category);
 
                function train($doc_id, $category_id, $content) {
                        $ret = false;
 
+
                        // if this doc_id already trained, no trained
-                       if (!$this->nbs->getReference($doc_id)) {
+                       if (!$this->nbs->getReference($doc_id, false)) {
+
                                $tokens = $this->_getTokens($content);
 
                                while (list($token, $count) = each($tokens)) {
                 */
                function untrain($doc_id) {
                        $ref = $this->nbs->getReference($doc_id);
-                       $tokens = $this->_getTokens($ref['content']);
 
-                       while (list($token, $count) = each($tokens)) {
-                               $this->nbs->removeWord($token, $count, $ref['category_id']);
-                       }
+                       if (isset($ref['content'])) {
 
-                       $this->nbs->removeReference($doc_id);
+                               $tokens = $this->_getTokens($ref['content']);
 
-                       return true;
+                               while (list($token, $count) = each($tokens)) {
+                                       $this->nbs->removeWord($token, $count, $ref['category_id']);
+                               }
+
+                               $this->nbs->removeReference($doc_id);
+
+                               return true;
+                       } else {
+                               return false;
+                       }
                }
 
                /** rescale the results between 0 and 1.
                function _getTokens($string) {
                        $rawtokens = array();
                        $tokens = array();
-                       $string = $this->_cleanString($string);
+                       //$string = $this->_cleanString($string);
 
                        if (count(0 >= $this->ignore_list)) {
                                $this->ignore_list = $this->getIgnoreList();
                        }
 
-                       $rawtokens = split("[^-_A-Za-z0-9]+", $string);
+                       $rawtokens = preg_split("/[\(\),:\.;\t\r\n ]/", $string, -1, PREG_SPLIT_NO_EMPTY);
 
                        // remove some tokens
                        while (list(, $token) = each($rawtokens)) {
                                $token = trim($token);
-                               if (!(('' == $token) || (strlen($token) < $this->min_token_length) || (strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) {
+                               if (!(('' == $token) || (mb_strpos($token, "&") !== FALSE) || (mb_strlen($token) < $this->min_token_length) || (mb_strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) {
                                        $tokens[$token]++;
                                }
                        }