]> git.wh0rd.org Git - tt-rss.git/commitdiff
add some more bayes stuff
authorAndrew Dolgov <noreply@madoka.volgo-balt.ru>
Wed, 17 Jun 2015 12:15:04 +0000 (15:15 +0300)
committerAndrew Dolgov <noreply@madoka.volgo-balt.ru>
Wed, 17 Jun 2015 12:15:04 +0000 (15:15 +0300)
include/rssfuncs.php
plugins/af_sort_bayes/init.php
plugins/af_sort_bayes/lib/class.naivebayesian.php
plugins/af_sort_bayes/lib/class.naivebayesianstorage.php

index 17233914ee1b096ff7d7a9d21aa3028c4efce303..4dbb7c18e427b4a46bad7e014fda58611bf07dbf 100644 (file)
 
                                $article = array("owner_uid" => $owner_uid, // read only
                                        "guid" => $entry_guid, // read only
+                                       "guid_hashed" => $entry_guid_hashed, // read only
                                        "title" => $entry_title,
                                        "content" => $entry_content,
                                        "link" => $entry_link,
                                                        lang = '$entry_language'
                                                WHERE id = '$ref_id'");
 
+                                       // update aux data
+                                       db_query("UPDATE ttrss_user_entries
+                                                       SET score = '$score' WHERE ref_id = '$ref_id'");
+
                                        if ($mark_unread_on_update) {
                                                db_query("UPDATE ttrss_user_entries
                                                        SET last_read = null, unread = true WHERE ref_id = '$ref_id'");
index 213c6aede3fe7df6fbf586a3dc25072c3936658f..23f38ec25df97778b0ba9f1d8e8b3e5fa9e6ef20 100644 (file)
@@ -5,6 +5,7 @@ class Af_Sort_Bayes extends Plugin {
        private $host;
        private $filters = array();
        private $dbh;
+       private $score_modifier = 50;
 
        function about() {
                return array(1.0,
@@ -31,8 +32,39 @@ class Af_Sort_Bayes extends Plugin {
                $article_id = (int) $_REQUEST["article_id"];
                $train_up = sql_bool_to_bool($_REQUEST["train_up"]);
 
-               print "FIXME: $article_id :: $train_up";
+               $category = $train_up ? "GOOD" : "NEUTRAL";
 
+               $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
+               $nb = new NaiveBayesian($nbs);
+
+               $result = $this->dbh->query("SELECT score, guid, title, content FROM ttrss_entries, ttrss_user_entries WHERE ref_id = id AND id = " .
+                       $article_id . " AND owner_uid = " . $_SESSION["uid"]);
+
+               if ($this->dbh->num_rows($result) != 0) {
+                       $guid = $this->dbh->fetch_result($result, 0, "guid");
+                       $title = $this->dbh->fetch_result($result, 0, "title");
+                       $content = mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content")));
+                       $score = $this->dbh->fetch_result($result, 0, "score");
+
+                       $this->dbh->query("BEGIN");
+
+                       if ($nb->untrain($guid, $content)) {
+                               if ($score >= $this->score_modifier) $score -= $this->score_modifier;
+                       }
+
+                       $nb->train($guid, $nbs->getCategoryByName($category), $content);
+
+                       if ($category == "GOOD") $score += $this->score_modifier;
+
+                       $this->dbh->query("UPDATE ttrss_user_entries SET score = '$score' WHERE ref_id = $article_id AND owner_uid = " . $_SESSION["uid"]);
+
+                       $nb->updateProbabilities();
+
+                       $this->dbh->query("COMMIT");
+
+               }
+
+               print "$article_id :: $category";
        }
 
        function get_js() {
@@ -54,9 +86,11 @@ class Af_Sort_Bayes extends Plugin {
        function init_database() {
                $prefix = "ttrss_plugin_af_sort_bayes";
 
-               /*$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_references", false);
-               $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_categories", false);
-               $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_wordfreqs", false);*/
+               // TODO there probably should be a way for plugins to determine their schema version to upgrade tables
+
+               /*$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_wordfreqs", false);
+               $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_references", false);
+               $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_categories", false);*/
 
                $this->dbh->query("BEGIN");
 
@@ -69,9 +103,9 @@ class Af_Sort_Bayes extends Plugin {
                        owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
                        word_count BIGINT NOT NULL DEFAULT '0')");
 
-               $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_documents (
+               $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references (
                        id SERIAL NOT NULL PRIMARY KEY,
-                       document varchar(250) NOT NULL DEFAULT '',
+                       document_id VARCHAR(255) NOT NULL,
                        category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
                        owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
                        content text NOT NULL)");
@@ -82,6 +116,17 @@ class Af_Sort_Bayes extends Plugin {
                        owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
                        count BIGINT NOT NULL DEFAULT '0')");
 
+               $owner_uid = @$_SESSION["uid"];
+
+               if ($owner_uid) {
+                       $result = $this->dbh->query("SELECT id FROM ${prefix}_categories WHERE owner_uid = $owner_uid LIMIT 1");
+
+                       if ($this->dbh->num_rows($result) == 0) {
+                               $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('GOOD', $owner_uid)");
+                               $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('NEUTRAL', $owner_uid)");
+                       }
+               }
+
                $this->dbh->query("COMMIT");
        }
 
@@ -98,6 +143,52 @@ class Af_Sort_Bayes extends Plugin {
        function hook_article_filter($article) {
                $owner_uid = $article["owner_uid"];
 
+               $nbs = new NaiveBayesianStorage($owner_uid);
+               $nb = new NaiveBayesian($nbs);
+
+               $categories = $nbs->getCategories();
+
+               if (count($categories) > 0) {
+
+                       $count_neutral = 0;
+                       $count_good = 0;
+                       $id_good = 0;
+                       $id_neutral = 0;
+
+                       foreach ($categories as $id => $cat) {
+                               if ($cat["category"] == "GOOD") {
+                                       $id_good = $id;
+                                       $count_good += $cat["word_count"];
+                               } else if ($cat["category"] == "NEUTRAL") {
+                                       $id_neutral = $id;
+                                       $count_neutral += $cat["word_count"];
+                               }
+                       }
+
+                       $dst_category = $id_neutral;
+
+                       $bayes_content = mb_strtolower($article["title"] . " " . strip_tags($article["content"]));
+
+                       if ($count_neutral >= 3000 && $count_good >= 1000) {
+                               // enable automatic categorization
+
+                               $result = $nb->categorize($bayes_content);
+
+                               if (count($result) == 2) {
+                                       $prob_good = $result[$id_good];
+                                       $prob_neutral = $result[$id_neutral];
+
+                                       if ($prob_good > 0.90 && $prob_good > $prob_neutral) {
+                                               //$dst_category = $id_good; // should we autofile as good or not? idk
+                                               $article["score_modifier"] += $this->score_modifier;
+                                       }
+                               }
+                       }
+
+                       $nb->train($article["guid_hashed"], $dst_category, $bayes_content);
+
+                       $nb->updateProbabilities();
+               }
 
                return $article;
 
index 1c2ef463bf068dde9ab6f3d598d959fd5480e357..c80c3f2151b27ac71a2156ca08a29ea58a8efc43 100644 (file)
@@ -85,6 +85,7 @@
                                reset($tokens);
 
                                while (list($token, $count) = each($tokens)) {
+
                                        if ($this->nbs->wordExists($token)) {
                                                $word = $this->nbs->getWord($token, $category);
 
                function train($doc_id, $category_id, $content) {
                        $ret = false;
 
+
                        // if this doc_id already trained, no trained
-                       if (!$this->nbs->getReference($doc_id)) {
+                       if (!$this->nbs->getReference($doc_id, false)) {
+
                                $tokens = $this->_getTokens($content);
 
                                while (list($token, $count) = each($tokens)) {
                 */
                function untrain($doc_id) {
                        $ref = $this->nbs->getReference($doc_id);
-                       $tokens = $this->_getTokens($ref['content']);
 
-                       while (list($token, $count) = each($tokens)) {
-                               $this->nbs->removeWord($token, $count, $ref['category_id']);
-                       }
+                       if (isset($ref['content'])) {
 
-                       $this->nbs->removeReference($doc_id);
+                               $tokens = $this->_getTokens($ref['content']);
 
-                       return true;
+                               while (list($token, $count) = each($tokens)) {
+                                       $this->nbs->removeWord($token, $count, $ref['category_id']);
+                               }
+
+                               $this->nbs->removeReference($doc_id);
+
+                               return true;
+                       } else {
+                               return false;
+                       }
                }
 
                /** rescale the results between 0 and 1.
                function _getTokens($string) {
                        $rawtokens = array();
                        $tokens = array();
-                       $string = $this->_cleanString($string);
+                       //$string = $this->_cleanString($string);
 
                        if (count(0 >= $this->ignore_list)) {
                                $this->ignore_list = $this->getIgnoreList();
                        }
 
-                       $rawtokens = split("[^-_A-Za-z0-9]+", $string);
+                       $rawtokens = preg_split("/[\(\),:\.;\t\r\n ]/", $string, -1, PREG_SPLIT_NO_EMPTY);
 
                        // remove some tokens
                        while (list(, $token) = each($rawtokens)) {
                                $token = trim($token);
-                               if (!(('' == $token) || (strlen($token) < $this->min_token_length) || (strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) {
+                               if (!(('' == $token) || (mb_strpos($token, "&") !== FALSE) || (mb_strlen($token) < $this->min_token_length) || (mb_strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) {
                                        $tokens[$token]++;
                                }
                        }
index fccdcaf06eb7c643a1d098e558d5df0240256063..4727705ef95c6b822d1c5e925e0a70e9307bea09 100644 (file)
                 */
                function getCategories() {
                        $categories = array();
-                       $rs = $this->con->query('SELECT * FROM ttrss_plugin_af_sort_bayes_categories');
+                       $rs = $this->con->query('SELECT * FROM ttrss_plugin_af_sort_bayes_categories WHERE owner_uid = ' . $this->owner_uid);
 
-                       while ($this->con->fetch_assoc($rs)) {
-                               $categories[$rs['category_id']] = array('probability' => $rs['probability'],
-                                       'word_count'  => $rs['word_count']
+                       while ($line = $this->con->fetch_assoc($rs)) {
+                               $categories[$line['id']] = array('probability' => $line['probability'],
+                                       'category' => $line['category'],
+                                       'word_count' => $line['word_count']
                                );
-
-                               
                        }
 
                        return $categories;
                }
 
+               function getCategoryByName($category) {
+                       $rs = $this->con->query("SELECT id FROM ttrss_plugin_af_sort_bayes_categories WHERE category = '" .
+                               $this->con->escape_string($category) . "' AND owner_uid = " . $this->owner_uid);
+
+                       if ($this->con->num_rows($rs) != 0) {
+                               return $this->con->fetch_result($rs, 0, "id");
+                       }
+
+                       return false;
+               }
+
                /** see if the word is an already learnt word.
                 @return bool
                 @param string word
                 */
                function wordExists($word) {
-                       $rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" . $this->con->escape_string($word) . "'");
+                       $rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" . $this->con->escape_string($word) . "' AND
+                               owner_uid = " . $this->owner_uid);
 
                        return $this->con->num_rows($rs) != 0;
                }
                function getWord($word, $category_id) {
                        $details = array();
 
-                       $rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" . $this->con->escape_string($word) . "' AND category_id='" . $this->con->escape_string($category_id) . "'");
+                       $rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" .
+                               $this->con->escape_string($word) . "' AND category_id=" . (int)$category_id);
 
                        if ($this->con->num_rows($rs) == 0 ) {
                                $details['count'] = 0;
-                       }
-                       else {
-                               $details['count'] = $rs['count'];
+                       } else {
+                               $details['count'] = $this->con->fetch_result($rs, 0, "count");
                        }
 
                        return $details;
                        $oldword = $this->getWord($word, $category_id);
 
                        if (0 == $oldword['count']) {
-                               return $this->con->execute("INSERT INTO ttrss_plugin_af_sort_bayes_wordfreqs (word, category_id, count) VALUES ('" . $this->con->escape_string($word) . "', '" . $this->con->escape_string($category_id) . "', '" . $this->con->escape_string((int) $count) . "')");
+                               return $this->con->query("INSERT INTO ttrss_plugin_af_sort_bayes_wordfreqs (word, category_id, count, owner_uid)
+                                       VALUES ('" . $this->con->escape_string($word) . "', '" .
+                                       (int)$category_id . "', '" .
+                                       (int)$count . "', '".
+                                       $this->owner_uid . "')");
                        }
                        else {
-                               return $this->con->execute("UPDATE ttrss_plugin_af_sort_bayes_wordfreqs SET count = count + " . (int) $count . " WHERE category_id = '" . $this->con->escape_string($category_id) . "' AND word = '" . $this->con->escape_string($word) . "'");
+                               return $this->con->query("UPDATE ttrss_plugin_af_sort_bayes_wordfreqs SET count = count + " . (int) $count . " WHERE category_id = '" . $this->con->escape_string($category_id) . "' AND word = '" . $this->con->escape_string($word) . "'");
                        }
                }
 
                        $oldword = $this->getWord($word, $category_id);
 
                        if (0 != $oldword['count'] && 0 >= ($oldword['count'] - $count)) {
-                               return $this->con->execute("DELETE FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" . $this->con->escape_string($word) . "' AND category_id='" . $this->con->escape_string($category_id) . "'");
+                               return $this->con->query("DELETE FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" .
+                                       $this->con->escape_string($word) . "' AND category_id='" .
+                                       $this->con->escape_string($category_id) . "'");
                        }
                        else {
-                               return $this->con->execute("UPDATE ttrss_plugin_af_sort_bayes_wordfreqs SET count = count - " . (int) $count . " WHERE category_id = '" . $this->con->escape_string($category_id) . "' AND word = '" . $this->con->escape_string($word) . "'");
+                               return $this->con->query("UPDATE ttrss_plugin_af_sort_bayes_wordfreqs SET count = count - " .
+                                       (int) $count . " WHERE category_id = '" . $this->con->escape_string($category_id) . "'
+                                       AND word = '" . $this->con->escape_string($word) . "'");
                        }
                }
 
                 */
                function updateProbabilities() {
                        // first update the word count of each category
-                       $rs = $this->con->query("SELECT category_id, SUM(count) AS total FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE 1 GROUP BY category_id");
-                       $total_words = 0;
-
-                       while ($this->con->fetch_assoc($rs)) {
-                               $total_words += $rs['total'];
-                               
-                       }
+                       $rs = $this->con->query("SELECT SUM(count) AS total FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE owner_uid = ".$this->owner_uid);
 
-                       $rs->moveStart();
+                       $total_words = $this->con->fetch_result($rs, 0, "total");
 
                        if ($total_words == 0) {
-                               $this->con->execute("UPDATE ttrss_plugin_af_sort_bayes_categories SET word_count=0, probability=0 WHERE 1");
-
+                               $this->con->query("UPDATE ttrss_plugin_af_sort_bayes_categories SET word_count=0, probability=0 WHERE owner_uid = " . $this->owner_uid);
                                return true;
                        }
 
-                       while ($this->con->fetch_assoc($rs)) {
-                               $proba = $rs['total'] / $total_words;
-                               $this->con->execute("UPDATE ttrss_plugin_af_sort_bayes_categories SET word_count=" . (int) $rs['total'] . ", probability=" . $proba . " WHERE category_id = '" . $rs['category_id'] . "'");
-                               
+                       $rs = $this->con->query("SELECT tc.id AS category_id, SUM(count) AS total FROM ttrss_plugin_af_sort_bayes_categories AS tc
+                               LEFT JOIN ttrss_plugin_af_sort_bayes_wordfreqs AS tw ON (tc.id = tw.category_id) WHERE tc.owner_uid = ".$this->owner_uid." GROUP BY tc.id");
+
+                       while ($line = $this->con->fetch_assoc($rs)) {
+
+                               $proba = (int)$line['total'] / $total_words;
+                               $this->con->query("UPDATE ttrss_plugin_af_sort_bayes_categories SET word_count=" . (int) $line['total'] .
+                                       ", probability=" . $proba . " WHERE id = '" . $line['category_id'] . "'");
                        }
 
                        return true;
                 @param  string content of the reference
                 */
                function saveReference($doc_id, $category_id, $content) {
-
-                       return $this->con->execute("INSERT INTO ttrss_plugin_af_sort_bayes_references (id, category_id, content) VALUES ('" . $this->con->escape_string($doc_id) . "', '" . $this->con->escape_string($category_id) . "', '" . $this->con->escape_string($content) . "')");
+                       return $this->con->query("INSERT INTO ttrss_plugin_af_sort_bayes_references (document_id, category_id, owner_uid) VALUES
+                               ('" . $this->con->escape_string($doc_id) . "', '" .
+                                       (int)$category_id . "', " .
+                                       (int)$this->owner_uid . ")");
                }
 
                /** get a reference from the database.
                 @return array  reference( category_id => ...., content => ....)
                 @param  string id
                 */
-               function getReference($doc_id) {
+               function getReference($doc_id, $include_content = true)
+               {
+
                        $ref = array();
-                       $rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_references WHERE id='" . $this->con->escape_string($doc_id) . "'");
+                       $rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_references WHERE document_id='" .
+                               $this->con->escape_string($doc_id) . "' AND owner_uid = " . $this->owner_uid);
 
-                       if ($this->con->num_rows($rs) == 0 ) {
+                       if ($this->con->num_rows($rs) == 0) {
                                return $ref;
                        }
 
-                       $ref['category_id'] = $rs['category_id'];
-                       $ref['content'] = $rs['content'];
-                       $ref['id'] = $rs['id'];
+                       $ref['category_id'] = $this->con->fetch_result($rs, 0, 'category_id');
+                       $ref['id'] = $this->con->fetch_result($rs, 0, 'id');
+                       $ref['document_id'] = $this->con->fetch_result($rs, 0, 'document_id');
+
+                       if ($include_content) {
+                               $rs = $this->con->query("SELECT content, title FROM ttrss_entries WHERE guid = '" .
+                                       $this->con->escape_string($ref['document_id']) . "'");
+
+                               if ($this->con->num_rows($rs) != 0) {
+                                       $ref['content'] = mb_strtolower($this->con->fetch_result($rs, 0, 'title') . ' ' . strip_tags($this->con->fetch_result($rs, 0, 'content')));
+                               }
+                       }
 
                        return $ref;
                }
                 */
                function removeReference($doc_id) {
 
-                       return $this->con->execute("DELETE FROM ttrss_plugin_af_sort_bayes_references WHERE id='" . $this->con->escape_string($doc_id) . "'");
+                       return $this->con->query("DELETE FROM ttrss_plugin_af_sort_bayes_references WHERE document_id='" . $this->con->escape_string($doc_id) . "' AND owner_uid = " . $this->owner_uid);
                }
 
        }