]> git.wh0rd.org - tt-rss.git/blobdiff - plugins/af_sort_bayes/init.php
bayes: properly reset score when going good -> ugly
[tt-rss.git] / plugins / af_sort_bayes / init.php
index 79d287158483287d067b092955b1d66b7deb737b..51a6f38bba3ff308899db7e7d6f9a3e5b17006e7 100644 (file)
@@ -6,6 +6,7 @@ class Af_Sort_Bayes extends Plugin {
        private $filters = array();
        private $dbh;
        private $score_modifier = 50;
+       private $sql_prefix = "ttrss_plugin_af_sort_bayes";
 
        function about() {
                return array(1.0,
@@ -15,6 +16,7 @@ class Af_Sort_Bayes extends Plugin {
 
        function init($host) {
                require_once __DIR__ . "/lib/class.naivebayesian.php";
+               require_once __DIR__ . "/lib/class.naivebayesian_ngram.php";
                require_once __DIR__ . "/lib/class.naivebayesianstorage.php";
 
                $this->host = $host;
@@ -32,7 +34,8 @@ class Af_Sort_Bayes extends Plugin {
                $article_id = (int) $_REQUEST["article_id"];
                $train_up = sql_bool_to_bool($_REQUEST["train_up"]);
 
-               $category = $train_up ? "GOOD" : "NEUTRAL";
+               //$category = $train_up ? "GOOD" : "UGLY";
+               $dst_category = "UGLY";
 
                $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
                $nb = new NaiveBayesian($nbs);
@@ -48,13 +51,48 @@ class Af_Sort_Bayes extends Plugin {
 
                        $this->dbh->query("BEGIN");
 
-                       if ($nb->untrain($guid, $content)) {
-                               if ($score >= $this->score_modifier) $score -= $this->score_modifier;
+                       $ref = $nbs->getReference($guid, false);
+
+                       if (isset($ref['category_id'])) {
+                               $current_category = $nbs->getCategoryById($ref['category_id']);
+                       } else {
+                               $current_category = "UGLY";
                        }
 
-                       $nb->train($guid, $nbs->getCategoryByName($category), $content);
+                       // set score to fixed value for now
+
+                       if ($train_up) {
+                               switch ($current_category) {
+                                       case "UGLY":
+                                               $dst_category = "GOOD";
+                                               $score = $this->score_modifier;
+                                               break;
+                                       case "BAD":
+                                               $dst_category = "UGLY";
+                                               $score = 0;
+                                               break;
+                                       case "GOOD":
+                                               $dst_category = "GOOD";
+                                               break;
+                               }
+                       } else {
+                               switch ($current_category) {
+                                       case "UGLY":
+                                               $dst_category = "BAD";
+                                               $score = $this->score_modifier;
+                                               break;
+                                       case "BAD":
+                                               $dst_category = "BAD";
+                                               break;
+                                       case "GOOD":
+                                               $dst_category = "UGLY";
+                                               $score = 0;
+                                               break;
+                               }
+                       }
 
-                       if ($category == "GOOD") $score += $this->score_modifier;
+                       $nb->untrain($guid, $content);
+                       $nb->train($guid, $nbs->getCategoryByName($dst_category), $content);
 
                        $this->dbh->query("UPDATE ttrss_user_entries SET score = '$score' WHERE ref_id = $article_id AND owner_uid = " . $_SESSION["uid"]);
 
@@ -64,13 +102,17 @@ class Af_Sort_Bayes extends Plugin {
 
                }
 
-               print "$article_id :: $category";
+               print "$article_id :: $dst_category :: $score";
        }
 
        function get_js() {
                return file_get_contents(__DIR__ . "/init.js");
        }
 
+       function get_prefs_js() {
+               return file_get_contents(__DIR__ . "/init.js");
+       }
+
        function hook_article_button($line) {
                return "<img src=\"plugins/af_sort_bayes/thumb_up.png\"
                        style=\"cursor : pointer\" style=\"cursor : pointer\"
@@ -84,7 +126,7 @@ class Af_Sort_Bayes extends Plugin {
        }
 
        function init_database() {
-               $prefix = "ttrss_plugin_af_sort_bayes";
+               $prefix = $this->sql_prefix;
 
                // TODO there probably should be a way for plugins to determine their schema version to upgrade tables
 
@@ -112,8 +154,7 @@ class Af_Sort_Bayes extends Plugin {
                                category_id INTEGER NOT NULL,
                                FOREIGN KEY (category_id) REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
                                owner_uid INTEGER NOT NULL,
-                               FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE,
-                               content text NOT NULL) ENGINE=InnoDB");
+                               FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE) ENGINE=InnoDB");
 
                        $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs (
                                word varchar(100) NOT NULL DEFAULT '',
@@ -128,7 +169,7 @@ class Af_Sort_Bayes extends Plugin {
                        $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_categories (
                                id SERIAL NOT NULL PRIMARY KEY,
                                category varchar(100) NOT NULL DEFAULT '',
-                               probability DOUBLE NOT NULL DEFAULT '0',
+                               probability DOUBLE PRECISION NOT NULL DEFAULT '0',
                                owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
                                word_count BIGINT NOT NULL DEFAULT '0')");
 
@@ -136,8 +177,7 @@ class Af_Sort_Bayes extends Plugin {
                                id SERIAL NOT NULL PRIMARY KEY,
                                document_id VARCHAR(255) NOT NULL,
                                category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
-                               owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
-                               content text NOT NULL)");
+                               owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE)");
 
                        $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs (
                                word varchar(100) NOT NULL DEFAULT '',
@@ -153,19 +193,66 @@ class Af_Sort_Bayes extends Plugin {
 
                        if ($this->dbh->num_rows($result) == 0) {
                                $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('GOOD', $owner_uid)");
-                               $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('NEUTRAL', $owner_uid)");
+                               $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('BAD', $owner_uid)");
+                               $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('UGLY', $owner_uid)");
                        }
                }
 
                $this->dbh->query("COMMIT");
        }
 
+       function renderPrefsUI() {
+               $result = $this->dbh->query("SELECT category, probability, word_count,
+                       (SELECT COUNT(id) FROM {$this->sql_prefix}_references WHERE
+                               category_id = {$this->sql_prefix}_categories.id) as doc_count
+                       FROM {$this->sql_prefix}_categories WHERE owner_uid = " . $_SESSION["uid"]);
+
+               print "<h3>" . __("Statistics") . "</h3>";
+
+               print "<table>";
+               print "<tr><th>Category</th><th>Probability</th><th>Words</th><th>Articles</th></tr>";
+
+               while ($line = $this->dbh->fetch_assoc($result)) {
+                       print "<tr>";
+                       foreach ($line as $k => $v) {
+                               if ($k == "probability") $v = sprintf("%.3f", $v);
+
+                               print "<td>$v</td>";
+                       }
+                       print "</tr>";
+               }
+
+               print "</table>";
+
+               print "<h3>" . __("Last matched articles") . "</h3>";
+
+               $result = $this->dbh->query("SELECT te.title, category, tf.title AS feed_title
+                       FROM ttrss_entries AS te, ttrss_user_entries AS tu, ttrss_feeds AS tf, {$this->sql_prefix}_references AS tr, {$this->sql_prefix}_categories AS tc
+                       WHERE tf.id = tu.feed_id AND tu.ref_id = te.id AND tc.id = tr.category_id AND tr.document_id = te.guid ORDER BY te.id DESC LIMIT 20");
+
+               print "<ul class=\"browseFeedList\" style=\"border-width : 1px\">";
+
+               while ($line = $this->dbh->fetch_assoc($result)) {
+                       print "<li>" . $line["category"] . ": " . $line["title"] . " (" . $line["feed_title"] . ")</li>";
+               }
+
+               print "</ul>";
+
+               print "<button dojoType=\"dijit.form.Button\" onclick=\"return bayesUpdateUI()\">".
+                       __('Refresh')."</button> ";
+
+               print "<button dojoType=\"dijit.form.Button\" onclick=\"return bayesClearDatabase()\">".
+                       __('Clear database')."</button> ";
+
+               //
+       }
+
        function hook_prefs_tab($args) {
                if ($args != "prefPrefs") return;
 
-               print "<div dojoType=\"dijit.layout.AccordionPane\" title=\"".__('af_sort_bayes')."\">";
+               print "<div id=\"af_sort_bayes_prefs\" dojoType=\"dijit.layout.AccordionPane\" title=\"".__('Bayesian classifier (af_sort_bayes)')."\">";
 
-               //
+               $this->renderPrefsUI();
 
                print "</div>";
        }
@@ -181,38 +268,47 @@ class Af_Sort_Bayes extends Plugin {
                if (count($categories) > 0) {
 
                        $count_neutral = 0;
-                       $count_good = 0;
+
                        $id_good = 0;
-                       $id_neutral = 0;
+                       $id_ugly = 0;
+                       $id_bad = 0;
 
                        foreach ($categories as $id => $cat) {
                                if ($cat["category"] == "GOOD") {
                                        $id_good = $id;
-                                       $count_good += $cat["word_count"];
-                               } else if ($cat["category"] == "NEUTRAL") {
-                                       $id_neutral = $id;
+                               } else if ($cat["category"] == "UGLY") {
+                                       $id_ugly = $id;
                                        $count_neutral += $cat["word_count"];
+                               } else if ($cat["category"] == "BAD") {
+                                       $id_bad = $id;
                                }
                        }
 
-                       $dst_category = $id_neutral;
+                       $dst_category = $id_ugly;
 
                        $bayes_content = mb_strtolower($article["title"] . " " . strip_tags($article["content"]));
 
-                       if ($count_neutral >= 3000 && $count_good >= 1000) {
+                       if ($count_neutral >= 10000) {
                                // enable automatic categorization
 
                                $result = $nb->categorize($bayes_content);
 
-                               if (count($result) == 2) {
+                               //print_r($result);
+
+                               if (count($result) == 3) {
                                        $prob_good = $result[$id_good];
-                                       $prob_neutral = $result[$id_neutral];
+                                       $prob_bad = $result[$id_bad];
 
-                                       if ($prob_good > 0.90 && $prob_good > $prob_neutral) {
-                                               $dst_category = $id_good; // should we autofile as good or not? idk
+                                       if ($prob_good > 0.90) {
+                                               $dst_category = $id_good;
                                                $article["score_modifier"] += $this->score_modifier;
+                                       } else if ($prob_bad > 0.90) {
+                                               $dst_category = $id_bad;
+                                               $article["score_modifier"] -= $this->score_modifier;
                                        }
                                }
+
+                               _debug("bayes, dst category: $dst_category");
                        }
 
                        $nb->train($article["guid_hashed"], $dst_category, $bayes_content);
@@ -224,6 +320,19 @@ class Af_Sort_Bayes extends Plugin {
 
        }
 
+       function clearDatabase() {
+               $prefix = $this->sql_prefix;
+
+               $this->dbh->query("BEGIN");
+               $this->dbh->query("DELETE FROM ${prefix}_references WHERE owner_uid = " . $_SESSION["uid"]);
+               $this->dbh->query("DELETE FROM ${prefix}_wordfreqs WHERE owner_uid = " . $_SESSION["uid"]);
+               $this->dbh->query("COMMIT");
+
+               $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
+               $nb = new NaiveBayesian($nbs);
+               $nb->updateProbabilities();
+       }
+
        function api_version() {
                return 2;
        }