]> git.wh0rd.org - tt-rss.git/blobdiff - plugins/af_sort_bayes/init.php
limit maximum data length for bayesian processing
[tt-rss.git] / plugins / af_sort_bayes / init.php
index fdb573ecde6de00a773eaca6bd784adc5675a5f3..6599baa0f62187ca9f336ae0176a50fbddfde351 100644 (file)
@@ -8,6 +8,7 @@ class Af_Sort_Bayes extends Plugin {
        private $score_modifier = 50;
        private $sql_prefix = "ttrss_plugin_af_sort_bayes";
        private $auto_categorize_threshold = 10000;
+       private $max_document_length = 3000; // classifier can't rescale output for very long strings apparently
 
        function about() {
                return array(1.0,
@@ -17,7 +18,7 @@ class Af_Sort_Bayes extends Plugin {
 
        function init($host) {
                require_once __DIR__ . "/lib/class.naivebayesian.php";
-               require_once __DIR__ . "/lib/class.naivebayesian_ngram.php";
+               //require_once __DIR__ . "/lib/class.naivebayesian_ngram.php";
                require_once __DIR__ . "/lib/class.naivebayesianstorage.php";
 
                $this->host = $host;
@@ -47,7 +48,7 @@ class Af_Sort_Bayes extends Plugin {
                if ($this->dbh->num_rows($result) != 0) {
                        $guid = $this->dbh->fetch_result($result, 0, "guid");
                        $title = $this->dbh->fetch_result($result, 0, "title");
-                       $content = mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content")));
+                       $content = mb_substr(mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content"))), 0, $this->max_document_length);
                        $score = $this->dbh->fetch_result($result, 0, "score");
 
                        $this->dbh->query("BEGIN");
@@ -117,12 +118,16 @@ class Af_Sort_Bayes extends Plugin {
        function hook_article_button($line) {
                return "<img src=\"plugins/af_sort_bayes/thumb_up.png\"
                        style=\"cursor : pointer\" style=\"cursor : pointer\"
-                       onclick=\"bayesTrain(".$line["id"].", true)\"
+                       onclick=\"bayesTrain(".$line["id"].", true, event)\"
                        class='tagsPic' title='".__('+1')."'>" .
                "<img src=\"plugins/af_sort_bayes/thumb_down.png\"
                        style=\"cursor : pointer\" style=\"cursor : pointer\"
-                       onclick=\"bayesTrain(".$line["id"].", false)\"
-                       class='tagsPic' title='".__('-1')."'>";
+                       onclick=\"bayesTrain(".$line["id"].", false, event)\"
+                       class='tagsPic' title='".__('-1')."'>" .
+               "<img src=\"plugins/af_sort_bayes/chart_bar.png\"
+                       style=\"cursor : pointer\" style=\"cursor : pointer\"
+                       onclick=\"bayesShow(".$line["id"].")\"
+                       class='tagsPic' title='".__('Show classifier info')."'>";
 
        }
 
@@ -263,12 +268,17 @@ class Af_Sort_Bayes extends Plugin {
        function hook_article_filter($article) {
                $owner_uid = $article["owner_uid"];
 
-               $nbs = new NaiveBayesianStorage($owner_uid);
-               $nb = new NaiveBayesian($nbs);
+               // guid already includes owner_uid so we don't need to include it
+               $result = $this->dbh->query("SELECT id FROM {$this->sql_prefix}_references WHERE
+                       document_id = '" . $this->dbh->escape_string($article['guid_hashed']) . "'");
 
-               $ref = $nbs->getReference($article["guid"], false);
+               if (db_num_rows($result) != 0) {
+                       _debug("bayes: article already categorized");
+                       return $article;
+               }
 
-               if (isset($ref["category_id"])) return $article; // already categorized
+               $nbs = new NaiveBayesianStorage($owner_uid);
+               $nb = new NaiveBayesian($nbs);
 
                $categories = $nbs->getCategories();
 
@@ -293,7 +303,7 @@ class Af_Sort_Bayes extends Plugin {
 
                        $dst_category = $id_ugly;
 
-                       $bayes_content = mb_strtolower($article["title"] . " " . strip_tags($article["content"]));
+                       $bayes_content = mb_substr(mb_strtolower($article["title"] . " " . strip_tags($article["content"])), 0, $this->max_document_length);
 
                        if ($count_neutral >= $this->auto_categorize_threshold) {
                                // enable automatic categorization
@@ -306,10 +316,10 @@ class Af_Sort_Bayes extends Plugin {
                                        $prob_good = $result[$id_good];
                                        $prob_bad = $result[$id_bad];
 
-                                       if ($prob_good > 0.90) {
+                                       if (!is_nan($prob_good) && $prob_good > 0.90) {
                                                $dst_category = $id_good;
                                                $article["score_modifier"] += $this->score_modifier;
-                                       } else if ($prob_bad > 0.90) {
+                                       } else if (!is_nan($prob_bad) && $prob_bad > 0.90) {
                                                $dst_category = $id_bad;
                                                $article["score_modifier"] -= $this->score_modifier;
                                        }
@@ -340,6 +350,61 @@ class Af_Sort_Bayes extends Plugin {
                $nb->updateProbabilities();
        }
 
+       function showArticleStats() {
+               $article_id = (int) $_REQUEST["article_id"];
+
+               $result = $this->dbh->query("SELECT score, guid, title, content FROM ttrss_entries, ttrss_user_entries WHERE ref_id = id AND id = " .
+                       $article_id . " AND owner_uid = " . $_SESSION["uid"]);
+
+               if ($this->dbh->num_rows($result) != 0) {
+                       $guid = $this->dbh->fetch_result($result, 0, "guid");
+                       $title = $this->dbh->fetch_result($result, 0, "title");
+
+                       $content = mb_substr(mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content"))), 0, $this->max_document_length);
+
+                       print "<h2>" . $title . "</h2>";
+
+                       $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
+                       $nb = new NaiveBayesian($nbs);
+
+                       $categories = $nbs->getCategories();
+
+                       $ref = $nbs->getReference($guid, false);
+
+                       $current_cat = isset($ref["category_id"]) ? $categories[$ref["category_id"]]["category"] : "N/A";
+
+                       print "<p>" . T_sprintf("Currently stored as: %s", $current_cat) . "</p>";
+
+                       $result = $nb->categorize($content);
+
+                       print "<h3>" . __("Classifier result") . "</h3>";
+
+                       print "<table>";
+                       print "<tr><th>Category</th><th>Probability</th></tr>";
+
+                       foreach ($result as $k => $v) {
+                               print "<tr>";
+                               print "<td>" . $categories[$k]["category"] . "</td>";
+                               print "<td>" . $v . "</td>";
+
+                               print "</tr>";
+                       }
+
+                       print "</table>";
+
+               } else {
+                       print_error("Article not found");
+               }
+
+               print "<div align='center'>";
+
+               print "<button dojoType=\"dijit.form.Button\" onclick=\"return dijit.byId('bayesShowDlg').hide()\">".
+                       __('Close this window')."</button>";
+
+               print "</div>";
+
+       }
+
        function api_version() {
                return 2;
        }