]> git.wh0rd.org - tt-rss.git/commitdiff
limit maximum data length for bayesian processing
authorAndrew Dolgov <noreply@fakecake.org>
Thu, 18 Jun 2015 16:02:39 +0000 (19:02 +0300)
committerAndrew Dolgov <noreply@fakecake.org>
Thu, 18 Jun 2015 16:02:39 +0000 (19:02 +0300)
plugins/af_sort_bayes/init.php
plugins/af_sort_bayes/lib/class.naivebayesianstorage.php

index 51867f08ea8cb7a40d783be82e2ce586b0224a99..6599baa0f62187ca9f336ae0176a50fbddfde351 100644 (file)
@@ -8,6 +8,7 @@ class Af_Sort_Bayes extends Plugin {
        private $score_modifier = 50;
        private $sql_prefix = "ttrss_plugin_af_sort_bayes";
        private $auto_categorize_threshold = 10000;
+       private $max_document_length = 3000; // classifier can't rescale output for very long strings apparently
 
        function about() {
                return array(1.0,
@@ -47,7 +48,7 @@ class Af_Sort_Bayes extends Plugin {
                if ($this->dbh->num_rows($result) != 0) {
                        $guid = $this->dbh->fetch_result($result, 0, "guid");
                        $title = $this->dbh->fetch_result($result, 0, "title");
-                       $content = mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content")));
+                       $content = mb_substr(mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content"))), 0, $this->max_document_length);
                        $score = $this->dbh->fetch_result($result, 0, "score");
 
                        $this->dbh->query("BEGIN");
@@ -302,7 +303,7 @@ class Af_Sort_Bayes extends Plugin {
 
                        $dst_category = $id_ugly;
 
-                       $bayes_content = mb_strtolower($article["title"] . " " . strip_tags($article["content"]));
+                       $bayes_content = mb_substr(mb_strtolower($article["title"] . " " . strip_tags($article["content"])), 0, $this->max_document_length);
 
                        if ($count_neutral >= $this->auto_categorize_threshold) {
                                // enable automatic categorization
@@ -358,7 +359,8 @@ class Af_Sort_Bayes extends Plugin {
                if ($this->dbh->num_rows($result) != 0) {
                        $guid = $this->dbh->fetch_result($result, 0, "guid");
                        $title = $this->dbh->fetch_result($result, 0, "title");
-                       $content = mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content")));
+
+                       $content = mb_substr(mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content"))), 0, $this->max_document_length);
 
                        print "<h2>" . $title . "</h2>";
 
index 73c1ee4c6f7964adab67132ec155a988d6ddf46e..99db1fc79f0f2f70939a926b766963d781388ad2 100644 (file)
@@ -47,6 +47,7 @@
        class NaiveBayesianStorage {
                var $con = null;
                var $owner_uid = null;
+               var $max_document_length = 3000; // classifier can't rescale output for very long strings apparently
 
                function NaiveBayesianStorage($owner_uid) {
                        $this->con = Db::get();
                                        $this->con->escape_string($ref['document_id']) . "'");
 
                                if ($this->con->num_rows($rs) != 0) {
-                                       $ref['content'] = mb_strtolower($this->con->fetch_result($rs, 0, 'title') . ' ' . strip_tags($this->con->fetch_result($rs, 0, 'content')));
+                                       $ref['content'] = mb_substr(mb_strtolower($this->con->fetch_result($rs, 0, 'title') . ' ' . strip_tags($this->con->fetch_result($rs, 0, 'content'))), 0,
+                                       $this->max_document_length);
                                }
                        }