]> git.wh0rd.org Git - tt-rss.git/blob - plugins/af_sort_bayes/init.php
3958d27b266849672ef8bc416435509ecc3b078c
[tt-rss.git] / plugins / af_sort_bayes / init.php
1 <?php
2
3 class Af_Sort_Bayes extends Plugin {
4
5         private $host;
6         private $filters = array();
7         private $dbh;
8         private $score_modifier = 50;
9         private $sql_prefix = "ttrss_plugin_af_sort_bayes";
10         private $auto_categorize_threshold = 10000;
11
12         function about() {
13                 return array(1.0,
14                         "Bayesian classifier for tt-rss (WIP)",
15                         "fox");
16         }
17
18         function init($host) {
19                 require_once __DIR__ . "/lib/class.naivebayesian.php";
20                 //require_once __DIR__ . "/lib/class.naivebayesian_ngram.php";
21                 require_once __DIR__ . "/lib/class.naivebayesianstorage.php";
22
23                 $this->host = $host;
24                 $this->dbh = Db::get();
25
26                 $this->init_database();
27
28                 $host->add_hook($host::HOOK_ARTICLE_FILTER, $this);
29                 $host->add_hook($host::HOOK_PREFS_TAB, $this);
30                 $host->add_hook($host::HOOK_ARTICLE_BUTTON, $this);
31
32         }
33
34         function trainArticle() {
35                 $article_id = (int) $_REQUEST["article_id"];
36                 $train_up = sql_bool_to_bool($_REQUEST["train_up"]);
37
38                 //$category = $train_up ? "GOOD" : "UGLY";
39                 $dst_category = "UGLY";
40
41                 $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
42                 $nb = new NaiveBayesian($nbs);
43
44                 $result = $this->dbh->query("SELECT score, guid, title, content FROM ttrss_entries, ttrss_user_entries WHERE ref_id = id AND id = " .
45                         $article_id . " AND owner_uid = " . $_SESSION["uid"]);
46
47                 if ($this->dbh->num_rows($result) != 0) {
48                         $guid = $this->dbh->fetch_result($result, 0, "guid");
49                         $title = $this->dbh->fetch_result($result, 0, "title");
50                         $content = mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content")));
51                         $score = $this->dbh->fetch_result($result, 0, "score");
52
53                         $this->dbh->query("BEGIN");
54
55                         $ref = $nbs->getReference($guid, false);
56
57                         if (isset($ref['category_id'])) {
58                                 $current_category = $nbs->getCategoryById($ref['category_id']);
59                         } else {
60                                 $current_category = "UGLY";
61                         }
62
63                         // set score to fixed value for now
64
65                         if ($train_up) {
66                                 switch ($current_category) {
67                                         case "UGLY":
68                                                 $dst_category = "GOOD";
69                                                 $score = $this->score_modifier;
70                                                 break;
71                                         case "BAD":
72                                                 $dst_category = "UGLY";
73                                                 $score = 0;
74                                                 break;
75                                         case "GOOD":
76                                                 $dst_category = "GOOD";
77                                                 break;
78                                 }
79                         } else {
80                                 switch ($current_category) {
81                                         case "UGLY":
82                                                 $dst_category = "BAD";
83                                                 $score = -$this->score_modifier;
84                                                 break;
85                                         case "BAD":
86                                                 $dst_category = "BAD";
87                                                 break;
88                                         case "GOOD":
89                                                 $dst_category = "UGLY";
90                                                 $score = 0;
91                                                 break;
92                                 }
93                         }
94
95                         $nb->untrain($guid, $content);
96                         $nb->train($guid, $nbs->getCategoryByName($dst_category), $content);
97
98                         $this->dbh->query("UPDATE ttrss_user_entries SET score = '$score' WHERE ref_id = $article_id AND owner_uid = " . $_SESSION["uid"]);
99
100                         $nb->updateProbabilities();
101
102                         $this->dbh->query("COMMIT");
103
104                 }
105
106                 print "$article_id :: $dst_category :: $score";
107         }
108
109         function get_js() {
110                 return file_get_contents(__DIR__ . "/init.js");
111         }
112
113         function get_prefs_js() {
114                 return file_get_contents(__DIR__ . "/init.js");
115         }
116
117         function hook_article_button($line) {
118                 return "<img src=\"plugins/af_sort_bayes/thumb_up.png\"
119                         style=\"cursor : pointer\" style=\"cursor : pointer\"
120                         onclick=\"bayesTrain(".$line["id"].", true)\"
121                         class='tagsPic' title='".__('+1')."'>" .
122                 "<img src=\"plugins/af_sort_bayes/thumb_down.png\"
123                         style=\"cursor : pointer\" style=\"cursor : pointer\"
124                         onclick=\"bayesTrain(".$line["id"].", false)\"
125                         class='tagsPic' title='".__('-1')."'>";
126
127         }
128
129         function init_database() {
130                 $prefix = $this->sql_prefix;
131
132                 // TODO there probably should be a way for plugins to determine their schema version to upgrade tables
133
134                 /*$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_wordfreqs", false);
135                 $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_references", false);
136                 $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_categories", false);*/
137
138                 $this->dbh->query("BEGIN");
139
140                 // PG only for the time being
141
142                 if (DB_TYPE == "mysql") {
143
144                         $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_categories (
145                                 id INTEGER NOT NULL PRIMARY KEY auto_increment,
146                                 category varchar(100) NOT NULL DEFAULT '',
147                                 probability DOUBLE NOT NULL DEFAULT '0',
148                                 owner_uid INTEGER NOT NULL,
149                                 FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE,
150                                 word_count BIGINT NOT NULL DEFAULT '0') ENGINE=InnoDB");
151
152                         $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references (
153                                 id INTEGER NOT NULL PRIMARY KEY auto_increment,
154                                 document_id VARCHAR(255) NOT NULL,
155                                 category_id INTEGER NOT NULL,
156                                 FOREIGN KEY (category_id) REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
157                                 owner_uid INTEGER NOT NULL,
158                                 FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE) ENGINE=InnoDB");
159
160                         $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs (
161                                 word varchar(100) NOT NULL DEFAULT '',
162                                 category_id INTEGER NOT NULL,
163                                 FOREIGN KEY (category_id) REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
164                                 owner_uid INTEGER NOT NULL,
165                                 FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE,
166                                 count BIGINT NOT NULL DEFAULT '0') ENGINE=InnoDB");
167
168
169                 } else {
170                         $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_categories (
171                                 id SERIAL NOT NULL PRIMARY KEY,
172                                 category varchar(100) NOT NULL DEFAULT '',
173                                 probability DOUBLE PRECISION NOT NULL DEFAULT '0',
174                                 owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
175                                 word_count BIGINT NOT NULL DEFAULT '0')");
176
177                         $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references (
178                                 id SERIAL NOT NULL PRIMARY KEY,
179                                 document_id VARCHAR(255) NOT NULL,
180                                 category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
181                                 owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE)");
182
183                         $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs (
184                                 word varchar(100) NOT NULL DEFAULT '',
185                                 category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
186                                 owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
187                                 count BIGINT NOT NULL DEFAULT '0')");
188                 }
189
190                 $owner_uid = @$_SESSION["uid"];
191
192                 if ($owner_uid) {
193                         $result = $this->dbh->query("SELECT id FROM ${prefix}_categories WHERE owner_uid = $owner_uid LIMIT 1");
194
195                         if ($this->dbh->num_rows($result) == 0) {
196                                 $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('GOOD', $owner_uid)");
197                                 $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('BAD', $owner_uid)");
198                                 $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('UGLY', $owner_uid)");
199                         }
200                 }
201
202                 $this->dbh->query("COMMIT");
203         }
204
205         function renderPrefsUI() {
206                 $result = $this->dbh->query("SELECT category, probability, word_count,
207                         (SELECT COUNT(id) FROM {$this->sql_prefix}_references WHERE
208                                 category_id = {$this->sql_prefix}_categories.id) as doc_count
209                         FROM {$this->sql_prefix}_categories WHERE owner_uid = " . $_SESSION["uid"]);
210
211                 print "<h3>" . __("Statistics") . "</h3>";
212
213                 print "<p>".T_sprintf("Required UGLY word count for automatic matching: %d", $this->auto_categorize_threshold)."</p>";
214
215                 print "<table>";
216                 print "<tr><th>Category</th><th>Probability</th><th>Words</th><th>Articles</th></tr>";
217
218                 while ($line = $this->dbh->fetch_assoc($result)) {
219                         print "<tr>";
220                         foreach ($line as $k => $v) {
221                                 if ($k == "probability") $v = sprintf("%.3f", $v);
222
223                                 print "<td>$v</td>";
224                         }
225                         print "</tr>";
226                 }
227
228                 print "</table>";
229
230                 print "<h3>" . __("Last matched articles") . "</h3>";
231
232                 $result = $this->dbh->query("SELECT te.title, category, tf.title AS feed_title
233                         FROM ttrss_entries AS te, ttrss_user_entries AS tu, ttrss_feeds AS tf, {$this->sql_prefix}_references AS tr, {$this->sql_prefix}_categories AS tc
234                         WHERE tf.id = tu.feed_id AND tu.ref_id = te.id AND tc.id = tr.category_id AND tr.document_id = te.guid ORDER BY te.id DESC LIMIT 20");
235
236                 print "<ul class=\"browseFeedList\" style=\"border-width : 1px\">";
237
238                 while ($line = $this->dbh->fetch_assoc($result)) {
239                         print "<li>" . $line["category"] . ": " . $line["title"] . " (" . $line["feed_title"] . ")</li>";
240                 }
241
242                 print "</ul>";
243
244                 print "<button dojoType=\"dijit.form.Button\" onclick=\"return bayesUpdateUI()\">".
245                         __('Refresh')."</button> ";
246
247                 print "<button dojoType=\"dijit.form.Button\" onclick=\"return bayesClearDatabase()\">".
248                         __('Clear database')."</button> ";
249
250                 //
251         }
252
253         function hook_prefs_tab($args) {
254                 if ($args != "prefPrefs") return;
255
256                 print "<div id=\"af_sort_bayes_prefs\" dojoType=\"dijit.layout.AccordionPane\" title=\"".__('Bayesian classifier (af_sort_bayes)')."\">";
257
258                 $this->renderPrefsUI();
259
260                 print "</div>";
261         }
262
263         function hook_article_filter($article) {
264                 $owner_uid = $article["owner_uid"];
265
266                 // guid already includes owner_uid so we don't need to include it
267                 $result = $this->dbh->query("SELECT id FROM {$this->sql_prefix}_references WHERE
268                         document_id = '" . $this->dbh->escape_string($article['guid_hashed']) . "'");
269
270                 if (db_num_rows($result) != 0) {
271                         _debug("bayes: article already categorized");
272                         return $article;
273                 }
274
275                 $nbs = new NaiveBayesianStorage($owner_uid);
276                 $nb = new NaiveBayesian($nbs);
277
278                 $categories = $nbs->getCategories();
279
280                 if (count($categories) > 0) {
281
282                         $count_neutral = 0;
283
284                         $id_good = 0;
285                         $id_ugly = 0;
286                         $id_bad = 0;
287
288                         foreach ($categories as $id => $cat) {
289                                 if ($cat["category"] == "GOOD") {
290                                         $id_good = $id;
291                                 } else if ($cat["category"] == "UGLY") {
292                                         $id_ugly = $id;
293                                         $count_neutral += $cat["word_count"];
294                                 } else if ($cat["category"] == "BAD") {
295                                         $id_bad = $id;
296                                 }
297                         }
298
299                         $dst_category = $id_ugly;
300
301                         $bayes_content = mb_strtolower($article["title"] . " " . strip_tags($article["content"]));
302
303                         if ($count_neutral >= $this->auto_categorize_threshold) {
304                                 // enable automatic categorization
305
306                                 $result = $nb->categorize($bayes_content);
307
308                                 //print_r($result);
309
310                                 if (count($result) == 3) {
311                                         $prob_good = $result[$id_good];
312                                         $prob_bad = $result[$id_bad];
313
314                                         if ($prob_good > 0.90) {
315                                                 $dst_category = $id_good;
316                                                 $article["score_modifier"] += $this->score_modifier;
317                                         } else if ($prob_bad > 0.90) {
318                                                 $dst_category = $id_bad;
319                                                 $article["score_modifier"] -= $this->score_modifier;
320                                         }
321                                 }
322
323                                 _debug("bayes, dst category: $dst_category");
324                         }
325
326                         $nb->train($article["guid_hashed"], $dst_category, $bayes_content);
327
328                         $nb->updateProbabilities();
329                 }
330
331                 return $article;
332
333         }
334
335         function clearDatabase() {
336                 $prefix = $this->sql_prefix;
337
338                 $this->dbh->query("BEGIN");
339                 $this->dbh->query("DELETE FROM ${prefix}_references WHERE owner_uid = " . $_SESSION["uid"]);
340                 $this->dbh->query("DELETE FROM ${prefix}_wordfreqs WHERE owner_uid = " . $_SESSION["uid"]);
341                 $this->dbh->query("COMMIT");
342
343                 $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
344                 $nb = new NaiveBayesian($nbs);
345                 $nb->updateProbabilities();
346         }
347
348         function api_version() {
349                 return 2;
350         }
351
352 }
353 ?>