]> git.wh0rd.org - tt-rss.git/blame - plugins/af_sort_bayes/init.php
apparently classifier may return NaN
[tt-rss.git] / plugins / af_sort_bayes / init.php
CommitLineData
853cc128
AD
1<?php
2
3class Af_Sort_Bayes extends Plugin {
4
5 private $host;
6 private $filters = array();
7 private $dbh;
59e83455 8 private $score_modifier = 50;
3c43def9 9 private $sql_prefix = "ttrss_plugin_af_sort_bayes";
fa05356a 10 private $auto_categorize_threshold = 10000;
853cc128
AD
11
12 function about() {
13 return array(1.0,
14 "Bayesian classifier for tt-rss (WIP)",
15 "fox");
16 }
17
18 function init($host) {
19 require_once __DIR__ . "/lib/class.naivebayesian.php";
4dbd303b 20 //require_once __DIR__ . "/lib/class.naivebayesian_ngram.php";
853cc128
AD
21 require_once __DIR__ . "/lib/class.naivebayesianstorage.php";
22
23 $this->host = $host;
24 $this->dbh = Db::get();
25
26 $this->init_database();
27
28 $host->add_hook($host::HOOK_ARTICLE_FILTER, $this);
29 $host->add_hook($host::HOOK_PREFS_TAB, $this);
30 $host->add_hook($host::HOOK_ARTICLE_BUTTON, $this);
31
32 }
33
34 function trainArticle() {
35 $article_id = (int) $_REQUEST["article_id"];
36 $train_up = sql_bool_to_bool($_REQUEST["train_up"]);
37
08cfcba4
AD
38 //$category = $train_up ? "GOOD" : "UGLY";
39 $dst_category = "UGLY";
853cc128 40
59e83455 41 $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
4dbd303b 42 $nb = new NaiveBayesian($nbs);
59e83455
AD
43
44 $result = $this->dbh->query("SELECT score, guid, title, content FROM ttrss_entries, ttrss_user_entries WHERE ref_id = id AND id = " .
45 $article_id . " AND owner_uid = " . $_SESSION["uid"]);
46
47 if ($this->dbh->num_rows($result) != 0) {
48 $guid = $this->dbh->fetch_result($result, 0, "guid");
49 $title = $this->dbh->fetch_result($result, 0, "title");
50 $content = mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content")));
51 $score = $this->dbh->fetch_result($result, 0, "score");
52
53 $this->dbh->query("BEGIN");
54
08cfcba4
AD
55 $ref = $nbs->getReference($guid, false);
56
57 if (isset($ref['category_id'])) {
58 $current_category = $nbs->getCategoryById($ref['category_id']);
59 } else {
60 $current_category = "UGLY";
59e83455
AD
61 }
62
08cfcba4
AD
63 // set score to fixed value for now
64
65 if ($train_up) {
66 switch ($current_category) {
67 case "UGLY":
68 $dst_category = "GOOD";
69 $score = $this->score_modifier;
70 break;
71 case "BAD":
72 $dst_category = "UGLY";
73 $score = 0;
74 break;
75 case "GOOD":
76 $dst_category = "GOOD";
77 break;
78 }
79 } else {
80 switch ($current_category) {
81 case "UGLY":
82 $dst_category = "BAD";
21111f62 83 $score = -$this->score_modifier;
08cfcba4
AD
84 break;
85 case "BAD":
86 $dst_category = "BAD";
87 break;
88 case "GOOD":
89 $dst_category = "UGLY";
a72cd54c 90 $score = 0;
08cfcba4
AD
91 break;
92 }
93 }
59e83455 94
08cfcba4
AD
95 $nb->untrain($guid, $content);
96 $nb->train($guid, $nbs->getCategoryByName($dst_category), $content);
59e83455
AD
97
98 $this->dbh->query("UPDATE ttrss_user_entries SET score = '$score' WHERE ref_id = $article_id AND owner_uid = " . $_SESSION["uid"]);
99
100 $nb->updateProbabilities();
101
102 $this->dbh->query("COMMIT");
103
104 }
105
08cfcba4 106 print "$article_id :: $dst_category :: $score";
853cc128
AD
107 }
108
109 function get_js() {
110 return file_get_contents(__DIR__ . "/init.js");
111 }
112
3c43def9
AD
113 function get_prefs_js() {
114 return file_get_contents(__DIR__ . "/init.js");
115 }
116
853cc128
AD
117 function hook_article_button($line) {
118 return "<img src=\"plugins/af_sort_bayes/thumb_up.png\"
119 style=\"cursor : pointer\" style=\"cursor : pointer\"
5b0cb9aa 120 onclick=\"bayesTrain(".$line["id"].", true, event)\"
853cc128
AD
121 class='tagsPic' title='".__('+1')."'>" .
122 "<img src=\"plugins/af_sort_bayes/thumb_down.png\"
123 style=\"cursor : pointer\" style=\"cursor : pointer\"
5b0cb9aa 124 onclick=\"bayesTrain(".$line["id"].", false, event)\"
168e32b9
AD
125 class='tagsPic' title='".__('-1')."'>" .
126 "<img src=\"plugins/af_sort_bayes/chart_bar.png\"
127 style=\"cursor : pointer\" style=\"cursor : pointer\"
128 onclick=\"bayesShow(".$line["id"].")\"
129 class='tagsPic' title='".__('Show classifier info')."'>";
853cc128
AD
130
131 }
132
133 function init_database() {
3c43def9 134 $prefix = $this->sql_prefix;
853cc128 135
59e83455
AD
136 // TODO there probably should be a way for plugins to determine their schema version to upgrade tables
137
138 /*$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_wordfreqs", false);
139 $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_references", false);
140 $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_categories", false);*/
853cc128
AD
141
142 $this->dbh->query("BEGIN");
143
144 // PG only for the time being
145
b02e8bc8
AD
146 if (DB_TYPE == "mysql") {
147
148 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_categories (
149 id INTEGER NOT NULL PRIMARY KEY auto_increment,
150 category varchar(100) NOT NULL DEFAULT '',
151 probability DOUBLE NOT NULL DEFAULT '0',
152 owner_uid INTEGER NOT NULL,
153 FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE,
154 word_count BIGINT NOT NULL DEFAULT '0') ENGINE=InnoDB");
155
156 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references (
157 id INTEGER NOT NULL PRIMARY KEY auto_increment,
158 document_id VARCHAR(255) NOT NULL,
159 category_id INTEGER NOT NULL,
160 FOREIGN KEY (category_id) REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
161 owner_uid INTEGER NOT NULL,
61c9ce6a 162 FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE) ENGINE=InnoDB");
b02e8bc8
AD
163
164 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs (
165 word varchar(100) NOT NULL DEFAULT '',
166 category_id INTEGER NOT NULL,
167 FOREIGN KEY (category_id) REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
168 owner_uid INTEGER NOT NULL,
169 FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE,
170 count BIGINT NOT NULL DEFAULT '0') ENGINE=InnoDB");
171
172
173 } else {
174 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_categories (
175 id SERIAL NOT NULL PRIMARY KEY,
176 category varchar(100) NOT NULL DEFAULT '',
d62a5e0c 177 probability DOUBLE PRECISION NOT NULL DEFAULT '0',
b02e8bc8
AD
178 owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
179 word_count BIGINT NOT NULL DEFAULT '0')");
180
181 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references (
182 id SERIAL NOT NULL PRIMARY KEY,
183 document_id VARCHAR(255) NOT NULL,
184 category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
61c9ce6a 185 owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE)");
b02e8bc8
AD
186
187 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs (
188 word varchar(100) NOT NULL DEFAULT '',
189 category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
190 owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
191 count BIGINT NOT NULL DEFAULT '0')");
192 }
853cc128 193
59e83455
AD
194 $owner_uid = @$_SESSION["uid"];
195
196 if ($owner_uid) {
197 $result = $this->dbh->query("SELECT id FROM ${prefix}_categories WHERE owner_uid = $owner_uid LIMIT 1");
198
199 if ($this->dbh->num_rows($result) == 0) {
200 $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('GOOD', $owner_uid)");
08cfcba4
AD
201 $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('BAD', $owner_uid)");
202 $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('UGLY', $owner_uid)");
59e83455
AD
203 }
204 }
205
853cc128
AD
206 $this->dbh->query("COMMIT");
207 }
208
308c55c0 209 function renderPrefsUI() {
4947c02e
AD
210 $result = $this->dbh->query("SELECT category, probability, word_count,
211 (SELECT COUNT(id) FROM {$this->sql_prefix}_references WHERE
212 category_id = {$this->sql_prefix}_categories.id) as doc_count
213 FROM {$this->sql_prefix}_categories WHERE owner_uid = " . $_SESSION["uid"]);
3c43def9 214
308c55c0
AD
215 print "<h3>" . __("Statistics") . "</h3>";
216
fa05356a
AD
217 print "<p>".T_sprintf("Required UGLY word count for automatic matching: %d", $this->auto_categorize_threshold)."</p>";
218
3c43def9 219 print "<table>";
308c55c0 220 print "<tr><th>Category</th><th>Probability</th><th>Words</th><th>Articles</th></tr>";
3c43def9
AD
221
222 while ($line = $this->dbh->fetch_assoc($result)) {
223 print "<tr>";
224 foreach ($line as $k => $v) {
225 if ($k == "probability") $v = sprintf("%.3f", $v);
226
227 print "<td>$v</td>";
228 }
229 print "</tr>";
230 }
231
232 print "</table>";
233
308c55c0
AD
234 print "<h3>" . __("Last matched articles") . "</h3>";
235
236 $result = $this->dbh->query("SELECT te.title, category, tf.title AS feed_title
237 FROM ttrss_entries AS te, ttrss_user_entries AS tu, ttrss_feeds AS tf, {$this->sql_prefix}_references AS tr, {$this->sql_prefix}_categories AS tc
238 WHERE tf.id = tu.feed_id AND tu.ref_id = te.id AND tc.id = tr.category_id AND tr.document_id = te.guid ORDER BY te.id DESC LIMIT 20");
239
240 print "<ul class=\"browseFeedList\" style=\"border-width : 1px\">";
241
242 while ($line = $this->dbh->fetch_assoc($result)) {
243 print "<li>" . $line["category"] . ": " . $line["title"] . " (" . $line["feed_title"] . ")</li>";
244 }
245
246 print "</ul>";
247
248 print "<button dojoType=\"dijit.form.Button\" onclick=\"return bayesUpdateUI()\">".
249 __('Refresh')."</button> ";
250
3c43def9
AD
251 print "<button dojoType=\"dijit.form.Button\" onclick=\"return bayesClearDatabase()\">".
252 __('Clear database')."</button> ";
853cc128
AD
253
254 //
308c55c0
AD
255 }
256
257 function hook_prefs_tab($args) {
258 if ($args != "prefPrefs") return;
259
260 print "<div id=\"af_sort_bayes_prefs\" dojoType=\"dijit.layout.AccordionPane\" title=\"".__('Bayesian classifier (af_sort_bayes)')."\">";
261
262 $this->renderPrefsUI();
853cc128
AD
263
264 print "</div>";
265 }
266
267 function hook_article_filter($article) {
268 $owner_uid = $article["owner_uid"];
269
5df13f31
AD
270 // guid already includes owner_uid so we don't need to include it
271 $result = $this->dbh->query("SELECT id FROM {$this->sql_prefix}_references WHERE
272 document_id = '" . $this->dbh->escape_string($article['guid_hashed']) . "'");
59e83455 273
13167d2a
AD
274 if (db_num_rows($result) != 0) {
275 _debug("bayes: article already categorized");
276 return $article;
277 }
278
5df13f31 279 $nbs = new NaiveBayesianStorage($owner_uid);
4dbd303b 280 $nb = new NaiveBayesian($nbs);
fe4535e6 281
59e83455
AD
282 $categories = $nbs->getCategories();
283
284 if (count($categories) > 0) {
285
286 $count_neutral = 0;
08cfcba4 287
59e83455 288 $id_good = 0;
08cfcba4
AD
289 $id_ugly = 0;
290 $id_bad = 0;
59e83455
AD
291
292 foreach ($categories as $id => $cat) {
293 if ($cat["category"] == "GOOD") {
294 $id_good = $id;
08cfcba4
AD
295 } else if ($cat["category"] == "UGLY") {
296 $id_ugly = $id;
59e83455 297 $count_neutral += $cat["word_count"];
08cfcba4
AD
298 } else if ($cat["category"] == "BAD") {
299 $id_bad = $id;
59e83455
AD
300 }
301 }
302
08cfcba4 303 $dst_category = $id_ugly;
59e83455
AD
304
305 $bayes_content = mb_strtolower($article["title"] . " " . strip_tags($article["content"]));
306
fa05356a 307 if ($count_neutral >= $this->auto_categorize_threshold) {
59e83455
AD
308 // enable automatic categorization
309
310 $result = $nb->categorize($bayes_content);
311
47cd0bd5 312 //print_r($result);
e0ae194a 313
08cfcba4 314 if (count($result) == 3) {
59e83455 315 $prob_good = $result[$id_good];
08cfcba4 316 $prob_bad = $result[$id_bad];
59e83455 317
bc0e565a 318 if (!is_nan($prob_good) && $prob_good > 0.90) {
e0ae194a 319 $dst_category = $id_good;
59e83455 320 $article["score_modifier"] += $this->score_modifier;
bc0e565a 321 } else if (!is_nan($prob_bad) && $prob_bad > 0.90) {
e0ae194a 322 $dst_category = $id_bad;
08cfcba4 323 $article["score_modifier"] -= $this->score_modifier;
59e83455
AD
324 }
325 }
e0ae194a
AD
326
327 _debug("bayes, dst category: $dst_category");
59e83455
AD
328 }
329
330 $nb->train($article["guid_hashed"], $dst_category, $bayes_content);
331
332 $nb->updateProbabilities();
333 }
853cc128
AD
334
335 return $article;
336
337 }
338
3c43def9
AD
339 function clearDatabase() {
340 $prefix = $this->sql_prefix;
341
342 $this->dbh->query("BEGIN");
343 $this->dbh->query("DELETE FROM ${prefix}_references WHERE owner_uid = " . $_SESSION["uid"]);
344 $this->dbh->query("DELETE FROM ${prefix}_wordfreqs WHERE owner_uid = " . $_SESSION["uid"]);
345 $this->dbh->query("COMMIT");
346
347 $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
4dbd303b 348 $nb = new NaiveBayesian($nbs);
3c43def9
AD
349 $nb->updateProbabilities();
350 }
351
168e32b9
AD
352 function showArticleStats() {
353 $article_id = (int) $_REQUEST["article_id"];
354
355 $result = $this->dbh->query("SELECT score, guid, title, content FROM ttrss_entries, ttrss_user_entries WHERE ref_id = id AND id = " .
356 $article_id . " AND owner_uid = " . $_SESSION["uid"]);
357
358 if ($this->dbh->num_rows($result) != 0) {
359 $guid = $this->dbh->fetch_result($result, 0, "guid");
360 $title = $this->dbh->fetch_result($result, 0, "title");
361 $content = mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content")));
362
363 print "<h2>" . $title . "</h2>";
364
365 $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
366 $nb = new NaiveBayesian($nbs);
367
368 $categories = $nbs->getCategories();
369
370 $ref = $nbs->getReference($guid, false);
371
372 $current_cat = isset($ref["category_id"]) ? $categories[$ref["category_id"]]["category"] : "N/A";
373
374 print "<p>" . T_sprintf("Currently stored as: %s", $current_cat) . "</p>";
375
376 $result = $nb->categorize($content);
377
378 print "<h3>" . __("Classifier result") . "</h3>";
379
380 print "<table>";
381 print "<tr><th>Category</th><th>Probability</th></tr>";
382
383 foreach ($result as $k => $v) {
384 print "<tr>";
385 print "<td>" . $categories[$k]["category"] . "</td>";
386 print "<td>" . $v . "</td>";
387
388 print "</tr>";
389 }
390
391 print "</table>";
392
393 } else {
394 print_error("Article not found");
395 }
396
397 print "<div align='center'>";
398
399 print "<button dojoType=\"dijit.form.Button\" onclick=\"return dijit.byId('bayesShowDlg').hide()\">".
400 __('Close this window')."</button>";
401
402 print "</div>";
403
404 }
405
853cc128
AD
406 function api_version() {
407 return 2;
408 }
409
410}
411?>