]> git.wh0rd.org - tt-rss.git/blob - plugins/af_sort_bayes/init.php
6599baa0f62187ca9f336ae0176a50fbddfde351
[tt-rss.git] / plugins / af_sort_bayes / init.php
1 <?php
2
3 class Af_Sort_Bayes extends Plugin {
4
5 private $host;
6 private $filters = array();
7 private $dbh;
8 private $score_modifier = 50;
9 private $sql_prefix = "ttrss_plugin_af_sort_bayes";
10 private $auto_categorize_threshold = 10000;
11 private $max_document_length = 3000; // classifier can't rescale output for very long strings apparently
12
13 function about() {
14 return array(1.0,
15 "Bayesian classifier for tt-rss (WIP)",
16 "fox");
17 }
18
19 function init($host) {
20 require_once __DIR__ . "/lib/class.naivebayesian.php";
21 //require_once __DIR__ . "/lib/class.naivebayesian_ngram.php";
22 require_once __DIR__ . "/lib/class.naivebayesianstorage.php";
23
24 $this->host = $host;
25 $this->dbh = Db::get();
26
27 $this->init_database();
28
29 $host->add_hook($host::HOOK_ARTICLE_FILTER, $this);
30 $host->add_hook($host::HOOK_PREFS_TAB, $this);
31 $host->add_hook($host::HOOK_ARTICLE_BUTTON, $this);
32
33 }
34
35 function trainArticle() {
36 $article_id = (int) $_REQUEST["article_id"];
37 $train_up = sql_bool_to_bool($_REQUEST["train_up"]);
38
39 //$category = $train_up ? "GOOD" : "UGLY";
40 $dst_category = "UGLY";
41
42 $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
43 $nb = new NaiveBayesian($nbs);
44
45 $result = $this->dbh->query("SELECT score, guid, title, content FROM ttrss_entries, ttrss_user_entries WHERE ref_id = id AND id = " .
46 $article_id . " AND owner_uid = " . $_SESSION["uid"]);
47
48 if ($this->dbh->num_rows($result) != 0) {
49 $guid = $this->dbh->fetch_result($result, 0, "guid");
50 $title = $this->dbh->fetch_result($result, 0, "title");
51 $content = mb_substr(mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content"))), 0, $this->max_document_length);
52 $score = $this->dbh->fetch_result($result, 0, "score");
53
54 $this->dbh->query("BEGIN");
55
56 $ref = $nbs->getReference($guid, false);
57
58 if (isset($ref['category_id'])) {
59 $current_category = $nbs->getCategoryById($ref['category_id']);
60 } else {
61 $current_category = "UGLY";
62 }
63
64 // set score to fixed value for now
65
66 if ($train_up) {
67 switch ($current_category) {
68 case "UGLY":
69 $dst_category = "GOOD";
70 $score = $this->score_modifier;
71 break;
72 case "BAD":
73 $dst_category = "UGLY";
74 $score = 0;
75 break;
76 case "GOOD":
77 $dst_category = "GOOD";
78 break;
79 }
80 } else {
81 switch ($current_category) {
82 case "UGLY":
83 $dst_category = "BAD";
84 $score = -$this->score_modifier;
85 break;
86 case "BAD":
87 $dst_category = "BAD";
88 break;
89 case "GOOD":
90 $dst_category = "UGLY";
91 $score = 0;
92 break;
93 }
94 }
95
96 $nb->untrain($guid, $content);
97 $nb->train($guid, $nbs->getCategoryByName($dst_category), $content);
98
99 $this->dbh->query("UPDATE ttrss_user_entries SET score = '$score' WHERE ref_id = $article_id AND owner_uid = " . $_SESSION["uid"]);
100
101 $nb->updateProbabilities();
102
103 $this->dbh->query("COMMIT");
104
105 }
106
107 print "$article_id :: $dst_category :: $score";
108 }
109
110 function get_js() {
111 return file_get_contents(__DIR__ . "/init.js");
112 }
113
114 function get_prefs_js() {
115 return file_get_contents(__DIR__ . "/init.js");
116 }
117
118 function hook_article_button($line) {
119 return "<img src=\"plugins/af_sort_bayes/thumb_up.png\"
120 style=\"cursor : pointer\" style=\"cursor : pointer\"
121 onclick=\"bayesTrain(".$line["id"].", true, event)\"
122 class='tagsPic' title='".__('+1')."'>" .
123 "<img src=\"plugins/af_sort_bayes/thumb_down.png\"
124 style=\"cursor : pointer\" style=\"cursor : pointer\"
125 onclick=\"bayesTrain(".$line["id"].", false, event)\"
126 class='tagsPic' title='".__('-1')."'>" .
127 "<img src=\"plugins/af_sort_bayes/chart_bar.png\"
128 style=\"cursor : pointer\" style=\"cursor : pointer\"
129 onclick=\"bayesShow(".$line["id"].")\"
130 class='tagsPic' title='".__('Show classifier info')."'>";
131
132 }
133
134 function init_database() {
135 $prefix = $this->sql_prefix;
136
137 // TODO there probably should be a way for plugins to determine their schema version to upgrade tables
138
139 /*$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_wordfreqs", false);
140 $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_references", false);
141 $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_categories", false);*/
142
143 $this->dbh->query("BEGIN");
144
145 // PG only for the time being
146
147 if (DB_TYPE == "mysql") {
148
149 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_categories (
150 id INTEGER NOT NULL PRIMARY KEY auto_increment,
151 category varchar(100) NOT NULL DEFAULT '',
152 probability DOUBLE NOT NULL DEFAULT '0',
153 owner_uid INTEGER NOT NULL,
154 FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE,
155 word_count BIGINT NOT NULL DEFAULT '0') ENGINE=InnoDB");
156
157 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references (
158 id INTEGER NOT NULL PRIMARY KEY auto_increment,
159 document_id VARCHAR(255) NOT NULL,
160 category_id INTEGER NOT NULL,
161 FOREIGN KEY (category_id) REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
162 owner_uid INTEGER NOT NULL,
163 FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE) ENGINE=InnoDB");
164
165 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs (
166 word varchar(100) NOT NULL DEFAULT '',
167 category_id INTEGER NOT NULL,
168 FOREIGN KEY (category_id) REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
169 owner_uid INTEGER NOT NULL,
170 FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE,
171 count BIGINT NOT NULL DEFAULT '0') ENGINE=InnoDB");
172
173
174 } else {
175 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_categories (
176 id SERIAL NOT NULL PRIMARY KEY,
177 category varchar(100) NOT NULL DEFAULT '',
178 probability DOUBLE PRECISION NOT NULL DEFAULT '0',
179 owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
180 word_count BIGINT NOT NULL DEFAULT '0')");
181
182 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references (
183 id SERIAL NOT NULL PRIMARY KEY,
184 document_id VARCHAR(255) NOT NULL,
185 category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
186 owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE)");
187
188 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs (
189 word varchar(100) NOT NULL DEFAULT '',
190 category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
191 owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
192 count BIGINT NOT NULL DEFAULT '0')");
193 }
194
195 $owner_uid = @$_SESSION["uid"];
196
197 if ($owner_uid) {
198 $result = $this->dbh->query("SELECT id FROM ${prefix}_categories WHERE owner_uid = $owner_uid LIMIT 1");
199
200 if ($this->dbh->num_rows($result) == 0) {
201 $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('GOOD', $owner_uid)");
202 $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('BAD', $owner_uid)");
203 $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('UGLY', $owner_uid)");
204 }
205 }
206
207 $this->dbh->query("COMMIT");
208 }
209
210 function renderPrefsUI() {
211 $result = $this->dbh->query("SELECT category, probability, word_count,
212 (SELECT COUNT(id) FROM {$this->sql_prefix}_references WHERE
213 category_id = {$this->sql_prefix}_categories.id) as doc_count
214 FROM {$this->sql_prefix}_categories WHERE owner_uid = " . $_SESSION["uid"]);
215
216 print "<h3>" . __("Statistics") . "</h3>";
217
218 print "<p>".T_sprintf("Required UGLY word count for automatic matching: %d", $this->auto_categorize_threshold)."</p>";
219
220 print "<table>";
221 print "<tr><th>Category</th><th>Probability</th><th>Words</th><th>Articles</th></tr>";
222
223 while ($line = $this->dbh->fetch_assoc($result)) {
224 print "<tr>";
225 foreach ($line as $k => $v) {
226 if ($k == "probability") $v = sprintf("%.3f", $v);
227
228 print "<td>$v</td>";
229 }
230 print "</tr>";
231 }
232
233 print "</table>";
234
235 print "<h3>" . __("Last matched articles") . "</h3>";
236
237 $result = $this->dbh->query("SELECT te.title, category, tf.title AS feed_title
238 FROM ttrss_entries AS te, ttrss_user_entries AS tu, ttrss_feeds AS tf, {$this->sql_prefix}_references AS tr, {$this->sql_prefix}_categories AS tc
239 WHERE tf.id = tu.feed_id AND tu.ref_id = te.id AND tc.id = tr.category_id AND tr.document_id = te.guid ORDER BY te.id DESC LIMIT 20");
240
241 print "<ul class=\"browseFeedList\" style=\"border-width : 1px\">";
242
243 while ($line = $this->dbh->fetch_assoc($result)) {
244 print "<li>" . $line["category"] . ": " . $line["title"] . " (" . $line["feed_title"] . ")</li>";
245 }
246
247 print "</ul>";
248
249 print "<button dojoType=\"dijit.form.Button\" onclick=\"return bayesUpdateUI()\">".
250 __('Refresh')."</button> ";
251
252 print "<button dojoType=\"dijit.form.Button\" onclick=\"return bayesClearDatabase()\">".
253 __('Clear database')."</button> ";
254
255 //
256 }
257
258 function hook_prefs_tab($args) {
259 if ($args != "prefPrefs") return;
260
261 print "<div id=\"af_sort_bayes_prefs\" dojoType=\"dijit.layout.AccordionPane\" title=\"".__('Bayesian classifier (af_sort_bayes)')."\">";
262
263 $this->renderPrefsUI();
264
265 print "</div>";
266 }
267
268 function hook_article_filter($article) {
269 $owner_uid = $article["owner_uid"];
270
271 // guid already includes owner_uid so we don't need to include it
272 $result = $this->dbh->query("SELECT id FROM {$this->sql_prefix}_references WHERE
273 document_id = '" . $this->dbh->escape_string($article['guid_hashed']) . "'");
274
275 if (db_num_rows($result) != 0) {
276 _debug("bayes: article already categorized");
277 return $article;
278 }
279
280 $nbs = new NaiveBayesianStorage($owner_uid);
281 $nb = new NaiveBayesian($nbs);
282
283 $categories = $nbs->getCategories();
284
285 if (count($categories) > 0) {
286
287 $count_neutral = 0;
288
289 $id_good = 0;
290 $id_ugly = 0;
291 $id_bad = 0;
292
293 foreach ($categories as $id => $cat) {
294 if ($cat["category"] == "GOOD") {
295 $id_good = $id;
296 } else if ($cat["category"] == "UGLY") {
297 $id_ugly = $id;
298 $count_neutral += $cat["word_count"];
299 } else if ($cat["category"] == "BAD") {
300 $id_bad = $id;
301 }
302 }
303
304 $dst_category = $id_ugly;
305
306 $bayes_content = mb_substr(mb_strtolower($article["title"] . " " . strip_tags($article["content"])), 0, $this->max_document_length);
307
308 if ($count_neutral >= $this->auto_categorize_threshold) {
309 // enable automatic categorization
310
311 $result = $nb->categorize($bayes_content);
312
313 //print_r($result);
314
315 if (count($result) == 3) {
316 $prob_good = $result[$id_good];
317 $prob_bad = $result[$id_bad];
318
319 if (!is_nan($prob_good) && $prob_good > 0.90) {
320 $dst_category = $id_good;
321 $article["score_modifier"] += $this->score_modifier;
322 } else if (!is_nan($prob_bad) && $prob_bad > 0.90) {
323 $dst_category = $id_bad;
324 $article["score_modifier"] -= $this->score_modifier;
325 }
326 }
327
328 _debug("bayes, dst category: $dst_category");
329 }
330
331 $nb->train($article["guid_hashed"], $dst_category, $bayes_content);
332
333 $nb->updateProbabilities();
334 }
335
336 return $article;
337
338 }
339
340 function clearDatabase() {
341 $prefix = $this->sql_prefix;
342
343 $this->dbh->query("BEGIN");
344 $this->dbh->query("DELETE FROM ${prefix}_references WHERE owner_uid = " . $_SESSION["uid"]);
345 $this->dbh->query("DELETE FROM ${prefix}_wordfreqs WHERE owner_uid = " . $_SESSION["uid"]);
346 $this->dbh->query("COMMIT");
347
348 $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
349 $nb = new NaiveBayesian($nbs);
350 $nb->updateProbabilities();
351 }
352
353 function showArticleStats() {
354 $article_id = (int) $_REQUEST["article_id"];
355
356 $result = $this->dbh->query("SELECT score, guid, title, content FROM ttrss_entries, ttrss_user_entries WHERE ref_id = id AND id = " .
357 $article_id . " AND owner_uid = " . $_SESSION["uid"]);
358
359 if ($this->dbh->num_rows($result) != 0) {
360 $guid = $this->dbh->fetch_result($result, 0, "guid");
361 $title = $this->dbh->fetch_result($result, 0, "title");
362
363 $content = mb_substr(mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content"))), 0, $this->max_document_length);
364
365 print "<h2>" . $title . "</h2>";
366
367 $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
368 $nb = new NaiveBayesian($nbs);
369
370 $categories = $nbs->getCategories();
371
372 $ref = $nbs->getReference($guid, false);
373
374 $current_cat = isset($ref["category_id"]) ? $categories[$ref["category_id"]]["category"] : "N/A";
375
376 print "<p>" . T_sprintf("Currently stored as: %s", $current_cat) . "</p>";
377
378 $result = $nb->categorize($content);
379
380 print "<h3>" . __("Classifier result") . "</h3>";
381
382 print "<table>";
383 print "<tr><th>Category</th><th>Probability</th></tr>";
384
385 foreach ($result as $k => $v) {
386 print "<tr>";
387 print "<td>" . $categories[$k]["category"] . "</td>";
388 print "<td>" . $v . "</td>";
389
390 print "</tr>";
391 }
392
393 print "</table>";
394
395 } else {
396 print_error("Article not found");
397 }
398
399 print "<div align='center'>";
400
401 print "<button dojoType=\"dijit.form.Button\" onclick=\"return dijit.byId('bayesShowDlg').hide()\">".
402 __('Close this window')."</button>";
403
404 print "</div>";
405
406 }
407
408 function api_version() {
409 return 2;
410 }
411
412 }
413 ?>