]> git.wh0rd.org - tt-rss.git/blob - plugins/af_sort_bayes/init.php
d2e659cae31111d118f05b0ff7a2cdf8bdf52411
[tt-rss.git] / plugins / af_sort_bayes / init.php
1 <?php
2
3 class Af_Sort_Bayes extends Plugin {
4
5 private $host;
6 private $filters = array();
7 private $dbh;
8 private $score_modifier = 50;
9 private $sql_prefix = "ttrss_plugin_af_sort_bayes";
10 private $auto_categorize_threshold = 10000;
11
12 function about() {
13 return array(1.0,
14 "Bayesian classifier for tt-rss (WIP)",
15 "fox");
16 }
17
18 function init($host) {
19 require_once __DIR__ . "/lib/class.naivebayesian.php";
20 require_once __DIR__ . "/lib/class.naivebayesian_ngram.php";
21 require_once __DIR__ . "/lib/class.naivebayesianstorage.php";
22
23 $this->host = $host;
24 $this->dbh = Db::get();
25
26 $this->init_database();
27
28 $host->add_hook($host::HOOK_ARTICLE_FILTER, $this);
29 $host->add_hook($host::HOOK_PREFS_TAB, $this);
30 $host->add_hook($host::HOOK_ARTICLE_BUTTON, $this);
31
32 }
33
34 function trainArticle() {
35 $article_id = (int) $_REQUEST["article_id"];
36 $train_up = sql_bool_to_bool($_REQUEST["train_up"]);
37
38 //$category = $train_up ? "GOOD" : "UGLY";
39 $dst_category = "UGLY";
40
41 $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
42 $nb = new NaiveBayesianNgram($nbs, 3);
43
44 $result = $this->dbh->query("SELECT score, guid, title, content FROM ttrss_entries, ttrss_user_entries WHERE ref_id = id AND id = " .
45 $article_id . " AND owner_uid = " . $_SESSION["uid"]);
46
47 if ($this->dbh->num_rows($result) != 0) {
48 $guid = $this->dbh->fetch_result($result, 0, "guid");
49 $title = $this->dbh->fetch_result($result, 0, "title");
50 $content = mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content")));
51 $score = $this->dbh->fetch_result($result, 0, "score");
52
53 $this->dbh->query("BEGIN");
54
55 $ref = $nbs->getReference($guid, false);
56
57 if (isset($ref['category_id'])) {
58 $current_category = $nbs->getCategoryById($ref['category_id']);
59 } else {
60 $current_category = "UGLY";
61 }
62
63 // set score to fixed value for now
64
65 if ($train_up) {
66 switch ($current_category) {
67 case "UGLY":
68 $dst_category = "GOOD";
69 $score = $this->score_modifier;
70 break;
71 case "BAD":
72 $dst_category = "UGLY";
73 $score = 0;
74 break;
75 case "GOOD":
76 $dst_category = "GOOD";
77 break;
78 }
79 } else {
80 switch ($current_category) {
81 case "UGLY":
82 $dst_category = "BAD";
83 $score = -$this->score_modifier;
84 break;
85 case "BAD":
86 $dst_category = "BAD";
87 break;
88 case "GOOD":
89 $dst_category = "UGLY";
90 $score = 0;
91 break;
92 }
93 }
94
95 $nb->untrain($guid, $content);
96 $nb->train($guid, $nbs->getCategoryByName($dst_category), $content);
97
98 $this->dbh->query("UPDATE ttrss_user_entries SET score = '$score' WHERE ref_id = $article_id AND owner_uid = " . $_SESSION["uid"]);
99
100 $nb->updateProbabilities();
101
102 $this->dbh->query("COMMIT");
103
104 }
105
106 print "$article_id :: $dst_category :: $score";
107 }
108
109 function get_js() {
110 return file_get_contents(__DIR__ . "/init.js");
111 }
112
113 function get_prefs_js() {
114 return file_get_contents(__DIR__ . "/init.js");
115 }
116
117 function hook_article_button($line) {
118 return "<img src=\"plugins/af_sort_bayes/thumb_up.png\"
119 style=\"cursor : pointer\" style=\"cursor : pointer\"
120 onclick=\"bayesTrain(".$line["id"].", true)\"
121 class='tagsPic' title='".__('+1')."'>" .
122 "<img src=\"plugins/af_sort_bayes/thumb_down.png\"
123 style=\"cursor : pointer\" style=\"cursor : pointer\"
124 onclick=\"bayesTrain(".$line["id"].", false)\"
125 class='tagsPic' title='".__('-1')."'>";
126
127 }
128
129 function init_database() {
130 $prefix = $this->sql_prefix;
131
132 // TODO there probably should be a way for plugins to determine their schema version to upgrade tables
133
134 /*$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_wordfreqs", false);
135 $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_references", false);
136 $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_categories", false);*/
137
138 $this->dbh->query("BEGIN");
139
140 // PG only for the time being
141
142 if (DB_TYPE == "mysql") {
143
144 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_categories (
145 id INTEGER NOT NULL PRIMARY KEY auto_increment,
146 category varchar(100) NOT NULL DEFAULT '',
147 probability DOUBLE NOT NULL DEFAULT '0',
148 owner_uid INTEGER NOT NULL,
149 FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE,
150 word_count BIGINT NOT NULL DEFAULT '0') ENGINE=InnoDB");
151
152 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references (
153 id INTEGER NOT NULL PRIMARY KEY auto_increment,
154 document_id VARCHAR(255) NOT NULL,
155 category_id INTEGER NOT NULL,
156 FOREIGN KEY (category_id) REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
157 owner_uid INTEGER NOT NULL,
158 FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE) ENGINE=InnoDB");
159
160 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs (
161 word varchar(100) NOT NULL DEFAULT '',
162 category_id INTEGER NOT NULL,
163 FOREIGN KEY (category_id) REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
164 owner_uid INTEGER NOT NULL,
165 FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE,
166 count BIGINT NOT NULL DEFAULT '0') ENGINE=InnoDB");
167
168
169 } else {
170 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_categories (
171 id SERIAL NOT NULL PRIMARY KEY,
172 category varchar(100) NOT NULL DEFAULT '',
173 probability DOUBLE PRECISION NOT NULL DEFAULT '0',
174 owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
175 word_count BIGINT NOT NULL DEFAULT '0')");
176
177 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references (
178 id SERIAL NOT NULL PRIMARY KEY,
179 document_id VARCHAR(255) NOT NULL,
180 category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
181 owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE)");
182
183 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs (
184 word varchar(100) NOT NULL DEFAULT '',
185 category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
186 owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
187 count BIGINT NOT NULL DEFAULT '0')");
188 }
189
190 $owner_uid = @$_SESSION["uid"];
191
192 if ($owner_uid) {
193 $result = $this->dbh->query("SELECT id FROM ${prefix}_categories WHERE owner_uid = $owner_uid LIMIT 1");
194
195 if ($this->dbh->num_rows($result) == 0) {
196 $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('GOOD', $owner_uid)");
197 $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('BAD', $owner_uid)");
198 $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('UGLY', $owner_uid)");
199 }
200 }
201
202 $this->dbh->query("COMMIT");
203 }
204
205 function renderPrefsUI() {
206 $result = $this->dbh->query("SELECT category, probability, word_count,
207 (SELECT COUNT(id) FROM {$this->sql_prefix}_references WHERE
208 category_id = {$this->sql_prefix}_categories.id) as doc_count
209 FROM {$this->sql_prefix}_categories WHERE owner_uid = " . $_SESSION["uid"]);
210
211 print "<h3>" . __("Statistics") . "</h3>";
212
213 print "<p>".T_sprintf("Required UGLY word count for automatic matching: %d", $this->auto_categorize_threshold)."</p>";
214
215 print "<table>";
216 print "<tr><th>Category</th><th>Probability</th><th>Words</th><th>Articles</th></tr>";
217
218 while ($line = $this->dbh->fetch_assoc($result)) {
219 print "<tr>";
220 foreach ($line as $k => $v) {
221 if ($k == "probability") $v = sprintf("%.3f", $v);
222
223 print "<td>$v</td>";
224 }
225 print "</tr>";
226 }
227
228 print "</table>";
229
230 print "<h3>" . __("Last matched articles") . "</h3>";
231
232 $result = $this->dbh->query("SELECT te.title, category, tf.title AS feed_title
233 FROM ttrss_entries AS te, ttrss_user_entries AS tu, ttrss_feeds AS tf, {$this->sql_prefix}_references AS tr, {$this->sql_prefix}_categories AS tc
234 WHERE tf.id = tu.feed_id AND tu.ref_id = te.id AND tc.id = tr.category_id AND tr.document_id = te.guid ORDER BY te.id DESC LIMIT 20");
235
236 print "<ul class=\"browseFeedList\" style=\"border-width : 1px\">";
237
238 while ($line = $this->dbh->fetch_assoc($result)) {
239 print "<li>" . $line["category"] . ": " . $line["title"] . " (" . $line["feed_title"] . ")</li>";
240 }
241
242 print "</ul>";
243
244 print "<button dojoType=\"dijit.form.Button\" onclick=\"return bayesUpdateUI()\">".
245 __('Refresh')."</button> ";
246
247 print "<button dojoType=\"dijit.form.Button\" onclick=\"return bayesClearDatabase()\">".
248 __('Clear database')."</button> ";
249
250 //
251 }
252
253 function hook_prefs_tab($args) {
254 if ($args != "prefPrefs") return;
255
256 print "<div id=\"af_sort_bayes_prefs\" dojoType=\"dijit.layout.AccordionPane\" title=\"".__('Bayesian classifier (af_sort_bayes)')."\">";
257
258 $this->renderPrefsUI();
259
260 print "</div>";
261 }
262
263 function hook_article_filter($article) {
264 $owner_uid = $article["owner_uid"];
265
266 $nbs = new NaiveBayesianStorage($owner_uid);
267 $nb = new NaiveBayesianNgram($nbs, 3);
268
269 $ref = $nbs->getReference($article["guid"], false);
270
271 if (isset($ref["category_id"])) return $article; // already categorized
272
273 $categories = $nbs->getCategories();
274
275 if (count($categories) > 0) {
276
277 $count_neutral = 0;
278
279 $id_good = 0;
280 $id_ugly = 0;
281 $id_bad = 0;
282
283 foreach ($categories as $id => $cat) {
284 if ($cat["category"] == "GOOD") {
285 $id_good = $id;
286 } else if ($cat["category"] == "UGLY") {
287 $id_ugly = $id;
288 $count_neutral += $cat["word_count"];
289 } else if ($cat["category"] == "BAD") {
290 $id_bad = $id;
291 }
292 }
293
294 $dst_category = $id_ugly;
295
296 $bayes_content = mb_strtolower($article["title"] . " " . strip_tags($article["content"]));
297
298 if ($count_neutral >= $this->auto_categorize_threshold) {
299 // enable automatic categorization
300
301 $result = $nb->categorize($bayes_content);
302
303 //print_r($result);
304
305 if (count($result) == 3) {
306 $prob_good = $result[$id_good];
307 $prob_bad = $result[$id_bad];
308
309 if ($prob_good > 0.90) {
310 $dst_category = $id_good;
311 $article["score_modifier"] += $this->score_modifier;
312 } else if ($prob_bad > 0.90) {
313 $dst_category = $id_bad;
314 $article["score_modifier"] -= $this->score_modifier;
315 }
316 }
317
318 _debug("bayes, dst category: $dst_category");
319 }
320
321 $nb->train($article["guid_hashed"], $dst_category, $bayes_content);
322
323 $nb->updateProbabilities();
324 }
325
326 return $article;
327
328 }
329
330 function clearDatabase() {
331 $prefix = $this->sql_prefix;
332
333 $this->dbh->query("BEGIN");
334 $this->dbh->query("DELETE FROM ${prefix}_references WHERE owner_uid = " . $_SESSION["uid"]);
335 $this->dbh->query("DELETE FROM ${prefix}_wordfreqs WHERE owner_uid = " . $_SESSION["uid"]);
336 $this->dbh->query("COMMIT");
337
338 $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
339 $nb = new NaiveBayesianNgram($nbs, 3);
340 $nb->updateProbabilities();
341 }
342
343 function api_version() {
344 return 2;
345 }
346
347 }
348 ?>