]>
Commit | Line | Data |
---|---|---|
853cc128 AD |
1 | <?php |
2 | ||
3 | class Af_Sort_Bayes extends Plugin { | |
4 | ||
5 | private $host; | |
6 | private $filters = array(); | |
7 | private $dbh; | |
59e83455 | 8 | private $score_modifier = 50; |
3c43def9 | 9 | private $sql_prefix = "ttrss_plugin_af_sort_bayes"; |
fa05356a | 10 | private $auto_categorize_threshold = 10000; |
132e42a9 | 11 | private $max_document_length = 3000; // classifier can't rescale output for very long strings apparently |
853cc128 AD |
12 | |
13 | function about() { | |
14 | return array(1.0, | |
15 | "Bayesian classifier for tt-rss (WIP)", | |
16 | "fox"); | |
17 | } | |
18 | ||
19 | function init($host) { | |
20 | require_once __DIR__ . "/lib/class.naivebayesian.php"; | |
4dbd303b | 21 | //require_once __DIR__ . "/lib/class.naivebayesian_ngram.php"; |
853cc128 AD |
22 | require_once __DIR__ . "/lib/class.naivebayesianstorage.php"; |
23 | ||
24 | $this->host = $host; | |
25 | $this->dbh = Db::get(); | |
26 | ||
27 | $this->init_database(); | |
28 | ||
29 | $host->add_hook($host::HOOK_ARTICLE_FILTER, $this); | |
30 | $host->add_hook($host::HOOK_PREFS_TAB, $this); | |
31 | $host->add_hook($host::HOOK_ARTICLE_BUTTON, $this); | |
32 | ||
33 | } | |
34 | ||
35 | function trainArticle() { | |
36 | $article_id = (int) $_REQUEST["article_id"]; | |
37 | $train_up = sql_bool_to_bool($_REQUEST["train_up"]); | |
38 | ||
08cfcba4 AD |
39 | //$category = $train_up ? "GOOD" : "UGLY"; |
40 | $dst_category = "UGLY"; | |
853cc128 | 41 | |
59e83455 | 42 | $nbs = new NaiveBayesianStorage($_SESSION["uid"]); |
4dbd303b | 43 | $nb = new NaiveBayesian($nbs); |
59e83455 AD |
44 | |
45 | $result = $this->dbh->query("SELECT score, guid, title, content FROM ttrss_entries, ttrss_user_entries WHERE ref_id = id AND id = " . | |
46 | $article_id . " AND owner_uid = " . $_SESSION["uid"]); | |
47 | ||
48 | if ($this->dbh->num_rows($result) != 0) { | |
49 | $guid = $this->dbh->fetch_result($result, 0, "guid"); | |
50 | $title = $this->dbh->fetch_result($result, 0, "title"); | |
132e42a9 | 51 | $content = mb_substr(mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content"))), 0, $this->max_document_length); |
59e83455 AD |
52 | $score = $this->dbh->fetch_result($result, 0, "score"); |
53 | ||
54 | $this->dbh->query("BEGIN"); | |
55 | ||
08cfcba4 AD |
56 | $ref = $nbs->getReference($guid, false); |
57 | ||
58 | if (isset($ref['category_id'])) { | |
59 | $current_category = $nbs->getCategoryById($ref['category_id']); | |
60 | } else { | |
61 | $current_category = "UGLY"; | |
59e83455 AD |
62 | } |
63 | ||
08cfcba4 AD |
64 | // set score to fixed value for now |
65 | ||
66 | if ($train_up) { | |
67 | switch ($current_category) { | |
68 | case "UGLY": | |
69 | $dst_category = "GOOD"; | |
70 | $score = $this->score_modifier; | |
71 | break; | |
72 | case "BAD": | |
73 | $dst_category = "UGLY"; | |
74 | $score = 0; | |
75 | break; | |
76 | case "GOOD": | |
77 | $dst_category = "GOOD"; | |
78 | break; | |
79 | } | |
80 | } else { | |
81 | switch ($current_category) { | |
82 | case "UGLY": | |
83 | $dst_category = "BAD"; | |
21111f62 | 84 | $score = -$this->score_modifier; |
08cfcba4 AD |
85 | break; |
86 | case "BAD": | |
87 | $dst_category = "BAD"; | |
88 | break; | |
89 | case "GOOD": | |
90 | $dst_category = "UGLY"; | |
a72cd54c | 91 | $score = 0; |
08cfcba4 AD |
92 | break; |
93 | } | |
94 | } | |
59e83455 | 95 | |
08cfcba4 AD |
96 | $nb->untrain($guid, $content); |
97 | $nb->train($guid, $nbs->getCategoryByName($dst_category), $content); | |
59e83455 AD |
98 | |
99 | $this->dbh->query("UPDATE ttrss_user_entries SET score = '$score' WHERE ref_id = $article_id AND owner_uid = " . $_SESSION["uid"]); | |
100 | ||
101 | $nb->updateProbabilities(); | |
102 | ||
103 | $this->dbh->query("COMMIT"); | |
104 | ||
105 | } | |
106 | ||
08cfcba4 | 107 | print "$article_id :: $dst_category :: $score"; |
853cc128 AD |
108 | } |
109 | ||
110 | function get_js() { | |
111 | return file_get_contents(__DIR__ . "/init.js"); | |
112 | } | |
113 | ||
3c43def9 AD |
114 | function get_prefs_js() { |
115 | return file_get_contents(__DIR__ . "/init.js"); | |
116 | } | |
117 | ||
853cc128 AD |
118 | function hook_article_button($line) { |
119 | return "<img src=\"plugins/af_sort_bayes/thumb_up.png\" | |
120 | style=\"cursor : pointer\" style=\"cursor : pointer\" | |
5b0cb9aa | 121 | onclick=\"bayesTrain(".$line["id"].", true, event)\" |
853cc128 AD |
122 | class='tagsPic' title='".__('+1')."'>" . |
123 | "<img src=\"plugins/af_sort_bayes/thumb_down.png\" | |
124 | style=\"cursor : pointer\" style=\"cursor : pointer\" | |
5b0cb9aa | 125 | onclick=\"bayesTrain(".$line["id"].", false, event)\" |
168e32b9 AD |
126 | class='tagsPic' title='".__('-1')."'>" . |
127 | "<img src=\"plugins/af_sort_bayes/chart_bar.png\" | |
128 | style=\"cursor : pointer\" style=\"cursor : pointer\" | |
129 | onclick=\"bayesShow(".$line["id"].")\" | |
130 | class='tagsPic' title='".__('Show classifier info')."'>"; | |
853cc128 AD |
131 | |
132 | } | |
133 | ||
134 | function init_database() { | |
3c43def9 | 135 | $prefix = $this->sql_prefix; |
853cc128 | 136 | |
59e83455 AD |
137 | // TODO there probably should be a way for plugins to determine their schema version to upgrade tables |
138 | ||
139 | /*$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_wordfreqs", false); | |
140 | $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_references", false); | |
141 | $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_categories", false);*/ | |
853cc128 AD |
142 | |
143 | $this->dbh->query("BEGIN"); | |
144 | ||
145 | // PG only for the time being | |
146 | ||
b02e8bc8 AD |
147 | if (DB_TYPE == "mysql") { |
148 | ||
149 | $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_categories ( | |
150 | id INTEGER NOT NULL PRIMARY KEY auto_increment, | |
151 | category varchar(100) NOT NULL DEFAULT '', | |
152 | probability DOUBLE NOT NULL DEFAULT '0', | |
153 | owner_uid INTEGER NOT NULL, | |
154 | FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE, | |
155 | word_count BIGINT NOT NULL DEFAULT '0') ENGINE=InnoDB"); | |
156 | ||
157 | $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references ( | |
158 | id INTEGER NOT NULL PRIMARY KEY auto_increment, | |
159 | document_id VARCHAR(255) NOT NULL, | |
160 | category_id INTEGER NOT NULL, | |
161 | FOREIGN KEY (category_id) REFERENCES ${prefix}_categories(id) ON DELETE CASCADE, | |
162 | owner_uid INTEGER NOT NULL, | |
61c9ce6a | 163 | FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE) ENGINE=InnoDB"); |
b02e8bc8 AD |
164 | |
165 | $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs ( | |
166 | word varchar(100) NOT NULL DEFAULT '', | |
167 | category_id INTEGER NOT NULL, | |
168 | FOREIGN KEY (category_id) REFERENCES ${prefix}_categories(id) ON DELETE CASCADE, | |
169 | owner_uid INTEGER NOT NULL, | |
170 | FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE, | |
171 | count BIGINT NOT NULL DEFAULT '0') ENGINE=InnoDB"); | |
172 | ||
173 | ||
174 | } else { | |
175 | $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_categories ( | |
176 | id SERIAL NOT NULL PRIMARY KEY, | |
177 | category varchar(100) NOT NULL DEFAULT '', | |
d62a5e0c | 178 | probability DOUBLE PRECISION NOT NULL DEFAULT '0', |
b02e8bc8 AD |
179 | owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE, |
180 | word_count BIGINT NOT NULL DEFAULT '0')"); | |
181 | ||
182 | $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references ( | |
183 | id SERIAL NOT NULL PRIMARY KEY, | |
184 | document_id VARCHAR(255) NOT NULL, | |
185 | category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE, | |
61c9ce6a | 186 | owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE)"); |
b02e8bc8 AD |
187 | |
188 | $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs ( | |
189 | word varchar(100) NOT NULL DEFAULT '', | |
190 | category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE, | |
191 | owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE, | |
192 | count BIGINT NOT NULL DEFAULT '0')"); | |
193 | } | |
853cc128 | 194 | |
59e83455 AD |
195 | $owner_uid = @$_SESSION["uid"]; |
196 | ||
197 | if ($owner_uid) { | |
198 | $result = $this->dbh->query("SELECT id FROM ${prefix}_categories WHERE owner_uid = $owner_uid LIMIT 1"); | |
199 | ||
200 | if ($this->dbh->num_rows($result) == 0) { | |
201 | $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('GOOD', $owner_uid)"); | |
08cfcba4 AD |
202 | $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('BAD', $owner_uid)"); |
203 | $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('UGLY', $owner_uid)"); | |
59e83455 AD |
204 | } |
205 | } | |
206 | ||
853cc128 AD |
207 | $this->dbh->query("COMMIT"); |
208 | } | |
209 | ||
308c55c0 | 210 | function renderPrefsUI() { |
4947c02e AD |
211 | $result = $this->dbh->query("SELECT category, probability, word_count, |
212 | (SELECT COUNT(id) FROM {$this->sql_prefix}_references WHERE | |
213 | category_id = {$this->sql_prefix}_categories.id) as doc_count | |
214 | FROM {$this->sql_prefix}_categories WHERE owner_uid = " . $_SESSION["uid"]); | |
3c43def9 | 215 | |
308c55c0 AD |
216 | print "<h3>" . __("Statistics") . "</h3>"; |
217 | ||
fa05356a AD |
218 | print "<p>".T_sprintf("Required UGLY word count for automatic matching: %d", $this->auto_categorize_threshold)."</p>"; |
219 | ||
3c43def9 | 220 | print "<table>"; |
308c55c0 | 221 | print "<tr><th>Category</th><th>Probability</th><th>Words</th><th>Articles</th></tr>"; |
3c43def9 AD |
222 | |
223 | while ($line = $this->dbh->fetch_assoc($result)) { | |
224 | print "<tr>"; | |
225 | foreach ($line as $k => $v) { | |
226 | if ($k == "probability") $v = sprintf("%.3f", $v); | |
227 | ||
228 | print "<td>$v</td>"; | |
229 | } | |
230 | print "</tr>"; | |
231 | } | |
232 | ||
233 | print "</table>"; | |
234 | ||
308c55c0 AD |
235 | print "<h3>" . __("Last matched articles") . "</h3>"; |
236 | ||
237 | $result = $this->dbh->query("SELECT te.title, category, tf.title AS feed_title | |
238 | FROM ttrss_entries AS te, ttrss_user_entries AS tu, ttrss_feeds AS tf, {$this->sql_prefix}_references AS tr, {$this->sql_prefix}_categories AS tc | |
239 | WHERE tf.id = tu.feed_id AND tu.ref_id = te.id AND tc.id = tr.category_id AND tr.document_id = te.guid ORDER BY te.id DESC LIMIT 20"); | |
240 | ||
241 | print "<ul class=\"browseFeedList\" style=\"border-width : 1px\">"; | |
242 | ||
243 | while ($line = $this->dbh->fetch_assoc($result)) { | |
244 | print "<li>" . $line["category"] . ": " . $line["title"] . " (" . $line["feed_title"] . ")</li>"; | |
245 | } | |
246 | ||
247 | print "</ul>"; | |
248 | ||
249 | print "<button dojoType=\"dijit.form.Button\" onclick=\"return bayesUpdateUI()\">". | |
250 | __('Refresh')."</button> "; | |
251 | ||
3c43def9 AD |
252 | print "<button dojoType=\"dijit.form.Button\" onclick=\"return bayesClearDatabase()\">". |
253 | __('Clear database')."</button> "; | |
853cc128 AD |
254 | |
255 | // | |
308c55c0 AD |
256 | } |
257 | ||
258 | function hook_prefs_tab($args) { | |
259 | if ($args != "prefPrefs") return; | |
260 | ||
261 | print "<div id=\"af_sort_bayes_prefs\" dojoType=\"dijit.layout.AccordionPane\" title=\"".__('Bayesian classifier (af_sort_bayes)')."\">"; | |
262 | ||
263 | $this->renderPrefsUI(); | |
853cc128 AD |
264 | |
265 | print "</div>"; | |
266 | } | |
267 | ||
268 | function hook_article_filter($article) { | |
269 | $owner_uid = $article["owner_uid"]; | |
270 | ||
5df13f31 AD |
271 | // guid already includes owner_uid so we don't need to include it |
272 | $result = $this->dbh->query("SELECT id FROM {$this->sql_prefix}_references WHERE | |
273 | document_id = '" . $this->dbh->escape_string($article['guid_hashed']) . "'"); | |
59e83455 | 274 | |
13167d2a AD |
275 | if (db_num_rows($result) != 0) { |
276 | _debug("bayes: article already categorized"); | |
277 | return $article; | |
278 | } | |
279 | ||
5df13f31 | 280 | $nbs = new NaiveBayesianStorage($owner_uid); |
4dbd303b | 281 | $nb = new NaiveBayesian($nbs); |
fe4535e6 | 282 | |
59e83455 AD |
283 | $categories = $nbs->getCategories(); |
284 | ||
285 | if (count($categories) > 0) { | |
286 | ||
287 | $count_neutral = 0; | |
08cfcba4 | 288 | |
59e83455 | 289 | $id_good = 0; |
08cfcba4 AD |
290 | $id_ugly = 0; |
291 | $id_bad = 0; | |
59e83455 AD |
292 | |
293 | foreach ($categories as $id => $cat) { | |
294 | if ($cat["category"] == "GOOD") { | |
295 | $id_good = $id; | |
08cfcba4 AD |
296 | } else if ($cat["category"] == "UGLY") { |
297 | $id_ugly = $id; | |
59e83455 | 298 | $count_neutral += $cat["word_count"]; |
08cfcba4 AD |
299 | } else if ($cat["category"] == "BAD") { |
300 | $id_bad = $id; | |
59e83455 AD |
301 | } |
302 | } | |
303 | ||
08cfcba4 | 304 | $dst_category = $id_ugly; |
59e83455 | 305 | |
132e42a9 | 306 | $bayes_content = mb_substr(mb_strtolower($article["title"] . " " . strip_tags($article["content"])), 0, $this->max_document_length); |
59e83455 | 307 | |
fa05356a | 308 | if ($count_neutral >= $this->auto_categorize_threshold) { |
59e83455 AD |
309 | // enable automatic categorization |
310 | ||
311 | $result = $nb->categorize($bayes_content); | |
312 | ||
47cd0bd5 | 313 | //print_r($result); |
e0ae194a | 314 | |
08cfcba4 | 315 | if (count($result) == 3) { |
59e83455 | 316 | $prob_good = $result[$id_good]; |
08cfcba4 | 317 | $prob_bad = $result[$id_bad]; |
59e83455 | 318 | |
bc0e565a | 319 | if (!is_nan($prob_good) && $prob_good > 0.90) { |
e0ae194a | 320 | $dst_category = $id_good; |
59e83455 | 321 | $article["score_modifier"] += $this->score_modifier; |
bc0e565a | 322 | } else if (!is_nan($prob_bad) && $prob_bad > 0.90) { |
e0ae194a | 323 | $dst_category = $id_bad; |
08cfcba4 | 324 | $article["score_modifier"] -= $this->score_modifier; |
59e83455 AD |
325 | } |
326 | } | |
e0ae194a AD |
327 | |
328 | _debug("bayes, dst category: $dst_category"); | |
59e83455 AD |
329 | } |
330 | ||
331 | $nb->train($article["guid_hashed"], $dst_category, $bayes_content); | |
332 | ||
333 | $nb->updateProbabilities(); | |
334 | } | |
853cc128 AD |
335 | |
336 | return $article; | |
337 | ||
338 | } | |
339 | ||
3c43def9 AD |
340 | function clearDatabase() { |
341 | $prefix = $this->sql_prefix; | |
342 | ||
343 | $this->dbh->query("BEGIN"); | |
344 | $this->dbh->query("DELETE FROM ${prefix}_references WHERE owner_uid = " . $_SESSION["uid"]); | |
345 | $this->dbh->query("DELETE FROM ${prefix}_wordfreqs WHERE owner_uid = " . $_SESSION["uid"]); | |
346 | $this->dbh->query("COMMIT"); | |
347 | ||
348 | $nbs = new NaiveBayesianStorage($_SESSION["uid"]); | |
4dbd303b | 349 | $nb = new NaiveBayesian($nbs); |
3c43def9 AD |
350 | $nb->updateProbabilities(); |
351 | } | |
352 | ||
168e32b9 AD |
353 | function showArticleStats() { |
354 | $article_id = (int) $_REQUEST["article_id"]; | |
355 | ||
356 | $result = $this->dbh->query("SELECT score, guid, title, content FROM ttrss_entries, ttrss_user_entries WHERE ref_id = id AND id = " . | |
357 | $article_id . " AND owner_uid = " . $_SESSION["uid"]); | |
358 | ||
359 | if ($this->dbh->num_rows($result) != 0) { | |
360 | $guid = $this->dbh->fetch_result($result, 0, "guid"); | |
361 | $title = $this->dbh->fetch_result($result, 0, "title"); | |
132e42a9 AD |
362 | |
363 | $content = mb_substr(mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content"))), 0, $this->max_document_length); | |
168e32b9 AD |
364 | |
365 | print "<h2>" . $title . "</h2>"; | |
366 | ||
367 | $nbs = new NaiveBayesianStorage($_SESSION["uid"]); | |
368 | $nb = new NaiveBayesian($nbs); | |
369 | ||
370 | $categories = $nbs->getCategories(); | |
371 | ||
372 | $ref = $nbs->getReference($guid, false); | |
373 | ||
374 | $current_cat = isset($ref["category_id"]) ? $categories[$ref["category_id"]]["category"] : "N/A"; | |
375 | ||
376 | print "<p>" . T_sprintf("Currently stored as: %s", $current_cat) . "</p>"; | |
377 | ||
378 | $result = $nb->categorize($content); | |
379 | ||
380 | print "<h3>" . __("Classifier result") . "</h3>"; | |
381 | ||
382 | print "<table>"; | |
383 | print "<tr><th>Category</th><th>Probability</th></tr>"; | |
384 | ||
385 | foreach ($result as $k => $v) { | |
386 | print "<tr>"; | |
387 | print "<td>" . $categories[$k]["category"] . "</td>"; | |
388 | print "<td>" . $v . "</td>"; | |
389 | ||
390 | print "</tr>"; | |
391 | } | |
392 | ||
393 | print "</table>"; | |
394 | ||
395 | } else { | |
396 | print_error("Article not found"); | |
397 | } | |
398 | ||
399 | print "<div align='center'>"; | |
400 | ||
401 | print "<button dojoType=\"dijit.form.Button\" onclick=\"return dijit.byId('bayesShowDlg').hide()\">". | |
402 | __('Close this window')."</button>"; | |
403 | ||
404 | print "</div>"; | |
405 | ||
406 | } | |
407 | ||
853cc128 AD |
408 | function api_version() { |
409 | return 2; | |
410 | } | |
411 | ||
412 | } | |
413 | ?> |