]>
Commit | Line | Data |
---|---|---|
1 | <?php | |
2 | ||
3 | class Af_Sort_Bayes extends Plugin { | |
4 | ||
5 | private $host; | |
6 | private $filters = array(); | |
7 | private $dbh; | |
8 | private $score_modifier = 50; | |
9 | private $sql_prefix = "ttrss_plugin_af_sort_bayes"; | |
10 | private $auto_categorize_threshold = 10000; | |
11 | ||
12 | function about() { | |
13 | return array(1.0, | |
14 | "Bayesian classifier for tt-rss (WIP)", | |
15 | "fox"); | |
16 | } | |
17 | ||
18 | function init($host) { | |
19 | require_once __DIR__ . "/lib/class.naivebayesian.php"; | |
20 | //require_once __DIR__ . "/lib/class.naivebayesian_ngram.php"; | |
21 | require_once __DIR__ . "/lib/class.naivebayesianstorage.php"; | |
22 | ||
23 | $this->host = $host; | |
24 | $this->dbh = Db::get(); | |
25 | ||
26 | $this->init_database(); | |
27 | ||
28 | $host->add_hook($host::HOOK_ARTICLE_FILTER, $this); | |
29 | $host->add_hook($host::HOOK_PREFS_TAB, $this); | |
30 | $host->add_hook($host::HOOK_ARTICLE_BUTTON, $this); | |
31 | ||
32 | } | |
33 | ||
34 | function trainArticle() { | |
35 | $article_id = (int) $_REQUEST["article_id"]; | |
36 | $train_up = sql_bool_to_bool($_REQUEST["train_up"]); | |
37 | ||
38 | //$category = $train_up ? "GOOD" : "UGLY"; | |
39 | $dst_category = "UGLY"; | |
40 | ||
41 | $nbs = new NaiveBayesianStorage($_SESSION["uid"]); | |
42 | $nb = new NaiveBayesian($nbs); | |
43 | ||
44 | $result = $this->dbh->query("SELECT score, guid, title, content FROM ttrss_entries, ttrss_user_entries WHERE ref_id = id AND id = " . | |
45 | $article_id . " AND owner_uid = " . $_SESSION["uid"]); | |
46 | ||
47 | if ($this->dbh->num_rows($result) != 0) { | |
48 | $guid = $this->dbh->fetch_result($result, 0, "guid"); | |
49 | $title = $this->dbh->fetch_result($result, 0, "title"); | |
50 | $content = mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content"))); | |
51 | $score = $this->dbh->fetch_result($result, 0, "score"); | |
52 | ||
53 | $this->dbh->query("BEGIN"); | |
54 | ||
55 | $ref = $nbs->getReference($guid, false); | |
56 | ||
57 | if (isset($ref['category_id'])) { | |
58 | $current_category = $nbs->getCategoryById($ref['category_id']); | |
59 | } else { | |
60 | $current_category = "UGLY"; | |
61 | } | |
62 | ||
63 | // set score to fixed value for now | |
64 | ||
65 | if ($train_up) { | |
66 | switch ($current_category) { | |
67 | case "UGLY": | |
68 | $dst_category = "GOOD"; | |
69 | $score = $this->score_modifier; | |
70 | break; | |
71 | case "BAD": | |
72 | $dst_category = "UGLY"; | |
73 | $score = 0; | |
74 | break; | |
75 | case "GOOD": | |
76 | $dst_category = "GOOD"; | |
77 | break; | |
78 | } | |
79 | } else { | |
80 | switch ($current_category) { | |
81 | case "UGLY": | |
82 | $dst_category = "BAD"; | |
83 | $score = -$this->score_modifier; | |
84 | break; | |
85 | case "BAD": | |
86 | $dst_category = "BAD"; | |
87 | break; | |
88 | case "GOOD": | |
89 | $dst_category = "UGLY"; | |
90 | $score = 0; | |
91 | break; | |
92 | } | |
93 | } | |
94 | ||
95 | $nb->untrain($guid, $content); | |
96 | $nb->train($guid, $nbs->getCategoryByName($dst_category), $content); | |
97 | ||
98 | $this->dbh->query("UPDATE ttrss_user_entries SET score = '$score' WHERE ref_id = $article_id AND owner_uid = " . $_SESSION["uid"]); | |
99 | ||
100 | $nb->updateProbabilities(); | |
101 | ||
102 | $this->dbh->query("COMMIT"); | |
103 | ||
104 | } | |
105 | ||
106 | print "$article_id :: $dst_category :: $score"; | |
107 | } | |
108 | ||
109 | function get_js() { | |
110 | return file_get_contents(__DIR__ . "/init.js"); | |
111 | } | |
112 | ||
113 | function get_prefs_js() { | |
114 | return file_get_contents(__DIR__ . "/init.js"); | |
115 | } | |
116 | ||
117 | function hook_article_button($line) { | |
118 | return "<img src=\"plugins/af_sort_bayes/thumb_up.png\" | |
119 | style=\"cursor : pointer\" style=\"cursor : pointer\" | |
120 | onclick=\"bayesTrain(".$line["id"].", true)\" | |
121 | class='tagsPic' title='".__('+1')."'>" . | |
122 | "<img src=\"plugins/af_sort_bayes/thumb_down.png\" | |
123 | style=\"cursor : pointer\" style=\"cursor : pointer\" | |
124 | onclick=\"bayesTrain(".$line["id"].", false)\" | |
125 | class='tagsPic' title='".__('-1')."'>"; | |
126 | ||
127 | } | |
128 | ||
129 | function init_database() { | |
130 | $prefix = $this->sql_prefix; | |
131 | ||
132 | // TODO there probably should be a way for plugins to determine their schema version to upgrade tables | |
133 | ||
134 | /*$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_wordfreqs", false); | |
135 | $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_references", false); | |
136 | $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_categories", false);*/ | |
137 | ||
138 | $this->dbh->query("BEGIN"); | |
139 | ||
140 | // PG only for the time being | |
141 | ||
142 | if (DB_TYPE == "mysql") { | |
143 | ||
144 | $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_categories ( | |
145 | id INTEGER NOT NULL PRIMARY KEY auto_increment, | |
146 | category varchar(100) NOT NULL DEFAULT '', | |
147 | probability DOUBLE NOT NULL DEFAULT '0', | |
148 | owner_uid INTEGER NOT NULL, | |
149 | FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE, | |
150 | word_count BIGINT NOT NULL DEFAULT '0') ENGINE=InnoDB"); | |
151 | ||
152 | $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references ( | |
153 | id INTEGER NOT NULL PRIMARY KEY auto_increment, | |
154 | document_id VARCHAR(255) NOT NULL, | |
155 | category_id INTEGER NOT NULL, | |
156 | FOREIGN KEY (category_id) REFERENCES ${prefix}_categories(id) ON DELETE CASCADE, | |
157 | owner_uid INTEGER NOT NULL, | |
158 | FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE) ENGINE=InnoDB"); | |
159 | ||
160 | $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs ( | |
161 | word varchar(100) NOT NULL DEFAULT '', | |
162 | category_id INTEGER NOT NULL, | |
163 | FOREIGN KEY (category_id) REFERENCES ${prefix}_categories(id) ON DELETE CASCADE, | |
164 | owner_uid INTEGER NOT NULL, | |
165 | FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE, | |
166 | count BIGINT NOT NULL DEFAULT '0') ENGINE=InnoDB"); | |
167 | ||
168 | ||
169 | } else { | |
170 | $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_categories ( | |
171 | id SERIAL NOT NULL PRIMARY KEY, | |
172 | category varchar(100) NOT NULL DEFAULT '', | |
173 | probability DOUBLE PRECISION NOT NULL DEFAULT '0', | |
174 | owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE, | |
175 | word_count BIGINT NOT NULL DEFAULT '0')"); | |
176 | ||
177 | $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references ( | |
178 | id SERIAL NOT NULL PRIMARY KEY, | |
179 | document_id VARCHAR(255) NOT NULL, | |
180 | category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE, | |
181 | owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE)"); | |
182 | ||
183 | $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs ( | |
184 | word varchar(100) NOT NULL DEFAULT '', | |
185 | category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE, | |
186 | owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE, | |
187 | count BIGINT NOT NULL DEFAULT '0')"); | |
188 | } | |
189 | ||
190 | $owner_uid = @$_SESSION["uid"]; | |
191 | ||
192 | if ($owner_uid) { | |
193 | $result = $this->dbh->query("SELECT id FROM ${prefix}_categories WHERE owner_uid = $owner_uid LIMIT 1"); | |
194 | ||
195 | if ($this->dbh->num_rows($result) == 0) { | |
196 | $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('GOOD', $owner_uid)"); | |
197 | $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('BAD', $owner_uid)"); | |
198 | $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('UGLY', $owner_uid)"); | |
199 | } | |
200 | } | |
201 | ||
202 | $this->dbh->query("COMMIT"); | |
203 | } | |
204 | ||
205 | function renderPrefsUI() { | |
206 | $result = $this->dbh->query("SELECT category, probability, word_count, | |
207 | (SELECT COUNT(id) FROM {$this->sql_prefix}_references WHERE | |
208 | category_id = {$this->sql_prefix}_categories.id) as doc_count | |
209 | FROM {$this->sql_prefix}_categories WHERE owner_uid = " . $_SESSION["uid"]); | |
210 | ||
211 | print "<h3>" . __("Statistics") . "</h3>"; | |
212 | ||
213 | print "<p>".T_sprintf("Required UGLY word count for automatic matching: %d", $this->auto_categorize_threshold)."</p>"; | |
214 | ||
215 | print "<table>"; | |
216 | print "<tr><th>Category</th><th>Probability</th><th>Words</th><th>Articles</th></tr>"; | |
217 | ||
218 | while ($line = $this->dbh->fetch_assoc($result)) { | |
219 | print "<tr>"; | |
220 | foreach ($line as $k => $v) { | |
221 | if ($k == "probability") $v = sprintf("%.3f", $v); | |
222 | ||
223 | print "<td>$v</td>"; | |
224 | } | |
225 | print "</tr>"; | |
226 | } | |
227 | ||
228 | print "</table>"; | |
229 | ||
230 | print "<h3>" . __("Last matched articles") . "</h3>"; | |
231 | ||
232 | $result = $this->dbh->query("SELECT te.title, category, tf.title AS feed_title | |
233 | FROM ttrss_entries AS te, ttrss_user_entries AS tu, ttrss_feeds AS tf, {$this->sql_prefix}_references AS tr, {$this->sql_prefix}_categories AS tc | |
234 | WHERE tf.id = tu.feed_id AND tu.ref_id = te.id AND tc.id = tr.category_id AND tr.document_id = te.guid ORDER BY te.id DESC LIMIT 20"); | |
235 | ||
236 | print "<ul class=\"browseFeedList\" style=\"border-width : 1px\">"; | |
237 | ||
238 | while ($line = $this->dbh->fetch_assoc($result)) { | |
239 | print "<li>" . $line["category"] . ": " . $line["title"] . " (" . $line["feed_title"] . ")</li>"; | |
240 | } | |
241 | ||
242 | print "</ul>"; | |
243 | ||
244 | print "<button dojoType=\"dijit.form.Button\" onclick=\"return bayesUpdateUI()\">". | |
245 | __('Refresh')."</button> "; | |
246 | ||
247 | print "<button dojoType=\"dijit.form.Button\" onclick=\"return bayesClearDatabase()\">". | |
248 | __('Clear database')."</button> "; | |
249 | ||
250 | // | |
251 | } | |
252 | ||
253 | function hook_prefs_tab($args) { | |
254 | if ($args != "prefPrefs") return; | |
255 | ||
256 | print "<div id=\"af_sort_bayes_prefs\" dojoType=\"dijit.layout.AccordionPane\" title=\"".__('Bayesian classifier (af_sort_bayes)')."\">"; | |
257 | ||
258 | $this->renderPrefsUI(); | |
259 | ||
260 | print "</div>"; | |
261 | } | |
262 | ||
263 | function hook_article_filter($article) { | |
264 | $owner_uid = $article["owner_uid"]; | |
265 | ||
266 | // guid already includes owner_uid so we don't need to include it | |
267 | $result = $this->dbh->query("SELECT id FROM {$this->sql_prefix}_references WHERE | |
268 | document_id = '" . $this->dbh->escape_string($article['guid_hashed']) . "'"); | |
269 | ||
270 | $nbs = new NaiveBayesianStorage($owner_uid); | |
271 | $nb = new NaiveBayesian($nbs); | |
272 | ||
273 | $categories = $nbs->getCategories(); | |
274 | ||
275 | if (count($categories) > 0) { | |
276 | ||
277 | $count_neutral = 0; | |
278 | ||
279 | $id_good = 0; | |
280 | $id_ugly = 0; | |
281 | $id_bad = 0; | |
282 | ||
283 | foreach ($categories as $id => $cat) { | |
284 | if ($cat["category"] == "GOOD") { | |
285 | $id_good = $id; | |
286 | } else if ($cat["category"] == "UGLY") { | |
287 | $id_ugly = $id; | |
288 | $count_neutral += $cat["word_count"]; | |
289 | } else if ($cat["category"] == "BAD") { | |
290 | $id_bad = $id; | |
291 | } | |
292 | } | |
293 | ||
294 | $dst_category = $id_ugly; | |
295 | ||
296 | $bayes_content = mb_strtolower($article["title"] . " " . strip_tags($article["content"])); | |
297 | ||
298 | if ($count_neutral >= $this->auto_categorize_threshold) { | |
299 | // enable automatic categorization | |
300 | ||
301 | $result = $nb->categorize($bayes_content); | |
302 | ||
303 | //print_r($result); | |
304 | ||
305 | if (count($result) == 3) { | |
306 | $prob_good = $result[$id_good]; | |
307 | $prob_bad = $result[$id_bad]; | |
308 | ||
309 | if ($prob_good > 0.90) { | |
310 | $dst_category = $id_good; | |
311 | $article["score_modifier"] += $this->score_modifier; | |
312 | } else if ($prob_bad > 0.90) { | |
313 | $dst_category = $id_bad; | |
314 | $article["score_modifier"] -= $this->score_modifier; | |
315 | } | |
316 | } | |
317 | ||
318 | _debug("bayes, dst category: $dst_category"); | |
319 | } | |
320 | ||
321 | $nb->train($article["guid_hashed"], $dst_category, $bayes_content); | |
322 | ||
323 | $nb->updateProbabilities(); | |
324 | } | |
325 | ||
326 | return $article; | |
327 | ||
328 | } | |
329 | ||
330 | function clearDatabase() { | |
331 | $prefix = $this->sql_prefix; | |
332 | ||
333 | $this->dbh->query("BEGIN"); | |
334 | $this->dbh->query("DELETE FROM ${prefix}_references WHERE owner_uid = " . $_SESSION["uid"]); | |
335 | $this->dbh->query("DELETE FROM ${prefix}_wordfreqs WHERE owner_uid = " . $_SESSION["uid"]); | |
336 | $this->dbh->query("COMMIT"); | |
337 | ||
338 | $nbs = new NaiveBayesianStorage($_SESSION["uid"]); | |
339 | $nb = new NaiveBayesian($nbs); | |
340 | $nb->updateProbabilities(); | |
341 | } | |
342 | ||
343 | function api_version() { | |
344 | return 2; | |
345 | } | |
346 | ||
347 | } | |
348 | ?> |