]> git.wh0rd.org - tt-rss.git/blob - plugins/af_sort_bayes/init.php
use ngram tokens instead of whole words for matching
[tt-rss.git] / plugins / af_sort_bayes / init.php
1 <?php
2
3 class Af_Sort_Bayes extends Plugin {
4
5 private $host;
6 private $filters = array();
7 private $dbh;
8 private $score_modifier = 50;
9 private $sql_prefix = "ttrss_plugin_af_sort_bayes";
10
11 function about() {
12 return array(1.0,
13 "Bayesian classifier for tt-rss (WIP)",
14 "fox");
15 }
16
17 function init($host) {
18 require_once __DIR__ . "/lib/class.naivebayesian.php";
19 require_once __DIR__ . "/lib/class.naivebayesian_ngram.php";
20 require_once __DIR__ . "/lib/class.naivebayesianstorage.php";
21
22 $this->host = $host;
23 $this->dbh = Db::get();
24
25 $this->init_database();
26
27 $host->add_hook($host::HOOK_ARTICLE_FILTER, $this);
28 $host->add_hook($host::HOOK_PREFS_TAB, $this);
29 $host->add_hook($host::HOOK_ARTICLE_BUTTON, $this);
30
31 }
32
33 function trainArticle() {
34 $article_id = (int) $_REQUEST["article_id"];
35 $train_up = sql_bool_to_bool($_REQUEST["train_up"]);
36
37 $category = $train_up ? "GOOD" : "NEUTRAL";
38
39 $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
40 $nb = new NaiveBayesianNgram($nbs);
41
42 $result = $this->dbh->query("SELECT score, guid, title, content FROM ttrss_entries, ttrss_user_entries WHERE ref_id = id AND id = " .
43 $article_id . " AND owner_uid = " . $_SESSION["uid"]);
44
45 if ($this->dbh->num_rows($result) != 0) {
46 $guid = $this->dbh->fetch_result($result, 0, "guid");
47 $title = $this->dbh->fetch_result($result, 0, "title");
48 $content = mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content")));
49 $score = $this->dbh->fetch_result($result, 0, "score");
50
51 $this->dbh->query("BEGIN");
52
53 if ($nb->untrain($guid, $content)) {
54 if ($score >= $this->score_modifier) $score -= $this->score_modifier;
55 }
56
57 $nb->train($guid, $nbs->getCategoryByName($category), $content);
58
59 if ($category == "GOOD") $score += $this->score_modifier;
60
61 $this->dbh->query("UPDATE ttrss_user_entries SET score = '$score' WHERE ref_id = $article_id AND owner_uid = " . $_SESSION["uid"]);
62
63 $nb->updateProbabilities();
64
65 $this->dbh->query("COMMIT");
66
67 }
68
69 print "$article_id :: $category";
70 }
71
72 function get_js() {
73 return file_get_contents(__DIR__ . "/init.js");
74 }
75
76 function get_prefs_js() {
77 return file_get_contents(__DIR__ . "/init.js");
78 }
79
80 function hook_article_button($line) {
81 return "<img src=\"plugins/af_sort_bayes/thumb_up.png\"
82 style=\"cursor : pointer\" style=\"cursor : pointer\"
83 onclick=\"bayesTrain(".$line["id"].", true)\"
84 class='tagsPic' title='".__('+1')."'>" .
85 "<img src=\"plugins/af_sort_bayes/thumb_down.png\"
86 style=\"cursor : pointer\" style=\"cursor : pointer\"
87 onclick=\"bayesTrain(".$line["id"].", false)\"
88 class='tagsPic' title='".__('-1')."'>";
89
90 }
91
92 function init_database() {
93 $prefix = $this->sql_prefix;
94
95 // TODO there probably should be a way for plugins to determine their schema version to upgrade tables
96
97 /*$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_wordfreqs", false);
98 $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_references", false);
99 $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_categories", false);*/
100
101 $this->dbh->query("BEGIN");
102
103 // PG only for the time being
104
105 if (DB_TYPE == "mysql") {
106
107 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_categories (
108 id INTEGER NOT NULL PRIMARY KEY auto_increment,
109 category varchar(100) NOT NULL DEFAULT '',
110 probability DOUBLE NOT NULL DEFAULT '0',
111 owner_uid INTEGER NOT NULL,
112 FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE,
113 word_count BIGINT NOT NULL DEFAULT '0') ENGINE=InnoDB");
114
115 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references (
116 id INTEGER NOT NULL PRIMARY KEY auto_increment,
117 document_id VARCHAR(255) NOT NULL,
118 category_id INTEGER NOT NULL,
119 FOREIGN KEY (category_id) REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
120 owner_uid INTEGER NOT NULL,
121 FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE,
122 content text NOT NULL) ENGINE=InnoDB");
123
124 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs (
125 word varchar(100) NOT NULL DEFAULT '',
126 category_id INTEGER NOT NULL,
127 FOREIGN KEY (category_id) REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
128 owner_uid INTEGER NOT NULL,
129 FOREIGN KEY (owner_uid) REFERENCES ttrss_users(id) ON DELETE CASCADE,
130 count BIGINT NOT NULL DEFAULT '0') ENGINE=InnoDB");
131
132
133 } else {
134 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_categories (
135 id SERIAL NOT NULL PRIMARY KEY,
136 category varchar(100) NOT NULL DEFAULT '',
137 probability DOUBLE NOT NULL DEFAULT '0',
138 owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
139 word_count BIGINT NOT NULL DEFAULT '0')");
140
141 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references (
142 id SERIAL NOT NULL PRIMARY KEY,
143 document_id VARCHAR(255) NOT NULL,
144 category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
145 owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
146 content text NOT NULL)");
147
148 $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_wordfreqs (
149 word varchar(100) NOT NULL DEFAULT '',
150 category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
151 owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
152 count BIGINT NOT NULL DEFAULT '0')");
153 }
154
155 $owner_uid = @$_SESSION["uid"];
156
157 if ($owner_uid) {
158 $result = $this->dbh->query("SELECT id FROM ${prefix}_categories WHERE owner_uid = $owner_uid LIMIT 1");
159
160 if ($this->dbh->num_rows($result) == 0) {
161 $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('GOOD', $owner_uid)");
162 $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('NEUTRAL', $owner_uid)");
163 }
164 }
165
166 $this->dbh->query("COMMIT");
167 }
168
169 function hook_prefs_tab($args) {
170 if ($args != "prefPrefs") return;
171
172 print "<div dojoType=\"dijit.layout.AccordionPane\" title=\"".__('Bayesian classifier (af_sort_bayes)')."\">";
173
174 $result = $this->dbh->query("SELECT category, probability, word_count,
175 (SELECT COUNT(id) FROM {$this->sql_prefix}_references WHERE
176 category_id = {$this->sql_prefix}_categories.id) as doc_count
177 FROM {$this->sql_prefix}_categories WHERE owner_uid = " . $_SESSION["uid"]);
178
179 print "<table>";
180 print "<tr><th>Category</th><th>Probability</th><th>Word count</th><th>Article count</th></tr>";
181
182 while ($line = $this->dbh->fetch_assoc($result)) {
183 print "<tr>";
184 foreach ($line as $k => $v) {
185 if ($k == "probability") $v = sprintf("%.3f", $v);
186
187 print "<td>$v</td>";
188 }
189 print "</tr>";
190 }
191
192 print "</table>";
193
194 print "<button dojoType=\"dijit.form.Button\" onclick=\"return bayesClearDatabase()\">".
195 __('Clear database')."</button> ";
196
197 //
198
199 print "</div>";
200 }
201
202 function hook_article_filter($article) {
203 $owner_uid = $article["owner_uid"];
204
205 $nbs = new NaiveBayesianStorage($owner_uid);
206 $nb = new NaiveBayesianNgram($nbs);
207
208 $categories = $nbs->getCategories();
209
210 if (count($categories) > 0) {
211
212 $count_neutral = 0;
213 $count_good = 0;
214 $id_good = 0;
215 $id_neutral = 0;
216
217 foreach ($categories as $id => $cat) {
218 if ($cat["category"] == "GOOD") {
219 $id_good = $id;
220 $count_good += $cat["word_count"];
221 } else if ($cat["category"] == "NEUTRAL") {
222 $id_neutral = $id;
223 $count_neutral += $cat["word_count"];
224 }
225 }
226
227 $dst_category = $id_neutral;
228
229 $bayes_content = mb_strtolower($article["title"] . " " . strip_tags($article["content"]));
230
231 if ($count_neutral >= 20000 && $count_good >= 10000) {
232 // enable automatic categorization
233
234 $result = $nb->categorize($bayes_content);
235
236 if (count($result) == 2) {
237 $prob_good = $result[$id_good];
238 $prob_neutral = $result[$id_neutral];
239
240 if ($prob_good > 0.90 && $prob_good > $prob_neutral) {
241 $dst_category = $id_good; // should we autofile as good or not? idk
242 $article["score_modifier"] += $this->score_modifier;
243 }
244 }
245 }
246
247 $nb->train($article["guid_hashed"], $dst_category, $bayes_content);
248
249 $nb->updateProbabilities();
250 }
251
252 return $article;
253
254 }
255
256 function clearDatabase() {
257 $prefix = $this->sql_prefix;
258
259 $this->dbh->query("BEGIN");
260 $this->dbh->query("DELETE FROM ${prefix}_references WHERE owner_uid = " . $_SESSION["uid"]);
261 $this->dbh->query("DELETE FROM ${prefix}_wordfreqs WHERE owner_uid = " . $_SESSION["uid"]);
262 $this->dbh->query("COMMIT");
263
264 $nbs = new NaiveBayesianStorage($_SESSION["uid"]);
265 $nb = new NaiveBayesianNgram($nbs);
266 $nb->updateProbabilities();
267 }
268
269 function api_version() {
270 return 2;
271 }
272
273 }
274 ?>