plugins/af_sort_bayes/lib/class.naivebayesian.php

   1 <?php
   2         /*
   3          ***** BEGIN LICENSE BLOCK *****
   4          This file is part of PHP Naive Bayesian Filter.
   5
   6          The Initial Developer of the Original Code is
   7          Loic d'Anterroches [loic_at_xhtml.net].
   8          Portions created by the Initial Developer are Copyright (C) 2003
   9          the Initial Developer. All Rights Reserved.
  10
  11          Contributor(s):
  12          See the source
  13
  14          PHP Naive Bayesian Filter is free software; you can redistribute it
  15          and/or modify it under the terms of the GNU General Public License as
  16          published by the Free Software Foundation; either version 2 of
  17          the License, or (at your option) any later version.
  18
  19          PHP Naive Bayesian Filter is distributed in the hope that it will
  20          be useful, but WITHOUT ANY WARRANTY; without even the implied
  21          warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  22          See the GNU General Public License for more details.
  23
  24          You should have received a copy of the GNU General Public License
  25          along with Foobar; if not, write to the Free Software
  26          Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  27
  28          Alternatively, the contents of this file may be used under the terms of
  29          the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  30          in which case the provisions of the LGPL are applicable instead
  31          of those above.
  32
  33          ***** END LICENSE BLOCK *****
  34          */
  35
  36         class NaiveBayesian {
  37                 /** min token length for it to be taken into consideration */
  38                 var $min_token_length = 3;
  39                 /** max token length for it to be taken into consideration */
  40                 var $max_token_length = 15;
  41                 /** list of token to ignore
  42                  @see getIgnoreList()
  43                  */
  44                 var $ignore_list = array();
  45                 /** storage object
  46                  @see class NaiveBayesianStorage
  47                  */
  48                 var $nbs = null;
  49
  50                 function NaiveBayesian($nbs) {
  51                         $this->nbs = $nbs;
  52
  53                         return true;
  54                 }
  55
  56                 /** categorize a document.
  57                  Get list of categories in which the document can be categorized
  58                  with a score for each category.
  59
  60                  @return array keys = category ids, values = scores
  61                  @param string document
  62                  */
  63                 function categorize($document) {
  64                         $scores = array();
  65                         $categories = $this->nbs->getCategories();
  66                         $tokens = $this->_getTokens($document);
  67
  68                         // calculate the score in each category
  69                         $total_words = 0;
  70                         $ncat = 0;
  71
  72                         while (list($category, $data) = each($categories)) {
  73                                 $total_words += $data['word_count'];
  74                                 $ncat++;
  75                         }
  76
  77                         reset($categories);
  78
  79                         while (list($category, $data) = each($categories)) {
  80                                 $scores[$category] = $data['probability'];
  81                                 // small probability for a word not in the category
  82                                 // maybe putting 1.0 as a 'no effect' word can also be good
  83
  84                                 if ($data['word_count'] > 0)
  85                                         $small_proba = 1.0 / ($data['word_count'] * 2);
  86                                 else
  87                                         $small_proba = 0;
  88
  89                                 reset($tokens);
  90
  91                                 while (list($token, $count) = each($tokens)) {
  92
  93                                         if ($this->nbs->wordExists($token)) {
  94                                                 $word = $this->nbs->getWord($token, $category);
  95
  96                                                 if ($word['count']) {
  97                                                         $proba = $word['count'] / $data['word_count'];
  98                                                 }
  99                                                 else {
 100                                                         $proba = $small_proba;
 101                                                 }
 102
 103                                                 $scores[$category] *= pow($proba, $count) * pow($total_words / $ncat, $count);
 104                                                 // pow($total_words/$ncat, $count) is here to avoid underflow.
 105
 106                                         }
 107                                 }
 108                         }
 109
 110                         return $this->_rescale($scores);
 111                 }
 112
 113                 /** training against a document.
 114                  Set a document as being in a specific category. The document becomes a reference
 115                  and is saved in the table of references. After a set of training is done
 116                  the updateProbabilities() function must be run.
 117
 118                  @see updateProbabilities()
 119                  @see untrain()
 120                  @return bool success
 121                  @param string document id, must be unique
 122                  @param string category_id the category id in which the document should be
 123                  @param string content of the document
 124                  */
 125                 function train($doc_id, $category_id, $content) {
 126                         $ret = false;
 127
 128
 129                         // if this doc_id already trained, no trained
 130                         if (!$this->nbs->getReference($doc_id, false)) {
 131
 132                                 $tokens = $this->_getTokens($content);
 133
 134                                 while (list($token, $count) = each($tokens)) {
 135                                         $this->nbs->updateWord($token, $count, $category_id);
 136                                 }
 137
 138                                 $this->nbs->saveReference($doc_id, $category_id, $content);
 139
 140                                 $ret = true;
 141                         }
 142                         else {
 143                                 $ret = false;
 144                         }
 145
 146                         return $ret;
 147                 }
 148
 149                 /** untraining of a document.
 150                  To remove just one document from the references.
 151
 152                  @see updateProbabilities()
 153                  @see untrain()
 154                  @return bool success
 155                  @param string document id, must be unique
 156                  */
 157                 function untrain($doc_id) {
 158                         $ref = $this->nbs->getReference($doc_id);
 159
 160                         if (isset($ref['content'])) {
 161
 162                                 $tokens = $this->_getTokens($ref['content']);
 163
 164                                 while (list($token, $count) = each($tokens)) {
 165                                         $this->nbs->removeWord($token, $count, $ref['category_id']);
 166                                 }
 167
 168                                 $this->nbs->removeReference($doc_id);
 169
 170                                 return true;
 171                         } else {
 172                                 return false;
 173                         }
 174                 }
 175
 176                 /** rescale the results between 0 and 1.
 177
 178                  @author Ken Williams, ken@mathforum.org
 179                  @see categorize()
 180                  @return array normalized scores (keys => category, values => scores)
 181                  @param array scores (keys => category, values => scores)
 182                  */
 183                 function _rescale($scores) {
 184                         // Scale everything back to a reasonable area in
 185                         // logspace (near zero), un-loggify, and normalize
 186                         $total = 0.0;
 187                         $max = 0.0;
 188                         reset($scores);
 189
 190                         while (list($cat, $score) = each($scores)) {
 191                                 if ($score >= $max)
 192                                         $max = $score;
 193                         }
 194
 195                         reset($scores);
 196                         while (list($cat, $score) = each($scores)) {
 197                                 $scores[$cat] = (float) exp($score - $max);
 198                                 $total += (float) pow($scores[$cat], 2);
 199                         }
 200
 201                         $total = (float) sqrt($total);
 202
 203                         reset($scores);
 204                         while (list($cat, $score) = each($scores)) {
 205                                 $scores[$cat] = (float) $scores[$cat] / $total;
 206                         }
 207                         reset($scores);
 208
 209                         return $scores;
 210                 }
 211
 212                 /** update the probabilities of the categories and word count.
 213                  This function must be run after a set of training
 214
 215                  @see train()
 216                  @see untrain()
 217                  @return bool sucess
 218                  */
 219                 function updateProbabilities() {
 220                         // this function is really only database manipulation
 221                         // that is why all is done in the NaiveBayesianStorage
 222                         return $this->nbs->updateProbabilities();
 223                 }
 224
 225                 /** Get the list of token to ignore.
 226                  @return array ignore list
 227                  */
 228                 function getIgnoreList() {
 229                         //return array('the', 'that', 'you', 'for', 'and');
 230
 231                         // https://en.wikipedia.org/wiki/Most_common_words_in_English
 232                         return array('the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'I', 'it', 'for', 'not', 'on', 'with',
 233                                 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her',
 234                                 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up',
 235                                 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time',
 236                                 'no', 'just', 'him', 'know', 'take', 'people', 'into', 'year', 'your', 'good', 'some', 'could',
 237                                 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think',
 238                                 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even',
 239                                 'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us', 'read', 'more');
 240
 241                 }
 242
 243                 /** get the tokens from a string
 244
 245                  @author James Seng. [http://james.seng.cc/] (based on his perl version)
 246
 247                  @return array tokens
 248                  @param  string the string to get the tokens from
 249                  */
 250                 function _getTokens($string) {
 251                         $rawtokens = array();
 252                         $tokens = array();
 253                         //$string = $this->_cleanString($string);
 254
 255                         if (count(0 >= $this->ignore_list)) {
 256                                 $this->ignore_list = $this->getIgnoreList();
 257                         }
 258
 259                         $rawtokens = preg_split("/[\(\),:\.;\t\r\n ]/", $string, -1, PREG_SPLIT_NO_EMPTY);
 260
 261                         // remove some tokens
 262                         while (list(, $token) = each($rawtokens)) {
 263                                 $token = trim($token);
 264                                 if (!(('' == $token) || (mb_strpos($token, "&") !== FALSE) || (mb_strlen($token) < $this->min_token_length) || (mb_strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) {
 265                                         $tokens[$token]++;
 266                                 }
 267                         }
 268
 269                         return $tokens;
 270                 }
 271
 272                 /** clean a string from the diacritics
 273
 274                  @author Antoine Bajolet [phpdig_at_toiletoine.net]
 275                  @author SPIP [http://uzine.net/spip/]
 276
 277                  @return string clean string
 278                  @param  string string with accents
 279                  */
 280                 function _cleanString($string) {
 281                         $diac = /* A */ chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) .
 282                                 /* a */ chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) .
 283                                 /* O */ chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) .
 284                                 /* o */ chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) .
 285                                 /* E */ chr(200) . chr(201) . chr(202) . chr(203) .
 286                                 /* e */ chr(232) . chr(233) . chr(234) . chr(235) .
 287                                 /* Cc */ chr(199) . chr(231) .
 288                                 /* I */ chr(204) . chr(205) . chr(206) . chr(207) .
 289                                 /* i */ chr(236) . chr(237) . chr(238) . chr(239) .
 290                                 /* U */ chr(217) . chr(218) . chr(219) . chr(220) .
 291                                 /* u */ chr(249) . chr(250) . chr(251) . chr(252) .
 292                                 /* yNn */ chr(255) . chr(209) . chr(241);
 293
 294                         return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn'));
 295                 }
 296
 297         }