plugins/af_sort_bayes/lib/class.naivebayesian.php

   1 <?php
   2         /*
   3          ***** BEGIN LICENSE BLOCK *****
   4          This file is part of PHP Naive Bayesian Filter.
   5
   6          The Initial Developer of the Original Code is
   7          Loic d'Anterroches [loic_at_xhtml.net].
   8          Portions created by the Initial Developer are Copyright (C) 2003
   9          the Initial Developer. All Rights Reserved.
  10
  11          Contributor(s):
  12          See the source
  13
  14          PHP Naive Bayesian Filter is free software; you can redistribute it
  15          and/or modify it under the terms of the GNU General Public License as
  16          published by the Free Software Foundation; either version 2 of
  17          the License, or (at your option) any later version.
  18
  19          PHP Naive Bayesian Filter is distributed in the hope that it will
  20          be useful, but WITHOUT ANY WARRANTY; without even the implied
  21          warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  22          See the GNU General Public License for more details.
  23
  24          You should have received a copy of the GNU General Public License
  25          along with Foobar; if not, write to the Free Software
  26          Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  27
  28          Alternatively, the contents of this file may be used under the terms of
  29          the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  30          in which case the provisions of the LGPL are applicable instead
  31          of those above.
  32
  33          ***** END LICENSE BLOCK *****
  34          */
  35
  36         class NaiveBayesian {
  37                 /** min token length for it to be taken into consideration */
  38                 var $min_token_length = 3;
  39                 /** max token length for it to be taken into consideration */
  40                 var $max_token_length = 15;
  41                 /** list of token to ignore
  42                  @see getIgnoreList()
  43                  */
  44                 var $ignore_list = array();
  45                 /** storage object
  46                  @see class NaiveBayesianStorage
  47                  */
  48                 var $nbs = null;
  49
  50                 function NaiveBayesian($nbs) {
  51                         $this->nbs = $nbs;
  52
  53                         return true;
  54                 }
  55
  56                 /** categorize a document.
  57                  Get list of categories in which the document can be categorized
  58                  with a score for each category.
  59
  60                  @return array keys = category ids, values = scores
  61                  @param string document
  62                  */
  63                 function categorize($document) {
  64                         $scores = array();
  65                         $categories = $this->nbs->getCategories();
  66                         $tokens = $this->_getTokens($document);
  67
  68                         // calculate the score in each category
  69                         $total_words = 0;
  70                         $ncat = 0;
  71
  72                         while (list($category, $data) = each($categories)) {
  73                                 $total_words += $data['word_count'];
  74                                 $ncat++;
  75                         }
  76
  77                         reset($categories);
  78
  79                         while (list($category, $data) = each($categories)) {
  80                                 $scores[$category] = $data['probability'];
  81                                 // small probability for a word not in the category
  82                                 // maybe putting 1.0 as a 'no effect' word can also be good
  83                                 $small_proba = 1.0 / ($data['word_count'] * 2);
  84
  85                                 reset($tokens);
  86
  87                                 while (list($token, $count) = each($tokens)) {
  88
  89                                         if ($this->nbs->wordExists($token)) {
  90                                                 $word = $this->nbs->getWord($token, $category);
  91
  92                                                 if ($word['count']) {
  93                                                         $proba = $word['count'] / $data['word_count'];
  94                                                 }
  95                                                 else {
  96                                                         $proba = $small_proba;
  97                                                 }
  98
  99                                                 $scores[$category] *= pow($proba, $count) * pow($total_words / $ncat, $count);
 100                                                 // pow($total_words/$ncat, $count) is here to avoid underflow.
 101
 102                                         }
 103                                 }
 104                         }
 105
 106                         return $this->_rescale($scores);
 107                 }
 108
 109                 /** training against a document.
 110                  Set a document as being in a specific category. The document becomes a reference
 111                  and is saved in the table of references. After a set of training is done
 112                  the updateProbabilities() function must be run.
 113
 114                  @see updateProbabilities()
 115                  @see untrain()
 116                  @return bool success
 117                  @param string document id, must be unique
 118                  @param string category_id the category id in which the document should be
 119                  @param string content of the document
 120                  */
 121                 function train($doc_id, $category_id, $content) {
 122                         $ret = false;
 123
 124
 125                         // if this doc_id already trained, no trained
 126                         if (!$this->nbs->getReference($doc_id, false)) {
 127
 128                                 $tokens = $this->_getTokens($content);
 129
 130                                 while (list($token, $count) = each($tokens)) {
 131                                         $this->nbs->updateWord($token, $count, $category_id);
 132                                 }
 133
 134                                 $this->nbs->saveReference($doc_id, $category_id, $content);
 135
 136                                 $ret = true;
 137                         }
 138                         else {
 139                                 $ret = false;
 140                         }
 141
 142                         return $ret;
 143                 }
 144
 145                 /** untraining of a document.
 146                  To remove just one document from the references.
 147
 148                  @see updateProbabilities()
 149                  @see untrain()
 150                  @return bool success
 151                  @param string document id, must be unique
 152                  */
 153                 function untrain($doc_id) {
 154                         $ref = $this->nbs->getReference($doc_id);
 155
 156                         if (isset($ref['content'])) {
 157
 158                                 $tokens = $this->_getTokens($ref['content']);
 159
 160                                 while (list($token, $count) = each($tokens)) {
 161                                         $this->nbs->removeWord($token, $count, $ref['category_id']);
 162                                 }
 163
 164                                 $this->nbs->removeReference($doc_id);
 165
 166                                 return true;
 167                         } else {
 168                                 return false;
 169                         }
 170                 }
 171
 172                 /** rescale the results between 0 and 1.
 173
 174                  @author Ken Williams, ken@mathforum.org
 175                  @see categorize()
 176                  @return array normalized scores (keys => category, values => scores)
 177                  @param array scores (keys => category, values => scores)
 178                  */
 179                 function _rescale($scores) {
 180                         // Scale everything back to a reasonable area in
 181                         // logspace (near zero), un-loggify, and normalize
 182                         $total = 0.0;
 183                         $max = 0.0;
 184                         reset($scores);
 185
 186                         while (list($cat, $score) = each($scores)) {
 187                                 if ($score >= $max)
 188                                         $max = $score;
 189                         }
 190
 191                         reset($scores);
 192                         while (list($cat, $score) = each($scores)) {
 193                                 $scores[$cat] = (float) exp($score - $max);
 194                                 $total += (float) pow($scores[$cat], 2);
 195                         }
 196
 197                         $total = (float) sqrt($total);
 198
 199                         reset($scores);
 200                         while (list($cat, $score) = each($scores)) {
 201                                 $scores[$cat] = (float) $scores[$cat] / $total;
 202                         }
 203                         reset($scores);
 204
 205                         return $scores;
 206                 }
 207
 208                 /** update the probabilities of the categories and word count.
 209                  This function must be run after a set of training
 210
 211                  @see train()
 212                  @see untrain()
 213                  @return bool sucess
 214                  */
 215                 function updateProbabilities() {
 216                         // this function is really only database manipulation
 217                         // that is why all is done in the NaiveBayesianStorage
 218                         return $this->nbs->updateProbabilities();
 219                 }
 220
 221                 /** Get the list of token to ignore.
 222                  @return array ignore list
 223                  */
 224                 function getIgnoreList() {
 225                         return array('the', 'that', 'you', 'for', 'and');
 226                 }
 227
 228                 /** get the tokens from a string
 229
 230                  @author James Seng. [http://james.seng.cc/] (based on his perl version)
 231
 232                  @return array tokens
 233                  @param  string the string to get the tokens from
 234                  */
 235                 function _getTokens($string) {
 236                         $rawtokens = array();
 237                         $tokens = array();
 238                         //$string = $this->_cleanString($string);
 239
 240                         if (count(0 >= $this->ignore_list)) {
 241                                 $this->ignore_list = $this->getIgnoreList();
 242                         }
 243
 244                         $rawtokens = preg_split("/[\(\),:\.;\t\r\n ]/", $string, -1, PREG_SPLIT_NO_EMPTY);
 245
 246                         // remove some tokens
 247                         while (list(, $token) = each($rawtokens)) {
 248                                 $token = trim($token);
 249                                 if (!(('' == $token) || (mb_strpos($token, "&") !== FALSE) || (mb_strlen($token) < $this->min_token_length) || (mb_strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) {
 250                                         $tokens[$token]++;
 251                                 }
 252                         }
 253
 254                         return $tokens;
 255                 }
 256
 257                 /** clean a string from the diacritics
 258
 259                  @author Antoine Bajolet [phpdig_at_toiletoine.net]
 260                  @author SPIP [http://uzine.net/spip/]
 261
 262                  @return string clean string
 263                  @param  string string with accents
 264                  */
 265                 function _cleanString($string) {
 266                         $diac = /* A */ chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) .
 267                                 /* a */ chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) .
 268                                 /* O */ chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) .
 269                                 /* o */ chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) .
 270                                 /* E */ chr(200) . chr(201) . chr(202) . chr(203) .
 271                                 /* e */ chr(232) . chr(233) . chr(234) . chr(235) .
 272                                 /* Cc */ chr(199) . chr(231) .
 273                                 /* I */ chr(204) . chr(205) . chr(206) . chr(207) .
 274                                 /* i */ chr(236) . chr(237) . chr(238) . chr(239) .
 275                                 /* U */ chr(217) . chr(218) . chr(219) . chr(220) .
 276                                 /* u */ chr(249) . chr(250) . chr(251) . chr(252) .
 277                                 /* yNn */ chr(255) . chr(209) . chr(241);
 278
 279                         return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn'));
 280                 }
 281
 282         }