Friendica Communications Platform (please note that this is a clone of the repository at github, issues are handled there) https://friendi.ca
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

502 lines
12 KiB

9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
  1. <?php
  2. # Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
  3. #
  4. # b8 - A Bayesian spam filter written in PHP 5
  5. #
  6. # This program is free software; you can redistribute it and/or modify it
  7. # under the terms of the GNU Lesser General Public License as published by
  8. # the Free Software Foundation in version 2.1 of the License.
  9. #
  10. # This program is distributed in the hope that it will be useful, but
  11. # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  12. # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
  13. # License for more details.
  14. #
  15. # You should have received a copy of the GNU Lesser General Public License
  16. # along with this program; if not, write to the Free Software Foundation,
  17. # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
  18. /**
  19. * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
  20. *
  21. * @license LGPL
  22. * @access public
  23. * @package b8
  24. * @author Tobias Leupold
  25. * @author Oliver Lillie (aka buggedcom) (original PHP 5 port)
  26. */
  27. class b8
  28. {
  29. public $config = array(
  30. 'min_size' => 3,
  31. 'max_size' => 30,
  32. 'allow_numbers' => FALSE,
  33. 'lexer' => 'default',
  34. 'degenerator' => 'default',
  35. 'storage' => 'dba',
  36. 'use_relevant' => 15,
  37. 'min_dev' => 0.2,
  38. 'rob_s' => 0.3,
  39. 'rob_x' => 0.5
  40. );
  41. private $_lexer = NULL;
  42. private $_database = NULL;
  43. private $_token_data = NULL;
  44. const SPAM = 'spam';
  45. const HAM = 'ham';
  46. const LEARN = 'learn';
  47. const UNLEARN = 'unlearn';
  48. const STARTUP_FAIL_DATABASE = 'STARTUP_FAIL_DATABASE';
  49. const STARTUP_FAIL_LEXER = 'STARTUP_FAIL_LEXER';
  50. const TRAINER_CATEGORY_FAIL = 'TRAINER_CATEGORY_FAIL';
  51. /**
  52. * Constructs b8
  53. *
  54. * @access public
  55. * @return void
  56. */
  57. function __construct($config = array(), $database_config)
  58. {
  59. # Validate config data
  60. if(count($config) > 0) {
  61. foreach ($config as $name=>$value) {
  62. switch($name) {
  63. case 'min_dev':
  64. case 'rob_s':
  65. case 'rob_x':
  66. $this->config[$name] = (float) $value;
  67. break;
  68. case 'min_size':
  69. case 'max_size':
  70. case 'use_relevant':
  71. $this->config[$name] = (int) $value;
  72. break;
  73. case 'allow_numbers':
  74. $this->config[$name] = (bool) $value;
  75. break;
  76. case 'lexer':
  77. $value = (string) strtolower($value);
  78. $this->config[$name] = is_file(dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . "lexer_" . $value . '.php') === TRUE ? $value : 'default';
  79. break;
  80. case 'storage':
  81. $this->config[$name] = (string) $value;
  82. break;
  83. }
  84. }
  85. }
  86. # Setup the database backend
  87. # Get the basic storage class used by all backends
  88. if($this->load_class('b8_storage_base', dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_base.php') === FALSE)
  89. return;
  90. # Get the degenerator we need
  91. if($this->load_class('b8_degenerator_' . $this->config['degenerator'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'degenerator' . DIRECTORY_SEPARATOR . 'degenerator_' . $this->config['degenerator'] . '.php') === FALSE)
  92. return;
  93. # Get the actual storage backend we need
  94. if($this->load_class('b8_storage_' . $this->config['storage'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_' . $this->config['storage'] . '.php') === FALSE)
  95. return;
  96. # Setup the backend
  97. $class = 'b8_storage_' . $this->config['storage'];
  98. $this->_database = new $class(
  99. $database_config,
  100. $this->config['degenerator'], date('ymd')
  101. );
  102. # Setup the lexer class
  103. if($this->load_class('b8_lexer_' . $this->config['lexer'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . 'lexer_' . $this->config['lexer'] . '.php') === FALSE)
  104. return;
  105. $class = 'b8_lexer_' . $this->config['lexer'];
  106. $this->_lexer = new $class(
  107. array(
  108. 'min_size' => $this->config['min_size'],
  109. 'max_size' => $this->config['max_size'],
  110. 'allow_numbers' => $this->config['allow_numbers']
  111. )
  112. );
  113. }
  114. /**
  115. * Load a class file if a class has not been defined yet.
  116. *
  117. * @access public
  118. * @return boolean Returns TRUE if everything is okay, otherwise FALSE.
  119. */
  120. public function load_class($class_name, $class_file)
  121. {
  122. if(class_exists($class_name, FALSE) === FALSE) {
  123. $included = require_once $class_file;
  124. if($included === FALSE or class_exists($class_name, FALSE) === FALSE)
  125. return FALSE;
  126. }
  127. return TRUE;
  128. }
  129. /**
  130. * Validates the class has all it needs to work.
  131. *
  132. * @access public
  133. * @return mixed Returns TRUE if everything is okay, otherwise an error code.
  134. */
  135. public function validate()
  136. {
  137. if($this->_database === NULL)
  138. return self::STARTUP_FAIL_DATABASE;
  139. # Connect the database backend if we aren't connected yet
  140. elseif($this->_database->connected === FALSE) {
  141. $connection = $this->_database->connect();
  142. if($connection !== TRUE)
  143. return $connection;
  144. }
  145. if($this->_lexer === NULL)
  146. return self::STARTUP_FAIL_LEXER;
  147. return TRUE;
  148. }
  149. /**
  150. * Classifies a text
  151. *
  152. * @access public
  153. * @package default
  154. * @param string $text
  155. * @return float The rating between 0 (ham) and 1 (spam)
  156. */
  157. public function classify($uid,$text)
  158. {
  159. # Validate the startup
  160. $started_up = $this->validate();
  161. if($started_up !== TRUE)
  162. return $started_up;
  163. # Get the internal database variables, containing the number of ham and
  164. # spam texts so the spam probability can be calculated in relation to them
  165. $internals = $this->_database->get_internals($uid);
  166. # Calculate the spamminess of all tokens
  167. # Get all tokens we want to rate
  168. $tokens = $this->_lexer->get_tokens($text);
  169. # Check if the lexer failed
  170. # (if so, $tokens will be a lexer error code, if not, $tokens will be an array)
  171. if(!is_array($tokens))
  172. return $tokens;
  173. # Fetch all availible data for the token set from the database
  174. $this->_token_data = $this->_database->get(array_keys($tokens),$uid);
  175. # Calculate the spamminess and importance for each token (or a degenerated form of it)
  176. $word_count = array();
  177. $rating = array();
  178. $importance = array();
  179. foreach($tokens as $word => $count) {
  180. $word_count[$word] = $count;
  181. # Although we only call this function only here ... let's do the
  182. # calculation stuff in a function to make this a bit less confusing ;-)
  183. $rating[$word] = $this->_get_probability($word, $internals['texts_ham'], $internals['texts_spam']);
  184. $importance[$word] = abs(0.5 - $rating[$word]);
  185. }
  186. # Order by importance
  187. arsort($importance);
  188. reset($importance);
  189. # Get the most interesting tokens (use all if we have less than the given number)
  190. $relevant = array();
  191. for($i = 0; $i < $this->config['use_relevant']; $i++) {
  192. if($tmp = each($importance)) {
  193. # Important tokens remain
  194. # If the token's rating is relevant enough, use it
  195. if(abs(0.5 - $rating[$tmp['key']]) > $this->config['min_dev']) {
  196. # Tokens that appear more than once also count more than once
  197. for($x = 0, $l = $word_count[$tmp['key']]; $x < $l; $x++)
  198. array_push($relevant, $rating[$tmp['key']]);
  199. }
  200. }
  201. else {
  202. # We have less than words to use, so we already
  203. # use what we have and can break here
  204. break;
  205. }
  206. }
  207. # Calculate the spamminess of the text (thanks to Mr. Robinson ;-)
  208. # We set both hamminess and Spamminess to 1 for the first multiplying
  209. $hamminess = 1;
  210. $spamminess = 1;
  211. # Consider all relevant ratings
  212. foreach($relevant as $value) {
  213. $hamminess *= (1.0 - $value);
  214. $spamminess *= $value;
  215. }
  216. # If no token was good for calculation, we really don't know how
  217. # to rate this text; so we assume a spam and ham probability of 0.5
  218. if($hamminess === 1 and $spamminess === 1) {
  219. $hamminess = 0.5;
  220. $spamminess = 0.5;
  221. $n = 1;
  222. }
  223. else {
  224. # Get the number of relevant ratings
  225. $n = count($relevant);
  226. }
  227. # Calculate the combined rating
  228. # The actual hamminess and spamminess
  229. $hamminess = 1 - pow($hamminess, (1 / $n));
  230. $spamminess = 1 - pow($spamminess, (1 / $n));
  231. # Calculate the combined indicator
  232. $probability = ($hamminess - $spamminess) / ($hamminess + $spamminess);
  233. # We want a value between 0 and 1, not between -1 and +1, so ...
  234. $probability = (1 + $probability) / 2;
  235. # Alea iacta est
  236. return $probability;
  237. }
  238. /**
  239. * Calculate the spamminess of a single token also considering "degenerated" versions
  240. *
  241. * @access private
  242. * @param string $word
  243. * @param string $texts_ham
  244. * @param string $texts_spam
  245. * @return void
  246. */
  247. private function _get_probability($word, $texts_ham, $texts_spam)
  248. {
  249. # Let's see what we have!
  250. if(isset($this->_token_data['tokens'][$word]) === TRUE) {
  251. # The token was in the database, so we can use it's data as-is
  252. # and calculate the spamminess of this token directly
  253. return $this->_calc_probability($this->_token_data['tokens'][$word], $texts_ham, $texts_spam);
  254. }
  255. # Damn. The token was not found, so do we have at least similar words?
  256. if(isset($this->_token_data['degenerates'][$word]) === TRUE) {
  257. # We found similar words, so calculate the spamminess for each one
  258. # and choose the most important one for the further calculation
  259. # The default rating is 0.5 simply saying nothing
  260. $rating = 0.5;
  261. foreach($this->_token_data['degenerates'][$word] as $degenerate => $count) {
  262. # Calculate the rating of the current degenerated token
  263. $rating_tmp = $this->_calc_probability($count, $texts_ham, $texts_spam);
  264. # Is it more important than the rating of another degenerated version?
  265. if(abs(0.5 - $rating_tmp) > abs(0.5 - $rating))
  266. $rating = $rating_tmp;
  267. }
  268. return $rating;
  269. }
  270. else {
  271. # The token is really unknown, so choose the default rating
  272. # for completely unknown tokens. This strips down to the
  273. # robX parameter so we can cheap out the freaky math ;-)
  274. return $this->config['rob_x'];
  275. }
  276. }
  277. /**
  278. * Do the actual spamminess calculation of a single token
  279. *
  280. * @access private
  281. * @param array $data
  282. * @param string $texts_ham
  283. * @param string $texts_spam
  284. * @return void
  285. */
  286. private function _calc_probability($data, $texts_ham, $texts_spam)
  287. {
  288. # Calculate the basic probability by Mr. Graham
  289. # But: consider the number of ham and spam texts saved instead of the
  290. # number of entries where the token appeared to calculate a relative
  291. # spamminess because we count tokens appearing multiple times not just
  292. # once but as often as they appear in the learned texts
  293. $rel_ham = $data['count_ham'];
  294. $rel_spam = $data['count_spam'];
  295. if($texts_ham > 0)
  296. $rel_ham = $data['count_ham'] / $texts_ham;
  297. if($texts_spam > 0)
  298. $rel_spam = $data['count_spam'] / $texts_spam;
  299. $rating = $rel_spam / ($rel_ham + $rel_spam);
  300. # Calculate the better probability proposed by Mr. Robinson
  301. $all = $data['count_ham'] + $data['count_spam'];
  302. return (($this->config['rob_s'] * $this->config['rob_x']) + ($all * $rating)) / ($this->config['rob_s'] + $all);
  303. }
  304. /**
  305. * Check the validity of the category of a request
  306. *
  307. * @access private
  308. * @param string $category
  309. * @return void
  310. */
  311. private function _check_category($category)
  312. {
  313. return $category === self::HAM or $category === self::SPAM;
  314. }
  315. /**
  316. * Learn a reference text
  317. *
  318. * @access public
  319. * @param string $text
  320. * @param const $category Either b8::SPAM or b8::HAM
  321. * @return void
  322. */
  323. public function learn($text, $category, $uid)
  324. {
  325. return $this->_process_text($text, $category, self::LEARN, $uid);
  326. }
  327. /**
  328. * Unlearn a reference text
  329. *
  330. * @access public
  331. * @param string $text
  332. * @param const $category Either b8::SPAM or b8::HAM
  333. * @return void
  334. */
  335. public function unlearn($text, $category, $uid)
  336. {
  337. return $this->_process_text($text, $category, self::UNLEARN, $uid);
  338. }
  339. /**
  340. * Does the actual interaction with the storage backend for learning or unlearning texts
  341. *
  342. * @access private
  343. * @param string $text
  344. * @param const $category Either b8::SPAM or b8::HAM
  345. * @param const $action Either b8::LEARN or b8::UNLEARN
  346. * @return void
  347. */
  348. private function _process_text($text, $category, $action, $uid = 0)
  349. {
  350. # Validate the startup
  351. $started_up = $this->validate();
  352. if($started_up !== TRUE)
  353. return $started_up;
  354. # Look if the request is okay
  355. if($this->_check_category($category) === FALSE)
  356. return self::TRAINER_CATEGORY_FAIL;
  357. # Get all tokens from $text
  358. $tokens = $this->_lexer->get_tokens($text);
  359. # Check if the lexer failed
  360. # (if so, $tokens will be a lexer error code, if not, $tokens will be an array)
  361. if(!is_array($tokens))
  362. return $tokens;
  363. # Pass the tokens and what to do with it to the storage backend
  364. return $this->_database->process_text($tokens, $category, $action, $uid);
  365. }
  366. }
  367. ?>