503 lines
		
	
	
		
			No EOL
		
	
	
		
			12 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			503 lines
		
	
	
		
			No EOL
		
	
	
		
			12 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | |
| 
 | |
| #   Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
 | |
| #
 | |
| #   b8 - A Bayesian spam filter written in PHP 5
 | |
| #
 | |
| #   This program is free software; you can redistribute it and/or modify it
 | |
| #   under the terms of the GNU Lesser General Public License as published by
 | |
| #   the Free Software Foundation in version 2.1 of the License.
 | |
| #
 | |
| #   This program is distributed in the hope that it will be useful, but
 | |
| #   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 | |
| #   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 | |
| #   License for more details.
 | |
| #
 | |
| #   You should have received a copy of the GNU Lesser General Public License
 | |
| #   along with this program; if not, write to the Free Software Foundation,
 | |
| #   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 | |
| 
 | |
| /**
 | |
|  * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
 | |
|  *
 | |
|  * @license LGPL
 | |
|  * @access public
 | |
|  * @package b8
 | |
|  * @author Tobias Leupold
 | |
|  * @author Oliver Lillie (aka buggedcom) (original PHP 5 port)
 | |
|  */
 | |
| 
 | |
| class b8
 | |
| {
 | |
| 
 | |
| 	public $config = array(
 | |
| 		'min_size'      => 3,
 | |
| 		'max_size'      => 30,
 | |
| 		'allow_numbers' => FALSE,
 | |
| 		'lexer'         => 'default',
 | |
| 		'degenerator'   => 'default',
 | |
| 		'storage'       => 'dba',
 | |
| 		'use_relevant'  => 15,
 | |
| 		'min_dev'       => 0.2,
 | |
| 		'rob_s'         => 0.3,
 | |
| 		'rob_x'         => 0.5
 | |
| 	);
 | |
| 
 | |
| 	private $_lexer      = NULL;
 | |
| 	private $_database   = NULL;
 | |
| 	private $_token_data = NULL;
 | |
| 
 | |
| 	const SPAM    = 'spam';
 | |
| 	const HAM     = 'ham';
 | |
| 	const LEARN   = 'learn';
 | |
| 	const UNLEARN = 'unlearn';
 | |
| 
 | |
| 	const STARTUP_FAIL_DATABASE = 'STARTUP_FAIL_DATABASE';
 | |
| 	const STARTUP_FAIL_LEXER    = 'STARTUP_FAIL_LEXER';
 | |
| 	const TRAINER_CATEGORY_FAIL = 'TRAINER_CATEGORY_FAIL';
 | |
| 
 | |
| 	/**
 | |
| 	 * Constructs b8
 | |
| 	 *
 | |
| 	 * @access public
 | |
| 	 * @return void
 | |
| 	 */
 | |
| 
 | |
| 	function __construct($config = array(), $database_config)
 | |
| 	{
 | |
| 
 | |
| 		# Validate config data
 | |
| 
 | |
| 		if(count($config) > 0) {
 | |
| 
 | |
| 			foreach ($config as $name=>$value) {
 | |
| 
 | |
| 				switch($name) {
 | |
| 
 | |
| 					case 'min_dev':
 | |
| 					case 'rob_s':
 | |
| 					case 'rob_x':
 | |
| 						$this->config[$name] = (float) $value;
 | |
| 						break;
 | |
| 
 | |
| 					case 'min_size':
 | |
| 					case 'max_size':
 | |
| 					case 'use_relevant':
 | |
| 						$this->config[$name] = (int) $value;
 | |
| 						break;
 | |
| 
 | |
| 					case 'allow_numbers':
 | |
| 						$this->config[$name] = (bool) $value;
 | |
| 						break;
 | |
| 
 | |
| 					case 'lexer':
 | |
| 						$value = (string) strtolower($value);
 | |
| 						$this->config[$name] = is_file(dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . "lexer_" . $value . '.php') === TRUE ? $value : 'default';
 | |
| 						break;
 | |
| 
 | |
| 					case 'storage':
 | |
| 						$this->config[$name] = (string) $value;
 | |
| 						break;
 | |
| 
 | |
| 				}
 | |
| 
 | |
| 			}
 | |
| 
 | |
| 		}
 | |
| 
 | |
| 		# Setup the database backend
 | |
| 
 | |
| 		# Get the basic storage class used by all backends
 | |
| 		if($this->load_class('b8_storage_base', dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_base.php') === FALSE)
 | |
| 			return;
 | |
| 
 | |
| 		# Get the degenerator we need
 | |
| 		if($this->load_class('b8_degenerator_' . $this->config['degenerator'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'degenerator' . DIRECTORY_SEPARATOR . 'degenerator_' . $this->config['degenerator'] . '.php') === FALSE)
 | |
| 			return;
 | |
| 
 | |
| 		# Get the actual storage backend we need
 | |
| 		if($this->load_class('b8_storage_' . $this->config['storage'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_' . $this->config['storage'] . '.php') === FALSE)
 | |
| 			return;
 | |
| 
 | |
| 		# Setup the backend
 | |
| 		$class = 'b8_storage_' . $this->config['storage'];
 | |
| 		$this->_database = new $class(
 | |
| 			$database_config,
 | |
| 			$this->config['degenerator'], date('ymd')
 | |
| 		);
 | |
| 
 | |
| 		# Setup the lexer class
 | |
| 
 | |
| 		if($this->load_class('b8_lexer_' . $this->config['lexer'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . 'lexer_' . $this->config['lexer'] . '.php') === FALSE)
 | |
| 			return;
 | |
| 
 | |
| 		$class = 'b8_lexer_' . $this->config['lexer'];
 | |
| 		$this->_lexer = new $class(
 | |
| 			array(
 | |
| 				'min_size' => $this->config['min_size'],
 | |
| 				'max_size' => $this->config['max_size'],
 | |
| 				'allow_numbers' => $this->config['allow_numbers']
 | |
| 			)
 | |
| 		);
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Load a class file if a class has not been defined yet.
 | |
| 	 *
 | |
| 	 * @access public
 | |
| 	 * @return boolean Returns TRUE if everything is okay, otherwise FALSE.
 | |
| 	 */
 | |
| 
 | |
| 	public function load_class($class_name, $class_file)
 | |
| 	{
 | |
| 
 | |
| 		if(class_exists($class_name, FALSE) === FALSE) {
 | |
| 
 | |
| 			$included = require_once $class_file;
 | |
| 
 | |
| 			if($included === FALSE or class_exists($class_name, FALSE) === FALSE)
 | |
| 				return FALSE;
 | |
| 
 | |
| 		}
 | |
| 
 | |
| 		return TRUE;
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Validates the class has all it needs to work.
 | |
| 	 *
 | |
| 	 * @access public
 | |
| 	 * @return mixed Returns TRUE if everything is okay, otherwise an error code.
 | |
| 	 */
 | |
| 
 | |
| 	public function validate()
 | |
| 	{
 | |
| 
 | |
| 		if($this->_database === NULL)
 | |
| 			return self::STARTUP_FAIL_DATABASE;
 | |
| 
 | |
| 		# Connect the database backend if we aren't connected yet
 | |
| 
 | |
| 		elseif($this->_database->connected === FALSE) {
 | |
| 
 | |
| 			$connection = $this->_database->connect();
 | |
| 
 | |
| 			if($connection !== TRUE)
 | |
| 				return $connection;
 | |
| 
 | |
| 		}
 | |
| 
 | |
| 		if($this->_lexer === NULL)
 | |
| 			return self::STARTUP_FAIL_LEXER;
 | |
| 
 | |
| 		return TRUE;
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Classifies a text
 | |
| 	 *
 | |
| 	 * @access public
 | |
| 	 * @package default
 | |
| 	 * @param string $text
 | |
| 	 * @return float The rating between 0 (ham) and 1 (spam)
 | |
| 	 */
 | |
| 
 | |
| 	public function classify($uid,$text)
 | |
| 	{
 | |
| 
 | |
| 		# Validate the startup
 | |
| 
 | |
| 		$started_up = $this->validate();
 | |
| 
 | |
| 		if($started_up !== TRUE)
 | |
| 			return $started_up;
 | |
| 
 | |
| 		# Get the internal database variables, containing the number of ham and
 | |
| 		# spam texts so the spam probability can be calculated in relation to them
 | |
| 		$internals = $this->_database->get_internals($uid);
 | |
| 
 | |
| 		# Calculate the spamminess of all tokens
 | |
| 
 | |
| 		# Get all tokens we want to rate
 | |
| 
 | |
| 		$tokens = $this->_lexer->get_tokens($text);
 | |
| 
 | |
| 		# Check if the lexer failed
 | |
| 		# (if so, $tokens will be a lexer error code, if not, $tokens will be an array)
 | |
| 		if(!is_array($tokens))
 | |
| 			return $tokens;
 | |
| 
 | |
| 		# Fetch all availible data for the token set from the database
 | |
| 		$this->_token_data = $this->_database->get(array_keys($tokens),$uid);
 | |
| 
 | |
| 		# Calculate the spamminess and importance for each token (or a degenerated form of it)
 | |
| 
 | |
| 		$word_count = array();
 | |
| 		$rating     = array();
 | |
| 		$importance = array();
 | |
| 
 | |
| 		foreach($tokens as $word => $count) {
 | |
| 
 | |
| 			$word_count[$word] = $count;
 | |
| 
 | |
| 			# Although we only call this function only here ... let's do the
 | |
| 			# calculation stuff in a function to make this a bit less confusing ;-)
 | |
| 			$rating[$word] = $this->_get_probability($word, $internals['texts_ham'], $internals['texts_spam']);
 | |
| 
 | |
| 			$importance[$word] = abs(0.5 - $rating[$word]);
 | |
| 
 | |
| 		}
 | |
| 
 | |
| 		# Order by importance
 | |
| 		arsort($importance);
 | |
| 		reset($importance);
 | |
| 
 | |
| 		# Get the most interesting tokens (use all if we have less than the given number)
 | |
| 
 | |
| 		$relevant = array();
 | |
| 
 | |
| 		for($i = 0; $i < $this->config['use_relevant']; $i++) {
 | |
| 
 | |
| 			if($tmp = each($importance)) {
 | |
| 
 | |
| 				# Important tokens remain
 | |
| 
 | |
| 				# If the token's rating is relevant enough, use it
 | |
| 
 | |
| 				if(abs(0.5 - $rating[$tmp['key']]) > $this->config['min_dev']) {
 | |
| 
 | |
| 					# Tokens that appear more than once also count more than once
 | |
| 
 | |
| 					for($x = 0, $l = $word_count[$tmp['key']]; $x < $l; $x++)
 | |
| 						array_push($relevant, $rating[$tmp['key']]);
 | |
| 
 | |
| 				}
 | |
| 
 | |
| 			}
 | |
| 
 | |
| 			else {
 | |
| 				# We have less than words to use, so we already
 | |
| 				# use what we have and can break here
 | |
| 				break;
 | |
| 			}
 | |
| 
 | |
| 		}
 | |
| 
 | |
| 		# Calculate the spamminess of the text (thanks to Mr. Robinson ;-)
 | |
| 		# We set both hamminess and Spamminess to 1 for the first multiplying
 | |
| 		$hamminess  = 1;
 | |
| 		$spamminess = 1;
 | |
| 
 | |
| 		# Consider all relevant ratings
 | |
| 		foreach($relevant as $value) {
 | |
| 			$hamminess  *= (1.0 - $value);
 | |
| 			$spamminess *= $value;
 | |
| 		}
 | |
| 
 | |
| 		# If no token was good for calculation, we really don't know how
 | |
| 		# to rate this text; so we assume a spam and ham probability of 0.5
 | |
| 
 | |
| 		if($hamminess === 1 and $spamminess === 1) {
 | |
| 			$hamminess = 0.5;
 | |
| 			$spamminess = 0.5;
 | |
| 			$n = 1;
 | |
| 		}
 | |
| 		else {
 | |
| 			# Get the number of relevant ratings
 | |
| 			$n = count($relevant);
 | |
| 		}
 | |
| 
 | |
| 		# Calculate the combined rating
 | |
| 
 | |
| 		# The actual hamminess and spamminess
 | |
| 		$hamminess  = 1 - pow($hamminess,  (1 / $n));
 | |
| 		$spamminess = 1 - pow($spamminess, (1 / $n));
 | |
| 
 | |
| 		# Calculate the combined indicator
 | |
| 		$probability = ($hamminess - $spamminess) / ($hamminess + $spamminess);
 | |
| 
 | |
| 		# We want a value between 0 and 1, not between -1 and +1, so ...
 | |
| 		$probability = (1 + $probability) / 2;
 | |
| 
 | |
| 		# Alea iacta est
 | |
| 		return $probability;
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Calculate the spamminess of a single token also considering "degenerated" versions
 | |
| 	 *
 | |
| 	 * @access private
 | |
| 	 * @param string $word
 | |
| 	 * @param string $texts_ham
 | |
| 	 * @param string $texts_spam
 | |
| 	 * @return void
 | |
| 	 */
 | |
| 
 | |
| 	private function _get_probability($word, $texts_ham, $texts_spam)
 | |
| 	{
 | |
| 
 | |
| 		# Let's see what we have!
 | |
| 
 | |
| 		if(isset($this->_token_data['tokens'][$word]) === TRUE) {
 | |
| 			# The token was in the database, so we can use it's data as-is
 | |
| 			# and calculate the spamminess of this token directly
 | |
| 			return $this->_calc_probability($this->_token_data['tokens'][$word], $texts_ham, $texts_spam);
 | |
| 		}
 | |
| 
 | |
| 		# Damn. The token was not found, so do we have at least similar words?
 | |
| 
 | |
| 		if(isset($this->_token_data['degenerates'][$word]) === TRUE) {
 | |
| 
 | |
| 			# We found similar words, so calculate the spamminess for each one
 | |
| 			# and choose the most important one for the further calculation
 | |
| 
 | |
| 			# The default rating is 0.5 simply saying nothing
 | |
| 			$rating = 0.5;
 | |
| 
 | |
| 			foreach($this->_token_data['degenerates'][$word] as $degenerate => $count) {
 | |
| 
 | |
| 				# Calculate the rating of the current degenerated token
 | |
| 				$rating_tmp = $this->_calc_probability($count, $texts_ham, $texts_spam);
 | |
| 
 | |
| 				# Is it more important than the rating of another degenerated version?
 | |
| 				if(abs(0.5 - $rating_tmp) > abs(0.5 - $rating))
 | |
| 					$rating = $rating_tmp;
 | |
| 
 | |
| 			}
 | |
| 
 | |
| 			return $rating;
 | |
| 
 | |
| 		}
 | |
| 
 | |
| 		else {
 | |
| 			# The token is really unknown, so choose the default rating
 | |
| 			# for completely unknown tokens. This strips down to the
 | |
| 			# robX parameter so we can cheap out the freaky math ;-)
 | |
| 			return $this->config['rob_x'];
 | |
| 		}
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Do the actual spamminess calculation of a single token
 | |
| 	 *
 | |
| 	 * @access private
 | |
| 	 * @param array $data
 | |
| 	 * @param string $texts_ham
 | |
| 	 * @param string $texts_spam
 | |
| 	 * @return void
 | |
| 	 */
 | |
| 
 | |
| 	private function _calc_probability($data, $texts_ham, $texts_spam)
 | |
| 	{
 | |
| 
 | |
| 		# Calculate the basic probability by Mr. Graham
 | |
| 
 | |
| 		# But: consider the number of ham and spam texts saved instead of the
 | |
| 		# number of entries where the token appeared to calculate a relative
 | |
| 		# spamminess because we count tokens appearing multiple times not just
 | |
| 		# once but as often as they appear in the learned texts
 | |
| 
 | |
| 		$rel_ham = $data['count_ham'];
 | |
| 		$rel_spam = $data['count_spam'];
 | |
| 
 | |
| 		if($texts_ham > 0)
 | |
| 			$rel_ham = $data['count_ham'] / $texts_ham;
 | |
| 
 | |
| 		if($texts_spam > 0)
 | |
| 			$rel_spam = $data['count_spam'] / $texts_spam;
 | |
| 
 | |
| 		$rating = $rel_spam / ($rel_ham + $rel_spam);
 | |
| 
 | |
| 		# Calculate the better probability proposed by Mr. Robinson
 | |
| 		$all = $data['count_ham'] + $data['count_spam'];
 | |
| 		return (($this->config['rob_s'] * $this->config['rob_x']) + ($all * $rating)) / ($this->config['rob_s'] + $all);
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Check the validity of the category of a request
 | |
| 	 *
 | |
| 	 * @access private
 | |
| 	 * @param string $category
 | |
| 	 * @return void
 | |
| 	 */
 | |
| 
 | |
| 	private function _check_category($category)
 | |
| 	{
 | |
| 		return $category === self::HAM or $category === self::SPAM;
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Learn a reference text
 | |
| 	 *
 | |
| 	 * @access public
 | |
| 	 * @param string $text
 | |
| 	 * @param const $category Either b8::SPAM or b8::HAM
 | |
| 	 * @return void
 | |
| 	 */
 | |
| 
 | |
| 	public function learn($text, $category, $uid)
 | |
| 	{
 | |
| 		return $this->_process_text($text, $category, self::LEARN, $uid);
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Unlearn a reference text
 | |
| 	 *
 | |
| 	 * @access public
 | |
| 	 * @param string $text
 | |
| 	 * @param const $category Either b8::SPAM or b8::HAM
 | |
| 	 * @return void
 | |
| 	 */
 | |
| 
 | |
| 	public function unlearn($text, $category, $uid)
 | |
| 	{
 | |
| 		return $this->_process_text($text, $category, self::UNLEARN, $uid);
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Does the actual interaction with the storage backend for learning or unlearning texts
 | |
| 	 *
 | |
| 	 * @access private
 | |
| 	 * @param string $text
 | |
| 	 * @param const $category Either b8::SPAM or b8::HAM
 | |
| 	 * @param const $action Either b8::LEARN or b8::UNLEARN
 | |
| 	 * @return void
 | |
| 	 */
 | |
| 
 | |
| 	private function _process_text($text, $category, $action, $uid = 0)
 | |
| 	{
 | |
| 
 | |
| 		# Validate the startup
 | |
| 
 | |
| 		$started_up = $this->validate();
 | |
| 
 | |
| 		if($started_up !== TRUE)
 | |
| 			return $started_up;
 | |
| 
 | |
| 		# Look if the request is okay
 | |
| 		if($this->_check_category($category) === FALSE)
 | |
| 			return self::TRAINER_CATEGORY_FAIL;
 | |
| 
 | |
| 		# Get all tokens from $text
 | |
| 
 | |
| 		$tokens = $this->_lexer->get_tokens($text);
 | |
| 
 | |
| 		# Check if the lexer failed
 | |
| 		# (if so, $tokens will be a lexer error code, if not, $tokens will be an array)
 | |
| 		if(!is_array($tokens))
 | |
| 			return $tokens;
 | |
| 
 | |
| 		# Pass the tokens and what to do with it to the storage backend
 | |
| 		return $this->_database->process_text($tokens, $category, $action, $uid);
 | |
| 
 | |
| 	}
 | |
| 
 | |
| }
 | |
| 
 | |
| ?>
 |