@ -0,0 +1,503 @@ | |||
<?php | |||
# Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> | |||
# | |||
# b8 - A Bayesian spam filter written in PHP 5 | |||
# | |||
# This program is free software; you can redistribute it and/or modify it | |||
# under the terms of the GNU Lesser General Public License as published by | |||
# the Free Software Foundation in version 2.1 of the License. | |||
# | |||
# This program is distributed in the hope that it will be useful, but | |||
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |||
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |||
# License for more details. | |||
# | |||
# You should have received a copy of the GNU Lesser General Public License | |||
# along with this program; if not, write to the Free Software Foundation, | |||
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. | |||
/** | |||
* Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> | |||
* | |||
* @license LGPL | |||
* @access public | |||
* @package b8 | |||
* @author Tobias Leupold | |||
* @author Oliver Lillie (aka buggedcom) (original PHP 5 port) | |||
*/ | |||
class b8 | |||
{ | |||
public $config = array( | |||
'min_size' => 3, | |||
'max_size' => 30, | |||
'allow_numbers' => FALSE, | |||
'lexer' => 'default', | |||
'degenerator' => 'default', | |||
'storage' => 'dba', | |||
'use_relevant' => 15, | |||
'min_dev' => 0.2, | |||
'rob_s' => 0.3, | |||
'rob_x' => 0.5 | |||
); | |||
private $_lexer = NULL; | |||
private $_database = NULL; | |||
private $_token_data = NULL; | |||
const SPAM = 'spam'; | |||
const HAM = 'ham'; | |||
const LEARN = 'learn'; | |||
const UNLEARN = 'unlearn'; | |||
const STARTUP_FAIL_DATABASE = 'STARTUP_FAIL_DATABASE'; | |||
const STARTUP_FAIL_LEXER = 'STARTUP_FAIL_LEXER'; | |||
const TRAINER_CATEGORY_FAIL = 'TRAINER_CATEGORY_FAIL'; | |||
/** | |||
* Constructs b8 | |||
* | |||
* @access public | |||
* @return void | |||
*/ | |||
function __construct($config = array(), $database_config) | |||
{ | |||
# Validate config data | |||
if(count($config) > 0) { | |||
foreach ($config as $name=>$value) { | |||
switch($name) { | |||
case 'min_dev': | |||
case 'rob_s': | |||
case 'rob_x': | |||
$this->config[$name] = (float) $value; | |||
break; | |||
case 'min_size': | |||
case 'max_size': | |||
case 'use_relevant': | |||
$this->config[$name] = (int) $value; | |||
break; | |||
case 'allow_numbers': | |||
$this->config[$name] = (bool) $value; | |||
break; | |||
case 'lexer': | |||
$value = (string) strtolower($value); | |||
$this->config[$name] = is_file(dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . "lexer_" . $value . '.php') === TRUE ? $value : 'default'; | |||
break; | |||
case 'storage': | |||
$this->config[$name] = (string) $value; | |||
break; | |||
} | |||
} | |||
} | |||
# Setup the database backend | |||
# Get the basic storage class used by all backends | |||
if($this->load_class('b8_storage_base', dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_base.php') === FALSE) | |||
return; | |||
# Get the degenerator we need | |||
if($this->load_class('b8_degenerator_' . $this->config['degenerator'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'degenerator' . DIRECTORY_SEPARATOR . 'degenerator_' . $this->config['degenerator'] . '.php') === FALSE) | |||
return; | |||
# Get the actual storage backend we need | |||
if($this->load_class('b8_storage_' . $this->config['storage'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_' . $this->config['storage'] . '.php') === FALSE) | |||
return; | |||
# Setup the backend | |||
$class = 'b8_storage_' . $this->config['storage']; | |||
$this->_database = new $class( | |||
$database_config, | |||
$this->config['degenerator'], date('ymd') | |||
); | |||
# Setup the lexer class | |||
if($this->load_class('b8_lexer_' . $this->config['lexer'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . 'lexer_' . $this->config['lexer'] . '.php') === FALSE) | |||
return; | |||
$class = 'b8_lexer_' . $this->config['lexer']; | |||
$this->_lexer = new $class( | |||
array( | |||
'min_size' => $this->config['min_size'], | |||
'max_size' => $this->config['max_size'], | |||
'allow_numbers' => $this->config['allow_numbers'] | |||
) | |||
); | |||
} | |||
/** | |||
* Load a class file if a class has not been defined yet. | |||
* | |||
* @access public | |||
* @return boolean Returns TRUE if everything is okay, otherwise FALSE. | |||
*/ | |||
public function load_class($class_name, $class_file) | |||
{ | |||
if(class_exists($class_name, FALSE) === FALSE) { | |||
$included = require_once $class_file; | |||
if($included === FALSE or class_exists($class_name, FALSE) === FALSE) | |||
return FALSE; | |||
} | |||
return TRUE; | |||
} | |||
/** | |||
* Validates the class has all it needs to work. | |||
* | |||
* @access public | |||
* @return mixed Returns TRUE if everything is okay, otherwise an error code. | |||
*/ | |||
public function validate() | |||
{ | |||
if($this->_database === NULL) | |||
return self::STARTUP_FAIL_DATABASE; | |||
# Connect the database backend if we aren't connected yet | |||
elseif($this->_database->connected === FALSE) { | |||
$connection = $this->_database->connect(); | |||
if($connection !== TRUE) | |||
return $connection; | |||
} | |||
if($this->_lexer === NULL) | |||
return self::STARTUP_FAIL_LEXER; | |||
return TRUE; | |||
} | |||
/** | |||
* Classifies a text | |||
* | |||
* @access public | |||
* @package default | |||
* @param string $text | |||
* @return float The rating between 0 (ham) and 1 (spam) | |||
*/ | |||
public function classify($text) | |||
{ | |||
# Validate the startup | |||
$started_up = $this->validate(); | |||
if($started_up !== TRUE) | |||
return $started_up; | |||
# Get the internal database variables, containing the number of ham and | |||
# spam texts so the spam probability can be calculated in relation to them | |||
$internals = $this->_database->get_internals(); | |||
# Calculate the spamminess of all tokens | |||
# Get all tokens we want to rate | |||
$tokens = $this->_lexer->get_tokens($text); | |||
# Check if the lexer failed | |||
# (if so, $tokens will be a lexer error code, if not, $tokens will be an array) | |||
if(!is_array($tokens)) | |||
return $tokens; | |||
# Fetch all availible data for the token set from the database | |||
$this->_token_data = $this->_database->get(array_keys($tokens)); | |||
# Calculate the spamminess and importance for each token (or a degenerated form of it) | |||
$word_count = array(); | |||
$rating = array(); | |||
$importance = array(); | |||
foreach($tokens as $word => $count) { | |||
$word_count[$word] = $count; | |||
# Although we only call this function only here ... let's do the | |||
# calculation stuff in a function to make this a bit less confusing ;-) | |||
$rating[$word] = $this->_get_probability($word, $internals['texts_ham'], $internals['texts_spam']); | |||
$importance[$word] = abs(0.5 - $rating[$word]); | |||
} | |||
# Order by importance | |||
arsort($importance); | |||
reset($importance); | |||
# Get the most interesting tokens (use all if we have less than the given number) | |||
$relevant = array(); | |||
for($i = 0; $i < $this->config['use_relevant']; $i++) { | |||
if($tmp = each($importance)) { | |||
# Important tokens remain | |||
# If the token's rating is relevant enough, use it | |||
if(abs(0.5 - $rating[$tmp['key']]) > $this->config['min_dev']) { | |||
# Tokens that appear more than once also count more than once | |||
for($x = 0, $l = $word_count[$tmp['key']]; $x < $l; $x++) | |||
array_push($relevant, $rating[$tmp['key']]); | |||
} | |||
} | |||
else { | |||
# We have less than words to use, so we already | |||
# use what we have and can break here | |||
break; | |||
} | |||
} | |||
# Calculate the spamminess of the text (thanks to Mr. Robinson ;-) | |||
# We set both hamminess and Spamminess to 1 for the first multiplying | |||
$hamminess = 1; | |||
$spamminess = 1; | |||
# Consider all relevant ratings | |||
foreach($relevant as $value) { | |||
$hamminess *= (1.0 - $value); | |||
$spamminess *= $value; | |||
} | |||
# If no token was good for calculation, we really don't know how | |||
# to rate this text; so we assume a spam and ham probability of 0.5 | |||
if($hamminess === 1 and $spamminess === 1) { | |||
$hamminess = 0.5; | |||
$spamminess = 0.5; | |||
$n = 1; | |||
} | |||
else { | |||
# Get the number of relevant ratings | |||
$n = count($relevant); | |||
} | |||
# Calculate the combined rating | |||
# The actual hamminess and spamminess | |||
$hamminess = 1 - pow($hamminess, (1 / $n)); | |||
$spamminess = 1 - pow($spamminess, (1 / $n)); | |||
# Calculate the combined indicator | |||
$probability = ($hamminess - $spamminess) / ($hamminess + $spamminess); | |||
# We want a value between 0 and 1, not between -1 and +1, so ... | |||
$probability = (1 + $probability) / 2; | |||
# Alea iacta est | |||
return $probability; | |||
} | |||
/** | |||
* Calculate the spamminess of a single token also considering "degenerated" versions | |||
* | |||
* @access private | |||
* @param string $word | |||
* @param string $texts_ham | |||
* @param string $texts_spam | |||
* @return void | |||
*/ | |||
private function _get_probability($word, $texts_ham, $texts_spam) | |||
{ | |||
# Let's see what we have! | |||
if(isset($this->_token_data['tokens'][$word]) === TRUE) { | |||
# The token was in the database, so we can use it's data as-is | |||
# and calculate the spamminess of this token directly | |||
return $this->_calc_probability($this->_token_data['tokens'][$word], $texts_ham, $texts_spam); | |||
} | |||
# Damn. The token was not found, so do we have at least similar words? | |||
if(isset($this->_token_data['degenerates'][$word]) === TRUE) { | |||
# We found similar words, so calculate the spamminess for each one | |||
# and choose the most important one for the further calculation | |||
# The default rating is 0.5 simply saying nothing | |||
$rating = 0.5; | |||
foreach($this->_token_data['degenerates'][$word] as $degenerate => $count) { | |||
# Calculate the rating of the current degenerated token | |||
$rating_tmp = $this->_calc_probability($count, $texts_ham, $texts_spam); | |||
# Is it more important than the rating of another degenerated version? | |||
if(abs(0.5 - $rating_tmp) > abs(0.5 - $rating)) | |||
$rating = $rating_tmp; | |||
} | |||
return $rating; | |||
} | |||
else { | |||
# The token is really unknown, so choose the default rating | |||
# for completely unknown tokens. This strips down to the | |||
# robX parameter so we can cheap out the freaky math ;-) | |||
return $this->config['rob_x']; | |||
} | |||
} | |||
/** | |||
* Do the actual spamminess calculation of a single token | |||
* | |||
* @access private | |||
* @param array $data | |||
* @param string $texts_ham | |||
* @param string $texts_spam | |||
* @return void | |||
*/ | |||
private function _calc_probability($data, $texts_ham, $texts_spam) | |||
{ | |||
# Calculate the basic probability by Mr. Graham | |||
# But: consider the number of ham and spam texts saved instead of the | |||
# number of entries where the token appeared to calculate a relative | |||
# spamminess because we count tokens appearing multiple times not just | |||
# once but as often as they appear in the learned texts | |||
$rel_ham = $data['count_ham']; | |||
$rel_spam = $data['count_spam']; | |||
if($texts_ham > 0) | |||
$rel_ham = $data['count_ham'] / $texts_ham; | |||
if($texts_spam > 0) | |||
$rel_spam = $data['count_spam'] / $texts_spam; | |||
$rating = $rel_spam / ($rel_ham + $rel_spam); | |||
# Calculate the better probability proposed by Mr. Robinson | |||
$all = $data['count_ham'] + $data['count_spam']; | |||
return (($this->config['rob_s'] * $this->config['rob_x']) + ($all * $rating)) / ($this->config['rob_s'] + $all); | |||
} | |||
/** | |||
* Check the validity of the category of a request | |||
* | |||
* @access private | |||
* @param string $category | |||
* @return void | |||
*/ | |||
private function _check_category($category) | |||
{ | |||
return $category === self::HAM or $category === self::SPAM; | |||
} | |||
/** | |||
* Learn a reference text | |||
* | |||
* @access public | |||
* @param string $text | |||
* @param const $category Either b8::SPAM or b8::HAM | |||
* @return void | |||
*/ | |||
public function learn($text, $category) | |||
{ | |||
return $this->_process_text($text, $category, self::LEARN); | |||
} | |||
/** | |||
* Unlearn a reference text | |||
* | |||
* @access public | |||
* @param string $text | |||
* @param const $category Either b8::SPAM or b8::HAM | |||
* @return void | |||
*/ | |||
public function unlearn($text, $category) | |||
{ | |||
return $this->_process_text($text, $category, self::UNLEARN); | |||
} | |||
/** | |||
* Does the actual interaction with the storage backend for learning or unlearning texts | |||
* | |||
* @access private | |||
* @param string $text | |||
* @param const $category Either b8::SPAM or b8::HAM | |||
* @param const $action Either b8::LEARN or b8::UNLEARN | |||
* @return void | |||
*/ | |||
private function _process_text($text, $category, $action) | |||
{ | |||
# Validate the startup | |||
$started_up = $this->validate(); | |||
if($started_up !== TRUE) | |||
return $started_up; | |||
# Look if the request is okay | |||
if($this->_check_category($category) === FALSE) | |||
return self::TRAINER_CATEGORY_FAIL; | |||
# Get all tokens from $text | |||
$tokens = $this->_lexer->get_tokens($text); | |||
# Check if the lexer failed | |||
# (if so, $tokens will be a lexer error code, if not, $tokens will be an array) | |||
if(!is_array($tokens)) | |||
return $tokens; | |||
# Pass the tokens and what to do with it to the storage backend | |||
return $this->_database->process_text($tokens, $category, $action); | |||
} | |||
} | |||
?> |
@ -0,0 +1,395 @@ | |||
<?php | |||
# Copyright (C) 2010 Tobias Leupold <tobias.leupold@web.de> | |||
# | |||
# This file is part of the b8 package | |||
# | |||
# This program is free software; you can redistribute it and/or modify it | |||
# under the terms of the GNU Lesser General Public License as published by | |||
# the Free Software Foundation in version 2.1 of the License. | |||
# | |||
# This program is distributed in the hope that it will be useful, but | |||
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |||
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |||
# License for more details. | |||
# | |||
# You should have received a copy of the GNU Lesser General Public License | |||
# along with this program; if not, write to the Free Software Foundation, | |||
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. | |||
/** | |||
* Functions used by all storage backends | |||
* Copyright (C) 2010 Tobias Leupold <tobias.leupold@web.de> | |||
* | |||
* @license LGPL | |||
* @access public | |||
* @package b8 | |||
* @author Tobias Leupold | |||
*/ | |||
abstract class b8_storage_base | |||
{ | |||
public $connected = FALSE; | |||
protected $_degenerator = NULL; | |||
const INTERNALS_TEXTS_HAM = 'bayes*texts.ham'; | |||
const INTERNALS_TEXTS_SPAM = 'bayes*texts.spam'; | |||
const INTERNALS_DBVERSION = 'bayes*dbversion'; | |||
const BACKEND_NOT_CONNECTED = 'BACKEND_NOT_CONNECTED'; | |||
const DATABASE_WRONG_VERSION = 'DATABASE_WRONG_VERSION'; | |||
const DATABASE_NOT_B8 = 'DATABASE_NOT_B8'; | |||
/** | |||
* Validates the class has all it needs to work. | |||
* | |||
* @access protected | |||
* @return mixed Returns TRUE if everything is okay, otherwise an error code. | |||
*/ | |||
protected function validate() | |||
{ | |||
# We set up the degenerator here, as we would have to duplicate code if it | |||
# was done in the constructor of the respective storage backend. | |||
$class = 'b8_degenerator_' . $this->b8_config['degenerator']; | |||
$this->_degenerator = new $class(); | |||
if($this->connected !== TRUE) | |||
return self::BACKEND_NOT_CONNECTED; | |||
return TRUE; | |||
} | |||
/** | |||
* Checks if a b8 database is used and if it's version is okay | |||
* | |||
* @access protected | |||
* @return mixed Returns TRUE if everything is okay, otherwise an error code. | |||
*/ | |||
protected function check_database() | |||
{ | |||
$internals = $this->get_internals(); | |||
if(isset($internals['dbversion'])) { | |||
if($internals['dbversion'] == "2") { | |||
return TRUE; | |||
} | |||
else { | |||
$this->connected = FALSE; | |||
return self::DATABASE_WRONG_VERSION; | |||
} | |||
} | |||
else { | |||
$this->connected = FALSE; | |||
return self::DATABASE_NOT_B8; | |||
} | |||
} | |||
/** | |||
* Parses the "count" data of a token. | |||
* | |||
* @access private | |||
* @param string $data | |||
* @return array Returns an array of the parsed data: array(count_ham, count_spam, lastseen). | |||
*/ | |||
private function _parse_count($data) | |||
{ | |||
list($count_ham, $count_spam, $lastseen) = explode(' ', $data); | |||
$count_ham = (int) $count_ham; | |||
$count_spam = (int) $count_spam; | |||
return array( | |||
'count_ham' => $count_ham, | |||
'count_spam' => $count_spam | |||
); | |||
} | |||
/** | |||
* Get the database's internal variables. | |||
* | |||
* @access public | |||
* @return array Returns an array of all internals. | |||
*/ | |||
public function get_internals() | |||
{ | |||
$internals = $this->_get_query( | |||
array( | |||
self::INTERNALS_TEXTS_HAM, | |||
self::INTERNALS_TEXTS_SPAM, | |||
self::INTERNALS_DBVERSION | |||
) | |||
); | |||
return array( | |||
'texts_ham' => (int) $internals[self::INTERNALS_TEXTS_HAM], | |||
'texts_spam' => (int) $internals[self::INTERNALS_TEXTS_SPAM], | |||
'dbversion' => (int) $internals[self::INTERNALS_DBVERSION] | |||
); | |||
} | |||
/** | |||
* Get all data about a list of tags from the database. | |||
* | |||
* @access public | |||
* @param array $tokens | |||
* @return mixed Returns FALSE on failure, otherwise returns array of returned data in the format array('tokens' => array(token => count), 'degenerates' => array(token => array(degenerate => count))). | |||
*/ | |||
public function get($tokens) | |||
{ | |||
# Validate the startup | |||
$started_up = $this->validate(); | |||
if($started_up !== TRUE) | |||
return $started_up; | |||
# First we see what we have in the database. | |||
$token_data = $this->_get_query($tokens); | |||
# Check if we have to degenerate some tokens | |||
$missing_tokens = array(); | |||
foreach($tokens as $token) { | |||
if(!isset($token_data[$token])) | |||
$missing_tokens[] = $token; | |||
} | |||
if(count($missing_tokens) > 0) { | |||
# We have to degenerate some tokens | |||
$degenerates_list = array(); | |||
# Generate a list of degenerated tokens for the missing tokens ... | |||
$degenerates = $this->_degenerator->degenerate($missing_tokens); | |||
# ... and look them up | |||
foreach($degenerates as $token => $token_degenerates) | |||
$degenerates_list = array_merge($degenerates_list, $token_degenerates); | |||
$token_data = array_merge($token_data, $this->_get_query($degenerates_list)); | |||
} | |||
# Here, we have all availible data in $token_data. | |||
$return_data_tokens = array(); | |||
$return_data_degenerates = array(); | |||
foreach($tokens as $token) { | |||
if(isset($token_data[$token]) === TRUE) { | |||
# The token was found in the database | |||
# Add the data ... | |||
$return_data_tokens[$token] = $this->_parse_count($token_data[$token]); | |||
# ... and update it's lastseen parameter | |||
$this->_update($token, "{$return_data_tokens[$token]['count_ham']} {$return_data_tokens[$token]['count_spam']} " . $this->b8_config['today']); | |||
} | |||
else { | |||
# The token was not found, so we look if we | |||
# can return data for degenerated tokens | |||
# Check all degenerated forms of the token | |||
foreach($this->_degenerator->degenerates[$token] as $degenerate) { | |||
if(isset($token_data[$degenerate]) === TRUE) { | |||
# A degeneration of the token way found in the database | |||
# Add the data ... | |||
$return_data_degenerates[$token][$degenerate] = $this->_parse_count($token_data[$degenerate]); | |||
# ... and update it's lastseen parameter | |||
$this->_update($degenerate, "{$return_data_degenerates[$token][$degenerate]['count_ham']} {$return_data_degenerates[$token][$degenerate]['count_spam']} " . $this->b8_config['today']); | |||
} | |||
} | |||
} | |||
} | |||
# Now, all token data directly found in the database is in $return_data_tokens | |||
# and all data for degenerated versions is in $return_data_degenerates | |||
# First, we commit the changes to the lastseen parameters | |||
$this->_commit(); | |||
# Then, we return what we have | |||
return array( | |||
'tokens' => $return_data_tokens, | |||
'degenerates' => $return_data_degenerates | |||
); | |||
} | |||
/** | |||
* Stores or deletes a list of tokens from the given category. | |||
* | |||
* @access public | |||
* @param array $tokens | |||
* @param const $category Either b8::HAM or b8::SPAM | |||
* @param const $action Either b8::LEARN or b8::UNLEARN | |||
* @return void | |||
*/ | |||
public function process_text($tokens, $category, $action) | |||
{ | |||
# Validate the startup | |||
$started_up = $this->validate(); | |||
if($started_up !== TRUE) | |||
return $started_up; | |||
# No matter what we do, we first have to check what data we have. | |||
# First get the internals, including the ham texts and spam texts counter | |||
$internals = $this->get_internals(); | |||
# Then, fetch all data for all tokens we have (and update their lastseen parameters) | |||
$token_data = $this->_get_query(array_keys($tokens)); | |||
# Process all tokens to learn/unlearn | |||
foreach($tokens as $token => $count) { | |||
if(isset($token_data[$token])) { | |||
# We already have this token, so update it's data | |||
# Get the existing data | |||
list($count_ham, $count_spam, $lastseen) = explode(' ', $token_data[$token]); | |||
$count_ham = (int) $count_ham; | |||
$count_spam = (int) $count_spam; | |||
# Increase or decrease the right counter | |||
if($action === b8::LEARN) { | |||
if($category === b8::HAM) | |||
$count_ham += $count; | |||
elseif($category === b8::SPAM) | |||
$count_spam += $count; | |||
} | |||
elseif($action == b8::UNLEARN) { | |||
if($category === b8::HAM) | |||
$count_ham -= $count; | |||
elseif($category === b8::SPAM) | |||
$count_spam -= $count; | |||
} | |||
# We don't want to have negative values | |||
if($count_ham < 0) | |||
$count_ham = 0; | |||
if($count_spam < 0) | |||
$count_spam = 0; | |||
# Now let's see if we have to update or delete the token | |||
if($count_ham !== 0 or $count_spam !== 0) | |||
$this->_update($token, "$count_ham $count_spam " . $this->b8_config['today']); | |||
else | |||
$this->_del($token); | |||
} | |||
else { | |||
# We don't have the token. If we unlearn a text, we can't delete it | |||
# as we don't have it anyway, so just do something if we learn a text | |||
if($action === b8::LEARN) { | |||
if($category === b8::HAM) | |||
$data = '1 0 '; | |||
elseif($category === b8::SPAM) | |||
$data = '0 1 '; | |||
$data .= $this->b8_config['today']; | |||
$this->_put($token, $data); | |||
} | |||
} | |||
} | |||
# Now, all token have been processed, so let's update the right text | |||
if($action === b8::LEARN) { | |||
if($category === b8::HAM) { | |||
$internals['texts_ham']++; | |||
$this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']); | |||
} | |||
elseif($category === b8::SPAM) { | |||
$internals['texts_spam']++; | |||
$this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']); | |||
} | |||
} | |||
elseif($action == b8::UNLEARN) { | |||
if($category === b8::HAM) { | |||
$internals['texts_ham']--; | |||
if($internals['texts_ham'] < 0) | |||
$internals['texts_ham'] = 0; | |||
$this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']); | |||
} | |||
elseif($category === b8::SPAM) { | |||
$internals['texts_spam']--; | |||
if($internals['texts_spam'] < 0) | |||
$internals['texts_spam'] = 0; | |||
$this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']); | |||
} | |||
} | |||
# We're done and can commit all changes to the database now | |||
$this->_commit(); | |||
} | |||
} | |||
?> |
@ -0,0 +1,351 @@ | |||
<?php | |||
# Copyright (C) 2006-2011 Tobias Leupold <tobias.leupold@web.de> | |||
# | |||
# This file is part of the b8 package | |||
# | |||
# This program is free software; you can redistribute it and/or modify it | |||
# under the terms of the GNU Lesser General Public License as published by | |||
# the Free Software Foundation in version 2.1 of the License. | |||
# | |||
# This program is distributed in the hope that it will be useful, but | |||
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |||
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |||
# License for more details. | |||
# | |||
# You should have received a copy of the GNU Lesser General Public License | |||
# along with this program; if not, write to the Free Software Foundation, | |||
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. | |||
/** | |||
* The MySQL abstraction layer for communicating with the database. | |||
* Copyright (C) 2009 Oliver Lillie (aka buggedcom) | |||
* Copyright (C) 2010-2011 Tobias Leupold <tobias.leupold@web.de> | |||
* | |||
* @license LGPL | |||
* @access public | |||
* @package b8 | |||
* @author Oliver Lillie (aka buggedcom) (original PHP 5 port and optimizations) | |||
* @author Tobias Leupold | |||
*/ | |||
class b8_storage_mysql extends b8_storage_base | |||
{ | |||
public $config = array( | |||
'database' => 'b8_wordlist', | |||
'table_name' => 'b8_wordlist', | |||
'host' => 'localhost', | |||
'user' => FALSE, | |||
'pass' => FALSE, | |||
'connection' => NULL | |||
); | |||
public $b8_config = array( | |||
'degenerator' => NULL, | |||
'today' => NULL | |||
); | |||
private $_connection = NULL; | |||
private $_deletes = array(); | |||
private $_puts = array(); | |||
private $_updates = array(); | |||
const DATABASE_CONNECTION_FAIL = 'DATABASE_CONNECTION_FAIL'; | |||
const DATABASE_CONNECTION_ERROR = 'DATABASE_CONNECTION_ERROR'; | |||
const DATABASE_CONNECTION_BAD_RESOURCE = 'DATABASE_CONNECTION_BAD_RESOURCE'; | |||
const DATABASE_SELECT_ERROR = 'DATABASE_SELECT_ERROR'; | |||
const DATABASE_TABLE_ACCESS_FAIL = 'DATABASE_TABLE_ACCESS_FAIL'; | |||
const DATABASE_WRONG_VERSION = 'DATABASE_WRONG_VERSION'; | |||
/** | |||
* Constructs the database layer. | |||
* | |||
* @access public | |||
* @param string $config | |||
*/ | |||
function __construct($config, $degenerator, $today) | |||
{ | |||
# Pass some variables of the main b8 config to this class | |||
$this->b8_config['degenerator'] = $degenerator; | |||
$this->b8_config['today'] = $today; | |||
# Validate the config items | |||
if(count($config) > 0) { | |||
foreach ($config as $name => $value) { | |||
switch($name) { | |||
case 'table_name': | |||
case 'host': | |||
case 'user': | |||
case 'pass': | |||
case 'database': | |||
$this->config[$name] = (string) $value; | |||
break; | |||
case 'connection': | |||
if($value !== NULL) { | |||
if(is_resource($value) === TRUE) { | |||
$resource_type = get_resource_type($value); | |||
$this->config['connection'] = $resource_type !== 'mysql link' && $resource_type !== 'mysql link persistent' ? FALSE : $value; | |||
} | |||
else | |||
$this->config['connection'] = FALSE; | |||
} | |||
break; | |||
} | |||
} | |||
} | |||
} | |||
/** | |||
* Closes the database connection. | |||
* | |||
* @access public | |||
* @return void | |||
*/ | |||
function __destruct() | |||
{ | |||
if($this->_connection === NULL) | |||
return; | |||
# Commit any changes before closing | |||
$this->_commit(); | |||
# Just close the connection if no link-resource was passed and b8 created it's own connection | |||
if($this->config['connection'] === NULL) | |||
mysql_close($this->_connection); | |||
$this->connected = FALSE; | |||
} | |||
/** | |||
* Connect to the database and do some checks. | |||
* | |||
* @access public | |||
* @return mixed Returns TRUE on a successful database connection, otherwise returns a constant from b8. | |||
*/ | |||
public function connect() | |||
{ | |||
# Are we already connected? | |||
if($this->connected === TRUE) | |||
return TRUE; | |||
# Are we using an existing passed resource? | |||
if($this->config['connection'] === FALSE) { | |||
# ... yes we are, but the connection is not a resource, so return an error | |||
$this->connected = FALSE; | |||
return self::DATABASE_CONNECTION_BAD_RESOURCE; | |||
} | |||
elseif($this->config['connection'] === NULL) { | |||
# ... no we aren't so we have to connect. | |||
if($this->_connection = mysql_connect($this->config['host'], $this->config['user'], $this->config['pass'])) { | |||
if(mysql_select_db($this->config['database'], $this->_connection) === FALSE) { | |||
$this->connected = FALSE; | |||
return self::DATABASE_SELECT_ERROR . ": " . mysql_error(); | |||
} | |||
} | |||
else { | |||
$this->connected = FALSE; | |||
return self::DATABASE_CONNECTION_ERROR; | |||
} | |||
} | |||
else { | |||
# ... yes we are | |||
$this->_connection = $this->config['connection']; | |||
} | |||
# Just in case ... | |||
if($this->_connection === NULL) { | |||
$this->connected = FALSE; | |||
return self::DATABASE_CONNECTION_FAIL; | |||
} | |||
# Check to see if the wordlist table exists | |||
if(mysql_query('DESCRIBE ' . $this->config['table_name'], $this->_connection) === FALSE) { | |||
$this->connected = FALSE; | |||
return self::DATABASE_TABLE_ACCESS_FAIL . ": " . mysql_error(); | |||
} | |||
# Everything is okay and connected | |||
$this->connected = TRUE; | |||
# Let's see if this is a b8 database and the version is okay | |||
return $this->check_database(); | |||
} | |||
/** | |||
* Does the actual interaction with the database when fetching data. | |||
* | |||
* @access protected | |||
* @param array $tokens | |||
* @return mixed Returns an array of the returned data in the format array(token => data) or an empty array if there was no data. | |||
*/ | |||
protected function _get_query($tokens) | |||
{ | |||
# Construct the query ... | |||
if(count($tokens) > 0) { | |||
$where = array(); | |||
foreach ($tokens as $token) { | |||
$token = mysql_real_escape_string($token, $this->_connection); | |||
array_push($where, $token); | |||
} | |||
$where = 'token IN ("' . implode('", "', $where) . '")'; | |||
} | |||
else { | |||
$token = mysql_real_escape_string($token, $this->_connection); | |||
$where = 'token = "' . $token . '"'; | |||
} | |||
# ... and fetch the data | |||
$result = mysql_query(' | |||
SELECT token, count | |||
FROM ' . $this->config['table_name'] . ' | |||
WHERE ' . $where . '; | |||
', $this->_connection); | |||
$data = array(); | |||
while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) | |||
$data[$row['token']] = $row['count']; | |||
mysql_free_result($result); | |||
return $data; | |||
} | |||
/** | |||
* Store a token to the database. | |||
* | |||
* @access protected | |||
* @param string $token | |||
* @param string $count | |||
* @return void | |||
*/ | |||
protected function _put($token, $count) { | |||
$token = mysql_real_escape_string($token, $this->_connection); | |||
$count = mysql_real_escape_string($count, $this->_connection);; | |||
array_push($this->_puts, '("' . $token . '", "' . $count . '")'); | |||
} | |||
/** | |||
* Update an existing token. | |||
* | |||
* @access protected | |||
* @param string $token | |||
* @param string $count | |||
* @return void | |||
*/ | |||
protected function _update($token, $count) | |||
{ | |||
$token = mysql_real_escape_string($token, $this->_connection); | |||
$count = mysql_real_escape_string($count, $this->_connection); | |||
array_push($this->_updates, '("' . $token . '", "' . $count . '")'); | |||
} | |||
/** | |||
* Remove a token from the database. | |||
* | |||
* @access protected | |||
* @param string $token | |||
* @return void | |||
*/ | |||
protected function _del($token) | |||
{ | |||
$token = mysql_real_escape_string($token, $this->_connection); | |||
array_push($this->_deletes, $token); | |||
} | |||
/** | |||
* Commits any modification queries. | |||
* | |||
* @access protected | |||
* @return void | |||
*/ | |||
protected function _commit() | |||
{ | |||
if(count($this->_deletes) > 0) { | |||
$result = mysql_query(' | |||
DELETE FROM ' . $this->config['table_name'] . ' | |||
WHERE token IN ("' . implode('", "', $this->_deletes) . '"); | |||
', $this->_connection); | |||
if(is_resource($result) === TRUE) | |||
mysql_free_result($result); | |||
$this->_deletes = array(); | |||
} | |||
if(count($this->_puts) > 0) { | |||
$result = mysql_query(' | |||
INSERT INTO ' . $this->config['table_name'] . '(token, count) | |||
VALUES ' . implode(', ', $this->_puts) . ';', $this->_connection); | |||
if(is_resource($result) === TRUE) | |||
mysql_free_result($result); | |||
$this->_puts = array(); | |||
} | |||
if(count($this->_updates) > 0) { | |||
$result = mysql_query(' | |||
INSERT INTO ' . $this->config['table_name'] . '(token, count) | |||
VALUES ' . implode(', ', $this->_updates) . ' | |||
ON DUPLICATE KEY UPDATE ' . $this->config['table_name'] . '.count = VALUES(count);', $this->_connection); | |||
if(is_resource($result) === TRUE) | |||
mysql_free_result($result); | |||
$this->_updates = array(); | |||
} | |||
} | |||
} | |||
?> |