395 lines
9.4 KiB
Plaintext
395 lines
9.4 KiB
Plaintext
<?php
|
|
|
|
# Copyright (C) 2010 Tobias Leupold <tobias.leupold@web.de>
|
|
#
|
|
# This file is part of the b8 package
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify it
|
|
# under the terms of the GNU Lesser General Public License as published by
|
|
# the Free Software Foundation in version 2.1 of the License.
|
|
#
|
|
# This program is distributed in the hope that it will be useful, but
|
|
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
# License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Lesser General Public License
|
|
# along with this program; if not, write to the Free Software Foundation,
|
|
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
|
|
|
|
/**
|
|
* Functions used by all storage backends
|
|
* Copyright (C) 2010 Tobias Leupold <tobias.leupold@web.de>
|
|
*
|
|
* @license LGPL
|
|
* @access public
|
|
* @package b8
|
|
* @author Tobias Leupold
|
|
*/
|
|
|
|
abstract class b8_storage_base
|
|
{
|
|
|
|
public $connected = FALSE;
|
|
|
|
protected $_degenerator = NULL;
|
|
|
|
const INTERNALS_TEXTS_HAM = 'bayes*texts.ham';
|
|
const INTERNALS_TEXTS_SPAM = 'bayes*texts.spam';
|
|
const INTERNALS_DBVERSION = 'bayes*dbversion';
|
|
|
|
const BACKEND_NOT_CONNECTED = 'BACKEND_NOT_CONNECTED';
|
|
const DATABASE_WRONG_VERSION = 'DATABASE_WRONG_VERSION';
|
|
const DATABASE_NOT_B8 = 'DATABASE_NOT_B8';
|
|
|
|
/**
|
|
* Validates the class has all it needs to work.
|
|
*
|
|
* @access protected
|
|
* @return mixed Returns TRUE if everything is okay, otherwise an error code.
|
|
*/
|
|
|
|
protected function validate()
|
|
{
|
|
|
|
# We set up the degenerator here, as we would have to duplicate code if it
|
|
# was done in the constructor of the respective storage backend.
|
|
$class = 'b8_degenerator_' . $this->b8_config['degenerator'];
|
|
$this->_degenerator = new $class();
|
|
|
|
if($this->connected !== TRUE)
|
|
return self::BACKEND_NOT_CONNECTED;
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
/**
|
|
* Checks if a b8 database is used and if it's version is okay
|
|
*
|
|
* @access protected
|
|
* @return mixed Returns TRUE if everything is okay, otherwise an error code.
|
|
*/
|
|
|
|
protected function check_database()
|
|
{
|
|
|
|
$internals = $this->get_internals();
|
|
|
|
if(isset($internals['dbversion'])) {
|
|
if($internals['dbversion'] == "2") {
|
|
return TRUE;
|
|
}
|
|
else {
|
|
$this->connected = FALSE;
|
|
return self::DATABASE_WRONG_VERSION;
|
|
}
|
|
}
|
|
else {
|
|
$this->connected = FALSE;
|
|
return self::DATABASE_NOT_B8;
|
|
}
|
|
|
|
}
|
|
|
|
/**
|
|
* Parses the "count" data of a token.
|
|
*
|
|
* @access private
|
|
* @param string $data
|
|
* @return array Returns an array of the parsed data: array(count_ham, count_spam, lastseen).
|
|
*/
|
|
|
|
private function _parse_count($data)
|
|
{
|
|
|
|
list($count_ham, $count_spam, $lastseen) = explode(' ', $data);
|
|
|
|
$count_ham = (int) $count_ham;
|
|
$count_spam = (int) $count_spam;
|
|
|
|
return array(
|
|
'count_ham' => $count_ham,
|
|
'count_spam' => $count_spam
|
|
);
|
|
|
|
}
|
|
|
|
/**
|
|
* Get the database's internal variables.
|
|
*
|
|
* @access public
|
|
* @return array Returns an array of all internals.
|
|
*/
|
|
|
|
public function get_internals()
|
|
{
|
|
|
|
$internals = $this->_get_query(
|
|
array(
|
|
self::INTERNALS_TEXTS_HAM,
|
|
self::INTERNALS_TEXTS_SPAM,
|
|
self::INTERNALS_DBVERSION
|
|
)
|
|
);
|
|
|
|
return array(
|
|
'texts_ham' => (int) $internals[self::INTERNALS_TEXTS_HAM],
|
|
'texts_spam' => (int) $internals[self::INTERNALS_TEXTS_SPAM],
|
|
'dbversion' => (int) $internals[self::INTERNALS_DBVERSION]
|
|
);
|
|
|
|
}
|
|
|
|
/**
|
|
* Get all data about a list of tags from the database.
|
|
*
|
|
* @access public
|
|
* @param array $tokens
|
|
* @return mixed Returns FALSE on failure, otherwise returns array of returned data in the format array('tokens' => array(token => count), 'degenerates' => array(token => array(degenerate => count))).
|
|
*/
|
|
|
|
public function get($tokens)
|
|
{
|
|
|
|
# Validate the startup
|
|
|
|
$started_up = $this->validate();
|
|
|
|
if($started_up !== TRUE)
|
|
return $started_up;
|
|
|
|
# First we see what we have in the database.
|
|
$token_data = $this->_get_query($tokens);
|
|
|
|
# Check if we have to degenerate some tokens
|
|
|
|
$missing_tokens = array();
|
|
|
|
foreach($tokens as $token) {
|
|
if(!isset($token_data[$token]))
|
|
$missing_tokens[] = $token;
|
|
}
|
|
|
|
if(count($missing_tokens) > 0) {
|
|
|
|
# We have to degenerate some tokens
|
|
$degenerates_list = array();
|
|
|
|
# Generate a list of degenerated tokens for the missing tokens ...
|
|
$degenerates = $this->_degenerator->degenerate($missing_tokens);
|
|
|
|
# ... and look them up
|
|
|
|
foreach($degenerates as $token => $token_degenerates)
|
|
$degenerates_list = array_merge($degenerates_list, $token_degenerates);
|
|
|
|
$token_data = array_merge($token_data, $this->_get_query($degenerates_list));
|
|
|
|
}
|
|
|
|
# Here, we have all availible data in $token_data.
|
|
|
|
$return_data_tokens = array();
|
|
$return_data_degenerates = array();
|
|
|
|
foreach($tokens as $token) {
|
|
|
|
if(isset($token_data[$token]) === TRUE) {
|
|
|
|
# The token was found in the database
|
|
|
|
# Add the data ...
|
|
$return_data_tokens[$token] = $this->_parse_count($token_data[$token]);
|
|
|
|
# ... and update it's lastseen parameter
|
|
$this->_update($token, "{$return_data_tokens[$token]['count_ham']} {$return_data_tokens[$token]['count_spam']} " . $this->b8_config['today']);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
# The token was not found, so we look if we
|
|
# can return data for degenerated tokens
|
|
|
|
# Check all degenerated forms of the token
|
|
|
|
foreach($this->_degenerator->degenerates[$token] as $degenerate) {
|
|
|
|
if(isset($token_data[$degenerate]) === TRUE) {
|
|
|
|
# A degeneration of the token way found in the database
|
|
|
|
# Add the data ...
|
|
$return_data_degenerates[$token][$degenerate] = $this->_parse_count($token_data[$degenerate]);
|
|
|
|
# ... and update it's lastseen parameter
|
|
$this->_update($degenerate, "{$return_data_degenerates[$token][$degenerate]['count_ham']} {$return_data_degenerates[$token][$degenerate]['count_spam']} " . $this->b8_config['today']);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
# Now, all token data directly found in the database is in $return_data_tokens
|
|
# and all data for degenerated versions is in $return_data_degenerates
|
|
|
|
# First, we commit the changes to the lastseen parameters
|
|
$this->_commit();
|
|
|
|
# Then, we return what we have
|
|
return array(
|
|
'tokens' => $return_data_tokens,
|
|
'degenerates' => $return_data_degenerates
|
|
);
|
|
|
|
}
|
|
|
|
/**
|
|
* Stores or deletes a list of tokens from the given category.
|
|
*
|
|
* @access public
|
|
* @param array $tokens
|
|
* @param const $category Either b8::HAM or b8::SPAM
|
|
* @param const $action Either b8::LEARN or b8::UNLEARN
|
|
* @return void
|
|
*/
|
|
|
|
public function process_text($tokens, $category, $action)
|
|
{
|
|
|
|
# Validate the startup
|
|
|
|
$started_up = $this->validate();
|
|
|
|
if($started_up !== TRUE)
|
|
return $started_up;
|
|
|
|
# No matter what we do, we first have to check what data we have.
|
|
|
|
# First get the internals, including the ham texts and spam texts counter
|
|
$internals = $this->get_internals();
|
|
|
|
# Then, fetch all data for all tokens we have (and update their lastseen parameters)
|
|
$token_data = $this->_get_query(array_keys($tokens));
|
|
|
|
# Process all tokens to learn/unlearn
|
|
|
|
foreach($tokens as $token => $count) {
|
|
|
|
if(isset($token_data[$token])) {
|
|
|
|
# We already have this token, so update it's data
|
|
|
|
# Get the existing data
|
|
list($count_ham, $count_spam, $lastseen) = explode(' ', $token_data[$token]);
|
|
$count_ham = (int) $count_ham;
|
|
$count_spam = (int) $count_spam;
|
|
|
|
# Increase or decrease the right counter
|
|
|
|
if($action === b8::LEARN) {
|
|
if($category === b8::HAM)
|
|
$count_ham += $count;
|
|
elseif($category === b8::SPAM)
|
|
$count_spam += $count;
|
|
}
|
|
|
|
elseif($action == b8::UNLEARN) {
|
|
if($category === b8::HAM)
|
|
$count_ham -= $count;
|
|
elseif($category === b8::SPAM)
|
|
$count_spam -= $count;
|
|
}
|
|
|
|
# We don't want to have negative values
|
|
|
|
if($count_ham < 0)
|
|
$count_ham = 0;
|
|
|
|
if($count_spam < 0)
|
|
$count_spam = 0;
|
|
|
|
# Now let's see if we have to update or delete the token
|
|
if($count_ham !== 0 or $count_spam !== 0)
|
|
$this->_update($token, "$count_ham $count_spam " . $this->b8_config['today']);
|
|
else
|
|
$this->_del($token);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
# We don't have the token. If we unlearn a text, we can't delete it
|
|
# as we don't have it anyway, so just do something if we learn a text
|
|
|
|
if($action === b8::LEARN) {
|
|
|
|
if($category === b8::HAM)
|
|
$data = '1 0 ';
|
|
elseif($category === b8::SPAM)
|
|
$data = '0 1 ';
|
|
|
|
$data .= $this->b8_config['today'];
|
|
|
|
$this->_put($token, $data);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
# Now, all token have been processed, so let's update the right text
|
|
|
|
if($action === b8::LEARN) {
|
|
|
|
if($category === b8::HAM) {
|
|
$internals['texts_ham']++;
|
|
$this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']);
|
|
}
|
|
|
|
elseif($category === b8::SPAM) {
|
|
$internals['texts_spam']++;
|
|
$this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']);
|
|
}
|
|
|
|
}
|
|
|
|
elseif($action == b8::UNLEARN) {
|
|
|
|
if($category === b8::HAM) {
|
|
|
|
$internals['texts_ham']--;
|
|
|
|
if($internals['texts_ham'] < 0)
|
|
$internals['texts_ham'] = 0;
|
|
|
|
$this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']);
|
|
|
|
}
|
|
|
|
elseif($category === b8::SPAM) {
|
|
|
|
$internals['texts_spam']--;
|
|
|
|
if($internals['texts_spam'] < 0)
|
|
$internals['texts_spam'] = 0;
|
|
|
|
$this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
# We're done and can commit all changes to the database now
|
|
$this->_commit();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
?> |