396 lines
		
	
	
		
			No EOL
		
	
	
		
			9.5 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			396 lines
		
	
	
		
			No EOL
		
	
	
		
			9.5 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | |
| 
 | |
| #   Copyright (C) 2010 Tobias Leupold <tobias.leupold@web.de>
 | |
| #
 | |
| #   This file is part of the b8 package
 | |
| #
 | |
| #   This program is free software; you can redistribute it and/or modify it
 | |
| #   under the terms of the GNU Lesser General Public License as published by
 | |
| #   the Free Software Foundation in version 2.1 of the License.
 | |
| #
 | |
| #   This program is distributed in the hope that it will be useful, but
 | |
| #   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 | |
| #   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 | |
| #   License for more details.
 | |
| #
 | |
| #   You should have received a copy of the GNU Lesser General Public License
 | |
| #   along with this program; if not, write to the Free Software Foundation,
 | |
| #   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 | |
| 
 | |
| /**
 | |
|  * Functions used by all storage backends
 | |
|  * Copyright (C) 2010 Tobias Leupold <tobias.leupold@web.de>
 | |
|  *
 | |
|  * @license LGPL
 | |
|  * @access public
 | |
|  * @package b8
 | |
|  * @author Tobias Leupold
 | |
|  */
 | |
| 
 | |
| abstract class b8_storage_base
 | |
| {
 | |
| 
 | |
| 	public $connected            = FALSE;
 | |
| 
 | |
| 	protected $_degenerator      = NULL;
 | |
| 
 | |
| 	const INTERNALS_TEXTS_HAM    = 'bayes*texts.ham';
 | |
| 	const INTERNALS_TEXTS_SPAM   = 'bayes*texts.spam';
 | |
| 	const INTERNALS_DBVERSION    = 'bayes*dbversion';
 | |
| 
 | |
| 	const BACKEND_NOT_CONNECTED  = 'BACKEND_NOT_CONNECTED';
 | |
| 	const DATABASE_WRONG_VERSION = 'DATABASE_WRONG_VERSION';
 | |
| 	const DATABASE_NOT_B8        = 'DATABASE_NOT_B8';
 | |
| 
 | |
| 	/**
 | |
| 	 * Validates the class has all it needs to work.
 | |
| 	 *
 | |
| 	 * @access protected
 | |
| 	 * @return mixed Returns TRUE if everything is okay, otherwise an error code.
 | |
| 	 */
 | |
| 
 | |
| 	protected function validate()
 | |
| 	{
 | |
| 
 | |
| 		# We set up the degenerator here, as we would have to duplicate code if it
 | |
| 		# was done in the constructor of the respective storage backend.
 | |
| 		$class = 'b8_degenerator_' . $this->b8_config['degenerator'];
 | |
| 		$this->_degenerator = new $class();
 | |
| 
 | |
| 		if($this->connected !== TRUE)
 | |
| 			return self::BACKEND_NOT_CONNECTED;
 | |
| 
 | |
| 		return TRUE;
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Checks if a b8 database is used and if it's version is okay
 | |
| 	 *
 | |
| 	 * @access protected
 | |
| 	 * @return mixed Returns TRUE if everything is okay, otherwise an error code.
 | |
| 	 */
 | |
| 
 | |
| 	protected function check_database($uid)
 | |
| 	{
 | |
| 
 | |
| 		$internals = $this->get_internals($uid);
 | |
| 
 | |
| 		if(isset($internals['dbversion'])) {
 | |
| 			if($internals['dbversion'] == "2") {
 | |
| 				return TRUE;
 | |
| 			}
 | |
| 			else {
 | |
| 				$this->connected = FALSE;
 | |
| 				return self::DATABASE_WRONG_VERSION;
 | |
| 			}
 | |
| 		}
 | |
| 		else {
 | |
| 			$this->connected = FALSE;
 | |
| 			return self::DATABASE_NOT_B8;
 | |
| 		}
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Parses the "count" data of a token.
 | |
| 	 *
 | |
| 	 * @access private
 | |
| 	 * @param string $data
 | |
| 	 * @return array Returns an array of the parsed data: array(count_ham, count_spam, lastseen).
 | |
| 	 */
 | |
| 
 | |
| 	private function _parse_count($data)
 | |
| 	{
 | |
| 
 | |
| 		list($count_ham, $count_spam, $lastseen) = explode(' ', $data);
 | |
| 
 | |
| 		$count_ham  = (int) $count_ham;
 | |
| 		$count_spam = (int) $count_spam;
 | |
| 
 | |
| 		return array(
 | |
| 			'count_ham'  => $count_ham,
 | |
| 			'count_spam' => $count_spam
 | |
| 		);
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Get the database's internal variables.
 | |
| 	 *
 | |
| 	 * @access public
 | |
| 	 * @return array Returns an array of all internals.
 | |
| 	 */
 | |
| 
 | |
| 	public function get_internals($uid)
 | |
| 	{
 | |
| 
 | |
| 		$internals = $this->_get_query(
 | |
| 			array(
 | |
| 				self::INTERNALS_TEXTS_HAM,
 | |
| 				self::INTERNALS_TEXTS_SPAM,
 | |
| 				self::INTERNALS_DBVERSION
 | |
| 			),
 | |
| 			$uid
 | |
| 		);
 | |
| 
 | |
| 		return array(
 | |
| 			'texts_ham'  => (int) $internals[self::INTERNALS_TEXTS_HAM],
 | |
| 			'texts_spam' => (int) $internals[self::INTERNALS_TEXTS_SPAM],
 | |
| 			'dbversion'  => (int) $internals[self::INTERNALS_DBVERSION]
 | |
| 		);
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Get all data about a list of tags from the database.
 | |
| 	 *
 | |
| 	 * @access public
 | |
| 	 * @param array $tokens
 | |
| 	 * @return mixed Returns FALSE on failure, otherwise returns array of returned data in the format array('tokens' => array(token => count), 'degenerates' => array(token => array(degenerate => count))).
 | |
| 	 */
 | |
| 
 | |
| 	public function get($tokens, $uid)
 | |
| 	{
 | |
| 
 | |
| 		# Validate the startup
 | |
| 
 | |
| 		$started_up = $this->validate();
 | |
| 
 | |
| 		if($started_up !== TRUE)
 | |
| 			return $started_up;
 | |
| 
 | |
| 		# First we see what we have in the database.
 | |
| 		$token_data = $this->_get_query($tokens, $uid);
 | |
| 
 | |
| 		# Check if we have to degenerate some tokens
 | |
| 
 | |
| 		$missing_tokens = array();
 | |
| 
 | |
| 		foreach($tokens as $token) {
 | |
| 			if(!isset($token_data[$token]))
 | |
| 				$missing_tokens[] = $token;
 | |
| 		}
 | |
| 
 | |
| 		if(count($missing_tokens) > 0) {
 | |
| 
 | |
| 			# We have to degenerate some tokens
 | |
| 			$degenerates_list = array();
 | |
| 
 | |
| 			# Generate a list of degenerated tokens for the missing tokens ...
 | |
| 			$degenerates = $this->_degenerator->degenerate($missing_tokens);
 | |
| 
 | |
| 			# ... and look them up
 | |
| 
 | |
| 			foreach($degenerates as $token => $token_degenerates)
 | |
| 				$degenerates_list = array_merge($degenerates_list, $token_degenerates);
 | |
| 
 | |
| 			$token_data = array_merge($token_data, $this->_get_query($degenerates_list));
 | |
| 
 | |
| 		}
 | |
| 
 | |
| 		# Here, we have all availible data in $token_data.
 | |
| 
 | |
| 		$return_data_tokens = array();
 | |
| 		$return_data_degenerates = array();
 | |
| 
 | |
| 		foreach($tokens as $token) {
 | |
| 
 | |
| 			if(isset($token_data[$token]) === TRUE) {
 | |
| 
 | |
| 				# The token was found in the database
 | |
| 
 | |
| 				# Add the data ...
 | |
| 				$return_data_tokens[$token] = $this->_parse_count($token_data[$token]);
 | |
| 
 | |
| 				# ... and update it's lastseen parameter
 | |
| 				$this->_update($token, "{$return_data_tokens[$token]['count_ham']} {$return_data_tokens[$token]['count_spam']} " . $this->b8_config['today'], $uid );
 | |
| 
 | |
| 			}
 | |
| 
 | |
| 			else {
 | |
| 
 | |
| 				# The token was not found, so we look if we
 | |
| 				# can return data for degenerated tokens
 | |
| 
 | |
| 				# Check all degenerated forms of the token
 | |
| 
 | |
| 				foreach($this->_degenerator->degenerates[$token] as $degenerate) {
 | |
| 
 | |
| 					if(isset($token_data[$degenerate]) === TRUE) {
 | |
| 
 | |
| 						# A degeneration of the token way found in the database
 | |
| 
 | |
| 						# Add the data ...
 | |
| 						$return_data_degenerates[$token][$degenerate] = $this->_parse_count($token_data[$degenerate]);
 | |
| 
 | |
| 						# ... and update it's lastseen parameter
 | |
| 						$this->_update($degenerate, "{$return_data_degenerates[$token][$degenerate]['count_ham']} {$return_data_degenerates[$token][$degenerate]['count_spam']} " . $this->b8_config['today'], $uid);
 | |
| 
 | |
| 					}
 | |
| 
 | |
| 				}
 | |
| 
 | |
| 			}
 | |
| 
 | |
| 		}
 | |
| 
 | |
| 		# Now, all token data directly found in the database is in $return_data_tokens
 | |
| 		# and all data for degenerated versions is in $return_data_degenerates
 | |
| 
 | |
| 		# First, we commit the changes to the lastseen parameters
 | |
| 		$this->_commit();
 | |
| 
 | |
| 		# Then, we return what we have
 | |
| 		return array(
 | |
| 			'tokens'      => $return_data_tokens,
 | |
| 			'degenerates' => $return_data_degenerates
 | |
| 		);
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Stores or deletes a list of tokens from the given category.
 | |
| 	 *
 | |
| 	 * @access public
 | |
| 	 * @param array $tokens
 | |
| 	 * @param const $category Either b8::HAM or b8::SPAM
 | |
| 	 * @param const $action Either b8::LEARN or b8::UNLEARN
 | |
| 	 * @return void
 | |
| 	 */
 | |
| 
 | |
| 	public function process_text($tokens, $category, $action, $uid)
 | |
| 	{
 | |
| 
 | |
| 		# Validate the startup
 | |
| 
 | |
| 		$started_up = $this->validate();
 | |
| 
 | |
| 		if($started_up !== TRUE)
 | |
| 			return $started_up;
 | |
| 
 | |
| 		# No matter what we do, we first have to check what data we have.
 | |
| 
 | |
| 		# First get the internals, including the ham texts and spam texts counter
 | |
| 		$internals = $this->get_internals($uid);
 | |
| 
 | |
| 		# Then, fetch all data for all tokens we have (and update their lastseen parameters)
 | |
| 		$token_data = $this->_get_query(array_keys($tokens), $uid);
 | |
| 
 | |
| 		# Process all tokens to learn/unlearn
 | |
| 
 | |
| 		foreach($tokens as $token => $count) {
 | |
| 
 | |
| 			if(isset($token_data[$token])) {
 | |
| 
 | |
| 				# We already have this token, so update it's data
 | |
| 
 | |
| 				# Get the existing data
 | |
| 				list($count_ham, $count_spam, $lastseen) = explode(' ', $token_data[$token]);
 | |
| 				$count_ham  = (int) $count_ham;
 | |
| 				$count_spam = (int) $count_spam;
 | |
| 
 | |
| 				# Increase or decrease the right counter
 | |
| 
 | |
| 				if($action === b8::LEARN) {
 | |
| 					if($category === b8::HAM)
 | |
| 						$count_ham += $count;
 | |
| 					elseif($category === b8::SPAM)
 | |
| 						$count_spam += $count;
 | |
| 				}
 | |
| 
 | |
| 				elseif($action == b8::UNLEARN) {
 | |
| 					if($category === b8::HAM)
 | |
| 						$count_ham -= $count;
 | |
| 					elseif($category === b8::SPAM)
 | |
| 						$count_spam -= $count;
 | |
| 				}
 | |
| 
 | |
| 				# We don't want to have negative values
 | |
| 
 | |
| 				if($count_ham < 0)
 | |
| 					$count_ham = 0;
 | |
| 
 | |
| 				if($count_spam < 0)
 | |
| 					$count_spam = 0;
 | |
| 
 | |
| 				# Now let's see if we have to update or delete the token
 | |
| 				if($count_ham !== 0 or $count_spam !== 0)
 | |
| 					$this->_update($token, "$count_ham $count_spam " . $this->b8_config['today'], $uid);
 | |
| 				else
 | |
| 					$this->_del($token, $uid);
 | |
| 
 | |
| 			}
 | |
| 
 | |
| 			else {
 | |
| 
 | |
| 				# We don't have the token. If we unlearn a text, we can't delete it
 | |
| 				# as we don't have it anyway, so just do something if we learn a text
 | |
| 
 | |
| 				if($action === b8::LEARN) {
 | |
| 
 | |
| 					if($category === b8::HAM)
 | |
| 						$data = '1 0 ';
 | |
| 					elseif($category === b8::SPAM)
 | |
| 						$data = '0 1 ';
 | |
| 
 | |
| 					$data .= $this->b8_config['today'];
 | |
| 
 | |
| 					$this->_put($token, $data, $uid);
 | |
| 
 | |
| 				}
 | |
| 
 | |
| 			}
 | |
| 
 | |
| 		}
 | |
| 
 | |
| 		# Now, all token have been processed, so let's update the right text
 | |
| 
 | |
| 		if($action === b8::LEARN) {
 | |
| 
 | |
| 			if($category === b8::HAM) {
 | |
| 				$internals['texts_ham']++;
 | |
| 				$this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham'], $uid);
 | |
| 			}
 | |
| 
 | |
| 			elseif($category === b8::SPAM) {
 | |
| 				$internals['texts_spam']++;
 | |
| 				$this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam'], $uid);
 | |
| 			}
 | |
| 
 | |
| 		}
 | |
| 
 | |
| 		elseif($action == b8::UNLEARN) {
 | |
| 
 | |
| 			if($category === b8::HAM) {
 | |
| 
 | |
| 				$internals['texts_ham']--;
 | |
| 
 | |
| 				if($internals['texts_ham'] < 0)
 | |
| 					$internals['texts_ham'] = 0;
 | |
| 
 | |
| 				$this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham'], $uid);
 | |
| 
 | |
| 			}
 | |
| 
 | |
| 			elseif($category === b8::SPAM) {
 | |
| 
 | |
| 				$internals['texts_spam']--;
 | |
| 
 | |
| 				if($internals['texts_spam'] < 0)
 | |
| 					$internals['texts_spam'] = 0;
 | |
| 
 | |
| 				$this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam'], $uid);
 | |
| 
 | |
| 			}
 | |
| 
 | |
| 		}
 | |
| 
 | |
| 		# We're done and can commit all changes to the database now
 | |
| 		$this->_commit($uid);
 | |
| 
 | |
| 	}
 | |
| 
 | |
| }
 | |
| 
 | |
| ?>
 |