add spam engine

2012-01-31 15:54:41 -08:00 · 2012-01-31 15:54:41 -08:00 · c8c062d960
commit c8c062d960
parent 4fc455d195
13 changed files with 4048 additions and 0 deletions
--- a/library/spam/b8/lexer/lexer_default.php
+++ b/library/spam/b8/lexer/lexer_default.php
@ -0,0 +1,205 @@
+<?php
+
+#   Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
+#
+#   This file is part of the b8 package
+#
+#   This program is free software; you can redistribute it and/or modify it
+#   under the terms of the GNU Lesser General Public License as published by
+#   the Free Software Foundation in version 2.1 of the License.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+#   License for more details.
+#
+#   You should have received a copy of the GNU Lesser General Public License
+#   along with this program; if not, write to the Free Software Foundation,
+#   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+
+/**
+ * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
+ *
+ * @license LGPL
+ * @access public
+ * @package b8
+ * @author Tobias Leupold
+ * @author Oliver Lillie (aka buggedcom) (original PHP 5 port)
+ */
+
+class b8_lexer_default
+{
+
+	const LEXER_TEXT_NOT_STRING = 'LEXER_TEXT_NOT_STRING';
+	const LEXER_TEXT_EMPTY      = 'LEXER_TEXT_EMPTY';
+
+	public $config = NULL;
+
+	# The regular expressions we use to split the text to tokens
+
+	public $regexp = array(
+		'ip'        => '/([A-Za-z0-9\_\-\.]+)/',
+		'raw_split' => '/[\s,\.\/"\:;\|<>\-_\[\]{}\+=\)\(\*\&\^%]+/',
+		'html'      => '/(<.+?>)/',
+		'tagname'   => '/(.+?)\s/',
+		'numbers'   => '/^[0-9]+$/'
+	);
+
+	/**
+	 * Constructs the lexer.
+	 *
+	 * @access public
+	 * @return void
+	 */
+
+	function __construct($config)
+	{
+		$this->config = $config;
+	}
+
+	/**
+	 * Generates the tokens required for the bayesian filter.
+	 *
+	 * @access public
+	 * @param string $text
+	 * @return array Returns the list of tokens
+	 */
+
+	public function get_tokens($text)
+	{
+
+		# Check that we actually have a string ...
+		if(is_string($text) === FALSE)
+			return self::LEXER_TEXT_NOT_STRING;
+
+		# ... and that it's not empty
+		if(empty($text) === TRUE)
+			return self::LEXER_TEXT_EMPTY;
+
+		# Re-convert the text to the original characters coded in UTF-8, as
+		# they have been coded in html entities during the post process
+		$text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
+
+		$tokens = array();
+
+		# Find URLs and IP addresses
+
+		preg_match_all($this->regexp['ip'], $text, $raw_tokens);
+
+		foreach($raw_tokens[1] as $word) {
+
+			# Check for a dot
+			if(strpos($word, '.') === FALSE)
+				continue;
+
+			# Check that the word is valid, min and max sizes, etc.
+			if($this->_is_valid($word) === FALSE)
+				continue;
+
+			if(isset($tokens[$word]) === FALSE)
+				$tokens[$word] = 1;
+			else
+				$tokens[$word] += 1;
+
+			# Delete the word from the text so it doesn't get re-added.
+			$text = str_replace($word, '', $text);
+
+			# Also process the parts of the URLs
+			$url_parts = preg_split($this->regexp['raw_split'], $word);
+
+			foreach($url_parts as $word) {
+
+				# Again validate the part
+
+				if($this->_is_valid($word) === FALSE)
+					continue;
+
+				if(isset($tokens[$word]) === FALSE)
+					$tokens[$word] = 1;
+				else
+					$tokens[$word] += 1;
+
+			}
+
+		}
+
+		# Split the remaining text
+
+		$raw_tokens = preg_split($this->regexp['raw_split'], $text);
+
+		foreach($raw_tokens as $word) {
+
+			# Again validate the part
+
+			if($this->_is_valid($word) === FALSE)
+				continue;
+
+			if(isset($tokens[$word]) === FALSE)
+				$tokens[$word] = 1;
+			else
+				$tokens[$word] += 1;
+
+		}
+
+		# Process the HTML
+
+		preg_match_all($this->regexp['html'], $text, $raw_tokens);
+
+		foreach($raw_tokens[1] as $word) {
+
+			# Again validate the part
+
+			if($this->_is_valid($word) === FALSE)
+				continue;
+
+			# If the tag has parameters, just use the tag itself
+
+			if(strpos($word, ' ') !== FALSE) {
+				preg_match($this->regexp['tagname'], $word, $tmp);
+				$word = "{$tmp[1]}...>";
+			}
+
+			if(isset($tokens[$word]) === FALSE)
+				$tokens[$word] = 1;
+			else
+				$tokens[$word] += 1;
+
+		}
+
+		# Return a list of all found tokens
+		return $tokens;
+
+	}
+
+	/**
+	 * Validates a token.
+	 *
+	 * @access private
+	 * @param string $token The token string.
+	 * @return boolean Returns TRUE if the token is valid, otherwise returns FALSE
+	 */
+
+	private function _is_valid($token)
+	{
+
+		# Validate the size of the token
+
+		$len = strlen($token);
+
+		if($len < $this->config['min_size'] or $len > $this->config['max_size'])
+			return FALSE;
+
+		# We may want to exclude pure numbers
+		if($this->config['allow_numbers'] === FALSE) {
+			if(preg_match($this->regexp['numbers'], $token) > 0)
+				return FALSE;
+		}
+
+		# Token is okay
+		return TRUE;
+
+	}
+
+}
+
+?>