add spam engine
This commit is contained in:
parent
4fc455d195
commit
c8c062d960
13 changed files with 4048 additions and 0 deletions
205
library/spam/b8/lexer/lexer_default.php
Normal file
205
library/spam/b8/lexer/lexer_default.php
Normal file
|
@ -0,0 +1,205 @@
|
|||
<?php
|
||||
|
||||
# Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
|
||||
#
|
||||
# This file is part of the b8 package
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify it
|
||||
# under the terms of the GNU Lesser General Public License as published by
|
||||
# the Free Software Foundation in version 2.1 of the License.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
# License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public License
|
||||
# along with this program; if not, write to the Free Software Foundation,
|
||||
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
/**
|
||||
* Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
|
||||
*
|
||||
* @license LGPL
|
||||
* @access public
|
||||
* @package b8
|
||||
* @author Tobias Leupold
|
||||
* @author Oliver Lillie (aka buggedcom) (original PHP 5 port)
|
||||
*/
|
||||
|
||||
class b8_lexer_default
|
||||
{
|
||||
|
||||
const LEXER_TEXT_NOT_STRING = 'LEXER_TEXT_NOT_STRING';
|
||||
const LEXER_TEXT_EMPTY = 'LEXER_TEXT_EMPTY';
|
||||
|
||||
public $config = NULL;
|
||||
|
||||
# The regular expressions we use to split the text to tokens
|
||||
|
||||
public $regexp = array(
|
||||
'ip' => '/([A-Za-z0-9\_\-\.]+)/',
|
||||
'raw_split' => '/[\s,\.\/"\:;\|<>\-_\[\]{}\+=\)\(\*\&\^%]+/',
|
||||
'html' => '/(<.+?>)/',
|
||||
'tagname' => '/(.+?)\s/',
|
||||
'numbers' => '/^[0-9]+$/'
|
||||
);
|
||||
|
||||
/**
|
||||
* Constructs the lexer.
|
||||
*
|
||||
* @access public
|
||||
* @return void
|
||||
*/
|
||||
|
||||
function __construct($config)
|
||||
{
|
||||
$this->config = $config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates the tokens required for the bayesian filter.
|
||||
*
|
||||
* @access public
|
||||
* @param string $text
|
||||
* @return array Returns the list of tokens
|
||||
*/
|
||||
|
||||
public function get_tokens($text)
|
||||
{
|
||||
|
||||
# Check that we actually have a string ...
|
||||
if(is_string($text) === FALSE)
|
||||
return self::LEXER_TEXT_NOT_STRING;
|
||||
|
||||
# ... and that it's not empty
|
||||
if(empty($text) === TRUE)
|
||||
return self::LEXER_TEXT_EMPTY;
|
||||
|
||||
# Re-convert the text to the original characters coded in UTF-8, as
|
||||
# they have been coded in html entities during the post process
|
||||
$text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
|
||||
|
||||
$tokens = array();
|
||||
|
||||
# Find URLs and IP addresses
|
||||
|
||||
preg_match_all($this->regexp['ip'], $text, $raw_tokens);
|
||||
|
||||
foreach($raw_tokens[1] as $word) {
|
||||
|
||||
# Check for a dot
|
||||
if(strpos($word, '.') === FALSE)
|
||||
continue;
|
||||
|
||||
# Check that the word is valid, min and max sizes, etc.
|
||||
if($this->_is_valid($word) === FALSE)
|
||||
continue;
|
||||
|
||||
if(isset($tokens[$word]) === FALSE)
|
||||
$tokens[$word] = 1;
|
||||
else
|
||||
$tokens[$word] += 1;
|
||||
|
||||
# Delete the word from the text so it doesn't get re-added.
|
||||
$text = str_replace($word, '', $text);
|
||||
|
||||
# Also process the parts of the URLs
|
||||
$url_parts = preg_split($this->regexp['raw_split'], $word);
|
||||
|
||||
foreach($url_parts as $word) {
|
||||
|
||||
# Again validate the part
|
||||
|
||||
if($this->_is_valid($word) === FALSE)
|
||||
continue;
|
||||
|
||||
if(isset($tokens[$word]) === FALSE)
|
||||
$tokens[$word] = 1;
|
||||
else
|
||||
$tokens[$word] += 1;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
# Split the remaining text
|
||||
|
||||
$raw_tokens = preg_split($this->regexp['raw_split'], $text);
|
||||
|
||||
foreach($raw_tokens as $word) {
|
||||
|
||||
# Again validate the part
|
||||
|
||||
if($this->_is_valid($word) === FALSE)
|
||||
continue;
|
||||
|
||||
if(isset($tokens[$word]) === FALSE)
|
||||
$tokens[$word] = 1;
|
||||
else
|
||||
$tokens[$word] += 1;
|
||||
|
||||
}
|
||||
|
||||
# Process the HTML
|
||||
|
||||
preg_match_all($this->regexp['html'], $text, $raw_tokens);
|
||||
|
||||
foreach($raw_tokens[1] as $word) {
|
||||
|
||||
# Again validate the part
|
||||
|
||||
if($this->_is_valid($word) === FALSE)
|
||||
continue;
|
||||
|
||||
# If the tag has parameters, just use the tag itself
|
||||
|
||||
if(strpos($word, ' ') !== FALSE) {
|
||||
preg_match($this->regexp['tagname'], $word, $tmp);
|
||||
$word = "{$tmp[1]}...>";
|
||||
}
|
||||
|
||||
if(isset($tokens[$word]) === FALSE)
|
||||
$tokens[$word] = 1;
|
||||
else
|
||||
$tokens[$word] += 1;
|
||||
|
||||
}
|
||||
|
||||
# Return a list of all found tokens
|
||||
return $tokens;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates a token.
|
||||
*
|
||||
* @access private
|
||||
* @param string $token The token string.
|
||||
* @return boolean Returns TRUE if the token is valid, otherwise returns FALSE
|
||||
*/
|
||||
|
||||
private function _is_valid($token)
|
||||
{
|
||||
|
||||
# Validate the size of the token
|
||||
|
||||
$len = strlen($token);
|
||||
|
||||
if($len < $this->config['min_size'] or $len > $this->config['max_size'])
|
||||
return FALSE;
|
||||
|
||||
# We may want to exclude pure numbers
|
||||
if($this->config['allow_numbers'] === FALSE) {
|
||||
if(preg_match($this->regexp['numbers'], $token) > 0)
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
# Token is okay
|
||||
return TRUE;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
Loading…
Add table
Add a link
Reference in a new issue