Config and comments added

This commit is contained in:
Michael Vogel 2019-04-19 09:15:00 +02:00
parent 13522dcc69
commit ba4b6d4d21

View file

@ -16,6 +16,7 @@
use Friendica\Core\Hook;
use Friendica\Core\System;
use Friendica\Core\Logger;
use Friendica\Core\Config;
function blockbots_install()
{
@ -30,11 +31,12 @@ function blockbots_uninstall()
function blockbots_check($a, $b)
{
if (empty($_SERVER['HTTP_USER_AGENT'])) {
exit;
return;
}
$request = ['agent' => $_SERVER['HTTP_USER_AGENT'], 'uri' => $_SERVER['REQUEST_URI']];
// List of parts of user agent strings of known bots
$agents = ['ArchiveTeam ArchiveBot', 'SEMrushBot', '360Spider', 'Twitterbot', 'ltx71', 'AhrefsBot', 'YoudaoBot',
'Baiduspider', 'MSNBot', 'Googlebot', 'Sosospider', 'JikeSpider', 'BLEXBot', 'picmole', 'LexxeBot',
'NextGenSearchBot', 'spbot', 'SiteBot', 'MJ12bot', 'CrystalSemanticsBot', 'NetSeer crawler',
@ -45,7 +47,7 @@ function blockbots_check($a, $b)
'BDCbot', 'GrapeshotCrawler', 'WeSEE:Search', 'TurnitinBot', 'admantx', 'BUbiNG', 'YisouSpider',
'facebookexternalhit', 'ldspider', 'Researchscan', 'CCBot', 'Qwantify/Bleriot', 'PaperLiBot', 'bingbot',
'AppEngine-Google', 'Datanyze', 'evc-batch', 'HTTP Banner Detection', 'DuckDuckGo'];
// collection@infegy.com
foreach ($agents as $agent) {
if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) {
Logger::info('blocking user-agent', $request);
@ -53,6 +55,12 @@ function blockbots_check($a, $b)
}
}
// Activate to discover unknown bots
if (!Config::get('blockbots', 'discover_bots', false)) {
return;
}
// List of strings of known "good" agents
$agents = ['diaspora-connection-tester', 'DiasporaFederation', 'Friendica', '(compatible; zot)',
'Micro.blog', 'Mastodon', 'hackney', 'GangGo', 'python/federation', 'GNU social', 'winHttp',
'Go-http-client', 'Mr.4x3 Powered', 'Test Certificate Info', 'WordPress.com', 'zgrab',
@ -64,41 +72,36 @@ function blockbots_check($a, $b)
}
}
// Remove all known parts of user agent strings of regular browsers
$agent = blockbots_remove_known_parts($_SERVER['HTTP_USER_AGENT']);
// Empty means that it is no bot
if (empty($agent)) {
return;
}
// When it isn't empty, this is possible a bot
$request = array_merge(['rest' => $agent], $request);
Logger::info('Possible bot', $request);
}
function blockbots_remove_known_parts($agent)
{
$patterns = [
'\(Linux; Android [\d\.]*; [^\)]*\)',
'\(Linux; U; Android [\d\.]*; [^\)]*\)',
'\(iPhone; CPU [^\)]* like Mac OS X\)',
'\(iPad; CPU [^\)]* like Mac OS X\)',
'\(X11; Linux [\d_a-z]*\)',
'\(X11; Linux [\d_a-z]*; rv:[\d\.a-z]*\)',
// Search patterns of known agents
$patterns = ['\(Linux; Android [\d\.]*; [^\)]*\)', '\(Linux; U; Android [\d\.]*; [^\)]*\)',
'\(iPhone; CPU [^\)]* like Mac OS X\)', '\(iPad; CPU [^\)]* like Mac OS X\)',
'\(X11; Linux [\d_a-z]*\)', '\(X11; Linux [\d_a-z]*; rv:[\d\.a-z]*\)',
'\(X11; [a-z]*; Linux [\d_a-z]*; rv:[\d\.a-z]*\)',
'Chrome/[\d\.]*', 'Vivaldi/[\d\.]*',
'Firefox/[\d\.]*', 'rv:[\d\.a-z]*', 'AppleWebKit/[\d\.]*',
'Safari/[\d\.]*', 'Gecko/[\d\.]*', 'Quark/[\d\.]*',
'Chrome/[\d\.]*', 'Vivaldi/[\d\.]*', 'Firefox/[\d\.]*', 'rv:[\d\.a-z]*',
'AppleWebKit/[\d\.]*', 'Safari/[\d\.]*', 'Gecko/[\d\.]*', 'Quark/[\d\.]*',
'Chromium/[\d\.]*', 'Trident/[\d\.]*', 'Edge/[\d\.]*', 'Edg/[\d\.]*',
'Opera/[\d\.]*', 'Ceatles/[\d\.]*',
'UCBrowser/[\d\.]*', 'Navigator/[\d\.a-z]*', 'Mozilla/[\d\.]*',
'Goanna/[\d\.]*', 'PaleMoon/[\d\.]*',
'Windows NT [\d\.]*',
'Intel Mac OS X \d*_\d*_\d*', 'Intel Mac OS X [\d\.]*',
'Presto/[\d\.]*', 'MSIE [\d\.]*', 'Version/[\d\.]*',
'Version/[\d\.]*', '.NET CLR [\d\.]*', 'SLCC2', 'Media Center PC \d*\.\d*',
'Netscape/\d*\.\d*\.\d*', 'CrOS x86_64 [\d\.]*',
'Mobile/[\d\.a-z]*', 'Build/[\d\.a-z]*',
'FxiOS/[\d\.a-z]*', 'OPR/[\d\.]*', 'UBrowser/[\d\.]*'
];
'Opera/[\d\.]*', 'Ceatles/[\d\.]*', 'UCBrowser/[\d\.]*', 'Navigator/[\d\.a-z]*',
'Mozilla/[\d\.]*', 'Goanna/[\d\.]*', 'PaleMoon/[\d\.]*', 'Windows NT [\d\.]*',
'Intel Mac OS X \d*_\d*_\d*', 'Intel Mac OS X [\d\.]*', 'Presto/[\d\.]*',
'MSIE [\d\.]*', 'Version/[\d\.]*', 'Version/[\d\.]*', '.NET CLR [\d\.]*',
'SLCC2', 'Media Center PC \d*\.\d*', 'Netscape/\d*\.\d*\.\d*',
'CrOS x86_64 [\d\.]*', 'Mobile/[\d\.a-z]*', 'Build/[\d\.a-z]*',
'FxiOS/[\d\.a-z]*', 'OPR/[\d\.]*', 'UBrowser/[\d\.]*'];
do {
$oldagent = $agent;
@ -107,12 +110,11 @@ function blockbots_remove_known_parts($agent)
}
} while ($agent != $oldagent);
// Some more known parts that we can remove
$search = ['KHTML', 'like Gecko', 'WOW64', 'x86_64', 'X11', 'Linux', 'compatible',
'Macintosh', 'x64', 'Win64', 'Mobile', 'i686', 'en-US', 'zh-CN', ' de ',
' fr ', ' U ', 'Google Favicon', 'Windows',
'googleweblight',' en-us ',
'Win 9x 4.90', ' SG ', 'Intel Mac OS X x.y',
' wv ', 'PPC Mac OS X Mach-O', ' pre '];
' fr ', ' U ', 'Google Favicon', 'Windows', 'googleweblight',' en-us ',
'Win 9x 4.90', ' SG ', 'Intel Mac OS X x.y', ' wv ', 'PPC Mac OS X Mach-O', ' pre '];
do {
$oldtext = $agent;
$agent = ' ' . trim(str_replace($search, ' ', $agent), ' ();:.,/') . ' ';