Compare commits

...

6 Commits

Author SHA1 Message Date
Michael Vogel bc71ee9ac8 Added external library 2019-04-20 16:40:26 +02:00
Michael Vogel 9e97a74dff Improved detection 2019-04-19 13:25:39 +02:00
Michael Vogel ba4b6d4d21 Config and comments added 2019-04-19 09:15:00 +02:00
Michael Vogel 13522dcc69 Improved client detection 2019-04-19 08:23:01 +02:00
Michael Vogel 5752017287 Separated cleaning function, added some more patterns 2019-04-18 08:20:09 +02:00
Michael Vogel e359013cac Blockbot: Completely block access for crawlers 2019-04-18 07:45:30 +02:00
1 changed files with 135 additions and 0 deletions

135
blockbots/blockbots.php Normal file
View File

@ -0,0 +1,135 @@
<?php
/**
* Name: blockbots
* Description: Block all bots - even the ones who don't respect robots.txt
* Version: 0.1
* Author: Michael Vogel <https://pirati.ca/profile/heluecht>
*
* There are bots that ignore robots.txt, see for example:
* https://www.archiveteam.org/index.php?title=Robots.txt
*
* Additionally the list contains an exhausting list of other known bots.
*
* When a bot is detected, the system quits with error "403 Forbidden"
*/
use Friendica\Core\Hook;
use Friendica\Core\System;
use Friendica\Core\Logger;
use Friendica\Core\Config;
use Jaybizzle\CrawlerDetect\CrawlerDetect;
function blockbots_install()
{
Hook::register('head', 'addon/blockbots/blockbots.php', 'blockbots_check');
}
function blockbots_uninstall()
{
Hook::unregister('head', 'addon/blockbots/blockbots.php', 'blockbots_check');
}
function blockbots_check($a, $b)
{
if (empty($_SERVER['HTTP_USER_AGENT'])) {
return;
}
$request = ['agent' => $_SERVER['HTTP_USER_AGENT'], 'uri' => $_SERVER['REQUEST_URI']];
$CrawlerDetect = new CrawlerDetect;
if ($CrawlerDetect->isCrawler($_SERVER['HTTP_USER_AGENT'])) {
Logger::info('blocking crawler', $request);
System::httpExit(403);
}
// List of parts of user agent strings of known bots
$agents = ['ArchiveTeam ArchiveBot', 'SEMrushBot', '360Spider', 'Twitterbot', 'ltx71', 'AhrefsBot', 'YoudaoBot',
'Baiduspider', 'MSNBot', 'Googlebot', 'Sosospider', 'JikeSpider', 'BLEXBot', 'picmole', 'LexxeBot',
'NextGenSearchBot', 'spbot', 'SiteBot', 'MJ12bot', 'CrystalSemanticsBot', 'NetSeer crawler',
'trovitBot', 'DotBot', 'Ezooms', 'discobot', 'Jyxobot', 'sogou', 'sistrix', 'heritrix', 'GarlikCrawler',
'NerdByNature.Bot', 'DTS Agent', 'psbot', 'WBSearchBot', 'AddThis.com', 'ia_archiver', 'proximic',
'discoverybot', 'bl.uk_lddc_bot', 'IstellaBot', 'seokicks', 'UnisterBot', 'Bender', 'wotbox',
'Yasni', 'netEstate NE Crawler', 'Exabot', 'Pixray-Seeker', 'Linguee', 'integromedb', 'SearchmetricsBot',
'BDCbot', 'GrapeshotCrawler', 'WeSEE:Search', 'TurnitinBot', 'admantx', 'BUbiNG', 'YisouSpider',
'facebookexternalhit', 'ldspider', 'Researchscan', 'CCBot', 'Qwantify/Bleriot', 'PaperLiBot', 'bingbot',
'AppEngine-Google', 'Datanyze', 'evc-batch', 'HTTP Banner Detection', 'DuckDuckGo', 'QwantBrowser',
'Hatena-Favicon', 'Dispatch/', 'Scoop.it'];
foreach ($agents as $agent) {
if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) {
Logger::info('additional blocking', $request);
System::httpExit(403);
}
}
// Activate to discover unknown bots
if (!Config::get('blockbots', 'discover_bots', false)) {
return;
}
// List of strings of known "good" agents
$agents = ['diaspora-connection-tester', 'DiasporaFederation', 'Friendica', '(compatible; zot)',
'Micro.blog', 'Mastodon', 'hackney', 'GangGo', 'python/federation', 'GNU social', 'winHttp',
'Go-http-client', 'Mr.4x3 Powered', 'Test Certificate Info', 'WordPress.com', 'zgrab',
'curl/', 'StatusNet', 'OpenGraphReader/', 'Uptimebot/', 'python-opengraph-jaywink'];
foreach ($agents as $agent) {
if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) {
return;
}
}
// Remove all known parts of user agent strings of regular browsers
$agent = blockbots_remove_known_parts($_SERVER['HTTP_USER_AGENT']);
// Empty means that it is no bot
if (empty($agent)) {
return;
}
// When it isn't empty, this is possible a bot
$request = array_merge(['rest' => $agent], $request);
Logger::info('Possible bot', $request);
}
function blockbots_remove_known_parts($agent)
{
// Search patterns of known agents
$patterns = ['\(Linux; Android [\d\.].*; [^\)].*\)', '\(Linux; U; Android [\d\.].*; [^\)].*\)',
'\(iPhone; CPU [^\)].* like Mac OS X\)', '\(iPad; CPU [^\)].* like Mac OS X\)',
'\(X11; Linux [\d_a-z].*\)', '\(X11; Linux [\d_a-z].*; rv:[\d\.a-z].*\)',
'\(X11; [a-z].*; Linux [\d_a-z].*; rv:[\d\.a-z].*\)',
'Chrome/[\d\.].*', 'Vivaldi/[\d\.].*', 'Firefox/[\d\.].*', 'rv:[\d\.a-z].*',
'AppleWebKit/[\d\.].*', 'Safari/[\d\.].*', 'Gecko/[\d\.].*', 'Quark/[\d\.].*',
'Chromium/[\d\.].*', 'Trident/[\d\.].*', 'Edge/[\d\.].*', 'Edg/[\d\.].*',
'Opera/[\d\.].*', 'Ceatles/[\d\.].*', 'UCBrowser/[\d\.].*', 'Navigator/[\d\.a-z].*',
'Mozilla/[\d\.].*', 'Goanna/[\d\.].*', 'PaleMoon/[\d\.].*', 'Windows NT [\d\.].*',
'Intel Mac OS X \d*_\d*_\d*', 'Intel Mac OS X [\d\.].*', 'Presto/[\d\.].*',
'MSIE [\d\.].*', 'Version/[\d\.].*', 'Version/[\d\.].*', '.NET CLR [\d\.].*',
'SLCC2', 'Media Center PC \d*\.\d*', 'Netscape/\d*\.\d*\.\d*',
'CrOS x86_64 [\d\.].*', 'Mobile/[\d\.a-z].*', 'Build/[\d\.a-z].*',
'FxiOS/[\d\.a-z].*', 'OPR/[\d\.].*', 'baidubrowser/[\d\.].*', 'UBrowser/[\d\.].*',
'Android [\d\.].*'];
do {
$oldagent = $agent;
foreach ($patterns as $pattern) {
$agent = preg_replace('=(.*?)' . $pattern . '(.*?)=i', '$1$2', $agent);
}
} while ($agent != $oldagent);
// Some more known parts that we can remove
$search = ['KHTML', 'like Gecko', 'WOW64', 'x86_64', 'X11', 'Linux', 'compatible',
'Macintosh', 'x64', 'Win64', 'Mobile', 'i686', 'en-US', 'zh-CN', ' de ',
' fr ', ' U ', 'Google Favicon', 'Windows', 'googleweblight',' en-us ',
'Win 9x 4.90', ' SG ', 'Intel Mac OS X x.y', ' wv ', 'PPC Mac OS X Mach-O', ' pre ',
'(Baidu; P1 5.1)', 'T7/7.5'];
do {
$oldtext = $agent;
$agent = ' ' . trim(str_replace($search, ' ', $agent), ' ();:.,/') . ' ';
} while ($oldtext != $agent);
return trim($agent);
}