forked from friendica/friendica-addons
Compare commits
6 Commits
develop
...
blockbot-l
Author | SHA1 | Date |
---|---|---|
Michael Vogel | bc71ee9ac8 | |
Michael Vogel | 9e97a74dff | |
Michael Vogel | ba4b6d4d21 | |
Michael Vogel | 13522dcc69 | |
Michael Vogel | 5752017287 | |
Michael Vogel | e359013cac |
|
@ -0,0 +1,135 @@
|
|||
<?php
|
||||
/**
|
||||
* Name: blockbots
|
||||
* Description: Block all bots - even the ones who don't respect robots.txt
|
||||
* Version: 0.1
|
||||
* Author: Michael Vogel <https://pirati.ca/profile/heluecht>
|
||||
*
|
||||
* There are bots that ignore robots.txt, see for example:
|
||||
* https://www.archiveteam.org/index.php?title=Robots.txt
|
||||
*
|
||||
* Additionally the list contains an exhausting list of other known bots.
|
||||
*
|
||||
* When a bot is detected, the system quits with error "403 Forbidden"
|
||||
*/
|
||||
|
||||
use Friendica\Core\Hook;
|
||||
use Friendica\Core\System;
|
||||
use Friendica\Core\Logger;
|
||||
use Friendica\Core\Config;
|
||||
use Jaybizzle\CrawlerDetect\CrawlerDetect;
|
||||
|
||||
function blockbots_install()
|
||||
{
|
||||
Hook::register('head', 'addon/blockbots/blockbots.php', 'blockbots_check');
|
||||
}
|
||||
|
||||
function blockbots_uninstall()
|
||||
{
|
||||
Hook::unregister('head', 'addon/blockbots/blockbots.php', 'blockbots_check');
|
||||
}
|
||||
|
||||
function blockbots_check($a, $b)
|
||||
{
|
||||
if (empty($_SERVER['HTTP_USER_AGENT'])) {
|
||||
return;
|
||||
}
|
||||
|
||||
$request = ['agent' => $_SERVER['HTTP_USER_AGENT'], 'uri' => $_SERVER['REQUEST_URI']];
|
||||
|
||||
$CrawlerDetect = new CrawlerDetect;
|
||||
|
||||
if ($CrawlerDetect->isCrawler($_SERVER['HTTP_USER_AGENT'])) {
|
||||
Logger::info('blocking crawler', $request);
|
||||
System::httpExit(403);
|
||||
}
|
||||
|
||||
// List of parts of user agent strings of known bots
|
||||
$agents = ['ArchiveTeam ArchiveBot', 'SEMrushBot', '360Spider', 'Twitterbot', 'ltx71', 'AhrefsBot', 'YoudaoBot',
|
||||
'Baiduspider', 'MSNBot', 'Googlebot', 'Sosospider', 'JikeSpider', 'BLEXBot', 'picmole', 'LexxeBot',
|
||||
'NextGenSearchBot', 'spbot', 'SiteBot', 'MJ12bot', 'CrystalSemanticsBot', 'NetSeer crawler',
|
||||
'trovitBot', 'DotBot', 'Ezooms', 'discobot', 'Jyxobot', 'sogou', 'sistrix', 'heritrix', 'GarlikCrawler',
|
||||
'NerdByNature.Bot', 'DTS Agent', 'psbot', 'WBSearchBot', 'AddThis.com', 'ia_archiver', 'proximic',
|
||||
'discoverybot', 'bl.uk_lddc_bot', 'IstellaBot', 'seokicks', 'UnisterBot', 'Bender', 'wotbox',
|
||||
'Yasni', 'netEstate NE Crawler', 'Exabot', 'Pixray-Seeker', 'Linguee', 'integromedb', 'SearchmetricsBot',
|
||||
'BDCbot', 'GrapeshotCrawler', 'WeSEE:Search', 'TurnitinBot', 'admantx', 'BUbiNG', 'YisouSpider',
|
||||
'facebookexternalhit', 'ldspider', 'Researchscan', 'CCBot', 'Qwantify/Bleriot', 'PaperLiBot', 'bingbot',
|
||||
'AppEngine-Google', 'Datanyze', 'evc-batch', 'HTTP Banner Detection', 'DuckDuckGo', 'QwantBrowser',
|
||||
'Hatena-Favicon', 'Dispatch/', 'Scoop.it'];
|
||||
|
||||
foreach ($agents as $agent) {
|
||||
if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) {
|
||||
Logger::info('additional blocking', $request);
|
||||
System::httpExit(403);
|
||||
}
|
||||
}
|
||||
|
||||
// Activate to discover unknown bots
|
||||
if (!Config::get('blockbots', 'discover_bots', false)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// List of strings of known "good" agents
|
||||
$agents = ['diaspora-connection-tester', 'DiasporaFederation', 'Friendica', '(compatible; zot)',
|
||||
'Micro.blog', 'Mastodon', 'hackney', 'GangGo', 'python/federation', 'GNU social', 'winHttp',
|
||||
'Go-http-client', 'Mr.4x3 Powered', 'Test Certificate Info', 'WordPress.com', 'zgrab',
|
||||
'curl/', 'StatusNet', 'OpenGraphReader/', 'Uptimebot/', 'python-opengraph-jaywink'];
|
||||
|
||||
foreach ($agents as $agent) {
|
||||
if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Remove all known parts of user agent strings of regular browsers
|
||||
$agent = blockbots_remove_known_parts($_SERVER['HTTP_USER_AGENT']);
|
||||
|
||||
// Empty means that it is no bot
|
||||
if (empty($agent)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// When it isn't empty, this is possible a bot
|
||||
$request = array_merge(['rest' => $agent], $request);
|
||||
Logger::info('Possible bot', $request);
|
||||
}
|
||||
|
||||
function blockbots_remove_known_parts($agent)
|
||||
{
|
||||
// Search patterns of known agents
|
||||
$patterns = ['\(Linux; Android [\d\.].*; [^\)].*\)', '\(Linux; U; Android [\d\.].*; [^\)].*\)',
|
||||
'\(iPhone; CPU [^\)].* like Mac OS X\)', '\(iPad; CPU [^\)].* like Mac OS X\)',
|
||||
'\(X11; Linux [\d_a-z].*\)', '\(X11; Linux [\d_a-z].*; rv:[\d\.a-z].*\)',
|
||||
'\(X11; [a-z].*; Linux [\d_a-z].*; rv:[\d\.a-z].*\)',
|
||||
'Chrome/[\d\.].*', 'Vivaldi/[\d\.].*', 'Firefox/[\d\.].*', 'rv:[\d\.a-z].*',
|
||||
'AppleWebKit/[\d\.].*', 'Safari/[\d\.].*', 'Gecko/[\d\.].*', 'Quark/[\d\.].*',
|
||||
'Chromium/[\d\.].*', 'Trident/[\d\.].*', 'Edge/[\d\.].*', 'Edg/[\d\.].*',
|
||||
'Opera/[\d\.].*', 'Ceatles/[\d\.].*', 'UCBrowser/[\d\.].*', 'Navigator/[\d\.a-z].*',
|
||||
'Mozilla/[\d\.].*', 'Goanna/[\d\.].*', 'PaleMoon/[\d\.].*', 'Windows NT [\d\.].*',
|
||||
'Intel Mac OS X \d*_\d*_\d*', 'Intel Mac OS X [\d\.].*', 'Presto/[\d\.].*',
|
||||
'MSIE [\d\.].*', 'Version/[\d\.].*', 'Version/[\d\.].*', '.NET CLR [\d\.].*',
|
||||
'SLCC2', 'Media Center PC \d*\.\d*', 'Netscape/\d*\.\d*\.\d*',
|
||||
'CrOS x86_64 [\d\.].*', 'Mobile/[\d\.a-z].*', 'Build/[\d\.a-z].*',
|
||||
'FxiOS/[\d\.a-z].*', 'OPR/[\d\.].*', 'baidubrowser/[\d\.].*', 'UBrowser/[\d\.].*',
|
||||
'Android [\d\.].*'];
|
||||
|
||||
do {
|
||||
$oldagent = $agent;
|
||||
foreach ($patterns as $pattern) {
|
||||
$agent = preg_replace('=(.*?)' . $pattern . '(.*?)=i', '$1$2', $agent);
|
||||
}
|
||||
} while ($agent != $oldagent);
|
||||
|
||||
// Some more known parts that we can remove
|
||||
$search = ['KHTML', 'like Gecko', 'WOW64', 'x86_64', 'X11', 'Linux', 'compatible',
|
||||
'Macintosh', 'x64', 'Win64', 'Mobile', 'i686', 'en-US', 'zh-CN', ' de ',
|
||||
' fr ', ' U ', 'Google Favicon', 'Windows', 'googleweblight',' en-us ',
|
||||
'Win 9x 4.90', ' SG ', 'Intel Mac OS X x.y', ' wv ', 'PPC Mac OS X Mach-O', ' pre ',
|
||||
'(Baidu; P1 5.1)', 'T7/7.5'];
|
||||
do {
|
||||
$oldtext = $agent;
|
||||
$agent = ' ' . trim(str_replace($search, ' ', $agent), ' ();:.,/') . ' ';
|
||||
} while ($oldtext != $agent);
|
||||
|
||||
return trim($agent);
|
||||
}
|
Loading…
Reference in New Issue