2019-04-20 14:15:45 +02:00
< ? php
/**
2019-04-20 20:38:32 +02:00
* Name : blockbot
2019-04-20 14:15:45 +02:00
* Description : Blocking bots based on detecting bots / crawlers / spiders via the user agent and http_from header .
2019-07-28 09:49:30 +02:00
* Version : 0.2
2019-04-20 14:15:45 +02:00
* Author : Philipp Holzer < admin @ philipp . info >
2019-07-28 09:49:30 +02:00
* Author : Michael Vogel < https :// pirati . ca / profile / heluecht >
2019-04-20 14:15:45 +02:00
*
*/
use Friendica\Core\Hook ;
2020-01-18 22:07:06 +01:00
use Friendica\DI ;
2019-04-20 14:15:45 +02:00
use Jaybizzle\CrawlerDetect\CrawlerDetect ;
2019-04-27 13:51:44 +02:00
use Friendica\Core\Logger ;
2019-07-28 09:49:30 +02:00
use Friendica\Core\Renderer ;
2021-11-04 21:32:16 +01:00
use Friendica\Network\HTTPException\ForbiddenException ;
2019-04-20 14:15:45 +02:00
2019-04-21 12:35:33 +02:00
require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor' . DIRECTORY_SEPARATOR . 'autoload.php' ;
2022-06-23 07:16:22 +02:00
function blockbot_install ()
{
2019-04-22 10:49:40 +02:00
Hook :: register ( 'init_1' , __FILE__ , 'blockbot_init_1' );
2019-04-20 14:15:45 +02:00
}
2023-01-14 03:16:09 +01:00
function blockbot_addon_admin ( string & $o )
2022-06-23 07:16:22 +02:00
{
$t = Renderer :: getMarkupTemplate ( 'admin.tpl' , 'addon/blockbot/' );
2019-07-28 09:49:30 +02:00
$o = Renderer :: replaceMacros ( $t , [
2024-03-04 06:37:04 +01:00
'$submit' => DI :: l10n () -> t ( 'Save Settings' ),
'$good_crawlers' => [ 'good_crawlers' , DI :: l10n () -> t ( 'Allow "good" crawlers' ), DI :: config () -> get ( 'blockbot' , 'good_crawlers' ), DI :: l10n () -> t ( " Don't block fediverse crawlers, relay servers and other bots with good purposes. " )],
'$socialmedia_agents' => [ 'socialmedia_agents' , DI :: l10n () -> t ( 'Allow preview agents' ), DI :: config () -> get ( 'blockbot' , 'socialmedia_agents' ), DI :: l10n () -> t ( " Don't block agents from social media systems that want to generate preview data for links that had been set by their users. " )],
2024-03-05 05:54:36 +01:00
'$http_libraries' => [ 'http_libraries' , DI :: l10n () -> t ( 'Allow generic HTTP libraries' ), DI :: config () -> get ( 'blockbot' , 'http_libraries' ), DI :: l10n () -> t ( " Don't block agents from generic HTTP libraries that could be used for good or for bad and that currently can't be traced back to any known Fediverse project. " )],
2024-03-04 06:37:04 +01:00
'$block_gab' => [ 'block_gab' , DI :: l10n () -> t ( 'Block GabSocial' ), DI :: config () -> get ( 'blockbot' , 'block_gab' ), DI :: l10n () -> t ( 'Block the software GabSocial. This will block every access for that software. You can block dedicated gab instances in the blocklist settings in the admin section.' )],
'$training' => [ 'training' , DI :: l10n () -> t ( 'Training mode' ), DI :: config () -> get ( 'blockbot' , 'training' ), DI :: l10n () -> t ( " Activates the training mode. This is only meant for developing purposes. Don't activate this on a production machine. This can cut communication with some systems. " )],
2019-07-28 09:49:30 +02:00
]);
}
2023-01-14 03:16:09 +01:00
function blockbot_addon_admin_post ()
2022-06-23 07:16:22 +02:00
{
2020-01-19 21:21:52 +01:00
DI :: config () -> set ( 'blockbot' , 'good_crawlers' , $_POST [ 'good_crawlers' ] ? ? false );
2024-03-04 06:37:04 +01:00
DI :: config () -> set ( 'blockbot' , 'socialmedia_agents' , $_POST [ 'socialmedia_agents' ] ? ? false );
2024-03-05 05:54:36 +01:00
DI :: config () -> set ( 'blockbot' , 'http_libraries' , $_POST [ 'http_libraries' ] ? ? false );
2020-01-19 21:21:52 +01:00
DI :: config () -> set ( 'blockbot' , 'block_gab' , $_POST [ 'block_gab' ] ? ? false );
DI :: config () -> set ( 'blockbot' , 'training' , $_POST [ 'training' ] ? ? false );
2019-07-28 09:49:30 +02:00
}
2023-01-14 03:16:09 +01:00
function blockbot_init_1 ()
2022-06-23 07:16:22 +02:00
{
2019-05-03 12:25:13 +02:00
if ( empty ( $_SERVER [ 'HTTP_USER_AGENT' ])) {
return ;
}
2019-04-27 15:34:51 +02:00
$logdata = [ 'agent' => $_SERVER [ 'HTTP_USER_AGENT' ], 'uri' => $_SERVER [ 'REQUEST_URI' ]];
2024-03-04 06:37:04 +01:00
// List of known unwanted crawlers.
2023-03-05 15:01:32 +01:00
$agents = [
'SemrushBot' , 's~feedly-nikon3' , 'Qwantify/Bleriot/' , 'ltx71' , 'Sogou web spider/' ,
2024-03-04 06:37:04 +01:00
'Diffbot/' , 'YisouSpider' , 'evc-batch/' , 'LivelapBot/' , 'TrendsmapResolver/' ,
2019-05-29 20:51:07 +02:00
'PaperLiBot/' , 'Nuzzel' , 'um-LN/' , 'Google Favicon' , 'Datanyze' , 'BLEXBot/' , '360Spider' ,
'adscanner/' , 'HeadlessChrome' , 'wpif' , 'startmebot/' , 'Googlebot/' , 'Applebot/' ,
2024-03-04 06:37:04 +01:00
'GoogleImageProxy' , 'bingbot/' , 'heritrix/' , 'ldspider' ,
2021-05-24 08:21:07 +02:00
'AwarioRssBot/' , 'TweetmemeBot/' , 'dcrawl/' , 'PhantomJS/' , 'Googlebot-Image/' ,
2019-06-01 06:51:01 +02:00
'CrowdTanglebot/' , 'Mediapartners-Google' , 'Baiduspider/' , 'datagnionbot' ,
2019-06-06 22:31:16 +02:00
'MegaIndex.ru/' , 'SMUrlExpander' , 'Hatena-Favicon/' , 'Wappalyzer' , 'FlipboardProxy/' ,
'NetcraftSurveyAgent/' , 'Dataprovider.com' , 'SMTBot/' , 'Nimbostratus-Bot/' ,
2019-06-10 16:33:42 +02:00
'DuckDuckGo-Favicons-Bot/' , 'IndieWebCards/' , 'proximic' , 'netEstate NE Crawler' ,
2024-03-04 06:37:04 +01:00
'AhrefsBot/' , 'YandexBot/' , 'Exabot/' , 'Mediumbot-MetaTagFetcher/' ,
'SurdotlyBot/' , 'BingPreview/' , 'SabsimBot/' , 'CCBot/' , 'WbSrch/' ,
2019-06-20 07:31:53 +02:00
'DuckDuckBot-Https/' , 'HTTP Banner Detection' , 'YandexImages/' , 'archive.org_bot' ,
'ArchiveTeam ArchiveBot/' , 'yacybot' , 'https://developers.google.com/+/web/snippet/' ,
2024-03-04 06:37:04 +01:00
'Scrapy/' , 'MJ12bot/' , 'DotBot/' , 'Pinterestbot/' , 'Jooblebot/' ,
2019-07-10 04:16:57 +02:00
'Cliqzbot/' , 'YaK/' , 'Mediatoolkitbot' , 'Snacktory' , 'FunWebProducts' , 'oBot/' ,
2019-07-29 17:48:51 +02:00
'7Siters/' , 'KOCMOHABT' , 'Google-SearchByImage' , 'FemtosearchBot/' ,
2020-07-05 16:36:17 +02:00
'HubSpot Crawler' , 'DomainStatsBot/' , 'Re-re Studio' , 'AwarioSmartBot/' ,
2024-03-04 16:27:44 +01:00
'DNSResearchBot/' , 'PetalBot;' , 'Nmap Scripting Engine;' ,
2020-07-09 17:08:41 +02:00
'Google-Apps-Script; beanserver;' , 'woorankreview/' , 'Seekport Crawler;' , 'AHC/' ,
2024-03-05 05:54:36 +01:00
'Semanticbot/' , 'XoviOnpageCrawler;' , 'Pinterest/' ,
2021-08-11 09:50:55 +02:00
'GetHPinfo.com-Bot/' , 'BoardReader Favicon Fetcher' , 'Google-Adwords-Instant' , 'newspaper/' ,
2024-03-05 05:54:36 +01:00
'YurichevBot/' , 'Crawling at Home Project' , 'InfoTigerBot/' , 'AdIdxBot/' ,
'MicrosoftPreview/' , 'masscan/' , 'Timpibot/' , 'everyfeed-spider/' , 'AndroidDownloadManager/' ,
'WebZIP/' , 'WDG_Validator/' , 'Screaming Frog SEO Spider/' , ' Bytespider;' , 'ISSCyberRiskCrawler/' ,
2024-03-10 07:14:01 +01:00
'BitSightBot/' , 'ev-crawler/' , 'CensysInspect/1.1' , 'Protopage/' , 'Gaisbot/' , 'WellKnownBot/' ,
'SuperBot/' , 'Googlebot-Mobile/' , 'GPTBot/' , 'GenomeCrawlerd/' , '2ip bot/' , 'Ocarinabot' ,
'Yahoo! Slurp;' , 'AdsBot-Google' , 'Gregarius/' , 'FAST-WebCrawler/' , 'Xenu Link Sleuth/' ,
'Ask Jeeves'
2024-03-05 05:54:36 +01:00
];
if ( DI :: config () -> get ( 'blockbot' , 'block_gab' )) {
$agents [] = 'GabSocial/' ;
}
// List of "good" crawlers, mostly from the fediverse.
$good_agents = [
'fediverse.space crawler' , 'fediverse.network crawler' , 'Active_Pods_CheckBot_3.0' ,
'Social-Relay/' , 'Test Certificate Info' , 'Uptimebot/' , 'GNUSocialBot' , 'UptimeRobot/' ,
'PTST/' , 'Zabbix' , 'Poduptime/' , 'FediFetcher' , 'lemmy-stats-crawler' ,
2024-03-10 07:14:01 +01:00
'FedditLemmyverseCrawler/' , 'lemmy-explorer-crawler/' , 'URIports Validator' ,
'rss-is-dead.lol web bot;' , 'fedistatsCrawler/' , 'W3C_CSS_Validator_JFouffa/' ,
'IABot/' , 'Slackbot 1' ,
2023-03-05 15:01:32 +01:00
];
2019-07-28 09:49:30 +02:00
2020-01-19 21:21:12 +01:00
if ( ! DI :: config () -> get ( 'blockbot' , 'good_crawlers' )) {
2019-07-28 10:13:53 +02:00
$agents = array_merge ( $agents , $good_agents );
2024-03-05 05:54:36 +01:00
} elseif ( blockbot_match ( $good_agents )) {
return ;
2020-01-18 13:47:20 +01:00
}
2019-07-28 10:13:53 +02:00
2024-03-05 05:54:36 +01:00
// List of agents from social media systems that fetch preview data via opem graph or twitter cards.
$socialmedia_agents = [ 'Twitterbot' , 'facebookexternalhit/' , 'SkypeUriPreview Preview/' ,
'TelegramBot' , 'WhatsApp/' , 'github-camo' , 'Bluesky Cardyb/' , 'XING-contenttabreceiver/' ,
2024-03-10 07:14:01 +01:00
'LinkedInBot/' , 'Instagram ' , 'Synapse (bot; ' , 'Discordbot/' , 'SummalyBot/' ,
'Slackbot-LinkExpanding' , 'Slack-ImgProxy' ,
];
2024-03-05 05:54:36 +01:00
2024-03-04 06:37:04 +01:00
if ( ! DI :: config () -> get ( 'blockbot' , 'socialmedia_agents' )) {
$agents = array_merge ( $agents , $socialmedia_agents );
2024-03-05 05:54:36 +01:00
} elseif ( blockbot_match ( $socialmedia_agents )) {
return ;
2024-03-04 06:37:04 +01:00
}
2024-03-05 05:54:36 +01:00
// HTTP Libraries
2024-03-06 06:27:08 +01:00
$http_libraries = [ 'ReactorNetty/' , 'GuzzleHttp/' , 'Embed PHP library' , 'python-urllib3/' ,
2024-03-10 07:14:01 +01:00
'EventMachine HttpClient' , 'HTMLParser/'
];
2024-03-04 06:37:04 +01:00
2024-03-05 05:54:36 +01:00
if ( ! DI :: config () -> get ( 'blockbot' , 'http_libraries' )) {
$agents = array_merge ( $agents , $http_libraries );
} elseif ( blockbot_match ( $http_libraries )) {
return ;
2019-07-28 09:49:30 +02:00
}
2019-05-29 20:51:07 +02:00
2024-03-05 05:54:36 +01:00
if ( blockbot_match ( $agents )) {
throw new ForbiddenException ( 'Bots are not allowed. If you consider this a mistake, create an issue at https://github.com/friendica/friendica' );
2019-05-29 20:51:07 +02:00
}
2019-05-30 06:45:20 +02:00
// This switch here is only meant for developers who want to add more bots to the list above, it is not safe for production.
2020-01-19 21:21:12 +01:00
if ( ! DI :: config () -> get ( 'blockbot' , 'training' )) {
2019-05-29 20:51:07 +02:00
return ;
}
$crawlerDetect = new CrawlerDetect ();
2019-04-27 15:34:51 +02:00
if ( ! $crawlerDetect -> isCrawler ()) {
logger :: debug ( 'Good user agent detected' , $logdata );
return ;
}
2024-03-05 05:54:36 +01:00
// List of known "good" agents, mostly used by Fediverse systems, feed readers, ...
2023-03-05 15:01:32 +01:00
$agents = [
'curl' , 'zgrab' , 'Go-http-client' , 'curb' , 'github.com' , 'reqwest' , 'Feedly/' ,
2019-04-29 22:21:42 +02:00
'Python-urllib/' , 'Liferea/' , 'aiohttp/' , 'WordPress.com Reader' , 'hackney/' ,
'Faraday v' , 'okhttp' , 'UniversalFeedParser' , 'PixelFedBot' , 'python-requests' ,
2019-05-30 12:32:01 +02:00
'WordPress/' , 'http.rb/' , 'Apache-HttpClient/' , 'WordPress.com;' , 'Pleroma' ,
2020-07-12 10:41:29 +02:00
'Dispatch/' , 'Ruby' , 'Java/' , 'libwww-perl/' , 'Mastodon/' , 'FeedlyApp/' ,
2020-07-08 21:24:00 +02:00
'lua-resty-http/' , 'Tiny Tiny RSS/' , 'Wget/' , 'PostmanRuntime/' ,
2021-08-11 09:50:55 +02:00
'W3C_Validator/' , 'NetNewsWire' , 'FeedValidator/' , 'theoldreader.com' , 'axios/' ,
2021-12-08 21:00:45 +01:00
'Paw/' , 'PeerTube/' , 'fedi.inex.dev' , 'FediDB/' , 'index.community crawler' ,
2024-03-05 05:54:36 +01:00
'Slackbot-LinkExpanding' , 'Firefish/' , 'Takahe/' , 'Akkoma ' , 'Misskey/' , 'Lynx/' ,
2024-03-10 07:14:01 +01:00
'camo-rs asset proxy' , 'gotosocial/' , 'incestoma ' , 'SpaceCowboys Android RSS Reader' ,
'NewsBlur Feed Finder' , 'Lemmy/' , 'enby-town/' , 'rss2tg bot;' , '; HTTrack ' ,
'MbinBot' , 'kbinBot'
2023-03-05 15:01:32 +01:00
];
2019-07-28 10:13:53 +02:00
2024-03-05 05:54:36 +01:00
if ( blockbot_match ( $agents )) {
logger :: info ( 'False positive' , $logdata );
return ;
2019-07-28 10:13:53 +02:00
}
2019-04-27 15:34:51 +02:00
2024-03-05 05:54:36 +01:00
logger :: notice ( 'Blocked bot' , $logdata );
throw new ForbiddenException ( 'Bots are not allowed. If you consider this a mistake, create an issue at https://github.com/friendica/friendica' );
}
function blockbot_match ( array $agents )
{
2019-04-27 15:34:51 +02:00
foreach ( $agents as $agent ) {
if ( stristr ( $_SERVER [ 'HTTP_USER_AGENT' ], $agent )) {
2024-03-05 05:54:36 +01:00
return true ;
2019-04-27 15:34:51 +02:00
}
}
2024-03-05 05:54:36 +01:00
return false ;
}