From 741c142a0570dbf420eb0e6534e9d6acbd552c92 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 16 Apr 2024 05:00:23 +0000 Subject: [PATCH] Blockbot: New user agents, storage for possible bot user agents --- blockbot/blockbot.php | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index e11f23f7..19065fce 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -13,6 +13,7 @@ use Friendica\DI; use Jaybizzle\CrawlerDetect\CrawlerDetect; use Friendica\Core\Logger; use Friendica\Core\Renderer; +use Friendica\Core\System; use Friendica\Network\HTTPException\ForbiddenException; require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor' . DIRECTORY_SEPARATOR . 'autoload.php'; @@ -87,7 +88,11 @@ function blockbot_init_1() 'Facebot', 'Googlebot-Video/', 'msnbot/', 'Offline Explorer/', 'YandexNews/', 'msnbot-media/', 'EmailWolf', 'Download Demon/', 'FeedFetcher-Google;', 'WebCopier', '+ONB_Bot_Btrix', 'scoopit-crawler/', 'ia_archiver', 'Quora-Bot/', 'WebwikiBot/', 'FullStoryBot/', - 'wpbot/', 'SearchExpress', 'DuckDuckBot/', 'Google Web Preview', + 'wpbot/', 'SearchExpress', 'DuckDuckBot/', 'Google Web Preview', 'Amazonbot/', + 'ImagesiftBot;', 'webtech/', 'Bloglines/', 'Netcraft Web Server Survey', 'Spawning-AI', + 'NLUX_IAHarvester/', 'bots.retroverse.social', 'RSSingBot', 'Chrome-Lighthouse', + 't3versionsBot/', 'scaninfo@paloaltonetworks.com', 'intelx.io_bot', 'Google-Read-Aloud', + 'bot Mozilla', ]; if (DI::config()->get('blockbot', 'block_gab')) { @@ -102,7 +107,7 @@ function blockbot_init_1() 'FedditLemmyverseCrawler/', 'lemmy-explorer-crawler/', 'URIports Validator', 'rss-is-dead.lol web bot;', 'fedistatsCrawler/', 'W3C_CSS_Validator_JFouffa/', 'IABot/', 'Slackbot 1', 'BeeperBot/', 'Matrix-Media-Repo/', 'P3P Validator', - 'KeybaseBot;', + 'KeybaseBot;', 'Observatory/', 'CSSCheck/', 'FeedBurner/', 'rss-is-dead.lol feed bot;' ]; if (!DI::config()->get('blockbot', 'good_crawlers')) { @@ -126,7 +131,8 @@ function blockbot_init_1() // HTTP Libraries $http_libraries = ['ReactorNetty/', 'GuzzleHttp/', 'Embed PHP library', 'python-urllib3/', - 'EventMachine HttpClient', 'HTMLParser/' + 'EventMachine HttpClient', 'HTMLParser/', 'node-fetch', 'fasthttp', 'python-httpx/', + 'Fuzz Faster U Fool', 'gvfs/', 'Embarcadero URI Client/', 'grub-client' ]; if (!DI::config()->get('blockbot', 'http_libraries')) { @@ -165,6 +171,7 @@ function blockbot_init_1() 'camo-rs asset proxy', 'gotosocial/', 'incestoma ', 'SpaceCowboys Android RSS Reader', 'NewsBlur Feed Finder', 'Lemmy/', 'enby-town/', 'rss2tg bot;', '; HTTrack ', 'MbinBot', 'kbinBot', 'Pixelfed/', 'NewsBlur Feed Fetcher', 'NewsBlur Page Fetcher', + 'facebookexternalua', 'FreshRSS/', 'BookWyrm/', 'Reeder/', 'microblogpub/', ]; if (blockbot_match($agents)) { @@ -172,10 +179,28 @@ function blockbot_init_1() return; } + blockbot_save('blocked-bot', $_SERVER['HTTP_USER_AGENT']); + logger::notice('Blocked bot', $logdata); throw new ForbiddenException('Bots are not allowed. If you consider this a mistake, create an issue at https://github.com/friendica/friendica'); } +function blockbot_save($database, $userAgent) +{ + if (!function_exists('dba_open')) { + return; + } + + $ressource = dba_open(System::getTempPath() . '/' . $database, 'cl'); + $result = dba_fetch($userAgent, $ressource); + if ($result === false) { + dba_insert($userAgent, 1, $ressource); + } else { + dba_replace($userAgent, ++$result, $ressource); + } + dba_close($ressource); +} + function blockbot_match(array $agents) { foreach ($agents as $agent) { -- 2.52.0