Blockbot: reworked user agent parser

This commit is contained in:
Michael 2024-04-22 01:38:08 +00:00
parent 741c142a05
commit 2c064a559e

* Name: blockbot
* Description: Blocking bots based on detecting bots/crawlers/spiders via the user agent and http_from header.
* Version: 0.2
* Version: 1.0
* Author: Philipp Holzer <>
* Author: Michael Vogel <>
use Friendica\Core\Renderer;
use Friendica\Core\System;
use Friendica\Network\HTTPException\ForbiddenException;
use phpseclib3\System\SSH\Agent;
require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor' . DIRECTORY_SEPARATOR . 'autoload.php';
DI::config()->set('blockbot', 'training', $_POST['training'] ?? false);
function blockbot_reject()
throw new ForbiddenException('Bots are not allowed. If you consider this a mistake, create an issue at');
function blockbot_init_1()
if (empty($_SERVER['HTTP_USER_AGENT'])) {
$logdata = ['agent' => $_SERVER['HTTP_USER_AGENT'], 'uri' => $_SERVER['REQUEST_URI']];
$crawlerDetect = new CrawlerDetect();
$isCrawler = $crawlerDetect->isCrawler();
$cleaned = blockbot_remove_known_parts($_SERVER['HTTP_USER_AGENT']);
$logdata = ['isCrawler' => $isCrawler, 'cleaned' => $cleaned, 'agent' => $_SERVER['HTTP_USER_AGENT'], 'uri' => $_SERVER['REQUEST_URI']];
if (empty($cleaned) && !$isCrawler) {
if (blockbot_is_bot($_SERVER['HTTP_USER_AGENT'])) {
if (blockbot_is_socialmedia_agents($cleaned, $_SERVER['HTTP_USER_AGENT'])) {
if (!DI::config()->get('blockbot', 'socialmedia_agents')) {
if (blockbot_is_library($cleaned, $_SERVER['HTTP_USER_AGENT'])) {
if (!DI::config()->get('blockbot', 'http_libraries')) {
if (blockbot_is_good_agent($cleaned, $_SERVER['HTTP_USER_AGENT'])) {
if (!DI::config()->get('blockbot', 'good_crawlers')) {
// This switch here is only meant for developers who want to add more bots to the list above, it is not safe for production.
if (!DI::config()->get('blockbot', 'training')) {
if (blockbot_is_wanted_agent($cleaned, $_SERVER['HTTP_USER_AGENT'])) {
if (!$isCrawler) {
if (!empty($cleaned)) {
logger::debug('Legacy good user agent detected', $logdata);
blockbot_save('legacy-good', $_SERVER['HTTP_USER_AGENT'], $cleaned);
} elseif (empty($cleaned)) {
logger::notice('Blocked bot', $logdata);
blockbot_save('blocked-bot', $_SERVER['HTTP_USER_AGENT'], $cleaned);
function blockbot_save($database, $userAgent, $cleaned)
if (!function_exists('dba_open')) {
$ressource = dba_open(System::getTempPath() . '/' . $database, 'cl');
$result = dba_fetch($userAgent, $ressource);
if ($result === false) {
dba_insert($userAgent, $cleaned, $ressource);
function blockbot_remove_known_parts($agent)
$agent = str_replace('))', ')', $agent);
$patterns = [
"Friendica '.*' \S+; https?://\S+" => 'Friendica',
'FriendicaDirectory/\S+ GuzzleHttp/\S+ curl/\S+ PHP/\S+' => 'FriendicaDirectory',
'.*\(golang net/http; Activity-Relay \S+; \S+\)' => 'Activity-Relay',
'Nextcloud Social \S+' => 'Nextcloud Social',
'\S+ - Mobilizon \S+' => 'Mobilizon',
'\S+ - Mobilizon' => 'Mobilizon',
'GNU social/\S+ \(.*\)' => 'GNU social',
'MastodonAndroid/\S+' => 'MastodonAndroid',
'Mozilla/5.0 \(compatible; zot\)' => 'zot',
'Camo Asset Proxy \S+' => 'Camo Asset Proxy',
'AodeRelay \(ap-relay/\S+; \+https?://\S+\)' => 'AodeRelay',
'Wget/\S+ \S+ \(Red Hat modified\)' => 'Wget',
'PixelFedBot/\S+ \(Pixelfed/\S+; \+https?://\S+\)' => 'Pixelfed',
'\(Pixelfed/\S+; \+https?://\S+\)' => 'Pixelfed',
'Boost/\S+ \(Linux;Android \S+\)' => 'Boost',
'Apache-HttpClient/\S+ \(Java/\S+\)' => 'Apache-HttpClient',
'python/federation/\S+' => 'federation',
'undefined/undefined \(\+\)' => '',
'Poduptime/Production from https?://\S+' => 'Poduptime',
'GNUsocialBot \S+ - https?://\S+' => 'GNUsocialBot'
$oldagent = $agent;
foreach ($patterns as $pattern => $replacement) {
$agent = preg_replace('=' . $pattern . '=i', $replacement, $agent);
$patterns = [
'Dalvik/\S+ \(Linux; U; Android \S+; [^;]* Build/\S+\)',
do {
$oldagent = $agent;
foreach ($patterns as $pattern) {
$agent = preg_replace('=' . $pattern . '=i', '', $agent);
} while ($agent != $oldagent);
$patterns = ['Mozilla\S+ Slackware\S+ \([^)]*\)', 'Mozilla/\S+ \([^)]*\)', 'Opera/\S+ \([^)]*\)', 'iTunes/\S+ \([^)]*\)',
'Links \([^)]*\)',
do {
$oldagent = $agent;
foreach ($patterns as $pattern) {
$agent = preg_replace('=' . $pattern . '=i', '', $agent);
} while ($agent != $oldagent);
// Search patterns of known agents
$patterns = ['KHTML/\S+ \(like Gecko\)', '\(KHTML, like Gecko\)',
'\(KHTML like Gecko\)', 'like iPhone OS \S+ Mac OS X',
'Gecko/\S+', 'Presto/\S+', 'SamsungBrowser/\S+',
'KHTML/\S+ \(like Gecko\)', 'Whale/\S+', 'YaBrowser/\S+', 'Yowser/\S+',
'Mobile/\S+', 'CriOS/\S+', 'SA/\S+', 'honksnonk/\S+; \S+',
'libwww-FM/\S+', 'SSL-MM/\S+'
do {
$oldagent = $agent;
foreach ($patterns as $pattern) {
$agent = preg_replace('=(\W)' . $pattern . '(\W)=i', '$1$2', $agent);
} while ($agent != $oldagent);
// '(\S+)/\S+'
$patterns = ['python-httpx/\S+ \((\S+)/\S+; \+https?://\S+\)',
'http.rb/\S+ \(\S+/\S+ (\S+)/\S+; \+https?://\S+\) Bot',
'http.rb/\S+ \((\S+)/\S+; \+https?://\S+\) Bot',
'http.rb/\S+ \((\S+)/\S+; \+https?://\S+\)',
'http.rb/\S+ \((\S+)/\S+; https?://\S+\)',
'http.rb/\S+ \((\S+)/\S+\)',
'python-requests \((\S+)/\S+; \+https?://\S+\)',
'python-httpx/\S+ \(\S+; (\S+)/\S+; \+https?://\S+\)',
'(\S+) \(\S+; \+https?://\S+\)',
'(\S+)/\S+ CFNetwork/\S+ Darwin/\S+',
'(\S+)/\S+ Android/\S+ OkHttp/\S+',
'http.fetch \((\S+); \+https?://\S+ <\S+>; Timeline enrichment\); Bot',
'http.fetch \((\S+); \+https?://\S+ <\S+>; Timeline enrichment\)',
'(\S+) at \S+ \(\+https?://\S+\)',
'\S+ \(\+https?://\S+\) (\S+)/\S+ \S+', '\S+ \(\+https?://\S+\) (\S+)/\S+',
'(\S+)/\S* \(\+https?://\S+\)', '(\S+) \(\+https?://\S+\)',
'(\S+)/\S+ \(https?://\S+\)', '(\S+) \(https?://\S+\)', '(\S+)/\S* https?://\S+',
'(\S+) \S+; https?://\S+ <\S+>; Bot', '(\S+) \S+; https?://\S+ <\S+>',
'(\S+)/\S+; \+https?://\S+', '(\S+)/; \+https?://\S+', '(\S+)/[\d_.]+\S*'
]; // '(\S+)/[\d_.]+\S*'
do {
$oldagent = $agent;
foreach ($patterns as $pattern) {
// echo $agent . "\t" . $pattern . "\n";
$agent = preg_replace('=' . $pattern . '=i', '$1', $agent);
} while ($agent != $oldagent);
// 'Pre',
// Some more known parts that we can remove
$search = ['Waterfox', 'Minefield', 'Mobile Safari', 'Safari', 'Firefox',
'Version', 'AppleWebKit', 'Core', 'QQBrowser', 'OPR', 'EdgA', 'Edge',
'Edg', 'Maxthon', 'Iceweasel', 'Fennec', 'webOSBrowser', 'KHTML (like Gecko)',
'like Gecko', 'Quark', 'QtWebEngine', 'Brighteon', 'FxiOS', '(Unsupported)',
'Konqueror (WebEnginePart)', '(Edition Yx)', 'Kindle', 'Gecko',
'Observatory', 'SeaMonkey', 'Chromium', 'Mammoth', 'Puffin', 'Fedora',
'Debian', 'Amethyst', 'FirePHP', 'CherryPick', 'Avant Browser', 'Chrome',
'PieFed', 'AtContent', 'OpenSSL', 'Lynx', 'NokiaBrowser', '3gpp-gba',
'Kubuntu', 'Ubuntu', 'Synapse', 'UNTRUSTED', 'Vivaldi',
do {
$oldtext = $agent;
$agent = str_ireplace($search, ' ', $agent);
} while ($oldtext != $agent);
$agent = trim($agent, ',./ ');
return trim($agent);
function blockbot_is_wanted_agent(string $cleaned, string $agent): bool
$agents =
'Friendica', 'AodeRelay', 'Pixelfed', 'zot', 'Mobilizon', 'federation',
'Mastodon', 'Akkoma', 'Pleroma', 'Misskey', 'Firefish', 'WordPress',
'Iceshrimp', 'PeerTube', 'FoundKey', 'Calckey', 'gotosocial', 'Wget',
'microblogpub', 'Takahe', 'DiasporaFederation', 'MbinBot', 'snac', 'funkwhale',
'kbinBot', 'Mbin', 'lotide', 'lemmy-stats-crawler', 'Lemmy', 'MastodonInstances',
'lemmy-explorer-crawler', 'MisskeyMediaProxy', 'FedditLemmyverseCrawler',
'facebookexternalua', 'ActivityRelay', 'Pachli', 'go-camo', 'OpenGraphReader',
'buzzrelay', 'TootDeck-Worker', 'diaspora-connection-tester', 'Fedilab',
'ActivityPub', 'camo-rs asset proxy', 'Fedibird', 'FediFetcher', 'FediList Agent',
'CSSCheck', 'Tiny Tiny RSS', 'Tusky', 'FediDB', 'Plume', "fediverse's stats",
'', 'MastodonAndroid', 'Activity-Relay', 'GNU social', ''
if (in_array($cleaned, $agents)) {
return true;
$legacy_agents = [
'curl', 'zgrab', 'Go-http-client', 'curb', '', 'reqwest', 'Feedly/',
'Python-urllib/', 'Liferea/', 'aiohttp/', ' Reader', 'hackney/',
'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests',
'WordPress/', 'http.rb/', 'Apache-HttpClient/', ';', 'Pleroma',
'Dispatch/', 'Ruby', 'Java/', 'libwww-perl/', 'Mastodon/', 'FeedlyApp/',
'lua-resty-http/', 'Tiny Tiny RSS/', 'Wget/', 'PostmanRuntime/',
'W3C_Validator/', 'NetNewsWire', 'FeedValidator/', '', 'axios/',
'Paw/', 'PeerTube/', '', 'FediDB/', ' crawler',
'Slackbot-LinkExpanding', 'Firefish/', 'Takahe/', 'Akkoma ', 'Misskey/', 'Lynx/',
'camo-rs asset proxy', 'gotosocial/', 'incestoma ', 'SpaceCowboys Android RSS Reader',
'NewsBlur Feed Finder', 'Lemmy/', 'enby-town/', 'rss2tg bot;', '; HTTrack ',
'MbinBot', 'kbinBot', 'Pixelfed/', 'NewsBlur Feed Fetcher', 'NewsBlur Page Fetcher',
'facebookexternalua', 'FreshRSS/', 'BookWyrm/', 'Reeder/', 'microblogpub/',
if (in_array($cleaned, $legacy_agents)) {
Logger::debug('Legacy wanted direct match', ['agent' => $cleaned]);
blockbot_save('legacy-wanted-direct', $agent, $cleaned);
return true;
$match = blockbot_match(array_merge($agents, $legacy_agents), $cleaned);
if ($match) {
Logger::debug('Legacy wanted partial match', ['agent' => $cleaned]);
blockbot_save('legacy-wanted-partial', $agent, $cleaned);
return $match;
function blockbot_is_good_agent(string $cleaned, string $agent): bool
$agents =
'Zabbix', '', 'node', 'FeedBurner',
'lemmy-stats-crawler', 'Poduptime', 'GNUsocialBot',
if (in_array($cleaned, $agents)) {
return true;
$legacy_agents = [
' crawler', ' crawler', 'Active_Pods_CheckBot_3.0',
'Social-Relay/', 'Test Certificate Info', 'Uptimebot/', 'UptimeRobot/', 'PTST/', 'FediFetcher',
'FedditLemmyverseCrawler/', 'lemmy-explorer-crawler/', 'URIports Validator',
' web bot;', 'fedistatsCrawler/', 'W3C_CSS_Validator_JFouffa/',
'IABot/', 'Slackbot 1', 'BeeperBot/', 'Matrix-Media-Repo/', 'P3P Validator',
'KeybaseBot;', 'Observatory/', 'CSSCheck/', ' feed bot;'
if (in_array($cleaned, $legacy_agents)) {
Logger::debug('Legacy good direct match', ['agent' => $cleaned]);
blockbot_save('legacy-good-direct', $agent, $cleaned);
return true;
$match = blockbot_match(array_merge($agents, $legacy_agents), $cleaned);
if ($match) {
Logger::debug('Legacy good partial match', ['agent' => $cleaned]);
blockbot_save('legacy-good-partial', $agent, $cleaned);
return $match;
function blockbot_is_socialmedia_agents(string $cleaned, string $agent): bool
$agents = [
'TelegramBot (like TwitterBot)', 'Twitterbot', 'Slack-ImgProxy', 'Slackbot-LinkExpanding',
'WhatsApp', 'facebookexternalhit', 'SkypeUriPreview Preview', 'SummalyBot', 'Iframely',
'Tumblr', 'Summalybot'
if (in_array($cleaned, $agents)) {
return true;
$legacy_agents =
'Twitterbot', 'facebookexternalhit/', 'SkypeUriPreview Preview/',
'TelegramBot', 'WhatsApp/', 'github-camo', 'Bluesky Cardyb/', 'XING-contenttabreceiver/',
'LinkedInBot/', 'Instagram ', 'Synapse (bot; ', 'Discordbot/', 'SummalyBot/',
'Slack-ImgProxy', 'Iframely/',
if (in_array($cleaned, $legacy_agents)) {
Logger::debug('Legacy social media direct match', ['agent' => $cleaned]);
blockbot_save('legacy-social-media-direct', $agent, $cleaned);
return true;
$match = blockbot_match(array_merge($agents, $legacy_agents), $cleaned);
if ($match) {
Logger::debug('Legacy social media partial match', ['agent' => $cleaned]);
blockbot_save('legacy-social-media-partial', $agent, $cleaned);
return $match;
function blockbot_is_library(string $cleaned, string $agent): bool
$agents =
'curl', 'Ruby', 'Go-http-client', 'python-httpx', 'undici',
'Python-urllib', 'okhttp', 'python-requests', 'python-asks', 'caveman-sieve',
'ReactorNetty', 'GuzzleHttp', 'Embed PHP library', 'python-urllib3',
'EventMachine HttpClient', 'HTMLParser', 'node-fetch', 'fasthttp',
'Fuzz Faster U Fool', 'gvfs', 'Embarcadero URI Client', 'grub-client',
'Deno', 'mint', 'axios', 'cutycapt', 'Java', 'Apache-HttpClient',
if (in_array($cleaned, $agents)) {
return true;
$legacy_agents =
'ReactorNetty/', 'GuzzleHttp/', 'Embed PHP library', 'python-urllib3/',
'EventMachine HttpClient', 'HTMLParser/', 'node-fetch', 'fasthttp', 'python-httpx/',
'Fuzz Faster U Fool', 'gvfs/', 'Embarcadero URI Client/', 'grub-client'
if (in_array($cleaned, $legacy_agents)) {
Logger::debug('Legacy good direct match', ['agent' => $cleaned]);
blockbot_save('legacy-good-direct', $agent, $cleaned);
return true;
$match = blockbot_match(array_merge($agents, $legacy_agents), $cleaned);
if ($match) {
Logger::debug('Legacy good partial match', ['agent' => $cleaned]);
blockbot_save('legacy-good-partial', $agent, $cleaned);
return $match;
function blockbot_is_bot(string $agent): bool
// List of known unwanted crawlers.
$agents = [
'SemrushBot', 's~feedly-nikon3', 'Qwantify/Bleriot/', 'ltx71', 'Sogou web spider/',
'CrowdTanglebot/', 'Mediapartners-Google', 'Baiduspider', 'datagnionbot',
'', 'SMUrlExpander', 'Hatena-Favicon/', 'Wappalyzer', 'FlipboardProxy/',
'NetcraftSurveyAgent/', '', 'SMTBot/', 'Nimbostratus-Bot/',
'DuckDuckGo-Favicons-Bot/', 'IndieWebCards/', 'proximic', 'netEstate NE Crawler',
'DuckDuckGo/', 'DuckDuckGo-Favicons-Bot/', 'IndieWebCards/', 'proximic', 'netEstate NE Crawler',
'AhrefsBot/', 'YandexBot/', 'Exabot/', 'Mediumbot-MetaTagFetcher/',
'SurdotlyBot/', 'BingPreview/', 'SabsimBot/', 'CCBot/', 'WbSrch/',
'DuckDuckBot-Https/', 'HTTP Banner Detection', 'YandexImages/', 'archive.org_bot',
'ImagesiftBot;', 'webtech/', 'Bloglines/', 'Netcraft Web Server Survey', 'Spawning-AI',
'NLUX_IAHarvester/', '', 'RSSingBot', 'Chrome-Lighthouse',
't3versionsBot/', '', 'intelx.io_bot', 'Google-Read-Aloud',
'bot Mozilla',
'bot Mozilla', 'rayven/',
if (DI::config()->get('blockbot', 'block_gab')) {
$agents[] = 'GabSocial/';
// List of "good" crawlers, mostly from the fediverse.
$good_agents = [
' crawler', ' crawler', 'Active_Pods_CheckBot_3.0',
'Social-Relay/', 'Test Certificate Info', 'Uptimebot/', 'GNUSocialBot', 'UptimeRobot/',
'PTST/', 'Zabbix', 'Poduptime/', 'FediFetcher', 'lemmy-stats-crawler',
'FedditLemmyverseCrawler/', 'lemmy-explorer-crawler/', 'URIports Validator',
' web bot;', 'fedistatsCrawler/', 'W3C_CSS_Validator_JFouffa/',
'IABot/', 'Slackbot 1', 'BeeperBot/', 'Matrix-Media-Repo/', 'P3P Validator',
'KeybaseBot;', 'Observatory/', 'CSSCheck/', 'FeedBurner/', ' feed bot;'
if (!DI::config()->get('blockbot', 'good_crawlers')) {
$agents = array_merge($agents, $good_agents);
} elseif (blockbot_match($good_agents)) {
// List of agents from social media systems that fetch preview data via opem graph or twitter cards.
$socialmedia_agents = ['Twitterbot', 'facebookexternalhit/', 'SkypeUriPreview Preview/',
'TelegramBot', 'WhatsApp/', 'github-camo', 'Bluesky Cardyb/', 'XING-contenttabreceiver/',
'LinkedInBot/', 'Instagram ', 'Synapse (bot; ', 'Discordbot/', 'SummalyBot/',
'Slackbot-LinkExpanding', 'Slack-ImgProxy', 'Iframely/',
if (!DI::config()->get('blockbot', 'socialmedia_agents')) {
$agents = array_merge($agents, $socialmedia_agents);
} elseif (blockbot_match($socialmedia_agents)) {
// HTTP Libraries
$http_libraries = ['ReactorNetty/', 'GuzzleHttp/', 'Embed PHP library', 'python-urllib3/',
'EventMachine HttpClient', 'HTMLParser/', 'node-fetch', 'fasthttp', 'python-httpx/',
'Fuzz Faster U Fool', 'gvfs/', 'Embarcadero URI Client/', 'grub-client'
if (!DI::config()->get('blockbot', 'http_libraries')) {
$agents = array_merge($agents, $http_libraries);
} elseif (blockbot_match($http_libraries)) {
if (blockbot_match($agents)) {
throw new ForbiddenException('Bots are not allowed. If you consider this a mistake, create an issue at');
// This switch here is only meant for developers who want to add more bots to the list above, it is not safe for production.
if (!DI::config()->get('blockbot', 'training')) {
$crawlerDetect = new CrawlerDetect();
if (!$crawlerDetect->isCrawler()) {
logger::debug('Good user agent detected', $logdata);
// List of known "good" agents, mostly used by Fediverse systems, feed readers, ...
$agents = [
'curl', 'zgrab', 'Go-http-client', 'curb', '', 'reqwest', 'Feedly/',
'Python-urllib/', 'Liferea/', 'aiohttp/', ' Reader', 'hackney/',
'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests',
'WordPress/', 'http.rb/', 'Apache-HttpClient/', ';', 'Pleroma',
'Dispatch/', 'Ruby', 'Java/', 'libwww-perl/', 'Mastodon/', 'FeedlyApp/',
'lua-resty-http/', 'Tiny Tiny RSS/', 'Wget/', 'PostmanRuntime/',
'W3C_Validator/', 'NetNewsWire', 'FeedValidator/', '', 'axios/',
'Paw/', 'PeerTube/', '', 'FediDB/', ' crawler',
'Slackbot-LinkExpanding', 'Firefish/', 'Takahe/', 'Akkoma ', 'Misskey/', 'Lynx/',
'camo-rs asset proxy', 'gotosocial/', 'incestoma ', 'SpaceCowboys Android RSS Reader',
'NewsBlur Feed Finder', 'Lemmy/', 'enby-town/', 'rss2tg bot;', '; HTTrack ',
'MbinBot', 'kbinBot', 'Pixelfed/', 'NewsBlur Feed Fetcher', 'NewsBlur Page Fetcher',
'facebookexternalua', 'FreshRSS/', 'BookWyrm/', 'Reeder/', 'microblogpub/',
if (blockbot_match($agents)) {
logger::info('False positive', $logdata);
blockbot_save('blocked-bot', $_SERVER['HTTP_USER_AGENT']);
logger::notice('Blocked bot', $logdata);
throw new ForbiddenException('Bots are not allowed. If you consider this a mistake, create an issue at');
return blockbot_match($agents, $agent);
function blockbot_save($database, $userAgent)
if (!function_exists('dba_open')) {
$ressource = dba_open(System::getTempPath() . '/' . $database, 'cl');
$result = dba_fetch($userAgent, $ressource);
if ($result === false) {
dba_insert($userAgent, 1, $ressource);
} else {
dba_replace($userAgent, ++$result, $ressource);
function blockbot_match(array $agents)
function blockbot_match(array $agents, string $request_agent)
foreach ($agents as $agent) {
if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) {
if (stristr($request_agent, $agent)) {
return true;