Blockbot: reworked user agent parser

This commit is contained in:
Michael 2024-04-22 01:38:08 +00:00
parent 741c142a05
commit 2c064a559e
1 changed files with 348 additions and 107 deletions

View File

@ -2,7 +2,7 @@
/**
* Name: blockbot
* Description: Blocking bots based on detecting bots/crawlers/spiders via the user agent and http_from header.
* Version: 0.2
* Version: 1.0
* Author: Philipp Holzer <admin@philipp.info>
* Author: Michael Vogel <https://pirati.ca/profile/heluecht>
*
@ -15,6 +15,7 @@ use Friendica\Core\Logger;
use Friendica\Core\Renderer;
use Friendica\Core\System;
use Friendica\Network\HTTPException\ForbiddenException;
use phpseclib3\System\SSH\Agent;
require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor' . DIRECTORY_SEPARATOR . 'autoload.php';
@ -46,14 +47,354 @@ function blockbot_addon_admin_post()
DI::config()->set('blockbot', 'training', $_POST['training'] ?? false);
}
function blockbot_reject()
{
throw new ForbiddenException('Bots are not allowed. If you consider this a mistake, create an issue at https://github.com/friendica/friendica');
}
function blockbot_init_1()
{
if (empty($_SERVER['HTTP_USER_AGENT'])) {
return;
}
$logdata = ['agent' => $_SERVER['HTTP_USER_AGENT'], 'uri' => $_SERVER['REQUEST_URI']];
$crawlerDetect = new CrawlerDetect();
$isCrawler = $crawlerDetect->isCrawler();
$cleaned = blockbot_remove_known_parts($_SERVER['HTTP_USER_AGENT']);
$logdata = ['isCrawler' => $isCrawler, 'cleaned' => $cleaned, 'agent' => $_SERVER['HTTP_USER_AGENT'], 'uri' => $_SERVER['REQUEST_URI']];
if (empty($cleaned) && !$isCrawler) {
return;
}
if (blockbot_is_bot($_SERVER['HTTP_USER_AGENT'])) {
blockbot_reject();
}
if (blockbot_is_socialmedia_agents($cleaned, $_SERVER['HTTP_USER_AGENT'])) {
if (!DI::config()->get('blockbot', 'socialmedia_agents')) {
blockbot_reject();
}
return;
}
if (blockbot_is_library($cleaned, $_SERVER['HTTP_USER_AGENT'])) {
if (!DI::config()->get('blockbot', 'http_libraries')) {
blockbot_reject();
}
return;
}
if (blockbot_is_good_agent($cleaned, $_SERVER['HTTP_USER_AGENT'])) {
if (!DI::config()->get('blockbot', 'good_crawlers')) {
blockbot_reject();
}
return;
}
// This switch here is only meant for developers who want to add more bots to the list above, it is not safe for production.
if (!DI::config()->get('blockbot', 'training')) {
return;
}
if (blockbot_is_wanted_agent($cleaned, $_SERVER['HTTP_USER_AGENT'])) {
return;
}
if (!$isCrawler) {
if (!empty($cleaned)) {
logger::debug('Legacy good user agent detected', $logdata);
blockbot_save('legacy-good', $_SERVER['HTTP_USER_AGENT'], $cleaned);
}
return;
} elseif (empty($cleaned)) {
return;
}
logger::notice('Blocked bot', $logdata);
blockbot_save('blocked-bot', $_SERVER['HTTP_USER_AGENT'], $cleaned);
blockbot_reject();
}
function blockbot_save($database, $userAgent, $cleaned)
{
if (!function_exists('dba_open')) {
return;
}
$ressource = dba_open(System::getTempPath() . '/' . $database, 'cl');
$result = dba_fetch($userAgent, $ressource);
if ($result === false) {
dba_insert($userAgent, $cleaned, $ressource);
}
dba_close($ressource);
}
function blockbot_remove_known_parts($agent)
{
$agent = str_replace('))', ')', $agent);
$patterns = [
"Friendica '.*' \S+; https?://\S+" => 'Friendica',
'FriendicaDirectory/\S+ GuzzleHttp/\S+ curl/\S+ PHP/\S+' => 'FriendicaDirectory',
'.*\(golang net/http; Activity-Relay \S+; \S+\)' => 'Activity-Relay',
'Nextcloud Social \S+' => 'Nextcloud Social',
'\S+ - Mobilizon \S+' => 'Mobilizon',
'\S+ - Mobilizon' => 'Mobilizon',
'GNU social/\S+ \(.*\)' => 'GNU social',
'MastodonAndroid/\S+' => 'MastodonAndroid',
'Mozilla/5.0 \(compatible; zot\)' => 'zot',
'Camo Asset Proxy \S+' => 'Camo Asset Proxy',
'AodeRelay \(ap-relay/\S+; \+https?://\S+\)' => 'AodeRelay',
'Wget/\S+ \S+ \(Red Hat modified\)' => 'Wget',
'PixelFedBot/\S+ \(Pixelfed/\S+; \+https?://\S+\)' => 'Pixelfed',
'\(Pixelfed/\S+; \+https?://\S+\)' => 'Pixelfed',
'Boost/\S+ \(Linux;Android \S+\)' => 'Boost',
'Apache-HttpClient/\S+ \(Java/\S+\)' => 'Apache-HttpClient',
'python/federation/\S+' => 'federation',
'undefined/undefined \(\+http://a.gup.pe\)' => 'gup.pe',
'Poduptime/Production from https?://\S+' => 'Poduptime',
'GNUsocialBot \S+ - https?://\S+' => 'GNUsocialBot'
];
$oldagent = $agent;
foreach ($patterns as $pattern => $replacement) {
$agent = preg_replace('=' . $pattern . '=i', $replacement, $agent);
}
$patterns = [
'Dalvik/\S+ \(Linux; U; Android \S+; [^;]* Build/\S+\)',
];
do {
$oldagent = $agent;
foreach ($patterns as $pattern) {
$agent = preg_replace('=' . $pattern . '=i', '', $agent);
}
} while ($agent != $oldagent);
$patterns = ['Mozilla\S+ Slackware\S+ \([^)]*\)', 'Mozilla/\S+ \([^)]*\)', 'Opera/\S+ \([^)]*\)', 'iTunes/\S+ \([^)]*\)',
'Links \([^)]*\)',
];
do {
$oldagent = $agent;
foreach ($patterns as $pattern) {
$agent = preg_replace('=' . $pattern . '=i', '', $agent);
}
} while ($agent != $oldagent);
// Search patterns of known agents
$patterns = ['KHTML/\S+ \(like Gecko\)', '\(KHTML, like Gecko\)',
'\(KHTML like Gecko\)', 'like iPhone OS \S+ Mac OS X',
'Gecko/\S+', 'Presto/\S+', 'SamsungBrowser/\S+',
'KHTML/\S+ \(like Gecko\)', 'Whale/\S+', 'YaBrowser/\S+', 'Yowser/\S+',
'Mobile/\S+', 'CriOS/\S+', 'SA/\S+', 'honksnonk/\S+; \S+',
'libwww-FM/\S+', 'SSL-MM/\S+'
];
do {
$oldagent = $agent;
foreach ($patterns as $pattern) {
$agent = preg_replace('=(\W)' . $pattern . '(\W)=i', '$1$2', $agent);
}
} while ($agent != $oldagent);
// '(\S+)/\S+'
$patterns = ['python-httpx/\S+ \((\S+)/\S+; \+https?://\S+\)',
'http.rb/\S+ \(\S+/\S+ (\S+)/\S+; \+https?://\S+\) Bot',
'http.rb/\S+ \((\S+)/\S+; \+https?://\S+\) Bot',
'http.rb/\S+ \((\S+)/\S+; \+https?://\S+\)',
'http.rb/\S+ \((\S+)/\S+; https?://\S+\)',
'http.rb/\S+ \((\S+)/\S+\)',
'python-requests \((\S+)/\S+; \+https?://\S+\)',
'python-httpx/\S+ \(\S+; (\S+)/\S+; \+https?://\S+\)',
'(\S+) \(\S+; \+https?://\S+\)',
'(\S+)/\S+ CFNetwork/\S+ Darwin/\S+',
'(\S+)/\S+ Android/\S+ OkHttp/\S+',
'http.fetch \((\S+); \+https?://\S+ <\S+>; Timeline enrichment\); Bot',
'http.fetch \((\S+); \+https?://\S+ <\S+>; Timeline enrichment\)',
'(\S+) at \S+ \(\+https?://\S+\)',
'\S+ \(\+https?://\S+\) (\S+)/\S+ \S+', '\S+ \(\+https?://\S+\) (\S+)/\S+',
'(\S+)/\S* \(\+https?://\S+\)', '(\S+) \(\+https?://\S+\)',
'(\S+)/\S+ \(https?://\S+\)', '(\S+) \(https?://\S+\)', '(\S+)/\S* https?://\S+',
'(\S+) \S+; https?://\S+ <\S+>; Bot', '(\S+) \S+; https?://\S+ <\S+>',
'(\S+)/\S+; \+https?://\S+', '(\S+)/; \+https?://\S+', '(\S+)/[\d_.]+\S*'
]; // '(\S+)/[\d_.]+\S*'
do {
$oldagent = $agent;
foreach ($patterns as $pattern) {
// echo $agent . "\t" . $pattern . "\n";
$agent = preg_replace('=' . $pattern . '=i', '$1', $agent);
}
} while ($agent != $oldagent);
// 'Pre',
// Some more known parts that we can remove
$search = ['Waterfox', 'Minefield', 'Mobile Safari', 'Safari', 'Firefox',
'Version', 'AppleWebKit', 'Core', 'QQBrowser', 'OPR', 'EdgA', 'Edge',
'Edg', 'Maxthon', 'Iceweasel', 'Fennec', 'webOSBrowser', 'KHTML (like Gecko)',
'like Gecko', 'Quark', 'QtWebEngine', 'Brighteon', 'FxiOS', '(Unsupported)',
'Konqueror (WebEnginePart)', '(Edition Yx)', 'Kindle', 'Gecko',
'Observatory', 'SeaMonkey', 'Chromium', 'Mammoth', 'Puffin', 'Fedora',
'Debian', 'Amethyst', 'FirePHP', 'CherryPick', 'Avant Browser', 'Chrome',
'PieFed', 'AtContent', 'OpenSSL', 'Lynx', 'NokiaBrowser', '3gpp-gba',
'Kubuntu', 'Ubuntu', 'Synapse', 'UNTRUSTED', 'Vivaldi',
];
do {
$oldtext = $agent;
$agent = str_ireplace($search, ' ', $agent);
} while ($oldtext != $agent);
$agent = trim($agent, ',./ ');
return trim($agent);
}
function blockbot_is_wanted_agent(string $cleaned, string $agent): bool
{
$agents =
[
'Friendica', 'AodeRelay', 'Pixelfed', 'zot', 'Mobilizon', 'federation',
'Mastodon', 'Akkoma', 'Pleroma', 'Misskey', 'Firefish', 'WordPress',
'Iceshrimp', 'PeerTube', 'FoundKey', 'Calckey', 'gotosocial', 'Wget',
'microblogpub', 'Takahe', 'DiasporaFederation', 'MbinBot', 'snac', 'funkwhale',
'kbinBot', 'Mbin', 'lotide', 'lemmy-stats-crawler', 'Lemmy', 'MastodonInstances',
'lemmy-explorer-crawler', 'MisskeyMediaProxy', 'FedditLemmyverseCrawler',
'facebookexternalua', 'ActivityRelay', 'Pachli', 'go-camo', 'OpenGraphReader',
'buzzrelay', 'TootDeck-Worker', 'diaspora-connection-tester', 'Fedilab',
'ActivityPub', 'camo-rs asset proxy', 'Fedibird', 'FediFetcher', 'FediList Agent',
'CSSCheck', 'Tiny Tiny RSS', 'Tusky', 'FediDB', 'Plume', "fediverse's stats",
'gup.pe', 'MastodonAndroid', 'Activity-Relay', 'GNU social', 'app.wafrn.net'
];
if (in_array($cleaned, $agents)) {
return true;
}
$legacy_agents = [
'curl', 'zgrab', 'Go-http-client', 'curb', 'github.com', 'reqwest', 'Feedly/',
'Python-urllib/', 'Liferea/', 'aiohttp/', 'WordPress.com Reader', 'hackney/',
'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests',
'WordPress/', 'http.rb/', 'Apache-HttpClient/', 'WordPress.com;', 'Pleroma',
'Dispatch/', 'Ruby', 'Java/', 'libwww-perl/', 'Mastodon/', 'FeedlyApp/',
'lua-resty-http/', 'Tiny Tiny RSS/', 'Wget/', 'PostmanRuntime/',
'W3C_Validator/', 'NetNewsWire', 'FeedValidator/', 'theoldreader.com', 'axios/',
'Paw/', 'PeerTube/', 'fedi.inex.dev', 'FediDB/', 'index.community crawler',
'Slackbot-LinkExpanding', 'Firefish/', 'Takahe/', 'Akkoma ', 'Misskey/', 'Lynx/',
'camo-rs asset proxy', 'gotosocial/', 'incestoma ', 'SpaceCowboys Android RSS Reader',
'NewsBlur Feed Finder', 'Lemmy/', 'enby-town/', 'rss2tg bot;', '; HTTrack ',
'MbinBot', 'kbinBot', 'Pixelfed/', 'NewsBlur Feed Fetcher', 'NewsBlur Page Fetcher',
'facebookexternalua', 'FreshRSS/', 'BookWyrm/', 'Reeder/', 'microblogpub/',
];
if (in_array($cleaned, $legacy_agents)) {
Logger::debug('Legacy wanted direct match', ['agent' => $cleaned]);
blockbot_save('legacy-wanted-direct', $agent, $cleaned);
return true;
}
$match = blockbot_match(array_merge($agents, $legacy_agents), $cleaned);
if ($match) {
Logger::debug('Legacy wanted partial match', ['agent' => $cleaned]);
blockbot_save('legacy-wanted-partial', $agent, $cleaned);
}
return $match;
}
function blockbot_is_good_agent(string $cleaned, string $agent): bool
{
$agents =
[
'Zabbix', 'fediblock.manalejandro.com', 'node', 'FeedBurner',
'lemmy-stats-crawler', 'Poduptime', 'GNUsocialBot',
];
if (in_array($cleaned, $agents)) {
return true;
}
$legacy_agents = [
'fediverse.space crawler', 'fediverse.network crawler', 'Active_Pods_CheckBot_3.0',
'Social-Relay/', 'Test Certificate Info', 'Uptimebot/', 'UptimeRobot/', 'PTST/', 'FediFetcher',
'FedditLemmyverseCrawler/', 'lemmy-explorer-crawler/', 'URIports Validator',
'rss-is-dead.lol web bot;', 'fedistatsCrawler/', 'W3C_CSS_Validator_JFouffa/',
'IABot/', 'Slackbot 1', 'BeeperBot/', 'Matrix-Media-Repo/', 'P3P Validator',
'KeybaseBot;', 'Observatory/', 'CSSCheck/', 'rss-is-dead.lol feed bot;'
];
if (in_array($cleaned, $legacy_agents)) {
Logger::debug('Legacy good direct match', ['agent' => $cleaned]);
blockbot_save('legacy-good-direct', $agent, $cleaned);
return true;
}
$match = blockbot_match(array_merge($agents, $legacy_agents), $cleaned);
if ($match) {
Logger::debug('Legacy good partial match', ['agent' => $cleaned]);
blockbot_save('legacy-good-partial', $agent, $cleaned);
}
return $match;
}
function blockbot_is_socialmedia_agents(string $cleaned, string $agent): bool
{
$agents = [
'TelegramBot (like TwitterBot)', 'Twitterbot', 'Slack-ImgProxy', 'Slackbot-LinkExpanding',
'WhatsApp', 'facebookexternalhit', 'SkypeUriPreview Preview', 'SummalyBot', 'Iframely',
'Tumblr', 'Summalybot'
];
if (in_array($cleaned, $agents)) {
return true;
}
$legacy_agents =
[
'Twitterbot', 'facebookexternalhit/', 'SkypeUriPreview Preview/',
'TelegramBot', 'WhatsApp/', 'github-camo', 'Bluesky Cardyb/', 'XING-contenttabreceiver/',
'LinkedInBot/', 'Instagram ', 'Synapse (bot; ', 'Discordbot/', 'SummalyBot/',
'Slack-ImgProxy', 'Iframely/',
];
if (in_array($cleaned, $legacy_agents)) {
Logger::debug('Legacy social media direct match', ['agent' => $cleaned]);
blockbot_save('legacy-social-media-direct', $agent, $cleaned);
return true;
}
$match = blockbot_match(array_merge($agents, $legacy_agents), $cleaned);
if ($match) {
Logger::debug('Legacy social media partial match', ['agent' => $cleaned]);
blockbot_save('legacy-social-media-partial', $agent, $cleaned);
}
return $match;
}
function blockbot_is_library(string $cleaned, string $agent): bool
{
$agents =
[
'curl', 'Ruby', 'Go-http-client', 'python-httpx', 'undici',
'Python-urllib', 'okhttp', 'python-requests', 'python-asks', 'caveman-sieve',
'ReactorNetty', 'GuzzleHttp', 'Embed PHP library', 'python-urllib3',
'EventMachine HttpClient', 'HTMLParser', 'node-fetch', 'fasthttp',
'Fuzz Faster U Fool', 'gvfs', 'Embarcadero URI Client', 'grub-client',
'Deno', 'mint', 'axios', 'cutycapt', 'Java', 'Apache-HttpClient',
'Crystal',
];
if (in_array($cleaned, $agents)) {
return true;
}
$legacy_agents =
[
'ReactorNetty/', 'GuzzleHttp/', 'Embed PHP library', 'python-urllib3/',
'EventMachine HttpClient', 'HTMLParser/', 'node-fetch', 'fasthttp', 'python-httpx/',
'Fuzz Faster U Fool', 'gvfs/', 'Embarcadero URI Client/', 'grub-client'
];
if (in_array($cleaned, $legacy_agents)) {
Logger::debug('Legacy good direct match', ['agent' => $cleaned]);
blockbot_save('legacy-good-direct', $agent, $cleaned);
return true;
}
$match = blockbot_match(array_merge($agents, $legacy_agents), $cleaned);
if ($match) {
Logger::debug('Legacy good partial match', ['agent' => $cleaned]);
blockbot_save('legacy-good-partial', $agent, $cleaned);
}
return $match;
}
function blockbot_is_bot(string $agent): bool
{
// List of known unwanted crawlers.
$agents = [
'SemrushBot', 's~feedly-nikon3', 'Qwantify/Bleriot/', 'ltx71', 'Sogou web spider/',
@ -65,7 +406,7 @@ function blockbot_init_1()
'CrowdTanglebot/', 'Mediapartners-Google', 'Baiduspider', 'datagnionbot',
'MegaIndex.ru/', 'SMUrlExpander', 'Hatena-Favicon/', 'Wappalyzer', 'FlipboardProxy/',
'NetcraftSurveyAgent/', 'Dataprovider.com', 'SMTBot/', 'Nimbostratus-Bot/',
'DuckDuckGo-Favicons-Bot/', 'IndieWebCards/', 'proximic', 'netEstate NE Crawler',
'DuckDuckGo/', 'DuckDuckGo-Favicons-Bot/', 'IndieWebCards/', 'proximic', 'netEstate NE Crawler',
'AhrefsBot/', 'YandexBot/', 'Exabot/', 'Mediumbot-MetaTagFetcher/',
'SurdotlyBot/', 'BingPreview/', 'SabsimBot/', 'CCBot/', 'WbSrch/',
'DuckDuckBot-Https/', 'HTTP Banner Detection', 'YandexImages/', 'archive.org_bot',
@ -92,119 +433,19 @@ function blockbot_init_1()
'ImagesiftBot;', 'webtech/', 'Bloglines/', 'Netcraft Web Server Survey', 'Spawning-AI',
'NLUX_IAHarvester/', 'bots.retroverse.social', 'RSSingBot', 'Chrome-Lighthouse',
't3versionsBot/', 'scaninfo@paloaltonetworks.com', 'intelx.io_bot', 'Google-Read-Aloud',
'bot Mozilla',
'bot Mozilla', 'rayven/',
];
if (DI::config()->get('blockbot', 'block_gab')) {
$agents[] = 'GabSocial/';
}
// List of "good" crawlers, mostly from the fediverse.
$good_agents = [
'fediverse.space crawler', 'fediverse.network crawler', 'Active_Pods_CheckBot_3.0',
'Social-Relay/', 'Test Certificate Info', 'Uptimebot/', 'GNUSocialBot', 'UptimeRobot/',
'PTST/', 'Zabbix', 'Poduptime/', 'FediFetcher', 'lemmy-stats-crawler',
'FedditLemmyverseCrawler/', 'lemmy-explorer-crawler/', 'URIports Validator',
'rss-is-dead.lol web bot;', 'fedistatsCrawler/', 'W3C_CSS_Validator_JFouffa/',
'IABot/', 'Slackbot 1', 'BeeperBot/', 'Matrix-Media-Repo/', 'P3P Validator',
'KeybaseBot;', 'Observatory/', 'CSSCheck/', 'FeedBurner/', 'rss-is-dead.lol feed bot;'
];
if (!DI::config()->get('blockbot', 'good_crawlers')) {
$agents = array_merge($agents, $good_agents);
} elseif (blockbot_match($good_agents)) {
return;
}
// List of agents from social media systems that fetch preview data via opem graph or twitter cards.
$socialmedia_agents = ['Twitterbot', 'facebookexternalhit/', 'SkypeUriPreview Preview/',
'TelegramBot', 'WhatsApp/', 'github-camo', 'Bluesky Cardyb/', 'XING-contenttabreceiver/',
'LinkedInBot/', 'Instagram ', 'Synapse (bot; ', 'Discordbot/', 'SummalyBot/',
'Slackbot-LinkExpanding', 'Slack-ImgProxy', 'Iframely/',
];
if (!DI::config()->get('blockbot', 'socialmedia_agents')) {
$agents = array_merge($agents, $socialmedia_agents);
} elseif (blockbot_match($socialmedia_agents)) {
return;
}
// HTTP Libraries
$http_libraries = ['ReactorNetty/', 'GuzzleHttp/', 'Embed PHP library', 'python-urllib3/',
'EventMachine HttpClient', 'HTMLParser/', 'node-fetch', 'fasthttp', 'python-httpx/',
'Fuzz Faster U Fool', 'gvfs/', 'Embarcadero URI Client/', 'grub-client'
];
if (!DI::config()->get('blockbot', 'http_libraries')) {
$agents = array_merge($agents, $http_libraries);
} elseif (blockbot_match($http_libraries)) {
return;
}
if (blockbot_match($agents)) {
throw new ForbiddenException('Bots are not allowed. If you consider this a mistake, create an issue at https://github.com/friendica/friendica');
}
// This switch here is only meant for developers who want to add more bots to the list above, it is not safe for production.
if (!DI::config()->get('blockbot', 'training')) {
return;
}
$crawlerDetect = new CrawlerDetect();
if (!$crawlerDetect->isCrawler()) {
logger::debug('Good user agent detected', $logdata);
return;
}
// List of known "good" agents, mostly used by Fediverse systems, feed readers, ...
$agents = [
'curl', 'zgrab', 'Go-http-client', 'curb', 'github.com', 'reqwest', 'Feedly/',
'Python-urllib/', 'Liferea/', 'aiohttp/', 'WordPress.com Reader', 'hackney/',
'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests',
'WordPress/', 'http.rb/', 'Apache-HttpClient/', 'WordPress.com;', 'Pleroma',
'Dispatch/', 'Ruby', 'Java/', 'libwww-perl/', 'Mastodon/', 'FeedlyApp/',
'lua-resty-http/', 'Tiny Tiny RSS/', 'Wget/', 'PostmanRuntime/',
'W3C_Validator/', 'NetNewsWire', 'FeedValidator/', 'theoldreader.com', 'axios/',
'Paw/', 'PeerTube/', 'fedi.inex.dev', 'FediDB/', 'index.community crawler',
'Slackbot-LinkExpanding', 'Firefish/', 'Takahe/', 'Akkoma ', 'Misskey/', 'Lynx/',
'camo-rs asset proxy', 'gotosocial/', 'incestoma ', 'SpaceCowboys Android RSS Reader',
'NewsBlur Feed Finder', 'Lemmy/', 'enby-town/', 'rss2tg bot;', '; HTTrack ',
'MbinBot', 'kbinBot', 'Pixelfed/', 'NewsBlur Feed Fetcher', 'NewsBlur Page Fetcher',
'facebookexternalua', 'FreshRSS/', 'BookWyrm/', 'Reeder/', 'microblogpub/',
];
if (blockbot_match($agents)) {
logger::info('False positive', $logdata);
return;
}
blockbot_save('blocked-bot', $_SERVER['HTTP_USER_AGENT']);
logger::notice('Blocked bot', $logdata);
throw new ForbiddenException('Bots are not allowed. If you consider this a mistake, create an issue at https://github.com/friendica/friendica');
return blockbot_match($agents, $agent);
}
function blockbot_save($database, $userAgent)
{
if (!function_exists('dba_open')) {
return;
}
$ressource = dba_open(System::getTempPath() . '/' . $database, 'cl');
$result = dba_fetch($userAgent, $ressource);
if ($result === false) {
dba_insert($userAgent, 1, $ressource);
} else {
dba_replace($userAgent, ++$result, $ressource);
}
dba_close($ressource);
}
function blockbot_match(array $agents)
function blockbot_match(array $agents, string $request_agent)
{
foreach ($agents as $agent) {
if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) {
if (stristr($request_agent, $agent)) {
return true;
}
}