From 5e18b276af2aa17856e56bc782bebaeaba8b1c52 Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Sat, 27 Apr 2019 13:51:44 +0200 Subject: [PATCH 01/31] Blockbot: Avoid false positives --- blockbot/blockbot.php | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 30ecc3a6e..5d2242ef3 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -11,6 +11,7 @@ use Friendica\App; use Friendica\Core\Hook; use Friendica\Core\System; use Jaybizzle\CrawlerDetect\CrawlerDetect; +use Friendica\Core\Logger; require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor' . DIRECTORY_SEPARATOR . 'autoload.php'; @@ -26,7 +27,23 @@ function blockbot_uninstall() { function blockbot_init_1(App $a) { $crawlerDetect = new CrawlerDetect(); + // List of strings of known "good" agents + $agents = ['diaspora-connection-tester', 'DiasporaFederation', 'Friendica', '(compatible; zot)', + 'Micro.blog', 'Mastodon', 'hackney', 'GangGo', 'python/federation', 'GNU social', 'winHttp', + 'Go-http-client', 'Mr.4x3 Powered', 'Test Certificate Info', 'WordPress.com', 'zgrab', + 'curl/', 'StatusNet', 'OpenGraphReader/', 'Uptimebot/', 'python-opengraph-jaywink']; + if ($crawlerDetect->isCrawler()) { + foreach ($agents as $agent) { + if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { + // @ToDo: Report every false positive here: https://github.com/JayBizzle/Crawler-Detect/issues/326 + logger::notice('False positive', ['agent' => $_SERVER['HTTP_USER_AGENT']]); + return; + } + } + logger::info('Blocked bot', ['agent' => $_SERVER['HTTP_USER_AGENT']]); System::httpExit(403, 'Bots are not allowed'); + } else { + logger::debug('Good user agent detected', ['agent' => $_SERVER['HTTP_USER_AGENT']]); } } From 89b9baf392bd86ec093e77c50f7b7fac6ba2d24d Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Sat, 27 Apr 2019 15:33:06 +0200 Subject: [PATCH 02/31] There are in fact many false positives ... --- blockbot/blockbot.php | 54 ++++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 5d2242ef3..587bc9d13 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -27,23 +27,51 @@ function blockbot_uninstall() { function blockbot_init_1(App $a) { $crawlerDetect = new CrawlerDetect(); + $logdata = ['agent' => $_SERVER['HTTP_USER_AGENT'], 'uri' => $_SERVER['REQUEST_URI']]; + + if (!$crawlerDetect->isCrawler()) { + logger::debug('Good user agent detected', $logdata); + return; + } + + // List of strings of reported false positives + $agents = ['Mastodon', 'hackney', 'Faraday', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', + 'WordPress']; + foreach ($agents as $agent) { + if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { + // The agents had been reported to https://github.com/JayBizzle/Crawler-Detect/issues/ + logger::notice('Already reported wrong detection', $logdata); + return; + } + } + // List of strings of known "good" agents $agents = ['diaspora-connection-tester', 'DiasporaFederation', 'Friendica', '(compatible; zot)', - 'Micro.blog', 'Mastodon', 'hackney', 'GangGo', 'python/federation', 'GNU social', 'winHttp', + 'Micro.blog', 'GangGo', 'python/federation', 'GNU social', 'winHttp', 'Go-http-client', 'Mr.4x3 Powered', 'Test Certificate Info', 'WordPress.com', 'zgrab', - 'curl/', 'StatusNet', 'OpenGraphReader/', 'Uptimebot/', 'python-opengraph-jaywink']; + 'curl/', 'StatusNet', 'OpenGraphReader/', 'Uptimebot/', 'python-opengraph-jaywink', + 'fediverse.network crawler', 'Active_Pods_CheckBot_3.0', 'Social-Relay']; - if ($crawlerDetect->isCrawler()) { - foreach ($agents as $agent) { - if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { - // @ToDo: Report every false positive here: https://github.com/JayBizzle/Crawler-Detect/issues/326 - logger::notice('False positive', ['agent' => $_SERVER['HTTP_USER_AGENT']]); - return; - } + foreach ($agents as $agent) { + if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { + // Report every false positive here: https://github.com/JayBizzle/Crawler-Detect/issues/ + // After report move it into the array above + logger::notice('False positive', $logdata); + return; } - logger::info('Blocked bot', ['agent' => $_SERVER['HTTP_USER_AGENT']]); - System::httpExit(403, 'Bots are not allowed'); - } else { - logger::debug('Good user agent detected', ['agent' => $_SERVER['HTTP_USER_AGENT']]); } + + // List of known crawlers. They are added here to avoid having them logged at the end of the function. + // This helps to detect false positives + $agents = ['Mozilla/5.0 (compatible; SemrushBot/3~bl; +http://www.semrush.com/bot.html)', 'SEMrushBot', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 AppEngine-Google; (+http://code.google.com/appengine; appid: s~feedly-nikon3)']; + + foreach ($agents as $agent) { + if ($_SERVER['HTTP_USER_AGENT'] == $agent) { + System::httpExit(403, 'Bots are not allowed'); + } + } + + logger::info('Blocked bot', $logdata); + System::httpExit(403, 'Bots are not allowed'); } From 939d17e1eded0636dd36dd65910a88d1bf289466 Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Sat, 27 Apr 2019 15:34:51 +0200 Subject: [PATCH 03/31] There are in fact many false positives ... --- blockbot/blockbot.php | 54 ++++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 5d2242ef3..587bc9d13 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -27,23 +27,51 @@ function blockbot_uninstall() { function blockbot_init_1(App $a) { $crawlerDetect = new CrawlerDetect(); + $logdata = ['agent' => $_SERVER['HTTP_USER_AGENT'], 'uri' => $_SERVER['REQUEST_URI']]; + + if (!$crawlerDetect->isCrawler()) { + logger::debug('Good user agent detected', $logdata); + return; + } + + // List of strings of reported false positives + $agents = ['Mastodon', 'hackney', 'Faraday', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', + 'WordPress']; + foreach ($agents as $agent) { + if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { + // The agents had been reported to https://github.com/JayBizzle/Crawler-Detect/issues/ + logger::notice('Already reported wrong detection', $logdata); + return; + } + } + // List of strings of known "good" agents $agents = ['diaspora-connection-tester', 'DiasporaFederation', 'Friendica', '(compatible; zot)', - 'Micro.blog', 'Mastodon', 'hackney', 'GangGo', 'python/federation', 'GNU social', 'winHttp', + 'Micro.blog', 'GangGo', 'python/federation', 'GNU social', 'winHttp', 'Go-http-client', 'Mr.4x3 Powered', 'Test Certificate Info', 'WordPress.com', 'zgrab', - 'curl/', 'StatusNet', 'OpenGraphReader/', 'Uptimebot/', 'python-opengraph-jaywink']; + 'curl/', 'StatusNet', 'OpenGraphReader/', 'Uptimebot/', 'python-opengraph-jaywink', + 'fediverse.network crawler', 'Active_Pods_CheckBot_3.0', 'Social-Relay']; - if ($crawlerDetect->isCrawler()) { - foreach ($agents as $agent) { - if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { - // @ToDo: Report every false positive here: https://github.com/JayBizzle/Crawler-Detect/issues/326 - logger::notice('False positive', ['agent' => $_SERVER['HTTP_USER_AGENT']]); - return; - } + foreach ($agents as $agent) { + if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { + // Report every false positive here: https://github.com/JayBizzle/Crawler-Detect/issues/ + // After report move it into the array above + logger::notice('False positive', $logdata); + return; } - logger::info('Blocked bot', ['agent' => $_SERVER['HTTP_USER_AGENT']]); - System::httpExit(403, 'Bots are not allowed'); - } else { - logger::debug('Good user agent detected', ['agent' => $_SERVER['HTTP_USER_AGENT']]); } + + // List of known crawlers. They are added here to avoid having them logged at the end of the function. + // This helps to detect false positives + $agents = ['Mozilla/5.0 (compatible; SemrushBot/3~bl; +http://www.semrush.com/bot.html)', 'SEMrushBot', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 AppEngine-Google; (+http://code.google.com/appengine; appid: s~feedly-nikon3)']; + + foreach ($agents as $agent) { + if ($_SERVER['HTTP_USER_AGENT'] == $agent) { + System::httpExit(403, 'Bots are not allowed'); + } + } + + logger::info('Blocked bot', $logdata); + System::httpExit(403, 'Bots are not allowed'); } From 77acddba49c6e53f0d398ba780e3dce9c58ef575 Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Sat, 27 Apr 2019 15:50:25 +0200 Subject: [PATCH 04/31] And some more added "good" agent --- blockbot/blockbot.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 587bc9d13..1792593fd 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -36,7 +36,7 @@ function blockbot_init_1(App $a) { // List of strings of reported false positives $agents = ['Mastodon', 'hackney', 'Faraday', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', - 'WordPress']; + 'WordPress', 'http.rb']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { // The agents had been reported to https://github.com/JayBizzle/Crawler-Detect/issues/ From 758a36dfc6989afb2b959e51c63bc87db8b4db05 Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Sat, 27 Apr 2019 17:17:11 +0200 Subject: [PATCH 05/31] Rearranged user-agent list, tested against log data --- blockbot/blockbot.php | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 1792593fd..dd84df372 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -35,39 +35,34 @@ function blockbot_init_1(App $a) { } // List of strings of reported false positives - $agents = ['Mastodon', 'hackney', 'Faraday', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', - 'WordPress', 'http.rb']; + $agents = ['hackney/', 'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', + 'WordPress/', 'http.rb/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { // The agents had been reported to https://github.com/JayBizzle/Crawler-Detect/issues/ - logger::notice('Already reported wrong detection', $logdata); + logger::notice('Reported false positive', $logdata); return; } } - // List of strings of known "good" agents - $agents = ['diaspora-connection-tester', 'DiasporaFederation', 'Friendica', '(compatible; zot)', - 'Micro.blog', 'GangGo', 'python/federation', 'GNU social', 'winHttp', - 'Go-http-client', 'Mr.4x3 Powered', 'Test Certificate Info', 'WordPress.com', 'zgrab', - 'curl/', 'StatusNet', 'OpenGraphReader/', 'Uptimebot/', 'python-opengraph-jaywink', - 'fediverse.network crawler', 'Active_Pods_CheckBot_3.0', 'Social-Relay']; + // List of false positives' strings of known "good" agents we haven't reported (yet) + $agents = ['fediverse.network crawler', 'Active_Pods_CheckBot_3.0', 'Social-Relay/', + 'curl', 'zgrab', 'Go-http-client', 'curb']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { - // Report every false positive here: https://github.com/JayBizzle/Crawler-Detect/issues/ - // After report move it into the array above - logger::notice('False positive', $logdata); + logger::notice('Unreported falsely detected agent', $logdata); return; } } // List of known crawlers. They are added here to avoid having them logged at the end of the function. - // This helps to detect false positives - $agents = ['Mozilla/5.0 (compatible; SemrushBot/3~bl; +http://www.semrush.com/bot.html)', 'SEMrushBot', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 AppEngine-Google; (+http://code.google.com/appengine; appid: s~feedly-nikon3)']; + // This helps to detect false positives. + $agents = ['SEMrushBot', 's~feedly-nikon3', 'Qwantify/Bleriot/', 'ltx71', 'Sogou web spider/', + 'Diffbot/']; foreach ($agents as $agent) { - if ($_SERVER['HTTP_USER_AGENT'] == $agent) { + if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { System::httpExit(403, 'Bots are not allowed'); } } From 18f77b94e18f0def8a1ec8292f78097b20e8f273 Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Sat, 27 Apr 2019 20:55:58 +0200 Subject: [PATCH 06/31] And some more false positives and known bots --- blockbot/blockbot.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index dd84df372..c4a6fda38 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -47,7 +47,7 @@ function blockbot_init_1(App $a) { // List of false positives' strings of known "good" agents we haven't reported (yet) $agents = ['fediverse.network crawler', 'Active_Pods_CheckBot_3.0', 'Social-Relay/', - 'curl', 'zgrab', 'Go-http-client', 'curb']; + 'curl', 'zgrab', 'Go-http-client', 'curb', 'github.com', 'reqwest']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { @@ -59,7 +59,8 @@ function blockbot_init_1(App $a) { // List of known crawlers. They are added here to avoid having them logged at the end of the function. // This helps to detect false positives. $agents = ['SEMrushBot', 's~feedly-nikon3', 'Qwantify/Bleriot/', 'ltx71', 'Sogou web spider/', - 'Diffbot/']; + 'Diffbot/', 'Twitterbot/', 'YisouSpider/', 'evc-batch/', 'LivelapBot/', 'TrendsmapResolver/', + 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From a3ad6e42e04fe79b37fc043743fe47ce99f81bf7 Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Sun, 28 Apr 2019 08:44:56 +0200 Subject: [PATCH 07/31] More bots, more false positives --- blockbot/blockbot.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index c4a6fda38..424ffb719 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -47,7 +47,8 @@ function blockbot_init_1(App $a) { // List of false positives' strings of known "good" agents we haven't reported (yet) $agents = ['fediverse.network crawler', 'Active_Pods_CheckBot_3.0', 'Social-Relay/', - 'curl', 'zgrab', 'Go-http-client', 'curb', 'github.com', 'reqwest']; + 'curl', 'zgrab', 'Go-http-client', 'curb', 'github.com', 'reqwest', 'Feedly/', + 'Python-urllib/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { @@ -60,7 +61,8 @@ function blockbot_init_1(App $a) { // This helps to detect false positives. $agents = ['SEMrushBot', 's~feedly-nikon3', 'Qwantify/Bleriot/', 'ltx71', 'Sogou web spider/', 'Diffbot/', 'Twitterbot/', 'YisouSpider/', 'evc-batch/', 'LivelapBot/', 'TrendsmapResolver/', - 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon']; + 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon', 'Datanyze', 'BLEXBot/', '360Spider', + 'adscanner/', 'HeadlessChrome', 'wpif', 'startmebot/', 'Googlebot/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From 934df28d0cf9ab0612528efd58498636fad0e30d Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Sun, 28 Apr 2019 23:01:24 +0200 Subject: [PATCH 08/31] Some more bots and false positives --- blockbot/blockbot.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 424ffb719..81369df4f 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -48,7 +48,7 @@ function blockbot_init_1(App $a) { // List of false positives' strings of known "good" agents we haven't reported (yet) $agents = ['fediverse.network crawler', 'Active_Pods_CheckBot_3.0', 'Social-Relay/', 'curl', 'zgrab', 'Go-http-client', 'curb', 'github.com', 'reqwest', 'Feedly/', - 'Python-urllib/']; + 'Python-urllib/', 'Liferea/', 'aiohttp/', 'WordPress.com Reader']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { @@ -62,7 +62,8 @@ function blockbot_init_1(App $a) { $agents = ['SEMrushBot', 's~feedly-nikon3', 'Qwantify/Bleriot/', 'ltx71', 'Sogou web spider/', 'Diffbot/', 'Twitterbot/', 'YisouSpider/', 'evc-batch/', 'LivelapBot/', 'TrendsmapResolver/', 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon', 'Datanyze', 'BLEXBot/', '360Spider', - 'adscanner/', 'HeadlessChrome', 'wpif', 'startmebot/', 'Googlebot/']; + 'adscanner/', 'HeadlessChrome', 'wpif', 'startmebot/', 'Googlebot/', 'Applebot/', + 'facebookexternalhit/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From ccebe7ebf79e5b6474c5f26c6d3e8fe98fb396e1 Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Mon, 29 Apr 2019 17:46:04 +0200 Subject: [PATCH 09/31] And some more ... --- blockbot/blockbot.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 81369df4f..30f0b73cc 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -63,7 +63,7 @@ function blockbot_init_1(App $a) { 'Diffbot/', 'Twitterbot/', 'YisouSpider/', 'evc-batch/', 'LivelapBot/', 'TrendsmapResolver/', 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon', 'Datanyze', 'BLEXBot/', '360Spider', 'adscanner/', 'HeadlessChrome', 'wpif', 'startmebot/', 'Googlebot/', 'Applebot/', - 'facebookexternalhit/']; + 'facebookexternalhit/', 'GoogleImageProxy', 'bingbot/', 'heritrix/', 'ldspider']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From e6164536e8b897a6c5041bb4a555079171529848 Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Mon, 29 Apr 2019 22:21:42 +0200 Subject: [PATCH 10/31] The "good bots" lists had been unified. --- blockbot/blockbot.php | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 30f0b73cc..9e590a99e 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -34,25 +34,16 @@ function blockbot_init_1(App $a) { return; } - // List of strings of reported false positives - $agents = ['hackney/', 'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', - 'WordPress/', 'http.rb/']; - foreach ($agents as $agent) { - if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { - // The agents had been reported to https://github.com/JayBizzle/Crawler-Detect/issues/ - logger::notice('Reported false positive', $logdata); - return; - } - } - - // List of false positives' strings of known "good" agents we haven't reported (yet) + // List of false positives' strings of known "good" agents. $agents = ['fediverse.network crawler', 'Active_Pods_CheckBot_3.0', 'Social-Relay/', 'curl', 'zgrab', 'Go-http-client', 'curb', 'github.com', 'reqwest', 'Feedly/', - 'Python-urllib/', 'Liferea/', 'aiohttp/', 'WordPress.com Reader']; + 'Python-urllib/', 'Liferea/', 'aiohttp/', 'WordPress.com Reader', 'hackney/', + 'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', + 'WordPress/', 'http.rb/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { - logger::notice('Unreported falsely detected agent', $logdata); + logger::notice('False positive', $logdata); return; } } From 08c890d65101f559ff3e8459089460acb488562b Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Fri, 3 May 2019 12:25:13 +0200 Subject: [PATCH 11/31] Avoid warnings --- blockbot/blockbot.php | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 9e590a99e..41f31827a 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -27,6 +27,10 @@ function blockbot_uninstall() { function blockbot_init_1(App $a) { $crawlerDetect = new CrawlerDetect(); + if (empty($_SERVER['HTTP_USER_AGENT'])) { + return; + } + $logdata = ['agent' => $_SERVER['HTTP_USER_AGENT'], 'uri' => $_SERVER['REQUEST_URI']]; if (!$crawlerDetect->isCrawler()) { @@ -39,7 +43,7 @@ function blockbot_init_1(App $a) { 'curl', 'zgrab', 'Go-http-client', 'curb', 'github.com', 'reqwest', 'Feedly/', 'Python-urllib/', 'Liferea/', 'aiohttp/', 'WordPress.com Reader', 'hackney/', 'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', - 'WordPress/', 'http.rb/']; + 'WordPress/', 'http.rb/', 'Apache-HttpClient/', 'WordPress.com;']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { @@ -58,10 +62,10 @@ function blockbot_init_1(App $a) { foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { - System::httpExit(403, 'Bots are not allowed'); + System::httpExit(403, ['title' => 'Bots are not allowed']); } } logger::info('Blocked bot', $logdata); - System::httpExit(403, 'Bots are not allowed'); + System::httpExit(403, ['title' => 'Bots are not allowed']); } From 9e8ab6ccf064928423965006ccecb938dce14a50 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 29 May 2019 18:51:07 +0000 Subject: [PATCH 12/31] Added training mode --- blockbot/blockbot.php | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 41f31827a..f0eb8eb72 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -8,6 +8,7 @@ */ use Friendica\App; +use Friendica\Core\Config; use Friendica\Core\Hook; use Friendica\Core\System; use Jaybizzle\CrawlerDetect\CrawlerDetect; @@ -25,14 +26,33 @@ function blockbot_uninstall() { } function blockbot_init_1(App $a) { - $crawlerDetect = new CrawlerDetect(); - if (empty($_SERVER['HTTP_USER_AGENT'])) { return; } $logdata = ['agent' => $_SERVER['HTTP_USER_AGENT'], 'uri' => $_SERVER['REQUEST_URI']]; + // List of known crawlers. They are added here to avoid having them logged at the end of the function. + // This helps to detect false positives. + $agents = ['SEMrushBot', 's~feedly-nikon3', 'Qwantify/Bleriot/', 'ltx71', 'Sogou web spider/', + 'Diffbot/', 'Twitterbot/', 'YisouSpider/', 'evc-batch/', 'LivelapBot/', 'TrendsmapResolver/', + 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon', 'Datanyze', 'BLEXBot/', '360Spider', + 'adscanner/', 'HeadlessChrome', 'wpif', 'startmebot/', 'Googlebot/', 'Applebot/', + 'facebookexternalhit/', 'GoogleImageProxy', 'bingbot/', 'heritrix/', 'ldspider']; + + foreach ($agents as $agent) { + if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { + System::httpExit(403, 'Bots are not allowed'); + } + } + + // This switch here is just meant for developers who want to add more bots to the list above + if (!Config::get('blockbot', 'training')) { + return; + } + + $crawlerDetect = new CrawlerDetect(); + if (!$crawlerDetect->isCrawler()) { logger::debug('Good user agent detected', $logdata); return; @@ -52,20 +72,6 @@ function blockbot_init_1(App $a) { } } - // List of known crawlers. They are added here to avoid having them logged at the end of the function. - // This helps to detect false positives. - $agents = ['SEMrushBot', 's~feedly-nikon3', 'Qwantify/Bleriot/', 'ltx71', 'Sogou web spider/', - 'Diffbot/', 'Twitterbot/', 'YisouSpider/', 'evc-batch/', 'LivelapBot/', 'TrendsmapResolver/', - 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon', 'Datanyze', 'BLEXBot/', '360Spider', - 'adscanner/', 'HeadlessChrome', 'wpif', 'startmebot/', 'Googlebot/', 'Applebot/', - 'facebookexternalhit/', 'GoogleImageProxy', 'bingbot/', 'heritrix/', 'ldspider']; - - foreach ($agents as $agent) { - if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { - System::httpExit(403, ['title' => 'Bots are not allowed']); - } - } - logger::info('Blocked bot', $logdata); - System::httpExit(403, ['title' => 'Bots are not allowed']); + System::httpExit(403, 'Bots are not allowed'); } From efaefc4e1b717240aacb6b33223ac01f7ede74be Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 29 May 2019 19:13:15 +0000 Subject: [PATCH 13/31] Just some more bots --- blockbot/blockbot.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index f0eb8eb72..60fbca7b4 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -38,7 +38,8 @@ function blockbot_init_1(App $a) { 'Diffbot/', 'Twitterbot/', 'YisouSpider/', 'evc-batch/', 'LivelapBot/', 'TrendsmapResolver/', 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon', 'Datanyze', 'BLEXBot/', '360Spider', 'adscanner/', 'HeadlessChrome', 'wpif', 'startmebot/', 'Googlebot/', 'Applebot/', - 'facebookexternalhit/', 'GoogleImageProxy', 'bingbot/', 'heritrix/', 'ldspider']; + 'facebookexternalhit/', 'GoogleImageProxy', 'bingbot/', 'heritrix/', 'ldspider', + 'AwarioRssBot/', 'Zabbix']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From 501e58696884c31b95315dbd872e280221e85b15 Mon Sep 17 00:00:00 2001 From: Michael Date: Thu, 30 May 2019 04:17:35 +0000 Subject: [PATCH 14/31] Added false positive --- blockbot/blockbot.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 60fbca7b4..2f4de3f96 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -64,7 +64,7 @@ function blockbot_init_1(App $a) { 'curl', 'zgrab', 'Go-http-client', 'curb', 'github.com', 'reqwest', 'Feedly/', 'Python-urllib/', 'Liferea/', 'aiohttp/', 'WordPress.com Reader', 'hackney/', 'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', - 'WordPress/', 'http.rb/', 'Apache-HttpClient/', 'WordPress.com;']; + 'WordPress/', 'http.rb/', 'Apache-HttpClient/', 'WordPress.com;', 'Pleroma']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From 6f4dd86ff0905d4cf7868e6eca33da016f106829 Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Thu, 30 May 2019 06:45:20 +0200 Subject: [PATCH 15/31] Clarified the comments --- blockbot/blockbot.php | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 2f4de3f96..03c8f1629 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -32,8 +32,7 @@ function blockbot_init_1(App $a) { $logdata = ['agent' => $_SERVER['HTTP_USER_AGENT'], 'uri' => $_SERVER['REQUEST_URI']]; - // List of known crawlers. They are added here to avoid having them logged at the end of the function. - // This helps to detect false positives. + // List of known crawlers. $agents = ['SEMrushBot', 's~feedly-nikon3', 'Qwantify/Bleriot/', 'ltx71', 'Sogou web spider/', 'Diffbot/', 'Twitterbot/', 'YisouSpider/', 'evc-batch/', 'LivelapBot/', 'TrendsmapResolver/', 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon', 'Datanyze', 'BLEXBot/', '360Spider', @@ -47,7 +46,7 @@ function blockbot_init_1(App $a) { } } - // This switch here is just meant for developers who want to add more bots to the list above + // This switch here is only meant for developers who want to add more bots to the list above, it is not safe for production. if (!Config::get('blockbot', 'training')) { return; } From 305b814c8eb30bebbe2ae9b9536c37bbc7eb8e6e Mon Sep 17 00:00:00 2001 From: Michael Date: Thu, 30 May 2019 10:32:01 +0000 Subject: [PATCH 16/31] Bot added, false positive added --- blockbot/blockbot.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 03c8f1629..a7989b94d 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -38,7 +38,7 @@ function blockbot_init_1(App $a) { 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon', 'Datanyze', 'BLEXBot/', '360Spider', 'adscanner/', 'HeadlessChrome', 'wpif', 'startmebot/', 'Googlebot/', 'Applebot/', 'facebookexternalhit/', 'GoogleImageProxy', 'bingbot/', 'heritrix/', 'ldspider', - 'AwarioRssBot/', 'Zabbix']; + 'AwarioRssBot/', 'Zabbix', 'TweetmemeBot/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { @@ -63,7 +63,8 @@ function blockbot_init_1(App $a) { 'curl', 'zgrab', 'Go-http-client', 'curb', 'github.com', 'reqwest', 'Feedly/', 'Python-urllib/', 'Liferea/', 'aiohttp/', 'WordPress.com Reader', 'hackney/', 'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', - 'WordPress/', 'http.rb/', 'Apache-HttpClient/', 'WordPress.com;', 'Pleroma']; + 'WordPress/', 'http.rb/', 'Apache-HttpClient/', 'WordPress.com;', 'Pleroma', + 'Dispatch/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From 5af7f2a6b63ab061025fc92f632ac57e15682754 Mon Sep 17 00:00:00 2001 From: Michael Date: Thu, 30 May 2019 11:52:55 +0000 Subject: [PATCH 17/31] Crawler added --- blockbot/blockbot.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index a7989b94d..d4eb54a73 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -38,7 +38,7 @@ function blockbot_init_1(App $a) { 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon', 'Datanyze', 'BLEXBot/', '360Spider', 'adscanner/', 'HeadlessChrome', 'wpif', 'startmebot/', 'Googlebot/', 'Applebot/', 'facebookexternalhit/', 'GoogleImageProxy', 'bingbot/', 'heritrix/', 'ldspider', - 'AwarioRssBot/', 'Zabbix', 'TweetmemeBot/']; + 'AwarioRssBot/', 'Zabbix', 'TweetmemeBot/', 'dcrawl/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From cf98d23af1eaa8e3505c293f28d293ce4966ad38 Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Thu, 30 May 2019 15:11:33 +0200 Subject: [PATCH 18/31] And some more bot headed --- blockbot/blockbot.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 03c8f1629..ce296b5a7 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -38,7 +38,7 @@ function blockbot_init_1(App $a) { 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon', 'Datanyze', 'BLEXBot/', '360Spider', 'adscanner/', 'HeadlessChrome', 'wpif', 'startmebot/', 'Googlebot/', 'Applebot/', 'facebookexternalhit/', 'GoogleImageProxy', 'bingbot/', 'heritrix/', 'ldspider', - 'AwarioRssBot/', 'Zabbix']; + 'AwarioRssBot/', 'Zabbix', 'PhantomJS/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From 975155afbf1163d787013d713a0fcd3a8f2f50c1 Mon Sep 17 00:00:00 2001 From: Michael Date: Fri, 31 May 2019 15:32:22 +0000 Subject: [PATCH 19/31] And another bot added --- blockbot/blockbot.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 81531f32b..b651586b6 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -38,7 +38,7 @@ function blockbot_init_1(App $a) { 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon', 'Datanyze', 'BLEXBot/', '360Spider', 'adscanner/', 'HeadlessChrome', 'wpif', 'startmebot/', 'Googlebot/', 'Applebot/', 'facebookexternalhit/', 'GoogleImageProxy', 'bingbot/', 'heritrix/', 'ldspider', - 'AwarioRssBot/', 'Zabbix', 'TweetmemeBot/', 'dcrawl/', 'PhantomJS/']; + 'AwarioRssBot/', 'Zabbix', 'TweetmemeBot/', 'dcrawl/', 'PhantomJS/', 'Googlebot-Image/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From 4519f9bb1868d8ffecd1386d1f58335d1e6235c6 Mon Sep 17 00:00:00 2001 From: Michael Date: Fri, 31 May 2019 15:37:33 +0000 Subject: [PATCH 20/31] And again --- blockbot/blockbot.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index b651586b6..4473bbec5 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -38,7 +38,8 @@ function blockbot_init_1(App $a) { 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon', 'Datanyze', 'BLEXBot/', '360Spider', 'adscanner/', 'HeadlessChrome', 'wpif', 'startmebot/', 'Googlebot/', 'Applebot/', 'facebookexternalhit/', 'GoogleImageProxy', 'bingbot/', 'heritrix/', 'ldspider', - 'AwarioRssBot/', 'Zabbix', 'TweetmemeBot/', 'dcrawl/', 'PhantomJS/', 'Googlebot-Image/']; + 'AwarioRssBot/', 'Zabbix', 'TweetmemeBot/', 'dcrawl/', 'PhantomJS/', 'Googlebot-Image/', + 'CrowdTanglebot/', 'Mediapartners-Google', 'Baiduspider/', 'datagnionbot']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From 3ade520803bc48072eedd90762b4c8479534b95d Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 1 Jun 2019 04:51:01 +0000 Subject: [PATCH 21/31] three more crawler --- blockbot/blockbot.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 4473bbec5..11c95ff61 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -39,7 +39,8 @@ function blockbot_init_1(App $a) { 'adscanner/', 'HeadlessChrome', 'wpif', 'startmebot/', 'Googlebot/', 'Applebot/', 'facebookexternalhit/', 'GoogleImageProxy', 'bingbot/', 'heritrix/', 'ldspider', 'AwarioRssBot/', 'Zabbix', 'TweetmemeBot/', 'dcrawl/', 'PhantomJS/', 'Googlebot-Image/', - 'CrowdTanglebot/', 'Mediapartners-Google', 'Baiduspider/', 'datagnionbot']; + 'CrowdTanglebot/', 'Mediapartners-Google', 'Baiduspider/', 'datagnionbot', + 'MegaIndex.ru/', 'SMUrlExpander', 'Hatena-Favicon/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From b8ad479acca786d7a3049451ba952efde0d878bc Mon Sep 17 00:00:00 2001 From: Michael Date: Thu, 6 Jun 2019 20:31:16 +0000 Subject: [PATCH 22/31] One false positive, many bots added --- blockbot/blockbot.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 11c95ff61..69f9fc016 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -40,7 +40,9 @@ function blockbot_init_1(App $a) { 'facebookexternalhit/', 'GoogleImageProxy', 'bingbot/', 'heritrix/', 'ldspider', 'AwarioRssBot/', 'Zabbix', 'TweetmemeBot/', 'dcrawl/', 'PhantomJS/', 'Googlebot-Image/', 'CrowdTanglebot/', 'Mediapartners-Google', 'Baiduspider/', 'datagnionbot', - 'MegaIndex.ru/', 'SMUrlExpander', 'Hatena-Favicon/']; + 'MegaIndex.ru/', 'SMUrlExpander', 'Hatena-Favicon/', 'Wappalyzer', 'FlipboardProxy/', + 'NetcraftSurveyAgent/', 'Dataprovider.com', 'SMTBot/', 'Nimbostratus-Bot/', + 'DuckDuckGo-Favicons-Bot/', 'IndieWebCards/', 'proximic', 'netEstate NE Crawler']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { @@ -66,7 +68,7 @@ function blockbot_init_1(App $a) { 'Python-urllib/', 'Liferea/', 'aiohttp/', 'WordPress.com Reader', 'hackney/', 'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', 'WordPress/', 'http.rb/', 'Apache-HttpClient/', 'WordPress.com;', 'Pleroma', - 'Dispatch/']; + 'Dispatch/', 'Ruby']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From 0aba97ed84684bc2d92f7ddb2fc0ef065b320c48 Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 10 Jun 2019 14:33:42 +0000 Subject: [PATCH 23/31] Two more bad bots, a single good one --- blockbot/blockbot.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 69f9fc016..a807c6e8a 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -42,7 +42,8 @@ function blockbot_init_1(App $a) { 'CrowdTanglebot/', 'Mediapartners-Google', 'Baiduspider/', 'datagnionbot', 'MegaIndex.ru/', 'SMUrlExpander', 'Hatena-Favicon/', 'Wappalyzer', 'FlipboardProxy/', 'NetcraftSurveyAgent/', 'Dataprovider.com', 'SMTBot/', 'Nimbostratus-Bot/', - 'DuckDuckGo-Favicons-Bot/', 'IndieWebCards/', 'proximic', 'netEstate NE Crawler']; + 'DuckDuckGo-Favicons-Bot/', 'IndieWebCards/', 'proximic', 'netEstate NE Crawler', + 'AhrefsBot/', 'YandexBot/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { @@ -68,7 +69,7 @@ function blockbot_init_1(App $a) { 'Python-urllib/', 'Liferea/', 'aiohttp/', 'WordPress.com Reader', 'hackney/', 'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', 'WordPress/', 'http.rb/', 'Apache-HttpClient/', 'WordPress.com;', 'Pleroma', - 'Dispatch/', 'Ruby']; + 'Dispatch/', 'Ruby', 'Uptimebot/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From 6646d0f963b7dcec93f1903b54238bf76fb21440 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 11 Jun 2019 10:12:39 +0000 Subject: [PATCH 24/31] One good, many bad ... --- blockbot/blockbot.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index a807c6e8a..e6ca68406 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -43,7 +43,8 @@ function blockbot_init_1(App $a) { 'MegaIndex.ru/', 'SMUrlExpander', 'Hatena-Favicon/', 'Wappalyzer', 'FlipboardProxy/', 'NetcraftSurveyAgent/', 'Dataprovider.com', 'SMTBot/', 'Nimbostratus-Bot/', 'DuckDuckGo-Favicons-Bot/', 'IndieWebCards/', 'proximic', 'netEstate NE Crawler', - 'AhrefsBot/', 'YandexBot/']; + 'AhrefsBot/', 'YandexBot/', 'Exabot/', 'Mediumbot-MetaTagFetcher/', + 'WhatsApp/', 'TelegramBot']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { @@ -69,7 +70,7 @@ function blockbot_init_1(App $a) { 'Python-urllib/', 'Liferea/', 'aiohttp/', 'WordPress.com Reader', 'hackney/', 'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', 'WordPress/', 'http.rb/', 'Apache-HttpClient/', 'WordPress.com;', 'Pleroma', - 'Dispatch/', 'Ruby', 'Uptimebot/']; + 'Dispatch/', 'Ruby', 'Uptimebot/', 'Java/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From e52e6f7be2c79410ca59378277506f9fe839794c Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 12 Jun 2019 09:07:30 +0000 Subject: [PATCH 25/31] And another good library --- blockbot/blockbot.php | 2 +- twitter/twitter.php | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index e6ca68406..0a73b6e1f 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -70,7 +70,7 @@ function blockbot_init_1(App $a) { 'Python-urllib/', 'Liferea/', 'aiohttp/', 'WordPress.com Reader', 'hackney/', 'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', 'WordPress/', 'http.rb/', 'Apache-HttpClient/', 'WordPress.com;', 'Pleroma', - 'Dispatch/', 'Ruby', 'Uptimebot/', 'Java/']; + 'Dispatch/', 'Ruby', 'Uptimebot/', 'Java/', 'libwww-perl/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { diff --git a/twitter/twitter.php b/twitter/twitter.php index 406569182..33ee3e1ee 100644 --- a/twitter/twitter.php +++ b/twitter/twitter.php @@ -575,7 +575,7 @@ function twitter_post_hook(App $a, array &$b) return; } - Logger::log('twitter post invoked'); + Logger::notice('twitter post invoked', ['id' => $b['id'], 'guid' => $b['guid']]); PConfig::load($b['uid'], 'twitter'); @@ -610,6 +610,7 @@ function twitter_post_hook(App $a, array &$b) $b['body'] = twitter_update_mentions($b['body']); $msgarr = ItemContent::getPlaintextPost($b, $max_char, true, 8); + Logger::info('Got plaintext', $msgarr); $msg = $msgarr["text"]; if (($msg == "") && isset($msgarr["title"])) { From 24fd0658a34024befdd7f58d06aaea6cd9d23ae1 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 12 Jun 2019 09:11:54 +0000 Subject: [PATCH 26/31] Three bad bots added --- blockbot/blockbot.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 0a73b6e1f..789e78cb3 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -44,7 +44,7 @@ function blockbot_init_1(App $a) { 'NetcraftSurveyAgent/', 'Dataprovider.com', 'SMTBot/', 'Nimbostratus-Bot/', 'DuckDuckGo-Favicons-Bot/', 'IndieWebCards/', 'proximic', 'netEstate NE Crawler', 'AhrefsBot/', 'YandexBot/', 'Exabot/', 'Mediumbot-MetaTagFetcher/', - 'WhatsApp/', 'TelegramBot']; + 'WhatsApp/', 'TelegramBot', 'SurdotlyBot/', 'BingPreview/', 'SabsimBot/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From 056b32cc6df319937e6d03d146b255b286dfd2c0 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 12 Jun 2019 16:27:15 +0000 Subject: [PATCH 27/31] Another one ... --- blockbot/blockbot.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 789e78cb3..e46d98a88 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -44,7 +44,8 @@ function blockbot_init_1(App $a) { 'NetcraftSurveyAgent/', 'Dataprovider.com', 'SMTBot/', 'Nimbostratus-Bot/', 'DuckDuckGo-Favicons-Bot/', 'IndieWebCards/', 'proximic', 'netEstate NE Crawler', 'AhrefsBot/', 'YandexBot/', 'Exabot/', 'Mediumbot-MetaTagFetcher/', - 'WhatsApp/', 'TelegramBot', 'SurdotlyBot/', 'BingPreview/', 'SabsimBot/']; + 'WhatsApp/', 'TelegramBot', 'SurdotlyBot/', 'BingPreview/', 'SabsimBot/', + 'CCBot/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From 654783c27192ac86eb0566161cf644be7c9ae529 Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 17 Jun 2019 14:07:37 +0000 Subject: [PATCH 28/31] One good, one bad ... --- blockbot/blockbot.php | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index e46d98a88..50d790d69 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -43,9 +43,9 @@ function blockbot_init_1(App $a) { 'MegaIndex.ru/', 'SMUrlExpander', 'Hatena-Favicon/', 'Wappalyzer', 'FlipboardProxy/', 'NetcraftSurveyAgent/', 'Dataprovider.com', 'SMTBot/', 'Nimbostratus-Bot/', 'DuckDuckGo-Favicons-Bot/', 'IndieWebCards/', 'proximic', 'netEstate NE Crawler', - 'AhrefsBot/', 'YandexBot/', 'Exabot/', 'Mediumbot-MetaTagFetcher/', - 'WhatsApp/', 'TelegramBot', 'SurdotlyBot/', 'BingPreview/', 'SabsimBot/', - 'CCBot/']; + 'AhrefsBot/', 'YandexBot/', 'Exabot/', 'Mediumbot-MetaTagFetcher/', 'WhatsApp/', + 'TelegramBot', 'SurdotlyBot/', 'BingPreview/', 'SabsimBot/', 'CCBot/', 'WbSrch/', + 'DuckDuckBot-Https/', 'HTTP Banner Detection', 'YandexImages/', 'archive.org_bot']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { @@ -71,7 +71,8 @@ function blockbot_init_1(App $a) { 'Python-urllib/', 'Liferea/', 'aiohttp/', 'WordPress.com Reader', 'hackney/', 'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', 'WordPress/', 'http.rb/', 'Apache-HttpClient/', 'WordPress.com;', 'Pleroma', - 'Dispatch/', 'Ruby', 'Uptimebot/', 'Java/', 'libwww-perl/']; + 'Dispatch/', 'Ruby', 'Uptimebot/', 'Java/', 'libwww-perl/', 'Mastodon/', + 'lua-resty-http/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From d784c602cfef6e0f9c9038a142283dff8619fcef Mon Sep 17 00:00:00 2001 From: Michael Date: Thu, 20 Jun 2019 05:31:53 +0000 Subject: [PATCH 29/31] Six more bots added --- blockbot/blockbot.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index 50d790d69..c47e77056 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -45,7 +45,9 @@ function blockbot_init_1(App $a) { 'DuckDuckGo-Favicons-Bot/', 'IndieWebCards/', 'proximic', 'netEstate NE Crawler', 'AhrefsBot/', 'YandexBot/', 'Exabot/', 'Mediumbot-MetaTagFetcher/', 'WhatsApp/', 'TelegramBot', 'SurdotlyBot/', 'BingPreview/', 'SabsimBot/', 'CCBot/', 'WbSrch/', - 'DuckDuckBot-Https/', 'HTTP Banner Detection', 'YandexImages/', 'archive.org_bot']; + 'DuckDuckBot-Https/', 'HTTP Banner Detection', 'YandexImages/', 'archive.org_bot', + 'ArchiveTeam ArchiveBot/', 'yacybot', 'https://developers.google.com/+/web/snippet/', + 'Scrapy/', 'github-camo', 'MJ12bot/']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { @@ -72,7 +74,7 @@ function blockbot_init_1(App $a) { 'Faraday v', 'okhttp', 'UniversalFeedParser', 'PixelFedBot', 'python-requests', 'WordPress/', 'http.rb/', 'Apache-HttpClient/', 'WordPress.com;', 'Pleroma', 'Dispatch/', 'Ruby', 'Uptimebot/', 'Java/', 'libwww-perl/', 'Mastodon/', - 'lua-resty-http/']; + 'lua-resty-http/', 'Test Certificate Info']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From af09a922cfd596a9b91c64f4c9f752c1906896f4 Mon Sep 17 00:00:00 2001 From: Michael Date: Thu, 20 Jun 2019 16:03:13 +0000 Subject: [PATCH 30/31] And some more bots that I just detected --- blockbot/blockbot.php | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index c47e77056..c8cf7902c 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -33,8 +33,8 @@ function blockbot_init_1(App $a) { $logdata = ['agent' => $_SERVER['HTTP_USER_AGENT'], 'uri' => $_SERVER['REQUEST_URI']]; // List of known crawlers. - $agents = ['SEMrushBot', 's~feedly-nikon3', 'Qwantify/Bleriot/', 'ltx71', 'Sogou web spider/', - 'Diffbot/', 'Twitterbot/', 'YisouSpider/', 'evc-batch/', 'LivelapBot/', 'TrendsmapResolver/', + $agents = ['SemrushBot/', 's~feedly-nikon3', 'Qwantify/Bleriot/', 'ltx71', 'Sogou web spider/', + 'Diffbot/', 'Twitterbot/', 'YisouSpider', 'evc-batch/', 'LivelapBot/', 'TrendsmapResolver/', 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon', 'Datanyze', 'BLEXBot/', '360Spider', 'adscanner/', 'HeadlessChrome', 'wpif', 'startmebot/', 'Googlebot/', 'Applebot/', 'facebookexternalhit/', 'GoogleImageProxy', 'bingbot/', 'heritrix/', 'ldspider', @@ -47,7 +47,8 @@ function blockbot_init_1(App $a) { 'TelegramBot', 'SurdotlyBot/', 'BingPreview/', 'SabsimBot/', 'CCBot/', 'WbSrch/', 'DuckDuckBot-Https/', 'HTTP Banner Detection', 'YandexImages/', 'archive.org_bot', 'ArchiveTeam ArchiveBot/', 'yacybot', 'https://developers.google.com/+/web/snippet/', - 'Scrapy/', 'github-camo', 'MJ12bot/']; + 'Scrapy/', 'github-camo', 'MJ12bot/', 'DotBot/', 'Pinterestbot/', 'Jooblebot/', + 'Cliqzbot/', 'YaK/', 'Mediatoolkitbot']; foreach ($agents as $agent) { if (stristr($_SERVER['HTTP_USER_AGENT'], $agent)) { From 8446ffa42180d3042a196d46af0950ba4a567507 Mon Sep 17 00:00:00 2001 From: Michael Date: Sun, 23 Jun 2019 09:13:59 +0000 Subject: [PATCH 31/31] semrush uses two different names --- blockbot/blockbot.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blockbot/blockbot.php b/blockbot/blockbot.php index c8cf7902c..c16489d80 100644 --- a/blockbot/blockbot.php +++ b/blockbot/blockbot.php @@ -33,7 +33,7 @@ function blockbot_init_1(App $a) { $logdata = ['agent' => $_SERVER['HTTP_USER_AGENT'], 'uri' => $_SERVER['REQUEST_URI']]; // List of known crawlers. - $agents = ['SemrushBot/', 's~feedly-nikon3', 'Qwantify/Bleriot/', 'ltx71', 'Sogou web spider/', + $agents = ['SemrushBot', 's~feedly-nikon3', 'Qwantify/Bleriot/', 'ltx71', 'Sogou web spider/', 'Diffbot/', 'Twitterbot/', 'YisouSpider', 'evc-batch/', 'LivelapBot/', 'TrendsmapResolver/', 'PaperLiBot/', 'Nuzzel', 'um-LN/', 'Google Favicon', 'Datanyze', 'BLEXBot/', '360Spider', 'adscanner/', 'HeadlessChrome', 'wpif', 'startmebot/', 'Googlebot/', 'Applebot/',