2019-04-20 14:15:45 +02:00
< ? php
/**
2019-04-20 20:38:32 +02:00
* Name : blockbot
2019-04-20 14:15:45 +02:00
* Description : Blocking bots based on detecting bots / crawlers / spiders via the user agent and http_from header .
* Version : 0.1
* Author : Philipp Holzer < admin @ philipp . info >
*
*/
use Friendica\App ;
use Friendica\Core\Hook ;
use Friendica\Core\System ;
use Jaybizzle\CrawlerDetect\CrawlerDetect ;
2019-04-27 13:51:44 +02:00
use Friendica\Core\Logger ;
2019-04-20 14:15:45 +02:00
2019-04-21 12:35:33 +02:00
require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor' . DIRECTORY_SEPARATOR . 'autoload.php' ;
2019-04-20 20:38:32 +02:00
function blockbot_install () {
2019-04-22 10:49:40 +02:00
Hook :: register ( 'init_1' , __FILE__ , 'blockbot_init_1' );
2019-04-20 14:15:45 +02:00
}
2019-04-20 20:38:32 +02:00
function blockbot_uninstall () {
2019-04-22 10:49:40 +02:00
Hook :: unregister ( 'init_1' , __FILE__ , 'blockbot_init_1' );
2019-04-20 14:15:45 +02:00
}
2019-04-20 20:38:32 +02:00
function blockbot_init_1 ( App $a ) {
2019-04-20 14:15:45 +02:00
$crawlerDetect = new CrawlerDetect ();
2019-04-27 15:34:51 +02:00
$logdata = [ 'agent' => $_SERVER [ 'HTTP_USER_AGENT' ], 'uri' => $_SERVER [ 'REQUEST_URI' ]];
if ( ! $crawlerDetect -> isCrawler ()) {
logger :: debug ( 'Good user agent detected' , $logdata );
return ;
}
// List of strings of reported false positives
$agents = [ 'Mastodon' , 'hackney' , 'Faraday' , 'okhttp' , 'UniversalFeedParser' , 'PixelFedBot' , 'python-requests' ,
2019-04-27 15:50:25 +02:00
'WordPress' , 'http.rb' ];
2019-04-27 15:34:51 +02:00
foreach ( $agents as $agent ) {
if ( stristr ( $_SERVER [ 'HTTP_USER_AGENT' ], $agent )) {
// The agents had been reported to https://github.com/JayBizzle/Crawler-Detect/issues/
logger :: notice ( 'Already reported wrong detection' , $logdata );
return ;
}
}
2019-04-27 13:51:44 +02:00
// List of strings of known "good" agents
$agents = [ 'diaspora-connection-tester' , 'DiasporaFederation' , 'Friendica' , '(compatible; zot)' ,
2019-04-27 15:34:51 +02:00
'Micro.blog' , 'GangGo' , 'python/federation' , 'GNU social' , 'winHttp' ,
2019-04-27 13:51:44 +02:00
'Go-http-client' , 'Mr.4x3 Powered' , 'Test Certificate Info' , 'WordPress.com' , 'zgrab' ,
2019-04-27 15:34:51 +02:00
'curl/' , 'StatusNet' , 'OpenGraphReader/' , 'Uptimebot/' , 'python-opengraph-jaywink' ,
'fediverse.network crawler' , 'Active_Pods_CheckBot_3.0' , 'Social-Relay' ];
foreach ( $agents as $agent ) {
if ( stristr ( $_SERVER [ 'HTTP_USER_AGENT' ], $agent )) {
// Report every false positive here: https://github.com/JayBizzle/Crawler-Detect/issues/
// After report move it into the array above
logger :: notice ( 'False positive' , $logdata );
return ;
}
}
// List of known crawlers. They are added here to avoid having them logged at the end of the function.
// This helps to detect false positives
$agents = [ 'Mozilla/5.0 (compatible; SemrushBot/3~bl; +http://www.semrush.com/bot.html)' , 'SEMrushBot' ,
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 AppEngine-Google; (+http://code.google.com/appengine; appid: s~feedly-nikon3)' ];
foreach ( $agents as $agent ) {
if ( $_SERVER [ 'HTTP_USER_AGENT' ] == $agent ) {
System :: httpExit ( 403 , 'Bots are not allowed' );
2019-04-27 13:51:44 +02:00
}
2019-04-20 14:15:45 +02:00
}
2019-04-27 15:34:51 +02:00
logger :: info ( 'Blocked bot' , $logdata );
System :: httpExit ( 403 , 'Bots are not allowed' );
2019-04-20 14:15:45 +02:00
}