Merge pull request #846 from nupplaphil/features/6948-bot_detection

New Addon Bot detection
This commit is contained in:
Hypolite Petovan 2019-04-22 07:49:18 -04:00 committed by GitHub
commit cf741fe3e9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
27 changed files with 3793 additions and 0 deletions

32
blockbot/blockbot.php Normal file
View file

@ -0,0 +1,32 @@
<?php
/**
* Name: blockbot
* Description: Blocking bots based on detecting bots/crawlers/spiders via the user agent and http_from header.
* Version: 0.1
* Author: Philipp Holzer <admin@philipp.info>
*
*/
use Friendica\App;
use Friendica\Core\Hook;
use Friendica\Core\System;
use Jaybizzle\CrawlerDetect\CrawlerDetect;
require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor' . DIRECTORY_SEPARATOR . 'autoload.php';
function blockbot_install() {
Hook::register('init_1', __FILE__, 'blockbot_init_1');
}
function blockbot_uninstall() {
Hook::unregister('init_1', __FILE__, 'blockbot_init_1');
}
function blockbot_init_1(App $a) {
$crawlerDetect = new CrawlerDetect();
if ($crawlerDetect->isCrawler()) {
System::httpExit(403, 'Bots are not allowed');
}
}

24
blockbot/composer.json Normal file
View file

@ -0,0 +1,24 @@
{
"name": "friendica-addons/blockbot",
"description": "Blocking bots based on detecting bots/crawlers/spiders via the user agent and http_from header.",
"type": "friendica-addon",
"authors": [
{
"name": "Philipp Holzer",
"email": "admin@philipp.info",
"homepage": "https://friendica.philipp.info/profile/nupplaphil",
"role": "Developer"
}
],
"require": {
"php": ">=5.6.0",
"jaybizzle/crawler-detect": "1.*"
},
"license": "3-clause BSD license",
"minimum-stability": "stable",
"config": {
"optimize-autoloader": true,
"autoloader-suffix": "BlockBotAddon",
"preferred-install": "dist"
}
}

69
blockbot/composer.lock generated Normal file
View file

@ -0,0 +1,69 @@
{
"_readme": [
"This file locks the dependencies of your project to a known state",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "814fd867d00e99f84d12304e8e244aae",
"packages": [
{
"name": "jaybizzle/crawler-detect",
"version": "v1.2.80",
"source": {
"type": "git",
"url": "https://github.com/JayBizzle/Crawler-Detect.git",
"reference": "af6a36e6d69670df3f0a3ed8e21d4b8cc67a7847"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/JayBizzle/Crawler-Detect/zipball/af6a36e6d69670df3f0a3ed8e21d4b8cc67a7847",
"reference": "af6a36e6d69670df3f0a3ed8e21d4b8cc67a7847",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8|^5.5|^6.5",
"satooshi/php-coveralls": "1.*"
},
"type": "library",
"autoload": {
"psr-4": {
"Jaybizzle\\CrawlerDetect\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Mark Beech",
"email": "m@rkbee.ch",
"role": "Developer"
}
],
"description": "CrawlerDetect is a PHP class for detecting bots/crawlers/spiders via the user agent",
"homepage": "https://github.com/JayBizzle/Crawler-Detect/",
"keywords": [
"crawler",
"crawler detect",
"crawler detector",
"crawlerdetect",
"php crawler detect"
],
"time": "2019-04-05T19:52:02+00:00"
}
],
"packages-dev": [],
"aliases": [],
"minimum-stability": "stable",
"stability-flags": [],
"prefer-stable": false,
"prefer-lowest": false,
"platform": {
"php": ">=5.6.0"
},
"platform-dev": []
}

7
blockbot/vendor/autoload.php vendored Normal file
View file

@ -0,0 +1,7 @@
<?php
// autoload.php @generated by Composer
require_once __DIR__ . '/composer/autoload_real.php';
return ComposerAutoloaderInitBlockBotAddon::getLoader();

445
blockbot/vendor/composer/ClassLoader.php vendored Normal file
View file

@ -0,0 +1,445 @@
<?php
/*
* This file is part of Composer.
*
* (c) Nils Adermann <naderman@naderman.de>
* Jordi Boggiano <j.boggiano@seld.be>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Composer\Autoload;
/**
* ClassLoader implements a PSR-0, PSR-4 and classmap class loader.
*
* $loader = new \Composer\Autoload\ClassLoader();
*
* // register classes with namespaces
* $loader->add('Symfony\Component', __DIR__.'/component');
* $loader->add('Symfony', __DIR__.'/framework');
*
* // activate the autoloader
* $loader->register();
*
* // to enable searching the include path (eg. for PEAR packages)
* $loader->setUseIncludePath(true);
*
* In this example, if you try to use a class in the Symfony\Component
* namespace or one of its children (Symfony\Component\Console for instance),
* the autoloader will first look for the class under the component/
* directory, and it will then fallback to the framework/ directory if not
* found before giving up.
*
* This class is loosely based on the Symfony UniversalClassLoader.
*
* @author Fabien Potencier <fabien@symfony.com>
* @author Jordi Boggiano <j.boggiano@seld.be>
* @see http://www.php-fig.org/psr/psr-0/
* @see http://www.php-fig.org/psr/psr-4/
*/
class ClassLoader
{
// PSR-4
private $prefixLengthsPsr4 = array();
private $prefixDirsPsr4 = array();
private $fallbackDirsPsr4 = array();
// PSR-0
private $prefixesPsr0 = array();
private $fallbackDirsPsr0 = array();
private $useIncludePath = false;
private $classMap = array();
private $classMapAuthoritative = false;
private $missingClasses = array();
private $apcuPrefix;
public function getPrefixes()
{
if (!empty($this->prefixesPsr0)) {
return call_user_func_array('array_merge', $this->prefixesPsr0);
}
return array();
}
public function getPrefixesPsr4()
{
return $this->prefixDirsPsr4;
}
public function getFallbackDirs()
{
return $this->fallbackDirsPsr0;
}
public function getFallbackDirsPsr4()
{
return $this->fallbackDirsPsr4;
}
public function getClassMap()
{
return $this->classMap;
}
/**
* @param array $classMap Class to filename map
*/
public function addClassMap(array $classMap)
{
if ($this->classMap) {
$this->classMap = array_merge($this->classMap, $classMap);
} else {
$this->classMap = $classMap;
}
}
/**
* Registers a set of PSR-0 directories for a given prefix, either
* appending or prepending to the ones previously set for this prefix.
*
* @param string $prefix The prefix
* @param array|string $paths The PSR-0 root directories
* @param bool $prepend Whether to prepend the directories
*/
public function add($prefix, $paths, $prepend = false)
{
if (!$prefix) {
if ($prepend) {
$this->fallbackDirsPsr0 = array_merge(
(array) $paths,
$this->fallbackDirsPsr0
);
} else {
$this->fallbackDirsPsr0 = array_merge(
$this->fallbackDirsPsr0,
(array) $paths
);
}
return;
}
$first = $prefix[0];
if (!isset($this->prefixesPsr0[$first][$prefix])) {
$this->prefixesPsr0[$first][$prefix] = (array) $paths;
return;
}
if ($prepend) {
$this->prefixesPsr0[$first][$prefix] = array_merge(
(array) $paths,
$this->prefixesPsr0[$first][$prefix]
);
} else {
$this->prefixesPsr0[$first][$prefix] = array_merge(
$this->prefixesPsr0[$first][$prefix],
(array) $paths
);
}
}
/**
* Registers a set of PSR-4 directories for a given namespace, either
* appending or prepending to the ones previously set for this namespace.
*
* @param string $prefix The prefix/namespace, with trailing '\\'
* @param array|string $paths The PSR-4 base directories
* @param bool $prepend Whether to prepend the directories
*
* @throws \InvalidArgumentException
*/
public function addPsr4($prefix, $paths, $prepend = false)
{
if (!$prefix) {
// Register directories for the root namespace.
if ($prepend) {
$this->fallbackDirsPsr4 = array_merge(
(array) $paths,
$this->fallbackDirsPsr4
);
} else {
$this->fallbackDirsPsr4 = array_merge(
$this->fallbackDirsPsr4,
(array) $paths
);
}
} elseif (!isset($this->prefixDirsPsr4[$prefix])) {
// Register directories for a new namespace.
$length = strlen($prefix);
if ('\\' !== $prefix[$length - 1]) {
throw new \InvalidArgumentException("A non-empty PSR-4 prefix must end with a namespace separator.");
}
$this->prefixLengthsPsr4[$prefix[0]][$prefix] = $length;
$this->prefixDirsPsr4[$prefix] = (array) $paths;
} elseif ($prepend) {
// Prepend directories for an already registered namespace.
$this->prefixDirsPsr4[$prefix] = array_merge(
(array) $paths,
$this->prefixDirsPsr4[$prefix]
);
} else {
// Append directories for an already registered namespace.
$this->prefixDirsPsr4[$prefix] = array_merge(
$this->prefixDirsPsr4[$prefix],
(array) $paths
);
}
}
/**
* Registers a set of PSR-0 directories for a given prefix,
* replacing any others previously set for this prefix.
*
* @param string $prefix The prefix
* @param array|string $paths The PSR-0 base directories
*/
public function set($prefix, $paths)
{
if (!$prefix) {
$this->fallbackDirsPsr0 = (array) $paths;
} else {
$this->prefixesPsr0[$prefix[0]][$prefix] = (array) $paths;
}
}
/**
* Registers a set of PSR-4 directories for a given namespace,
* replacing any others previously set for this namespace.
*
* @param string $prefix The prefix/namespace, with trailing '\\'
* @param array|string $paths The PSR-4 base directories
*
* @throws \InvalidArgumentException
*/
public function setPsr4($prefix, $paths)
{
if (!$prefix) {
$this->fallbackDirsPsr4 = (array) $paths;
} else {
$length = strlen($prefix);
if ('\\' !== $prefix[$length - 1]) {
throw new \InvalidArgumentException("A non-empty PSR-4 prefix must end with a namespace separator.");
}
$this->prefixLengthsPsr4[$prefix[0]][$prefix] = $length;
$this->prefixDirsPsr4[$prefix] = (array) $paths;
}
}
/**
* Turns on searching the include path for class files.
*
* @param bool $useIncludePath
*/
public function setUseIncludePath($useIncludePath)
{
$this->useIncludePath = $useIncludePath;
}
/**
* Can be used to check if the autoloader uses the include path to check
* for classes.
*
* @return bool
*/
public function getUseIncludePath()
{
return $this->useIncludePath;
}
/**
* Turns off searching the prefix and fallback directories for classes
* that have not been registered with the class map.
*
* @param bool $classMapAuthoritative
*/
public function setClassMapAuthoritative($classMapAuthoritative)
{
$this->classMapAuthoritative = $classMapAuthoritative;
}
/**
* Should class lookup fail if not found in the current class map?
*
* @return bool
*/
public function isClassMapAuthoritative()
{
return $this->classMapAuthoritative;
}
/**
* APCu prefix to use to cache found/not-found classes, if the extension is enabled.
*
* @param string|null $apcuPrefix
*/
public function setApcuPrefix($apcuPrefix)
{
$this->apcuPrefix = function_exists('apcu_fetch') && ini_get('apc.enabled') ? $apcuPrefix : null;
}
/**
* The APCu prefix in use, or null if APCu caching is not enabled.
*
* @return string|null
*/
public function getApcuPrefix()
{
return $this->apcuPrefix;
}
/**
* Registers this instance as an autoloader.
*
* @param bool $prepend Whether to prepend the autoloader or not
*/
public function register($prepend = false)
{
spl_autoload_register(array($this, 'loadClass'), true, $prepend);
}
/**
* Unregisters this instance as an autoloader.
*/
public function unregister()
{
spl_autoload_unregister(array($this, 'loadClass'));
}
/**
* Loads the given class or interface.
*
* @param string $class The name of the class
* @return bool|null True if loaded, null otherwise
*/
public function loadClass($class)
{
if ($file = $this->findFile($class)) {
includeFile($file);
return true;
}
}
/**
* Finds the path to the file where the class is defined.
*
* @param string $class The name of the class
*
* @return string|false The path if found, false otherwise
*/
public function findFile($class)
{
// class map lookup
if (isset($this->classMap[$class])) {
return $this->classMap[$class];
}
if ($this->classMapAuthoritative || isset($this->missingClasses[$class])) {
return false;
}
if (null !== $this->apcuPrefix) {
$file = apcu_fetch($this->apcuPrefix.$class, $hit);
if ($hit) {
return $file;
}
}
$file = $this->findFileWithExtension($class, '.php');
// Search for Hack files if we are running on HHVM
if (false === $file && defined('HHVM_VERSION')) {
$file = $this->findFileWithExtension($class, '.hh');
}
if (null !== $this->apcuPrefix) {
apcu_add($this->apcuPrefix.$class, $file);
}
if (false === $file) {
// Remember that this class does not exist.
$this->missingClasses[$class] = true;
}
return $file;
}
private function findFileWithExtension($class, $ext)
{
// PSR-4 lookup
$logicalPathPsr4 = strtr($class, '\\', DIRECTORY_SEPARATOR) . $ext;
$first = $class[0];
if (isset($this->prefixLengthsPsr4[$first])) {
$subPath = $class;
while (false !== $lastPos = strrpos($subPath, '\\')) {
$subPath = substr($subPath, 0, $lastPos);
$search = $subPath . '\\';
if (isset($this->prefixDirsPsr4[$search])) {
$pathEnd = DIRECTORY_SEPARATOR . substr($logicalPathPsr4, $lastPos + 1);
foreach ($this->prefixDirsPsr4[$search] as $dir) {
if (file_exists($file = $dir . $pathEnd)) {
return $file;
}
}
}
}
}
// PSR-4 fallback dirs
foreach ($this->fallbackDirsPsr4 as $dir) {
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr4)) {
return $file;
}
}
// PSR-0 lookup
if (false !== $pos = strrpos($class, '\\')) {
// namespaced class name
$logicalPathPsr0 = substr($logicalPathPsr4, 0, $pos + 1)
. strtr(substr($logicalPathPsr4, $pos + 1), '_', DIRECTORY_SEPARATOR);
} else {
// PEAR-like class name
$logicalPathPsr0 = strtr($class, '_', DIRECTORY_SEPARATOR) . $ext;
}
if (isset($this->prefixesPsr0[$first])) {
foreach ($this->prefixesPsr0[$first] as $prefix => $dirs) {
if (0 === strpos($class, $prefix)) {
foreach ($dirs as $dir) {
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr0)) {
return $file;
}
}
}
}
}
// PSR-0 fallback dirs
foreach ($this->fallbackDirsPsr0 as $dir) {
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr0)) {
return $file;
}
}
// PSR-0 include paths.
if ($this->useIncludePath && $file = stream_resolve_include_path($logicalPathPsr0)) {
return $file;
}
return false;
}
}
/**
* Scope isolated include.
*
* Prevents access to $this/self from included files.
*/
function includeFile($file)
{
include $file;
}

21
blockbot/vendor/composer/LICENSE vendored Normal file
View file

@ -0,0 +1,21 @@
Copyright (c) Nils Adermann, Jordi Boggiano
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is furnished
to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View file

@ -0,0 +1,14 @@
<?php
// autoload_classmap.php @generated by Composer
$vendorDir = dirname(dirname(__FILE__));
$baseDir = dirname($vendorDir);
return array(
'Jaybizzle\\CrawlerDetect\\CrawlerDetect' => $vendorDir . '/jaybizzle/crawler-detect/src/CrawlerDetect.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\AbstractProvider' => $vendorDir . '/jaybizzle/crawler-detect/src/Fixtures/AbstractProvider.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\Crawlers' => $vendorDir . '/jaybizzle/crawler-detect/src/Fixtures/Crawlers.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\Exclusions' => $vendorDir . '/jaybizzle/crawler-detect/src/Fixtures/Exclusions.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\Headers' => $vendorDir . '/jaybizzle/crawler-detect/src/Fixtures/Headers.php',
);

View file

@ -0,0 +1,9 @@
<?php
// autoload_namespaces.php @generated by Composer
$vendorDir = dirname(dirname(__FILE__));
$baseDir = dirname($vendorDir);
return array(
);

View file

@ -0,0 +1,10 @@
<?php
// autoload_psr4.php @generated by Composer
$vendorDir = dirname(dirname(__FILE__));
$baseDir = dirname($vendorDir);
return array(
'Jaybizzle\\CrawlerDetect\\' => array($vendorDir . '/jaybizzle/crawler-detect/src'),
);

View file

@ -0,0 +1,52 @@
<?php
// autoload_real.php @generated by Composer
class ComposerAutoloaderInitBlockBotAddon
{
private static $loader;
public static function loadClassLoader($class)
{
if ('Composer\Autoload\ClassLoader' === $class) {
require __DIR__ . '/ClassLoader.php';
}
}
public static function getLoader()
{
if (null !== self::$loader) {
return self::$loader;
}
spl_autoload_register(array('ComposerAutoloaderInitBlockBotAddon', 'loadClassLoader'), true, true);
self::$loader = $loader = new \Composer\Autoload\ClassLoader();
spl_autoload_unregister(array('ComposerAutoloaderInitBlockBotAddon', 'loadClassLoader'));
$useStaticLoader = PHP_VERSION_ID >= 50600 && !defined('HHVM_VERSION') && (!function_exists('zend_loader_file_encoded') || !zend_loader_file_encoded());
if ($useStaticLoader) {
require_once __DIR__ . '/autoload_static.php';
call_user_func(\Composer\Autoload\ComposerStaticInitBlockBotAddon::getInitializer($loader));
} else {
$map = require __DIR__ . '/autoload_namespaces.php';
foreach ($map as $namespace => $path) {
$loader->set($namespace, $path);
}
$map = require __DIR__ . '/autoload_psr4.php';
foreach ($map as $namespace => $path) {
$loader->setPsr4($namespace, $path);
}
$classMap = require __DIR__ . '/autoload_classmap.php';
if ($classMap) {
$loader->addClassMap($classMap);
}
}
$loader->register(true);
return $loader;
}
}

View file

@ -0,0 +1,40 @@
<?php
// autoload_static.php @generated by Composer
namespace Composer\Autoload;
class ComposerStaticInitBlockBotAddon
{
public static $prefixLengthsPsr4 = array (
'J' =>
array (
'Jaybizzle\\CrawlerDetect\\' => 24,
),
);
public static $prefixDirsPsr4 = array (
'Jaybizzle\\CrawlerDetect\\' =>
array (
0 => __DIR__ . '/..' . '/jaybizzle/crawler-detect/src',
),
);
public static $classMap = array (
'Jaybizzle\\CrawlerDetect\\CrawlerDetect' => __DIR__ . '/..' . '/jaybizzle/crawler-detect/src/CrawlerDetect.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\AbstractProvider' => __DIR__ . '/..' . '/jaybizzle/crawler-detect/src/Fixtures/AbstractProvider.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\Crawlers' => __DIR__ . '/..' . '/jaybizzle/crawler-detect/src/Fixtures/Crawlers.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\Exclusions' => __DIR__ . '/..' . '/jaybizzle/crawler-detect/src/Fixtures/Exclusions.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\Headers' => __DIR__ . '/..' . '/jaybizzle/crawler-detect/src/Fixtures/Headers.php',
);
public static function getInitializer(ClassLoader $loader)
{
return \Closure::bind(function () use ($loader) {
$loader->prefixLengthsPsr4 = ComposerStaticInitBlockBotAddon::$prefixLengthsPsr4;
$loader->prefixDirsPsr4 = ComposerStaticInitBlockBotAddon::$prefixDirsPsr4;
$loader->classMap = ComposerStaticInitBlockBotAddon::$classMap;
}, null, ClassLoader::class);
}
}

53
blockbot/vendor/composer/installed.json vendored Normal file
View file

@ -0,0 +1,53 @@
[
{
"name": "jaybizzle/crawler-detect",
"version": "v1.2.80",
"version_normalized": "1.2.80.0",
"source": {
"type": "git",
"url": "https://github.com/JayBizzle/Crawler-Detect.git",
"reference": "af6a36e6d69670df3f0a3ed8e21d4b8cc67a7847"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/JayBizzle/Crawler-Detect/zipball/af6a36e6d69670df3f0a3ed8e21d4b8cc67a7847",
"reference": "af6a36e6d69670df3f0a3ed8e21d4b8cc67a7847",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8|^5.5|^6.5",
"satooshi/php-coveralls": "1.*"
},
"time": "2019-04-05T19:52:02+00:00",
"type": "library",
"installation-source": "dist",
"autoload": {
"psr-4": {
"Jaybizzle\\CrawlerDetect\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Mark Beech",
"email": "m@rkbee.ch",
"role": "Developer"
}
],
"description": "CrawlerDetect is a PHP class for detecting bots/crawlers/spiders via the user agent",
"homepage": "https://github.com/JayBizzle/Crawler-Detect/",
"keywords": [
"crawler",
"crawler detect",
"crawler detector",
"crawlerdetect",
"php crawler detect"
]
}
]

View file

@ -0,0 +1,22 @@
The MIT License (MIT)
Copyright (c) 2015-2018 Mark Beech
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -0,0 +1,72 @@
<p align="center"><a href="http://crawlerdetect.io/" target="_blank"><img src="https://cloud.githubusercontent.com/assets/340752/23082173/1bd1a396-f550-11e6-8aba-4d3c75edea2f.png" width="321" height="219" /></a><br><br>
<a href="http://crawlerdetect.io/" target="_blank">crawlerdetect.io</a>
<br><br>
</p>
<p align="center">
<a href="https://travis-ci.org/JayBizzle/Crawler-Detect"><img src="https://img.shields.io/travis/JayBizzle/Crawler-Detect/master.svg?style=flat-square" /></a>
<a href="https://packagist.org/packages/jaybizzle/crawler-detect"><img src="https://img.shields.io/packagist/dm/JayBizzle/Crawler-Detect.svg?style=flat-square" /></a>
<a href="https://scrutinizer-ci.com/g/JayBizzle/Crawler-Detect/?branch=master"><img src="https://img.shields.io/scrutinizer/g/JayBizzle/Crawler-Detect.svg?style=flat-square" /></a>
<a href="https://github.com/JayBizzle/Crawler-Detect"><img src="https://img.shields.io/badge/license-MIT-ff69b4.svg?style=flat-square" /></a>
<a href="https://packagist.org/packages/jaybizzle/crawler-detect"><img src="https://img.shields.io/packagist/v/jaybizzle/Crawler-Detect.svg?style=flat-square" /></a>
<a href="https://styleci.io/repos/32755917"><img src="https://styleci.io/repos/32755917/shield" /></a>
<a href="https://coveralls.io/github/JayBizzle/Crawler-Detect"><img src="https://img.shields.io/coveralls/JayBizzle/Crawler-Detect/master.svg?style=flat-square" /></a>
</p>
## About CrawlerDetect
CrawlerDetect is a PHP class for detecting bots/crawlers/spiders via the user agent and http_from header. Currently able to detect 1,000's of bots/spiders/crawlers.
### Installation
Run `composer require jaybizzle/crawler-detect 1.*` or add `"jaybizzle/crawler-detect" :"1.*"` to your `composer.json`.
### Usage
```PHP
use Jaybizzle\CrawlerDetect\CrawlerDetect;
$CrawlerDetect = new CrawlerDetect;
// Check the user agent of the current 'visitor'
if($CrawlerDetect->isCrawler()) {
// true if crawler user agent detected
}
// Pass a user agent as a string
if($CrawlerDetect->isCrawler('Mozilla/5.0 (compatible; Sosospider/2.0; +http://help.soso.com/webspider.htm)')) {
// true if crawler user agent detected
}
// Output the name of the bot that matched (if any)
echo $CrawlerDetect->getMatches();
```
### Contributing
If you find a bot/spider/crawler user agent that CrawlerDetect fails to detect, please submit a pull request with the regex pattern added to the `$data` array in `Fixtures/Crawlers.php` and add the failing user agent to `tests/crawlers.txt`.
Failing that, just create an issue with the user agent you have found, and we'll take it from there :)
### Laravel Package
If you would like to use this with Laravel 4/5, please see [Laravel-Crawler-Detect](https://github.com/JayBizzle/Laravel-Crawler-Detect)
### Symfony Bundle
To use this library with Symfony 2/3/4, check out the [CrawlerDetectBundle](https://github.com/nicolasmure/CrawlerDetectBundle).
### YII2 Extension
To use this library with the YII2 framework, check out [yii2-crawler-detect](https://github.com/AlikDex/yii2-crawler-detect).
### ES6 Library
To use this library with NodeJS or any ES6 application based, check out [es6-crawler-detect](https://github.com/JefferyHus/es6-crawler-detect).
### .NET Library
To use this library in a .net standard (including .net core) based project, check out [NetCrawlerDetect](https://github.com/gplumb/NetCrawlerDetect).
### Nette Extension
To use this library with the Nette framework, checkout [NetteCrawlerDetect](https://github.com/JanGalek/Crawler-Detect).
### Ruby Gem
To use this library with Ruby on Rails or any Ruby-based application, check out [crawler_detect](https://github.com/loadkpi/crawler_detect) gem.
_Parts of this class are based on the brilliant [MobileDetect](https://github.com/serbanghita/Mobile-Detect)_
[![Analytics](https://ga-beacon.appspot.com/UA-72430465-1/Crawler-Detect/readme?pixel)](https://github.com/JayBizzle/Crawler-Detect)

View file

@ -0,0 +1,30 @@
{
"name": "jaybizzle/crawler-detect",
"type": "library",
"description": "CrawlerDetect is a PHP class for detecting bots/crawlers/spiders via the user agent",
"keywords": ["crawler", "crawler detect", "crawler detector", "crawlerdetect", "php crawler detect"],
"homepage": "https://github.com/JayBizzle/Crawler-Detect/",
"license": "MIT",
"authors": [
{
"name": "Mark Beech",
"email": "m@rkbee.ch",
"role": "Developer"
}
],
"require": {
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8|^5.5|^6.5",
"satooshi/php-coveralls": "1.*"
},
"autoload": {
"psr-4": {
"Jaybizzle\\CrawlerDetect\\": "src/"
}
},
"scripts": {
"test": "vendor/bin/phpunit"
}
}

View file

@ -0,0 +1,41 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
require 'src/Fixtures/AbstractProvider.php';
require 'src/Fixtures/Crawlers.php';
require 'src/Fixtures/Exclusions.php';
require 'src/Fixtures/Headers.php';
$src = array(
'Crawlers',
'Exclusions',
'Headers',
);
foreach ($src as $class) {
$class = "Jaybizzle\\CrawlerDetect\\Fixtures\\$class";
$object = new $class;
outputJson($object);
outputTxt($object);
}
function outputJson($object)
{
$className = (new ReflectionClass($object))->getShortName();
file_put_contents("raw/$className.json", json_encode($object->getAll()));
}
function outputTxt($object)
{
$className = (new ReflectionClass($object))->getShortName();
file_put_contents("raw/$className.txt", implode($object->getAll(), PHP_EOL));
}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1 @@
["Safari.[\\d\\.]*","Firefox.[\\d\\.]*"," Chrome.[\\d\\.]*","Chromium.[\\d\\.]*","MSIE.[\\d\\.]","Opera\\\/[\\d\\.]*","Mozilla.[\\d\\.]*","AppleWebKit.[\\d\\.]*","Trident.[\\d\\.]*","Windows NT.[\\d\\.]*","Android [\\d\\.]*","Macintosh.","Ubuntu","Linux","[ ]Intel","Mac OS X [\\d_]*","(like )?Gecko(.[\\d\\.]*)?","KHTML,","CriOS.[\\d\\.]*","CPU iPhone OS ([0-9_])* like Mac OS X","CPU OS ([0-9_])* like Mac OS X","iPod","compatible","x86_..","i686","x64","X11","rv:[\\d\\.]*","Version.[\\d\\.]*","WOW64","Win64","Dalvik.[\\d\\.]*"," \\.NET CLR [\\d\\.]*","Presto.[\\d\\.]*","Media Center PC","BlackBerry","Build","Opera Mini\\\/\\d{1,2}\\.\\d{1,2}\\.[\\d\\.]*\\\/\\d{1,2}\\.","Opera"," \\.NET[\\d\\.]*","cubot","; M bot","; CRONO","; B bot","; IDbot","; ID bot","; POWER BOT",";"]

View file

@ -0,0 +1,48 @@
Safari.[\d\.]*
Firefox.[\d\.]*
Chrome.[\d\.]*
Chromium.[\d\.]*
MSIE.[\d\.]
Opera\/[\d\.]*
Mozilla.[\d\.]*
AppleWebKit.[\d\.]*
Trident.[\d\.]*
Windows NT.[\d\.]*
Android [\d\.]*
Macintosh.
Ubuntu
Linux
[ ]Intel
Mac OS X [\d_]*
(like )?Gecko(.[\d\.]*)?
KHTML,
CriOS.[\d\.]*
CPU iPhone OS ([0-9_])* like Mac OS X
CPU OS ([0-9_])* like Mac OS X
iPod
compatible
x86_..
i686
x64
X11
rv:[\d\.]*
Version.[\d\.]*
WOW64
Win64
Dalvik.[\d\.]*
\.NET CLR [\d\.]*
Presto.[\d\.]*
Media Center PC
BlackBerry
Build
Opera Mini\/\d{1,2}\.\d{1,2}\.[\d\.]*\/\d{1,2}\.
Opera
\.NET[\d\.]*
cubot
; M bot
; CRONO
; B bot
; IDbot
; ID bot
; POWER BOT
;

View file

@ -0,0 +1 @@
["HTTP_USER_AGENT","HTTP_X_OPERAMINI_PHONE_UA","HTTP_X_DEVICE_USER_AGENT","HTTP_X_ORIGINAL_USER_AGENT","HTTP_X_SKYFIRE_PHONE","HTTP_X_BOLT_PHONE_UA","HTTP_DEVICE_STOCK_UA","HTTP_X_UCBROWSER_DEVICE_UA","HTTP_FROM","HTTP_X_SCANNER"]

View file

@ -0,0 +1,10 @@
HTTP_USER_AGENT
HTTP_X_OPERAMINI_PHONE_UA
HTTP_X_DEVICE_USER_AGENT
HTTP_X_ORIGINAL_USER_AGENT
HTTP_X_SKYFIRE_PHONE
HTTP_X_BOLT_PHONE_UA
HTTP_DEVICE_STOCK_UA
HTTP_X_UCBROWSER_DEVICE_UA
HTTP_FROM
HTTP_X_SCANNER

View file

@ -0,0 +1,193 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect;
use Jaybizzle\CrawlerDetect\Fixtures\Crawlers;
use Jaybizzle\CrawlerDetect\Fixtures\Exclusions;
use Jaybizzle\CrawlerDetect\Fixtures\Headers;
class CrawlerDetect
{
/**
* The user agent.
*
* @var null
*/
protected $userAgent = null;
/**
* Headers that contain a user agent.
*
* @var array
*/
protected $httpHeaders = array();
/**
* Store regex matches.
*
* @var array
*/
protected $matches = array();
/**
* Crawlers object.
*
* @var \Jaybizzle\CrawlerDetect\Fixtures\Crawlers
*/
protected $crawlers;
/**
* Exclusions object.
*
* @var \Jaybizzle\CrawlerDetect\Fixtures\Exclusions
*/
protected $exclusions;
/**
* Headers object.
*
* @var \Jaybizzle\CrawlerDetect\Fixtures\Headers
*/
protected $uaHttpHeaders;
/**
* The compiled regex string.
*
* @var string
*/
protected $compiledRegex;
/**
* The compiled exclusions regex string.
*
* @var string
*/
protected $compiledExclusions;
/**
* Class constructor.
*/
public function __construct(array $headers = null, $userAgent = null)
{
$this->crawlers = new Crawlers();
$this->exclusions = new Exclusions();
$this->uaHttpHeaders = new Headers();
$this->compiledRegex = $this->compileRegex($this->crawlers->getAll());
$this->compiledExclusions = $this->compileRegex($this->exclusions->getAll());
$this->setHttpHeaders($headers);
$this->setUserAgent($userAgent);
}
/**
* Compile the regex patterns into one regex string.
*
* @param array
*
* @return string
*/
public function compileRegex($patterns)
{
return '('.implode('|', $patterns).')';
}
/**
* Set HTTP headers.
*
* @param array|null $httpHeaders
*/
public function setHttpHeaders($httpHeaders)
{
// Use global _SERVER if $httpHeaders aren't defined.
if (! is_array($httpHeaders) || ! count($httpHeaders)) {
$httpHeaders = $_SERVER;
}
// Clear existing headers.
$this->httpHeaders = array();
// Only save HTTP headers. In PHP land, that means
// only _SERVER vars that start with HTTP_.
foreach ($httpHeaders as $key => $value) {
if (strpos($key, 'HTTP_') === 0) {
$this->httpHeaders[$key] = $value;
}
}
}
/**
* Return user agent headers.
*
* @return array
*/
public function getUaHttpHeaders()
{
return $this->uaHttpHeaders->getAll();
}
/**
* Set the user agent.
*
* @param string $userAgent
*/
public function setUserAgent($userAgent)
{
if (is_null($userAgent)) {
foreach ($this->getUaHttpHeaders() as $altHeader) {
if (isset($this->httpHeaders[$altHeader])) {
$userAgent .= $this->httpHeaders[$altHeader].' ';
}
}
}
return $this->userAgent = $userAgent;
}
/**
* Check user agent string against the regex.
*
* @param string|null $userAgent
*
* @return bool
*/
public function isCrawler($userAgent = null)
{
$agent = trim(preg_replace(
"/{$this->compiledExclusions}/i",
'',
$userAgent ?: $this->userAgent
));
if ($agent == '') {
return false;
}
$result = preg_match("/{$this->compiledRegex}/i", $agent, $matches);
if ($matches) {
$this->matches = $matches;
}
return (bool) $result;
}
/**
* Return the matches.
*
* @return string|null
*/
public function getMatches()
{
return isset($this->matches[0]) ? $this->matches[0] : null;
}
}

View file

@ -0,0 +1,32 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect\Fixtures;
abstract class AbstractProvider
{
/**
* The data set.
*
* @var array
*/
protected $data;
/**
* Return the data set.
*
* @return array
*/
public function getAll()
{
return $this->data;
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,72 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect\Fixtures;
class Exclusions extends AbstractProvider
{
/**
* List of strings to remove from the user agent before running the crawler regex
* Over a large list of user agents, this gives us about a 55% speed increase!
*
* @var array
*/
protected $data = array(
'Safari.[\d\.]*',
'Firefox.[\d\.]*',
' Chrome.[\d\.]*',
'Chromium.[\d\.]*',
'MSIE.[\d\.]',
'Opera\/[\d\.]*',
'Mozilla.[\d\.]*',
'AppleWebKit.[\d\.]*',
'Trident.[\d\.]*',
'Windows NT.[\d\.]*',
'Android [\d\.]*',
'Macintosh.',
'Ubuntu',
'Linux',
'[ ]Intel',
'Mac OS X [\d_]*',
'(like )?Gecko(.[\d\.]*)?',
'KHTML,',
'CriOS.[\d\.]*',
'CPU iPhone OS ([0-9_])* like Mac OS X',
'CPU OS ([0-9_])* like Mac OS X',
'iPod',
'compatible',
'x86_..',
'i686',
'x64',
'X11',
'rv:[\d\.]*',
'Version.[\d\.]*',
'WOW64',
'Win64',
'Dalvik.[\d\.]*',
' \.NET CLR [\d\.]*',
'Presto.[\d\.]*',
'Media Center PC',
'BlackBerry',
'Build',
'Opera Mini\/\d{1,2}\.\d{1,2}\.[\d\.]*\/\d{1,2}\.',
'Opera',
' \.NET[\d\.]*',
'cubot',
'; M bot',
'; CRONO',
'; B bot',
'; IDbot',
'; ID bot',
'; POWER BOT',
';', // Remove the following characters ;
);
}

View file

@ -0,0 +1,37 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect\Fixtures;
class Headers extends AbstractProvider
{
/**
* All possible HTTP headers that represent the user agent string.
*
* @var array
*/
protected $data = array(
// The default User-Agent string.
'HTTP_USER_AGENT',
// Header can occur on devices using Opera Mini.
'HTTP_X_OPERAMINI_PHONE_UA',
// Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
'HTTP_X_DEVICE_USER_AGENT',
'HTTP_X_ORIGINAL_USER_AGENT',
'HTTP_X_SKYFIRE_PHONE',
'HTTP_X_BOLT_PHONE_UA',
'HTTP_DEVICE_STOCK_UA',
'HTTP_X_UCBROWSER_DEVICE_UA',
// Sometimes, bots (especially Google) use a genuine user agent, but fill this header in with their email address
'HTTP_FROM',
'HTTP_X_SCANNER', // Seen in use by Netsparker
);
}