Merge pull request #846 from nupplaphil/features/6948-bot_detection

New Addon Bot detection
This commit is contained in:
Hypolite Petovan 2019-04-22 07:49:18 -04:00 committed by GitHub
commit cf741fe3e9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
27 changed files with 3793 additions and 0 deletions

32
blockbot/blockbot.php Normal file
View file

@ -0,0 +1,32 @@
<?php
/**
* Name: blockbot
* Description: Blocking bots based on detecting bots/crawlers/spiders via the user agent and http_from header.
* Version: 0.1
* Author: Philipp Holzer <admin@philipp.info>
*
*/
use Friendica\App;
use Friendica\Core\Hook;
use Friendica\Core\System;
use Jaybizzle\CrawlerDetect\CrawlerDetect;
require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor' . DIRECTORY_SEPARATOR . 'autoload.php';
function blockbot_install() {
Hook::register('init_1', __FILE__, 'blockbot_init_1');
}
function blockbot_uninstall() {
Hook::unregister('init_1', __FILE__, 'blockbot_init_1');
}
function blockbot_init_1(App $a) {
$crawlerDetect = new CrawlerDetect();
if ($crawlerDetect->isCrawler()) {
System::httpExit(403, 'Bots are not allowed');
}
}

24
blockbot/composer.json Normal file
View file

@ -0,0 +1,24 @@
{
"name": "friendica-addons/blockbot",
"description": "Blocking bots based on detecting bots/crawlers/spiders via the user agent and http_from header.",
"type": "friendica-addon",
"authors": [
{
"name": "Philipp Holzer",
"email": "admin@philipp.info",
"homepage": "https://friendica.philipp.info/profile/nupplaphil",
"role": "Developer"
}
],
"require": {
"php": ">=5.6.0",
"jaybizzle/crawler-detect": "1.*"
},
"license": "3-clause BSD license",
"minimum-stability": "stable",
"config": {
"optimize-autoloader": true,
"autoloader-suffix": "BlockBotAddon",
"preferred-install": "dist"
}
}

69
blockbot/composer.lock generated Normal file
View file

@ -0,0 +1,69 @@
{
"_readme": [
"This file locks the dependencies of your project to a known state",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "814fd867d00e99f84d12304e8e244aae",
"packages": [
{
"name": "jaybizzle/crawler-detect",
"version": "v1.2.80",
"source": {
"type": "git",
"url": "https://github.com/JayBizzle/Crawler-Detect.git",
"reference": "af6a36e6d69670df3f0a3ed8e21d4b8cc67a7847"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/JayBizzle/Crawler-Detect/zipball/af6a36e6d69670df3f0a3ed8e21d4b8cc67a7847",
"reference": "af6a36e6d69670df3f0a3ed8e21d4b8cc67a7847",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8|^5.5|^6.5",
"satooshi/php-coveralls": "1.*"
},
"type": "library",
"autoload": {
"psr-4": {
"Jaybizzle\\CrawlerDetect\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Mark Beech",
"email": "m@rkbee.ch",
"role": "Developer"
}
],
"description": "CrawlerDetect is a PHP class for detecting bots/crawlers/spiders via the user agent",
"homepage": "https://github.com/JayBizzle/Crawler-Detect/",
"keywords": [
"crawler",
"crawler detect",
"crawler detector",
"crawlerdetect",
"php crawler detect"
],
"time": "2019-04-05T19:52:02+00:00"
}
],
"packages-dev": [],
"aliases": [],
"minimum-stability": "stable",
"stability-flags": [],
"prefer-stable": false,
"prefer-lowest": false,
"platform": {
"php": ">=5.6.0"
},
"platform-dev": []
}

7
blockbot/vendor/autoload.php vendored Normal file
View file

@ -0,0 +1,7 @@
<?php
// autoload.php @generated by Composer
require_once __DIR__ . '/composer/autoload_real.php';
return ComposerAutoloaderInitBlockBotAddon::getLoader();

445
blockbot/vendor/composer/ClassLoader.php vendored Normal file
View file

@ -0,0 +1,445 @@
<?php
/*
* This file is part of Composer.
*
* (c) Nils Adermann <naderman@naderman.de>
* Jordi Boggiano <j.boggiano@seld.be>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Composer\Autoload;
/**
* ClassLoader implements a PSR-0, PSR-4 and classmap class loader.
*
* $loader = new \Composer\Autoload\ClassLoader();
*
* // register classes with namespaces
* $loader->add('Symfony\Component', __DIR__.'/component');
* $loader->add('Symfony', __DIR__.'/framework');
*
* // activate the autoloader
* $loader->register();
*
* // to enable searching the include path (eg. for PEAR packages)
* $loader->setUseIncludePath(true);
*
* In this example, if you try to use a class in the Symfony\Component
* namespace or one of its children (Symfony\Component\Console for instance),
* the autoloader will first look for the class under the component/
* directory, and it will then fallback to the framework/ directory if not
* found before giving up.
*
* This class is loosely based on the Symfony UniversalClassLoader.
*
* @author Fabien Potencier <fabien@symfony.com>
* @author Jordi Boggiano <j.boggiano@seld.be>
* @see http://www.php-fig.org/psr/psr-0/
* @see http://www.php-fig.org/psr/psr-4/
*/
class ClassLoader
{
// PSR-4
private $prefixLengthsPsr4 = array();
private $prefixDirsPsr4 = array();
private $fallbackDirsPsr4 = array();
// PSR-0
private $prefixesPsr0 = array();
private $fallbackDirsPsr0 = array();
private $useIncludePath = false;
private $classMap = array();
private $classMapAuthoritative = false;
private $missingClasses = array();
private $apcuPrefix;
public function getPrefixes()
{
if (!empty($this->prefixesPsr0)) {
return call_user_func_array('array_merge', $this->prefixesPsr0);
}
return array();
}
public function getPrefixesPsr4()
{
return $this->prefixDirsPsr4;
}
public function getFallbackDirs()
{
return $this->fallbackDirsPsr0;
}
public function getFallbackDirsPsr4()
{
return $this->fallbackDirsPsr4;
}
public function getClassMap()
{
return $this->classMap;
}
/**
* @param array $classMap Class to filename map
*/
public function addClassMap(array $classMap)
{
if ($this->classMap) {
$this->classMap = array_merge($this->classMap, $classMap);
} else {
$this->classMap = $classMap;
}
}
/**
* Registers a set of PSR-0 directories for a given prefix, either
* appending or prepending to the ones previously set for this prefix.
*
* @param string $prefix The prefix
* @param array|string $paths The PSR-0 root directories
* @param bool $prepend Whether to prepend the directories
*/
public function add($prefix, $paths, $prepend = false)
{
if (!$prefix) {
if ($prepend) {
$this->fallbackDirsPsr0 = array_merge(
(array) $paths,
$this->fallbackDirsPsr0
);
} else {
$this->fallbackDirsPsr0 = array_merge(
$this->fallbackDirsPsr0,
(array) $paths
);
}
return;
}
$first = $prefix[0];
if (!isset($this->prefixesPsr0[$first][$prefix])) {
$this->prefixesPsr0[$first][$prefix] = (array) $paths;
return;
}
if ($prepend) {
$this->prefixesPsr0[$first][$prefix] = array_merge(
(array) $paths,
$this->prefixesPsr0[$first][$prefix]
);
} else {
$this->prefixesPsr0[$first][$prefix] = array_merge(
$this->prefixesPsr0[$first][$prefix],
(array) $paths
);
}
}
/**
* Registers a set of PSR-4 directories for a given namespace, either
* appending or prepending to the ones previously set for this namespace.
*
* @param string $prefix The prefix/namespace, with trailing '\\'
* @param array|string $paths The PSR-4 base directories
* @param bool $prepend Whether to prepend the directories
*
* @throws \InvalidArgumentException
*/
public function addPsr4($prefix, $paths, $prepend = false)
{
if (!$prefix) {
// Register directories for the root namespace.
if ($prepend) {
$this->fallbackDirsPsr4 = array_merge(
(array) $paths,
$this->fallbackDirsPsr4
);
} else {
$this->fallbackDirsPsr4 = array_merge(
$this->fallbackDirsPsr4,
(array) $paths
);
}
} elseif (!isset($this->prefixDirsPsr4[$prefix])) {
// Register directories for a new namespace.
$length = strlen($prefix);
if ('\\' !== $prefix[$length - 1]) {
throw new \InvalidArgumentException("A non-empty PSR-4 prefix must end with a namespace separator.");
}
$this->prefixLengthsPsr4[$prefix[0]][$prefix] = $length;
$this->prefixDirsPsr4[$prefix] = (array) $paths;
} elseif ($prepend) {
// Prepend directories for an already registered namespace.
$this->prefixDirsPsr4[$prefix] = array_merge(
(array) $paths,
$this->prefixDirsPsr4[$prefix]
);
} else {
// Append directories for an already registered namespace.
$this->prefixDirsPsr4[$prefix] = array_merge(
$this->prefixDirsPsr4[$prefix],
(array) $paths
);
}
}
/**
* Registers a set of PSR-0 directories for a given prefix,
* replacing any others previously set for this prefix.
*
* @param string $prefix The prefix
* @param array|string $paths The PSR-0 base directories
*/
public function set($prefix, $paths)
{
if (!$prefix) {
$this->fallbackDirsPsr0 = (array) $paths;
} else {
$this->prefixesPsr0[$prefix[0]][$prefix] = (array) $paths;
}
}
/**
* Registers a set of PSR-4 directories for a given namespace,
* replacing any others previously set for this namespace.
*
* @param string $prefix The prefix/namespace, with trailing '\\'
* @param array|string $paths The PSR-4 base directories
*
* @throws \InvalidArgumentException
*/
public function setPsr4($prefix, $paths)
{
if (!$prefix) {
$this->fallbackDirsPsr4 = (array) $paths;
} else {
$length = strlen($prefix);
if ('\\' !== $prefix[$length - 1]) {
throw new \InvalidArgumentException("A non-empty PSR-4 prefix must end with a namespace separator.");
}
$this->prefixLengthsPsr4[$prefix[0]][$prefix] = $length;
$this->prefixDirsPsr4[$prefix] = (array) $paths;
}
}
/**
* Turns on searching the include path for class files.
*
* @param bool $useIncludePath
*/
public function setUseIncludePath($useIncludePath)
{
$this->useIncludePath = $useIncludePath;
}
/**
* Can be used to check if the autoloader uses the include path to check
* for classes.
*
* @return bool
*/
public function getUseIncludePath()
{
return $this->useIncludePath;
}
/**
* Turns off searching the prefix and fallback directories for classes
* that have not been registered with the class map.
*
* @param bool $classMapAuthoritative
*/
public function setClassMapAuthoritative($classMapAuthoritative)
{
$this->classMapAuthoritative = $classMapAuthoritative;
}
/**
* Should class lookup fail if not found in the current class map?
*
* @return bool
*/
public function isClassMapAuthoritative()
{
return $this->classMapAuthoritative;
}
/**
* APCu prefix to use to cache found/not-found classes, if the extension is enabled.
*
* @param string|null $apcuPrefix
*/
public function setApcuPrefix($apcuPrefix)
{
$this->apcuPrefix = function_exists('apcu_fetch') && ini_get('apc.enabled') ? $apcuPrefix : null;
}
/**
* The APCu prefix in use, or null if APCu caching is not enabled.
*
* @return string|null
*/
public function getApcuPrefix()
{
return $this->apcuPrefix;
}
/**
* Registers this instance as an autoloader.
*
* @param bool $prepend Whether to prepend the autoloader or not
*/
public function register($prepend = false)
{
spl_autoload_register(array($this, 'loadClass'), true, $prepend);
}
/**
* Unregisters this instance as an autoloader.
*/
public function unregister()
{
spl_autoload_unregister(array($this, 'loadClass'));
}
/**
* Loads the given class or interface.
*
* @param string $class The name of the class
* @return bool|null True if loaded, null otherwise
*/
public function loadClass($class)
{
if ($file = $this->findFile($class)) {
includeFile($file);
return true;
}
}
/**
* Finds the path to the file where the class is defined.
*
* @param string $class The name of the class
*
* @return string|false The path if found, false otherwise
*/
public function findFile($class)
{
// class map lookup
if (isset($this->classMap[$class])) {
return $this->classMap[$class];
}
if ($this->classMapAuthoritative || isset($this->missingClasses[$class])) {
return false;
}
if (null !== $this->apcuPrefix) {
$file = apcu_fetch($this->apcuPrefix.$class, $hit);
if ($hit) {
return $file;
}
}
$file = $this->findFileWithExtension($class, '.php');
// Search for Hack files if we are running on HHVM
if (false === $file && defined('HHVM_VERSION')) {
$file = $this->findFileWithExtension($class, '.hh');
}
if (null !== $this->apcuPrefix) {
apcu_add($this->apcuPrefix.$class, $file);
}
if (false === $file) {
// Remember that this class does not exist.
$this->missingClasses[$class] = true;
}
return $file;
}
private function findFileWithExtension($class, $ext)
{
// PSR-4 lookup
$logicalPathPsr4 = strtr($class, '\\', DIRECTORY_SEPARATOR) . $ext;
$first = $class[0];
if (isset($this->prefixLengthsPsr4[$first])) {
$subPath = $class;
while (false !== $lastPos = strrpos($subPath, '\\')) {
$subPath = substr($subPath, 0, $lastPos);
$search = $subPath . '\\';
if (isset($this->prefixDirsPsr4[$search])) {
$pathEnd = DIRECTORY_SEPARATOR . substr($logicalPathPsr4, $lastPos + 1);
foreach ($this->prefixDirsPsr4[$search] as $dir) {
if (file_exists($file = $dir . $pathEnd)) {
return $file;
}
}
}
}
}
// PSR-4 fallback dirs
foreach ($this->fallbackDirsPsr4 as $dir) {
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr4)) {
return $file;
}
}
// PSR-0 lookup
if (false !== $pos = strrpos($class, '\\')) {
// namespaced class name
$logicalPathPsr0 = substr($logicalPathPsr4, 0, $pos + 1)
. strtr(substr($logicalPathPsr4, $pos + 1), '_', DIRECTORY_SEPARATOR);
} else {
// PEAR-like class name
$logicalPathPsr0 = strtr($class, '_', DIRECTORY_SEPARATOR) . $ext;
}
if (isset($this->prefixesPsr0[$first])) {
foreach ($this->prefixesPsr0[$first] as $prefix => $dirs) {
if (0 === strpos($class, $prefix)) {
foreach ($dirs as $dir) {
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr0)) {
return $file;
}
}
}
}
}
// PSR-0 fallback dirs
foreach ($this->fallbackDirsPsr0 as $dir) {
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr0)) {
return $file;
}
}
// PSR-0 include paths.
if ($this->useIncludePath && $file = stream_resolve_include_path($logicalPathPsr0)) {
return $file;
}
return false;
}
}
/**
* Scope isolated include.
*
* Prevents access to $this/self from included files.
*/
function includeFile($file)
{
include $file;
}

21
blockbot/vendor/composer/LICENSE vendored Normal file
View file

@ -0,0 +1,21 @@
Copyright (c) Nils Adermann, Jordi Boggiano
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is furnished
to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View file

@ -0,0 +1,14 @@
<?php
// autoload_classmap.php @generated by Composer
$vendorDir = dirname(dirname(__FILE__));
$baseDir = dirname($vendorDir);
return array(
'Jaybizzle\\CrawlerDetect\\CrawlerDetect' => $vendorDir . '/jaybizzle/crawler-detect/src/CrawlerDetect.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\AbstractProvider' => $vendorDir . '/jaybizzle/crawler-detect/src/Fixtures/AbstractProvider.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\Crawlers' => $vendorDir . '/jaybizzle/crawler-detect/src/Fixtures/Crawlers.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\Exclusions' => $vendorDir . '/jaybizzle/crawler-detect/src/Fixtures/Exclusions.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\Headers' => $vendorDir . '/jaybizzle/crawler-detect/src/Fixtures/Headers.php',
);

View file

@ -0,0 +1,9 @@
<?php
// autoload_namespaces.php @generated by Composer
$vendorDir = dirname(dirname(__FILE__));
$baseDir = dirname($vendorDir);
return array(
);

View file

@ -0,0 +1,10 @@
<?php
// autoload_psr4.php @generated by Composer
$vendorDir = dirname(dirname(__FILE__));
$baseDir = dirname($vendorDir);
return array(
'Jaybizzle\\CrawlerDetect\\' => array($vendorDir . '/jaybizzle/crawler-detect/src'),
);

View file

@ -0,0 +1,52 @@
<?php
// autoload_real.php @generated by Composer
class ComposerAutoloaderInitBlockBotAddon
{
private static $loader;
public static function loadClassLoader($class)
{
if ('Composer\Autoload\ClassLoader' === $class) {
require __DIR__ . '/ClassLoader.php';
}
}
public static function getLoader()
{
if (null !== self::$loader) {
return self::$loader;
}
spl_autoload_register(array('ComposerAutoloaderInitBlockBotAddon', 'loadClassLoader'), true, true);
self::$loader = $loader = new \Composer\Autoload\ClassLoader();
spl_autoload_unregister(array('ComposerAutoloaderInitBlockBotAddon', 'loadClassLoader'));
$useStaticLoader = PHP_VERSION_ID >= 50600 && !defined('HHVM_VERSION') && (!function_exists('zend_loader_file_encoded') || !zend_loader_file_encoded());
if ($useStaticLoader) {
require_once __DIR__ . '/autoload_static.php';
call_user_func(\Composer\Autoload\ComposerStaticInitBlockBotAddon::getInitializer($loader));
} else {
$map = require __DIR__ . '/autoload_namespaces.php';
foreach ($map as $namespace => $path) {
$loader->set($namespace, $path);
}
$map = require __DIR__ . '/autoload_psr4.php';
foreach ($map as $namespace => $path) {
$loader->setPsr4($namespace, $path);
}
$classMap = require __DIR__ . '/autoload_classmap.php';
if ($classMap) {
$loader->addClassMap($classMap);
}
}
$loader->register(true);
return $loader;
}
}

View file

@ -0,0 +1,40 @@
<?php
// autoload_static.php @generated by Composer
namespace Composer\Autoload;
class ComposerStaticInitBlockBotAddon
{
public static $prefixLengthsPsr4 = array (
'J' =>
array (
'Jaybizzle\\CrawlerDetect\\' => 24,
),
);
public static $prefixDirsPsr4 = array (
'Jaybizzle\\CrawlerDetect\\' =>
array (
0 => __DIR__ . '/..' . '/jaybizzle/crawler-detect/src',
),
);
public static $classMap = array (
'Jaybizzle\\CrawlerDetect\\CrawlerDetect' => __DIR__ . '/..' . '/jaybizzle/crawler-detect/src/CrawlerDetect.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\AbstractProvider' => __DIR__ . '/..' . '/jaybizzle/crawler-detect/src/Fixtures/AbstractProvider.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\Crawlers' => __DIR__ . '/..' . '/jaybizzle/crawler-detect/src/Fixtures/Crawlers.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\Exclusions' => __DIR__ . '/..' . '/jaybizzle/crawler-detect/src/Fixtures/Exclusions.php',
'Jaybizzle\\CrawlerDetect\\Fixtures\\Headers' => __DIR__ . '/..' . '/jaybizzle/crawler-detect/src/Fixtures/Headers.php',
);
public static function getInitializer(ClassLoader $loader)
{
return \Closure::bind(function () use ($loader) {
$loader->prefixLengthsPsr4 = ComposerStaticInitBlockBotAddon::$prefixLengthsPsr4;
$loader->prefixDirsPsr4 = ComposerStaticInitBlockBotAddon::$prefixDirsPsr4;
$loader->classMap = ComposerStaticInitBlockBotAddon::$classMap;
}, null, ClassLoader::class);
}
}

53
blockbot/vendor/composer/installed.json vendored Normal file
View file

@ -0,0 +1,53 @@
[
{
"name": "jaybizzle/crawler-detect",
"version": "v1.2.80",
"version_normalized": "1.2.80.0",
"source": {
"type": "git",
"url": "https://github.com/JayBizzle/Crawler-Detect.git",
"reference": "af6a36e6d69670df3f0a3ed8e21d4b8cc67a7847"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/JayBizzle/Crawler-Detect/zipball/af6a36e6d69670df3f0a3ed8e21d4b8cc67a7847",
"reference": "af6a36e6d69670df3f0a3ed8e21d4b8cc67a7847",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8|^5.5|^6.5",
"satooshi/php-coveralls": "1.*"
},
"time": "2019-04-05T19:52:02+00:00",
"type": "library",
"installation-source": "dist",
"autoload": {
"psr-4": {
"Jaybizzle\\CrawlerDetect\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Mark Beech",
"email": "m@rkbee.ch",
"role": "Developer"
}
],
"description": "CrawlerDetect is a PHP class for detecting bots/crawlers/spiders via the user agent",
"homepage": "https://github.com/JayBizzle/Crawler-Detect/",
"keywords": [
"crawler",
"crawler detect",
"crawler detector",
"crawlerdetect",
"php crawler detect"
]
}
]

View file

@ -0,0 +1,22 @@
The MIT License (MIT)
Copyright (c) 2015-2018 Mark Beech
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -0,0 +1,72 @@
<p align="center"><a href="http://crawlerdetect.io/" target="_blank"><img src="https://cloud.githubusercontent.com/assets/340752/23082173/1bd1a396-f550-11e6-8aba-4d3c75edea2f.png" width="321" height="219" /></a><br><br>
<a href="http://crawlerdetect.io/" target="_blank">crawlerdetect.io</a>
<br><br>
</p>
<p align="center">
<a href="https://travis-ci.org/JayBizzle/Crawler-Detect"><img src="https://img.shields.io/travis/JayBizzle/Crawler-Detect/master.svg?style=flat-square" /></a>
<a href="https://packagist.org/packages/jaybizzle/crawler-detect"><img src="https://img.shields.io/packagist/dm/JayBizzle/Crawler-Detect.svg?style=flat-square" /></a>
<a href="https://scrutinizer-ci.com/g/JayBizzle/Crawler-Detect/?branch=master"><img src="https://img.shields.io/scrutinizer/g/JayBizzle/Crawler-Detect.svg?style=flat-square" /></a>
<a href="https://github.com/JayBizzle/Crawler-Detect"><img src="https://img.shields.io/badge/license-MIT-ff69b4.svg?style=flat-square" /></a>
<a href="https://packagist.org/packages/jaybizzle/crawler-detect"><img src="https://img.shields.io/packagist/v/jaybizzle/Crawler-Detect.svg?style=flat-square" /></a>
<a href="https://styleci.io/repos/32755917"><img src="https://styleci.io/repos/32755917/shield" /></a>
<a href="https://coveralls.io/github/JayBizzle/Crawler-Detect"><img src="https://img.shields.io/coveralls/JayBizzle/Crawler-Detect/master.svg?style=flat-square" /></a>
</p>
## About CrawlerDetect
CrawlerDetect is a PHP class for detecting bots/crawlers/spiders via the user agent and http_from header. Currently able to detect 1,000's of bots/spiders/crawlers.
### Installation
Run `composer require jaybizzle/crawler-detect 1.*` or add `"jaybizzle/crawler-detect" :"1.*"` to your `composer.json`.
### Usage
```PHP
use Jaybizzle\CrawlerDetect\CrawlerDetect;
$CrawlerDetect = new CrawlerDetect;
// Check the user agent of the current 'visitor'
if($CrawlerDetect->isCrawler()) {
// true if crawler user agent detected
}
// Pass a user agent as a string
if($CrawlerDetect->isCrawler('Mozilla/5.0 (compatible; Sosospider/2.0; +http://help.soso.com/webspider.htm)')) {
// true if crawler user agent detected
}
// Output the name of the bot that matched (if any)
echo $CrawlerDetect->getMatches();
```
### Contributing
If you find a bot/spider/crawler user agent that CrawlerDetect fails to detect, please submit a pull request with the regex pattern added to the `$data` array in `Fixtures/Crawlers.php` and add the failing user agent to `tests/crawlers.txt`.
Failing that, just create an issue with the user agent you have found, and we'll take it from there :)
### Laravel Package
If you would like to use this with Laravel 4/5, please see [Laravel-Crawler-Detect](https://github.com/JayBizzle/Laravel-Crawler-Detect)
### Symfony Bundle
To use this library with Symfony 2/3/4, check out the [CrawlerDetectBundle](https://github.com/nicolasmure/CrawlerDetectBundle).
### YII2 Extension
To use this library with the YII2 framework, check out [yii2-crawler-detect](https://github.com/AlikDex/yii2-crawler-detect).
### ES6 Library
To use this library with NodeJS or any ES6 application based, check out [es6-crawler-detect](https://github.com/JefferyHus/es6-crawler-detect).
### .NET Library
To use this library in a .net standard (including .net core) based project, check out [NetCrawlerDetect](https://github.com/gplumb/NetCrawlerDetect).
### Nette Extension
To use this library with the Nette framework, checkout [NetteCrawlerDetect](https://github.com/JanGalek/Crawler-Detect).
### Ruby Gem
To use this library with Ruby on Rails or any Ruby-based application, check out [crawler_detect](https://github.com/loadkpi/crawler_detect) gem.
_Parts of this class are based on the brilliant [MobileDetect](https://github.com/serbanghita/Mobile-Detect)_
[![Analytics](https://ga-beacon.appspot.com/UA-72430465-1/Crawler-Detect/readme?pixel)](https://github.com/JayBizzle/Crawler-Detect)

View file

@ -0,0 +1,30 @@
{
"name": "jaybizzle/crawler-detect",
"type": "library",
"description": "CrawlerDetect is a PHP class for detecting bots/crawlers/spiders via the user agent",
"keywords": ["crawler", "crawler detect", "crawler detector", "crawlerdetect", "php crawler detect"],
"homepage": "https://github.com/JayBizzle/Crawler-Detect/",
"license": "MIT",
"authors": [
{
"name": "Mark Beech",
"email": "m@rkbee.ch",
"role": "Developer"
}
],
"require": {
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8|^5.5|^6.5",
"satooshi/php-coveralls": "1.*"
},
"autoload": {
"psr-4": {
"Jaybizzle\\CrawlerDetect\\": "src/"
}
},
"scripts": {
"test": "vendor/bin/phpunit"
}
}

View file

@ -0,0 +1,41 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
require 'src/Fixtures/AbstractProvider.php';
require 'src/Fixtures/Crawlers.php';
require 'src/Fixtures/Exclusions.php';
require 'src/Fixtures/Headers.php';
$src = array(
'Crawlers',
'Exclusions',
'Headers',
);
foreach ($src as $class) {
$class = "Jaybizzle\\CrawlerDetect\\Fixtures\\$class";
$object = new $class;
outputJson($object);
outputTxt($object);
}
function outputJson($object)
{
$className = (new ReflectionClass($object))->getShortName();
file_put_contents("raw/$className.json", json_encode($object->getAll()));
}
function outputTxt($object)
{
$className = (new ReflectionClass($object))->getShortName();
file_put_contents("raw/$className.txt", implode($object->getAll(), PHP_EOL));
}

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,1217 @@
.*Java.*outbrain
YLT
^b0t$
^bluefish
^Calypso v\/
^COMODO DCV
^DangDang
^DavClnt
^FDM
^git\/
^Goose\/
^Grabber
^HTTPClient\/
^Java\/
^Jeode\/
^Jetty\/
^Mail\/
^Mget
^Microsoft URL Control
^NG\/[0-9\.]
^NING\/
^PHP\/[0-9]
^RMA\/
^Ruby|Ruby\/[0-9]
^VSE\/[0-9]
^WordPress\.com
^XRL\/[0-9]
^ZmEu
008\/
13TABS
192\.comAgent
2ip\.ru
404enemy
7Siters
80legs
a\.pr-cy\.ru
a3logics\.in
A6-Indexer
Abonti
Aboundex
aboutthedomain
Accoona-AI-Agent
acoon
acrylicapps\.com\/pulp
Acunetix
AdAuth\/
adbeat
AddThis
ADmantX
AdminLabs
adressendeutschland
adscanner
Adstxtaggregator
agentslug
AHC
aihit
aiohttp\/
Airmail
akka-http\/
akula\/
alertra
alexa site audit
Alibaba\.Security\.Heimdall
Alligator
allloadin
AllSubmitter
alyze\.info
amagit
Anarchie
AndroidDownloadManager
Anemone
AngleSharp
annotate_google
Ant\.com
Anturis Agent
AnyEvent-HTTP\/
Apache Droid
Apache OpenOffice
Apache-HttpAsyncClient
Apache-HttpClient
ApacheBench
Apexoo
APIs-Google
AportWorm\/
AppBeat\/
AppEngine-Google
AppStoreScraperZ
Aprc\/[0-9]
Arachmo
arachnode
Arachnophilia
aria2
Arukereso
asafaweb
AskQuickly
Ask Jeeves
ASPSeek
Asterias
Astute
asynchttp
Attach
autocite
Autonomy
axios\/
B-l-i-t-z-B-O-T
Backlink-Ceck
backlink-check
BacklinkHttpStatus
BackStreet
BackWeb
Bad-Neighborhood
Badass
baidu\.com
Bandit
basicstate
BatchFTP
Battleztar Bazinga
baypup\/
BazQux
BBBike
BCKLINKS
BDFetch
BegunAdvertising
Bidtellect
BigBozz
Bigfoot
biglotron
BingLocalSearch
BingPreview
binlar
biNu image cacher
Bitacle
biz_Directory
Black Hole
Blackboard Safeassign
BlackWidow
BlockNote\.Net
Bloglines
Bloglovin
BlogPulseLive
BlogSearch
Blogtrottr
BlowFish
boitho\.com-dc
BPImageWalker
Braintree-Webhooks
Branch Metrics API
Branch-Passthrough
Brandprotect
BrandVerity
Brandwatch
Brodie\/
Browsershots
BUbiNG
Buck\/
Buddy
BuiltWith
Bullseye
BunnySlippers
Burf Search
Butterfly\/
BuzzSumo
CAAM\/[0-9]
CakePHP
Calculon
Canary%20Mail
CaretNail
catexplorador
CC Metadata Scaper
Cegbfeieh
censys
Cerberian Drtrs
CERT\.at-Statistics-Survey
cg-eye
changedetection
ChangesMeter
Charlotte
CheckHost
checkprivacy
CherryPicker
ChinaClaw
Chirp\/
chkme\.com
Chlooe
Chromaxa
CirrusExplorer
CISPA Vulnerability Notification
Citoid
CJNetworkQuality
Clarsentia
clips\.ua\.ac\.be
Cloud mapping
CloudEndure
CloudFlare-AlwaysOnline
Cloudinary
cmcm\.com
coccoc
cognitiveseo
colly -
CommaFeed
Commons-HttpClient
commonscan
contactbigdatafr
contentkingapp
convera
CookieReports
copyright sheriff
CopyRightCheck
Copyscape
Cosmos4j\.feedback
Covario-IDS
Crescent
Crowsnest
Criteo
CSHttp
curb
Curious George
curl
cuwhois\/
cybo\.com
DAP\/NetHTTP
DareBoost
DatabaseDriverMysqli
DataCha0s
Datafeedwatch
Datanyze
DataparkSearch
dataprovider
DataXu
Daum(oa)?[ \/][0-9]
Demon
DeuSu
developers\.google\.com\/\+\/web\/snippet\/
Devil
Digg
Digincore
DigitalPebble
Dirbuster
Discourse Forum Onebox
Disqus\/
Dispatch\/
DittoSpyder
dlvr
DMBrowser
DNSPod-reporting
docoloc
Dolphin http client
DomainAppender
Donuts Content Explorer
dotMailer content retrieval
dotSemantic
downforeveryoneorjustme
Download Wonder
downnotifier
DowntimeDetector
Drip
drupact
Drupal \(\+http:\/\/drupal\.org\/\)
DTS Agent
dubaiindex
EARTHCOM
Easy-Thumb
EasyDL
Ebingbong
ec2linkfinder
eCairn-Grabber
eCatch
ECCP
eContext\/
Ecxi
EirGrabber
ElectricMonk
elefent
EMail Exractor
EMail Wolf
EmailWolf
Embarcadero
Embed PHP Library
Embedly
endo\/
europarchive\.org
evc-batch
EventMachine HttpClient
Everwall Link Expander
Evidon
Evrinid
ExactSearch
ExaleadCloudview
Excel\/
exif
Exploratodo
Express WebPictures
Extreme Picture Finder
EyeNetIE
ezooms
facebookexternalhit
facebookplatform
fairshare
Faraday v
fasthttp
Faveeo
Favicon downloader
faviconkit
faviconarchive
FavOrg
Feed Wrangler
Feedable\/
Feedbin
FeedBooster
FeedBucket
FeedBunch\/
FeedBurner
feeder
Feedly
FeedshowOnline
Feedspot
Feedwind\/
FeedZcollector
feeltiptop
Fetch API
Fetch\/[0-9]
Fever\/[0-9]
FHscan
Fimap
findlink
findthatfile
FlashGet
FlipboardBrowserProxy
FlipboardProxy
FlipboardRSS
Flock\/
fluffy
Flunky
flynxapp
forensiq
FoundSeoTool
http:\/\/www.neomo.de\/
free thumbnails
Freeuploader
Funnelback
G-i-g-a-b-o-t
g00g1e\.net
ganarvisitas
geek-tools
Genieo
GentleSource
GetCode
Getintent
GetLinkInfo
getprismatic
GetRight
getroot
GetURLInfo\/
GetWeb
Ghost Inspector
GigablastOpenSource
GIS-LABS
github-camo
github\.com
Go [\d\.]* package http
Go http package
Go-Ahead-Got-It
Go-http-client
Go!Zilla
gobyus
gofetch
GomezAgent
gooblog
Goodzer\/
Google AppsViewer
Google Desktop
Google favicon
Google Keyword Suggestion
Google Keyword Tool
Google Page Speed Insights
Google PP Default
Google Search Console
Google Web Preview
Google-Adwords
Google-Apps-Script
Google-Calendar-Importer
Google-HotelAdsVerifier
Google-HTTP-Java-Client
Google-Publisher-Plugin
Google-SearchByImage
Google-Site-Verification
Google-Structured-Data-Testing-Tool
Google-Youtube-Links
google-xrawler
GoogleDocs
GoogleHC\/
GoogleProducer
GoogleSites
Google-Transparency-Report
Gookey
GoScraper
GoSpotCheck
gosquared-thumbnailer
Gotit
GoZilla
grabify
GrabNet
Grafula
Grammarly
GrapeFX
GreatNews
Gregarius
GRequests
grokkit
grouphigh
grub-client
gSOAP\/
GT::WWW
GTmetrix
GuzzleHttp
gvfs\/
HAA(A)?RTLAND http client
Haansoft
hackney\/
Hadi Agent
HappyApps-WebCheck
Hatena
Havij
HeadlessChrome
HEADMasterSEO
HeartRails_Capture
help@dataminr\.com
heritrix
historious
hkedcity
hledejLevne\.cz
Hloader
HMView
Holmes
HonesoSearchEngine
HootSuite Image proxy
Hootsuite-WebFeed
hosterstats
HostTracker
ht:\/\/check
htdig
HTMLparser
htmlyse
HTTP Banner Detection
HTTP_Compression_Test
http_request2
http_requester
http-get
HTTP-Header-Abfrage
http-kit
http-request\/
HTTP-Tiny
HTTP::Lite
http\.rb\/
http_get
HttpComponents
httphr
HTTPMon
httpRequest
httpscheck
httpssites_power
httpunit
HttpUrlConnection
httrack
huaweisymantec
HubSpot
Humanlinks
i2kconnect\/
Iblog
ichiro
Id-search
IdeelaborPlagiaat
IDG Twitter Links Resolver
IDwhois\/
Iframely
igdeSpyder
IlTrovatore
Image Fetch
Image Sucker
ImageEngine\/
ImageVisu\/
Imagga
imagineeasy
imgsizer
InAGist
inbound\.li parser
InDesign%20CC
Indy Library
InetURL
infegy
infohelfer
InfoTekies
InfoWizards Reciprocal Link
inpwrd\.com
instabid
Instapaper
Integrity
integromedb
Intelliseek
InterGET
internet_archive
Internet Ninja
InternetSeer
internetVista monitor
intraVnews
IODC
IOI
iplabel
ips-agent
IPS\/[0-9]
IPWorks HTTP\/S Component
iqdb\/
Iria
Irokez
isitup\.org
iskanie
isUp\.li
iThemes Sync\/
iZSearch
JAHHO
janforman
Jaunt\/
Jbrofuzz
Jersey\/
JetCar
Jigsaw
Jobboerse
JobFeed discovery
Jobg8 URL Monitor
jobo
Jobrapido
Jobsearch1\.5
JoinVision Generic
JolokiaPwn
Joomla
Jorgee
JS-Kit
JustView
Kaspersky Lab CFR link resolver
Kelny\/
Kerrigan\/
KeyCDN
Keyword Density
Keywords Research
KickFire
KimonoLabs\/
Kml-Google
knows\.is
KOCMOHABT
kouio
kube-probe
kulturarw3
KumKie
L\.webis
Larbin
Lavf\/
LeechFTP
LeechGet
letsencrypt
Lftp
LibVLC
LibWeb
Libwhisker
libwww
Licorne
Liferea\/
Lightspeedsystems
Lighthouse
Likse
Link Valet
link_thumbnailer
LinkAlarm\/
linkCheck
linkdex
LinkExaminer
linkfluence
linkpeek
LinkPreviewGenerator
LinkScan
LinksManager
LinkTiger
LinkWalker
Lipperhey
Litemage_walker
livedoor ScreenShot
LoadImpactRload
localsearch-web
LongURL API
looksystems\.net
ltx71
lua-resty-http
lwp-request
lwp-trivial
LWP::Simple
lycos
LYT\.SR
mabontland
Mag-Net
MagpieRSS
Mail\.Ru
MailChimp
Majestic12
makecontact\/
Mandrill
MapperCmd
marketinggrader
MarkMonitor
MarkWatch
Mass Downloader
masscan\/
Mata Hari
Mediapartners-Google
mediawords
MegaIndex\.ru
MeltwaterNews
Melvil Rawi
MemGator
Metaspinner
MetaURI
MFC_Tear_Sample
Microsearch
Microsoft Office
Microsoft Outlook
Microsoft Windows Network Diagnostics
Microsoft-WebDAV-MiniRedir
Microsoft Data Access
MIDown tool
MIIxpc
Mindjet
Miniature\.io
Miniflux
Mister PiX
mixdata dot com
mixed-content-scan
Mixmax-LinkPreview
mixnode
Mnogosearch
mogimogi
Mojeek
Mojolicious \(Perl\)
Monit\/
monitis
Monitority\/
montastic
MonTools
Moreover
Morfeus Fucking Scanner
Morning Paper
MovableType
mowser
Mrcgiguy
MS Web Services Client Protocol
MSFrontPage
mShots
MuckRack\/
muhstik-scan
MVAClient
MxToolbox\/
nagios
Najdi\.si
Name Intelligence
Nameprotect
Navroad
NearSite
Needle
Nessus
Net Vampire
NetAnts
NETCRAFT
NetLyzer
NetMechanic
NetNewsWire
Netpursual
netresearch
NetShelter ContentScan
Netsparker
NetTrack
Netvibes
NetZIP
Neustar WPM
NeutrinoAPI
NewRelicPinger
NewsBlur .*Finder
NewsGator
newsme
newspaper\/
Nexgate Ruby Client
NG-Search
Nibbler
NICErsPRO
Nikto
nineconnections
NLNZ_IAHarvester
Nmap Scripting Engine
node-superagent
node-urllib
node\.io
Nodemeter
NodePing
nominet\.org\.uk
nominet\.uk
Norton-Safeweb
Notifixious
notifyninja
nuhk
nutch
Nuzzel
nWormFeedFinder
nyawc\/
Nymesis
NYU
Ocelli\/
Octopus
oegp
Offline Explorer
Offline Navigator
og-scraper
okhttp
omgili
OMSC
Online Domain Tools
OpenCalaisSemanticProxy
Openfind
OpenLinkProfiler
Openstat\/
OpenVAS
Optimizer
Orbiter
OrgProbe\/
orion-semantics
Outlook-Express
Outlook-iOS
ow\.ly
Owler
ownCloud News
OxfordCloudService
Page Valet
page_verifier
page scorer
page2rss
PageGrabber
PagePeeker
PageScorer
Pagespeed\/
Panopta
panscient
Papa Foto
parsijoo
Pavuk
PayPal IPN
pcBrowser
Pcore-HTTP
Pearltrees
PECL::HTTP
peerindex
Peew
PeoplePal
Perlu -
PhantomJS Screenshoter
PhantomJS\/
Photon\/
phpservermon
Pi-Monster
Picscout
Picsearch
PictureFinder
Pimonster
ping\.blo\.gs
Pingability
PingAdmin\.Ru
Pingdom
Pingoscope
PingSpot
pinterest\.com
Pixray
Pizilla
Plagger\/
Ploetz \+ Zeller
Plukkie
plumanalytics
PocketImageCache
PocketParser
Pockey
POE-Component-Client-HTTP
Polymail\/
Pompos
Porkbun
Port Monitor
postano
PostmanRuntime
PostPost
postrank
PowerPoint\/
Priceonomics Analysis Engine
PrintFriendly
PritTorrent
Prlog
probethenet
Project 25499
prospectb2b
Protopage
ProWebWalker
proximic
PRTG Network Monitor
pshtt, https scanning
PTST
PTST\/[0-9]+
Pulsepoint XT3 web scraper
Pump
Python-httplib2
python-requests
Python-urllib
Qirina Hurdler
QQDownload
QrafterPro
Qseero
Qualidator
QueryN Metasearch
queuedriver
Quora Link Preview
Qwantify
Radian6
RankActive
RankFlex
RankSonicSiteAuditor
Re-re Studio
ReactorNetty
Readability
RealDownload
RealPlayer%20Downloader
RebelMouse
Recorder
RecurPost\/
redback\/
ReederForMac
ReGet
RepoMonkey
request\.js
reqwest\/
ResponseCodeTest
RestSharp
Riddler
Rival IQ
Robosourcer
Robozilla
ROI Hunter
RPT-HTTPClient
RSSOwl
safe-agent-scanner
SalesIntelligent
Saleslift
Sendsay\.Ru
SauceNAO
SBIder
scalaj-http
scan\.lol
ScanAlert
Scoop
scooter
ScoutJet
ScoutURLMonitor
ScrapeBox Page Scanner
SimpleScraper
Scrapy
Screaming
ScreenShotService
Scrubby
Scrutiny\/
search\.thunderstone
Search37
searchenginepromotionhelp
Searchestate
SearchExpress
SearchSight
Seeker
semanticdiscovery
semanticjuice
Semiocast HTTP client
Semrush
sentry\/
SEO Browser
Seo Servis
seo-nastroj\.cz
seo4ajax
Seobility
SEOCentro
SeoCheck
SEOkicks
Seomoz
SEOprofiler
SEOsearch
seoscanners
seositecheckup
SEOstats
servernfo
sexsearcher
Seznam
Shelob
Shodan
Shoppimon
ShopWiki
ShortLinkTranslate
shrinktheweb
Sideqik
SimplePie
SimplyFast
Siphon
SISTRIX
Site-Shot\/
Site Sucker
Site24x7
SiteBar
Sitebeam
Sitebulb\/
SiteCondor
SiteExplorer
SiteGuardian
Siteimprove
SiteIndexed
Sitemap(s)? Generator
SitemapGenerator
SiteMonitor
Siteshooter B0t
SiteSnagger
SiteSucker
SiteTruth
Sitevigil
sitexy\.com
SkypeUriPreview
Slack\/
slider\.com
slurp
SlySearch
SmartDownload
SMRF URL Expander
SMUrlExpander
Snake
Snappy
SnapSearch
Snarfer\/
SniffRSS
sniptracker
Snoopy
SnowHaze Search
sogou web
SortSite
Sottopop
sovereign\.ai
SpaceBison
SpamExperts
Spammen
Spanner
spaziodati
SPDYCheck
Specificfeeds
speedy
SPEng
Spinn3r
spray-can
Sprinklr
spyonweb
sqlmap
Sqlworm
Sqworm
SSL Labs
ssl-tools
StackRambler
Statastico\/
StatusCake
Steeler
Stratagems Kumo
Stroke\.cz
StudioFACA
StumbleUpon
suchen
Sucuri
summify
SuperHTTP
Surphace Scout
Suzuran
SwiteScraper
Symfony BrowserKit
Symfony2 BrowserKit
SynHttpClient-Built
Sysomos
sysscan
Szukacz
T0PHackTeam
tAkeOut
Tarantula\/
Taringa UGC
TarmotGezgin
Teleport
Telesoft
Telesphoreo
Telesphorep
Tenon\.io
teoma
terrainformatica
Test Certificate Info
testuri
Tetrahedron
The Drop Reaper
The Expert HTML Source Viewer
The Knowledge AI
The Intraformant
theinternetrules
TheNomad
Thinklab
Thumbshots
ThumbSniper
timewe\.net
TinEye
Tiny Tiny RSS
TLSProbe\/
Toata
topster
touche\.com
Traackr\.com
tracemyfile
Trackuity
TrapitAgent
Trendiction
Trendsmap
trendspottr
truwoGPS
TryJsoup
TulipChain
Turingos
Turnitin
tweetedtimes
Tweetminster
Tweezler\/
twibble
Twice
Twikle
Twingly
Twisted PageGetter
Typhoeus
ubermetrics-technologies
uclassify
UdmSearch
unchaos
unirest-java
UniversalFeedParser
Unshorten\.It
Untiny
UnwindFetchor
updated
updown\.io daemon
Upflow
Uptimia
Urlcheckr
URL Verifier
URLitor
urlresolver
Urlstat
URLTester
UrlTrends Ranking Updater
URLy Warning
URLy\.Warning
Vacuum
Vagabondo
VB Project
vBSEO
VCI
via ggpht\.com GoogleImageProxy
VidibleScraper
Virusdie
visionutils
vkShare
VoidEYE
Voil
voltron
voyager\/
VSAgent\/
VSB-TUO\/
Vulnbusters Meter
VYU2
w3af\.org
W3C_Unicorn
W3C-checklink
W3C-mobileOK
WAC-OFU
Wallpapers\/[0-9]+
WallpapersHD
wangling
Wappalyzer
WatchMouse
WbSrch\/
WDT\.io
web-capture\.net
Web-sniffer
Web Auto
Web Collage
Web Enhancer
Web Fetch
Web Fuck
Web Pix
Web Sauger
Web Sucker
Webalta
Webauskunft
WebAuto
WebCapture
WebClient\/
webcollage
WebCookies
WebCopier
WebCorp
WebDataStats
WebDoc
WebEnhancer
WebFetch
WebFuck
WebGazer
WebGo IS
WebImageCollector
WebImages
WebIndex
webkit2png
WebLeacher
webmastercoffee
webmon
WebPix
WebReaper
WebSauger
webscreenie
Webshag
Webshot
Website Quester
websitepulse agent
WebsiteQuester
Websnapr
WebSniffer
Webster
WebStripper
WebSucker
Webthumb\/
WebThumbnail
WebWhacker
WebZIP
WeLikeLinks
WEPA
WeSEE
wf84
Wfuzz\/
wget
WhatsApp
WhatsMyIP
WhatWeb
WhereGoes\?
Whibse
WhoRunsCoinHive
Whynder Magnet
Windows-RSS-Platform
WinPodder
wkhtmlto
wmtips
Woko
woorankreview
Word\/
WordPress\/
WordupinfoSearch
wotbox
WP Engine Install Performance API
wpif
wprecon\.com survey
WPScan
wscheck
Wtrace
WWW-Collector-E
WWW-Mechanize
WWW::Document
WWW::Mechanize
www\.monitor\.us
WWWOFFLE
x09Mozilla
x22Mozilla
XaxisSemanticsClassifier
Xenu Link Sleuth
XING-contenttabreceiver
xpymep([0-9]?)\.exe
Y!J-(ASR|BSC)
Y\!J-BRW
Yaanb
yacy
Yahoo Link Preview
YahooCacheSystem
YahooYSMcm
YandeG
Yandex(?!Search)
yanga
yeti
Yo-yo
Yoleo Consumer
yoogliFetchAgent
YottaaMonitor
Your-Website-Sucks
yourls\.org
YoYs\.net
YP\.PL
Zabbix
Zade
Zao
Zauba
Zemanta Aggregator
Zend_Http_Client
Zend\\Http\\Client
Zermelo
Zeus
zgrab
ZnajdzFoto
Zombie\.js
Zoom\.Mac
ZyBorg
[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer)

View file

@ -0,0 +1 @@
["Safari.[\\d\\.]*","Firefox.[\\d\\.]*"," Chrome.[\\d\\.]*","Chromium.[\\d\\.]*","MSIE.[\\d\\.]","Opera\\\/[\\d\\.]*","Mozilla.[\\d\\.]*","AppleWebKit.[\\d\\.]*","Trident.[\\d\\.]*","Windows NT.[\\d\\.]*","Android [\\d\\.]*","Macintosh.","Ubuntu","Linux","[ ]Intel","Mac OS X [\\d_]*","(like )?Gecko(.[\\d\\.]*)?","KHTML,","CriOS.[\\d\\.]*","CPU iPhone OS ([0-9_])* like Mac OS X","CPU OS ([0-9_])* like Mac OS X","iPod","compatible","x86_..","i686","x64","X11","rv:[\\d\\.]*","Version.[\\d\\.]*","WOW64","Win64","Dalvik.[\\d\\.]*"," \\.NET CLR [\\d\\.]*","Presto.[\\d\\.]*","Media Center PC","BlackBerry","Build","Opera Mini\\\/\\d{1,2}\\.\\d{1,2}\\.[\\d\\.]*\\\/\\d{1,2}\\.","Opera"," \\.NET[\\d\\.]*","cubot","; M bot","; CRONO","; B bot","; IDbot","; ID bot","; POWER BOT",";"]

View file

@ -0,0 +1,48 @@
Safari.[\d\.]*
Firefox.[\d\.]*
Chrome.[\d\.]*
Chromium.[\d\.]*
MSIE.[\d\.]
Opera\/[\d\.]*
Mozilla.[\d\.]*
AppleWebKit.[\d\.]*
Trident.[\d\.]*
Windows NT.[\d\.]*
Android [\d\.]*
Macintosh.
Ubuntu
Linux
[ ]Intel
Mac OS X [\d_]*
(like )?Gecko(.[\d\.]*)?
KHTML,
CriOS.[\d\.]*
CPU iPhone OS ([0-9_])* like Mac OS X
CPU OS ([0-9_])* like Mac OS X
iPod
compatible
x86_..
i686
x64
X11
rv:[\d\.]*
Version.[\d\.]*
WOW64
Win64
Dalvik.[\d\.]*
\.NET CLR [\d\.]*
Presto.[\d\.]*
Media Center PC
BlackBerry
Build
Opera Mini\/\d{1,2}\.\d{1,2}\.[\d\.]*\/\d{1,2}\.
Opera
\.NET[\d\.]*
cubot
; M bot
; CRONO
; B bot
; IDbot
; ID bot
; POWER BOT
;

View file

@ -0,0 +1 @@
["HTTP_USER_AGENT","HTTP_X_OPERAMINI_PHONE_UA","HTTP_X_DEVICE_USER_AGENT","HTTP_X_ORIGINAL_USER_AGENT","HTTP_X_SKYFIRE_PHONE","HTTP_X_BOLT_PHONE_UA","HTTP_DEVICE_STOCK_UA","HTTP_X_UCBROWSER_DEVICE_UA","HTTP_FROM","HTTP_X_SCANNER"]

View file

@ -0,0 +1,10 @@
HTTP_USER_AGENT
HTTP_X_OPERAMINI_PHONE_UA
HTTP_X_DEVICE_USER_AGENT
HTTP_X_ORIGINAL_USER_AGENT
HTTP_X_SKYFIRE_PHONE
HTTP_X_BOLT_PHONE_UA
HTTP_DEVICE_STOCK_UA
HTTP_X_UCBROWSER_DEVICE_UA
HTTP_FROM
HTTP_X_SCANNER

View file

@ -0,0 +1,193 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect;
use Jaybizzle\CrawlerDetect\Fixtures\Crawlers;
use Jaybizzle\CrawlerDetect\Fixtures\Exclusions;
use Jaybizzle\CrawlerDetect\Fixtures\Headers;
class CrawlerDetect
{
/**
* The user agent.
*
* @var null
*/
protected $userAgent = null;
/**
* Headers that contain a user agent.
*
* @var array
*/
protected $httpHeaders = array();
/**
* Store regex matches.
*
* @var array
*/
protected $matches = array();
/**
* Crawlers object.
*
* @var \Jaybizzle\CrawlerDetect\Fixtures\Crawlers
*/
protected $crawlers;
/**
* Exclusions object.
*
* @var \Jaybizzle\CrawlerDetect\Fixtures\Exclusions
*/
protected $exclusions;
/**
* Headers object.
*
* @var \Jaybizzle\CrawlerDetect\Fixtures\Headers
*/
protected $uaHttpHeaders;
/**
* The compiled regex string.
*
* @var string
*/
protected $compiledRegex;
/**
* The compiled exclusions regex string.
*
* @var string
*/
protected $compiledExclusions;
/**
* Class constructor.
*/
public function __construct(array $headers = null, $userAgent = null)
{
$this->crawlers = new Crawlers();
$this->exclusions = new Exclusions();
$this->uaHttpHeaders = new Headers();
$this->compiledRegex = $this->compileRegex($this->crawlers->getAll());
$this->compiledExclusions = $this->compileRegex($this->exclusions->getAll());
$this->setHttpHeaders($headers);
$this->setUserAgent($userAgent);
}
/**
* Compile the regex patterns into one regex string.
*
* @param array
*
* @return string
*/
public function compileRegex($patterns)
{
return '('.implode('|', $patterns).')';
}
/**
* Set HTTP headers.
*
* @param array|null $httpHeaders
*/
public function setHttpHeaders($httpHeaders)
{
// Use global _SERVER if $httpHeaders aren't defined.
if (! is_array($httpHeaders) || ! count($httpHeaders)) {
$httpHeaders = $_SERVER;
}
// Clear existing headers.
$this->httpHeaders = array();
// Only save HTTP headers. In PHP land, that means
// only _SERVER vars that start with HTTP_.
foreach ($httpHeaders as $key => $value) {
if (strpos($key, 'HTTP_') === 0) {
$this->httpHeaders[$key] = $value;
}
}
}
/**
* Return user agent headers.
*
* @return array
*/
public function getUaHttpHeaders()
{
return $this->uaHttpHeaders->getAll();
}
/**
* Set the user agent.
*
* @param string $userAgent
*/
public function setUserAgent($userAgent)
{
if (is_null($userAgent)) {
foreach ($this->getUaHttpHeaders() as $altHeader) {
if (isset($this->httpHeaders[$altHeader])) {
$userAgent .= $this->httpHeaders[$altHeader].' ';
}
}
}
return $this->userAgent = $userAgent;
}
/**
* Check user agent string against the regex.
*
* @param string|null $userAgent
*
* @return bool
*/
public function isCrawler($userAgent = null)
{
$agent = trim(preg_replace(
"/{$this->compiledExclusions}/i",
'',
$userAgent ?: $this->userAgent
));
if ($agent == '') {
return false;
}
$result = preg_match("/{$this->compiledRegex}/i", $agent, $matches);
if ($matches) {
$this->matches = $matches;
}
return (bool) $result;
}
/**
* Return the matches.
*
* @return string|null
*/
public function getMatches()
{
return isset($this->matches[0]) ? $this->matches[0] : null;
}
}

View file

@ -0,0 +1,32 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect\Fixtures;
abstract class AbstractProvider
{
/**
* The data set.
*
* @var array
*/
protected $data;
/**
* Return the data set.
*
* @return array
*/
public function getAll()
{
return $this->data;
}
}

View file

@ -0,0 +1,1240 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect\Fixtures;
class Crawlers extends AbstractProvider
{
/**
* Array of regular expressions to match against the user agent.
*
* @var array
*/
protected $data = array(
'.*Java.*outbrain',
' YLT',
'^b0t$',
'^bluefish ',
'^Calypso v\/',
'^COMODO DCV',
'^DangDang',
'^DavClnt',
'^FDM ',
'^git\/',
'^Goose\/',
'^Grabber',
'^HTTPClient\/',
'^Java\/',
'^Jeode\/',
'^Jetty\/',
'^Mail\/',
'^Mget',
'^Microsoft URL Control',
'^NG\/[0-9\.]',
'^NING\/',
'^PHP\/[0-9]',
'^RMA\/',
'^Ruby|Ruby\/[0-9]',
'^VSE\/[0-9]',
'^WordPress\.com',
'^XRL\/[0-9]',
'^ZmEu',
'008\/',
'13TABS',
'192\.comAgent',
'2ip\.ru',
'404enemy',
'7Siters',
'80legs',
'a\.pr-cy\.ru',
'a3logics\.in',
'A6-Indexer',
'Abonti',
'Aboundex',
'aboutthedomain',
'Accoona-AI-Agent',
'acoon',
'acrylicapps\.com\/pulp',
'Acunetix',
'AdAuth\/',
'adbeat',
'AddThis',
'ADmantX',
'AdminLabs',
'adressendeutschland',
'adscanner',
'Adstxtaggregator',
'agentslug',
'AHC',
'aihit',
'aiohttp\/',
'Airmail',
'akka-http\/',
'akula\/',
'alertra',
'alexa site audit',
'Alibaba\.Security\.Heimdall',
'Alligator',
'allloadin',
'AllSubmitter',
'alyze\.info',
'amagit',
'Anarchie',
'AndroidDownloadManager',
'Anemone',
'AngleSharp',
'annotate_google',
'Ant\.com',
'Anturis Agent',
'AnyEvent-HTTP\/',
'Apache Droid',
'Apache OpenOffice',
'Apache-HttpAsyncClient',
'Apache-HttpClient',
'ApacheBench',
'Apexoo',
'APIs-Google',
'AportWorm\/',
'AppBeat\/',
'AppEngine-Google',
'AppStoreScraperZ',
'Aprc\/[0-9]',
'Arachmo',
'arachnode',
'Arachnophilia',
'aria2',
'Arukereso',
'asafaweb',
'AskQuickly',
'Ask Jeeves',
'ASPSeek',
'Asterias',
'Astute',
'asynchttp',
'Attach',
'autocite',
'Autonomy',
'axios\/',
'B-l-i-t-z-B-O-T',
'Backlink-Ceck',
'backlink-check',
'BacklinkHttpStatus',
'BackStreet',
'BackWeb',
'Bad-Neighborhood',
'Badass',
'baidu\.com',
'Bandit',
'basicstate',
'BatchFTP',
'Battleztar Bazinga',
'baypup\/',
'BazQux',
'BBBike',
'BCKLINKS',
'BDFetch',
'BegunAdvertising',
'Bidtellect',
'BigBozz',
'Bigfoot',
'biglotron',
'BingLocalSearch',
'BingPreview',
'binlar',
'biNu image cacher',
'Bitacle',
'biz_Directory',
'Black Hole',
'Blackboard Safeassign',
'BlackWidow',
'BlockNote\.Net',
'Bloglines',
'Bloglovin',
'BlogPulseLive',
'BlogSearch',
'Blogtrottr',
'BlowFish',
'boitho\.com-dc',
'BPImageWalker',
'Braintree-Webhooks',
'Branch Metrics API',
'Branch-Passthrough',
'Brandprotect',
'BrandVerity',
'Brandwatch',
'Brodie\/',
'Browsershots',
'BUbiNG',
'Buck\/',
'Buddy',
'BuiltWith',
'Bullseye',
'BunnySlippers',
'Burf Search',
'Butterfly\/',
'BuzzSumo',
'CAAM\/[0-9]',
'CakePHP',
'Calculon',
'Canary%20Mail',
'CaretNail',
'catexplorador',
'CC Metadata Scaper',
'Cegbfeieh',
'censys',
'Cerberian Drtrs',
'CERT\.at-Statistics-Survey',
'cg-eye',
'changedetection',
'ChangesMeter',
'Charlotte',
'CheckHost',
'checkprivacy',
'CherryPicker',
'ChinaClaw',
'Chirp\/',
'chkme\.com',
'Chlooe',
'Chromaxa',
'CirrusExplorer',
'CISPA Vulnerability Notification',
'Citoid',
'CJNetworkQuality',
'Clarsentia',
'clips\.ua\.ac\.be',
'Cloud mapping',
'CloudEndure',
'CloudFlare-AlwaysOnline',
'Cloudinary',
'cmcm\.com',
'coccoc',
'cognitiveseo',
'colly -',
'CommaFeed',
'Commons-HttpClient',
'commonscan',
'contactbigdatafr',
'contentkingapp',
'convera',
'CookieReports',
'copyright sheriff',
'CopyRightCheck',
'Copyscape',
'Cosmos4j\.feedback',
'Covario-IDS',
'Crescent',
'Crowsnest',
'Criteo',
'CSHttp',
'curb',
'Curious George',
'curl',
'cuwhois\/',
'cybo\.com',
'DAP\/NetHTTP',
'DareBoost',
'DatabaseDriverMysqli',
'DataCha0s',
'Datafeedwatch',
'Datanyze',
'DataparkSearch',
'dataprovider',
'DataXu',
'Daum(oa)?[ \/][0-9]',
'Demon',
'DeuSu',
'developers\.google\.com\/\+\/web\/snippet\/',
'Devil',
'Digg',
'Digincore',
'DigitalPebble',
'Dirbuster',
'Discourse Forum Onebox',
'Disqus\/',
'Dispatch\/',
'DittoSpyder',
'dlvr',
'DMBrowser',
'DNSPod-reporting',
'docoloc',
'Dolphin http client',
'DomainAppender',
'Donuts Content Explorer',
'dotMailer content retrieval',
'dotSemantic',
'downforeveryoneorjustme',
'Download Wonder',
'downnotifier',
'DowntimeDetector',
'Drip',
'drupact',
'Drupal \(\+http:\/\/drupal\.org\/\)',
'DTS Agent',
'dubaiindex',
'EARTHCOM',
'Easy-Thumb',
'EasyDL',
'Ebingbong',
'ec2linkfinder',
'eCairn-Grabber',
'eCatch',
'ECCP',
'eContext\/',
'Ecxi',
'EirGrabber',
'ElectricMonk',
'elefent',
'EMail Exractor',
'EMail Wolf',
'EmailWolf',
'Embarcadero',
'Embed PHP Library',
'Embedly',
'endo\/',
'europarchive\.org',
'evc-batch',
'EventMachine HttpClient',
'Everwall Link Expander',
'Evidon',
'Evrinid',
'ExactSearch',
'ExaleadCloudview',
'Excel\/',
'exif',
'Exploratodo',
'Express WebPictures',
'Extreme Picture Finder',
'EyeNetIE',
'ezooms',
'facebookexternalhit',
'facebookplatform',
'fairshare',
'Faraday v',
'fasthttp',
'Faveeo',
'Favicon downloader',
'faviconkit',
'faviconarchive',
'FavOrg',
'Feed Wrangler',
'Feedable\/',
'Feedbin',
'FeedBooster',
'FeedBucket',
'FeedBunch\/',
'FeedBurner',
'feeder',
'Feedly',
'FeedshowOnline',
'Feedspot',
'Feedwind\/',
'FeedZcollector',
'feeltiptop',
'Fetch API',
'Fetch\/[0-9]',
'Fever\/[0-9]',
'FHscan',
'Fimap',
'findlink',
'findthatfile',
'FlashGet',
'FlipboardBrowserProxy',
'FlipboardProxy',
'FlipboardRSS',
'Flock\/',
'fluffy',
'Flunky',
'flynxapp',
'forensiq',
'FoundSeoTool',
'http:\/\/www.neomo.de\/', //'Francis [Bot]'
'free thumbnails',
'Freeuploader',
'Funnelback',
'G-i-g-a-b-o-t',
'g00g1e\.net',
'ganarvisitas',
'geek-tools',
'Genieo',
'GentleSource',
'GetCode',
'Getintent',
'GetLinkInfo',
'getprismatic',
'GetRight',
'getroot',
'GetURLInfo\/',
'GetWeb',
'Ghost Inspector',
'GigablastOpenSource',
'GIS-LABS',
'github-camo',
'github\.com',
'Go [\d\.]* package http',
'Go http package',
'Go-Ahead-Got-It',
'Go-http-client',
'Go!Zilla',
'gobyus',
'gofetch',
'GomezAgent',
'gooblog',
'Goodzer\/',
'Google AppsViewer',
'Google Desktop',
'Google favicon',
'Google Keyword Suggestion',
'Google Keyword Tool',
'Google Page Speed Insights',
'Google PP Default',
'Google Search Console',
'Google Web Preview',
'Google-Adwords',
'Google-Apps-Script',
'Google-Calendar-Importer',
'Google-HotelAdsVerifier',
'Google-HTTP-Java-Client',
'Google-Publisher-Plugin',
'Google-SearchByImage',
'Google-Site-Verification',
'Google-Structured-Data-Testing-Tool',
'Google-Youtube-Links',
'google-xrawler',
'GoogleDocs',
'GoogleHC\/',
'GoogleProducer',
'GoogleSites',
'Google-Transparency-Report',
'Gookey',
'GoScraper',
'GoSpotCheck',
'gosquared-thumbnailer',
'Gotit',
'GoZilla',
'grabify',
'GrabNet',
'Grafula',
'Grammarly',
'GrapeFX',
'GreatNews',
'Gregarius',
'GRequests',
'grokkit',
'grouphigh',
'grub-client',
'gSOAP\/',
'GT::WWW',
'GTmetrix',
'GuzzleHttp',
'gvfs\/',
'HAA(A)?RTLAND http client',
'Haansoft',
'hackney\/',
'Hadi Agent',
'HappyApps-WebCheck',
'Hatena',
'Havij',
'HeadlessChrome',
'HEADMasterSEO',
'HeartRails_Capture',
'help@dataminr\.com',
'heritrix',
'historious',
'hkedcity',
'hledejLevne\.cz',
'Hloader',
'HMView',
'Holmes',
'HonesoSearchEngine',
'HootSuite Image proxy',
'Hootsuite-WebFeed',
'hosterstats',
'HostTracker',
'ht:\/\/check',
'htdig',
'HTMLparser',
'htmlyse',
'HTTP Banner Detection',
'HTTP_Compression_Test',
'http_request2',
'http_requester',
'http-get',
'HTTP-Header-Abfrage',
'http-kit',
'http-request\/',
'HTTP-Tiny',
'HTTP::Lite',
'http\.rb\/',
'http_get',
'HttpComponents',
'httphr',
'HTTPMon',
'httpRequest',
'httpscheck',
'httpssites_power',
'httpunit',
'HttpUrlConnection',
'httrack',
'huaweisymantec',
'HubSpot ',
'Humanlinks',
'i2kconnect\/',
'Iblog',
'ichiro',
'Id-search',
'IdeelaborPlagiaat',
'IDG Twitter Links Resolver',
'IDwhois\/',
'Iframely',
'igdeSpyder',
'IlTrovatore',
'Image Fetch',
'Image Sucker',
'ImageEngine\/',
'ImageVisu\/',
'Imagga',
'imagineeasy',
'imgsizer',
'InAGist',
'inbound\.li parser',
'InDesign%20CC',
'Indy Library',
'InetURL',
'infegy',
'infohelfer',
'InfoTekies',
'InfoWizards Reciprocal Link',
'inpwrd\.com',
'instabid',
'Instapaper',
'Integrity',
'integromedb',
'Intelliseek',
'InterGET',
'internet_archive',
'Internet Ninja',
'InternetSeer',
'internetVista monitor',
'intraVnews',
'IODC',
'IOI',
'iplabel',
'ips-agent',
'IPS\/[0-9]',
'IPWorks HTTP\/S Component',
'iqdb\/',
'Iria',
'Irokez',
'isitup\.org',
'iskanie',
'isUp\.li',
'iThemes Sync\/',
'iZSearch',
'JAHHO',
'janforman',
'Jaunt\/',
'Jbrofuzz',
'Jersey\/',
'JetCar',
'Jigsaw',
'Jobboerse',
'JobFeed discovery',
'Jobg8 URL Monitor',
'jobo',
'Jobrapido',
'Jobsearch1\.5',
'JoinVision Generic',
'JolokiaPwn',
'Joomla',
'Jorgee',
'JS-Kit',
'JustView',
'Kaspersky Lab CFR link resolver',
'Kelny\/',
'Kerrigan\/',
'KeyCDN',
'Keyword Density',
'Keywords Research',
'KickFire',
'KimonoLabs\/',
'Kml-Google',
'knows\.is',
'KOCMOHABT',
'kouio',
'kube-probe',
'kulturarw3',
'KumKie',
'L\.webis',
'Larbin',
'Lavf\/',
'LeechFTP',
'LeechGet',
'letsencrypt',
'Lftp',
'LibVLC',
'LibWeb',
'Libwhisker',
'libwww',
'Licorne',
'Liferea\/',
'Lightspeedsystems',
'Lighthouse',
'Likse',
'Link Valet',
'link_thumbnailer',
'LinkAlarm\/',
'linkCheck',
'linkdex',
'LinkExaminer',
'linkfluence',
'linkpeek',
'LinkPreviewGenerator',
'LinkScan',
'LinksManager',
'LinkTiger',
'LinkWalker',
'Lipperhey',
'Litemage_walker',
'livedoor ScreenShot',
'LoadImpactRload',
'localsearch-web',
'LongURL API',
'looksystems\.net',
'ltx71',
'lua-resty-http',
'lwp-request',
'lwp-trivial',
'LWP::Simple',
'lycos',
'LYT\.SR',
'mabontland',
'Mag-Net',
'MagpieRSS',
'Mail\.Ru',
'MailChimp',
'Majestic12',
'makecontact\/',
'Mandrill',
'MapperCmd',
'marketinggrader',
'MarkMonitor',
'MarkWatch',
'Mass Downloader',
'masscan\/',
'Mata Hari',
'Mediapartners-Google',
'mediawords',
'MegaIndex\.ru',
'MeltwaterNews',
'Melvil Rawi',
'MemGator',
'Metaspinner',
'MetaURI',
'MFC_Tear_Sample',
'Microsearch',
'Microsoft Office ',
'Microsoft Outlook',
'Microsoft Windows Network Diagnostics',
'Microsoft-WebDAV-MiniRedir',
'Microsoft Data Access',
'MIDown tool',
'MIIxpc',
'Mindjet',
'Miniature\.io',
'Miniflux',
'Mister PiX',
'mixdata dot com',
'mixed-content-scan',
'Mixmax-LinkPreview',
'mixnode',
'Mnogosearch',
'mogimogi',
'Mojeek',
'Mojolicious \(Perl\)',
'Monit\/',
'monitis',
'Monitority\/',
'montastic',
'MonTools',
'Moreover',
'Morfeus Fucking Scanner',
'Morning Paper',
'MovableType',
'mowser',
'Mrcgiguy',
'MS Web Services Client Protocol',
'MSFrontPage',
'mShots',
'MuckRack\/',
'muhstik-scan',
'MVAClient',
'MxToolbox\/',
'nagios',
'Najdi\.si',
'Name Intelligence',
'Nameprotect',
'Navroad',
'NearSite',
'Needle',
'Nessus',
'Net Vampire',
'NetAnts',
'NETCRAFT',
'NetLyzer',
'NetMechanic',
'NetNewsWire',
'Netpursual',
'netresearch',
'NetShelter ContentScan',
'Netsparker',
'NetTrack',
'Netvibes',
'NetZIP',
'Neustar WPM',
'NeutrinoAPI',
'NewRelicPinger',
'NewsBlur .*Finder',
'NewsGator',
'newsme',
'newspaper\/',
'Nexgate Ruby Client',
'NG-Search',
'Nibbler',
'NICErsPRO',
'Nikto',
'nineconnections',
'NLNZ_IAHarvester',
'Nmap Scripting Engine',
'node-superagent',
'node-urllib',
'node\.io',
'Nodemeter',
'NodePing',
'nominet\.org\.uk',
'nominet\.uk',
'Norton-Safeweb',
'Notifixious',
'notifyninja',
'nuhk',
'nutch',
'Nuzzel',
'nWormFeedFinder',
'nyawc\/',
'Nymesis',
'NYU',
'Ocelli\/',
'Octopus',
'oegp',
'Offline Explorer',
'Offline Navigator',
'og-scraper',
'okhttp',
'omgili',
'OMSC',
'Online Domain Tools',
'OpenCalaisSemanticProxy',
'Openfind',
'OpenLinkProfiler',
'Openstat\/',
'OpenVAS',
'Optimizer',
'Orbiter',
'OrgProbe\/',
'orion-semantics',
'Outlook-Express',
'Outlook-iOS',
'ow\.ly',
'Owler',
'ownCloud News',
'OxfordCloudService',
'Page Valet',
'page_verifier',
'page scorer',
'page2rss',
'PageGrabber',
'PagePeeker',
'PageScorer',
'Pagespeed\/',
'Panopta',
'panscient',
'Papa Foto',
'parsijoo',
'Pavuk',
'PayPal IPN',
'pcBrowser',
'Pcore-HTTP',
'Pearltrees',
'PECL::HTTP',
'peerindex',
'Peew',
'PeoplePal',
'Perlu -',
'PhantomJS Screenshoter',
'PhantomJS\/',
'Photon\/',
'phpservermon',
'Pi-Monster',
'Picscout',
'Picsearch',
'PictureFinder',
'Pimonster',
'ping\.blo\.gs',
'Pingability',
'PingAdmin\.Ru',
'Pingdom',
'Pingoscope',
'PingSpot',
'pinterest\.com',
'Pixray',
'Pizilla',
'Plagger\/',
'Ploetz \+ Zeller',
'Plukkie',
'plumanalytics',
'PocketImageCache',
'PocketParser',
'Pockey',
'POE-Component-Client-HTTP',
'Polymail\/',
'Pompos',
'Porkbun',
'Port Monitor',
'postano',
'PostmanRuntime',
'PostPost',
'postrank',
'PowerPoint\/',
'Priceonomics Analysis Engine',
'PrintFriendly',
'PritTorrent',
'Prlog',
'probethenet',
'Project 25499',
'prospectb2b',
'Protopage',
'ProWebWalker',
'proximic',
'PRTG Network Monitor',
'pshtt, https scanning',
'PTST ',
'PTST\/[0-9]+',
'Pulsepoint XT3 web scraper',
'Pump',
'Python-httplib2',
'python-requests',
'Python-urllib',
'Qirina Hurdler',
'QQDownload',
'QrafterPro',
'Qseero',
'Qualidator',
'QueryN Metasearch',
'queuedriver',
'Quora Link Preview',
'Qwantify',
'Radian6',
'RankActive',
'RankFlex',
'RankSonicSiteAuditor',
'Re-re Studio',
'ReactorNetty',
'Readability',
'RealDownload',
'RealPlayer%20Downloader',
'RebelMouse',
'Recorder',
'RecurPost\/',
'redback\/',
'ReederForMac',
'ReGet',
'RepoMonkey',
'request\.js',
'reqwest\/',
'ResponseCodeTest',
'RestSharp',
'Riddler',
'Rival IQ',
'Robosourcer',
'Robozilla',
'ROI Hunter',
'RPT-HTTPClient',
'RSSOwl',
'safe-agent-scanner',
'SalesIntelligent',
'Saleslift',
'Sendsay\.Ru',
'SauceNAO',
'SBIder',
'scalaj-http',
'scan\.lol',
'ScanAlert',
'Scoop',
'scooter',
'ScoutJet',
'ScoutURLMonitor',
'ScrapeBox Page Scanner',
'SimpleScraper',
'Scrapy',
'Screaming',
'ScreenShotService',
'Scrubby',
'Scrutiny\/',
'search\.thunderstone',
'Search37',
'searchenginepromotionhelp',
'Searchestate',
'SearchExpress',
'SearchSight',
'Seeker',
'semanticdiscovery',
'semanticjuice',
'Semiocast HTTP client',
'Semrush',
'sentry\/',
'SEO Browser',
'Seo Servis',
'seo-nastroj\.cz',
'seo4ajax',
'Seobility',
'SEOCentro',
'SeoCheck',
'SEOkicks',
'Seomoz',
'SEOprofiler',
'SEOsearch',
'seoscanners',
'seositecheckup',
'SEOstats',
'servernfo',
'sexsearcher',
'Seznam',
'Shelob',
'Shodan',
'Shoppimon',
'ShopWiki',
'ShortLinkTranslate',
'shrinktheweb',
'Sideqik',
'SimplePie',
'SimplyFast',
'Siphon',
'SISTRIX',
'Site-Shot\/',
'Site Sucker',
'Site24x7',
'SiteBar',
'Sitebeam',
'Sitebulb\/',
'SiteCondor',
'SiteExplorer',
'SiteGuardian',
'Siteimprove',
'SiteIndexed',
'Sitemap(s)? Generator',
'SitemapGenerator',
'SiteMonitor',
'Siteshooter B0t',
'SiteSnagger',
'SiteSucker',
'SiteTruth',
'Sitevigil',
'sitexy\.com',
'SkypeUriPreview',
'Slack\/',
'slider\.com',
'slurp',
'SlySearch',
'SmartDownload',
'SMRF URL Expander',
'SMUrlExpander',
'Snake',
'Snappy',
'SnapSearch',
'Snarfer\/',
'SniffRSS',
'sniptracker',
'Snoopy',
'SnowHaze Search',
'sogou web',
'SortSite',
'Sottopop',
'sovereign\.ai',
'SpaceBison',
'SpamExperts',
'Spammen',
'Spanner',
'spaziodati',
'SPDYCheck',
'Specificfeeds',
'speedy',
'SPEng',
'Spinn3r',
'spray-can',
'Sprinklr ',
'spyonweb',
'sqlmap',
'Sqlworm',
'Sqworm',
'SSL Labs',
'ssl-tools',
'StackRambler',
'Statastico\/',
'StatusCake',
'Steeler',
'Stratagems Kumo',
'Stroke\.cz',
'StudioFACA',
'StumbleUpon',
'suchen',
'Sucuri',
'summify',
'SuperHTTP',
'Surphace Scout',
'Suzuran',
'SwiteScraper',
'Symfony BrowserKit',
'Symfony2 BrowserKit',
'SynHttpClient-Built',
'Sysomos',
'sysscan',
'Szukacz',
'T0PHackTeam',
'tAkeOut',
'Tarantula\/',
'Taringa UGC',
'TarmotGezgin',
'Teleport',
'Telesoft',
'Telesphoreo',
'Telesphorep',
'Tenon\.io',
'teoma',
'terrainformatica',
'Test Certificate Info',
'testuri',
'Tetrahedron',
'The Drop Reaper',
'The Expert HTML Source Viewer',
'The Knowledge AI',
'The Intraformant',
'theinternetrules',
'TheNomad',
'Thinklab',
'Thumbshots',
'ThumbSniper',
'timewe\.net',
'TinEye',
'Tiny Tiny RSS',
'TLSProbe\/',
'Toata',
'topster',
'touche\.com',
'Traackr\.com',
'tracemyfile',
'Trackuity',
'TrapitAgent',
'Trendiction',
'Trendsmap',
'trendspottr',
'truwoGPS',
'TryJsoup',
'TulipChain',
'Turingos',
'Turnitin',
'tweetedtimes',
'Tweetminster',
'Tweezler\/',
'twibble',
'Twice',
'Twikle',
'Twingly',
'Twisted PageGetter',
'Typhoeus',
'ubermetrics-technologies',
'uclassify',
'UdmSearch',
'unchaos',
'unirest-java',
'UniversalFeedParser',
'Unshorten\.It',
'Untiny',
'UnwindFetchor',
'updated',
'updown\.io daemon',
'Upflow',
'Uptimia',
'Urlcheckr',
'URL Verifier',
'URLitor',
'urlresolver',
'Urlstat',
'URLTester',
'UrlTrends Ranking Updater',
'URLy Warning',
'URLy\.Warning',
'Vacuum',
'Vagabondo',
'VB Project',
'vBSEO',
'VCI',
'via ggpht\.com GoogleImageProxy',
'VidibleScraper',
'Virusdie',
'visionutils',
'vkShare',
'VoidEYE',
'Voil',
'voltron',
'voyager\/',
'VSAgent\/',
'VSB-TUO\/',
'Vulnbusters Meter',
'VYU2',
'w3af\.org',
'W3C_Unicorn',
'W3C-checklink',
'W3C-mobileOK',
'WAC-OFU',
'Wallpapers\/[0-9]+',
'WallpapersHD',
'wangling',
'Wappalyzer',
'WatchMouse',
'WbSrch\/',
'WDT\.io',
'web-capture\.net',
'Web-sniffer',
'Web Auto',
'Web Collage',
'Web Enhancer',
'Web Fetch',
'Web Fuck',
'Web Pix',
'Web Sauger',
'Web Sucker',
'Webalta',
'Webauskunft',
'WebAuto',
'WebCapture',
'WebClient\/',
'webcollage',
'WebCookies',
'WebCopier',
'WebCorp',
'WebDataStats',
'WebDoc',
'WebEnhancer',
'WebFetch',
'WebFuck',
'WebGazer',
'WebGo IS',
'WebImageCollector',
'WebImages',
'WebIndex',
'webkit2png',
'WebLeacher',
'webmastercoffee',
'webmon ',
'WebPix',
'WebReaper',
'WebSauger',
'webscreenie',
'Webshag',
'Webshot',
'Website Quester',
'websitepulse agent',
'WebsiteQuester',
'Websnapr',
'WebSniffer',
'Webster',
'WebStripper',
'WebSucker',
'Webthumb\/',
'WebThumbnail',
'WebWhacker',
'WebZIP',
'WeLikeLinks',
'WEPA',
'WeSEE',
'wf84',
'Wfuzz\/',
'wget',
'WhatsApp',
'WhatsMyIP',
'WhatWeb',
'WhereGoes\?',
'Whibse',
'WhoRunsCoinHive',
'Whynder Magnet',
'Windows-RSS-Platform',
'WinPodder',
'wkhtmlto',
'wmtips',
'Woko',
'woorankreview',
'Word\/',
'WordPress\/',
'WordupinfoSearch',
'wotbox',
'WP Engine Install Performance API',
'wpif',
'wprecon\.com survey',
'WPScan',
'wscheck',
'Wtrace',
'WWW-Collector-E',
'WWW-Mechanize',
'WWW::Document',
'WWW::Mechanize',
'www\.monitor\.us',
'WWWOFFLE',
'x09Mozilla',
'x22Mozilla',
'XaxisSemanticsClassifier',
'Xenu Link Sleuth',
'XING-contenttabreceiver',
'xpymep([0-9]?)\.exe',
'Y!J-(ASR|BSC)',
'Y\!J-BRW',
'Yaanb',
'yacy',
'Yahoo Link Preview',
'YahooCacheSystem',
'YahooYSMcm',
'YandeG',
'Yandex(?!Search)',
'yanga',
'yeti',
'Yo-yo',
'Yoleo Consumer',
'yoogliFetchAgent',
'YottaaMonitor',
'Your-Website-Sucks',
'yourls\.org',
'YoYs\.net',
'YP\.PL',
'Zabbix',
'Zade',
'Zao',
'Zauba',
'Zemanta Aggregator',
'Zend_Http_Client',
'Zend\\\\Http\\\\Client',
'Zermelo',
'Zeus ',
'zgrab',
'ZnajdzFoto',
'Zombie\.js',
'Zoom\.Mac',
'ZyBorg',
'[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer)',
);
}

View file

@ -0,0 +1,72 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect\Fixtures;
class Exclusions extends AbstractProvider
{
/**
* List of strings to remove from the user agent before running the crawler regex
* Over a large list of user agents, this gives us about a 55% speed increase!
*
* @var array
*/
protected $data = array(
'Safari.[\d\.]*',
'Firefox.[\d\.]*',
' Chrome.[\d\.]*',
'Chromium.[\d\.]*',
'MSIE.[\d\.]',
'Opera\/[\d\.]*',
'Mozilla.[\d\.]*',
'AppleWebKit.[\d\.]*',
'Trident.[\d\.]*',
'Windows NT.[\d\.]*',
'Android [\d\.]*',
'Macintosh.',
'Ubuntu',
'Linux',
'[ ]Intel',
'Mac OS X [\d_]*',
'(like )?Gecko(.[\d\.]*)?',
'KHTML,',
'CriOS.[\d\.]*',
'CPU iPhone OS ([0-9_])* like Mac OS X',
'CPU OS ([0-9_])* like Mac OS X',
'iPod',
'compatible',
'x86_..',
'i686',
'x64',
'X11',
'rv:[\d\.]*',
'Version.[\d\.]*',
'WOW64',
'Win64',
'Dalvik.[\d\.]*',
' \.NET CLR [\d\.]*',
'Presto.[\d\.]*',
'Media Center PC',
'BlackBerry',
'Build',
'Opera Mini\/\d{1,2}\.\d{1,2}\.[\d\.]*\/\d{1,2}\.',
'Opera',
' \.NET[\d\.]*',
'cubot',
'; M bot',
'; CRONO',
'; B bot',
'; IDbot',
'; ID bot',
'; POWER BOT',
';', // Remove the following characters ;
);
}

View file

@ -0,0 +1,37 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect\Fixtures;
class Headers extends AbstractProvider
{
/**
* All possible HTTP headers that represent the user agent string.
*
* @var array
*/
protected $data = array(
// The default User-Agent string.
'HTTP_USER_AGENT',
// Header can occur on devices using Opera Mini.
'HTTP_X_OPERAMINI_PHONE_UA',
// Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
'HTTP_X_DEVICE_USER_AGENT',
'HTTP_X_ORIGINAL_USER_AGENT',
'HTTP_X_SKYFIRE_PHONE',
'HTTP_X_BOLT_PHONE_UA',
'HTTP_DEVICE_STOCK_UA',
'HTTP_X_UCBROWSER_DEVICE_UA',
// Sometimes, bots (especially Google) use a genuine user agent, but fill this header in with their email address
'HTTP_FROM',
'HTTP_X_SCANNER', // Seen in use by Netsparker
);
}