New addon "tesseract" for OCR #1457

Merged
MrPetovan merged 3 commits from heluecht/friendica-addons:tesseract into develop 2024-01-15 23:58:42 +01:00
29 changed files with 1905 additions and 0 deletions

1
tesseract/README.md Normal file
View file

@ -0,0 +1 @@
To make the addon work, you have to install the tesseract-ocr command line tool.

5
tesseract/composer.json Normal file
View file

@ -0,0 +1,5 @@
{
"require": {
"thiagoalessio/tesseract_ocr": "^2.13"
}
}

66
tesseract/composer.lock generated Normal file
View file

@ -0,0 +1,66 @@
{
"_readme": [
"This file locks the dependencies of your project to a known state",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "778b5479cb5d2b31b57f40473a87f8eb",
"packages": [
{
"name": "thiagoalessio/tesseract_ocr",
"version": "2.13.0",
"source": {
"type": "git",
"url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git",
"reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
"reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
"shasum": ""
},
"require": {
"php": "^5.3 || ^7.0 || ^8.0"
},
"require-dev": {
"phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
},
"type": "library",
"autoload": {
"psr-4": {
"thiagoalessio\\TesseractOCR\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "thiagoalessio",
"email": "thiagoalessio@me.com"
}
],
"description": "A wrapper to work with Tesseract OCR inside PHP.",
"keywords": [
"OCR",
"Tesseract",
"text recognition"
],
"time": "2023-10-05T21:14:48+00:00"
}
],
"packages-dev": [],
"aliases": [],
"minimum-stability": "stable",
"stability-flags": [],
"prefer-stable": false,
"prefer-lowest": false,
"platform": [],
"platform-dev": [],
"platform-overrides": {
"php": "7.2"
},
"plugin-api-version": "1.1.0"
}

33
tesseract/tesseract.php Normal file
View file

@ -0,0 +1,33 @@
<?php
/**
* Name: Tesseract OCR
* Description: Use OCR to get text from images
* Version: 0.1
* Author: Michael Vogel <http://pirati.ca/profile/heluecht>
*/
use Friendica\Core\Hook;
use Friendica\Core\Logger;
use Friendica\Core\System;
use thiagoalessio\TesseractOCR\TesseractOCR;
require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor' . DIRECTORY_SEPARATOR . 'autoload.php';
function tesseract_install()
{
Hook::register('ocr-detection', __FILE__, 'tesseract_ocr_detection');
Logger::notice('installed tesseract');
}
function tesseract_ocr_detection(&$media)
{
$ocr = new TesseractOCR();
try {
$ocr->tempDir(System::getTempPath());
$ocr->imageData($media['img_str'], strlen($media['img_str']));
$media['description'] = $ocr->run();
} catch (\Throwable $th) {
Logger::info('Error calling TesseractOCR', ['message' => $th->getMessage()]);
}
}

7
tesseract/vendor/autoload.php vendored Normal file
View file

@ -0,0 +1,7 @@
<?php
// autoload.php @generated by Composer
require_once __DIR__ . '/composer/autoload_real.php';
return ComposerAutoloaderInit695d781792f754383aa61632167d066e::getLoader();

View file

@ -0,0 +1,445 @@
<?php
/*
* This file is part of Composer.
*
* (c) Nils Adermann <naderman@naderman.de>
* Jordi Boggiano <j.boggiano@seld.be>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Composer\Autoload;
/**
* ClassLoader implements a PSR-0, PSR-4 and classmap class loader.
*
* $loader = new \Composer\Autoload\ClassLoader();
*
* // register classes with namespaces
* $loader->add('Symfony\Component', __DIR__.'/component');
* $loader->add('Symfony', __DIR__.'/framework');
*
* // activate the autoloader
* $loader->register();
*
* // to enable searching the include path (eg. for PEAR packages)
* $loader->setUseIncludePath(true);
*
* In this example, if you try to use a class in the Symfony\Component
* namespace or one of its children (Symfony\Component\Console for instance),
* the autoloader will first look for the class under the component/
* directory, and it will then fallback to the framework/ directory if not
* found before giving up.
*
* This class is loosely based on the Symfony UniversalClassLoader.
*
* @author Fabien Potencier <fabien@symfony.com>
* @author Jordi Boggiano <j.boggiano@seld.be>
* @see http://www.php-fig.org/psr/psr-0/
* @see http://www.php-fig.org/psr/psr-4/
*/
class ClassLoader
{
// PSR-4
private $prefixLengthsPsr4 = array();
private $prefixDirsPsr4 = array();
private $fallbackDirsPsr4 = array();
// PSR-0
private $prefixesPsr0 = array();
private $fallbackDirsPsr0 = array();
private $useIncludePath = false;
private $classMap = array();
private $classMapAuthoritative = false;
private $missingClasses = array();
private $apcuPrefix;
public function getPrefixes()
{
if (!empty($this->prefixesPsr0)) {
return call_user_func_array('array_merge', array_values($this->prefixesPsr0));
}
return array();
}
public function getPrefixesPsr4()
{
return $this->prefixDirsPsr4;
}
public function getFallbackDirs()
{
return $this->fallbackDirsPsr0;
}
public function getFallbackDirsPsr4()
{
return $this->fallbackDirsPsr4;
}
public function getClassMap()
{
return $this->classMap;
}
/**
* @param array $classMap Class to filename map
*/
public function addClassMap(array $classMap)
{
if ($this->classMap) {
$this->classMap = array_merge($this->classMap, $classMap);
} else {
$this->classMap = $classMap;
}
}
/**
* Registers a set of PSR-0 directories for a given prefix, either
* appending or prepending to the ones previously set for this prefix.
*
* @param string $prefix The prefix
* @param array|string $paths The PSR-0 root directories
* @param bool $prepend Whether to prepend the directories
*/
public function add($prefix, $paths, $prepend = false)
{
if (!$prefix) {
if ($prepend) {
$this->fallbackDirsPsr0 = array_merge(
(array) $paths,
$this->fallbackDirsPsr0
);
} else {
$this->fallbackDirsPsr0 = array_merge(
$this->fallbackDirsPsr0,
(array) $paths
);
}
return;
}
$first = $prefix[0];
if (!isset($this->prefixesPsr0[$first][$prefix])) {
$this->prefixesPsr0[$first][$prefix] = (array) $paths;
return;
}
if ($prepend) {
$this->prefixesPsr0[$first][$prefix] = array_merge(
(array) $paths,
$this->prefixesPsr0[$first][$prefix]
);
} else {
$this->prefixesPsr0[$first][$prefix] = array_merge(
$this->prefixesPsr0[$first][$prefix],
(array) $paths
);
}
}
/**
* Registers a set of PSR-4 directories for a given namespace, either
* appending or prepending to the ones previously set for this namespace.
*
* @param string $prefix The prefix/namespace, with trailing '\\'
* @param array|string $paths The PSR-4 base directories
* @param bool $prepend Whether to prepend the directories
*
* @throws \InvalidArgumentException
*/
public function addPsr4($prefix, $paths, $prepend = false)
{
if (!$prefix) {
// Register directories for the root namespace.
if ($prepend) {
$this->fallbackDirsPsr4 = array_merge(
(array) $paths,
$this->fallbackDirsPsr4
);
} else {
$this->fallbackDirsPsr4 = array_merge(
$this->fallbackDirsPsr4,
(array) $paths
);
}
} elseif (!isset($this->prefixDirsPsr4[$prefix])) {
// Register directories for a new namespace.
$length = strlen($prefix);
if ('\\' !== $prefix[$length - 1]) {
throw new \InvalidArgumentException("A non-empty PSR-4 prefix must end with a namespace separator.");
}
$this->prefixLengthsPsr4[$prefix[0]][$prefix] = $length;
$this->prefixDirsPsr4[$prefix] = (array) $paths;
} elseif ($prepend) {
// Prepend directories for an already registered namespace.
$this->prefixDirsPsr4[$prefix] = array_merge(
(array) $paths,
$this->prefixDirsPsr4[$prefix]
);
} else {
// Append directories for an already registered namespace.
$this->prefixDirsPsr4[$prefix] = array_merge(
$this->prefixDirsPsr4[$prefix],
(array) $paths
);
}
}
/**
* Registers a set of PSR-0 directories for a given prefix,
* replacing any others previously set for this prefix.
*
* @param string $prefix The prefix
* @param array|string $paths The PSR-0 base directories
*/
public function set($prefix, $paths)
{
if (!$prefix) {
$this->fallbackDirsPsr0 = (array) $paths;
} else {
$this->prefixesPsr0[$prefix[0]][$prefix] = (array) $paths;
}
}
/**
* Registers a set of PSR-4 directories for a given namespace,
* replacing any others previously set for this namespace.
*
* @param string $prefix The prefix/namespace, with trailing '\\'
* @param array|string $paths The PSR-4 base directories
*
* @throws \InvalidArgumentException
*/
public function setPsr4($prefix, $paths)
{
if (!$prefix) {
$this->fallbackDirsPsr4 = (array) $paths;
} else {
$length = strlen($prefix);
if ('\\' !== $prefix[$length - 1]) {
throw new \InvalidArgumentException("A non-empty PSR-4 prefix must end with a namespace separator.");
}
$this->prefixLengthsPsr4[$prefix[0]][$prefix] = $length;
$this->prefixDirsPsr4[$prefix] = (array) $paths;
}
}
/**
* Turns on searching the include path for class files.
*
* @param bool $useIncludePath
*/
public function setUseIncludePath($useIncludePath)
{
$this->useIncludePath = $useIncludePath;
}
/**
* Can be used to check if the autoloader uses the include path to check
* for classes.
*
* @return bool
*/
public function getUseIncludePath()
{
return $this->useIncludePath;
}
/**
* Turns off searching the prefix and fallback directories for classes
* that have not been registered with the class map.
*
* @param bool $classMapAuthoritative
*/
public function setClassMapAuthoritative($classMapAuthoritative)
{
$this->classMapAuthoritative = $classMapAuthoritative;
}
/**
* Should class lookup fail if not found in the current class map?
*
* @return bool
*/
public function isClassMapAuthoritative()
{
return $this->classMapAuthoritative;
}
/**
* APCu prefix to use to cache found/not-found classes, if the extension is enabled.
*
* @param string|null $apcuPrefix
*/
public function setApcuPrefix($apcuPrefix)
{
$this->apcuPrefix = function_exists('apcu_fetch') && filter_var(ini_get('apc.enabled'), FILTER_VALIDATE_BOOLEAN) ? $apcuPrefix : null;
}
/**
* The APCu prefix in use, or null if APCu caching is not enabled.
*
* @return string|null
*/
public function getApcuPrefix()
{
return $this->apcuPrefix;
}
/**
* Registers this instance as an autoloader.
*
* @param bool $prepend Whether to prepend the autoloader or not
*/
public function register($prepend = false)
{
spl_autoload_register(array($this, 'loadClass'), true, $prepend);
}
/**
* Unregisters this instance as an autoloader.
*/
public function unregister()
{
spl_autoload_unregister(array($this, 'loadClass'));
}
/**
* Loads the given class or interface.
*
* @param string $class The name of the class
* @return bool|null True if loaded, null otherwise
*/
public function loadClass($class)
{
if ($file = $this->findFile($class)) {
includeFile($file);
return true;
}
}
/**
* Finds the path to the file where the class is defined.
*
* @param string $class The name of the class
*
* @return string|false The path if found, false otherwise
*/
public function findFile($class)
{
// class map lookup
if (isset($this->classMap[$class])) {
return $this->classMap[$class];
}
if ($this->classMapAuthoritative || isset($this->missingClasses[$class])) {
return false;
}
if (null !== $this->apcuPrefix) {
$file = apcu_fetch($this->apcuPrefix.$class, $hit);
if ($hit) {
return $file;
}
}
$file = $this->findFileWithExtension($class, '.php');
// Search for Hack files if we are running on HHVM
if (false === $file && defined('HHVM_VERSION')) {
$file = $this->findFileWithExtension($class, '.hh');
}
if (null !== $this->apcuPrefix) {
apcu_add($this->apcuPrefix.$class, $file);
}
if (false === $file) {
// Remember that this class does not exist.
$this->missingClasses[$class] = true;
}
return $file;
}
private function findFileWithExtension($class, $ext)
{
// PSR-4 lookup
$logicalPathPsr4 = strtr($class, '\\', DIRECTORY_SEPARATOR) . $ext;
$first = $class[0];
if (isset($this->prefixLengthsPsr4[$first])) {
$subPath = $class;
while (false !== $lastPos = strrpos($subPath, '\\')) {
$subPath = substr($subPath, 0, $lastPos);
$search = $subPath . '\\';
if (isset($this->prefixDirsPsr4[$search])) {
$pathEnd = DIRECTORY_SEPARATOR . substr($logicalPathPsr4, $lastPos + 1);
foreach ($this->prefixDirsPsr4[$search] as $dir) {
if (file_exists($file = $dir . $pathEnd)) {
return $file;
}
}
}
}
}
// PSR-4 fallback dirs
foreach ($this->fallbackDirsPsr4 as $dir) {
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr4)) {
return $file;
}
}
// PSR-0 lookup
if (false !== $pos = strrpos($class, '\\')) {
// namespaced class name
$logicalPathPsr0 = substr($logicalPathPsr4, 0, $pos + 1)
. strtr(substr($logicalPathPsr4, $pos + 1), '_', DIRECTORY_SEPARATOR);
} else {
// PEAR-like class name
$logicalPathPsr0 = strtr($class, '_', DIRECTORY_SEPARATOR) . $ext;
}
if (isset($this->prefixesPsr0[$first])) {
foreach ($this->prefixesPsr0[$first] as $prefix => $dirs) {
if (0 === strpos($class, $prefix)) {
foreach ($dirs as $dir) {
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr0)) {
return $file;
}
}
}
}
}
// PSR-0 fallback dirs
foreach ($this->fallbackDirsPsr0 as $dir) {
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr0)) {
return $file;
}
}
// PSR-0 include paths.
if ($this->useIncludePath && $file = stream_resolve_include_path($logicalPathPsr0)) {
return $file;
}
return false;
}
}
/**
* Scope isolated include.
*
* Prevents access to $this/self from included files.
*/
function includeFile($file)
{
include $file;
}

21
tesseract/vendor/composer/LICENSE vendored Normal file
View file

@ -0,0 +1,21 @@
Copyright (c) Nils Adermann, Jordi Boggiano
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is furnished
to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View file

@ -0,0 +1,9 @@
<?php
// autoload_classmap.php @generated by Composer
$vendorDir = dirname(dirname(__FILE__));
$baseDir = dirname($vendorDir);
return array(
);

View file

@ -0,0 +1,9 @@
<?php
// autoload_namespaces.php @generated by Composer
$vendorDir = dirname(dirname(__FILE__));
$baseDir = dirname($vendorDir);
return array(
);

View file

@ -0,0 +1,10 @@
<?php
// autoload_psr4.php @generated by Composer
$vendorDir = dirname(dirname(__FILE__));
$baseDir = dirname($vendorDir);
return array(
'thiagoalessio\\TesseractOCR\\' => array($vendorDir . '/thiagoalessio/tesseract_ocr/src'),
);

View file

@ -0,0 +1,55 @@
<?php
// autoload_real.php @generated by Composer
class ComposerAutoloaderInit695d781792f754383aa61632167d066e
{
private static $loader;
public static function loadClassLoader($class)
{
if ('Composer\Autoload\ClassLoader' === $class) {
require __DIR__ . '/ClassLoader.php';
}
}
/**
* @return \Composer\Autoload\ClassLoader
*/
public static function getLoader()
{
if (null !== self::$loader) {
return self::$loader;
}
spl_autoload_register(array('ComposerAutoloaderInit695d781792f754383aa61632167d066e', 'loadClassLoader'), true, true);
self::$loader = $loader = new \Composer\Autoload\ClassLoader();
spl_autoload_unregister(array('ComposerAutoloaderInit695d781792f754383aa61632167d066e', 'loadClassLoader'));
$useStaticLoader = PHP_VERSION_ID >= 50600 && !defined('HHVM_VERSION') && (!function_exists('zend_loader_file_encoded') || !zend_loader_file_encoded());
if ($useStaticLoader) {
require_once __DIR__ . '/autoload_static.php';
call_user_func(\Composer\Autoload\ComposerStaticInit695d781792f754383aa61632167d066e::getInitializer($loader));
} else {
$map = require __DIR__ . '/autoload_namespaces.php';
foreach ($map as $namespace => $path) {
$loader->set($namespace, $path);
}
$map = require __DIR__ . '/autoload_psr4.php';
foreach ($map as $namespace => $path) {
$loader->setPsr4($namespace, $path);
}
$classMap = require __DIR__ . '/autoload_classmap.php';
if ($classMap) {
$loader->addClassMap($classMap);
}
}
$loader->register(true);
return $loader;
}
}

View file

@ -0,0 +1,31 @@
<?php
// autoload_static.php @generated by Composer
namespace Composer\Autoload;
class ComposerStaticInit695d781792f754383aa61632167d066e
{
public static $prefixLengthsPsr4 = array (
't' =>
array (
'thiagoalessio\\TesseractOCR\\' => 27,
),
);
public static $prefixDirsPsr4 = array (
'thiagoalessio\\TesseractOCR\\' =>
array (
0 => __DIR__ . '/..' . '/thiagoalessio/tesseract_ocr/src',
),
);
public static function getInitializer(ClassLoader $loader)
{
return \Closure::bind(function () use ($loader) {
$loader->prefixLengthsPsr4 = ComposerStaticInit695d781792f754383aa61632167d066e::$prefixLengthsPsr4;
$loader->prefixDirsPsr4 = ComposerStaticInit695d781792f754383aa61632167d066e::$prefixDirsPsr4;
}, null, ClassLoader::class);
}
}

View file

@ -0,0 +1,48 @@
[
{
"name": "thiagoalessio/tesseract_ocr",
"version": "2.13.0",
"version_normalized": "2.13.0.0",
"source": {
"type": "git",
"url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git",
"reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
"reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
"shasum": ""
},
"require": {
"php": "^5.3 || ^7.0 || ^8.0"
},
"require-dev": {
"phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
},
"time": "2023-10-05T21:14:48+00:00",
"type": "library",
"installation-source": "dist",
"autoload": {
"psr-4": {
"thiagoalessio\\TesseractOCR\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "thiagoalessio",
"email": "thiagoalessio@me.com"
}
],
"description": "A wrapper to work with Tesseract OCR inside PHP.",
"keywords": [
"OCR",
"Tesseract",
"text recognition"
]
}
]

View file

@ -0,0 +1,14 @@
---
build: false
install:
- ps: Set-Service wuauserv -StartupType Manual
- choco install php
- choco install capture2text --version 3.9
- choco install composer
- refreshenv
- cd %APPVEYOR_BUILD_FOLDER%
- composer install
test_script:
- php tests\run.php unit e2e

View file

@ -0,0 +1,19 @@
Copyright (c) 2012-2021 Thiago Alessio Pereira
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -0,0 +1,508 @@
<img src="https://thiagoalessio.github.io/tesseract-ocr-for-php/images/logo.png" alt="Tesseract OCR for PHP" align="right" width="320px"/>
# Tesseract OCR for PHP
A wrapper to work with Tesseract OCR inside PHP.
[![CI][ci_badge]][ci]
[![AppVeyor][appveyor_badge]][appveyor]
[![Codacy][codacy_badge]][codacy]
[![Test Coverage][test_coverage_badge]][test_coverage]
<br/>
[![Latest Stable Version][stable_version_badge]][packagist]
[![Total Downloads][total_downloads_badge]][packagist]
[![Monthly Downloads][monthly_downloads_badge]][packagist]
## Installation
Via [Composer][]:
$ composer require thiagoalessio/tesseract_ocr
:bangbang: **This library depends on [Tesseract OCR][], version _3.02_ or later.**
<br/>
### ![][windows_icon] Note for Windows users
There are [many ways][tesseract_installation_on_windows] to install
[Tesseract OCR][] on your system, but if you just want something quick to
get up and running, I recommend installing the [Capture2Text][] package with
[Chocolatey][].
choco install capture2text --version 3.9
:warning: Recent versions of [Capture2Text][] stopped shipping the `tesseract` binary.
<br/>
### ![][macos_icon] Note for macOS users
With [MacPorts][] you can install support for individual languages, like so:
$ sudo port install tesseract-<langcode>
But that is not possible with [Homebrew][]. It comes only with **English** support
by default, so if you intend to use it for other language, the quickest solution
is to install them all:
$ brew install tesseract tesseract-lang
<br/>
## Usage
### Basic usage
<img align="right" width="50%" title="The quick brown fox jumps over the lazy dog." src="./tests/EndToEnd/images/text.png"/>
```php
use thiagoalessio\TesseractOCR\TesseractOCR;
echo (new TesseractOCR('text.png'))
->run();
```
```
The quick brown fox
jumps over
the lazy dog.
```
<br/>
### Other languages
<img align="right" width="50%" title="Bülowstraße" src="./tests/EndToEnd/images/german.png"/>
```php
use thiagoalessio\TesseractOCR\TesseractOCR;
echo (new TesseractOCR('german.png'))
->lang('deu')
->run();
```
```
Bülowstraße
```
<br/>
### Multiple languages
<img align="right" width="50%" title="I eat すし y Pollo" src="./tests/EndToEnd/images/mixed-languages.png"/>
```php
use thiagoalessio\TesseractOCR\TesseractOCR;
echo (new TesseractOCR('mixed-languages.png'))
->lang('eng', 'jpn', 'spa')
->run();
```
```
I eat すし y Pollo
```
<br/>
### Inducing recognition
<img align="right" width="50%" title="8055" src="./tests/EndToEnd/images/8055.png"/>
```php
use thiagoalessio\TesseractOCR\TesseractOCR;
echo (new TesseractOCR('8055.png'))
->allowlist(range('A', 'Z'))
->run();
```
```
BOSS
```
<br/>
### Breaking CAPTCHAs
Yes, I know some of you might want to use this library for the *noble* purpose
of breaking CAPTCHAs, so please take a look at this comment:
<https://github.com/thiagoalessio/tesseract-ocr-for-php/issues/91#issuecomment-342290510>
## API
### run
Executes a `tesseract` command, optionally receiving an integer as `timeout`,
in case you experience stalled tesseract processes.
```php
$ocr = new TesseractOCR();
$ocr->run();
```
```php
$ocr = new TesseractOCR();
$timeout = 500;
$ocr->run($timeout);
```
### image
Define the path of an image to be recognized by `tesseract`.
```php
$ocr = new TesseractOCR();
$ocr->image('/path/to/image.png');
$ocr->run();
```
### imageData
Set the image to be recognized by `tesseract` from a string, with its size.
This can be useful when dealing with files that are already loaded in memory.
You can easily retrieve the image data and size of an image object :
```php
//Using Imagick
$data = $img->getImageBlob();
$size = $img->getImageLength();
//Using GD
ob_start();
// Note that you can use any format supported by tesseract
imagepng($img, null, 0);
$size = ob_get_length();
$data = ob_get_clean();
$ocr = new TesseractOCR();
$ocr->imageData($data, $size);
$ocr->run();
```
### executable
Define a custom location of the `tesseract` executable,
if by any reason it is not present in the `$PATH`.
```php
echo (new TesseractOCR('img.png'))
->executable('/path/to/tesseract')
->run();
```
### version
Returns the current version of `tesseract`.
```php
echo (new TesseractOCR())->version();
```
### availableLanguages
Returns a list of available languages/scripts.
```php
foreach((new TesseractOCR())->availableLanguages() as $lang) echo $lang;
```
__More info:__ <https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages-and-scripts>
### tessdataDir
Specify a custom location for the tessdata directory.
```php
echo (new TesseractOCR('img.png'))
->tessdataDir('/path')
->run();
```
### userWords
Specify the location of user words file.
This is a plain text file containing a list of words that you want to be
considered as a normal dictionary words by `tesseract`.
Useful when dealing with contents that contain technical terminology, jargon,
etc.
```
$ cat /path/to/user-words.txt
foo
bar
```
```php
echo (new TesseractOCR('img.png'))
->userWords('/path/to/user-words.txt')
->run();
```
### userPatterns
Specify the location of user patterns file.
If the contents you are dealing with have known patterns, this option can help
a lot tesseract's recognition accuracy.
```
$ cat /path/to/user-patterns.txt'
1-\d\d\d-GOOG-441
www.\n\\\*.com
```
```php
echo (new TesseractOCR('img.png'))
->userPatterns('/path/to/user-patterns.txt')
->run();
```
### lang
Define one or more languages to be used during the recognition.
A complete list of available languages can be found at:
<https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages>
__Tip from [@daijiale][]:__ Use the combination `->lang('chi_sim', 'chi_tra')`
for proper recognition of Chinese.
```php
echo (new TesseractOCR('img.png'))
->lang('lang1', 'lang2', 'lang3')
->run();
```
### psm
Specify the Page Segmentation Method, which instructs `tesseract` how to
interpret the given image.
__More info:__ <https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality#page-segmentation-method>
```php
echo (new TesseractOCR('img.png'))
->psm(6)
->run();
```
### oem
Specify the OCR Engine Mode. (see `tesseract --help-oem`)
```php
echo (new TesseractOCR('img.png'))
->oem(2)
->run();
```
### dpi
Specify the image DPI. It is useful if your image does not contain this information in its metadata.
```php
echo (new TesseractOCR('img.png'))
->dpi(300)
->run();
```
### allowlist
This is a shortcut for `->config('tessedit_char_whitelist', 'abcdef....')`.
```php
echo (new TesseractOCR('img.png'))
->allowlist(range('a', 'z'), range(0, 9), '-_@')
->run();
```
### configFile
Specify a config file to be used. It can either be the path to your own
config file or the name of one of the predefined config files:
<https://github.com/tesseract-ocr/tesseract/tree/master/tessdata/configs>
```php
echo (new TesseractOCR('img.png'))
->configFile('hocr')
->run();
```
### setOutputFile
Specify an Outputfile to be used. Be aware: If you set an outputfile then
the option `withoutTempFiles` is ignored.
Tempfiles are written (and deleted) even if `withoutTempFiles = true`.
In combination with `configFile` you are able to get the `hocr`, `tsv` or
`pdf` files.
```php
echo (new TesseractOCR('img.png'))
->configFile('pdf')
->setOutputFile('/PATH_TO_MY_OUTPUTFILE/searchable.pdf')
->run();
```
### digits
Shortcut for `->configFile('digits')`.
```php
echo (new TesseractOCR('img.png'))
->digits()
->run();
```
### hocr
Shortcut for `->configFile('hocr')`.
```php
echo (new TesseractOCR('img.png'))
->hocr()
->run();
```
### pdf
Shortcut for `->configFile('pdf')`.
```php
echo (new TesseractOCR('img.png'))
->pdf()
->run();
```
### quiet
Shortcut for `->configFile('quiet')`.
```php
echo (new TesseractOCR('img.png'))
->quiet()
->run();
```
### tsv
Shortcut for `->configFile('tsv')`.
```php
echo (new TesseractOCR('img.png'))
->tsv()
->run();
```
### txt
Shortcut for `->configFile('txt')`.
```php
echo (new TesseractOCR('img.png'))
->txt()
->run();
```
### tempDir
Define a custom directory to store temporary files generated by tesseract.
Make sure the directory actually exists and the user running `php` is allowed
to write in there.
```php
echo (new TesseractOCR('img.png'))
->tempDir('./my/custom/temp/dir')
->run();
```
### withoutTempFiles
Specify that `tesseract` should output the recognized text without writing to temporary files.
The data is gathered from the standard output of `tesseract` instead.
```php
echo (new TesseractOCR('img.png'))
->withoutTempFiles()
->run();
```
### Other options
Any configuration option offered by Tesseract can be used like that:
```php
echo (new TesseractOCR('img.png'))
->config('config_var', 'value')
->config('other_config_var', 'other value')
->run();
```
Or like that:
```php
echo (new TesseractOCR('img.png'))
->configVar('value')
->otherConfigVar('other value')
->run();
```
__More info:__ <https://github.com/tesseract-ocr/tesseract/wiki/ControlParams>
### Thread-limit
Sometimes, it may be useful to limit the number of threads that tesseract is
allowed to use (e.g. in [this case](https://github.com/tesseract-ocr/tesseract/issues/898)).
Set the maxmium number of threads as param for the `run` function:
```php
echo (new TesseractOCR('img.png'))
->threadLimit(1)
->run();
```
## How to contribute
You can contribute to this project by:
* Opening an [Issue][] if you found a bug or wish to propose a new feature;
* Placing a [Pull Request][] with code that fix a bug, missing/wrong documentation
or implement a new feature;
Just make sure you take a look at our [Code of Conduct][] and [Contributing][]
instructions.
## License
tesseract-ocr-for-php is released under the [MIT License][].
<h2></h2><p align="center"><sub>Made with <sub><a href="#"><img src="https://thiagoalessio.github.io/tesseract-ocr-for-php/images/heart.svg" alt="love" width="14px"/></a></sub> in Berlin</sub></p>
[ci_badge]: https://github.com/thiagoalessio/tesseract-ocr-for-php/workflows/CI/badge.svg?event=push&branch=main
[ci]: https://github.com/thiagoalessio/tesseract-ocr-for-php/actions?query=workflow%3ACI
[appveyor_badge]: https://ci.appveyor.com/api/projects/status/xwy5ls0798iwcim3/branch/main?svg=true
[appveyor]: https://ci.appveyor.com/project/thiagoalessio/tesseract-ocr-for-php/branch/main
[codacy_badge]: https://app.codacy.com/project/badge/Grade/a81aa10012874f23a57df5b492d835f2
[codacy]: https://www.codacy.com/gh/thiagoalessio/tesseract-ocr-for-php/dashboard
[test_coverage_badge]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php/branch/main/graph/badge.svg?token=Y0VnrqiSIf
[test_coverage]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php
[stable_version_badge]: https://img.shields.io/packagist/v/thiagoalessio/tesseract_ocr.svg
[packagist]: https://packagist.org/packages/thiagoalessio/tesseract_ocr
[total_downloads_badge]: https://img.shields.io/packagist/dt/thiagoalessio/tesseract_ocr.svg
[monthly_downloads_badge]: https://img.shields.io/packagist/dm/thiagoalessio/tesseract_ocr.svg
[Tesseract OCR]: https://github.com/tesseract-ocr/tesseract
[Composer]: http://getcomposer.org/
[windows_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/windows-18.svg
[macos_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/apple-18.svg
[tesseract_installation_on_windows]: https://github.com/tesseract-ocr/tesseract/wiki#windows
[Capture2Text]: https://chocolatey.org/packages/capture2text
[Chocolatey]: https://chocolatey.org
[MacPorts]: https://www.macports.org
[Homebrew]: https://brew.sh
[@daijiale]: https://github.com/daijiale
[HOCR]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#hocr-output
[TSV]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#tsv-output-currently-available-in-305-dev-in-master-branch-on-github
[Issue]: https://github.com/thiagoalessio/tesseract-ocr-for-php/issues
[Pull Request]: https://github.com/thiagoalessio/tesseract-ocr-for-php/pulls
[Code of Conduct]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CODE_OF_CONDUCT.md
[Contributing]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CONTRIBUTING.md
[MIT License]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/MIT-LICENSE

View file

@ -0,0 +1,4 @@
fixes:
- "/home/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::"
- "/Users/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::"
- "C:\\projects\\tesseract-ocr-for-php\\::"

View file

@ -0,0 +1,35 @@
{
"name": "thiagoalessio/tesseract_ocr",
"description": "A wrapper to work with Tesseract OCR inside PHP.",
"version": "2.13.0",
"type": "library",
"keywords": ["Tesseract", "OCR", "text recognition"],
"license": "MIT",
"authors": [
{
"name": "thiagoalessio",
"email": "thiagoalessio@me.com"
}
],
"support": {
"issues": "https://github.com/thiagoalessio/tesseract-ocr-for-php/issues",
"irc": "irc://irc.freenode.net/tesseract-ocr-for-php",
"source": "https://github.com/thiagoalessio/tesseract-ocr-for-php"
},
"require": {
"php": "^5.3 || ^7.0 || ^8.0"
},
"require-dev": {
"phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
},
"autoload": {
"psr-4": {
"thiagoalessio\\TesseractOCR\\": "src/"
}
},
"autoload-dev": {
"psr-4": {
"thiagoalessio\\TesseractOCR\\Tests\\": "tests/"
}
}
}

View file

@ -0,0 +1,80 @@
<?php namespace thiagoalessio\TesseractOCR;
class Command
{
public $executable = 'tesseract';
public $useFileAsInput = true;
public $useFileAsOutput = true;
public $options = array();
public $configFile;
public $tempDir;
public $threadLimit;
public $image;
public $imageSize;
private $outputFile;
public function __construct($image=null, $outputFile=null)
{
$this->image = $image;
$this->outputFile = $outputFile;
}
public function build() { return "$this"; }
public function __toString()
{
$cmd = array();
if ($this->threadLimit) $cmd[] = "OMP_THREAD_LIMIT={$this->threadLimit}";
$cmd[] = self::escape($this->executable);
$cmd[] = $this->useFileAsInput ? self::escape($this->image) : "-";
$cmd[] = $this->useFileAsOutput ? self::escape($this->getOutputFile(false)) : "-";
$version = $this->getTesseractVersion();
foreach ($this->options as $option) {
$cmd[] = is_callable($option) ? $option($version) : "$option";
}
if ($this->configFile) $cmd[] = $this->configFile;
return join(' ', $cmd);
}
public function getOutputFile($withExt=true)
{
if (!$this->outputFile)
$this->outputFile = $this->getTempDir()
.DIRECTORY_SEPARATOR
.basename(tempnam($this->getTempDir(), 'ocr'));
if (!$withExt) return $this->outputFile;
$hasCustomExt = array('hocr', 'tsv', 'pdf');
$ext = in_array($this->configFile, $hasCustomExt) ? $this->configFile : 'txt';
return "{$this->outputFile}.{$ext}";
}
public function getTempDir()
{
return $this->tempDir ?: sys_get_temp_dir();
}
public function getTesseractVersion()
{
exec(self::escape($this->executable).' --version 2>&1', $output);
$outputParts = explode(' ', $output[0]);
return $outputParts[1];
}
public function getAvailableLanguages()
{
exec(self::escape($this->executable) . ' --list-langs 2>&1', $output);
array_shift($output);
sort($output);
return $output;
}
public static function escape($str)
{
$charlist = strtoupper(substr(PHP_OS, 0, 3)) == 'WIN' ? '$"`' : '$"\\`';
return '"'.addcslashes($str, $charlist).'"';
}
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
class FeatureNotAvailableException extends TesseractOcrException
{
}

View file

@ -0,0 +1,120 @@
<?php namespace thiagoalessio\TesseractOCR;
class FriendlyErrors
{
public static function checkImagePath($image)
{
if (file_exists($image)) return;
$currentDir = __DIR__;
$msg = array();
$msg[] = "Error! The image \"$image\" was not found.";
$msg[] = '';
$msg[] = "The current __DIR__ is $currentDir";
$msg = join(PHP_EOL, $msg);
throw new ImageNotFoundException($msg);
}
public static function checkTesseractPresence($executable)
{
if (file_exists($executable)) return;
$cmd = stripos(PHP_OS, 'win') === 0
? 'where.exe '.Command::escape($executable).' > NUL 2>&1'
: 'type '.Command::escape($executable).' > /dev/null 2>&1';
system($cmd, $exitCode);
if ($exitCode == 0) return;
$currentPath = getenv('PATH');
$msg = array();
$msg[] = "Error! The command \"$executable\" was not found.";
$msg[] = '';
$msg[] = 'Make sure you have Tesseract OCR installed on your system:';
$msg[] = 'https://github.com/tesseract-ocr/tesseract';
$msg[] = '';
$msg[] = "The current \$PATH is $currentPath";
$msg = join(PHP_EOL, $msg);
throw new TesseractNotFoundException($msg);
}
public static function checkCommandExecution($command, $stdout, $stderr)
{
if ($command->useFileAsOutput) {
$file = $command->getOutputFile();
if (file_exists($file) && filesize($file) > 0) return;
}
if (!$command->useFileAsOutput && $stdout) {
return;
}
$msg = array();
$msg[] = 'Error! The command did not produce any output.';
$msg[] = '';
$msg[] = 'Generated command:';
$msg[] = "$command";
$msg[] = '';
$msg[] = 'Returned message:';
$arrayStderr = explode(PHP_EOL, $stderr);
array_pop($arrayStderr);
$msg = array_merge($msg, $arrayStderr);
$msg = join(PHP_EOL, $msg);
throw new UnsuccessfulCommandException($msg);
}
public static function checkProcessCreation($processHandle, $command)
{
if ($processHandle !== FALSE) return;
$msg = array();
$msg[] = 'Error! The command could not be launched.';
$msg[] = '';
$msg[] = 'Generated command:';
$msg[] = "$command";
$msg = join(PHP_EOL, $msg);
throw new UnsuccessfulCommandException($msg);
}
public static function checkTesseractVersion($expected, $action, $command)
{
$actual = $command->getTesseractVersion();
if ($actual[0] === 'v')
$actual = substr($actual, 1);
if (version_compare($actual, $expected, ">=")) return;
$msg = array();
$msg[] = "Error! $action is not available this tesseract version";
$msg[] = "Required version is $expected, actual version is $actual";
$msg[] = '';
$msg[] = 'Generated command:';
$msg[] = "$command";
$msg = join(PHP_EOL, $msg);
throw new FeatureNotAvailableException($msg);
}
public static function checkWritePermissions($path)
{
if (!is_dir(dirname($path))) mkdir(dirname($path));
$writableDirectory = is_writable(dirname($path));
$writableFile = true;
if (file_exists($path)) $writableFile = is_writable($path);
if ($writableFile && $writableDirectory) return;
$msg = array();
$msg[] = "Error! No permission to write to $path";
$msg[] = "Make sure you have the right outputFile and permissions "
."to write to the folder";
$msg[] = '';
$msg = join(PHP_EOL, $msg);
throw new NoWritePermissionsForOutputFile($msg);
}
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
class ImageNotFoundException extends TesseractOcrException
{
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
class NoWritePermissionsForOutputFile extends TesseractOcrException
{
}

View file

@ -0,0 +1,79 @@
<?php namespace thiagoalessio\TesseractOCR;
class Option
{
public static function psm($psm)
{
return function($version) use ($psm) {
$version = preg_replace('/^v/', '', $version);
return (version_compare($version, 4, '>=') ? '-' : '')."-psm $psm";
};
}
public static function oem($oem)
{
return function($version) use ($oem) {
Option::checkMinVersion('3.05', $version, 'oem');
return "--oem $oem";
};
}
public static function dpi($dpi)
{
return function() use ($dpi) {
return "--dpi $dpi";
};
}
public static function userWords($path)
{
return function($version) use ($path) {
Option::checkMinVersion('3.04', $version, 'user-words');
return '--user-words "'.addcslashes($path, '\\"').'"';
};
}
public static function userPatterns($path)
{
return function($version) use ($path) {
Option::checkMinVersion('3.04', $version, 'user-patterns');
return '--user-patterns "'.addcslashes($path, '\\"').'"';
};
}
public static function tessdataDir($path)
{
return function() use ($path) {
return '--tessdata-dir "'.addcslashes($path, '\\"').'"';
};
}
public static function lang()
{
$languages = func_get_args();
return function() use ($languages) {
return '-l '.join('+', $languages);
};
}
public static function config($var, $value)
{
return function() use($var, $value) {
$snakeCase = function($str) {
return strtolower(preg_replace('/([A-Z])+/', '_$1', $str));
};
$pair = $snakeCase($var).'='.$value;
return '-c "'.addcslashes($pair, '\\"').'"';
};
}
public static function checkMinVersion($minVersion, $currVersion, $option)
{
$minVersion = preg_replace('/^v/', '', $minVersion);
$currVersion = preg_replace('/^v/', '', $currVersion);
if (!version_compare($currVersion, $minVersion, '<')) return;
$msg = "$option option is only available on Tesseract $minVersion or later.";
$msg.= PHP_EOL."Your version of Tesseract is $currVersion";
throw new \Exception($msg);
}
}

View file

@ -0,0 +1,83 @@
<?php namespace thiagoalessio\TesseractOCR;
class Process {
private $stdin;
private $stdout;
private $stderr;
private $handle;
private $startTime;
public function __construct($command)
{
$this->startTime = microtime(true);
$streamDescriptors = [
array("pipe", "r"),
array("pipe", "w"),
array("pipe", "w")
];
$this->handle = proc_open($command, $streamDescriptors, $pipes, NULL, NULL, ["bypass_shell" => true]);
list($this->stdin, $this->stdout, $this->stderr) = $pipes;
FriendlyErrors::checkProcessCreation($this->handle, $command);
//This is can avoid deadlock on some cases (when stderr buffer is filled up before writing to stdout and vice-versa)
stream_set_blocking($this->stdout, 0);
stream_set_blocking($this->stderr, 0);
}
public function write($data, $len)
{
$total = 0;
do
{
$res = fwrite($this->stdin, substr($data, $total));
} while($res && $total += $res < $len);
return $total === $len;
}
public function wait($timeout = 0)
{
$running = true;
$data = ["out" => "", "err" => ""];
while (($running === true) && !$this->hasTimedOut($timeout))
{
$data["out"] .= fread($this->stdout, 8192);
$data["err"] .= fread($this->stderr, 8192);
$procInfo = proc_get_status($this->handle);
$running = $procInfo["running"];
if ($running) {
usleep(1000); // Sleep 1ms to yield CPU time
}
}
return $data;
}
public function close()
{
$this->closeStream($this->stdin);
$this->closeStream($this->stdout);
$this->closeStream($this->stderr);
return proc_close($this->handle);
}
public function closeStdin()
{
$this->closeStream($this->stdin);
}
private function hasTimedOut($timeout)
{
return (($timeout > 0) && ($this->startTime + $timeout < microtime(true)));
}
private function closeStream(&$stream)
{
if ($stream !== NULL)
{
fclose($stream);
$stream = NULL;
}
}
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
class TesseractNotFoundException extends TesseractOcrException
{
}

View file

@ -0,0 +1,181 @@
<?php namespace thiagoalessio\TesseractOCR;
use thiagoalessio\TesseractOCR\Command;
use thiagoalessio\TesseractOCR\Option;
use thiagoalessio\TesseractOCR\FriendlyErrors;
class TesseractOCR
{
public $command;
private $outputFile = null;
public function __construct($image=null, $command=null)
{
$this->command = $command ?: new Command;
$this->image("$image");
}
public function run($timeout = 0)
{
try {
if ($this->outputFile !== null) {
FriendlyErrors::checkWritePermissions($this->outputFile);
$this->command->useFileAsOutput = true;
}
FriendlyErrors::checkTesseractPresence($this->command->executable);
if ($this->command->useFileAsInput) {
FriendlyErrors::checkImagePath($this->command->image);
}
$process = new Process("{$this->command}");
if (!$this->command->useFileAsInput) {
$process->write($this->command->image, $this->command->imageSize);
$process->closeStdin();
}
$output = $process->wait($timeout);
FriendlyErrors::checkCommandExecution($this->command, $output["out"], $output["err"]);
}
catch (TesseractOcrException $e) {
if ($this->command->useFileAsOutput) $this->cleanTempFiles();
throw $e;
}
if ($this->command->useFileAsOutput) {
$text = file_get_contents($this->command->getOutputFile());
if ($this->outputFile !== null) {
rename($this->command->getOutputFile(), $this->outputFile);
}
$this->cleanTempFiles();
}
else
$text = $output["out"];
return trim($text, " \t\n\r\0\x0A\x0B\x0C");
}
public function imageData($image, $size)
{
FriendlyErrors::checkTesseractVersion("3.03-rc1", "Reading image data from stdin", $this->command);
$this->command->useFileAsInput = false;
$this->command->image = $image;
$this->command->imageSize = $size;
return $this;
}
public function withoutTempFiles()
{
FriendlyErrors::checkTesseractVersion("3.03-rc1", "Writing to stdout (without using temp files)", $this->command);
$this->command->useFileAsOutput = false;
return $this;
}
public function image($image)
{
$this->command->image = $image;
return $this;
}
public function executable($executable)
{
FriendlyErrors::checkTesseractPresence($executable);
$this->command->executable = $executable;
return $this;
}
public function configFile($configFile)
{
$this->command->configFile = $configFile;
return $this;
}
public function tempDir($tempDir)
{
$this->command->tempDir = $tempDir;
return $this;
}
public function threadLimit($limit)
{
$this->command->threadLimit = $limit;
return $this;
}
// @deprecated
public function format($fmt) { return $this->configFile($fmt); }
public function setOutputFile($path) {
$this->outputFile = $path;
return $this;
}
public function allowlist()
{
$concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; };
$allowlist = join('', array_map($concat, func_get_args()));
$this->command->options[] = Option::config('tessedit_char_whitelist', $allowlist);
return $this;
}
public function whitelist()
{
$warningMsg = 'Notice: whitelist is deprecated, use allowlist instead.';
trigger_error($warningMsg, E_USER_NOTICE);
$concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; };
$allowlist = join('', array_map($concat, func_get_args()));
return $this->allowlist($allowlist);
}
public function version()
{
return $this->command->getTesseractVersion();
}
public function availableLanguages()
{
return $this->command->getAvailableLanguages();
}
public function __call($method, $args)
{
if ($this->isConfigFile($method)) return $this->configFile($method);
if ($this->isOption($method)) {
$option = $this->getOptionClassName().'::'.$method;
$this->command->options[] = call_user_func_array($option, $args);
return $this;
}
$arg = empty($args) ? null : $args[0];
$this->command->options[] = Option::config($method, $arg);
return $this;
}
private function isConfigFile($name)
{
return in_array($name, array('digits', 'hocr', 'pdf', 'quiet', 'tsv', 'txt'));
}
private function isOption($name)
{
return in_array($name, get_class_methods($this->getOptionClassName()));
}
private function getOptionClassName()
{
return __NAMESPACE__.'\\Option';
}
private function cleanTempFiles()
{
if (file_exists($this->command->getOutputFile(false))) {
unlink($this->command->getOutputFile(false));
}
if (file_exists($this->command->getOutputFile(true))) {
unlink($this->command->getOutputFile(true));
}
}
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
abstract class TesseractOcrException extends \Exception
{
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
class UnsuccessfulCommandException extends TesseractOcrException
{
}