Use OCR to fetch text in images

This commit is contained in:
Michael 2024-01-13 19:30:20 +00:00
parent 16b12e1545
commit 3a60229e5c
5 changed files with 93 additions and 7 deletions

View file

@ -75,7 +75,8 @@
"npm-asset/moment": "^2.24", "npm-asset/moment": "^2.24",
"npm-asset/perfect-scrollbar": "0.6.16", "npm-asset/perfect-scrollbar": "0.6.16",
"npm-asset/textcomplete": "^0.18.2", "npm-asset/textcomplete": "^0.18.2",
"npm-asset/typeahead.js": "^0.11.1" "npm-asset/typeahead.js": "^0.11.1",
"thiagoalessio/tesseract_ocr": "^2.13"
}, },
"suggest": { "suggest": {
"ext-imagick": "For faster image processing", "ext-imagick": "For faster image processing",

64
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically" "This file is @generated automatically"
], ],
"content-hash": "082b16e2c88895f1a03d5b0ffe678ba7", "content-hash": "131ca83d1c6f64092ff5220e4a14a101",
"packages": [ "packages": [
{ {
"name": "asika/simple-console", "name": "asika/simple-console",
@ -1317,6 +1317,24 @@
"html", "html",
"markdown" "markdown"
], ],
"funding": [
{
"url": "https://www.colinodell.com/sponsor",
"type": "custom"
},
{
"url": "https://www.paypal.me/colinpodell/10.00",
"type": "custom"
},
{
"url": "https://github.com/colinodell",
"type": "github"
},
{
"url": "https://www.patreon.com/colinodell",
"type": "patreon"
}
],
"time": "2020-07-01T00:34:03+00:00" "time": "2020-07-01T00:34:03+00:00"
}, },
{ {
@ -4224,6 +4242,50 @@
], ],
"time": "2023-01-26T09:26:14+00:00" "time": "2023-01-26T09:26:14+00:00"
}, },
{
"name": "thiagoalessio/tesseract_ocr",
"version": "2.13.0",
"source": {
"type": "git",
"url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git",
"reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
"reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
"shasum": ""
},
"require": {
"php": "^5.3 || ^7.0 || ^8.0"
},
"require-dev": {
"phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
},
"type": "library",
"autoload": {
"psr-4": {
"thiagoalessio\\TesseractOCR\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "thiagoalessio",
"email": "thiagoalessio@me.com"
}
],
"description": "A wrapper to work with Tesseract OCR inside PHP.",
"keywords": [
"OCR",
"Tesseract",
"text recognition"
],
"time": "2023-10-05T21:14:48+00:00"
},
{ {
"name": "ua-parser/uap-php", "name": "ua-parser/uap-php",
"version": "v3.9.14", "version": "v3.9.14",

View file

@ -208,13 +208,17 @@ class Media
$filetype = !empty($media['mimetype']) ? strtolower(current(explode('/', $media['mimetype']))) : ''; $filetype = !empty($media['mimetype']) ? strtolower(current(explode('/', $media['mimetype']))) : '';
if (($media['type'] == self::IMAGE) || ($filetype == 'image')) { if (($media['type'] == self::IMAGE) || ($filetype == 'image')) {
$imagedata = Images::getInfoFromURLCached($media['url']); $imagedata = Images::getInfoFromURLCached($media['url'], empty($media['description']));
if ($imagedata) { if ($imagedata) {
$media['mimetype'] = $imagedata['mime']; $media['mimetype'] = $imagedata['mime'];
$media['size'] = $imagedata['size']; $media['size'] = $imagedata['size'];
$media['width'] = $imagedata[0]; $media['width'] = $imagedata[0];
$media['height'] = $imagedata[1]; $media['height'] = $imagedata[1];
$media['blurhash'] = $imagedata['blurhash'] ?? null; $media['blurhash'] = $imagedata['blurhash'] ?? null;
if (!empty($imagedata['description']) && empty($media['description'])) {
$media['description'] = $imagedata['description'];
Logger::debug('Detected text for image', $media);
}
} else { } else {
Logger::notice('No image data', ['media' => $media]); Logger::notice('No image data', ['media' => $media]);
} }

View file

@ -22,10 +22,12 @@
namespace Friendica\Util; namespace Friendica\Util;
use Friendica\Core\Logger; use Friendica\Core\Logger;
use Friendica\Core\System;
use Friendica\DI; use Friendica\DI;
use Friendica\Model\Photo; use Friendica\Model\Photo;
use Friendica\Network\HTTPClient\Client\HttpClientAccept; use Friendica\Network\HTTPClient\Client\HttpClientAccept;
use Friendica\Object\Image; use Friendica\Object\Image;
use thiagoalessio\TesseractOCR\TesseractOCR;
/** /**
* Image utilities * Image utilities
@ -181,10 +183,11 @@ class Images
* Gets info array from given URL, cached data has priority * Gets info array from given URL, cached data has priority
* *
* @param string $url * @param string $url
* @param bool $ocr
* @return array Info * @return array Info
* @throws \Friendica\Network\HTTPException\InternalServerErrorException * @throws \Friendica\Network\HTTPException\InternalServerErrorException
*/ */
public static function getInfoFromURLCached(string $url): array public static function getInfoFromURLCached(string $url, bool $ocr = false): array
{ {
$data = []; $data = [];
@ -192,12 +195,12 @@ class Images
return $data; return $data;
} }
$cacheKey = 'getInfoFromURL:' . sha1($url); $cacheKey = 'getInfoFromURL:' . sha1($url . $ocr);
$data = DI::cache()->get($cacheKey); $data = DI::cache()->get($cacheKey);
if (empty($data) || !is_array($data)) { if (empty($data) || !is_array($data)) {
$data = self::getInfoFromURL($url); $data = self::getInfoFromURL($url, $ocr);
DI::cache()->set($cacheKey, $data); DI::cache()->set($cacheKey, $data);
} }
@ -209,10 +212,11 @@ class Images
* Gets info from URL uncached * Gets info from URL uncached
* *
* @param string $url * @param string $url
* @param bool $ocr
* @return array Info array * @return array Info array
* @throws \Friendica\Network\HTTPException\InternalServerErrorException * @throws \Friendica\Network\HTTPException\InternalServerErrorException
*/ */
public static function getInfoFromURL(string $url): array public static function getInfoFromURL(string $url, bool $ocr = false): array
{ {
$data = []; $data = [];
@ -257,6 +261,17 @@ class Images
if ($image->isValid()) { if ($image->isValid()) {
$data['blurhash'] = $image->getBlurHash(); $data['blurhash'] = $image->getBlurHash();
if ($ocr && DI::config()->get('system', 'tesseract_ocr')) {
$ocr = new TesseractOCR();
try {
$ocr->tempDir(System::getTempPath());
$ocr->imageData($img_str, strlen($img_str));
$data['description'] = $ocr->run();
} catch (\Throwable $th) {
Logger::info('Error calling TesseractOCR', ['message' => $th->getMessage()]);
}
}
} }
$data['size'] = $filesize; $data['size'] = $filesize;

View file

@ -441,6 +441,10 @@ return [
// Don't show smilies. // Don't show smilies.
'no_smilies' => false, 'no_smilies' => false,
// tesseract_ocr (Boolean)
// Use Tesseract OCR to use OCR to fetch text from images
'tesseract_ocr' => false,
// optimize_all_tables (Boolean) // optimize_all_tables (Boolean)
// Optimizes all tables instead of only tables like workerqueue or the cache // Optimizes all tables instead of only tables like workerqueue or the cache
'optimize_all_tables' => false, 'optimize_all_tables' => false,