From 3a60229e5c0925bea134e89c91d2a75a0246f622 Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 13 Jan 2024 19:30:20 +0000 Subject: [PATCH 1/5] Use OCR to fetch text in images --- composer.json | 3 +- composer.lock | 64 +++++++++++++++++++++++++++++++++++++- src/Model/Post/Media.php | 6 +++- src/Util/Images.php | 23 +++++++++++--- static/defaults.config.php | 4 +++ 5 files changed, 93 insertions(+), 7 deletions(-) diff --git a/composer.json b/composer.json index 21603c7b27..903a3fab06 100644 --- a/composer.json +++ b/composer.json @@ -75,7 +75,8 @@ "npm-asset/moment": "^2.24", "npm-asset/perfect-scrollbar": "0.6.16", "npm-asset/textcomplete": "^0.18.2", - "npm-asset/typeahead.js": "^0.11.1" + "npm-asset/typeahead.js": "^0.11.1", + "thiagoalessio/tesseract_ocr": "^2.13" }, "suggest": { "ext-imagick": "For faster image processing", diff --git a/composer.lock b/composer.lock index a541913811..9442d41823 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "082b16e2c88895f1a03d5b0ffe678ba7", + "content-hash": "131ca83d1c6f64092ff5220e4a14a101", "packages": [ { "name": "asika/simple-console", @@ -1317,6 +1317,24 @@ "html", "markdown" ], + "funding": [ + { + "url": "https://www.colinodell.com/sponsor", + "type": "custom" + }, + { + "url": "https://www.paypal.me/colinpodell/10.00", + "type": "custom" + }, + { + "url": "https://github.com/colinodell", + "type": "github" + }, + { + "url": "https://www.patreon.com/colinodell", + "type": "patreon" + } + ], "time": "2020-07-01T00:34:03+00:00" }, { @@ -4224,6 +4242,50 @@ ], "time": "2023-01-26T09:26:14+00:00" }, + { + "name": "thiagoalessio/tesseract_ocr", + "version": "2.13.0", + "source": { + "type": "git", + "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git", + "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1", + "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1", + "shasum": "" + }, + "require": { + "php": "^5.3 || ^7.0 || ^8.0" + }, + "require-dev": { + "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "thiagoalessio\\TesseractOCR\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "thiagoalessio", + "email": "thiagoalessio@me.com" + } + ], + "description": "A wrapper to work with Tesseract OCR inside PHP.", + "keywords": [ + "OCR", + "Tesseract", + "text recognition" + ], + "time": "2023-10-05T21:14:48+00:00" + }, { "name": "ua-parser/uap-php", "version": "v3.9.14", diff --git a/src/Model/Post/Media.php b/src/Model/Post/Media.php index df05db98d5..afd6ca8383 100644 --- a/src/Model/Post/Media.php +++ b/src/Model/Post/Media.php @@ -208,13 +208,17 @@ class Media $filetype = !empty($media['mimetype']) ? strtolower(current(explode('/', $media['mimetype']))) : ''; if (($media['type'] == self::IMAGE) || ($filetype == 'image')) { - $imagedata = Images::getInfoFromURLCached($media['url']); + $imagedata = Images::getInfoFromURLCached($media['url'], empty($media['description'])); if ($imagedata) { $media['mimetype'] = $imagedata['mime']; $media['size'] = $imagedata['size']; $media['width'] = $imagedata[0]; $media['height'] = $imagedata[1]; $media['blurhash'] = $imagedata['blurhash'] ?? null; + if (!empty($imagedata['description']) && empty($media['description'])) { + $media['description'] = $imagedata['description']; + Logger::debug('Detected text for image', $media); + } } else { Logger::notice('No image data', ['media' => $media]); } diff --git a/src/Util/Images.php b/src/Util/Images.php index b44b1fb8f5..0d64601f1d 100644 --- a/src/Util/Images.php +++ b/src/Util/Images.php @@ -22,10 +22,12 @@ namespace Friendica\Util; use Friendica\Core\Logger; +use Friendica\Core\System; use Friendica\DI; use Friendica\Model\Photo; use Friendica\Network\HTTPClient\Client\HttpClientAccept; use Friendica\Object\Image; +use thiagoalessio\TesseractOCR\TesseractOCR; /** * Image utilities @@ -181,10 +183,11 @@ class Images * Gets info array from given URL, cached data has priority * * @param string $url + * @param bool $ocr * @return array Info * @throws \Friendica\Network\HTTPException\InternalServerErrorException */ - public static function getInfoFromURLCached(string $url): array + public static function getInfoFromURLCached(string $url, bool $ocr = false): array { $data = []; @@ -192,12 +195,12 @@ class Images return $data; } - $cacheKey = 'getInfoFromURL:' . sha1($url); + $cacheKey = 'getInfoFromURL:' . sha1($url . $ocr); $data = DI::cache()->get($cacheKey); if (empty($data) || !is_array($data)) { - $data = self::getInfoFromURL($url); + $data = self::getInfoFromURL($url, $ocr); DI::cache()->set($cacheKey, $data); } @@ -209,10 +212,11 @@ class Images * Gets info from URL uncached * * @param string $url + * @param bool $ocr * @return array Info array * @throws \Friendica\Network\HTTPException\InternalServerErrorException */ - public static function getInfoFromURL(string $url): array + public static function getInfoFromURL(string $url, bool $ocr = false): array { $data = []; @@ -257,6 +261,17 @@ class Images if ($image->isValid()) { $data['blurhash'] = $image->getBlurHash(); + + if ($ocr && DI::config()->get('system', 'tesseract_ocr')) { + $ocr = new TesseractOCR(); + try { + $ocr->tempDir(System::getTempPath()); + $ocr->imageData($img_str, strlen($img_str)); + $data['description'] = $ocr->run(); + } catch (\Throwable $th) { + Logger::info('Error calling TesseractOCR', ['message' => $th->getMessage()]); + } + } } $data['size'] = $filesize; diff --git a/static/defaults.config.php b/static/defaults.config.php index 819b0ad85f..b3a7f49984 100644 --- a/static/defaults.config.php +++ b/static/defaults.config.php @@ -441,6 +441,10 @@ return [ // Don't show smilies. 'no_smilies' => false, + // tesseract_ocr (Boolean) + // Use Tesseract OCR to use OCR to fetch text from images + 'tesseract_ocr' => false, + // optimize_all_tables (Boolean) // Optimizes all tables instead of only tables like workerqueue or the cache 'optimize_all_tables' => false, From 1fc1e478f8826e6fa65fc4ed4a6d6a32c955f81e Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 13 Jan 2024 19:37:24 +0000 Subject: [PATCH 2/5] Improved config description --- static/defaults.config.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/static/defaults.config.php b/static/defaults.config.php index b3a7f49984..47501fe91f 100644 --- a/static/defaults.config.php +++ b/static/defaults.config.php @@ -442,7 +442,8 @@ return [ 'no_smilies' => false, // tesseract_ocr (Boolean) - // Use Tesseract OCR to use OCR to fetch text from images + // Use Tesseract OCR to use OCR to fetch text from images. + // The Tesseract OCR command line tool needs to be installed separately on the system. 'tesseract_ocr' => false, // optimize_all_tables (Boolean) From 7150faa09c21fc09e86b4f44b97418ca5086ce28 Mon Sep 17 00:00:00 2001 From: Michael Date: Sun, 14 Jan 2024 18:38:22 +0000 Subject: [PATCH 3/5] Composer --- composer.lock | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/composer.lock b/composer.lock index 1e003d71c8..5125e8560f 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "131ca83d1c6f64092ff5220e4a14a101", + "content-hash": "b3decd4b776853666f122b85e56eedc9", "packages": [ { "name": "asika/simple-console", @@ -4311,6 +4311,50 @@ ], "time": "2020-10-23T14:02:19+00:00" }, + { + "name": "thiagoalessio/tesseract_ocr", + "version": "2.13.0", + "source": { + "type": "git", + "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git", + "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1", + "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1", + "shasum": "" + }, + "require": { + "php": "^5.3 || ^7.0 || ^8.0" + }, + "require-dev": { + "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "thiagoalessio\\TesseractOCR\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "thiagoalessio", + "email": "thiagoalessio@me.com" + } + ], + "description": "A wrapper to work with Tesseract OCR inside PHP.", + "keywords": [ + "OCR", + "Tesseract", + "text recognition" + ], + "time": "2023-10-05T21:14:48+00:00" + }, { "name": "ua-parser/uap-php", "version": "v3.9.14", From a2f4b4cd3dbe80de5f2185ce5548ccca426266b4 Mon Sep 17 00:00:00 2001 From: Michael Date: Sun, 14 Jan 2024 18:40:32 +0000 Subject: [PATCH 4/5] Composer --- composer.json | 3 +-- composer.lock | 64 +-------------------------------------------------- 2 files changed, 2 insertions(+), 65 deletions(-) diff --git a/composer.json b/composer.json index 7c88a640a4..730c24bafa 100644 --- a/composer.json +++ b/composer.json @@ -75,8 +75,7 @@ "npm-asset/moment": "^2.24", "npm-asset/perfect-scrollbar": "0.6.16", "npm-asset/textcomplete": "^0.18.2", - "npm-asset/typeahead.js": "^0.11.1", - "thiagoalessio/tesseract_ocr": "^2.13" + "npm-asset/typeahead.js": "^0.11.1" }, "suggest": { "ext-imagick": "For faster image processing", diff --git a/composer.lock b/composer.lock index 5125e8560f..a27b09497d 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "b3decd4b776853666f122b85e56eedc9", + "content-hash": "356019e5d0c92aae938f5292dd0fd103", "packages": [ { "name": "asika/simple-console", @@ -1412,24 +1412,6 @@ "html", "markdown" ], - "funding": [ - { - "url": "https://www.colinodell.com/sponsor", - "type": "custom" - }, - { - "url": "https://www.paypal.me/colinpodell/10.00", - "type": "custom" - }, - { - "url": "https://github.com/colinodell", - "type": "github" - }, - { - "url": "https://www.patreon.com/colinodell", - "type": "patreon" - } - ], "time": "2020-07-01T00:34:03+00:00" }, { @@ -4311,50 +4293,6 @@ ], "time": "2020-10-23T14:02:19+00:00" }, - { - "name": "thiagoalessio/tesseract_ocr", - "version": "2.13.0", - "source": { - "type": "git", - "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git", - "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1", - "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1", - "shasum": "" - }, - "require": { - "php": "^5.3 || ^7.0 || ^8.0" - }, - "require-dev": { - "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0" - }, - "type": "library", - "autoload": { - "psr-4": { - "thiagoalessio\\TesseractOCR\\": "src/" - } - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "MIT" - ], - "authors": [ - { - "name": "thiagoalessio", - "email": "thiagoalessio@me.com" - } - ], - "description": "A wrapper to work with Tesseract OCR inside PHP.", - "keywords": [ - "OCR", - "Tesseract", - "text recognition" - ], - "time": "2023-10-05T21:14:48+00:00" - }, { "name": "ua-parser/uap-php", "version": "v3.9.14", From a12fbf7ff337b3b1aa57a06d266364a620ab14f3 Mon Sep 17 00:00:00 2001 From: Michael Date: Sun, 14 Jan 2024 19:18:41 +0000 Subject: [PATCH 5/5] Move to addon --- src/Util/Images.php | 18 +++++++----------- static/defaults.config.php | 5 ----- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/src/Util/Images.php b/src/Util/Images.php index 0d64601f1d..e5b8afbb54 100644 --- a/src/Util/Images.php +++ b/src/Util/Images.php @@ -21,13 +21,12 @@ namespace Friendica\Util; +use Friendica\Core\Hook; use Friendica\Core\Logger; -use Friendica\Core\System; use Friendica\DI; use Friendica\Model\Photo; use Friendica\Network\HTTPClient\Client\HttpClientAccept; use Friendica\Object\Image; -use thiagoalessio\TesseractOCR\TesseractOCR; /** * Image utilities @@ -262,15 +261,12 @@ class Images if ($image->isValid()) { $data['blurhash'] = $image->getBlurHash(); - if ($ocr && DI::config()->get('system', 'tesseract_ocr')) { - $ocr = new TesseractOCR(); - try { - $ocr->tempDir(System::getTempPath()); - $ocr->imageData($img_str, strlen($img_str)); - $data['description'] = $ocr->run(); - } catch (\Throwable $th) { - Logger::info('Error calling TesseractOCR', ['message' => $th->getMessage()]); - } + if ($ocr) { + $media = ['img_str' => $img_str]; + Hook::callAll('ocr-detection', $media); + if (!empty($media['description'])) { + $data['description'] = $media['description']; + } } } diff --git a/static/defaults.config.php b/static/defaults.config.php index 47501fe91f..819b0ad85f 100644 --- a/static/defaults.config.php +++ b/static/defaults.config.php @@ -441,11 +441,6 @@ return [ // Don't show smilies. 'no_smilies' => false, - // tesseract_ocr (Boolean) - // Use Tesseract OCR to use OCR to fetch text from images. - // The Tesseract OCR command line tool needs to be installed separately on the system. - 'tesseract_ocr' => false, - // optimize_all_tables (Boolean) // Optimizes all tables instead of only tables like workerqueue or the cache 'optimize_all_tables' => false,