Use OCR to fetch text in images

2024-05-21 15:26:44 +02:00 · 2024-01-13 19:30:20 +00:00 · 2024-01-13 19:30:20 +00:00 · 3a60229e5c
parent 16b12e1545
commit 3a60229e5c
5 changed files with 93 additions and 7 deletions
--- a/composer.json
+++ b/composer.json
@ -75,7 +75,8 @@
 		"npm-asset/moment": "^2.24",
 		"npm-asset/perfect-scrollbar": "0.6.16",
 		"npm-asset/textcomplete": "^0.18.2",
-		"npm-asset/typeahead.js": "^0.11.1"
+		"npm-asset/typeahead.js": "^0.11.1",
+		"thiagoalessio/tesseract_ocr": "^2.13"
 	},
 	"suggest": {
 		"ext-imagick": "For faster image processing",
--- a/composer.lock
+++ b/composer.lock
@ -4,7 +4,7 @@
        "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
        "This file is @generated automatically"
    ],
-    "content-hash": "082b16e2c88895f1a03d5b0ffe678ba7",
+    "content-hash": "131ca83d1c6f64092ff5220e4a14a101",
    "packages": [
        {
            "name": "asika/simple-console",
@ -1317,6 +1317,24 @@
                "html",
                "markdown"
            ],
+            "funding": [
+                {
+                    "url": "https://www.colinodell.com/sponsor",
+                    "type": "custom"
+                },
+                {
+                    "url": "https://www.paypal.me/colinpodell/10.00",
+                    "type": "custom"
+                },
+                {
+                    "url": "https://github.com/colinodell",
+                    "type": "github"
+                },
+                {
+                    "url": "https://www.patreon.com/colinodell",
+                    "type": "patreon"
+                }
+            ],
            "time": "2020-07-01T00:34:03+00:00"
        },
        {
@ -4224,6 +4242,50 @@
            ],
            "time": "2023-01-26T09:26:14+00:00"
        },
+        {
+            "name": "thiagoalessio/tesseract_ocr",
+            "version": "2.13.0",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git",
+                "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
+                "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
+                "shasum": ""
+            },
+            "require": {
+                "php": "^5.3 || ^7.0 || ^8.0"
+            },
+            "require-dev": {
+                "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
+            },
+            "type": "library",
+            "autoload": {
+                "psr-4": {
+                    "thiagoalessio\\TesseractOCR\\": "src/"
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "MIT"
+            ],
+            "authors": [
+                {
+                    "name": "thiagoalessio",
+                    "email": "thiagoalessio@me.com"
+                }
+            ],
+            "description": "A wrapper to work with Tesseract OCR inside PHP.",
+            "keywords": [
+                "OCR",
+                "Tesseract",
+                "text recognition"
+            ],
+            "time": "2023-10-05T21:14:48+00:00"
+        },
        {
            "name": "ua-parser/uap-php",
            "version": "v3.9.14",
--- a/src/Model/Post/Media.php
+++ b/src/Model/Post/Media.php
@ -208,13 +208,17 @@ class Media
 		$filetype = !empty($media['mimetype']) ? strtolower(current(explode('/', $media['mimetype']))) : '';

 		if (($media['type'] == self::IMAGE) || ($filetype == 'image')) {
-			$imagedata = Images::getInfoFromURLCached($media['url']);
+			$imagedata = Images::getInfoFromURLCached($media['url'], empty($media['description']));
 			if ($imagedata) {
 				$media['mimetype'] = $imagedata['mime'];
 				$media['size'] = $imagedata['size'];
 				$media['width'] = $imagedata[0];
 				$media['height'] = $imagedata[1];
 				$media['blurhash'] = $imagedata['blurhash'] ?? null;
+				if (!empty($imagedata['description']) && empty($media['description'])) {
+					$media['description'] = $imagedata['description'];
+					Logger::debug('Detected text for image', $media);
+				}
 			} else {
 				Logger::notice('No image data', ['media' => $media]);
 			}
--- a/src/Util/Images.php
+++ b/src/Util/Images.php
@ -22,10 +22,12 @@
 namespace Friendica\Util;

 use Friendica\Core\Logger;
+use Friendica\Core\System;
 use Friendica\DI;
 use Friendica\Model\Photo;
 use Friendica\Network\HTTPClient\Client\HttpClientAccept;
 use Friendica\Object\Image;
+use thiagoalessio\TesseractOCR\TesseractOCR;

 /**
 * Image utilities
@ -181,10 +183,11 @@ class Images
 	 * Gets info array from given URL, cached data has priority
 	 *
 	 * @param string $url
+	 * @param bool   $ocr
 	 * @return array Info
 	 * @throws \Friendica\Network\HTTPException\InternalServerErrorException
 	 */
-	public static function getInfoFromURLCached(string $url): array
+	public static function getInfoFromURLCached(string $url, bool $ocr = false): array
 	{
 		$data = [];

@ -192,12 +195,12 @@ class Images
 			return $data;
 		}

-		$cacheKey = 'getInfoFromURL:' . sha1($url);
+		$cacheKey = 'getInfoFromURL:' . sha1($url . $ocr);

 		$data = DI::cache()->get($cacheKey);

 		if (empty($data) || !is_array($data)) {
-			$data = self::getInfoFromURL($url);
+			$data = self::getInfoFromURL($url, $ocr);

 			DI::cache()->set($cacheKey, $data);
 		}
@ -209,10 +212,11 @@ class Images
 	 * Gets info from URL uncached
 	 *
 	 * @param string $url
+	 * @param bool   $ocr
 	 * @return array Info array
 	 * @throws \Friendica\Network\HTTPException\InternalServerErrorException
 	 */
-	public static function getInfoFromURL(string $url): array
+	public static function getInfoFromURL(string $url, bool $ocr = false): array
 	{
 		$data = [];

@ -257,6 +261,17 @@ class Images

 		if ($image->isValid()) {
 			$data['blurhash'] = $image->getBlurHash();
+			
+			if ($ocr && DI::config()->get('system', 'tesseract_ocr')) {
+				$ocr = new TesseractOCR();
+				try {
+					$ocr->tempDir(System::getTempPath());
+					$ocr->imageData($img_str, strlen($img_str));
+					$data['description'] = $ocr->run();
+				} catch (\Throwable $th) {
+					Logger::info('Error calling TesseractOCR', ['message' => $th->getMessage()]);
+				}			
+			}
 		}

 		$data['size'] = $filesize;
--- a/static/defaults.config.php
+++ b/static/defaults.config.php
@ -441,6 +441,10 @@ return [
 		// Don't show smilies.
 		'no_smilies' => false,

+		// tesseract_ocr (Boolean)
+		// Use Tesseract OCR to use OCR to fetch text from images
+		'tesseract_ocr' => false,
+
 		// optimize_all_tables (Boolean)
 		// Optimizes all tables instead of only tables like workerqueue or the cache
 		'optimize_all_tables' => false,