diff --git a/cld/README.md b/cld/README.md new file mode 100644 index 000000000..933dbceee --- /dev/null +++ b/cld/README.md @@ -0,0 +1,85 @@ +Compact Language Detector +=== +CLD2 is an advanced language dectection library with a high reliability. + +This addon depends on the CLD PHP module which is not included in any Linux distribution. +It needs to be built and installed by hand, which is not totally straightforward. + +Prerequisite +--- +To be able to build the extension, you need the CLD module and the files for the PHP module development. +On Debian you install the packages php-dev, libcld2-dev and libcld2-0. +Make sure to have installed the correct PHP version. +Means: When you have got both PHP 8.0 and 8.2 on your system, you have to install php8.0-dev as well. + +Installation +--- +The original PHP extension is https://github.com/fntlnz/cld2-php-ext. +However, it doesn't support PHP8. +So https://github.com/hiteule/cld2-php-ext/tree/support-php8 has to be used. + +Download the source code: +``` +wget https://github.com/hiteule/cld2-php-ext/archive/refs/heads/support-php8.zip +``` + +Unzip it: +``` +unzip support-php8.zip +``` + +Change into the folder: +``` +cd cld2-php-ext-support-php8/ +``` + +Configure for the PHP Api version: +``` +phpize +``` +(if you have got several PHP versions on your system, execute the command with the version that you run Friendica with, e.g. `phpize8.0`) + +Create the Makefile: +``` +./configure --with-cld2=/usr/include/cld2 +``` + +Have a look at the line `checking for PHP includes`. +When the output (for example `/usr/include/php/20220829` doesn't match the API version that you got from `phpize`, then you have to change all the version codes in your `Makefile` afterwards) + +Create the module: +``` +make -j +``` + +Install it: +``` +sudo make install +``` + +Change to the folder with the available modules. When you use PHP 8.2 on Debian it is: +``` +cd /etc/php/8.2/mods-available +``` + +Create the file `cld2.ini` with this content: +``` +; configuration for php cld2 module +; priority=20 +extension=cld2.so +``` + +Enable the module for all versions and all sapi: +``` +phpenmod -v ALL -s ALL cld2 +``` + +Then restart the apache or fpm (or whatever you use) to load the changed configuration. + +Call `/admin/phpinfo` on your webserver. +You then see the PHP Info. +Search for "cld2". +The module is installed, when you find it here. +**Only proceed when the module is installed** + +Now you can enable the addon. \ No newline at end of file diff --git a/cld/cld.php b/cld/cld.php new file mode 100644 index 000000000..fc8fa8145 --- /dev/null +++ b/cld/cld.php @@ -0,0 +1,88 @@ + + */ + +use Friendica\Core\Hook; +use Friendica\Core\Logger; +use Friendica\DI; + +function cld_install() +{ + Hook::register('detect_languages', __FILE__, 'cld_detect_languages'); +} + +function cld_detect_languages(array &$data) +{ + if (!in_array('cld2', get_loaded_extensions())) { + Logger::warning('CLD2 is not installed.'); + return; + } + + $cld2 = new \CLD2Detector(); + + $cld2->setEncodingHint(CLD2Encoding::UTF8); // optional, hints about text encoding + $cld2->setPlainText(true); + + $result = $cld2->detect($data['text']); + + if ($data['detected']) { + $original = array_key_first($data['detected']); + } else { + $original = ''; + } + + $detected = $result['language_code']; + if ($detected == 'pt') { + $detected = 'pt-PT'; + } elseif ($detected == 'az') { + $detected = 'az-Latn'; + } elseif ($detected == 'bs') { + $detected = 'bs-Latn'; + } elseif ($detected == 'el') { + $detected = 'el-monoton'; + } elseif ($detected == 'ht') { + $detected = 'fr'; + } elseif ($detected == 'iw') { + $detected = 'he'; + } elseif ($detected == 'jw') { + $detected = 'jv'; + } elseif ($detected == 'ms') { + $detected = 'ms-Latn'; + } elseif ($detected == 'no') { + $detected = 'nb'; + } elseif ($detected == 'sr') { + $detected = 'sr-Cyrl'; + } elseif ($detected == 'zh') { + $detected = 'zh-Hans'; + } elseif ($detected == 'zh-Hant') { + $detected = 'zh-hant'; + } + + // languages that aren't supported via the base language detection + if (in_array($detected, ['ceb', 'hmn', 'ht', 'kk', 'ky', 'mg', 'mk', 'ml', 'ny', 'or', 'pa', 'rw', 'su', 'st', 'tg', 'ts', 'xx-Qaai'])) { + return; + } + + if (!$result['is_reliable']) { + Logger::debug('Unreliable detection', ['uri-id' => $data['uri-id'], 'original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); + return; + } + + if ($original == $detected) { + return; + } + + $available = array_keys(DI::l10n()->convertForLanguageDetection(DI::l10n()->getAvailableLanguages(true))); + + if (!in_array($detected, $available)) { + Logger::debug('Unsupported language', ['uri-id' => $data['uri-id'], 'original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); + return; + } + + Logger::debug('Detected different language', ['uri-id' => $data['uri-id'], 'original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); + $data['detected'] = [$detected => $result['language_probability'] / 100]; +}