From 981e6821d0a21c4889b11530fdc1ac59b5d0cbdb Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 30 Sep 2023 15:56:50 +0000 Subject: [PATCH] CLD: New plugin for language detection via CLD2 --- cld/README.md | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++ cld/cld.php | 75 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 cld/README.md create mode 100644 cld/cld.php diff --git a/cld/README.md b/cld/README.md new file mode 100644 index 00000000..6709facb --- /dev/null +++ b/cld/README.md @@ -0,0 +1,91 @@ +Compact Language Detector +=== +CLD2 is an advanced language dectection library with a high reliability. + +This addon depends on the CLD PHP module which is not included in any Linux distribution. +It needs to be built and installed by hand, which is not totally straightforward. + +Prerequisite +--- +To be able to build the extension, you need the CLD module and the files for the PHP module development. +On Debian you install the packages php-dev, libcld2-dev and libcld2-0. +Make sure to have installed the correct PHP version. +Means: When you have got both PHP 8.0 and 8.2 on your system, you have to install php8.0-dev as well. + +Installation +--- +The original PHP extension is https://github.com/fntlnz/cld2-php-ext. +However, it doesn't support PHP8. +So https://github.com/hiteule/cld2-php-ext/tree/support-php8 has to be used. + +Download the source code: +``` +wget https://github.com/hiteule/cld2-php-ext/archive/refs/heads/support-php8.zip +``` + +Unzip it: +``` +unzip support-php8.zip +``` + +Change into the folder: +``` +cd cld2-php-ext-support-php8/ +``` + +Configure for the PHP Api version: +``` +phpize +``` +(if you have got several PHP versions on your system, execute the command with the version that you run Friendica with, e.g. `phpize8.0`) + +Create the Makefile: +``` +./configure --with-cld2=/usr/include/cld2 +``` + +Have a look at the line `checking for PHP includes`. +When the output (for example `/usr/include/php/20220829` doesn't match the API version that you got from `phpize`, then you have to change all the version codes in your `Makefile` afterwards) + +Create the module: +``` +make -j +``` + +Install it: +``` +sudo make install +``` + +Change to the folder with the available modules. When you use PHP 8.0 on Debian it is: +``` +cd /etc/php/8.0/mods-available +``` + +Create the file `cld.ini` with this content: +``` +; configuration for php cld2 module +; priority=20 +extension=cld2.so +``` + +Change to the folder `conf.d` in the folder of your `php.ini`. +``` +cd /etc/php/8.0/cgi/conf.d +``` +This depends on the way you installed the PHP support for your webserver. Instead of `cgi` it could also be `apache2` or `fpm`. + +Create a symbolic link to install the module: +``` +ln -s /etc/php/8.0/mods-available/cld.ini +``` + +Then restart the apache or fpm (or whatever you use) to load the changed configuration. + +Call `/admin/phpinfo` on your webserver. +You then see the PHP Info. +Search for "cld2". +The module is installed, when you find it here. +**Only proceed when the module is installed** + +Now you can enable the addon. \ No newline at end of file diff --git a/cld/cld.php b/cld/cld.php new file mode 100644 index 00000000..4c54b2d0 --- /dev/null +++ b/cld/cld.php @@ -0,0 +1,75 @@ + + */ + +use Friendica\Core\Hook; +use Friendica\Core\Logger; +use Friendica\DI; + +function cld_install() +{ + Hook::register('get_language', 'addon/cld/cld.php', 'cld_get_language'); +} + +function cld_get_language(array &$data) +{ + if (!in_array('cld2', get_loaded_extensions())) { + Logger::warning('CLD2 is not installed.'); + return; + } + + $cld2 = new \CLD2Detector(); + + $cld2->setEncodingHint(CLD2Encoding::UTF8); // optional, hints about text encoding + + $result = $cld2->detect($data['text']); + + if ($data['detected']) { + $original = array_key_first($data['detected']); + } else { + $original = ''; + } + + $detected = $result['language_code']; + if ($detected == 'pt') { + $detected = 'pt-PT'; + } elseif ($detected == 'el') { + $detected = 'el-monoton'; + } elseif ($detected == 'no') { + $detected = 'nb'; + } elseif ($detected == 'zh') { + $detected = 'zh-Hans'; + } elseif ($detected == 'zh-Hant') { + $detected = 'zh-hant'; + } + + if (!$result['is_reliable']) { + Logger::debug('Unreliable detection', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); + return; + } + + if ($original == $detected) { +// return; + } + + // Nur aus Testgründen + if (in_array($detected, ['xx-Qaai', 'ht', 'ga'])) { + return; + } + + $available = array_keys(DI::l10n()->convertForLanguageDetection(DI::l10n()->getAvailableLanguages(true))); + + if (!in_array($detected, $available)) { + Logger::debug('Unsupported language', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); + return; + } + + Logger::debug('Detected', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); + +// Logger::debug('Detected different language', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); + $data['detected'] = [$detected => $result['language_probability'] / 100]; +}