From 981e6821d0a21c4889b11530fdc1ac59b5d0cbdb Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 30 Sep 2023 15:56:50 +0000 Subject: [PATCH 1/5] CLD: New plugin for language detection via CLD2 --- cld/README.md | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++ cld/cld.php | 75 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 cld/README.md create mode 100644 cld/cld.php diff --git a/cld/README.md b/cld/README.md new file mode 100644 index 000000000..6709facb2 --- /dev/null +++ b/cld/README.md @@ -0,0 +1,91 @@ +Compact Language Detector +=== +CLD2 is an advanced language dectection library with a high reliability. + +This addon depends on the CLD PHP module which is not included in any Linux distribution. +It needs to be built and installed by hand, which is not totally straightforward. + +Prerequisite +--- +To be able to build the extension, you need the CLD module and the files for the PHP module development. +On Debian you install the packages php-dev, libcld2-dev and libcld2-0. +Make sure to have installed the correct PHP version. +Means: When you have got both PHP 8.0 and 8.2 on your system, you have to install php8.0-dev as well. + +Installation +--- +The original PHP extension is https://github.com/fntlnz/cld2-php-ext. +However, it doesn't support PHP8. +So https://github.com/hiteule/cld2-php-ext/tree/support-php8 has to be used. + +Download the source code: +``` +wget https://github.com/hiteule/cld2-php-ext/archive/refs/heads/support-php8.zip +``` + +Unzip it: +``` +unzip support-php8.zip +``` + +Change into the folder: +``` +cd cld2-php-ext-support-php8/ +``` + +Configure for the PHP Api version: +``` +phpize +``` +(if you have got several PHP versions on your system, execute the command with the version that you run Friendica with, e.g. `phpize8.0`) + +Create the Makefile: +``` +./configure --with-cld2=/usr/include/cld2 +``` + +Have a look at the line `checking for PHP includes`. +When the output (for example `/usr/include/php/20220829` doesn't match the API version that you got from `phpize`, then you have to change all the version codes in your `Makefile` afterwards) + +Create the module: +``` +make -j +``` + +Install it: +``` +sudo make install +``` + +Change to the folder with the available modules. When you use PHP 8.0 on Debian it is: +``` +cd /etc/php/8.0/mods-available +``` + +Create the file `cld.ini` with this content: +``` +; configuration for php cld2 module +; priority=20 +extension=cld2.so +``` + +Change to the folder `conf.d` in the folder of your `php.ini`. +``` +cd /etc/php/8.0/cgi/conf.d +``` +This depends on the way you installed the PHP support for your webserver. Instead of `cgi` it could also be `apache2` or `fpm`. + +Create a symbolic link to install the module: +``` +ln -s /etc/php/8.0/mods-available/cld.ini +``` + +Then restart the apache or fpm (or whatever you use) to load the changed configuration. + +Call `/admin/phpinfo` on your webserver. +You then see the PHP Info. +Search for "cld2". +The module is installed, when you find it here. +**Only proceed when the module is installed** + +Now you can enable the addon. \ No newline at end of file diff --git a/cld/cld.php b/cld/cld.php new file mode 100644 index 000000000..4c54b2d0c --- /dev/null +++ b/cld/cld.php @@ -0,0 +1,75 @@ + + */ + +use Friendica\Core\Hook; +use Friendica\Core\Logger; +use Friendica\DI; + +function cld_install() +{ + Hook::register('get_language', 'addon/cld/cld.php', 'cld_get_language'); +} + +function cld_get_language(array &$data) +{ + if (!in_array('cld2', get_loaded_extensions())) { + Logger::warning('CLD2 is not installed.'); + return; + } + + $cld2 = new \CLD2Detector(); + + $cld2->setEncodingHint(CLD2Encoding::UTF8); // optional, hints about text encoding + + $result = $cld2->detect($data['text']); + + if ($data['detected']) { + $original = array_key_first($data['detected']); + } else { + $original = ''; + } + + $detected = $result['language_code']; + if ($detected == 'pt') { + $detected = 'pt-PT'; + } elseif ($detected == 'el') { + $detected = 'el-monoton'; + } elseif ($detected == 'no') { + $detected = 'nb'; + } elseif ($detected == 'zh') { + $detected = 'zh-Hans'; + } elseif ($detected == 'zh-Hant') { + $detected = 'zh-hant'; + } + + if (!$result['is_reliable']) { + Logger::debug('Unreliable detection', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); + return; + } + + if ($original == $detected) { +// return; + } + + // Nur aus Testgründen + if (in_array($detected, ['xx-Qaai', 'ht', 'ga'])) { + return; + } + + $available = array_keys(DI::l10n()->convertForLanguageDetection(DI::l10n()->getAvailableLanguages(true))); + + if (!in_array($detected, $available)) { + Logger::debug('Unsupported language', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); + return; + } + + Logger::debug('Detected', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); + +// Logger::debug('Detected different language', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); + $data['detected'] = [$detected => $result['language_probability'] / 100]; +} From 0eda161e04af171e43897ca0d780c1c20ff1ce21 Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 30 Sep 2023 16:00:26 +0000 Subject: [PATCH 2/5] Cleaned up code --- cld/cld.php | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/cld/cld.php b/cld/cld.php index 4c54b2d0c..56bca64f7 100644 --- a/cld/cld.php +++ b/cld/cld.php @@ -53,11 +53,6 @@ function cld_get_language(array &$data) } if ($original == $detected) { -// return; - } - - // Nur aus Testgründen - if (in_array($detected, ['xx-Qaai', 'ht', 'ga'])) { return; } @@ -68,8 +63,6 @@ function cld_get_language(array &$data) return; } - Logger::debug('Detected', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); - -// Logger::debug('Detected different language', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); + Logger::debug('Detected different language', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); $data['detected'] = [$detected => $result['language_probability'] / 100]; } From 80ce8551892ab7cbe7a16f40c1a019a05815353f Mon Sep 17 00:00:00 2001 From: Michael Date: Sun, 1 Oct 2023 04:14:10 +0000 Subject: [PATCH 3/5] Renamed hook --- cld/cld.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cld/cld.php b/cld/cld.php index 56bca64f7..54e7a73e4 100644 --- a/cld/cld.php +++ b/cld/cld.php @@ -12,10 +12,10 @@ use Friendica\DI; function cld_install() { - Hook::register('get_language', 'addon/cld/cld.php', 'cld_get_language'); + Hook::register('detect_languages', 'addon/cld/cld.php', 'cld_detect_languages'); } -function cld_get_language(array &$data) +function cld_detect_languages(array &$data) { if (!in_array('cld2', get_loaded_extensions())) { Logger::warning('CLD2 is not installed.'); From 18266ea6ef1ba98ffd7023fb8537b90dc413fa4a Mon Sep 17 00:00:00 2001 From: Michael Date: Fri, 6 Oct 2023 03:54:45 +0000 Subject: [PATCH 4/5] Changed hook parameter / more languages added --- cld/cld.php | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/cld/cld.php b/cld/cld.php index 54e7a73e4..fc8fa8145 100644 --- a/cld/cld.php +++ b/cld/cld.php @@ -12,7 +12,7 @@ use Friendica\DI; function cld_install() { - Hook::register('detect_languages', 'addon/cld/cld.php', 'cld_detect_languages'); + Hook::register('detect_languages', __FILE__, 'cld_detect_languages'); } function cld_detect_languages(array &$data) @@ -25,6 +25,7 @@ function cld_detect_languages(array &$data) $cld2 = new \CLD2Detector(); $cld2->setEncodingHint(CLD2Encoding::UTF8); // optional, hints about text encoding + $cld2->setPlainText(true); $result = $cld2->detect($data['text']); @@ -37,18 +38,37 @@ function cld_detect_languages(array &$data) $detected = $result['language_code']; if ($detected == 'pt') { $detected = 'pt-PT'; + } elseif ($detected == 'az') { + $detected = 'az-Latn'; + } elseif ($detected == 'bs') { + $detected = 'bs-Latn'; } elseif ($detected == 'el') { $detected = 'el-monoton'; + } elseif ($detected == 'ht') { + $detected = 'fr'; + } elseif ($detected == 'iw') { + $detected = 'he'; + } elseif ($detected == 'jw') { + $detected = 'jv'; + } elseif ($detected == 'ms') { + $detected = 'ms-Latn'; } elseif ($detected == 'no') { $detected = 'nb'; + } elseif ($detected == 'sr') { + $detected = 'sr-Cyrl'; } elseif ($detected == 'zh') { $detected = 'zh-Hans'; } elseif ($detected == 'zh-Hant') { $detected = 'zh-hant'; } + // languages that aren't supported via the base language detection + if (in_array($detected, ['ceb', 'hmn', 'ht', 'kk', 'ky', 'mg', 'mk', 'ml', 'ny', 'or', 'pa', 'rw', 'su', 'st', 'tg', 'ts', 'xx-Qaai'])) { + return; + } + if (!$result['is_reliable']) { - Logger::debug('Unreliable detection', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); + Logger::debug('Unreliable detection', ['uri-id' => $data['uri-id'], 'original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); return; } @@ -59,10 +79,10 @@ function cld_detect_languages(array &$data) $available = array_keys(DI::l10n()->convertForLanguageDetection(DI::l10n()->getAvailableLanguages(true))); if (!in_array($detected, $available)) { - Logger::debug('Unsupported language', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); + Logger::debug('Unsupported language', ['uri-id' => $data['uri-id'], 'original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); return; } - Logger::debug('Detected different language', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); + Logger::debug('Detected different language', ['uri-id' => $data['uri-id'], 'original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]); $data['detected'] = [$detected => $result['language_probability'] / 100]; } From 92251f4a6c8fa409b0a6ddef53dd6304390a9168 Mon Sep 17 00:00:00 2001 From: Michael Date: Fri, 6 Oct 2023 04:09:46 +0000 Subject: [PATCH 5/5] Updated CLD installation description --- cld/README.md | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/cld/README.md b/cld/README.md index 6709facb2..933dbceee 100644 --- a/cld/README.md +++ b/cld/README.md @@ -57,27 +57,21 @@ Install it: sudo make install ``` -Change to the folder with the available modules. When you use PHP 8.0 on Debian it is: +Change to the folder with the available modules. When you use PHP 8.2 on Debian it is: ``` -cd /etc/php/8.0/mods-available +cd /etc/php/8.2/mods-available ``` -Create the file `cld.ini` with this content: +Create the file `cld2.ini` with this content: ``` ; configuration for php cld2 module ; priority=20 extension=cld2.so ``` -Change to the folder `conf.d` in the folder of your `php.ini`. +Enable the module for all versions and all sapi: ``` -cd /etc/php/8.0/cgi/conf.d -``` -This depends on the way you installed the PHP support for your webserver. Instead of `cgi` it could also be `apache2` or `fpm`. - -Create a symbolic link to install the module: -``` -ln -s /etc/php/8.0/mods-available/cld.ini +phpenmod -v ALL -s ALL cld2 ``` Then restart the apache or fpm (or whatever you use) to load the changed configuration.