CLD: New plugin for language detection via CLD2 #1425

Merged
MrPetovan merged 6 commits from heluecht/friendica-addons:cld2 into 2023.09-rc 2023-10-07 07:07:00 +02:00
2 changed files with 173 additions and 0 deletions

85
cld/README.md Normal file
View file

@ -0,0 +1,85 @@
Compact Language Detector
===
CLD2 is an advanced language dectection library with a high reliability.
This addon depends on the CLD PHP module which is not included in any Linux distribution.
It needs to be built and installed by hand, which is not totally straightforward.
Prerequisite
---
To be able to build the extension, you need the CLD module and the files for the PHP module development.
On Debian you install the packages php-dev, libcld2-dev and libcld2-0.
Make sure to have installed the correct PHP version.
Means: When you have got both PHP 8.0 and 8.2 on your system, you have to install php8.0-dev as well.
Installation
---
The original PHP extension is https://github.com/fntlnz/cld2-php-ext.
However, it doesn't support PHP8.
So https://github.com/hiteule/cld2-php-ext/tree/support-php8 has to be used.
Download the source code:
```
wget https://github.com/hiteule/cld2-php-ext/archive/refs/heads/support-php8.zip
```
Unzip it:
```
unzip support-php8.zip
```
Change into the folder:
```
cd cld2-php-ext-support-php8/
```
Configure for the PHP Api version:
```
phpize
```
(if you have got several PHP versions on your system, execute the command with the version that you run Friendica with, e.g. `phpize8.0`)
Create the Makefile:
```
./configure --with-cld2=/usr/include/cld2
```
Have a look at the line `checking for PHP includes`.
When the output (for example `/usr/include/php/20220829` doesn't match the API version that you got from `phpize`, then you have to change all the version codes in your `Makefile` afterwards)
Create the module:
```
make -j
```
Install it:
```
sudo make install
```
Change to the folder with the available modules. When you use PHP 8.2 on Debian it is:
```
cd /etc/php/8.2/mods-available
```
Create the file `cld2.ini` with this content:
```
; configuration for php cld2 module
; priority=20
extension=cld2.so
```
Enable the module for all versions and all sapi:
```
phpenmod -v ALL -s ALL cld2
```
Then restart the apache or fpm (or whatever you use) to load the changed configuration.
Call `/admin/phpinfo` on your webserver.
You then see the PHP Info.
Search for "cld2".
The module is installed, when you find it here.
**Only proceed when the module is installed**
Now you can enable the addon.

88
cld/cld.php Normal file
View file

@ -0,0 +1,88 @@
<?php
/**
* Name: Compact Language Detector
* Description: Improved language detection
* Version: 0.1
* Author: Michael Vogel <heluecht@pirati.ca>
*/
use Friendica\Core\Hook;
use Friendica\Core\Logger;
use Friendica\DI;
function cld_install()
{
Hook::register('detect_languages', __FILE__, 'cld_detect_languages');
}
function cld_detect_languages(array &$data)
{
if (!in_array('cld2', get_loaded_extensions())) {
Logger::warning('CLD2 is not installed.');
return;
}
$cld2 = new \CLD2Detector();
$cld2->setEncodingHint(CLD2Encoding::UTF8); // optional, hints about text encoding
$cld2->setPlainText(true);
$result = $cld2->detect($data['text']);
if ($data['detected']) {
$original = array_key_first($data['detected']);
} else {
$original = '';
}
$detected = $result['language_code'];
if ($detected == 'pt') {
$detected = 'pt-PT';
} elseif ($detected == 'az') {
$detected = 'az-Latn';
} elseif ($detected == 'bs') {
$detected = 'bs-Latn';
} elseif ($detected == 'el') {
$detected = 'el-monoton';
} elseif ($detected == 'ht') {
$detected = 'fr';
} elseif ($detected == 'iw') {
$detected = 'he';
} elseif ($detected == 'jw') {
$detected = 'jv';
} elseif ($detected == 'ms') {
$detected = 'ms-Latn';
} elseif ($detected == 'no') {
$detected = 'nb';
} elseif ($detected == 'sr') {
$detected = 'sr-Cyrl';
} elseif ($detected == 'zh') {
$detected = 'zh-Hans';
} elseif ($detected == 'zh-Hant') {
$detected = 'zh-hant';
}
// languages that aren't supported via the base language detection
if (in_array($detected, ['ceb', 'hmn', 'ht', 'kk', 'ky', 'mg', 'mk', 'ml', 'ny', 'or', 'pa', 'rw', 'su', 'st', 'tg', 'ts', 'xx-Qaai'])) {
return;
}
if (!$result['is_reliable']) {
Logger::debug('Unreliable detection', ['uri-id' => $data['uri-id'], 'original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
return;
}
if ($original == $detected) {
return;
}
$available = array_keys(DI::l10n()->convertForLanguageDetection(DI::l10n()->getAvailableLanguages(true)));
if (!in_array($detected, $available)) {
Logger::debug('Unsupported language', ['uri-id' => $data['uri-id'], 'original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
return;
}
Logger::debug('Detected different language', ['uri-id' => $data['uri-id'], 'original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
$data['detected'] = [$detected => $result['language_probability'] / 100];
}