Merge pull request 'CLD: New plugin for language detection via CLD2' (#1425) from heluecht/friendica-addons:cld2 into 2023.09-rc
Reviewed-on: friendica/friendica-addons#1425
This commit is contained in:
commit
fbafa80815
85
cld/README.md
Normal file
85
cld/README.md
Normal file
|
@ -0,0 +1,85 @@
|
||||||
|
Compact Language Detector
|
||||||
|
===
|
||||||
|
CLD2 is an advanced language dectection library with a high reliability.
|
||||||
|
|
||||||
|
This addon depends on the CLD PHP module which is not included in any Linux distribution.
|
||||||
|
It needs to be built and installed by hand, which is not totally straightforward.
|
||||||
|
|
||||||
|
Prerequisite
|
||||||
|
---
|
||||||
|
To be able to build the extension, you need the CLD module and the files for the PHP module development.
|
||||||
|
On Debian you install the packages php-dev, libcld2-dev and libcld2-0.
|
||||||
|
Make sure to have installed the correct PHP version.
|
||||||
|
Means: When you have got both PHP 8.0 and 8.2 on your system, you have to install php8.0-dev as well.
|
||||||
|
|
||||||
|
Installation
|
||||||
|
---
|
||||||
|
The original PHP extension is https://github.com/fntlnz/cld2-php-ext.
|
||||||
|
However, it doesn't support PHP8.
|
||||||
|
So https://github.com/hiteule/cld2-php-ext/tree/support-php8 has to be used.
|
||||||
|
|
||||||
|
Download the source code:
|
||||||
|
```
|
||||||
|
wget https://github.com/hiteule/cld2-php-ext/archive/refs/heads/support-php8.zip
|
||||||
|
```
|
||||||
|
|
||||||
|
Unzip it:
|
||||||
|
```
|
||||||
|
unzip support-php8.zip
|
||||||
|
```
|
||||||
|
|
||||||
|
Change into the folder:
|
||||||
|
```
|
||||||
|
cd cld2-php-ext-support-php8/
|
||||||
|
```
|
||||||
|
|
||||||
|
Configure for the PHP Api version:
|
||||||
|
```
|
||||||
|
phpize
|
||||||
|
```
|
||||||
|
(if you have got several PHP versions on your system, execute the command with the version that you run Friendica with, e.g. `phpize8.0`)
|
||||||
|
|
||||||
|
Create the Makefile:
|
||||||
|
```
|
||||||
|
./configure --with-cld2=/usr/include/cld2
|
||||||
|
```
|
||||||
|
|
||||||
|
Have a look at the line `checking for PHP includes`.
|
||||||
|
When the output (for example `/usr/include/php/20220829` doesn't match the API version that you got from `phpize`, then you have to change all the version codes in your `Makefile` afterwards)
|
||||||
|
|
||||||
|
Create the module:
|
||||||
|
```
|
||||||
|
make -j
|
||||||
|
```
|
||||||
|
|
||||||
|
Install it:
|
||||||
|
```
|
||||||
|
sudo make install
|
||||||
|
```
|
||||||
|
|
||||||
|
Change to the folder with the available modules. When you use PHP 8.2 on Debian it is:
|
||||||
|
```
|
||||||
|
cd /etc/php/8.2/mods-available
|
||||||
|
```
|
||||||
|
|
||||||
|
Create the file `cld2.ini` with this content:
|
||||||
|
```
|
||||||
|
; configuration for php cld2 module
|
||||||
|
; priority=20
|
||||||
|
extension=cld2.so
|
||||||
|
```
|
||||||
|
|
||||||
|
Enable the module for all versions and all sapi:
|
||||||
|
```
|
||||||
|
phpenmod -v ALL -s ALL cld2
|
||||||
|
```
|
||||||
|
|
||||||
|
Then restart the apache or fpm (or whatever you use) to load the changed configuration.
|
||||||
|
|
||||||
|
Call `/admin/phpinfo` on your webserver.
|
||||||
|
You then see the PHP Info.
|
||||||
|
Search for "cld2".
|
||||||
|
The module is installed, when you find it here.
|
||||||
|
**Only proceed when the module is installed**
|
||||||
|
|
||||||
|
Now you can enable the addon.
|
88
cld/cld.php
Normal file
88
cld/cld.php
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
<?php
|
||||||
|
/**
|
||||||
|
* Name: Compact Language Detector
|
||||||
|
* Description: Improved language detection
|
||||||
|
* Version: 0.1
|
||||||
|
* Author: Michael Vogel <heluecht@pirati.ca>
|
||||||
|
*/
|
||||||
|
|
||||||
|
use Friendica\Core\Hook;
|
||||||
|
use Friendica\Core\Logger;
|
||||||
|
use Friendica\DI;
|
||||||
|
|
||||||
|
function cld_install()
|
||||||
|
{
|
||||||
|
Hook::register('detect_languages', __FILE__, 'cld_detect_languages');
|
||||||
|
}
|
||||||
|
|
||||||
|
function cld_detect_languages(array &$data)
|
||||||
|
{
|
||||||
|
if (!in_array('cld2', get_loaded_extensions())) {
|
||||||
|
Logger::warning('CLD2 is not installed.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
$cld2 = new \CLD2Detector();
|
||||||
|
|
||||||
|
$cld2->setEncodingHint(CLD2Encoding::UTF8); // optional, hints about text encoding
|
||||||
|
$cld2->setPlainText(true);
|
||||||
|
|
||||||
|
$result = $cld2->detect($data['text']);
|
||||||
|
|
||||||
|
if ($data['detected']) {
|
||||||
|
$original = array_key_first($data['detected']);
|
||||||
|
} else {
|
||||||
|
$original = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
$detected = $result['language_code'];
|
||||||
|
if ($detected == 'pt') {
|
||||||
|
$detected = 'pt-PT';
|
||||||
|
} elseif ($detected == 'az') {
|
||||||
|
$detected = 'az-Latn';
|
||||||
|
} elseif ($detected == 'bs') {
|
||||||
|
$detected = 'bs-Latn';
|
||||||
|
} elseif ($detected == 'el') {
|
||||||
|
$detected = 'el-monoton';
|
||||||
|
} elseif ($detected == 'ht') {
|
||||||
|
$detected = 'fr';
|
||||||
|
} elseif ($detected == 'iw') {
|
||||||
|
$detected = 'he';
|
||||||
|
} elseif ($detected == 'jw') {
|
||||||
|
$detected = 'jv';
|
||||||
|
} elseif ($detected == 'ms') {
|
||||||
|
$detected = 'ms-Latn';
|
||||||
|
} elseif ($detected == 'no') {
|
||||||
|
$detected = 'nb';
|
||||||
|
} elseif ($detected == 'sr') {
|
||||||
|
$detected = 'sr-Cyrl';
|
||||||
|
} elseif ($detected == 'zh') {
|
||||||
|
$detected = 'zh-Hans';
|
||||||
|
} elseif ($detected == 'zh-Hant') {
|
||||||
|
$detected = 'zh-hant';
|
||||||
|
}
|
||||||
|
|
||||||
|
// languages that aren't supported via the base language detection
|
||||||
|
if (in_array($detected, ['ceb', 'hmn', 'ht', 'kk', 'ky', 'mg', 'mk', 'ml', 'ny', 'or', 'pa', 'rw', 'su', 'st', 'tg', 'ts', 'xx-Qaai'])) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!$result['is_reliable']) {
|
||||||
|
Logger::debug('Unreliable detection', ['uri-id' => $data['uri-id'], 'original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($original == $detected) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
$available = array_keys(DI::l10n()->convertForLanguageDetection(DI::l10n()->getAvailableLanguages(true)));
|
||||||
|
|
||||||
|
if (!in_array($detected, $available)) {
|
||||||
|
Logger::debug('Unsupported language', ['uri-id' => $data['uri-id'], 'original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Logger::debug('Detected different language', ['uri-id' => $data['uri-id'], 'original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
|
||||||
|
$data['detected'] = [$detected => $result['language_probability'] / 100];
|
||||||
|
}
|
Loading…
Reference in a new issue