2023-09-30 17:56:50 +02:00
< ? php
/**
* Name : Compact Language Detector
* Description : Improved language detection
* Version : 0.1
* Author : Michael Vogel < heluecht @ pirati . ca >
*/
use Friendica\Core\Hook ;
use Friendica\Core\Logger ;
use Friendica\DI ;
function cld_install ()
{
2023-10-06 05:54:45 +02:00
Hook :: register ( 'detect_languages' , __FILE__ , 'cld_detect_languages' );
2023-09-30 17:56:50 +02:00
}
2023-10-01 06:14:10 +02:00
function cld_detect_languages ( array & $data )
2023-09-30 17:56:50 +02:00
{
if ( ! in_array ( 'cld2' , get_loaded_extensions ())) {
Logger :: warning ( 'CLD2 is not installed.' );
return ;
}
2024-11-26 08:55:01 +01:00
if ( ! class_exists ( 'CLD2Detector' )) {
Logger :: warning ( 'CLD2Detector class does not exist.' );
return ;
}
if ( ! class_exists ( 'CLD2Encoding' )) {
Logger :: warning ( 'CLD2Encoding class does not exist.' );
return ;
}
2023-09-30 17:56:50 +02:00
$cld2 = new \CLD2Detector ();
$cld2 -> setEncodingHint ( CLD2Encoding :: UTF8 ); // optional, hints about text encoding
2023-10-06 05:54:45 +02:00
$cld2 -> setPlainText ( true );
2023-09-30 17:56:50 +02:00
$result = $cld2 -> detect ( $data [ 'text' ]);
2023-10-11 20:49:42 +02:00
2023-09-30 17:56:50 +02:00
if ( $data [ 'detected' ]) {
$original = array_key_first ( $data [ 'detected' ]);
} else {
$original = '' ;
}
2023-11-02 23:54:19 +01:00
$detected = DI :: l10n () -> toISO6391 ( $result [ 'language_code' ]);
2023-09-30 17:56:50 +02:00
2023-11-02 23:54:19 +01:00
// languages that aren't supported via the base language detection or tend to false detections
if (( strlen ( $detected ) == 3 ) || in_array ( $detected , [ 'ht' , 'kk' , 'ku' , 'ky' , 'lg' , 'mg' , 'mk' , 'mt' , 'ny' , 'rw' , 'st' , 'su' , 'tg' , 'ts' , 'xx' ])) {
2023-10-06 05:54:45 +02:00
return ;
}
2023-09-30 17:56:50 +02:00
if ( ! $result [ 'is_reliable' ]) {
2023-10-06 05:54:45 +02:00
Logger :: debug ( 'Unreliable detection' , [ 'uri-id' => $data [ 'uri-id' ], 'original' => $original , 'detected' => $detected , 'name' => $result [ 'language_name' ], 'probability' => $result [ 'language_probability' ], 'text' => $data [ 'text' ]]);
2023-10-11 20:49:42 +02:00
if (( $original == $detected ) && ( $data [ 'detected' ][ $original ] < $result [ 'language_probability' ] / 100 )) {
$data [ 'detected' ][ $original ] = $result [ 'language_probability' ] / 100 ;
}
2023-09-30 17:56:50 +02:00
return ;
}
2023-11-02 23:54:19 +01:00
$available = array_keys ( DI :: l10n () -> getLanguageCodes ());
2023-09-30 17:56:50 +02:00
if ( ! in_array ( $detected , $available )) {
2023-10-06 05:54:45 +02:00
Logger :: debug ( 'Unsupported language' , [ 'uri-id' => $data [ 'uri-id' ], 'original' => $original , 'detected' => $detected , 'name' => $result [ 'language_name' ], 'probability' => $result [ 'language_probability' ], 'text' => $data [ 'text' ]]);
2023-09-30 17:56:50 +02:00
return ;
}
2023-10-11 20:49:42 +02:00
if ( $original != $detected ) {
Logger :: debug ( 'Detected different language' , [ 'uri-id' => $data [ 'uri-id' ], 'original' => $original , 'detected' => $detected , 'name' => $result [ 'language_name' ], 'probability' => $result [ 'language_probability' ], 'text' => $data [ 'text' ]]);
}
$length = count ( $data [ 'detected' ]);
if ( $length > 0 ) {
unset ( $data [ 'detected' ][ $detected ]);
$data [ 'detected' ] = array_merge ([ $detected => $result [ 'language_probability' ] / 100 ], array_slice ( $data [ 'detected' ], 0 , $length - 1 ));
} else {
$data [ 'detected' ] = [ $detected => $result [ 'language_probability' ] / 100 ];
}
2023-09-30 17:56:50 +02:00
}