mirror of
https://github.com/friendica/friendica
synced 2025-01-15 11:37:23 +01:00
Revert "Update languagedetect library"
This commit is contained in:
parent
c22920edba
commit
071946fa78
101 changed files with 3632 additions and 311 deletions
|
@ -16,8 +16,7 @@
|
|||
"ezyang/htmlpurifier": "~4.7.0",
|
||||
"mobiledetect/mobiledetectlib": "2.8.*",
|
||||
"league/html-to-markdown": "~4.4.1",
|
||||
"pear-pear.php.net/Text_Highlighter": "*",
|
||||
"pear-pear.php.net/Text_LanguageDetect": "*"
|
||||
"pear-pear.php.net/Text_Highlighter": "*"
|
||||
},
|
||||
"repositories": [
|
||||
{
|
||||
|
|
60
composer.lock
generated
60
composer.lock
generated
|
@ -4,7 +4,7 @@
|
|||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"content-hash": "7499dcab40af67a3f23036e3a8d9587f",
|
||||
"content-hash": "802372ddf124ef949e80dd8dc1d38797",
|
||||
"packages": [
|
||||
{
|
||||
"name": "ezyang/htmlpurifier",
|
||||
|
@ -116,16 +116,16 @@
|
|||
},
|
||||
{
|
||||
"name": "mobiledetect/mobiledetectlib",
|
||||
"version": "2.8.26",
|
||||
"version": "2.8.25",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/serbanghita/Mobile-Detect.git",
|
||||
"reference": "a0ed86c9d7c04ae27fa6418b55e3beb04dfe3297"
|
||||
"reference": "f0896b5c7274d1450023b0b376240be902c3251c"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/serbanghita/Mobile-Detect/zipball/a0ed86c9d7c04ae27fa6418b55e3beb04dfe3297",
|
||||
"reference": "a0ed86c9d7c04ae27fa6418b55e3beb04dfe3297",
|
||||
"url": "https://api.github.com/repos/serbanghita/Mobile-Detect/zipball/f0896b5c7274d1450023b0b376240be902c3251c",
|
||||
"reference": "f0896b5c7274d1450023b0b376240be902c3251c",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
|
@ -164,14 +164,14 @@
|
|||
"mobile detector",
|
||||
"php mobile detect"
|
||||
],
|
||||
"time": "2017-08-29T18:23:54+00:00"
|
||||
"time": "2017-03-29T13:59:30+00:00"
|
||||
},
|
||||
{
|
||||
"name": "pear-pear.php.net/Archive_Tar",
|
||||
"version": "1.4.3",
|
||||
"version": "1.4.2",
|
||||
"dist": {
|
||||
"type": "file",
|
||||
"url": "https://pear.php.net/get/Archive_Tar-1.4.3.tgz",
|
||||
"url": "https://pear.php.net/get/Archive_Tar-1.4.2.tgz",
|
||||
"reference": null,
|
||||
"shasum": null
|
||||
},
|
||||
|
@ -179,7 +179,7 @@
|
|||
"php": ">=5.2.0.0"
|
||||
},
|
||||
"replace": {
|
||||
"pear-pear/archive_tar": "== 1.4.3.0"
|
||||
"pear-pear/archive_tar": "== 1.4.2.0"
|
||||
},
|
||||
"type": "pear-library",
|
||||
"autoload": {
|
||||
|
@ -226,10 +226,10 @@
|
|||
},
|
||||
{
|
||||
"name": "pear-pear.php.net/PEAR",
|
||||
"version": "1.10.5",
|
||||
"version": "1.10.4",
|
||||
"dist": {
|
||||
"type": "file",
|
||||
"url": "https://pear.php.net/get/PEAR-1.10.5.tgz",
|
||||
"url": "https://pear.php.net/get/PEAR-1.10.4.tgz",
|
||||
"reference": null,
|
||||
"shasum": null
|
||||
},
|
||||
|
@ -247,7 +247,7 @@
|
|||
"pear-pear.php.net/pear_frontend_web": "<=0.4.0.0"
|
||||
},
|
||||
"replace": {
|
||||
"pear-pear/pear": "== 1.10.5.0"
|
||||
"pear-pear/pear": "== 1.10.4.0"
|
||||
},
|
||||
"type": "pear-library",
|
||||
"autoload": {
|
||||
|
@ -324,36 +324,6 @@
|
|||
],
|
||||
"description": "Text_Highlighter is a package for syntax highlighting.\n\nIt provides a base class provining all the functionality,\nand a descendent classes geneator class.\n\nThe main idea is to simplify creation of subclasses\nimplementing syntax highlighting for particular language.\nSubclasses do not implement any new functioanality,\nthey just provide syntax highlighting rules.\nThe rules sources are in XML format.\n\nTo create a highlighter for a language, there is no need\nto code a new class manually. Simply describe the rules\nin XML file and use Text_Highlighter_Generator to create\na new class."
|
||||
},
|
||||
{
|
||||
"name": "pear-pear.php.net/Text_LanguageDetect",
|
||||
"version": "1.0.0",
|
||||
"dist": {
|
||||
"type": "file",
|
||||
"url": "https://pear.php.net/get/Text_LanguageDetect-1.0.0.tgz",
|
||||
"reference": null,
|
||||
"shasum": null
|
||||
},
|
||||
"require": {
|
||||
"ext-pcre": "*",
|
||||
"php": ">=5.4.0.0"
|
||||
},
|
||||
"replace": {
|
||||
"pear-pear/text_languagedetect": "== 1.0.0.0"
|
||||
},
|
||||
"type": "pear-library",
|
||||
"autoload": {
|
||||
"classmap": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"include-path": [
|
||||
"/"
|
||||
],
|
||||
"license": [
|
||||
"BSD"
|
||||
],
|
||||
"description": "Text_LanguageDetect can identify 52 human languages from text samples and return confidence scores for each."
|
||||
},
|
||||
{
|
||||
"name": "pear-pear.php.net/XML_Parser",
|
||||
"version": "1.3.7",
|
||||
|
@ -386,10 +356,10 @@
|
|||
},
|
||||
{
|
||||
"name": "pear-pear.php.net/XML_Util",
|
||||
"version": "1.4.3",
|
||||
"version": "1.4.2",
|
||||
"dist": {
|
||||
"type": "file",
|
||||
"url": "https://pear.php.net/get/XML_Util-1.4.3.tgz",
|
||||
"url": "https://pear.php.net/get/XML_Util-1.4.2.tgz",
|
||||
"reference": null,
|
||||
"shasum": null
|
||||
},
|
||||
|
@ -398,7 +368,7 @@
|
|||
"php": ">=5.4.0.0"
|
||||
},
|
||||
"replace": {
|
||||
"pear-pear/xml_util": "== 1.4.3.0"
|
||||
"pear-pear/xml_util": "== 1.4.2.0"
|
||||
},
|
||||
"type": "pear-library",
|
||||
"autoload": {
|
||||
|
|
|
@ -364,9 +364,13 @@ function item_add_language_opt(&$arr) {
|
|||
$postopts = "";
|
||||
}
|
||||
|
||||
require_once('library/langdet/Text/LanguageDetect.php');
|
||||
|
||||
$naked_body = preg_replace('/\[(.+?)\]/','', $arr['body']);
|
||||
$LanguageDetect = new Text_LanguageDetect();
|
||||
$lng = $LanguageDetect->detect($naked_body, 3);
|
||||
$l = new Text_LanguageDetect();
|
||||
//$lng = $l->detectConfidence($naked_body);
|
||||
//$arr['postopts'] = (($lng['language']) ? 'lang=' . $lng['language'] . ';' . $lng['confidence'] : '');
|
||||
$lng = $l->detect($naked_body, 3);
|
||||
|
||||
if (sizeof($lng) > 0) {
|
||||
if ($postopts != "") $postopts .= '&'; // arbitrary separator, to be reviewed
|
||||
|
|
157
library/langdet/README.rst
Normal file
157
library/langdet/README.rst
Normal file
|
@ -0,0 +1,157 @@
|
|||
*******************
|
||||
Text_LanguageDetect
|
||||
*******************
|
||||
PHP library to identify human languages from text samples.
|
||||
Returns confidence scores for each.
|
||||
|
||||
|
||||
Installation
|
||||
============
|
||||
|
||||
PEAR
|
||||
----
|
||||
::
|
||||
|
||||
$ pear install Text_LanguageDetect
|
||||
|
||||
Composer
|
||||
--------
|
||||
::
|
||||
|
||||
$ composer require pear/text_languagedetect
|
||||
|
||||
|
||||
Usage
|
||||
=====
|
||||
Also see the examples in the ``docs/`` directory and
|
||||
the `official documentation`__.
|
||||
|
||||
__ http://pear.php.net/package/Text_LanguageDetect/docs
|
||||
|
||||
Language detection
|
||||
------------------
|
||||
Simple language detection::
|
||||
|
||||
<?php
|
||||
require_once 'Text/LanguageDetect.php';
|
||||
|
||||
$text = 'Was wäre, wenn ich Ihnen das jetzt sagen würde?';
|
||||
|
||||
$ld = new Text_LanguageDetect();
|
||||
$language = $ld->detectSimple($text);
|
||||
|
||||
echo $language;
|
||||
//output: german
|
||||
|
||||
Show the three most probable languages with their confidence score::
|
||||
|
||||
<?php
|
||||
require_once 'Text/LanguageDetect.php';
|
||||
|
||||
$text = 'Was wäre, wenn ich Ihnen das jetzt sagen würde?';
|
||||
|
||||
$ld = new Text_LanguageDetect();
|
||||
//3 most probable languages
|
||||
$results = $ld->detect($text, 3);
|
||||
|
||||
foreach ($results as $language => $confidence) {
|
||||
echo $language . ': ' . number_format($confidence, 2) . "\n";
|
||||
}
|
||||
|
||||
//output:
|
||||
//german: 0.35
|
||||
//dutch: 0.25
|
||||
//swedish: 0.20
|
||||
?>
|
||||
|
||||
|
||||
Language code
|
||||
-------------
|
||||
Instead of returning the full language name, ISO 639-2 two and three
|
||||
letter codes can be returned::
|
||||
|
||||
<?php
|
||||
require_once 'Text/LanguageDetect.php';
|
||||
$ld = new Text_LanguageDetect();
|
||||
|
||||
//will output the ISO 639-1 two-letter language code
|
||||
// "de"
|
||||
$ld->setNameMode(2);
|
||||
echo $ld->detectSimple('Das ist ein kleiner Text') . "\n";
|
||||
|
||||
//will output the ISO 639-2 three-letter language code
|
||||
// "deu"
|
||||
$ld->setNameMode(3);
|
||||
echo $ld->detectSimple('Das ist ein kleiner Text') . "\n";
|
||||
?>
|
||||
|
||||
|
||||
Supported languages
|
||||
===================
|
||||
- albanian
|
||||
- arabic
|
||||
- azeri
|
||||
- bengali
|
||||
- bulgarian
|
||||
- cebuano
|
||||
- croatian
|
||||
- czech
|
||||
- danish
|
||||
- dutch
|
||||
- english
|
||||
- estonian
|
||||
- farsi
|
||||
- finnish
|
||||
- french
|
||||
- german
|
||||
- hausa
|
||||
- hawaiian
|
||||
- hindi
|
||||
- hungarian
|
||||
- icelandic
|
||||
- indonesian
|
||||
- italian
|
||||
- kazakh
|
||||
- kyrgyz
|
||||
- latin
|
||||
- latvian
|
||||
- lithuanian
|
||||
- macedonian
|
||||
- mongolian
|
||||
- nepali
|
||||
- norwegian
|
||||
- pashto
|
||||
- pidgin
|
||||
- polish
|
||||
- portuguese
|
||||
- romanian
|
||||
- russian
|
||||
- serbian
|
||||
- slovak
|
||||
- slovene
|
||||
- somali
|
||||
- spanish
|
||||
- swahili
|
||||
- swedish
|
||||
- tagalog
|
||||
- turkish
|
||||
- ukrainian
|
||||
- urdu
|
||||
- uzbek
|
||||
- vietnamese
|
||||
- welsh
|
||||
|
||||
|
||||
Links
|
||||
=====
|
||||
Homepage
|
||||
http://pear.php.net/package/Text_LanguageDetect
|
||||
Bug tracker
|
||||
http://pear.php.net/bugs/search.php?cmd=display&package_name[]=Text_LanguageDetect
|
||||
Documentation
|
||||
http://pear.php.net/package/Text_LanguageDetect/docs
|
||||
Unit test status
|
||||
https://travis-ci.org/pear/Text_LanguageDetect
|
||||
|
||||
.. image:: https://travis-ci.org/pear/Text_LanguageDetect.svg?branch=master
|
||||
:target: https://travis-ci.org/pear/Text_LanguageDetect
|
|
@ -58,7 +58,7 @@ require_once 'Text/LanguageDetect/ISO639.php';
|
|||
* @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
|
||||
* @copyright 2005 Nicholas Pisarro
|
||||
* @license BSD http://www.opensource.org/licenses/bsd-license.php
|
||||
* @version Release: 1.0.0
|
||||
* @version Release: @package_version@
|
||||
* @link http://pear.php.net/package/Text_LanguageDetect/
|
||||
*/
|
||||
class Text_LanguageDetect
|
||||
|
@ -90,7 +90,7 @@ class Text_LanguageDetect
|
|||
*
|
||||
* @var string
|
||||
*/
|
||||
protected $_data_dir = 'D:\Mes Projets\Friendica\friendica\vendor/pear-pear.php.net/Text_LanguageDetect/data';
|
||||
protected $_data_dir = '@data_dir@';
|
||||
|
||||
/**
|
||||
* The trigram data for comparison
|
||||
|
@ -196,7 +196,7 @@ class Text_LanguageDetect
|
|||
|
||||
} elseif ($this->_data_dir != '@' . 'data_dir' . '@') {
|
||||
// if the data dir was set by the PEAR installer, use that
|
||||
return $this->_data_dir . '/' . $fname;
|
||||
return $this->_data_dir . '/Text_LanguageDetect/' . $fname;
|
||||
|
||||
} else {
|
||||
// assume this was just unpacked somewhere
|
|
@ -27,7 +27,7 @@
|
|||
* @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
|
||||
* @copyright 2006 Nicholas Pisarro
|
||||
* @license BSD http://www.opensource.org/licenses/bsd-license.php
|
||||
* @version Release: 1.0.0
|
||||
* @version Release: @package_version@
|
||||
* @link http://pear.php.net/package/Text_LanguageDetect/
|
||||
*/
|
||||
class Text_LanguageDetect_Parser extends Text_LanguageDetect
|
7
library/langdet/data/build-unicode_blocks.php
Normal file
7
library/langdet/data/build-unicode_blocks.php
Normal file
|
@ -0,0 +1,7 @@
|
|||
<?php
|
||||
/**
|
||||
* Generate the serialized unicode_blocks.dat file shipped with the package
|
||||
*/
|
||||
$unicode_blocks = include __DIR__ . '/unicode_blocks.php';
|
||||
file_put_contents(__DIR__ . '/unicode_blocks.dat', serialize($unicode_blocks));
|
||||
?>
|
874
library/langdet/data/unicode_blocks.php
Normal file
874
library/langdet/data/unicode_blocks.php
Normal file
|
@ -0,0 +1,874 @@
|
|||
<?php
|
||||
return array (
|
||||
0 =>
|
||||
array (
|
||||
0 => 0x0000,
|
||||
1 => 0x007F,
|
||||
2 => 'Basic Latin',
|
||||
),
|
||||
1 =>
|
||||
array (
|
||||
0 => 0x0080,
|
||||
1 => 0x00FF,
|
||||
2 => 'Latin-1 Supplement',
|
||||
),
|
||||
2 =>
|
||||
array (
|
||||
0 => 0x0100,
|
||||
1 => 0x017F,
|
||||
2 => 'Latin Extended-A',
|
||||
),
|
||||
3 =>
|
||||
array (
|
||||
0 => 0x0180,
|
||||
1 => 0x024F,
|
||||
2 => 'Latin Extended-B',
|
||||
),
|
||||
4 =>
|
||||
array (
|
||||
0 => 0x0250,
|
||||
1 => 0x02AF,
|
||||
2 => 'IPA Extensions',
|
||||
),
|
||||
5 =>
|
||||
array (
|
||||
0 => 0x02B0,
|
||||
1 => 0x02FF,
|
||||
2 => 'Spacing Modifier Letters',
|
||||
),
|
||||
6 =>
|
||||
array (
|
||||
0 => 0x0300,
|
||||
1 => 0x036F,
|
||||
2 => 'Combining Diacritical Marks',
|
||||
),
|
||||
7 =>
|
||||
array (
|
||||
0 => 0x0370,
|
||||
1 => 0x03FF,
|
||||
2 => 'Greek and Coptic',
|
||||
),
|
||||
8 =>
|
||||
array (
|
||||
0 => 0x0400,
|
||||
1 => 0x04FF,
|
||||
2 => 'Cyrillic',
|
||||
),
|
||||
9 =>
|
||||
array (
|
||||
0 => 0x0500,
|
||||
1 => 0x052F,
|
||||
2 => 'Cyrillic Supplement',
|
||||
),
|
||||
10 =>
|
||||
array (
|
||||
0 => 0x0530,
|
||||
1 => 0x058F,
|
||||
2 => 'Armenian',
|
||||
),
|
||||
11 =>
|
||||
array (
|
||||
0 => 0x0590,
|
||||
1 => 0x05FF,
|
||||
2 => 'Hebrew',
|
||||
),
|
||||
12 =>
|
||||
array (
|
||||
0 => 0x0600,
|
||||
1 => 0x06FF,
|
||||
2 => 'Arabic',
|
||||
),
|
||||
13 =>
|
||||
array (
|
||||
0 => 0x0700,
|
||||
1 => 0x074F,
|
||||
2 => 'Syriac',
|
||||
),
|
||||
14 =>
|
||||
array (
|
||||
0 => 0x0750,
|
||||
1 => 0x077F,
|
||||
2 => 'Arabic Supplement',
|
||||
),
|
||||
15 =>
|
||||
array (
|
||||
0 => 0x0780,
|
||||
1 => 0x07BF,
|
||||
2 => 'Thaana',
|
||||
),
|
||||
16 =>
|
||||
array (
|
||||
0 => 0x0900,
|
||||
1 => 0x097F,
|
||||
2 => 'Devanagari',
|
||||
),
|
||||
17 =>
|
||||
array (
|
||||
0 => 0x0980,
|
||||
1 => 0x09FF,
|
||||
2 => 'Bengali',
|
||||
),
|
||||
18 =>
|
||||
array (
|
||||
0 => 0x0A00,
|
||||
1 => 0x0A7F,
|
||||
2 => 'Gurmukhi',
|
||||
),
|
||||
19 =>
|
||||
array (
|
||||
0 => 0x0A80,
|
||||
1 => 0x0AFF,
|
||||
2 => 'Gujarati',
|
||||
),
|
||||
20 =>
|
||||
array (
|
||||
0 => 0x0B00,
|
||||
1 => 0x0B7F,
|
||||
2 => 'Oriya',
|
||||
),
|
||||
21 =>
|
||||
array (
|
||||
0 => 0x0B80,
|
||||
1 => 0x0BFF,
|
||||
2 => 'Tamil',
|
||||
),
|
||||
22 =>
|
||||
array (
|
||||
0 => 0x0C00,
|
||||
1 => 0x0C7F,
|
||||
2 => 'Telugu',
|
||||
),
|
||||
23 =>
|
||||
array (
|
||||
0 => 0x0C80,
|
||||
1 => 0x0CFF,
|
||||
2 => 'Kannada',
|
||||
),
|
||||
24 =>
|
||||
array (
|
||||
0 => 0x0D00,
|
||||
1 => 0x0D7F,
|
||||
2 => 'Malayalam',
|
||||
),
|
||||
25 =>
|
||||
array (
|
||||
0 => 0x0D80,
|
||||
1 => 0x0DFF,
|
||||
2 => 'Sinhala',
|
||||
),
|
||||
26 =>
|
||||
array (
|
||||
0 => 0x0E00,
|
||||
1 => 0x0E7F,
|
||||
2 => 'Thai',
|
||||
),
|
||||
27 =>
|
||||
array (
|
||||
0 => 0x0E80,
|
||||
1 => 0x0EFF,
|
||||
2 => 'Lao',
|
||||
),
|
||||
28 =>
|
||||
array (
|
||||
0 => 0x0F00,
|
||||
1 => 0x0FFF,
|
||||
2 => 'Tibetan',
|
||||
),
|
||||
29 =>
|
||||
array (
|
||||
0 => 0x1000,
|
||||
1 => 0x109F,
|
||||
2 => 'Myanmar',
|
||||
),
|
||||
30 =>
|
||||
array (
|
||||
0 => 0x10A0,
|
||||
1 => 0x10FF,
|
||||
2 => 'Georgian',
|
||||
),
|
||||
31 =>
|
||||
array (
|
||||
0 => 0x1100,
|
||||
1 => 0x11FF,
|
||||
2 => 'Hangul Jamo',
|
||||
),
|
||||
32 =>
|
||||
array (
|
||||
0 => 0x1200,
|
||||
1 => 0x137F,
|
||||
2 => 'Ethiopic',
|
||||
),
|
||||
33 =>
|
||||
array (
|
||||
0 => 0x1380,
|
||||
1 => 0x139F,
|
||||
2 => 'Ethiopic Supplement',
|
||||
),
|
||||
34 =>
|
||||
array (
|
||||
0 => 0x13A0,
|
||||
1 => 0x13FF,
|
||||
2 => 'Cherokee',
|
||||
),
|
||||
35 =>
|
||||
array (
|
||||
0 => 0x1400,
|
||||
1 => 0x167F,
|
||||
2 => 'Unified Canadian Aboriginal Syllabics',
|
||||
),
|
||||
36 =>
|
||||
array (
|
||||
0 => 0x1680,
|
||||
1 => 0x169F,
|
||||
2 => 'Ogham',
|
||||
),
|
||||
37 =>
|
||||
array (
|
||||
0 => 0x16A0,
|
||||
1 => 0x16FF,
|
||||
2 => 'Runic',
|
||||
),
|
||||
38 =>
|
||||
array (
|
||||
0 => 0x1700,
|
||||
1 => 0x171F,
|
||||
2 => 'Tagalog',
|
||||
),
|
||||
39 =>
|
||||
array (
|
||||
0 => 0x1720,
|
||||
1 => 0x173F,
|
||||
2 => 'Hanunoo',
|
||||
),
|
||||
40 =>
|
||||
array (
|
||||
0 => 0x1740,
|
||||
1 => 0x175F,
|
||||
2 => 'Buhid',
|
||||
),
|
||||
41 =>
|
||||
array (
|
||||
0 => 0x1760,
|
||||
1 => 0x177F,
|
||||
2 => 'Tagbanwa',
|
||||
),
|
||||
42 =>
|
||||
array (
|
||||
0 => 0x1780,
|
||||
1 => 0x17FF,
|
||||
2 => 'Khmer',
|
||||
),
|
||||
43 =>
|
||||
array (
|
||||
0 => 0x1800,
|
||||
1 => 0x18AF,
|
||||
2 => 'Mongolian',
|
||||
),
|
||||
44 =>
|
||||
array (
|
||||
0 => 0x1900,
|
||||
1 => 0x194F,
|
||||
2 => 'Limbu',
|
||||
),
|
||||
45 =>
|
||||
array (
|
||||
0 => 0x1950,
|
||||
1 => 0x197F,
|
||||
2 => 'Tai Le',
|
||||
),
|
||||
46 =>
|
||||
array (
|
||||
0 => 0x1980,
|
||||
1 => 0x19DF,
|
||||
2 => 'New Tai Lue',
|
||||
),
|
||||
47 =>
|
||||
array (
|
||||
0 => 0x19E0,
|
||||
1 => 0x19FF,
|
||||
2 => 'Khmer Symbols',
|
||||
),
|
||||
48 =>
|
||||
array (
|
||||
0 => 0x1A00,
|
||||
1 => 0x1A1F,
|
||||
2 => 'Buginese',
|
||||
),
|
||||
49 =>
|
||||
array (
|
||||
0 => 0x1D00,
|
||||
1 => 0x1D7F,
|
||||
2 => 'Phonetic Extensions',
|
||||
),
|
||||
50 =>
|
||||
array (
|
||||
0 => 0x1D80,
|
||||
1 => 0x1DBF,
|
||||
2 => 'Phonetic Extensions Supplement',
|
||||
),
|
||||
51 =>
|
||||
array (
|
||||
0 => 0x1DC0,
|
||||
1 => 0x1DFF,
|
||||
2 => 'Combining Diacritical Marks Supplement',
|
||||
),
|
||||
52 =>
|
||||
array (
|
||||
0 => 0x1E00,
|
||||
1 => 0x1EFF,
|
||||
2 => 'Latin Extended Additional',
|
||||
),
|
||||
53 =>
|
||||
array (
|
||||
0 => 0x1F00,
|
||||
1 => 0x1FFF,
|
||||
2 => 'Greek Extended',
|
||||
),
|
||||
54 =>
|
||||
array (
|
||||
0 => 0x2000,
|
||||
1 => 0x206F,
|
||||
2 => 'General Punctuation',
|
||||
),
|
||||
55 =>
|
||||
array (
|
||||
0 => 0x2070,
|
||||
1 => 0x209F,
|
||||
2 => 'Superscripts and Subscripts',
|
||||
),
|
||||
56 =>
|
||||
array (
|
||||
0 => 0x20A0,
|
||||
1 => 0x20CF,
|
||||
2 => 'Currency Symbols',
|
||||
),
|
||||
57 =>
|
||||
array (
|
||||
0 => 0x20D0,
|
||||
1 => 0x20FF,
|
||||
2 => 'Combining Diacritical Marks for Symbols',
|
||||
),
|
||||
58 =>
|
||||
array (
|
||||
0 => 0x2100,
|
||||
1 => 0x214F,
|
||||
2 => 'Letterlike Symbols',
|
||||
),
|
||||
59 =>
|
||||
array (
|
||||
0 => 0x2150,
|
||||
1 => 0x218F,
|
||||
2 => 'Number Forms',
|
||||
),
|
||||
60 =>
|
||||
array (
|
||||
0 => 0x2190,
|
||||
1 => 0x21FF,
|
||||
2 => 'Arrows',
|
||||
),
|
||||
61 =>
|
||||
array (
|
||||
0 => 0x2200,
|
||||
1 => 0x22FF,
|
||||
2 => 'Mathematical Operators',
|
||||
),
|
||||
62 =>
|
||||
array (
|
||||
0 => 0x2300,
|
||||
1 => 0x23FF,
|
||||
2 => 'Miscellaneous Technical',
|
||||
),
|
||||
63 =>
|
||||
array (
|
||||
0 => 0x2400,
|
||||
1 => 0x243F,
|
||||
2 => 'Control Pictures',
|
||||
),
|
||||
64 =>
|
||||
array (
|
||||
0 => 0x2440,
|
||||
1 => 0x245F,
|
||||
2 => 'Optical Character Recognition',
|
||||
),
|
||||
65 =>
|
||||
array (
|
||||
0 => 0x2460,
|
||||
1 => 0x24FF,
|
||||
2 => 'Enclosed Alphanumerics',
|
||||
),
|
||||
66 =>
|
||||
array (
|
||||
0 => 0x2500,
|
||||
1 => 0x257F,
|
||||
2 => 'Box Drawing',
|
||||
),
|
||||
67 =>
|
||||
array (
|
||||
0 => 0x2580,
|
||||
1 => 0x259F,
|
||||
2 => 'Block Elements',
|
||||
),
|
||||
68 =>
|
||||
array (
|
||||
0 => 0x25A0,
|
||||
1 => 0x25FF,
|
||||
2 => 'Geometric Shapes',
|
||||
),
|
||||
69 =>
|
||||
array (
|
||||
0 => 0x2600,
|
||||
1 => 0x26FF,
|
||||
2 => 'Miscellaneous Symbols',
|
||||
),
|
||||
70 =>
|
||||
array (
|
||||
0 => 0x2700,
|
||||
1 => 0x27BF,
|
||||
2 => 'Dingbats',
|
||||
),
|
||||
71 =>
|
||||
array (
|
||||
0 => 0x27C0,
|
||||
1 => 0x27EF,
|
||||
2 => 'Miscellaneous Mathematical Symbols-A',
|
||||
),
|
||||
72 =>
|
||||
array (
|
||||
0 => 0x27F0,
|
||||
1 => 0x27FF,
|
||||
2 => 'Supplemental Arrows-A',
|
||||
),
|
||||
73 =>
|
||||
array (
|
||||
0 => 0x2800,
|
||||
1 => 0x28FF,
|
||||
2 => 'Braille Patterns',
|
||||
),
|
||||
74 =>
|
||||
array (
|
||||
0 => 0x2900,
|
||||
1 => 0x297F,
|
||||
2 => 'Supplemental Arrows-B',
|
||||
),
|
||||
75 =>
|
||||
array (
|
||||
0 => 0x2980,
|
||||
1 => 0x29FF,
|
||||
2 => 'Miscellaneous Mathematical Symbols-B',
|
||||
),
|
||||
76 =>
|
||||
array (
|
||||
0 => 0x2A00,
|
||||
1 => 0x2AFF,
|
||||
2 => 'Supplemental Mathematical Operators',
|
||||
),
|
||||
77 =>
|
||||
array (
|
||||
0 => 0x2B00,
|
||||
1 => 0x2BFF,
|
||||
2 => 'Miscellaneous Symbols and Arrows',
|
||||
),
|
||||
78 =>
|
||||
array (
|
||||
0 => 0x2C00,
|
||||
1 => 0x2C5F,
|
||||
2 => 'Glagolitic',
|
||||
),
|
||||
79 =>
|
||||
array (
|
||||
0 => 0x2C80,
|
||||
1 => 0x2CFF,
|
||||
2 => 'Coptic',
|
||||
),
|
||||
80 =>
|
||||
array (
|
||||
0 => 0x2D00,
|
||||
1 => 0x2D2F,
|
||||
2 => 'Georgian Supplement',
|
||||
),
|
||||
81 =>
|
||||
array (
|
||||
0 => 0x2D30,
|
||||
1 => 0x2D7F,
|
||||
2 => 'Tifinagh',
|
||||
),
|
||||
82 =>
|
||||
array (
|
||||
0 => 0x2D80,
|
||||
1 => 0x2DDF,
|
||||
2 => 'Ethiopic Extended',
|
||||
),
|
||||
83 =>
|
||||
array (
|
||||
0 => 0x2E00,
|
||||
1 => 0x2E7F,
|
||||
2 => 'Supplemental Punctuation',
|
||||
),
|
||||
84 =>
|
||||
array (
|
||||
0 => 0x2E80,
|
||||
1 => 0x2EFF,
|
||||
2 => 'CJK Radicals Supplement',
|
||||
),
|
||||
85 =>
|
||||
array (
|
||||
0 => 0x2F00,
|
||||
1 => 0x2FDF,
|
||||
2 => 'Kangxi Radicals',
|
||||
),
|
||||
86 =>
|
||||
array (
|
||||
0 => 0x2FF0,
|
||||
1 => 0x2FFF,
|
||||
2 => 'Ideographic Description Characters',
|
||||
),
|
||||
87 =>
|
||||
array (
|
||||
0 => 0x3000,
|
||||
1 => 0x303F,
|
||||
2 => 'CJK Symbols and Punctuation',
|
||||
),
|
||||
88 =>
|
||||
array (
|
||||
0 => 0x3040,
|
||||
1 => 0x309F,
|
||||
2 => 'Hiragana',
|
||||
),
|
||||
89 =>
|
||||
array (
|
||||
0 => 0x30A0,
|
||||
1 => 0x30FF,
|
||||
2 => 'Katakana',
|
||||
),
|
||||
90 =>
|
||||
array (
|
||||
0 => 0x3100,
|
||||
1 => 0x312F,
|
||||
2 => 'Bopomofo',
|
||||
),
|
||||
91 =>
|
||||
array (
|
||||
0 => 0x3130,
|
||||
1 => 0x318F,
|
||||
2 => 'Hangul Compatibility Jamo',
|
||||
),
|
||||
92 =>
|
||||
array (
|
||||
0 => 0x3190,
|
||||
1 => 0x319F,
|
||||
2 => 'Kanbun',
|
||||
),
|
||||
93 =>
|
||||
array (
|
||||
0 => 0x31A0,
|
||||
1 => 0x31BF,
|
||||
2 => 'Bopomofo Extended',
|
||||
),
|
||||
94 =>
|
||||
array (
|
||||
0 => 0x31C0,
|
||||
1 => 0x31EF,
|
||||
2 => 'CJK Strokes',
|
||||
),
|
||||
95 =>
|
||||
array (
|
||||
0 => 0x31F0,
|
||||
1 => 0x31FF,
|
||||
2 => 'Katakana Phonetic Extensions',
|
||||
),
|
||||
96 =>
|
||||
array (
|
||||
0 => 0x3200,
|
||||
1 => 0x32FF,
|
||||
2 => 'Enclosed CJK Letters and Months',
|
||||
),
|
||||
97 =>
|
||||
array (
|
||||
0 => 0x3300,
|
||||
1 => 0x33FF,
|
||||
2 => 'CJK Compatibility',
|
||||
),
|
||||
98 =>
|
||||
array (
|
||||
0 => 0x3400,
|
||||
1 => 0x4DBF,
|
||||
2 => 'CJK Unified Ideographs Extension A',
|
||||
),
|
||||
99 =>
|
||||
array (
|
||||
0 => 0x4DC0,
|
||||
1 => 0x4DFF,
|
||||
2 => 'Yijing Hexagram Symbols',
|
||||
),
|
||||
100 =>
|
||||
array (
|
||||
0 => 0x4E00,
|
||||
1 => 0x9FFF,
|
||||
2 => 'CJK Unified Ideographs',
|
||||
),
|
||||
101 =>
|
||||
array (
|
||||
0 => 0xA000,
|
||||
1 => 0xA48F,
|
||||
2 => 'Yi Syllables',
|
||||
),
|
||||
102 =>
|
||||
array (
|
||||
0 => 0xA490,
|
||||
1 => 0xA4CF,
|
||||
2 => 'Yi Radicals',
|
||||
),
|
||||
103 =>
|
||||
array (
|
||||
0 => 0xA700,
|
||||
1 => 0xA71F,
|
||||
2 => 'Modifier Tone Letters',
|
||||
),
|
||||
104 =>
|
||||
array (
|
||||
0 => 0xA800,
|
||||
1 => 0xA82F,
|
||||
2 => 'Syloti Nagri',
|
||||
),
|
||||
105 =>
|
||||
array (
|
||||
0 => 0xAC00,
|
||||
1 => 0xD7AF,
|
||||
2 => 'Hangul Syllables',
|
||||
),
|
||||
106 =>
|
||||
array (
|
||||
0 => 0xD800,
|
||||
1 => 0xDB7F,
|
||||
2 => 'High Surrogates',
|
||||
),
|
||||
107 =>
|
||||
array (
|
||||
0 => 0xDB80,
|
||||
1 => 0xDBFF,
|
||||
2 => 'High Private Use Surrogates',
|
||||
),
|
||||
108 =>
|
||||
array (
|
||||
0 => 0xDC00,
|
||||
1 => 0xDFFF,
|
||||
2 => 'Low Surrogates',
|
||||
),
|
||||
109 =>
|
||||
array (
|
||||
0 => 0xE000,
|
||||
1 => 0xF8FF,
|
||||
2 => 'Private Use Area',
|
||||
),
|
||||
110 =>
|
||||
array (
|
||||
0 => 0xF900,
|
||||
1 => 0xFAFF,
|
||||
2 => 'CJK Compatibility Ideographs',
|
||||
),
|
||||
111 =>
|
||||
array (
|
||||
0 => 0xFB00,
|
||||
1 => 0xFB4F,
|
||||
2 => 'Alphabetic Presentation Forms',
|
||||
),
|
||||
112 =>
|
||||
array (
|
||||
0 => 0xFB50,
|
||||
1 => 0xFDFF,
|
||||
2 => 'Arabic Presentation Forms-A',
|
||||
),
|
||||
113 =>
|
||||
array (
|
||||
0 => 0xFE00,
|
||||
1 => 0xFE0F,
|
||||
2 => 'Variation Selectors',
|
||||
),
|
||||
114 =>
|
||||
array (
|
||||
0 => 0xFE10,
|
||||
1 => 0xFE1F,
|
||||
2 => 'Vertical Forms',
|
||||
),
|
||||
115 =>
|
||||
array (
|
||||
0 => 0xFE20,
|
||||
1 => 0xFE2F,
|
||||
2 => 'Combining Half Marks',
|
||||
),
|
||||
116 =>
|
||||
array (
|
||||
0 => 0xFE30,
|
||||
1 => 0xFE4F,
|
||||
2 => 'CJK Compatibility Forms',
|
||||
),
|
||||
117 =>
|
||||
array (
|
||||
0 => 0xFE50,
|
||||
1 => 0xFE6F,
|
||||
2 => 'Small Form Variants',
|
||||
),
|
||||
118 =>
|
||||
array (
|
||||
0 => 0xFE70,
|
||||
1 => 0xFEFF,
|
||||
2 => 'Arabic Presentation Forms-B',
|
||||
),
|
||||
119 =>
|
||||
array (
|
||||
0 => 0xFF00,
|
||||
1 => 0xFFEF,
|
||||
2 => 'Halfwidth and Fullwidth Forms',
|
||||
),
|
||||
120 =>
|
||||
array (
|
||||
0 => 0xFFF0,
|
||||
1 => 0xFFFF,
|
||||
2 => 'Specials',
|
||||
),
|
||||
121 =>
|
||||
array (
|
||||
0 => 0x10000,
|
||||
1 => 0x1007F,
|
||||
2 => 'Linear B Syllabary',
|
||||
),
|
||||
122 =>
|
||||
array (
|
||||
0 => 0x10080,
|
||||
1 => 0x100FF,
|
||||
2 => 'Linear B Ideograms',
|
||||
),
|
||||
123 =>
|
||||
array (
|
||||
0 => 0x10100,
|
||||
1 => 0x1013F,
|
||||
2 => 'Aegean Numbers',
|
||||
),
|
||||
124 =>
|
||||
array (
|
||||
0 => 0x10140,
|
||||
1 => 0x1018F,
|
||||
2 => 'Ancient Greek Numbers',
|
||||
),
|
||||
125 =>
|
||||
array (
|
||||
0 => 0x10300,
|
||||
1 => 0x1032F,
|
||||
2 => 'Old Italic',
|
||||
),
|
||||
126 =>
|
||||
array (
|
||||
0 => 0x10330,
|
||||
1 => 0x1034F,
|
||||
2 => 'Gothic',
|
||||
),
|
||||
127 =>
|
||||
array (
|
||||
0 => 0x10380,
|
||||
1 => 0x1039F,
|
||||
2 => 'Ugaritic',
|
||||
),
|
||||
128 =>
|
||||
array (
|
||||
0 => 0x103A0,
|
||||
1 => 0x103DF,
|
||||
2 => 'Old Persian',
|
||||
),
|
||||
129 =>
|
||||
array (
|
||||
0 => 0x10400,
|
||||
1 => 0x1044F,
|
||||
2 => 'Deseret',
|
||||
),
|
||||
130 =>
|
||||
array (
|
||||
0 => 0x10450,
|
||||
1 => 0x1047F,
|
||||
2 => 'Shavian',
|
||||
),
|
||||
131 =>
|
||||
array (
|
||||
0 => 0x10480,
|
||||
1 => 0x104AF,
|
||||
2 => 'Osmanya',
|
||||
),
|
||||
132 =>
|
||||
array (
|
||||
0 => 0x10800,
|
||||
1 => 0x1083F,
|
||||
2 => 'Cypriot Syllabary',
|
||||
),
|
||||
133 =>
|
||||
array (
|
||||
0 => 0x10A00,
|
||||
1 => 0x10A5F,
|
||||
2 => 'Kharoshthi',
|
||||
),
|
||||
134 =>
|
||||
array (
|
||||
0 => 0x1D000,
|
||||
1 => 0x1D0FF,
|
||||
2 => 'Byzantine Musical Symbols',
|
||||
),
|
||||
135 =>
|
||||
array (
|
||||
0 => 0x1D100,
|
||||
1 => 0x1D1FF,
|
||||
2 => 'Musical Symbols',
|
||||
),
|
||||
136 =>
|
||||
array (
|
||||
0 => 0x1D200,
|
||||
1 => 0x1D24F,
|
||||
2 => 'Ancient Greek Musical Notation',
|
||||
),
|
||||
137 =>
|
||||
array (
|
||||
0 => 0x1D300,
|
||||
1 => 0x1D35F,
|
||||
2 => 'Tai Xuan Jing Symbols',
|
||||
),
|
||||
138 =>
|
||||
array (
|
||||
0 => 0x1D400,
|
||||
1 => 0x1D7FF,
|
||||
2 => 'Mathematical Alphanumeric Symbols',
|
||||
),
|
||||
139 =>
|
||||
array (
|
||||
0 => 0x20000,
|
||||
1 => 0x2A6DF,
|
||||
2 => 'CJK Unified Ideographs Extension B',
|
||||
),
|
||||
140 =>
|
||||
array (
|
||||
0 => 0x2F800,
|
||||
1 => 0x2FA1F,
|
||||
2 => 'CJK Compatibility Ideographs Supplement',
|
||||
),
|
||||
141 =>
|
||||
array (
|
||||
0 => 0xE0000,
|
||||
1 => 0xE007F,
|
||||
2 => 'Tags',
|
||||
),
|
||||
142 =>
|
||||
array (
|
||||
0 => 0xE0100,
|
||||
1 => 0xE01EF,
|
||||
2 => 'Variation Selectors Supplement',
|
||||
),
|
||||
143 =>
|
||||
array (
|
||||
0 => 0xF0000,
|
||||
1 => 0xFFFFF,
|
||||
2 => 'Supplementary Private Use Area-A',
|
||||
),
|
||||
144 =>
|
||||
array (
|
||||
0 => 0x100000,
|
||||
1 => 0x10FFFF,
|
||||
2 => 'Supplementary Private Use Area-B',
|
||||
),
|
||||
);
|
||||
?>
|
18
library/langdet/docs/confidence.php
Normal file
18
library/langdet/docs/confidence.php
Normal file
|
@ -0,0 +1,18 @@
|
|||
<?php
|
||||
require_once 'Text/LanguageDetect.php';
|
||||
|
||||
$text = 'Was wäre, wenn ich Ihnen das jetzt sagen würde?';
|
||||
|
||||
$ld = new Text_LanguageDetect();
|
||||
//3 most probable languages
|
||||
$results = $ld->detect($text, 3);
|
||||
|
||||
foreach ($results as $language => $confidence) {
|
||||
echo $language . ': ' . number_format($confidence, 2) . "\n";
|
||||
}
|
||||
|
||||
//output:
|
||||
//german: 0.35
|
||||
//dutch: 0.25
|
||||
//swedish: 0.20
|
||||
?>
|
15
library/langdet/docs/errorhandling.php
Normal file
15
library/langdet/docs/errorhandling.php
Normal file
|
@ -0,0 +1,15 @@
|
|||
<?php
|
||||
/**
|
||||
* How to handle errors
|
||||
*/
|
||||
require_once 'Text/LanguageDetect.php';
|
||||
require_once 'Text/LanguageDetect/Exception.php';
|
||||
|
||||
try {
|
||||
$ld = new Text_LanguageDetect();
|
||||
$lang = $ld->detectSimple('Das ist ein kleiner Text');
|
||||
echo "Language is: $lang\n";
|
||||
} catch (Text_LanguageDetect_Exception $e) {
|
||||
echo 'An error occured! Message: ' . $e . "\n";
|
||||
}
|
||||
?>
|
35
library/langdet/docs/example_clui.php
Normal file
35
library/langdet/docs/example_clui.php
Normal file
|
@ -0,0 +1,35 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* example usage (CLI)
|
||||
*
|
||||
* @package Text_LanguageDetect
|
||||
* @version CVS: $Id$
|
||||
*/
|
||||
|
||||
require_once 'Text/LanguageDetect.php';
|
||||
|
||||
$l = new Text_LanguageDetect;
|
||||
|
||||
$stdin = fopen('php://stdin', 'r');
|
||||
|
||||
echo "Supported languages:\n";
|
||||
$langs = $l->getLanguages();
|
||||
sort($langs);
|
||||
echo join(', ', $langs);
|
||||
|
||||
echo "\ntotal ", count($langs), "\n\n";
|
||||
|
||||
while ($line = fgets($stdin)) {
|
||||
$result = $l->detect($line, 4);
|
||||
print_r($result);
|
||||
$blocks = $l->detectUnicodeBlocks($line, true);
|
||||
print_r($blocks);
|
||||
}
|
||||
|
||||
fclose($stdin);
|
||||
unset($l);
|
||||
|
||||
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
|
||||
|
||||
?>
|
72
library/langdet/docs/example_web.php
Normal file
72
library/langdet/docs/example_web.php
Normal file
|
@ -0,0 +1,72 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* example usage (web)
|
||||
*
|
||||
* @package Text_LanguageDetect
|
||||
* @version CVS: $Id$
|
||||
*/
|
||||
|
||||
// browsers will encode multi-byte characters wrong unless they think the page is utf8-encoded
|
||||
header('Content-type: text/html; charset=utf-8', true);
|
||||
|
||||
require_once 'Text/LanguageDetect.php';
|
||||
|
||||
$l = new Text_LanguageDetect;
|
||||
if (isset($_REQUEST['q'])) {
|
||||
$q = stripslashes($_REQUEST['q']);
|
||||
}
|
||||
|
||||
?>
|
||||
<html>
|
||||
<head>
|
||||
<title>Text_LanguageDetect demonstration</title>
|
||||
</head>
|
||||
<body>
|
||||
<h2>Text_LanguageDetect</h2>
|
||||
<?
|
||||
echo "<small>Supported languages:\n";
|
||||
$langs = $l->getLanguages();
|
||||
sort($langs);
|
||||
foreach ($langs as $lang) {
|
||||
echo ucfirst($lang), ', ';
|
||||
$i++;
|
||||
}
|
||||
|
||||
echo "<br />total $i</small><br /><br />";
|
||||
|
||||
?>
|
||||
<form method="post">
|
||||
Enter text to identify language (at least a couple of sentences):<br />
|
||||
<textarea name="q" wrap="virtual" cols="80" rows="8"><?= $q ?></textarea>
|
||||
<br />
|
||||
<input type="submit" value="Submit" />
|
||||
</form>
|
||||
<?
|
||||
if (isset($q) && strlen($q)) {
|
||||
$len = $l->utf8strlen($q);
|
||||
if ($len < 20) { // this value picked somewhat arbitrarily
|
||||
echo "Warning: string not very long ($len chars)<br />\n";
|
||||
}
|
||||
|
||||
$result = $l->detectConfidence($q);
|
||||
|
||||
if ($result == null) {
|
||||
echo "Text_LanguageDetect cannot identify this piece of text. <br /><br />\n";
|
||||
} else {
|
||||
echo "Text_LanguageDetect thinks this text is written in <b>{$result['language']}</b> ({$result['similarity']}, {$result['confidence']})<br /><br />\n";
|
||||
}
|
||||
|
||||
$result = $l->detectUnicodeBlocks($q, false);
|
||||
if (!empty($result)) {
|
||||
arsort($result);
|
||||
echo "Unicode blocks present: ", join(', ', array_keys($result)), "\n<br /><br />";
|
||||
}
|
||||
}
|
||||
|
||||
unset($l);
|
||||
|
||||
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
|
||||
|
||||
?>
|
||||
</body></html>
|
19
library/langdet/docs/iso.php
Normal file
19
library/langdet/docs/iso.php
Normal file
|
@ -0,0 +1,19 @@
|
|||
<?php
|
||||
/**
|
||||
* Demonstrates how to use ISO language codes.
|
||||
*
|
||||
* The "name mode" changes the way languages are accepted and returned.
|
||||
*/
|
||||
require_once 'Text/LanguageDetect.php';
|
||||
$ld = new Text_LanguageDetect();
|
||||
|
||||
//will output the ISO 639-1 two-letter language code
|
||||
// "de"
|
||||
$ld->setNameMode(2);
|
||||
echo $ld->detectSimple('Das ist ein kleiner Text') . "\n";
|
||||
|
||||
//will output the ISO 639-2 three-letter language code
|
||||
// "deu"
|
||||
$ld->setNameMode(3);
|
||||
echo $ld->detectSimple('Das ist ein kleiner Text') . "\n";
|
||||
?>
|
11
library/langdet/docs/languages.php
Normal file
11
library/langdet/docs/languages.php
Normal file
|
@ -0,0 +1,11 @@
|
|||
<?php
|
||||
/**
|
||||
* List all supported languages
|
||||
*/
|
||||
require_once 'Text/LanguageDetect.php';
|
||||
$ld = new Text_LanguageDetect();
|
||||
|
||||
foreach ($ld->getLanguages() as $lang) {
|
||||
echo $lang . "\n";
|
||||
}
|
||||
?>
|
10
library/langdet/docs/simple.php
Normal file
10
library/langdet/docs/simple.php
Normal file
|
@ -0,0 +1,10 @@
|
|||
<?php
|
||||
require_once 'Text/LanguageDetect.php';
|
||||
|
||||
$text = 'Was wäre, wenn ich Ihnen das jetzt sagen würde?';
|
||||
|
||||
$ld = new Text_LanguageDetect();
|
||||
$result = $ld->detectSimple($text);
|
||||
var_dump($result);
|
||||
//output: german
|
||||
?>
|
42
library/langdet/tests/PrivProxy.php
Normal file
42
library/langdet/tests/PrivProxy.php
Normal file
|
@ -0,0 +1,42 @@
|
|||
<?php
|
||||
/**
|
||||
* Helper that enables access to private and protected methods and properties.
|
||||
*/
|
||||
class PrivProxy
|
||||
{
|
||||
private $obj;
|
||||
|
||||
public function __construct($obj)
|
||||
{
|
||||
$this->obj = $obj;
|
||||
}
|
||||
|
||||
public function __call($method, $arguments)
|
||||
{
|
||||
$rm = new ReflectionMethod($this->obj, $method);
|
||||
$rm->setAccessible(true);
|
||||
return $rm->invokeArgs($this->obj, $arguments);
|
||||
}
|
||||
|
||||
public static function __callStatic($method, $arguments)
|
||||
{
|
||||
$rm = new ReflectionMethod($this->obj, $method);
|
||||
$rm->setAccessible(true);
|
||||
return $rm->invokeArgs($this->obj, $arguments);
|
||||
}
|
||||
|
||||
public function __set($var, $value)
|
||||
{
|
||||
$rp = new ReflectionProperty($this->obj, $var);
|
||||
$rp->setAccessible(true);
|
||||
$rp->setValue($this->obj, $value);
|
||||
}
|
||||
|
||||
public function __get($var)
|
||||
{
|
||||
$rp = new ReflectionProperty($this->obj, $var);
|
||||
$rp->setAccessible(true);
|
||||
return $rp->getValue($this->obj);
|
||||
}
|
||||
}
|
||||
?>
|
2059
library/langdet/tests/Text_LanguageDetectTest.php
Normal file
2059
library/langdet/tests/Text_LanguageDetectTest.php
Normal file
File diff suppressed because it is too large
Load diff
72
library/langdet/tests/Text_LanguageDetect_ISO639Test.php
Normal file
72
library/langdet/tests/Text_LanguageDetect_ISO639Test.php
Normal file
|
@ -0,0 +1,72 @@
|
|||
<?php
|
||||
set_include_path(
|
||||
__DIR__ . '/../' . PATH_SEPARATOR . get_include_path()
|
||||
);
|
||||
|
||||
require_once 'Text/LanguageDetect/ISO639.php';
|
||||
|
||||
class Text_LanguageDetect_ISO639Test extends PHPUnit_Framework_TestCase
|
||||
{
|
||||
public function testNameToCode2()
|
||||
{
|
||||
$this->assertEquals(
|
||||
'de',
|
||||
Text_LanguageDetect_ISO639::nameToCode2('german')
|
||||
);
|
||||
}
|
||||
|
||||
public function testNameToCode2Fail()
|
||||
{
|
||||
$this->assertNull(
|
||||
Text_LanguageDetect_ISO639::nameToCode2('doesnotexist')
|
||||
);
|
||||
}
|
||||
|
||||
public function testNameToCode3()
|
||||
{
|
||||
$this->assertEquals(
|
||||
'fra',
|
||||
Text_LanguageDetect_ISO639::nameToCode3('french')
|
||||
);
|
||||
}
|
||||
|
||||
public function testNameToCode3Fail()
|
||||
{
|
||||
$this->assertNull(
|
||||
Text_LanguageDetect_ISO639::nameToCode3('doesnotexist')
|
||||
);
|
||||
}
|
||||
|
||||
public function testCode2ToName()
|
||||
{
|
||||
$this->assertEquals(
|
||||
'english',
|
||||
Text_LanguageDetect_ISO639::code2ToName('en')
|
||||
);
|
||||