Revert "Update languagedetect library"

This commit is contained in:
Tobias Diekershoff 2017-10-05 11:58:18 +02:00 committed by GitHub
parent c22920edba
commit 071946fa78
101 changed files with 3632 additions and 311 deletions

View file

@ -16,8 +16,7 @@
"ezyang/htmlpurifier": "~4.7.0",
"mobiledetect/mobiledetectlib": "2.8.*",
"league/html-to-markdown": "~4.4.1",
"pear-pear.php.net/Text_Highlighter": "*",
"pear-pear.php.net/Text_LanguageDetect": "*"
"pear-pear.php.net/Text_Highlighter": "*"
},
"repositories": [
{

60
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"This file is @generated automatically"
],
"content-hash": "7499dcab40af67a3f23036e3a8d9587f",
"content-hash": "802372ddf124ef949e80dd8dc1d38797",
"packages": [
{
"name": "ezyang/htmlpurifier",
@ -116,16 +116,16 @@
},
{
"name": "mobiledetect/mobiledetectlib",
"version": "2.8.26",
"version": "2.8.25",
"source": {
"type": "git",
"url": "https://github.com/serbanghita/Mobile-Detect.git",
"reference": "a0ed86c9d7c04ae27fa6418b55e3beb04dfe3297"
"reference": "f0896b5c7274d1450023b0b376240be902c3251c"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/serbanghita/Mobile-Detect/zipball/a0ed86c9d7c04ae27fa6418b55e3beb04dfe3297",
"reference": "a0ed86c9d7c04ae27fa6418b55e3beb04dfe3297",
"url": "https://api.github.com/repos/serbanghita/Mobile-Detect/zipball/f0896b5c7274d1450023b0b376240be902c3251c",
"reference": "f0896b5c7274d1450023b0b376240be902c3251c",
"shasum": ""
},
"require": {
@ -164,14 +164,14 @@
"mobile detector",
"php mobile detect"
],
"time": "2017-08-29T18:23:54+00:00"
"time": "2017-03-29T13:59:30+00:00"
},
{
"name": "pear-pear.php.net/Archive_Tar",
"version": "1.4.3",
"version": "1.4.2",
"dist": {
"type": "file",
"url": "https://pear.php.net/get/Archive_Tar-1.4.3.tgz",
"url": "https://pear.php.net/get/Archive_Tar-1.4.2.tgz",
"reference": null,
"shasum": null
},
@ -179,7 +179,7 @@
"php": ">=5.2.0.0"
},
"replace": {
"pear-pear/archive_tar": "== 1.4.3.0"
"pear-pear/archive_tar": "== 1.4.2.0"
},
"type": "pear-library",
"autoload": {
@ -226,10 +226,10 @@
},
{
"name": "pear-pear.php.net/PEAR",
"version": "1.10.5",
"version": "1.10.4",
"dist": {
"type": "file",
"url": "https://pear.php.net/get/PEAR-1.10.5.tgz",
"url": "https://pear.php.net/get/PEAR-1.10.4.tgz",
"reference": null,
"shasum": null
},
@ -247,7 +247,7 @@
"pear-pear.php.net/pear_frontend_web": "<=0.4.0.0"
},
"replace": {
"pear-pear/pear": "== 1.10.5.0"
"pear-pear/pear": "== 1.10.4.0"
},
"type": "pear-library",
"autoload": {
@ -324,36 +324,6 @@
],
"description": "Text_Highlighter is a package for syntax highlighting.\n\nIt provides a base class provining all the functionality,\nand a descendent classes geneator class.\n\nThe main idea is to simplify creation of subclasses\nimplementing syntax highlighting for particular language.\nSubclasses do not implement any new functioanality,\nthey just provide syntax highlighting rules.\nThe rules sources are in XML format.\n\nTo create a highlighter for a language, there is no need\nto code a new class manually. Simply describe the rules\nin XML file and use Text_Highlighter_Generator to create\na new class."
},
{
"name": "pear-pear.php.net/Text_LanguageDetect",
"version": "1.0.0",
"dist": {
"type": "file",
"url": "https://pear.php.net/get/Text_LanguageDetect-1.0.0.tgz",
"reference": null,
"shasum": null
},
"require": {
"ext-pcre": "*",
"php": ">=5.4.0.0"
},
"replace": {
"pear-pear/text_languagedetect": "== 1.0.0.0"
},
"type": "pear-library",
"autoload": {
"classmap": [
""
]
},
"include-path": [
"/"
],
"license": [
"BSD"
],
"description": "Text_LanguageDetect can identify 52 human languages from text samples and return confidence scores for each."
},
{
"name": "pear-pear.php.net/XML_Parser",
"version": "1.3.7",
@ -386,10 +356,10 @@
},
{
"name": "pear-pear.php.net/XML_Util",
"version": "1.4.3",
"version": "1.4.2",
"dist": {
"type": "file",
"url": "https://pear.php.net/get/XML_Util-1.4.3.tgz",
"url": "https://pear.php.net/get/XML_Util-1.4.2.tgz",
"reference": null,
"shasum": null
},
@ -398,7 +368,7 @@
"php": ">=5.4.0.0"
},
"replace": {
"pear-pear/xml_util": "== 1.4.3.0"
"pear-pear/xml_util": "== 1.4.2.0"
},
"type": "pear-library",
"autoload": {

View file

@ -364,9 +364,13 @@ function item_add_language_opt(&$arr) {
$postopts = "";
}
require_once('library/langdet/Text/LanguageDetect.php');
$naked_body = preg_replace('/\[(.+?)\]/','', $arr['body']);
$LanguageDetect = new Text_LanguageDetect();
$lng = $LanguageDetect->detect($naked_body, 3);
$l = new Text_LanguageDetect();
//$lng = $l->detectConfidence($naked_body);
//$arr['postopts'] = (($lng['language']) ? 'lang=' . $lng['language'] . ';' . $lng['confidence'] : '');
$lng = $l->detect($naked_body, 3);
if (sizeof($lng) > 0) {
if ($postopts != "") $postopts .= '&'; // arbitrary separator, to be reviewed

157
library/langdet/README.rst Normal file
View file

@ -0,0 +1,157 @@
*******************
Text_LanguageDetect
*******************
PHP library to identify human languages from text samples.
Returns confidence scores for each.
Installation
============
PEAR
----
::
$ pear install Text_LanguageDetect
Composer
--------
::
$ composer require pear/text_languagedetect
Usage
=====
Also see the examples in the ``docs/`` directory and
the `official documentation`__.
__ http://pear.php.net/package/Text_LanguageDetect/docs
Language detection
------------------
Simple language detection::
<?php
require_once 'Text/LanguageDetect.php';
$text = 'Was wäre, wenn ich Ihnen das jetzt sagen würde?';
$ld = new Text_LanguageDetect();
$language = $ld->detectSimple($text);
echo $language;
//output: german
Show the three most probable languages with their confidence score::
<?php
require_once 'Text/LanguageDetect.php';
$text = 'Was wäre, wenn ich Ihnen das jetzt sagen würde?';
$ld = new Text_LanguageDetect();
//3 most probable languages
$results = $ld->detect($text, 3);
foreach ($results as $language => $confidence) {
echo $language . ': ' . number_format($confidence, 2) . "\n";
}
//output:
//german: 0.35
//dutch: 0.25
//swedish: 0.20
?>
Language code
-------------
Instead of returning the full language name, ISO 639-2 two and three
letter codes can be returned::
<?php
require_once 'Text/LanguageDetect.php';
$ld = new Text_LanguageDetect();
//will output the ISO 639-1 two-letter language code
// "de"
$ld->setNameMode(2);
echo $ld->detectSimple('Das ist ein kleiner Text') . "\n";
//will output the ISO 639-2 three-letter language code
// "deu"
$ld->setNameMode(3);
echo $ld->detectSimple('Das ist ein kleiner Text') . "\n";
?>
Supported languages
===================
- albanian
- arabic
- azeri
- bengali
- bulgarian
- cebuano
- croatian
- czech
- danish
- dutch
- english
- estonian
- farsi
- finnish
- french
- german
- hausa
- hawaiian
- hindi
- hungarian
- icelandic
- indonesian
- italian
- kazakh
- kyrgyz
- latin
- latvian
- lithuanian
- macedonian
- mongolian
- nepali
- norwegian
- pashto
- pidgin
- polish
- portuguese
- romanian
- russian
- serbian
- slovak
- slovene
- somali
- spanish
- swahili
- swedish
- tagalog
- turkish
- ukrainian
- urdu
- uzbek
- vietnamese
- welsh
Links
=====
Homepage
http://pear.php.net/package/Text_LanguageDetect
Bug tracker
http://pear.php.net/bugs/search.php?cmd=display&package_name[]=Text_LanguageDetect
Documentation
http://pear.php.net/package/Text_LanguageDetect/docs
Unit test status
https://travis-ci.org/pear/Text_LanguageDetect
.. image:: https://travis-ci.org/pear/Text_LanguageDetect.svg?branch=master
:target: https://travis-ci.org/pear/Text_LanguageDetect

View file

@ -58,7 +58,7 @@ require_once 'Text/LanguageDetect/ISO639.php';
* @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
* @copyright 2005 Nicholas Pisarro
* @license BSD http://www.opensource.org/licenses/bsd-license.php
* @version Release: 1.0.0
* @version Release: @package_version@
* @link http://pear.php.net/package/Text_LanguageDetect/
*/
class Text_LanguageDetect
@ -90,7 +90,7 @@ class Text_LanguageDetect
*
* @var string
*/
protected $_data_dir = 'D:\Mes Projets\Friendica\friendica\vendor/pear-pear.php.net/Text_LanguageDetect/data';
protected $_data_dir = '@data_dir@';
/**
* The trigram data for comparison
@ -196,7 +196,7 @@ class Text_LanguageDetect
} elseif ($this->_data_dir != '@' . 'data_dir' . '@') {
// if the data dir was set by the PEAR installer, use that
return $this->_data_dir . '/' . $fname;
return $this->_data_dir . '/Text_LanguageDetect/' . $fname;
} else {
// assume this was just unpacked somewhere

View file

@ -27,7 +27,7 @@
* @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
* @copyright 2006 Nicholas Pisarro
* @license BSD http://www.opensource.org/licenses/bsd-license.php
* @version Release: 1.0.0
* @version Release: @package_version@
* @link http://pear.php.net/package/Text_LanguageDetect/
*/
class Text_LanguageDetect_Parser extends Text_LanguageDetect

View file

@ -0,0 +1,7 @@
<?php
/**
* Generate the serialized unicode_blocks.dat file shipped with the package
*/
$unicode_blocks = include __DIR__ . '/unicode_blocks.php';
file_put_contents(__DIR__ . '/unicode_blocks.dat', serialize($unicode_blocks));
?>

View file

@ -0,0 +1,874 @@
<?php
return array (
0 =>
array (
0 => 0x0000,
1 => 0x007F,
2 => 'Basic Latin',
),
1 =>
array (
0 => 0x0080,
1 => 0x00FF,
2 => 'Latin-1 Supplement',
),
2 =>
array (
0 => 0x0100,
1 => 0x017F,
2 => 'Latin Extended-A',
),
3 =>
array (
0 => 0x0180,
1 => 0x024F,
2 => 'Latin Extended-B',
),
4 =>
array (
0 => 0x0250,
1 => 0x02AF,
2 => 'IPA Extensions',
),
5 =>
array (
0 => 0x02B0,
1 => 0x02FF,
2 => 'Spacing Modifier Letters',
),
6 =>
array (
0 => 0x0300,
1 => 0x036F,
2 => 'Combining Diacritical Marks',
),
7 =>
array (
0 => 0x0370,
1 => 0x03FF,
2 => 'Greek and Coptic',
),
8 =>
array (
0 => 0x0400,
1 => 0x04FF,
2 => 'Cyrillic',
),
9 =>
array (
0 => 0x0500,
1 => 0x052F,
2 => 'Cyrillic Supplement',
),
10 =>
array (
0 => 0x0530,
1 => 0x058F,
2 => 'Armenian',
),
11 =>
array (
0 => 0x0590,
1 => 0x05FF,
2 => 'Hebrew',
),
12 =>
array (
0 => 0x0600,
1 => 0x06FF,
2 => 'Arabic',
),
13 =>
array (
0 => 0x0700,
1 => 0x074F,
2 => 'Syriac',
),
14 =>
array (
0 => 0x0750,
1 => 0x077F,
2 => 'Arabic Supplement',
),
15 =>
array (
0 => 0x0780,
1 => 0x07BF,
2 => 'Thaana',
),
16 =>
array (
0 => 0x0900,
1 => 0x097F,
2 => 'Devanagari',
),
17 =>
array (
0 => 0x0980,
1 => 0x09FF,
2 => 'Bengali',
),
18 =>
array (
0 => 0x0A00,
1 => 0x0A7F,
2 => 'Gurmukhi',
),
19 =>
array (
0 => 0x0A80,
1 => 0x0AFF,
2 => 'Gujarati',
),
20 =>
array (
0 => 0x0B00,
1 => 0x0B7F,
2 => 'Oriya',
),
21 =>
array (
0 => 0x0B80,
1 => 0x0BFF,
2 => 'Tamil',
),
22 =>
array (
0 => 0x0C00,
1 => 0x0C7F,
2 => 'Telugu',
),
23 =>
array (
0 => 0x0C80,
1 => 0x0CFF,
2 => 'Kannada',
),
24 =>
array (
0 => 0x0D00,
1 => 0x0D7F,
2 => 'Malayalam',
),
25 =>
array (
0 => 0x0D80,
1 => 0x0DFF,
2 => 'Sinhala',
),
26 =>
array (
0 => 0x0E00,
1 => 0x0E7F,
2 => 'Thai',
),
27 =>
array (
0 => 0x0E80,
1 => 0x0EFF,
2 => 'Lao',
),
28 =>
array (
0 => 0x0F00,
1 => 0x0FFF,
2 => 'Tibetan',
),
29 =>
array (
0 => 0x1000,
1 => 0x109F,
2 => 'Myanmar',
),
30 =>
array (
0 => 0x10A0,
1 => 0x10FF,
2 => 'Georgian',
),
31 =>
array (
0 => 0x1100,
1 => 0x11FF,
2 => 'Hangul Jamo',
),
32 =>
array (
0 => 0x1200,
1 => 0x137F,
2 => 'Ethiopic',
),
33 =>
array (
0 => 0x1380,
1 => 0x139F,
2 => 'Ethiopic Supplement',
),
34 =>
array (
0 => 0x13A0,
1 => 0x13FF,
2 => 'Cherokee',
),
35 =>
array (
0 => 0x1400,
1 => 0x167F,
2 => 'Unified Canadian Aboriginal Syllabics',
),
36 =>
array (
0 => 0x1680,
1 => 0x169F,
2 => 'Ogham',
),
37 =>
array (
0 => 0x16A0,
1 => 0x16FF,
2 => 'Runic',
),
38 =>
array (
0 => 0x1700,
1 => 0x171F,
2 => 'Tagalog',
),
39 =>
array (
0 => 0x1720,
1 => 0x173F,
2 => 'Hanunoo',
),
40 =>
array (
0 => 0x1740,
1 => 0x175F,
2 => 'Buhid',
),
41 =>
array (
0 => 0x1760,
1 => 0x177F,
2 => 'Tagbanwa',
),
42 =>
array (
0 => 0x1780,
1 => 0x17FF,
2 => 'Khmer',
),
43 =>
array (
0 => 0x1800,
1 => 0x18AF,
2 => 'Mongolian',
),
44 =>
array (
0 => 0x1900,
1 => 0x194F,
2 => 'Limbu',
),
45 =>
array (
0 => 0x1950,
1 => 0x197F,
2 => 'Tai Le',
),
46 =>
array (
0 => 0x1980,
1 => 0x19DF,
2 => 'New Tai Lue',
),
47 =>
array (
0 => 0x19E0,
1 => 0x19FF,
2 => 'Khmer Symbols',
),
48 =>
array (
0 => 0x1A00,
1 => 0x1A1F,
2 => 'Buginese',
),
49 =>
array (
0 => 0x1D00,
1 => 0x1D7F,
2 => 'Phonetic Extensions',
),
50 =>
array (
0 => 0x1D80,
1 => 0x1DBF,
2 => 'Phonetic Extensions Supplement',
),
51 =>
array (
0 => 0x1DC0,
1 => 0x1DFF,
2 => 'Combining Diacritical Marks Supplement',
),
52 =>
array (
0 => 0x1E00,
1 => 0x1EFF,
2 => 'Latin Extended Additional',
),
53 =>
array (
0 => 0x1F00,
1 => 0x1FFF,
2 => 'Greek Extended',
),
54 =>
array (
0 => 0x2000,
1 => 0x206F,
2 => 'General Punctuation',
),
55 =>
array (
0 => 0x2070,
1 => 0x209F,
2 => 'Superscripts and Subscripts',
),
56 =>
array (
0 => 0x20A0,
1 => 0x20CF,
2 => 'Currency Symbols',
),
57 =>
array (
0 => 0x20D0,
1 => 0x20FF,
2 => 'Combining Diacritical Marks for Symbols',
),
58 =>
array (
0 => 0x2100,
1 => 0x214F,
2 => 'Letterlike Symbols',
),
59 =>
array (
0 => 0x2150,
1 => 0x218F,
2 => 'Number Forms',
),
60 =>
array (
0 => 0x2190,
1 => 0x21FF,
2 => 'Arrows',
),
61 =>
array (
0 => 0x2200,
1 => 0x22FF,
2 => 'Mathematical Operators',
),
62 =>
array (
0 => 0x2300,
1 => 0x23FF,
2 => 'Miscellaneous Technical',
),
63 =>
array (
0 => 0x2400,
1 => 0x243F,
2 => 'Control Pictures',
),
64 =>
array (
0 => 0x2440,
1 => 0x245F,
2 => 'Optical Character Recognition',
),
65 =>
array (
0 => 0x2460,
1 => 0x24FF,
2 => 'Enclosed Alphanumerics',
),
66 =>
array (
0 => 0x2500,
1 => 0x257F,
2 => 'Box Drawing',
),
67 =>
array (
0 => 0x2580,
1 => 0x259F,
2 => 'Block Elements',
),
68 =>
array (
0 => 0x25A0,
1 => 0x25FF,
2 => 'Geometric Shapes',
),
69 =>
array (
0 => 0x2600,
1 => 0x26FF,
2 => 'Miscellaneous Symbols',
),
70 =>
array (
0 => 0x2700,
1 => 0x27BF,
2 => 'Dingbats',
),
71 =>
array (
0 => 0x27C0,
1 => 0x27EF,
2 => 'Miscellaneous Mathematical Symbols-A',
),
72 =>
array (
0 => 0x27F0,
1 => 0x27FF,
2 => 'Supplemental Arrows-A',
),
73 =>
array (
0 => 0x2800,
1 => 0x28FF,
2 => 'Braille Patterns',
),
74 =>
array (
0 => 0x2900,
1 => 0x297F,
2 => 'Supplemental Arrows-B',
),
75 =>
array (
0 => 0x2980,
1 => 0x29FF,
2 => 'Miscellaneous Mathematical Symbols-B',
),
76 =>
array (
0 => 0x2A00,
1 => 0x2AFF,
2 => 'Supplemental Mathematical Operators',
),
77 =>
array (
0 => 0x2B00,
1 => 0x2BFF,
2 => 'Miscellaneous Symbols and Arrows',
),
78 =>
array (
0 => 0x2C00,
1 => 0x2C5F,
2 => 'Glagolitic',
),
79 =>
array (
0 => 0x2C80,
1 => 0x2CFF,
2 => 'Coptic',
),
80 =>
array (
0 => 0x2D00,
1 => 0x2D2F,
2 => 'Georgian Supplement',
),
81 =>
array (
0 => 0x2D30,
1 => 0x2D7F,
2 => 'Tifinagh',
),
82 =>
array (
0 => 0x2D80,
1 => 0x2DDF,
2 => 'Ethiopic Extended',
),
83 =>
array (
0 => 0x2E00,
1 => 0x2E7F,
2 => 'Supplemental Punctuation',
),
84 =>
array (
0 => 0x2E80,
1 => 0x2EFF,
2 => 'CJK Radicals Supplement',
),
85 =>
array (
0 => 0x2F00,
1 => 0x2FDF,
2 => 'Kangxi Radicals',
),
86 =>
array (
0 => 0x2FF0,
1 => 0x2FFF,
2 => 'Ideographic Description Characters',
),
87 =>
array (
0 => 0x3000,
1 => 0x303F,
2 => 'CJK Symbols and Punctuation',
),
88 =>
array (
0 => 0x3040,
1 => 0x309F,
2 => 'Hiragana',
),
89 =>
array (
0 => 0x30A0,
1 => 0x30FF,
2 => 'Katakana',
),
90 =>
array (
0 => 0x3100,
1 => 0x312F,
2 => 'Bopomofo',
),
91 =>
array (
0 => 0x3130,
1 => 0x318F,
2 => 'Hangul Compatibility Jamo',
),
92 =>
array (
0 => 0x3190,
1 => 0x319F,
2 => 'Kanbun',
),
93 =>
array (
0 => 0x31A0,
1 => 0x31BF,
2 => 'Bopomofo Extended',
),
94 =>
array (
0 => 0x31C0,
1 => 0x31EF,
2 => 'CJK Strokes',
),
95 =>
array (
0 => 0x31F0,
1 => 0x31FF,
2 => 'Katakana Phonetic Extensions',
),
96 =>
array (
0 => 0x3200,
1 => 0x32FF,
2 => 'Enclosed CJK Letters and Months',
),
97 =>
array (
0 => 0x3300,
1 => 0x33FF,
2 => 'CJK Compatibility',
),
98 =>
array (
0 => 0x3400,
1 => 0x4DBF,
2 => 'CJK Unified Ideographs Extension A',
),
99 =>
array (
0 => 0x4DC0,
1 => 0x4DFF,
2 => 'Yijing Hexagram Symbols',
),
100 =>
array (
0 => 0x4E00,
1 => 0x9FFF,
2 => 'CJK Unified Ideographs',
),
101 =>
array (
0 => 0xA000,
1 => 0xA48F,
2 => 'Yi Syllables',
),
102 =>
array (
0 => 0xA490,
1 => 0xA4CF,
2 => 'Yi Radicals',
),
103 =>
array (
0 => 0xA700,
1 => 0xA71F,
2 => 'Modifier Tone Letters',
),
104 =>
array (
0 => 0xA800,
1 => 0xA82F,
2 => 'Syloti Nagri',
),
105 =>
array (
0 => 0xAC00,
1 => 0xD7AF,
2 => 'Hangul Syllables',
),
106 =>
array (
0 => 0xD800,
1 => 0xDB7F,
2 => 'High Surrogates',
),
107 =>
array (
0 => 0xDB80,
1 => 0xDBFF,
2 => 'High Private Use Surrogates',
),
108 =>
array (
0 => 0xDC00,
1 => 0xDFFF,
2 => 'Low Surrogates',
),
109 =>
array (
0 => 0xE000,
1 => 0xF8FF,
2 => 'Private Use Area',
),
110 =>
array (
0 => 0xF900,
1 => 0xFAFF,
2 => 'CJK Compatibility Ideographs',
),
111 =>
array (
0 => 0xFB00,
1 => 0xFB4F,
2 => 'Alphabetic Presentation Forms',
),
112 =>
array (
0 => 0xFB50,
1 => 0xFDFF,
2 => 'Arabic Presentation Forms-A',
),
113 =>
array (
0 => 0xFE00,
1 => 0xFE0F,
2 => 'Variation Selectors',
),
114 =>
array (
0 => 0xFE10,
1 => 0xFE1F,
2 => 'Vertical Forms',
),
115 =>
array (
0 => 0xFE20,
1 => 0xFE2F,
2 => 'Combining Half Marks',
),
116 =>
array (
0 => 0xFE30,
1 => 0xFE4F,
2 => 'CJK Compatibility Forms',
),
117 =>
array (
0 => 0xFE50,
1 => 0xFE6F,
2 => 'Small Form Variants',
),
118 =>
array (
0 => 0xFE70,
1 => 0xFEFF,
2 => 'Arabic Presentation Forms-B',
),
119 =>
array (
0 => 0xFF00,
1 => 0xFFEF,
2 => 'Halfwidth and Fullwidth Forms',
),
120 =>
array (
0 => 0xFFF0,
1 => 0xFFFF,
2 => 'Specials',
),
121 =>
array (
0 => 0x10000,
1 => 0x1007F,
2 => 'Linear B Syllabary',
),
122 =>
array (
0 => 0x10080,
1 => 0x100FF,
2 => 'Linear B Ideograms',
),
123 =>
array (
0 => 0x10100,
1 => 0x1013F,
2 => 'Aegean Numbers',
),
124 =>
array (
0 => 0x10140,
1 => 0x1018F,
2 => 'Ancient Greek Numbers',
),
125 =>
array (
0 => 0x10300,
1 => 0x1032F,
2 => 'Old Italic',
),
126 =>
array (
0 => 0x10330,
1 => 0x1034F,
2 => 'Gothic',
),
127 =>
array (
0 => 0x10380,
1 => 0x1039F,
2 => 'Ugaritic',
),
128 =>
array (
0 => 0x103A0,
1 => 0x103DF,
2 => 'Old Persian',
),
129 =>
array (
0 => 0x10400,
1 => 0x1044F,
2 => 'Deseret',
),
130 =>
array (
0 => 0x10450,
1 => 0x1047F,
2 => 'Shavian',
),
131 =>
array (
0 => 0x10480,
1 => 0x104AF,
2 => 'Osmanya',
),
132 =>
array (
0 => 0x10800,
1 => 0x1083F,
2 => 'Cypriot Syllabary',
),
133 =>
array (
0 => 0x10A00,
1 => 0x10A5F,
2 => 'Kharoshthi',
),
134 =>
array (
0 => 0x1D000,
1 => 0x1D0FF,
2 => 'Byzantine Musical Symbols',
),
135 =>
array (
0 => 0x1D100,
1 => 0x1D1FF,
2 => 'Musical Symbols',
),
136 =>
array (
0 => 0x1D200,
1 => 0x1D24F,
2 => 'Ancient Greek Musical Notation',
),
137 =>
array (
0 => 0x1D300,
1 => 0x1D35F,
2 => 'Tai Xuan Jing Symbols',
),
138 =>
array (
0 => 0x1D400,
1 => 0x1D7FF,
2 => 'Mathematical Alphanumeric Symbols',
),
139 =>
array (
0 => 0x20000,
1 => 0x2A6DF,
2 => 'CJK Unified Ideographs Extension B',
),
140 =>
array (
0 => 0x2F800,
1 => 0x2FA1F,
2 => 'CJK Compatibility Ideographs Supplement',
),
141 =>
array (
0 => 0xE0000,
1 => 0xE007F,
2 => 'Tags',
),
142 =>
array (
0 => 0xE0100,
1 => 0xE01EF,
2 => 'Variation Selectors Supplement',
),
143 =>
array (
0 => 0xF0000,
1 => 0xFFFFF,
2 => 'Supplementary Private Use Area-A',
),
144 =>
array (
0 => 0x100000,
1 => 0x10FFFF,
2 => 'Supplementary Private Use Area-B',
),
);
?>

View file

@ -0,0 +1,18 @@
<?php
require_once 'Text/LanguageDetect.php';
$text = 'Was wäre, wenn ich Ihnen das jetzt sagen würde?';
$ld = new Text_LanguageDetect();
//3 most probable languages
$results = $ld->detect($text, 3);
foreach ($results as $language => $confidence) {
echo $language . ': ' . number_format($confidence, 2) . "\n";
}
//output:
//german: 0.35
//dutch: 0.25
//swedish: 0.20
?>

View file

@ -0,0 +1,15 @@
<?php
/**
* How to handle errors
*/
require_once 'Text/LanguageDetect.php';
require_once 'Text/LanguageDetect/Exception.php';
try {
$ld = new Text_LanguageDetect();
$lang = $ld->detectSimple('Das ist ein kleiner Text');
echo "Language is: $lang\n";
} catch (Text_LanguageDetect_Exception $e) {
echo 'An error occured! Message: ' . $e . "\n";
}
?>

View file

@ -0,0 +1,35 @@
<?php
/**
* example usage (CLI)
*
* @package Text_LanguageDetect
* @version CVS: $Id$
*/
require_once 'Text/LanguageDetect.php';
$l = new Text_LanguageDetect;
$stdin = fopen('php://stdin', 'r');
echo "Supported languages:\n";
$langs = $l->getLanguages();
sort($langs);
echo join(', ', $langs);
echo "\ntotal ", count($langs), "\n\n";
while ($line = fgets($stdin)) {
$result = $l->detect($line, 4);
print_r($result);
$blocks = $l->detectUnicodeBlocks($line, true);
print_r($blocks);
}
fclose($stdin);
unset($l);
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
?>

View file

@ -0,0 +1,72 @@
<?php
/**
* example usage (web)
*
* @package Text_LanguageDetect
* @version CVS: $Id$
*/
// browsers will encode multi-byte characters wrong unless they think the page is utf8-encoded
header('Content-type: text/html; charset=utf-8', true);
require_once 'Text/LanguageDetect.php';
$l = new Text_LanguageDetect;
if (isset($_REQUEST['q'])) {
$q = stripslashes($_REQUEST['q']);
}
?>
<html>
<head>
<title>Text_LanguageDetect demonstration</title>
</head>
<body>
<h2>Text_LanguageDetect</h2>
<?
echo "<small>Supported languages:\n";
$langs = $l->getLanguages();
sort($langs);
foreach ($langs as $lang) {
echo ucfirst($lang), ', ';
$i++;
}
echo "<br />total $i</small><br /><br />";
?>
<form method="post">
Enter text to identify language (at least a couple of sentences):<br />
<textarea name="q" wrap="virtual" cols="80" rows="8"><?= $q ?></textarea>
<br />
<input type="submit" value="Submit" />
</form>
<?
if (isset($q) && strlen($q)) {
$len = $l->utf8strlen($q);
if ($len < 20) { // this value picked somewhat arbitrarily
echo "Warning: string not very long ($len chars)<br />\n";
}
$result = $l->detectConfidence($q);
if ($result == null) {
echo "Text_LanguageDetect cannot identify this piece of text. <br /><br />\n";
} else {
echo "Text_LanguageDetect thinks this text is written in <b>{$result['language']}</b> ({$result['similarity']}, {$result['confidence']})<br /><br />\n";
}
$result = $l->detectUnicodeBlocks($q, false);
if (!empty($result)) {
arsort($result);
echo "Unicode blocks present: ", join(', ', array_keys($result)), "\n<br /><br />";
}
}
unset($l);
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
?>
</body></html>

View file

@ -0,0 +1,19 @@
<?php
/**
* Demonstrates how to use ISO language codes.
*
* The "name mode" changes the way languages are accepted and returned.
*/
require_once 'Text/LanguageDetect.php';
$ld = new Text_LanguageDetect();
//will output the ISO 639-1 two-letter language code
// "de"
$ld->setNameMode(2);
echo $ld->detectSimple('Das ist ein kleiner Text') . "\n";
//will output the ISO 639-2 three-letter language code
// "deu"
$ld->setNameMode(3);
echo $ld->detectSimple('Das ist ein kleiner Text') . "\n";
?>

View file

@ -0,0 +1,11 @@
<?php
/**
* List all supported languages
*/
require_once 'Text/LanguageDetect.php';
$ld = new Text_LanguageDetect();
foreach ($ld->getLanguages() as $lang) {
echo $lang . "\n";
}
?>

View file

@ -0,0 +1,10 @@
<?php
require_once 'Text/LanguageDetect.php';
$text = 'Was wäre, wenn ich Ihnen das jetzt sagen würde?';
$ld = new Text_LanguageDetect();
$result = $ld->detectSimple($text);
var_dump($result);
//output: german
?>

View file

@ -0,0 +1,42 @@
<?php
/**
* Helper that enables access to private and protected methods and properties.
*/
class PrivProxy
{
private $obj;
public function __construct($obj)
{
$this->obj = $obj;
}
public function __call($method, $arguments)
{
$rm = new ReflectionMethod($this->obj, $method);
$rm->setAccessible(true);
return $rm->invokeArgs($this->obj, $arguments);
}
public static function __callStatic($method, $arguments)
{
$rm = new ReflectionMethod($this->obj, $method);
$rm->setAccessible(true);
return $rm->invokeArgs($this->obj, $arguments);
}
public function __set($var, $value)
{
$rp = new ReflectionProperty($this->obj, $var);
$rp->setAccessible(true);
$rp->setValue($this->obj, $value);
}
public function __get($var)
{
$rp = new ReflectionProperty($this->obj, $var);
$rp->setAccessible(true);
return $rp->getValue($this->obj);
}
}
?>

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,72 @@
<?php
set_include_path(
__DIR__ . '/../' . PATH_SEPARATOR . get_include_path()
);
require_once 'Text/LanguageDetect/ISO639.php';
class Text_LanguageDetect_ISO639Test extends PHPUnit_Framework_TestCase
{
public function testNameToCode2()
{
$this->assertEquals(
'de',
Text_LanguageDetect_ISO639::nameToCode2('german')
);
}
public function testNameToCode2Fail()
{
$this->assertNull(
Text_LanguageDetect_ISO639::nameToCode2('doesnotexist')
);
}
public function testNameToCode3()
{
$this->assertEquals(
'fra',
Text_LanguageDetect_ISO639::nameToCode3('french')
);
}
public function testNameToCode3Fail()
{
$this->assertNull(
Text_LanguageDetect_ISO639::nameToCode3('doesnotexist')
);
}
public function testCode2ToName()
{
$this->assertEquals(
'english',
Text_LanguageDetect_ISO639::code2ToName('en')
);