Merge pull request #4881 from MrPetovan/task/4867-guess-language-from-plaintext
Guess language from plaintext
This commit is contained in:
commit
37e5272f92
4 changed files with 203 additions and 159 deletions
|
@ -23,6 +23,12 @@ function babel_content()
|
||||||
'content' => visible_lf($bbcode)
|
'content' => visible_lf($bbcode)
|
||||||
];
|
];
|
||||||
|
|
||||||
|
$plain = Text\BBCode::toPlaintext($bbcode, false);
|
||||||
|
$results[] = [
|
||||||
|
'title' => L10n::t('BBCode::toPlaintext'),
|
||||||
|
'content' => visible_lf($plain)
|
||||||
|
];
|
||||||
|
|
||||||
$html = Text\BBCode::convert($bbcode);
|
$html = Text\BBCode::convert($bbcode);
|
||||||
$results[] = [
|
$results[] = [
|
||||||
'title' => L10n::t("BBCode::convert \x28raw HTML\x29"),
|
'title' => L10n::t("BBCode::convert \x28raw HTML\x29"),
|
||||||
|
|
|
@ -343,159 +343,20 @@ class BBCode extends BaseObject
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Convert a message into plaintext for connectors to other networks
|
* @brief Converts a BBCode text into plaintext
|
||||||
*
|
*
|
||||||
* @param array $b The message array that is about to be posted
|
* @param bool $keep_urls Whether to keep URLs in the resulting plaintext
|
||||||
* @param int $limit The maximum number of characters when posting to that network
|
|
||||||
* @param bool $includedlinks Has an attached link to be included into the message?
|
|
||||||
* @param int $htmlmode This triggers the behaviour of the bbcode conversion
|
|
||||||
* @param string $target_network Name of the network where the post should go to.
|
|
||||||
*
|
*
|
||||||
* @return string The converted message
|
* @return string
|
||||||
*/
|
*/
|
||||||
public static function toPlaintext($b, $limit = 0, $includedlinks = false, $htmlmode = 2, $target_network = "")
|
public static function toPlaintext($text, $keep_urls = true)
|
||||||
{
|
{
|
||||||
// Remove the hash tags
|
$naked_text = preg_replace('/\[(.+?)\]/','', $text);
|
||||||
$URLSearchString = "^\[\]";
|
if (!$keep_urls) {
|
||||||
$body = preg_replace("/([#@])\[url\=([$URLSearchString]*)\](.*?)\[\/url\]/ism", '$1$3', $b["body"]);
|
$naked_text = preg_replace('#https?\://[^\s<]+[^\s\.\)]#i', '', $naked_text);
|
||||||
|
|
||||||
// Add an URL element if the text contains a raw link
|
|
||||||
$body = preg_replace("/([^\]\='".'"'."]|^)(https?\:\/\/[a-zA-Z0-9\:\/\-\?\&\;\.\=\_\~\#\%\$\!\+\,]+)/ism", '$1[url]$2[/url]', $body);
|
|
||||||
|
|
||||||
// Remove the abstract
|
|
||||||
$body = self::stripAbstract($body);
|
|
||||||
|
|
||||||
// At first look at data that is attached via "type-..." stuff
|
|
||||||
// This will hopefully replaced with a dedicated bbcode later
|
|
||||||
//$post = self::getAttachedData($b["body"]);
|
|
||||||
$post = self::getAttachedData($body, $b);
|
|
||||||
|
|
||||||
if (($b["title"] != "") && ($post["text"] != "")) {
|
|
||||||
$post["text"] = trim($b["title"]."\n\n".$post["text"]);
|
|
||||||
} elseif ($b["title"] != "") {
|
|
||||||
$post["text"] = trim($b["title"]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$abstract = "";
|
return $naked_text;
|
||||||
|
|
||||||
// Fetch the abstract from the given target network
|
|
||||||
if ($target_network != "") {
|
|
||||||
$default_abstract = self::getAbstract($b["body"]);
|
|
||||||
$abstract = self::getAbstract($b["body"], $target_network);
|
|
||||||
|
|
||||||
// If we post to a network with no limit we only fetch
|
|
||||||
// an abstract exactly for this network
|
|
||||||
if (($limit == 0) && ($abstract == $default_abstract)) {
|
|
||||||
$abstract = "";
|
|
||||||
}
|
|
||||||
} else {// Try to guess the correct target network
|
|
||||||
switch ($htmlmode) {
|
|
||||||
case 8:
|
|
||||||
$abstract = self::getAbstract($b["body"], NETWORK_TWITTER);
|
|
||||||
break;
|
|
||||||
case 7:
|
|
||||||
$abstract = self::getAbstract($b["body"], NETWORK_STATUSNET);
|
|
||||||
break;
|
|
||||||
case 6:
|
|
||||||
$abstract = self::getAbstract($b["body"], NETWORK_APPNET);
|
|
||||||
break;
|
|
||||||
default: // We don't know the exact target.
|
|
||||||
// We fetch an abstract since there is a posting limit.
|
|
||||||
if ($limit > 0) {
|
|
||||||
$abstract = self::getAbstract($b["body"]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($abstract != "") {
|
|
||||||
$post["text"] = $abstract;
|
|
||||||
|
|
||||||
if ($post["type"] == "text") {
|
|
||||||
$post["type"] = "link";
|
|
||||||
$post["url"] = $b["plink"];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$html = self::convert($post["text"].$post["after"], false, $htmlmode);
|
|
||||||
$msg = HTML::toPlaintext($html, 0, true);
|
|
||||||
$msg = trim(html_entity_decode($msg, ENT_QUOTES, 'UTF-8'));
|
|
||||||
|
|
||||||
$link = "";
|
|
||||||
if ($includedlinks) {
|
|
||||||
if ($post["type"] == "link") {
|
|
||||||
$link = $post["url"];
|
|
||||||
} elseif ($post["type"] == "text") {
|
|
||||||
$link = $post["url"];
|
|
||||||
} elseif ($post["type"] == "video") {
|
|
||||||
$link = $post["url"];
|
|
||||||
} elseif ($post["type"] == "photo") {
|
|
||||||
$link = $post["image"];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (($msg == "") && isset($post["title"])) {
|
|
||||||
$msg = trim($post["title"]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (($msg == "") && isset($post["description"])) {
|
|
||||||
$msg = trim($post["description"]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the link is already contained in the post, then it neeedn't to be added again
|
|
||||||
// But: if the link is beyond the limit, then it has to be added.
|
|
||||||
if (($link != "") && strstr($msg, $link)) {
|
|
||||||
$pos = strpos($msg, $link);
|
|
||||||
|
|
||||||
// Will the text be shortened in the link?
|
|
||||||
// Or is the link the last item in the post?
|
|
||||||
if (($limit > 0) && ($pos < $limit) && (($pos + 23 > $limit) || ($pos + strlen($link) == strlen($msg)))) {
|
|
||||||
$msg = trim(str_replace($link, "", $msg));
|
|
||||||
} elseif (($limit == 0) || ($pos < $limit)) {
|
|
||||||
// The limit has to be increased since it will be shortened - but not now
|
|
||||||
// Only do it with Twitter (htmlmode = 8)
|
|
||||||
if (($limit > 0) && (strlen($link) > 23) && ($htmlmode == 8)) {
|
|
||||||
$limit = $limit - 23 + strlen($link);
|
|
||||||
}
|
|
||||||
|
|
||||||
$link = "";
|
|
||||||
|
|
||||||
if ($post["type"] == "text") {
|
|
||||||
unset($post["url"]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($limit > 0) {
|
|
||||||
// Reduce multiple spaces
|
|
||||||
// When posted to a network with limited space, we try to gain space where possible
|
|
||||||
while (strpos($msg, " ") !== false) {
|
|
||||||
$msg = str_replace(" ", " ", $msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Twitter is using its own limiter, so we always assume that shortened links will have this length
|
|
||||||
if (iconv_strlen($link, "UTF-8") > 0) {
|
|
||||||
$limit = $limit - 23;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (iconv_strlen($msg, "UTF-8") > $limit) {
|
|
||||||
if (($post["type"] == "text") && isset($post["url"])) {
|
|
||||||
$post["url"] = $b["plink"];
|
|
||||||
} elseif (!isset($post["url"])) {
|
|
||||||
$limit = $limit - 23;
|
|
||||||
$post["url"] = $b["plink"];
|
|
||||||
// Which purpose has this line? It is now uncommented, but left as a reminder
|
|
||||||
//} elseif (strpos($b["body"], "[share") !== false) {
|
|
||||||
// $post["url"] = $b["plink"];
|
|
||||||
} elseif (PConfig::get($b["uid"], "system", "no_intelligent_shortening")) {
|
|
||||||
$post["url"] = $b["plink"];
|
|
||||||
}
|
|
||||||
$msg = Plaintext::shorten($msg, $limit);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$post["text"] = trim($msg);
|
|
||||||
|
|
||||||
return($post);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static function scaleExternalImages($srctext, $include_link = true, $scale_replace = false)
|
public static function scaleExternalImages($srctext, $include_link = true, $scale_replace = false)
|
||||||
|
@ -1947,7 +1808,7 @@ class BBCode extends BaseObject
|
||||||
* @param string $addon The addon for which the abstract is meant for
|
* @param string $addon The addon for which the abstract is meant for
|
||||||
* @return string The abstract
|
* @return string The abstract
|
||||||
*/
|
*/
|
||||||
private static function getAbstract($text, $addon = "")
|
public static function getAbstract($text, $addon = "")
|
||||||
{
|
{
|
||||||
$abstract = "";
|
$abstract = "";
|
||||||
$abstracts = [];
|
$abstracts = [];
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
namespace Friendica\Model;
|
namespace Friendica\Model;
|
||||||
|
|
||||||
use Friendica\BaseObject;
|
use Friendica\BaseObject;
|
||||||
|
use Friendica\Content\Text;
|
||||||
use Friendica\Core\Addon;
|
use Friendica\Core\Addon;
|
||||||
use Friendica\Core\Config;
|
use Friendica\Core\Config;
|
||||||
use Friendica\Core\L10n;
|
use Friendica\Core\L10n;
|
||||||
|
@ -977,35 +978,35 @@ class Item extends BaseObject
|
||||||
* if possible and not already present.
|
* if possible and not already present.
|
||||||
* Expects "body" element to exist in $arr.
|
* Expects "body" element to exist in $arr.
|
||||||
*/
|
*/
|
||||||
private static function addLanguageInPostopts(&$arr)
|
private static function addLanguageInPostopts(&$item)
|
||||||
{
|
{
|
||||||
if (x($arr, 'postopts')) {
|
if (!empty($item['postopts'])) {
|
||||||
if (strstr($arr['postopts'], 'lang=')) {
|
if (strstr($item['postopts'], 'lang=')) {
|
||||||
// do not override
|
// do not override
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
$postopts = $arr['postopts'];
|
$postopts = $item['postopts'];
|
||||||
} else {
|
} else {
|
||||||
$postopts = "";
|
$postopts = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
$naked_body = preg_replace('/\[(.+?)\]/','', $arr['body']);
|
$naked_body = Text\BBCode::toPlaintext($item['body'], false);
|
||||||
$l = new Text_LanguageDetect();
|
|
||||||
$lng = $l->detect($naked_body, 3);
|
|
||||||
|
|
||||||
if (sizeof($lng) > 0) {
|
$languages = (new Text_LanguageDetect())->detect($naked_body, 3);
|
||||||
if ($postopts != "") {
|
|
||||||
|
if (sizeof($languages) > 0) {
|
||||||
|
if ($postopts != '') {
|
||||||
$postopts .= '&'; // arbitrary separator, to be reviewed
|
$postopts .= '&'; // arbitrary separator, to be reviewed
|
||||||
}
|
}
|
||||||
|
|
||||||
$postopts .= 'lang=';
|
$postopts .= 'lang=';
|
||||||
$sep = "";
|
$sep = "";
|
||||||
|
|
||||||
foreach ($lng as $language => $score) {
|
foreach ($languages as $language => $score) {
|
||||||
$postopts .= $sep . $language . ";" . $score;
|
$postopts .= $sep . $language . ";" . $score;
|
||||||
$sep = ':';
|
$sep = ':';
|
||||||
}
|
}
|
||||||
$arr['postopts'] = $postopts;
|
$item['postopts'] = $postopts;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
176
src/Model/ItemContent.php
Normal file
176
src/Model/ItemContent.php
Normal file
|
@ -0,0 +1,176 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file src/Model/ItemContent.php
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace Friendica\Model;
|
||||||
|
|
||||||
|
use Friendica\BaseObject;
|
||||||
|
use Friendica\Content\Text;
|
||||||
|
use Friendica\Core\PConfig;
|
||||||
|
|
||||||
|
require_once 'boot.php';
|
||||||
|
require_once 'include/items.php';
|
||||||
|
require_once 'include/text.php';
|
||||||
|
|
||||||
|
class ItemContent extends BaseObject
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* @brief Convert a message into plaintext for connectors to other networks
|
||||||
|
*
|
||||||
|
* @param array $item The message array that is about to be posted
|
||||||
|
* @param int $limit The maximum number of characters when posting to that network
|
||||||
|
* @param bool $includedlinks Has an attached link to be included into the message?
|
||||||
|
* @param int $htmlmode This controls the behavior of the BBCode conversion
|
||||||
|
* @param string $target_network Name of the network where the post should go to.
|
||||||
|
*
|
||||||
|
* @see \Friendica\Content\Text\BBCode::getAttachedData
|
||||||
|
*
|
||||||
|
* @return array Same array structure than \Friendica\Content\Text\BBCode::getAttachedData
|
||||||
|
*/
|
||||||
|
public static function getPlaintextPost($item, $limit = 0, $includedlinks = false, $htmlmode = 2, $target_network = '')
|
||||||
|
{
|
||||||
|
// Remove hashtags
|
||||||
|
$URLSearchString = '^\[\]';
|
||||||
|
$body = preg_replace("/([#@])\[url\=([$URLSearchString]*)\](.*?)\[\/url\]/ism", '$1$3', $item['body']);
|
||||||
|
|
||||||
|
// Add an URL element if the text contains a raw link
|
||||||
|
$body = preg_replace('/([^\]\=\'"]|^)(https?\:\/\/[a-zA-Z0-9\:\/\-\?\&\;\.\=\_\~\#\%\$\!\+\,]+)/ism',
|
||||||
|
'$1[url]$2[/url]', $body);
|
||||||
|
|
||||||
|
// Remove the abstract
|
||||||
|
$body = Text\BBCode::stripAbstract($body);
|
||||||
|
|
||||||
|
// At first look at data that is attached via "type-..." stuff
|
||||||
|
// This will hopefully replaced with a dedicated bbcode later
|
||||||
|
//$post = self::getAttachedData($b['body']);
|
||||||
|
$post = Text\BBCode::getAttachedData($body, $item);
|
||||||
|
|
||||||
|
if (($item['title'] != '') && ($post['text'] != '')) {
|
||||||
|
$post['text'] = trim($item['title'] . "\n\n" . $post['text']);
|
||||||
|
} elseif ($item['title'] != '') {
|
||||||
|
$post['text'] = trim($item['title']);
|
||||||
|
}
|
||||||
|
|
||||||
|
$abstract = '';
|
||||||
|
|
||||||
|
// Fetch the abstract from the given target network
|
||||||
|
if ($target_network != '') {
|
||||||
|
$default_abstract = Text\BBCode::getAbstract($item['body']);
|
||||||
|
$abstract = Text\BBCode::getAbstract($item['body'], $target_network);
|
||||||
|
|
||||||
|
// If we post to a network with no limit we only fetch
|
||||||
|
// an abstract exactly for this network
|
||||||
|
if (($limit == 0) && ($abstract == $default_abstract)) {
|
||||||
|
$abstract = '';
|
||||||
|
}
|
||||||
|
} else {// Try to guess the correct target network
|
||||||
|
switch ($htmlmode) {
|
||||||
|
case 8:
|
||||||
|
$abstract = Text\BBCode::getAbstract($item['body'], NETWORK_TWITTER);
|
||||||
|
break;
|
||||||
|
case 7:
|
||||||
|
$abstract = Text\BBCode::getAbstract($item['body'], NETWORK_STATUSNET);
|
||||||
|
break;
|
||||||
|
case 6:
|
||||||
|
$abstract = Text\BBCode::getAbstract($item['body'], NETWORK_APPNET);
|
||||||
|
break;
|
||||||
|
default: // We don't know the exact target.
|
||||||
|
// We fetch an abstract since there is a posting limit.
|
||||||
|
if ($limit > 0) {
|
||||||
|
$abstract = Text\BBCode::getAbstract($item['body']);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($abstract != '') {
|
||||||
|
$post['text'] = $abstract;
|
||||||
|
|
||||||
|
if ($post['type'] == 'text') {
|
||||||
|
$post['type'] = 'link';
|
||||||
|
$post['url'] = $item['plink'];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$html = Text\BBCode::convert($post['text'] . $post['after'], false, $htmlmode);
|
||||||
|
$msg = Text\HTML::toPlaintext($html, 0, true);
|
||||||
|
$msg = trim(html_entity_decode($msg, ENT_QUOTES, 'UTF-8'));
|
||||||
|
|
||||||
|
$link = '';
|
||||||
|
if ($includedlinks) {
|
||||||
|
if ($post['type'] == 'link') {
|
||||||
|
$link = $post['url'];
|
||||||
|
} elseif ($post['type'] == 'text') {
|
||||||
|
$link = $post['url'];
|
||||||
|
} elseif ($post['type'] == 'video') {
|
||||||
|
$link = $post['url'];
|
||||||
|
} elseif ($post['type'] == 'photo') {
|
||||||
|
$link = $post['image'];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (($msg == '') && isset($post['title'])) {
|
||||||
|
$msg = trim($post['title']);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (($msg == '') && isset($post['description'])) {
|
||||||
|
$msg = trim($post['description']);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the link is already contained in the post, then it neeedn't to be added again
|
||||||
|
// But: if the link is beyond the limit, then it has to be added.
|
||||||
|
if (($link != '') && strstr($msg, $link)) {
|
||||||
|
$pos = strpos($msg, $link);
|
||||||
|
|
||||||
|
// Will the text be shortened in the link?
|
||||||
|
// Or is the link the last item in the post?
|
||||||
|
if (($limit > 0) && ($pos < $limit) && (($pos + 23 > $limit) || ($pos + strlen($link) == strlen($msg)))) {
|
||||||
|
$msg = trim(str_replace($link, '', $msg));
|
||||||
|
} elseif (($limit == 0) || ($pos < $limit)) {
|
||||||
|
// The limit has to be increased since it will be shortened - but not now
|
||||||
|
// Only do it with Twitter (htmlmode = 8)
|
||||||
|
if (($limit > 0) && (strlen($link) > 23) && ($htmlmode == 8)) {
|
||||||
|
$limit = $limit - 23 + strlen($link);
|
||||||
|
}
|
||||||
|
|
||||||
|
$link = '';
|
||||||
|
|
||||||
|
if ($post['type'] == 'text') {
|
||||||
|
unset($post['url']);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($limit > 0) {
|
||||||
|
// Reduce multiple spaces
|
||||||
|
// When posted to a network with limited space, we try to gain space where possible
|
||||||
|
while (strpos($msg, ' ') !== false) {
|
||||||
|
$msg = str_replace(' ', ' ', $msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Twitter is using its own limiter, so we always assume that shortened links will have this length
|
||||||
|
if (iconv_strlen($link, 'UTF-8') > 0) {
|
||||||
|
$limit = $limit - 23;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (iconv_strlen($msg, 'UTF-8') > $limit) {
|
||||||
|
if (($post['type'] == 'text') && isset($post['url'])) {
|
||||||
|
$post['url'] = $item['plink'];
|
||||||
|
} elseif (!isset($post['url'])) {
|
||||||
|
$limit = $limit - 23;
|
||||||
|
$post['url'] = $item['plink'];
|
||||||
|
} elseif (strpos($item['body'], '[share') !== false) {
|
||||||
|
$post['url'] = $item['plink'];
|
||||||
|
} elseif (PConfig::get($item['uid'], 'system', 'no_intelligent_shortening')) {
|
||||||
|
$post['url'] = $item['plink'];
|
||||||
|
}
|
||||||
|
$msg = Text\Plaintext::shorten($msg, $limit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$post['text'] = trim($msg);
|
||||||
|
|
||||||
|
return $post;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue