Merge pull request #4881 from MrPetovan/task/4867-guess-language-from-plaintext

Guess language from plaintext
This commit is contained in:
Michael Vogel 2018-04-23 00:26:05 +02:00 committed by GitHub
commit 37e5272f92
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 203 additions and 159 deletions

View file

@ -23,6 +23,12 @@ function babel_content()
'content' => visible_lf($bbcode) 'content' => visible_lf($bbcode)
]; ];
$plain = Text\BBCode::toPlaintext($bbcode, false);
$results[] = [
'title' => L10n::t('BBCode::toPlaintext'),
'content' => visible_lf($plain)
];
$html = Text\BBCode::convert($bbcode); $html = Text\BBCode::convert($bbcode);
$results[] = [ $results[] = [
'title' => L10n::t("BBCode::convert \x28raw HTML\x29"), 'title' => L10n::t("BBCode::convert \x28raw HTML\x29"),

View file

@ -343,159 +343,20 @@ class BBCode extends BaseObject
} }
/** /**
* @brief Convert a message into plaintext for connectors to other networks * @brief Converts a BBCode text into plaintext
* *
* @param array $b The message array that is about to be posted * @param bool $keep_urls Whether to keep URLs in the resulting plaintext
* @param int $limit The maximum number of characters when posting to that network
* @param bool $includedlinks Has an attached link to be included into the message?
* @param int $htmlmode This triggers the behaviour of the bbcode conversion
* @param string $target_network Name of the network where the post should go to.
* *
* @return string The converted message * @return string
*/ */
public static function toPlaintext($b, $limit = 0, $includedlinks = false, $htmlmode = 2, $target_network = "") public static function toPlaintext($text, $keep_urls = true)
{ {
// Remove the hash tags $naked_text = preg_replace('/\[(.+?)\]/','', $text);
$URLSearchString = "^\[\]"; if (!$keep_urls) {
$body = preg_replace("/([#@])\[url\=([$URLSearchString]*)\](.*?)\[\/url\]/ism", '$1$3', $b["body"]); $naked_text = preg_replace('#https?\://[^\s<]+[^\s\.\)]#i', '', $naked_text);
// Add an URL element if the text contains a raw link
$body = preg_replace("/([^\]\='".'"'."]|^)(https?\:\/\/[a-zA-Z0-9\:\/\-\?\&\;\.\=\_\~\#\%\$\!\+\,]+)/ism", '$1[url]$2[/url]', $body);
// Remove the abstract
$body = self::stripAbstract($body);
// At first look at data that is attached via "type-..." stuff
// This will hopefully replaced with a dedicated bbcode later
//$post = self::getAttachedData($b["body"]);
$post = self::getAttachedData($body, $b);
if (($b["title"] != "") && ($post["text"] != "")) {
$post["text"] = trim($b["title"]."\n\n".$post["text"]);
} elseif ($b["title"] != "") {
$post["text"] = trim($b["title"]);
} }
$abstract = ""; return $naked_text;
// Fetch the abstract from the given target network
if ($target_network != "") {
$default_abstract = self::getAbstract($b["body"]);
$abstract = self::getAbstract($b["body"], $target_network);
// If we post to a network with no limit we only fetch
// an abstract exactly for this network
if (($limit == 0) && ($abstract == $default_abstract)) {
$abstract = "";
}
} else {// Try to guess the correct target network
switch ($htmlmode) {
case 8:
$abstract = self::getAbstract($b["body"], NETWORK_TWITTER);
break;
case 7:
$abstract = self::getAbstract($b["body"], NETWORK_STATUSNET);
break;
case 6:
$abstract = self::getAbstract($b["body"], NETWORK_APPNET);
break;
default: // We don't know the exact target.
// We fetch an abstract since there is a posting limit.
if ($limit > 0) {
$abstract = self::getAbstract($b["body"]);
}
}
}
if ($abstract != "") {
$post["text"] = $abstract;
if ($post["type"] == "text") {
$post["type"] = "link";
$post["url"] = $b["plink"];
}
}
$html = self::convert($post["text"].$post["after"], false, $htmlmode);
$msg = HTML::toPlaintext($html, 0, true);
$msg = trim(html_entity_decode($msg, ENT_QUOTES, 'UTF-8'));
$link = "";
if ($includedlinks) {
if ($post["type"] == "link") {
$link = $post["url"];
} elseif ($post["type"] == "text") {
$link = $post["url"];
} elseif ($post["type"] == "video") {
$link = $post["url"];
} elseif ($post["type"] == "photo") {
$link = $post["image"];
}
if (($msg == "") && isset($post["title"])) {
$msg = trim($post["title"]);
}
if (($msg == "") && isset($post["description"])) {
$msg = trim($post["description"]);
}
// If the link is already contained in the post, then it neeedn't to be added again
// But: if the link is beyond the limit, then it has to be added.
if (($link != "") && strstr($msg, $link)) {
$pos = strpos($msg, $link);
// Will the text be shortened in the link?
// Or is the link the last item in the post?
if (($limit > 0) && ($pos < $limit) && (($pos + 23 > $limit) || ($pos + strlen($link) == strlen($msg)))) {
$msg = trim(str_replace($link, "", $msg));
} elseif (($limit == 0) || ($pos < $limit)) {
// The limit has to be increased since it will be shortened - but not now
// Only do it with Twitter (htmlmode = 8)
if (($limit > 0) && (strlen($link) > 23) && ($htmlmode == 8)) {
$limit = $limit - 23 + strlen($link);
}
$link = "";
if ($post["type"] == "text") {
unset($post["url"]);
}
}
}
}
if ($limit > 0) {
// Reduce multiple spaces
// When posted to a network with limited space, we try to gain space where possible
while (strpos($msg, " ") !== false) {
$msg = str_replace(" ", " ", $msg);
}
// Twitter is using its own limiter, so we always assume that shortened links will have this length
if (iconv_strlen($link, "UTF-8") > 0) {
$limit = $limit - 23;
}
if (iconv_strlen($msg, "UTF-8") > $limit) {
if (($post["type"] == "text") && isset($post["url"])) {
$post["url"] = $b["plink"];
} elseif (!isset($post["url"])) {
$limit = $limit - 23;
$post["url"] = $b["plink"];
// Which purpose has this line? It is now uncommented, but left as a reminder
//} elseif (strpos($b["body"], "[share") !== false) {
// $post["url"] = $b["plink"];
} elseif (PConfig::get($b["uid"], "system", "no_intelligent_shortening")) {
$post["url"] = $b["plink"];
}
$msg = Plaintext::shorten($msg, $limit);
}
}
$post["text"] = trim($msg);
return($post);
} }
public static function scaleExternalImages($srctext, $include_link = true, $scale_replace = false) public static function scaleExternalImages($srctext, $include_link = true, $scale_replace = false)
@ -1947,7 +1808,7 @@ class BBCode extends BaseObject
* @param string $addon The addon for which the abstract is meant for * @param string $addon The addon for which the abstract is meant for
* @return string The abstract * @return string The abstract
*/ */
private static function getAbstract($text, $addon = "") public static function getAbstract($text, $addon = "")
{ {
$abstract = ""; $abstract = "";
$abstracts = []; $abstracts = [];

View file

@ -7,6 +7,7 @@
namespace Friendica\Model; namespace Friendica\Model;
use Friendica\BaseObject; use Friendica\BaseObject;
use Friendica\Content\Text;
use Friendica\Core\Addon; use Friendica\Core\Addon;
use Friendica\Core\Config; use Friendica\Core\Config;
use Friendica\Core\L10n; use Friendica\Core\L10n;
@ -977,35 +978,35 @@ class Item extends BaseObject
* if possible and not already present. * if possible and not already present.
* Expects "body" element to exist in $arr. * Expects "body" element to exist in $arr.
*/ */
private static function addLanguageInPostopts(&$arr) private static function addLanguageInPostopts(&$item)
{ {
if (x($arr, 'postopts')) { if (!empty($item['postopts'])) {
if (strstr($arr['postopts'], 'lang=')) { if (strstr($item['postopts'], 'lang=')) {
// do not override // do not override
return; return;
} }
$postopts = $arr['postopts']; $postopts = $item['postopts'];
} else { } else {
$postopts = ""; $postopts = "";
} }
$naked_body = preg_replace('/\[(.+?)\]/','', $arr['body']); $naked_body = Text\BBCode::toPlaintext($item['body'], false);
$l = new Text_LanguageDetect();
$lng = $l->detect($naked_body, 3);
if (sizeof($lng) > 0) { $languages = (new Text_LanguageDetect())->detect($naked_body, 3);
if ($postopts != "") {
if (sizeof($languages) > 0) {
if ($postopts != '') {
$postopts .= '&'; // arbitrary separator, to be reviewed $postopts .= '&'; // arbitrary separator, to be reviewed
} }
$postopts .= 'lang='; $postopts .= 'lang=';
$sep = ""; $sep = "";
foreach ($lng as $language => $score) { foreach ($languages as $language => $score) {
$postopts .= $sep . $language . ";" . $score; $postopts .= $sep . $language . ";" . $score;
$sep = ':'; $sep = ':';
} }
$arr['postopts'] = $postopts; $item['postopts'] = $postopts;
} }
} }

176
src/Model/ItemContent.php Normal file
View file

@ -0,0 +1,176 @@
<?php
/**
* @file src/Model/ItemContent.php
*/
namespace Friendica\Model;
use Friendica\BaseObject;
use Friendica\Content\Text;
use Friendica\Core\PConfig;
require_once 'boot.php';
require_once 'include/items.php';
require_once 'include/text.php';
class ItemContent extends BaseObject
{
/**
* @brief Convert a message into plaintext for connectors to other networks
*
* @param array $item The message array that is about to be posted
* @param int $limit The maximum number of characters when posting to that network
* @param bool $includedlinks Has an attached link to be included into the message?
* @param int $htmlmode This controls the behavior of the BBCode conversion
* @param string $target_network Name of the network where the post should go to.
*
* @see \Friendica\Content\Text\BBCode::getAttachedData
*
* @return array Same array structure than \Friendica\Content\Text\BBCode::getAttachedData
*/
public static function getPlaintextPost($item, $limit = 0, $includedlinks = false, $htmlmode = 2, $target_network = '')
{
// Remove hashtags
$URLSearchString = '^\[\]';
$body = preg_replace("/([#@])\[url\=([$URLSearchString]*)\](.*?)\[\/url\]/ism", '$1$3', $item['body']);
// Add an URL element if the text contains a raw link
$body = preg_replace('/([^\]\=\'"]|^)(https?\:\/\/[a-zA-Z0-9\:\/\-\?\&\;\.\=\_\~\#\%\$\!\+\,]+)/ism',
'$1[url]$2[/url]', $body);
// Remove the abstract
$body = Text\BBCode::stripAbstract($body);
// At first look at data that is attached via "type-..." stuff
// This will hopefully replaced with a dedicated bbcode later
//$post = self::getAttachedData($b['body']);
$post = Text\BBCode::getAttachedData($body, $item);
if (($item['title'] != '') && ($post['text'] != '')) {
$post['text'] = trim($item['title'] . "\n\n" . $post['text']);
} elseif ($item['title'] != '') {
$post['text'] = trim($item['title']);
}
$abstract = '';
// Fetch the abstract from the given target network
if ($target_network != '') {
$default_abstract = Text\BBCode::getAbstract($item['body']);
$abstract = Text\BBCode::getAbstract($item['body'], $target_network);
// If we post to a network with no limit we only fetch
// an abstract exactly for this network
if (($limit == 0) && ($abstract == $default_abstract)) {
$abstract = '';
}
} else {// Try to guess the correct target network
switch ($htmlmode) {
case 8:
$abstract = Text\BBCode::getAbstract($item['body'], NETWORK_TWITTER);
break;
case 7:
$abstract = Text\BBCode::getAbstract($item['body'], NETWORK_STATUSNET);
break;
case 6:
$abstract = Text\BBCode::getAbstract($item['body'], NETWORK_APPNET);
break;
default: // We don't know the exact target.
// We fetch an abstract since there is a posting limit.
if ($limit > 0) {
$abstract = Text\BBCode::getAbstract($item['body']);
}
}
}
if ($abstract != '') {
$post['text'] = $abstract;
if ($post['type'] == 'text') {
$post['type'] = 'link';
$post['url'] = $item['plink'];
}
}
$html = Text\BBCode::convert($post['text'] . $post['after'], false, $htmlmode);
$msg = Text\HTML::toPlaintext($html, 0, true);
$msg = trim(html_entity_decode($msg, ENT_QUOTES, 'UTF-8'));
$link = '';
if ($includedlinks) {
if ($post['type'] == 'link') {
$link = $post['url'];
} elseif ($post['type'] == 'text') {
$link = $post['url'];
} elseif ($post['type'] == 'video') {
$link = $post['url'];
} elseif ($post['type'] == 'photo') {
$link = $post['image'];
}
if (($msg == '') && isset($post['title'])) {
$msg = trim($post['title']);
}
if (($msg == '') && isset($post['description'])) {
$msg = trim($post['description']);
}
// If the link is already contained in the post, then it neeedn't to be added again
// But: if the link is beyond the limit, then it has to be added.
if (($link != '') && strstr($msg, $link)) {
$pos = strpos($msg, $link);
// Will the text be shortened in the link?
// Or is the link the last item in the post?
if (($limit > 0) && ($pos < $limit) && (($pos + 23 > $limit) || ($pos + strlen($link) == strlen($msg)))) {
$msg = trim(str_replace($link, '', $msg));
} elseif (($limit == 0) || ($pos < $limit)) {
// The limit has to be increased since it will be shortened - but not now
// Only do it with Twitter (htmlmode = 8)
if (($limit > 0) && (strlen($link) > 23) && ($htmlmode == 8)) {
$limit = $limit - 23 + strlen($link);
}
$link = '';
if ($post['type'] == 'text') {
unset($post['url']);
}
}
}
}
if ($limit > 0) {
// Reduce multiple spaces
// When posted to a network with limited space, we try to gain space where possible
while (strpos($msg, ' ') !== false) {
$msg = str_replace(' ', ' ', $msg);
}
// Twitter is using its own limiter, so we always assume that shortened links will have this length
if (iconv_strlen($link, 'UTF-8') > 0) {
$limit = $limit - 23;
}
if (iconv_strlen($msg, 'UTF-8') > $limit) {
if (($post['type'] == 'text') && isset($post['url'])) {
$post['url'] = $item['plink'];
} elseif (!isset($post['url'])) {
$limit = $limit - 23;
$post['url'] = $item['plink'];
} elseif (strpos($item['body'], '[share') !== false) {
$post['url'] = $item['plink'];
} elseif (PConfig::get($item['uid'], 'system', 'no_intelligent_shortening')) {
$post['url'] = $item['plink'];
}
$msg = Text\Plaintext::shorten($msg, $limit);
}
}
$post['text'] = trim($msg);
return $post;
}
}