Merge pull request #4881 from MrPetovan/task/4867-guess-language-from-plaintext
Guess language from plaintext
This commit is contained in:
commit
37e5272f92
4 changed files with 203 additions and 159 deletions
|
@ -23,6 +23,12 @@ function babel_content()
|
|||
'content' => visible_lf($bbcode)
|
||||
];
|
||||
|
||||
$plain = Text\BBCode::toPlaintext($bbcode, false);
|
||||
$results[] = [
|
||||
'title' => L10n::t('BBCode::toPlaintext'),
|
||||
'content' => visible_lf($plain)
|
||||
];
|
||||
|
||||
$html = Text\BBCode::convert($bbcode);
|
||||
$results[] = [
|
||||
'title' => L10n::t("BBCode::convert \x28raw HTML\x29"),
|
||||
|
|
|
@ -343,159 +343,20 @@ class BBCode extends BaseObject
|
|||
}
|
||||
|
||||
/**
|
||||
* @brief Convert a message into plaintext for connectors to other networks
|
||||
* @brief Converts a BBCode text into plaintext
|
||||
*
|
||||
* @param array $b The message array that is about to be posted
|
||||
* @param int $limit The maximum number of characters when posting to that network
|
||||
* @param bool $includedlinks Has an attached link to be included into the message?
|
||||
* @param int $htmlmode This triggers the behaviour of the bbcode conversion
|
||||
* @param string $target_network Name of the network where the post should go to.
|
||||
* @param bool $keep_urls Whether to keep URLs in the resulting plaintext
|
||||
*
|
||||
* @return string The converted message
|
||||
* @return string
|
||||
*/
|
||||
public static function toPlaintext($b, $limit = 0, $includedlinks = false, $htmlmode = 2, $target_network = "")
|
||||
public static function toPlaintext($text, $keep_urls = true)
|
||||
{
|
||||
// Remove the hash tags
|
||||
$URLSearchString = "^\[\]";
|
||||
$body = preg_replace("/([#@])\[url\=([$URLSearchString]*)\](.*?)\[\/url\]/ism", '$1$3', $b["body"]);
|
||||
|
||||
// Add an URL element if the text contains a raw link
|
||||
$body = preg_replace("/([^\]\='".'"'."]|^)(https?\:\/\/[a-zA-Z0-9\:\/\-\?\&\;\.\=\_\~\#\%\$\!\+\,]+)/ism", '$1[url]$2[/url]', $body);
|
||||
|
||||
// Remove the abstract
|
||||
$body = self::stripAbstract($body);
|
||||
|
||||
// At first look at data that is attached via "type-..." stuff
|
||||
// This will hopefully replaced with a dedicated bbcode later
|
||||
//$post = self::getAttachedData($b["body"]);
|
||||
$post = self::getAttachedData($body, $b);
|
||||
|
||||
if (($b["title"] != "") && ($post["text"] != "")) {
|
||||
$post["text"] = trim($b["title"]."\n\n".$post["text"]);
|
||||
} elseif ($b["title"] != "") {
|
||||
$post["text"] = trim($b["title"]);
|
||||
$naked_text = preg_replace('/\[(.+?)\]/','', $text);
|
||||
if (!$keep_urls) {
|
||||
$naked_text = preg_replace('#https?\://[^\s<]+[^\s\.\)]#i', '', $naked_text);
|
||||
}
|
||||
|
||||
$abstract = "";
|
||||
|
||||
// Fetch the abstract from the given target network
|
||||
if ($target_network != "") {
|
||||
$default_abstract = self::getAbstract($b["body"]);
|
||||
$abstract = self::getAbstract($b["body"], $target_network);
|
||||
|
||||
// If we post to a network with no limit we only fetch
|
||||
// an abstract exactly for this network
|
||||
if (($limit == 0) && ($abstract == $default_abstract)) {
|
||||
$abstract = "";
|
||||
}
|
||||
} else {// Try to guess the correct target network
|
||||
switch ($htmlmode) {
|
||||
case 8:
|
||||
$abstract = self::getAbstract($b["body"], NETWORK_TWITTER);
|
||||
break;
|
||||
case 7:
|
||||
$abstract = self::getAbstract($b["body"], NETWORK_STATUSNET);
|
||||
break;
|
||||
case 6:
|
||||
$abstract = self::getAbstract($b["body"], NETWORK_APPNET);
|
||||
break;
|
||||
default: // We don't know the exact target.
|
||||
// We fetch an abstract since there is a posting limit.
|
||||
if ($limit > 0) {
|
||||
$abstract = self::getAbstract($b["body"]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($abstract != "") {
|
||||
$post["text"] = $abstract;
|
||||
|
||||
if ($post["type"] == "text") {
|
||||
$post["type"] = "link";
|
||||
$post["url"] = $b["plink"];
|
||||
}
|
||||
}
|
||||
|
||||
$html = self::convert($post["text"].$post["after"], false, $htmlmode);
|
||||
$msg = HTML::toPlaintext($html, 0, true);
|
||||
$msg = trim(html_entity_decode($msg, ENT_QUOTES, 'UTF-8'));
|
||||
|
||||
$link = "";
|
||||
if ($includedlinks) {
|
||||
if ($post["type"] == "link") {
|
||||
$link = $post["url"];
|
||||
} elseif ($post["type"] == "text") {
|
||||
$link = $post["url"];
|
||||
} elseif ($post["type"] == "video") {
|
||||
$link = $post["url"];
|
||||
} elseif ($post["type"] == "photo") {
|
||||
$link = $post["image"];
|
||||
}
|
||||
|
||||
if (($msg == "") && isset($post["title"])) {
|
||||
$msg = trim($post["title"]);
|
||||
}
|
||||
|
||||
if (($msg == "") && isset($post["description"])) {
|
||||
$msg = trim($post["description"]);
|
||||
}
|
||||
|
||||
// If the link is already contained in the post, then it neeedn't to be added again
|
||||
// But: if the link is beyond the limit, then it has to be added.
|
||||
if (($link != "") && strstr($msg, $link)) {
|
||||
$pos = strpos($msg, $link);
|
||||
|
||||
// Will the text be shortened in the link?
|
||||
// Or is the link the last item in the post?
|
||||
if (($limit > 0) && ($pos < $limit) && (($pos + 23 > $limit) || ($pos + strlen($link) == strlen($msg)))) {
|
||||
$msg = trim(str_replace($link, "", $msg));
|
||||
} elseif (($limit == 0) || ($pos < $limit)) {
|
||||
// The limit has to be increased since it will be shortened - but not now
|
||||
// Only do it with Twitter (htmlmode = 8)
|
||||
if (($limit > 0) && (strlen($link) > 23) && ($htmlmode == 8)) {
|
||||
$limit = $limit - 23 + strlen($link);
|
||||
}
|
||||
|
||||
$link = "";
|
||||
|
||||
if ($post["type"] == "text") {
|
||||
unset($post["url"]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($limit > 0) {
|
||||
// Reduce multiple spaces
|
||||
// When posted to a network with limited space, we try to gain space where possible
|
||||
while (strpos($msg, " ") !== false) {
|
||||
$msg = str_replace(" ", " ", $msg);
|
||||
}
|
||||
|
||||
// Twitter is using its own limiter, so we always assume that shortened links will have this length
|
||||
if (iconv_strlen($link, "UTF-8") > 0) {
|
||||
$limit = $limit - 23;
|
||||
}
|
||||
|
||||
if (iconv_strlen($msg, "UTF-8") > $limit) {
|
||||
if (($post["type"] == "text") && isset($post["url"])) {
|
||||
$post["url"] = $b["plink"];
|
||||
} elseif (!isset($post["url"])) {
|
||||
$limit = $limit - 23;
|
||||
$post["url"] = $b["plink"];
|
||||
// Which purpose has this line? It is now uncommented, but left as a reminder
|
||||
//} elseif (strpos($b["body"], "[share") !== false) {
|
||||
// $post["url"] = $b["plink"];
|
||||
} elseif (PConfig::get($b["uid"], "system", "no_intelligent_shortening")) {
|
||||
$post["url"] = $b["plink"];
|
||||
}
|
||||
$msg = Plaintext::shorten($msg, $limit);
|
||||
}
|
||||
}
|
||||
|
||||
$post["text"] = trim($msg);
|
||||
|
||||
return($post);
|
||||
return $naked_text;
|
||||
}
|
||||
|
||||
public static function scaleExternalImages($srctext, $include_link = true, $scale_replace = false)
|
||||
|
@ -1947,7 +1808,7 @@ class BBCode extends BaseObject
|
|||
* @param string $addon The addon for which the abstract is meant for
|
||||
* @return string The abstract
|
||||
*/
|
||||
private static function getAbstract($text, $addon = "")
|
||||
public static function getAbstract($text, $addon = "")
|
||||
{
|
||||
$abstract = "";
|
||||
$abstracts = [];
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
namespace Friendica\Model;
|
||||
|
||||
use Friendica\BaseObject;
|
||||
use Friendica\Content\Text;
|
||||
use Friendica\Core\Addon;
|
||||
use Friendica\Core\Config;
|
||||
use Friendica\Core\L10n;
|
||||
|
@ -977,35 +978,35 @@ class Item extends BaseObject
|
|||
* if possible and not already present.
|
||||
* Expects "body" element to exist in $arr.
|
||||
*/
|
||||
private static function addLanguageInPostopts(&$arr)
|
||||
private static function addLanguageInPostopts(&$item)
|
||||
{
|
||||
if (x($arr, 'postopts')) {
|
||||
if (strstr($arr['postopts'], 'lang=')) {
|
||||
if (!empty($item['postopts'])) {
|
||||
if (strstr($item['postopts'], 'lang=')) {
|
||||
// do not override
|
||||
return;
|
||||
}
|
||||
$postopts = $arr['postopts'];
|
||||
$postopts = $item['postopts'];
|
||||
} else {
|
||||
$postopts = "";
|
||||
}
|
||||
|
||||
$naked_body = preg_replace('/\[(.+?)\]/','', $arr['body']);
|
||||
$l = new Text_LanguageDetect();
|
||||
$lng = $l->detect($naked_body, 3);
|
||||
$naked_body = Text\BBCode::toPlaintext($item['body'], false);
|
||||
|
||||
if (sizeof($lng) > 0) {
|
||||
if ($postopts != "") {
|
||||
$languages = (new Text_LanguageDetect())->detect($naked_body, 3);
|
||||
|
||||
if (sizeof($languages) > 0) {
|
||||
if ($postopts != '') {
|
||||
$postopts .= '&'; // arbitrary separator, to be reviewed
|
||||
}
|
||||
|
||||
$postopts .= 'lang=';
|
||||
$sep = "";
|
||||
|
||||
foreach ($lng as $language => $score) {
|
||||
foreach ($languages as $language => $score) {
|
||||
$postopts .= $sep . $language . ";" . $score;
|
||||
$sep = ':';
|
||||
}
|
||||
$arr['postopts'] = $postopts;
|
||||
$item['postopts'] = $postopts;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
176
src/Model/ItemContent.php
Normal file
176
src/Model/ItemContent.php
Normal file
|
@ -0,0 +1,176 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* @file src/Model/ItemContent.php
|
||||
*/
|
||||
|
||||
namespace Friendica\Model;
|
||||
|
||||
use Friendica\BaseObject;
|
||||
use Friendica\Content\Text;
|
||||
use Friendica\Core\PConfig;
|
||||
|
||||
require_once 'boot.php';
|
||||
require_once 'include/items.php';
|
||||
require_once 'include/text.php';
|
||||
|
||||
class ItemContent extends BaseObject
|
||||
{
|
||||
/**
|
||||
* @brief Convert a message into plaintext for connectors to other networks
|
||||
*
|
||||
* @param array $item The message array that is about to be posted
|
||||
* @param int $limit The maximum number of characters when posting to that network
|
||||
* @param bool $includedlinks Has an attached link to be included into the message?
|
||||
* @param int $htmlmode This controls the behavior of the BBCode conversion
|
||||
* @param string $target_network Name of the network where the post should go to.
|
||||
*
|
||||
* @see \Friendica\Content\Text\BBCode::getAttachedData
|
||||
*
|
||||
* @return array Same array structure than \Friendica\Content\Text\BBCode::getAttachedData
|
||||
*/
|
||||
public static function getPlaintextPost($item, $limit = 0, $includedlinks = false, $htmlmode = 2, $target_network = '')
|
||||
{
|
||||
// Remove hashtags
|
||||
$URLSearchString = '^\[\]';
|
||||
$body = preg_replace("/([#@])\[url\=([$URLSearchString]*)\](.*?)\[\/url\]/ism", '$1$3', $item['body']);
|
||||
|
||||
// Add an URL element if the text contains a raw link
|
||||
$body = preg_replace('/([^\]\=\'"]|^)(https?\:\/\/[a-zA-Z0-9\:\/\-\?\&\;\.\=\_\~\#\%\$\!\+\,]+)/ism',
|
||||
'$1[url]$2[/url]', $body);
|
||||
|
||||
// Remove the abstract
|
||||
$body = Text\BBCode::stripAbstract($body);
|
||||
|
||||
// At first look at data that is attached via "type-..." stuff
|
||||
// This will hopefully replaced with a dedicated bbcode later
|
||||
//$post = self::getAttachedData($b['body']);
|
||||
$post = Text\BBCode::getAttachedData($body, $item);
|
||||
|
||||
if (($item['title'] != '') && ($post['text'] != '')) {
|
||||
$post['text'] = trim($item['title'] . "\n\n" . $post['text']);
|
||||
} elseif ($item['title'] != '') {
|
||||
$post['text'] = trim($item['title']);
|
||||
}
|
||||
|
||||
$abstract = '';
|
||||
|
||||
// Fetch the abstract from the given target network
|
||||
if ($target_network != '') {
|
||||
$default_abstract = Text\BBCode::getAbstract($item['body']);
|
||||
$abstract = Text\BBCode::getAbstract($item['body'], $target_network);
|
||||
|
||||
// If we post to a network with no limit we only fetch
|
||||
// an abstract exactly for this network
|
||||
if (($limit == 0) && ($abstract == $default_abstract)) {
|
||||
$abstract = '';
|
||||
}
|
||||
} else {// Try to guess the correct target network
|
||||
switch ($htmlmode) {
|
||||
case 8:
|
||||
$abstract = Text\BBCode::getAbstract($item['body'], NETWORK_TWITTER);
|
||||
break;
|
||||
case 7:
|
||||
$abstract = Text\BBCode::getAbstract($item['body'], NETWORK_STATUSNET);
|
||||
break;
|
||||
case 6:
|
||||
$abstract = Text\BBCode::getAbstract($item['body'], NETWORK_APPNET);
|
||||
break;
|
||||
default: // We don't know the exact target.
|
||||
// We fetch an abstract since there is a posting limit.
|
||||
if ($limit > 0) {
|
||||
$abstract = Text\BBCode::getAbstract($item['body']);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($abstract != '') {
|
||||
$post['text'] = $abstract;
|
||||
|
||||
if ($post['type'] == 'text') {
|
||||
$post['type'] = 'link';
|
||||
$post['url'] = $item['plink'];
|
||||
}
|
||||
}
|
||||
|
||||
$html = Text\BBCode::convert($post['text'] . $post['after'], false, $htmlmode);
|
||||
$msg = Text\HTML::toPlaintext($html, 0, true);
|
||||
$msg = trim(html_entity_decode($msg, ENT_QUOTES, 'UTF-8'));
|
||||
|
||||
$link = '';
|
||||
if ($includedlinks) {
|
||||
if ($post['type'] == 'link') {
|
||||
$link = $post['url'];
|
||||
} elseif ($post['type'] == 'text') {
|
||||
$link = $post['url'];
|
||||
} elseif ($post['type'] == 'video') {
|
||||
$link = $post['url'];
|
||||
} elseif ($post['type'] == 'photo') {
|
||||
$link = $post['image'];
|
||||
}
|
||||
|
||||
if (($msg == '') && isset($post['title'])) {
|
||||
$msg = trim($post['title']);
|
||||
}
|
||||
|
||||
if (($msg == '') && isset($post['description'])) {
|
||||
$msg = trim($post['description']);
|
||||
}
|
||||
|
||||
// If the link is already contained in the post, then it neeedn't to be added again
|
||||
// But: if the link is beyond the limit, then it has to be added.
|
||||
if (($link != '') && strstr($msg, $link)) {
|
||||
$pos = strpos($msg, $link);
|
||||
|
||||
// Will the text be shortened in the link?
|
||||
// Or is the link the last item in the post?
|
||||
if (($limit > 0) && ($pos < $limit) && (($pos + 23 > $limit) || ($pos + strlen($link) == strlen($msg)))) {
|
||||
$msg = trim(str_replace($link, '', $msg));
|
||||
} elseif (($limit == 0) || ($pos < $limit)) {
|
||||
// The limit has to be increased since it will be shortened - but not now
|
||||
// Only do it with Twitter (htmlmode = 8)
|
||||
if (($limit > 0) && (strlen($link) > 23) && ($htmlmode == 8)) {
|
||||
$limit = $limit - 23 + strlen($link);
|
||||
}
|
||||
|
||||
$link = '';
|
||||
|
||||
if ($post['type'] == 'text') {
|
||||
unset($post['url']);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($limit > 0) {
|
||||
// Reduce multiple spaces
|
||||
// When posted to a network with limited space, we try to gain space where possible
|
||||
while (strpos($msg, ' ') !== false) {
|
||||
$msg = str_replace(' ', ' ', $msg);
|
||||
}
|
||||
|
||||
// Twitter is using its own limiter, so we always assume that shortened links will have this length
|
||||
if (iconv_strlen($link, 'UTF-8') > 0) {
|
||||
$limit = $limit - 23;
|
||||
}
|
||||
|
||||
if (iconv_strlen($msg, 'UTF-8') > $limit) {
|
||||
if (($post['type'] == 'text') && isset($post['url'])) {
|
||||
$post['url'] = $item['plink'];
|
||||
} elseif (!isset($post['url'])) {
|
||||
$limit = $limit - 23;
|
||||
$post['url'] = $item['plink'];
|
||||
} elseif (strpos($item['body'], '[share') !== false) {
|
||||
$post['url'] = $item['plink'];
|
||||
} elseif (PConfig::get($item['uid'], 'system', 'no_intelligent_shortening')) {
|
||||
$post['url'] = $item['plink'];
|
||||
}
|
||||
$msg = Text\Plaintext::shorten($msg, $limit);
|
||||
}
|
||||
}
|
||||
|
||||
$post['text'] = trim($msg);
|
||||
|
||||
return $post;
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue