Merge pull request #4881 from MrPetovan/task/4867-guess-language-from-plaintext

Guess language from plaintext
2024-05-12 10:39:38 +02:00 · 2018-04-23 00:26:05 +02:00 · 2018-04-23 00:26:05 +02:00 · 37e5272f92
parent 4e9236e9e9 d977ff78df
commit 37e5272f92
4 changed files with 203 additions and 159 deletions
--- a/mod/babel.php
+++ b/mod/babel.php
@ -23,6 +23,12 @@ function babel_content()
 					'content' => visible_lf($bbcode)
 				];
 				$plain = Text\BBCode::toPlaintext($bbcode, false);
 				$results[] = [
 					'title' => L10n::t('BBCode::toPlaintext'),
 					'content' => visible_lf($plain)
 				];
 				$html = Text\BBCode::convert($bbcode);
 				$results[] = [
 					'title' => L10n::t("BBCode::convert \x28raw HTML\x29"),
--- a/src/Content/Text/BBCode.php
+++ b/src/Content/Text/BBCode.php
@ -343,159 +343,20 @@ class BBCode extends BaseObject
 	}
 	/**
-	 * @brief Convert a message into plaintext for connectors to other networks
+	 * @brief Converts a BBCode text into plaintext
 	 *
-	 * @param array $b The message array that is about to be posted
+	 * @param bool $keep_urls Whether to keep URLs in the resulting plaintext
 	 * @param int $limit The maximum number of characters when posting to that network
 	 * @param bool $includedlinks Has an attached link to be included into the message?
 	 * @param int $htmlmode This triggers the behaviour of the bbcode conversion
 	 * @param string $target_network Name of the network where the post should go to.
 	 *
-	 * @return string The converted message
+	 * @return string
 	 */
-	public static function toPlaintext($b, $limit = 0, $includedlinks = false, $htmlmode = 2, $target_network = "")
+	public static function toPlaintext($text, $keep_urls = true)
 	{
-		// Remove the hash tags
+		$naked_text = preg_replace('/\[(.+?)\]/','', $text);
-		$URLSearchString = "^\[\]";
+		if (!$keep_urls) {
-		$body = preg_replace("/([#@])\[url\=([$URLSearchString]*)\](.*?)\[\/url\]/ism", '$1$3', $b["body"]);
+			$naked_text = preg_replace('#https?\://[^\s<]+[^\s\.\)]#i', '', $naked_text);
 		// Add an URL element if the text contains a raw link
 		$body = preg_replace("/([^\]\='".'"'."]|^)(https?\:\/\/[a-zA-Z0-9\:\/\-\?\&\;\.\=\_\~\#\%\$\!\+\,]+)/ism", '$1[url]$2[/url]', $body);
 		// Remove the abstract
 		$body = self::stripAbstract($body);
 		// At first look at data that is attached via "type-..." stuff
 		// This will hopefully replaced with a dedicated bbcode later
 		//$post = self::getAttachedData($b["body"]);
 		$post = self::getAttachedData($body, $b);
 		if (($b["title"] != "") && ($post["text"] != "")) {
 			$post["text"] = trim($b["title"]."\n\n".$post["text"]);
 		} elseif ($b["title"] != "") {
 			$post["text"] = trim($b["title"]);
 		}
-		$abstract = "";
+		return $naked_text;
 		// Fetch the abstract from the given target network
 		if ($target_network != "") {
 			$default_abstract = self::getAbstract($b["body"]);
 			$abstract = self::getAbstract($b["body"], $target_network);
 			// If we post to a network with no limit we only fetch
 			// an abstract exactly for this network
 			if (($limit == 0) && ($abstract == $default_abstract)) {
 				$abstract = "";
 			}
 		} else {// Try to guess the correct target network
 			switch ($htmlmode) {
 				case 8:
 					$abstract = self::getAbstract($b["body"], NETWORK_TWITTER);
 					break;
 				case 7:
 					$abstract = self::getAbstract($b["body"], NETWORK_STATUSNET);
 					break;
 				case 6:
 					$abstract = self::getAbstract($b["body"], NETWORK_APPNET);
 					break;
 				default: // We don't know the exact target.
 					// We fetch an abstract since there is a posting limit.
 					if ($limit > 0) {
 						$abstract = self::getAbstract($b["body"]);
 					}
 			}
 		}
 		if ($abstract != "") {
 			$post["text"] = $abstract;
 			if ($post["type"] == "text") {
 				$post["type"] = "link";
 				$post["url"] = $b["plink"];
 			}
 		}
 		$html = self::convert($post["text"].$post["after"], false, $htmlmode);
 		$msg = HTML::toPlaintext($html, 0, true);
 		$msg = trim(html_entity_decode($msg, ENT_QUOTES, 'UTF-8'));
 		$link = "";
 		if ($includedlinks) {
 			if ($post["type"] == "link") {
 				$link = $post["url"];
 			} elseif ($post["type"] == "text") {
 				$link = $post["url"];
 			} elseif ($post["type"] == "video") {
 				$link = $post["url"];
 			} elseif ($post["type"] == "photo") {
 				$link = $post["image"];
 			}
 			if (($msg == "") && isset($post["title"])) {
 				$msg = trim($post["title"]);
 			}
 			if (($msg == "") && isset($post["description"])) {
 				$msg = trim($post["description"]);
 			}
 			// If the link is already contained in the post, then it neeedn't to be added again
 			// But: if the link is beyond the limit, then it has to be added.
 			if (($link != "") && strstr($msg, $link)) {
 				$pos = strpos($msg, $link);
 				// Will the text be shortened in the link?
 				// Or is the link the last item in the post?
 				if (($limit > 0) && ($pos < $limit) && (($pos + 23 > $limit) || ($pos + strlen($link) == strlen($msg)))) {
 					$msg = trim(str_replace($link, "", $msg));
 				} elseif (($limit == 0) || ($pos < $limit)) {
 					// The limit has to be increased since it will be shortened - but not now
 					// Only do it with Twitter (htmlmode = 8)
 					if (($limit > 0) && (strlen($link) > 23) && ($htmlmode == 8)) {
 						$limit = $limit - 23 + strlen($link);
 					}
 					$link = "";
 					if ($post["type"] == "text") {
 						unset($post["url"]);
 					}
 				}
 			}
 		}
 		if ($limit > 0) {
 			// Reduce multiple spaces
 			// When posted to a network with limited space, we try to gain space where possible
 			while (strpos($msg, "  ") !== false) {
 				$msg = str_replace("  ", " ", $msg);
 			}
 			// Twitter is using its own limiter, so we always assume that shortened links will have this length
 			if (iconv_strlen($link, "UTF-8") > 0) {
 				$limit = $limit - 23;
 			}
 			if (iconv_strlen($msg, "UTF-8") > $limit) {
 				if (($post["type"] == "text") && isset($post["url"])) {
 					$post["url"] = $b["plink"];
 				} elseif (!isset($post["url"])) {
 					$limit = $limit - 23;
 					$post["url"] = $b["plink"];
 				// Which purpose has this line? It is now uncommented, but left as a reminder
 				//} elseif (strpos($b["body"], "[share") !== false) {
 				//	$post["url"] = $b["plink"];
 				} elseif (PConfig::get($b["uid"], "system", "no_intelligent_shortening")) {
 					$post["url"] = $b["plink"];
 				}
 				$msg = Plaintext::shorten($msg, $limit);
 			}
 		}
 		$post["text"] = trim($msg);
 		return($post);
 	}
 	public static function scaleExternalImages($srctext, $include_link = true, $scale_replace = false)
@ -1947,7 +1808,7 @@ class BBCode extends BaseObject
 	 * @param string $addon The addon for which the abstract is meant for
 	 * @return string The abstract
 	 */
-	private static function getAbstract($text, $addon = "")
+	public static function getAbstract($text, $addon = "")
 	{
 		$abstract = "";
 		$abstracts = [];
--- a/src/Model/Item.php
+++ b/src/Model/Item.php
@ -7,6 +7,7 @@
 namespace Friendica\Model;
 use Friendica\BaseObject;
 use Friendica\Content\Text;
 use Friendica\Core\Addon;
 use Friendica\Core\Config;
 use Friendica\Core\L10n;
@ -977,35 +978,35 @@ class Item extends BaseObject
 	 * if possible and not already present.
 	 * Expects "body" element to exist in $arr.
 	 */
-	private static function addLanguageInPostopts(&$arr)
+	private static function addLanguageInPostopts(&$item)
 	{
-		if (x($arr, 'postopts')) {
+		if (!empty($item['postopts'])) {
-			if (strstr($arr['postopts'], 'lang=')) {
+			if (strstr($item['postopts'], 'lang=')) {
 				// do not override
 				return;
 			}
-			$postopts = $arr['postopts'];
+			$postopts = $item['postopts'];
 		} else {
 			$postopts = "";
 		}
-		$naked_body = preg_replace('/\[(.+?)\]/','', $arr['body']);
+		$naked_body = Text\BBCode::toPlaintext($item['body'], false);
 		$l = new Text_LanguageDetect();
 		$lng = $l->detect($naked_body, 3);
-		if (sizeof($lng) > 0) {
+		$languages = (new Text_LanguageDetect())->detect($naked_body, 3);
-			if ($postopts != "") {
+
 		if (sizeof($languages) > 0) {
 			if ($postopts != '') {
 				$postopts .= '&'; // arbitrary separator, to be reviewed
 			}
 			$postopts .= 'lang=';
 			$sep = "";
-			foreach ($lng as $language => $score) {
+			foreach ($languages as $language => $score) {
 				$postopts .= $sep . $language . ";" . $score;
 				$sep = ':';
 			}
-			$arr['postopts'] = $postopts;
+			$item['postopts'] = $postopts;
 		}
 	}
--- a/src/Model/ItemContent.php
+++ b/src/Model/ItemContent.php
@ -0,0 +1,176 @@
 <?php
 /**
 * @file src/Model/ItemContent.php
 */
 namespace Friendica\Model;
 use Friendica\BaseObject;
 use Friendica\Content\Text;
 use Friendica\Core\PConfig;
 require_once 'boot.php';
 require_once 'include/items.php';
 require_once 'include/text.php';
 class ItemContent extends BaseObject
 {
 	/**
 	 * @brief Convert a message into plaintext for connectors to other networks
 	 *
 	 * @param array  $item           The message array that is about to be posted
 	 * @param int    $limit          The maximum number of characters when posting to that network
 	 * @param bool   $includedlinks  Has an attached link to be included into the message?
 	 * @param int    $htmlmode       This controls the behavior of the BBCode conversion
 	 * @param string $target_network Name of the network where the post should go to.
 	 *
 	 * @see \Friendica\Content\Text\BBCode::getAttachedData
 	 *
 	 * @return array Same array structure than \Friendica\Content\Text\BBCode::getAttachedData
 	 */
 	public static function getPlaintextPost($item, $limit = 0, $includedlinks = false, $htmlmode = 2, $target_network = '')
 	{
 		// Remove hashtags
 		$URLSearchString = '^\[\]';
 		$body = preg_replace("/([#@])\[url\=([$URLSearchString]*)\](.*?)\[\/url\]/ism", '$1$3', $item['body']);
 		// Add an URL element if the text contains a raw link
 		$body = preg_replace('/([^\]\=\'"]|^)(https?\:\/\/[a-zA-Z0-9\:\/\-\?\&\;\.\=\_\~\#\%\$\!\+\,]+)/ism',
 			'$1[url]$2[/url]', $body);
 		// Remove the abstract
 		$body = Text\BBCode::stripAbstract($body);
 		// At first look at data that is attached via "type-..." stuff
 		// This will hopefully replaced with a dedicated bbcode later
 		//$post = self::getAttachedData($b['body']);
 		$post = Text\BBCode::getAttachedData($body, $item);
 		if (($item['title'] != '') && ($post['text'] != '')) {
 			$post['text'] = trim($item['title'] . "\n\n" . $post['text']);
 		} elseif ($item['title'] != '') {
 			$post['text'] = trim($item['title']);
 		}
 		$abstract = '';
 		// Fetch the abstract from the given target network
 		if ($target_network != '') {
 			$default_abstract = Text\BBCode::getAbstract($item['body']);
 			$abstract = Text\BBCode::getAbstract($item['body'], $target_network);
 			// If we post to a network with no limit we only fetch
 			// an abstract exactly for this network
 			if (($limit == 0) && ($abstract == $default_abstract)) {
 				$abstract = '';
 			}
 		} else {// Try to guess the correct target network
 			switch ($htmlmode) {
 				case 8:
 					$abstract = Text\BBCode::getAbstract($item['body'], NETWORK_TWITTER);
 					break;
 				case 7:
 					$abstract = Text\BBCode::getAbstract($item['body'], NETWORK_STATUSNET);
 					break;
 				case 6:
 					$abstract = Text\BBCode::getAbstract($item['body'], NETWORK_APPNET);
 					break;
 				default: // We don't know the exact target.
 					// We fetch an abstract since there is a posting limit.
 					if ($limit > 0) {
 						$abstract = Text\BBCode::getAbstract($item['body']);
 					}
 			}
 		}
 		if ($abstract != '') {
 			$post['text'] = $abstract;
 			if ($post['type'] == 'text') {
 				$post['type'] = 'link';
 				$post['url'] = $item['plink'];
 			}
 		}
 		$html = Text\BBCode::convert($post['text'] . $post['after'], false, $htmlmode);
 		$msg = Text\HTML::toPlaintext($html, 0, true);
 		$msg = trim(html_entity_decode($msg, ENT_QUOTES, 'UTF-8'));
 		$link = '';
 		if ($includedlinks) {
 			if ($post['type'] == 'link') {
 				$link = $post['url'];
 			} elseif ($post['type'] == 'text') {
 				$link = $post['url'];
 			} elseif ($post['type'] == 'video') {
 				$link = $post['url'];
 			} elseif ($post['type'] == 'photo') {
 				$link = $post['image'];
 			}
 			if (($msg == '') && isset($post['title'])) {
 				$msg = trim($post['title']);
 			}
 			if (($msg == '') && isset($post['description'])) {
 				$msg = trim($post['description']);
 			}
 			// If the link is already contained in the post, then it neeedn't to be added again
 			// But: if the link is beyond the limit, then it has to be added.
 			if (($link != '') && strstr($msg, $link)) {
 				$pos = strpos($msg, $link);
 				// Will the text be shortened in the link?
 				// Or is the link the last item in the post?
 				if (($limit > 0) && ($pos < $limit) && (($pos + 23 > $limit) || ($pos + strlen($link) == strlen($msg)))) {
 					$msg = trim(str_replace($link, '', $msg));
 				} elseif (($limit == 0) || ($pos < $limit)) {
 					// The limit has to be increased since it will be shortened - but not now
 					// Only do it with Twitter (htmlmode = 8)
 					if (($limit > 0) && (strlen($link) > 23) && ($htmlmode == 8)) {
 						$limit = $limit - 23 + strlen($link);
 					}
 					$link = '';
 					if ($post['type'] == 'text') {
 						unset($post['url']);
 					}
 				}
 			}
 		}
 		if ($limit > 0) {
 			// Reduce multiple spaces
 			// When posted to a network with limited space, we try to gain space where possible
 			while (strpos($msg, '  ') !== false) {
 				$msg = str_replace('  ', ' ', $msg);
 			}
 			// Twitter is using its own limiter, so we always assume that shortened links will have this length
 			if (iconv_strlen($link, 'UTF-8') > 0) {
 				$limit = $limit - 23;
 			}
 			if (iconv_strlen($msg, 'UTF-8') > $limit) {
 				if (($post['type'] == 'text') && isset($post['url'])) {
 					$post['url'] = $item['plink'];
 				} elseif (!isset($post['url'])) {
 					$limit = $limit - 23;
 					$post['url'] = $item['plink'];
 				} elseif (strpos($item['body'], '[share') !== false) {
 					$post['url'] = $item['plink'];
 				} elseif (PConfig::get($item['uid'], 'system', 'no_intelligent_shortening')) {
 					$post['url'] = $item['plink'];
 				}
 				$msg = Text\Plaintext::shorten($msg, $limit);
 			}
 		}
 		$post['text'] = trim($msg);
 		return $post;
 	}
 }