From f3323aff5ec0ae918718432688ac82c66c8f99a2 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Wed, 17 Jun 2020 04:57:21 -0400 Subject: [PATCH] Deprecate page_info functions to new PageInfo class - Add tests for parts not using remote requests - Add scheme requirement for page info URLs - Add policy to keep label from stripped Page Info links --- include/items.php | 210 +++------------------- src/Content/PageInfo.php | 269 +++++++++++++++++++++++++++++ tests/src/Content/PageInfoMock.php | 38 ++++ tests/src/Content/PageInfoTest.php | 125 ++++++++++++++ 4 files changed, 457 insertions(+), 185 deletions(-) create mode 100644 src/Content/PageInfo.php create mode 100644 tests/src/Content/PageInfoMock.php create mode 100644 tests/src/Content/PageInfoTest.php diff --git a/include/items.php b/include/items.php index 2f218ea2a..16fe897be 100644 --- a/include/items.php +++ b/include/items.php @@ -19,209 +19,49 @@ * */ -use Friendica\Core\Hook; -use Friendica\Core\Logger; -use Friendica\Core\Protocol; -use Friendica\Core\Renderer; -use Friendica\Core\Session; -use Friendica\Database\DBA; -use Friendica\DI; -use Friendica\Model\Item; -use Friendica\Protocol\DFRN; -use Friendica\Protocol\Feed; -use Friendica\Protocol\OStatus; -use Friendica\Util\Network; -use Friendica\Util\ParseUrl; -use Friendica\Util\Strings; - -require_once __DIR__ . '/../mod/share.php'; - +/** + * @deprecated since 2020.06 + * @see \Friendica\Content\PageInfo::getFooterFromData + */ function add_page_info_data(array $data, $no_photos = false) { - Hook::callAll('page_info_data', $data); - - if (empty($data['type'])) { - return ''; - } - - // It maybe is a rich content, but if it does have everything that a link has, - // then treat it that way - if (($data["type"] == "rich") && is_string($data["title"]) && - is_string($data["text"]) && !empty($data["images"])) { - $data["type"] = "link"; - } - - $data["title"] = $data["title"] ?? ''; - - if ((($data["type"] != "link") && ($data["type"] != "video") && ($data["type"] != "photo")) || ($data["title"] == $data["url"])) { - return ""; - } - - if ($no_photos && ($data["type"] == "photo")) { - return ""; - } - - // Escape some bad characters - $data["url"] = str_replace(["[", "]"], ["[", "]"], htmlentities($data["url"], ENT_QUOTES, 'UTF-8', false)); - $data["title"] = str_replace(["[", "]"], ["[", "]"], htmlentities($data["title"], ENT_QUOTES, 'UTF-8', false)); - - $text = "[attachment type='".$data["type"]."'"; - - if (empty($data["text"])) { - $data["text"] = $data["title"]; - } - - if (empty($data["text"])) { - $data["text"] = $data["url"]; - } - - if (!empty($data["url"])) { - $text .= " url='".$data["url"]."'"; - } - - if (!empty($data["title"])) { - $text .= " title='".$data["title"]."'"; - } - - // Only embedd a picture link when it seems to be a valid picture ("width" is set) - if (!empty($data["images"]) && !empty($data["images"][0]["width"])) { - $preview = str_replace(["[", "]"], ["[", "]"], htmlentities($data["images"][0]["src"], ENT_QUOTES, 'UTF-8', false)); - // if the preview picture is larger than 500 pixels then show it in a larger mode - // But only, if the picture isn't higher than large (To prevent huge posts) - if (!DI::config()->get('system', 'always_show_preview') && ($data["images"][0]["width"] >= 500) - && ($data["images"][0]["width"] >= $data["images"][0]["height"])) { - $text .= " image='".$preview."'"; - } else { - $text .= " preview='".$preview."'"; - } - } - - $text .= "]".$data["text"]."[/attachment]"; - - $hashtags = ""; - if (isset($data["keywords"]) && count($data["keywords"])) { - $hashtags = "\n"; - foreach ($data["keywords"] as $keyword) { - /// @TODO make a positive list of allowed characters - $hashtag = str_replace([' ', '+', '/', '.', '#', '@', "'", '"', '’', '`', '(', ')', '„', '“'], '', $keyword); - $hashtags .= "#[url=" . DI::baseUrl() . "/search?tag=" . $hashtag . "]" . $hashtag . "[/url] "; - } - } - - return "\n".$text.$hashtags; + return "\n" . \Friendica\Content\PageInfo::getFooterFromData($data, $no_photos); } +/** + * @deprecated since 2020.06 + * @see \Friendica\Content\PageInfo::queryUrl + */ function query_page_info($url, $photo = "", $keywords = false, $keyword_denylist = "") { - $data = ParseUrl::getSiteinfoCached($url, true); - - if ($photo != "") { - $data["images"][0]["src"] = $photo; - } - - Logger::log('fetch page info for ' . $url . ' ' . print_r($data, true), Logger::DEBUG); - - if (!$keywords && isset($data["keywords"])) { - unset($data["keywords"]); - } - - if (($keyword_denylist != "") && isset($data["keywords"])) { - $list = explode(", ", $keyword_denylist); - - foreach ($list as $keyword) { - $keyword = trim($keyword); - - $index = array_search($keyword, $data["keywords"]); - if ($index !== false) { - unset($data["keywords"][$index]); - } - } - } - - return $data; + return \Friendica\Content\PageInfo::queryUrl($url, $photo, $keywords, $keyword_denylist); } +/** + * @deprecated since 2020.06 + * @see \Friendica\Content\PageInfo::getTagsFromUrl() + */ function get_page_keywords($url, $photo = "", $keywords = false, $keyword_denylist = "") { - $data = query_page_info($url, $photo, $keywords, $keyword_denylist); - if (empty($data["keywords"]) || !is_array($data["keywords"])) { - return []; - } - - $taglist = []; - foreach ($data['keywords'] as $keyword) { - $hashtag = str_replace([" ", "+", "/", ".", "#", "'"], - ["", "", "", "", "", ""], $keyword); - - $taglist[] = $hashtag; - } - - return $taglist; + return $keywords ? \Friendica\Content\PageInfo::getTagsFromUrl($url, $photo, $keyword_denylist) : []; } +/** + * @deprecated since 2020.06 + * @see \Friendica\Content\PageInfo::getFooterFromUrl + */ function add_page_info($url, $no_photos = false, $photo = "", $keywords = false, $keyword_denylist = "") { - $data = query_page_info($url, $photo, $keywords, $keyword_denylist); - - $text = ''; - - if (is_array($data)) { - $text = add_page_info_data($data, $no_photos); - } - - return $text; + return "\n" . \Friendica\Content\PageInfo::getFooterFromUrl($url, $no_photos, $photo, $keywords, $keyword_denylist); } +/** + * @deprecated since 2020.06 + * @see \Friendica\Content\PageInfo::appendToBody + */ function add_page_info_to_body($body, $texturl = false, $no_photos = false) { - Logger::log('add_page_info_to_body: fetch page info for body ' . $body, Logger::DEBUG); - - $URLSearchString = "^\[\]"; - - // Fix for Mastodon where the mentions are in a different format - $body = preg_replace("/\[url\=([$URLSearchString]*)\]([#!@])(.*?)\[\/url\]/ism", - '$2[url=$1]$3[/url]', $body); - - // Adding these spaces is a quick hack due to my problems with regular expressions :) - preg_match("/[^!#@]\[url\]([$URLSearchString]*)\[\/url\]/ism", " " . $body, $matches); - - if (!$matches) { - preg_match("/[^!#@]\[url\=([$URLSearchString]*)\](.*?)\[\/url\]/ism", " " . $body, $matches); - } - - // Convert urls without bbcode elements - if (!$matches && $texturl) { - preg_match("/([^\]\='".'"'."]|^)(https?\:\/\/[a-zA-Z0-9\:\/\-\?\&\;\.\=\_\~\#\%\$\!\+\,]+)/ism", " ".$body, $matches); - - // Yeah, a hack. I really hate regular expressions :) - if ($matches) { - $matches[1] = $matches[2]; - } - } - - if ($matches) { - $footer = add_page_info($matches[1], $no_photos); - } - - // Remove the link from the body if the link is attached at the end of the post - if (isset($footer) && (trim($footer) != "") && (strpos($footer, $matches[1]))) { - $removedlink = trim(str_replace($matches[1], "", $body)); - if (($removedlink == "") || strstr($body, $removedlink)) { - $body = $removedlink; - } - - $removedlink = preg_replace("/\[url\=" . preg_quote($matches[1], '/') . "\](.*?)\[\/url\]/ism", '', $body); - if (($removedlink == "") || strstr($body, $removedlink)) { - $body = $removedlink; - } - } - - // Add the page information to the bottom - if (isset($footer) && (trim($footer) != "")) { - $body .= $footer; - } - - return $body; + return \Friendica\Content\PageInfo::appendToBody($body, $texturl, $no_photos); } /** diff --git a/src/Content/PageInfo.php b/src/Content/PageInfo.php new file mode 100644 index 000000000..7d6f2eb9f --- /dev/null +++ b/src/Content/PageInfo.php @@ -0,0 +1,269 @@ +. + * + */ + +namespace Friendica\Content; + +use Friendica\Core\Hook; +use Friendica\Core\Logger; +use Friendica\DI; +use Friendica\Network\HTTPException; +use Friendica\Util\ParseUrl; +use Friendica\Util\Strings; + +/** + * Extracts trailing URLs from post bodies to transform them in enriched attachment tags through Site Info query + */ +class PageInfo +{ + /** + * @param string $body + * @param bool $searchNakedUrls + * @param bool $no_photos + * @return string + * @throws HTTPException\InternalServerErrorException + */ + public static function appendToBody(string $body, bool $searchNakedUrls = false, bool $no_photos = false) + { + Logger::info('add_page_info_to_body: fetch page info for body', ['body' => $body]); + + $url = self::getRelevantUrlFromBody($body, $searchNakedUrls); + if (!$url) { + return $body; + } + + $footer = self::getFooterFromUrl($url, $no_photos); + if (!$footer) { + return $body; + } + + $body = self::stripTrailingUrlFromBody($body, $url); + + $body .= "\n" . $footer; + + return $body; + } + + /** + * @param string $url + * @param bool $no_photos + * @param string $photo + * @param bool $keywords + * @param string $keyword_denylist + * @return string + * @throws HTTPException\InternalServerErrorException + */ + public static function getFooterFromUrl(string $url, bool $no_photos = false, string $photo = '', bool $keywords = false, string $keyword_denylist = '') + { + $data = self::queryUrl($url, $photo, $keywords, $keyword_denylist); + + return self::getFooterFromData($data, $no_photos); + } + + /** + * @param array $data + * @param bool $no_photos + * @return string + * @throws HTTPException\InternalServerErrorException + */ + public static function getFooterFromData(array $data, bool $no_photos = false) + { + Hook::callAll('page_info_data', $data); + + if (empty($data['type'])) { + return ''; + } + + // It maybe is a rich content, but if it does have everything that a link has, + // then treat it that way + if (($data['type'] == 'rich') && is_string($data['title']) && + is_string($data['text']) && !empty($data['images'])) { + $data['type'] = 'link'; + } + + $data['title'] = $data['title'] ?? ''; + + if ((($data['type'] != 'link') && ($data['type'] != 'video') && ($data['type'] != 'photo')) || ($data['title'] == $data['url'])) { + return ''; + } + + if ($no_photos && ($data['type'] == 'photo')) { + return ''; + } + + // Escape some bad characters + $data['url'] = str_replace(['[', ']'], ['[', ']'], htmlentities($data['url'], ENT_QUOTES, 'UTF-8', false)); + $data['title'] = str_replace(['[', ']'], ['[', ']'], htmlentities($data['title'], ENT_QUOTES, 'UTF-8', false)); + + $text = "[attachment type='" . $data['type'] . "'"; + + if (empty($data['text'])) { + $data['text'] = $data['title']; + } + + if (empty($data['text'])) { + $data['text'] = $data['url']; + } + + if (!empty($data['url'])) { + $text .= " url='" . $data['url'] . "'"; + } + + if (!empty($data['title'])) { + $text .= " title='" . $data['title'] . "'"; + } + + // Only embedd a picture link when it seems to be a valid picture ("width" is set) + if (!empty($data['images']) && !empty($data['images'][0]['width'])) { + $preview = str_replace(['[', ']'], ['[', ']'], htmlentities($data['images'][0]['src'], ENT_QUOTES, 'UTF-8', false)); + // if the preview picture is larger than 500 pixels then show it in a larger mode + // But only, if the picture isn't higher than large (To prevent huge posts) + if (!DI::config()->get('system', 'always_show_preview') && ($data['images'][0]['width'] >= 500) + && ($data['images'][0]['width'] >= $data['images'][0]['height'])) { + $text .= " image='" . $preview . "'"; + } else { + $text .= " preview='" . $preview . "'"; + } + } + + $text .= ']' . $data['text'] . '[/attachment]'; + + $hashtags = ''; + if (!empty($data['keywords'])) { + $hashtags = "\n"; + foreach ($data['keywords'] as $keyword) { + /// @TODO make a positive list of allowed characters + $hashtag = str_replace([' ', '+', '/', '.', '#', '@', "'", '"', '’', '`', '(', ')', '„', '“'], '', $keyword); + $hashtags .= '#[url=' . DI::baseUrl() . '/search?tag=' . $hashtag . ']' . $hashtag . '[/url] '; + } + } + + return $text . $hashtags; + } + + /** + * @param string $url + * @param string $photo + * @param bool $keywords + * @param string $keyword_denylist + * @return array|bool + * @throws HTTPException\InternalServerErrorException + */ + public static function queryUrl(string $url, string $photo = '', bool $keywords = false, string $keyword_denylist = '') + { + $data = ParseUrl::getSiteinfoCached($url, true); + + if ($photo != '') { + $data['images'][0]['src'] = $photo; + } + + if (!$keywords) { + unset($data['keywords']); + } elseif ($keyword_denylist) { + $list = explode(', ', $keyword_denylist); + + foreach ($list as $keyword) { + $keyword = trim($keyword); + + $index = array_search($keyword, $data['keywords']); + if ($index !== false) { + unset($data['keywords'][$index]); + } + } + } + + Logger::info('fetch page info for URL', ['url' => $url, 'data' => $data]); + + return $data; + } + + /** + * @param string $url + * @param string $photo + * @param string $keyword_denylist + * @return array + * @throws HTTPException\InternalServerErrorException + */ + public static function getTagsFromUrl(string $url, string $photo = '', string $keyword_denylist = '') + { + $data = self::queryUrl($url, $photo, true, $keyword_denylist); + + $taglist = []; + foreach ($data['keywords'] as $keyword) { + $hashtag = str_replace([' ', '+', '/', '.', '#', "'"], + ['', '', '', '', '', ''], $keyword); + + $taglist[] = $hashtag; + } + + return $taglist; + } + + /** + * Picks a non-hashtag, non-mention, schemeful URL at the end of the provided body string to be converted into Page Info. + * + * @param string $body + * @param bool $searchNakedUrls Whether we should pick a naked URL (outside of BBCode tags) as a last resort + * @return string|null + */ + protected static function getRelevantUrlFromBody(string $body, bool $searchNakedUrls = false) + { + $URLSearchString = 'https?://[^\[\]]*'; + + // Fix for Mastodon where the mentions are in a different format + $body = preg_replace("~\[url=($URLSearchString)]([#!@])(.*?)\[/url]~is", '$2[url=$1]$3[/url]', $body); + + preg_match("~(?. + * + */ + +namespace Friendica\Test\src\Content; + +/** + * Class PageInfoMock + * + * Exposes protected methods for test in the inherited class + * + * @method static string|null getRelevantUrlFromBody(string $body, $searchNakedUrls = false) + * @method static string stripTrailingUrlFromBody(string $body, string $url) + */ +class PageInfoMock extends \Friendica\Content\PageInfo +{ + public static function __callStatic($name, $arguments) + { + return self::$name(...$arguments); + } +} diff --git a/tests/src/Content/PageInfoTest.php b/tests/src/Content/PageInfoTest.php new file mode 100644 index 000000000..6f9641564 --- /dev/null +++ b/tests/src/Content/PageInfoTest.php @@ -0,0 +1,125 @@ +. + * + */ + +namespace Friendica\Test\src\Content; + +use Friendica\Test\MockedTest; + +class PageInfoTest extends MockedTest +{ + public function dataGetRelevantUrlFromBody() + { + return [ + 'end-of-content' => [ + 'expected' => 'http://example.com/end-of-content', + 'body' => 'Content[url]http://example.com/end-of-content[/url]', + ], + 'tag-no-attr' => [ + 'expected' => 'http://example.com/tag-no-attr', + 'body' => '[url]http://example.com/tag-no-attr[/url]', + ], + 'tag-attr' => [ + 'expected' => 'http://example.com/tag-attr', + 'body' => '[url=http://example.com/tag-attr]Example.com[/url]', + ], + 'mention' => [ + 'expected' => null, + 'body' => '@[url=http://example.com/mention]Mention[/url]', + ], + 'mention-exclusive' => [ + 'expected' => null, + 'body' => '@[url=http://example.com/mention-exclusive]Mention Exclusive[/url]', + ], + 'hashtag' => [ + 'expected' => null, + 'body' => '#[url=http://example.com/hashtag]hashtag[/url]', + ], + 'naked-url-unexpected' => [ + 'expected' => null, + 'body' => 'http://example.com/naked-url-unexpected', + ], + 'naked-url-expected' => [ + 'expected' => 'http://example.com/naked-url-expected', + 'body' => 'http://example.com/naked-url-expected', + 'searchNakedUrls' => true, + ], + 'naked-url-end-of-content-unexpected' => [ + 'expected' => null, + 'body' => 'Contenthttp://example.com/naked-url-end-of-content-unexpected', + 'searchNakedUrls' => true, + ], + 'naked-url-end-of-content-expected' => [ + 'expected' => 'http://example.com/naked-url-end-of-content-expected', + 'body' => 'Content http://example.com/naked-url-end-of-content-expected', + 'searchNakedUrls' => true, + ], + 'bug-8781-schemeless-link' => [ + 'expected' => null, + 'body' => '[url]/posts/2576978090fd0138ee4c005056264835[/url]', + ], + ]; + } + + /** + * @dataProvider dataGetRelevantUrlFromBody + * + * @param string|null $expected + * @param string $body + * @param bool $searchNakedUrls + */ + public function testGetRelevantUrlFromBody($expected, string $body, bool $searchNakedUrls = false) + { + $this->assertSame($expected, PageInfoMock::getRelevantUrlFromBody($body, $searchNakedUrls)); + } + + public function dataStripTrailingUrlFromBody() + { + return [ + 'naked-url-append' => [ + 'expected' => 'content', + 'body' => 'contenthttps://example.com', + 'url' => 'https://example.com', + ], + 'naked-url-not-at-the-end' => [ + 'expected' => 'https://example.comcontent', + 'body' => 'https://example.comcontent', + 'url' => 'https://example.com', + ], + 'bug-8781-labeled-link' => [ + 'expected' => 'link label', + 'body' => '[url=https://example.com]link label[/url]', + 'url' => 'https://example.com', + ], + ]; + } + + /** + * @dataProvider dataStripTrailingUrlFromBody + * + * @param string $expected + * @param string $body + * @param string $url + */ + public function testStripTrailingUrlFromBody(string $expected, string $body, string $url) + { + $this->assertSame($expected, PageInfoMock::stripTrailingUrlFromBody($body, $url)); + } +}