From 20fd25258a87931799c4f7672396220445390e1b Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 24 Feb 2024 11:35:32 +0000 Subject: [PATCH] Accidentally changes are reverted --- src/Content/OEmbed.php | 229 ++++++++++++++++++++++++++++++------ src/Content/Text/BBCode.php | 16 ++- src/Model/Post/Media.php | 2 +- src/Module/Oembed.php | 74 ++++++++++++ src/Util/ParseUrl.php | 54 ++++++++- static/defaults.config.php | 4 + static/routes.config.php | 5 + 7 files changed, 334 insertions(+), 50 deletions(-) create mode 100644 src/Module/Oembed.php diff --git a/src/Content/OEmbed.php b/src/Content/OEmbed.php index d89cfca3e1..445c52ced0 100644 --- a/src/Content/OEmbed.php +++ b/src/Content/OEmbed.php @@ -22,6 +22,8 @@ namespace Friendica\Content; use DOMDocument; +use DOMNode; +use DOMText; use DOMXPath; use Exception; use Friendica\Core\Cache\Enum\Duration; @@ -30,10 +32,10 @@ use Friendica\Core\Renderer; use Friendica\Database\Database; use Friendica\Database\DBA; use Friendica\DI; -use Friendica\Model\Post; use Friendica\Network\HTTPClient\Client\HttpClientAccept; use Friendica\Util\DateTimeFormat; use Friendica\Util\Network; +use Friendica\Util\ParseUrl; use Friendica\Util\Proxy; use Friendica\Util\Strings; @@ -47,15 +49,32 @@ use Friendica\Util\Strings; */ class OEmbed { + /** + * Callback for fetching URL, checking allowance and returning formatted HTML + * + * @param array $matches + * @return string Formatted HTML + */ + public static function replaceCallback(array $matches): string + { + $embedurl = $matches[1]; + $j = self::fetchURL($embedurl, !self::isAllowedURL($embedurl)); + $s = self::formatObject($j); + + return $s; + } + /** * Get data from an URL to embed its content. * * @param string $embedurl The URL from which the data should be fetched. + * @param bool $no_rich_type If set to true rich type content won't be fetched. + * @param bool $use_parseurl Use the "ParseUrl" functionality to add additional data * * @return \Friendica\Object\OEmbed * @throws \Friendica\Network\HTTPException\InternalServerErrorException */ - public static function fetchURL(string $embedurl): \Friendica\Object\OEmbed + public static function fetchURL(string $embedurl, bool $no_rich_type = false, bool $use_parseurl = true): \Friendica\Object\OEmbed { $embedurl = trim($embedurl, '\'"'); @@ -100,7 +119,7 @@ class OEmbed $href = str_replace(['http://www.youtube.com/', 'http://player.vimeo.com/'], ['https://www.youtube.com/', 'https://player.vimeo.com/'], $href); $result = DI::httpClient()->fetchFull($href . '&maxwidth=' . $a->getThemeInfoValue('videowidth')); - if ($result->isSuccess()) { + if ($result->getReturnCode() === 200) { $json_string = $result->getBodyString(); break; } @@ -137,6 +156,60 @@ class OEmbed $oembed->html = str_replace(['http://www.youtube.com/', 'http://player.vimeo.com/'], ['https://www.youtube.com/', 'https://player.vimeo.com/'], $oembed->html); } + // Improve the OEmbed data with data from OpenGraph, Twitter cards and other sources + if ($use_parseurl) { + $data = ParseUrl::getSiteinfoCached($embedurl, false); + + if (($oembed->type == 'error') && empty($data['title']) && empty($data['text'])) { + return $oembed; + } + + if ($no_rich_type || ($oembed->type == 'error')) { + $oembed->html = ''; + $oembed->type = $data['type']; + + if ($oembed->type == 'photo') { + if (!empty($data['images'])) { + $oembed->url = $data['images'][0]['src']; + $oembed->width = $data['images'][0]['width']; + $oembed->height = $data['images'][0]['height']; + } else { + $oembed->type = 'link'; + } + } + } + + if (!empty($data['title'])) { + $oembed->title = $data['title']; + } + + if (!empty($data['text'])) { + $oembed->description = $data['text']; + } + + if (!empty($data['publisher_name'])) { + $oembed->provider_name = $data['publisher_name']; + } + + if (!empty($data['publisher_url'])) { + $oembed->provider_url = $data['publisher_url']; + } + + if (!empty($data['author_name'])) { + $oembed->author_name = $data['author_name']; + } + + if (!empty($data['author_url'])) { + $oembed->author_url = $data['author_url']; + } + + if (!empty($data['images']) && ($oembed->type != 'photo')) { + $oembed->thumbnail_url = $data['images'][0]['src']; + $oembed->thumbnail_width = $data['images'][0]['width']; + $oembed->thumbnail_height = $data['images'][0]['height']; + } + } + Hook::callAll('oembed_fetch_url', $embedurl, $oembed); return $oembed; @@ -146,20 +219,12 @@ class OEmbed * Returns a formatted string from OEmbed object * * @param \Friendica\Object\OEmbed $oembed - * @param int $uriid * @return string */ - private static function formatObject(\Friendica\Object\OEmbed $oembed, int $uriid): string + private static function formatObject(\Friendica\Object\OEmbed $oembed): string { $ret = '
'; - if (!self::isAllowedURL($oembed->url)) { - $oembed->html = ''; - if ($oembed->type == 'rich') { - $oembed->type = 'link'; - } - } - switch ($oembed->type) { case 'video': if ($oembed->thumbnail_url) { @@ -176,15 +241,15 @@ class OEmbed '$escapedhtml' => base64_encode($oembed->html), '$tw' => $tw, '$th' => $th, - '$turl' => Post\Link::getByLink($uriid, $oembed->thumbnail_url, Proxy::SIZE_MEDIUM), + '$turl' => $oembed->thumbnail_url, ]); } else { - $ret .= Proxy::proxifyHtml($oembed->html); + $ret = $oembed->html; } break; case 'photo': - $ret .= ''; + $ret .= ''; break; case 'link': @@ -229,14 +294,6 @@ class OEmbed $ret .= '' . $oembed->embed_url . ''; } $ret .= ""; - if ($oembed->type == 'link') { - if (!empty($oembed->thumbnail_url)) { - $ret .= ''; - } - if (!empty($oembed->description)) { - $ret .= '

' . $oembed->description . '

'; - } - } } elseif (!strpos($oembed->html, $oembed->embed_url)) { // add for html2bbcode conversion $ret .= '' . $oembed->title . ''; @@ -251,20 +308,51 @@ class OEmbed * Converts BBCode to HTML code * * @param string $text - * @param int $uriid * @return string */ - public static function BBCode2HTML(string $text, int $uriid): string + public static function BBCode2HTML(string $text): string { - if (!preg_match_all("/\[embed\](.+?)\[\/embed\]/is", $text, $matches, PREG_SET_ORDER)) { + if (DI::config()->get('system', 'no_oembed')) { + return preg_replace("/\[embed\](.+?)\[\/embed\]/is", "" . DI::l10n()->t('Embedding disabled') . " : $1", $text); + } + return preg_replace_callback("/\[embed\](.+?)\[\/embed\]/is", [self::class, 'replaceCallback'], $text); + } + + /** + * Find .... + * and replace it with [embed]url[/embed] + * + * @param string $text + * @return string + */ + public static function HTML2BBCode(string $text): string + { + // start parser only if 'oembed' is in text + if (strpos($text, 'oembed')) { + // convert non ascii chars to html entities + $html_text = mb_convert_encoding($text, 'HTML-ENTITIES', mb_detect_encoding($text)); + + // If it doesn't parse at all, just return the text. + $dom = new DOMDocument(); + if (!@$dom->loadHTML($html_text)) { + return $text; + } + $xpath = new DOMXPath($dom); + + $xattr = self::buildXPath('class', 'oembed'); + $entries = $xpath->query("//div[$xattr]"); + + $xattr = "@rel='oembed'"; //oe_build_xpath("rel","oembed"); + foreach ($entries as $e) { + $href = $xpath->evaluate("a[$xattr]/@href", $e)->item(0)->nodeValue; + if (!is_null($href)) { + $e->parentNode->replaceChild(new DOMText('[embed]' . $href . '[/embed]'), $e); + } + } + return self::getInnerHTML($dom->getElementsByTagName('body')->item(0)); + } else { return $text; } - - foreach ($matches as $match) { - $data = self::fetchURL($match[1]); - $text = str_replace($match[0], self::formatObject($data, $uriid), $text); - } - return $text; } /** @@ -274,7 +362,7 @@ class OEmbed * @return boolean * @throws \Friendica\Network\HTTPException\InternalServerErrorException */ - private static function isAllowedURL(string $url): bool + public static function isAllowedURL(string $url): bool { if (!DI::config()->get('system', 'no_oembed_rich_content')) { return true; @@ -299,13 +387,12 @@ class OEmbed * Returns a formatted HTML code from given URL and sets optional title * * @param string $url URL to fetch - * @param string $title title (default: what comes from OEmbed object) - * @param int $uriid + * @param string $title Optional title (default: what comes from OEmbed object) * @return string Formatted HTML */ - public static function getHTML(string $url, string $title, int $uriid): string + public static function getHTML(string $url, string $title = ''): string { - $o = self::fetchURL($url); + $o = self::fetchURL($url, !self::isAllowedURL($url)); if (!is_object($o) || property_exists($o, 'type') && $o->type == 'error') { throw new Exception('OEmbed failed for URL: ' . $url); @@ -315,8 +402,74 @@ class OEmbed $o->title = $title; } - $html = self::formatObject($o, $uriid); + $html = self::formatObject($o); return $html; } + + /** + * Generates the iframe HTML for an oembed attachment. + * + * Width and height are given by the remote, and are regularly too small for + * the generated iframe. + * + * The width is entirely discarded for the actual width of the post, while fixed + * height is used as a starting point before the inevitable resizing. + * + * Since the iframe is automatically resized on load, there are no need for ugly + * and impractical scrollbars. + * + * @todo This function is currently unused until someoneā„¢ adds support for a separate OEmbed domain + * + * @param string $src Original remote URL to embed + * @param string $width + * @param string $height + * @return string Formatted HTML + * + * @throws \Friendica\Network\HTTPException\InternalServerErrorException + * @see oembed_format_object() + */ + private static function iframe(string $src, string $width, string $height): string + { + if (!$height || strstr($height, '%')) { + $height = '200'; + } + $width = '100%'; + + $src = DI::baseUrl() . '/oembed/' . Strings::base64UrlEncode($src); + return ''; + } + + /** + * Generates attribute search XPath string + * + * Generates an XPath query to select elements whose provided attribute contains + * the provided value in a space-separated list. + * + * @param string $attr Name of the attribute to search + * @param string $value Value to search in a space-separated list + * @return string + */ + private static function buildXPath(string $attr, $value): string + { + // https://www.westhoffswelt.de/blog/2009/6/9/select-html-elements-with-more-than-one-css-class-using-xpath + return "contains(normalize-space(@$attr), ' $value ') or substring(normalize-space(@$attr), 1, string-length('$value') + 1) = '$value ' or substring(normalize-space(@$attr), string-length(@$attr) - string-length('$value')) = ' $value' or @$attr = '$value'"; + } + + /** + * Returns the inner XML string of a provided DOMNode + * + * @param DOMNode $node + * @return string + */ + private static function getInnerHTML(DOMNode $node): string + { + $innerHTML = ''; + $children = $node->childNodes; + foreach ($children as $child) { + $innerHTML .= $child->ownerDocument->saveXML($child); + } + return $innerHTML; + } + } diff --git a/src/Content/Text/BBCode.php b/src/Content/Text/BBCode.php index 3aed82df09..f5bba8ce42 100644 --- a/src/Content/Text/BBCode.php +++ b/src/Content/Text/BBCode.php @@ -452,7 +452,11 @@ class BBCode $return = ''; try { - $return = OEmbed::getHTML($data['url'], $data['title'], $uriid); + if ($tryoembed && OEmbed::isAllowedURL($data['url'])) { + $return = OEmbed::getHTML($data['url'], $data['title']); + } else { + throw new Exception('OEmbed is disabled for this attachment.'); + } } catch (Exception $e) { $data['title'] = ($data['title'] ?? '') ?: $data['url']; @@ -1354,12 +1358,12 @@ class BBCode * $match[1] = $url * $match[2] = $title or absent */ - $try_oembed_callback = function (array $match) use ($uriid) { + $try_oembed_callback = function (array $match) { $url = $match[1]; $title = $match[2] ?? ''; try { - $return = OEmbed::getHTML($url, $title, $uriid); + $return = OEmbed::getHTML($url, $title); } catch (Exception $ex) { $return = $match[0]; } @@ -1377,7 +1381,6 @@ class BBCode $text = preg_replace("#\[(\w*)](\n*)#ism", '$2[$1]', $text); $text = preg_replace("#(\n*)\[/(\w*)]#ism", '[/$2]$1', $text); - // oembed // Extract the private images which use data urls since preg has issues with // large data sizes. Stash them away while we do bbcode conversion, and then put them back // in after we've done all the regex matching. We cannot use any preg functions to do this. @@ -1807,7 +1810,7 @@ class BBCode } // oembed tag - $text = OEmbed::BBCode2HTML($text, $uriid); + $text = OEmbed::BBCode2HTML($text); // Avoid triple linefeeds through oembed $text = str_replace("


", "

", $text); @@ -2055,6 +2058,9 @@ class BBCode // Default iframe allowed domains/path $allowedIframeDomains = [ + DI::baseUrl()->getHost() + . (DI::baseUrl()->getPath() ? '/' . DI::baseUrl()->getPath() : '') + . '/oembed/', # The path part has to change with the source in Content\Oembed::iframe 'www.youtube.com/embed/', 'player.vimeo.com/video/', ]; diff --git a/src/Model/Post/Media.php b/src/Model/Post/Media.php index a22b9fba6b..346a6a1d00 100644 --- a/src/Model/Post/Media.php +++ b/src/Model/Post/Media.php @@ -365,7 +365,7 @@ class Media */ private static function addPage(array $media): array { - $data = ParseUrl::getSiteinfoCached($media['url']); + $data = ParseUrl::getSiteinfoCached($media['url'], false); $media['preview'] = $data['images'][0]['src'] ?? null; $media['preview-height'] = $data['images'][0]['height'] ?? null; $media['preview-width'] = $data['images'][0]['width'] ?? null; diff --git a/src/Module/Oembed.php b/src/Module/Oembed.php new file mode 100644 index 0000000000..68e13a2e86 --- /dev/null +++ b/src/Module/Oembed.php @@ -0,0 +1,74 @@ +. + * + */ + +namespace Friendica\Module; + +use Friendica\BaseModule; +use Friendica\Content; +use Friendica\Core\System; +use Friendica\DI; +use Friendica\Util\Strings; + +/** + * Oembed module + * + * Displays stored embed content based on a base64 hash of a remote URL + * + * Example: /oembed/aHR0cHM6Ly9... + * + * @author Hypolite Petovan + */ +class Oembed extends BaseModule +{ + protected function content(array $request = []): string + { + // Unused form: /oembed/b2h?url=... + if (DI::args()->getArgv()[1] == 'b2h') { + $url = ["", trim(hex2bin($_REQUEST['url']))]; + echo Content\OEmbed::replaceCallback($url); + System::exit(); + } + + // Unused form: /oembed/h2b?text=... + if (DI::args()->getArgv()[1] == 'h2b') { + $text = trim(hex2bin($_REQUEST['text'])); + echo Content\OEmbed::HTML2BBCode($text); + System::exit(); + } + + // @TODO: Replace with parameter from router + if (DI::args()->getArgc() == 2) { + echo ''; + $url = Strings::base64UrlDecode(DI::args()->getArgv()[1]); + $j = Content\OEmbed::fetchURL($url); + + // workaround for media.ccc.de (and any other endpoint that return size 0) + if (substr($j->html, 0, 7) == "html, 'width="0"')) { + $j->html = '' . $j->html; + $j->html = str_replace('width="0"', '', $j->html); + $j->html = str_replace('height="0"', '', $j->html); + } + echo $j->html; + echo ''; + } + System::exit(); + } +} diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index af05f3b6e9..1287160576 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -23,6 +23,7 @@ namespace Friendica\Util; use DOMDocument; use DOMXPath; +use Friendica\Content\OEmbed; use Friendica\Content\Text\HTML; use Friendica\Protocol\HTTP\MediaType; use Friendica\Core\Hook; @@ -98,6 +99,8 @@ class ParseUrl * Search for cached embeddable data of an url otherwise fetch it * * @param string $url The url of the page which should be scraped + * @param bool $do_oembed The false option is used by the function fetch_oembed() + * to avoid endless loops * * @return array which contains needed data for embedding * string 'url' => The url of the parsed page @@ -112,7 +115,7 @@ class ParseUrl * @see ParseUrl::getSiteinfo() for more information about scraping * embeddable content */ - public static function getSiteinfoCached(string $url): array + public static function getSiteinfoCached(string $url, bool $do_oembed = true): array { if (empty($url)) { return [ @@ -123,13 +126,15 @@ class ParseUrl $urlHash = hash('sha256', $url); - $parsed_url = DBA::selectFirst('parsed_url', ['content'], ['url_hash' => $urlHash]); + $parsed_url = DBA::selectFirst('parsed_url', ['content'], + ['url_hash' => $urlHash, 'oembed' => $do_oembed] + ); if (!empty($parsed_url['content'])) { $data = unserialize($parsed_url['content']); return $data; } - $data = self::getSiteinfo($url); + $data = self::getSiteinfo($url, $do_oembed); $expires = $data['expires']; @@ -139,7 +144,7 @@ class ParseUrl 'parsed_url', [ 'url_hash' => $urlHash, - 'oembed' => false, + 'oembed' => $do_oembed, 'url' => $url, 'content' => serialize($data), 'created' => DateTimeFormat::utcNow(), @@ -162,6 +167,8 @@ class ParseUrl * \ * * @param string $url The url of the page which should be scraped + * @param bool $do_oembed The false option is used by the function fetch_oembed() + * to avoid endless loops * @param int $count Internal counter to avoid endless loops * * @return array which contains needed data for embedding @@ -187,7 +194,7 @@ class ParseUrl * * @endverbatim */ - public static function getSiteinfo(string $url, int $count = 1): array + public static function getSiteinfo(string $url, bool $do_oembed = true, int $count = 1): array { if (empty($url)) { return [ @@ -247,6 +254,41 @@ class ParseUrl $body = $curlResult->getBodyString(); + if ($do_oembed) { + $oembed_data = OEmbed::fetchURL($url, false, false); + + if (!empty($oembed_data->type)) { + if (!in_array($oembed_data->type, ['error', 'rich', 'image', 'video', 'audio', ''])) { + $siteinfo['type'] = $oembed_data->type; + } + + // See https://github.com/friendica/friendica/pull/5763#discussion_r217913178 + if ($siteinfo['type'] != 'photo') { + if (!empty($oembed_data->title)) { + $siteinfo['title'] = trim($oembed_data->title); + } + if (!empty($oembed_data->description)) { + $siteinfo['text'] = trim($oembed_data->description); + } + if (!empty($oembed_data->author_name)) { + $siteinfo['author_name'] = trim($oembed_data->author_name); + } + if (!empty($oembed_data->author_url)) { + $siteinfo['author_url'] = Network::sanitizeUrl($oembed_data->author_url); + } + if (!empty($oembed_data->provider_name)) { + $siteinfo['publisher_name'] = trim($oembed_data->provider_name); + } + if (!empty($oembed_data->provider_url)) { + $siteinfo['publisher_url'] = Network::sanitizeUrl($oembed_data->provider_url); + } + if (!empty($oembed_data->thumbnail_url)) { + $siteinfo['image'] = $oembed_data->thumbnail_url; + } + } + } + } + $charset = ''; try { // Look for a charset, first in headers @@ -309,7 +351,7 @@ class ParseUrl } } if ($content != '') { - $siteinfo = self::getSiteinfo($content, ++$count); + $siteinfo = self::getSiteinfo($content, $do_oembed, ++$count); return $siteinfo; } } diff --git a/static/defaults.config.php b/static/defaults.config.php index 8a13c61a39..7439c7fdf6 100644 --- a/static/defaults.config.php +++ b/static/defaults.config.php @@ -429,6 +429,10 @@ return [ // Don't do count calculations (currently only when showing photo albums). 'no_count' => false, + // no_oembed (Boolean) + // Don't use OEmbed to fetch more information about a link. + 'no_oembed' => false, + // no_redirect_list (Array) // List of domains where HTTP redirects should be ignored. 'no_redirect_list' => [], diff --git a/static/routes.config.php b/static/routes.config.php index 77d04f92c1..146255adc6 100644 --- a/static/routes.config.php +++ b/static/routes.config.php @@ -557,6 +557,11 @@ return [ '/objects/{guid}[/{activity}]' => [Module\ActivityPub\Objects::class, [R::GET]], + '/oembed' => [ + '/b2h' => [Module\Oembed::class, [R::GET]], + '/h2b' => [Module\Oembed::class, [R::GET]], + '/{hash}' => [Module\Oembed::class, [R::GET]], + ], '/outbox/{nickname}' => [Module\ActivityPub\Outbox::class, [R::GET, R::POST]], '/owa' => [Module\Owa::class, [R::GET]], '/openid' => [Module\Security\OpenID::class, [R::GET]],