From 262ee2b0b1a62dfcf78eca01a741d3cecfb34a40 Mon Sep 17 00:00:00 2001 From: Michael Date: Fri, 12 Mar 2021 23:04:51 +0000 Subject: [PATCH 1/3] Issue 10019: Fix embedding of media objects --- src/Content/Text/BBCode.php | 8 +++++--- src/Module/ParseUrl.php | 9 +++++---- src/Util/ParseUrl.php | 24 ++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/src/Content/Text/BBCode.php b/src/Content/Text/BBCode.php index 997370585..0b49e75a0 100644 --- a/src/Content/Text/BBCode.php +++ b/src/Content/Text/BBCode.php @@ -2257,10 +2257,10 @@ class BBCode return $result; } - $siteinfo = ParseUrl::getSiteinfoCached($url); + $type = ParseUrl::getContentType($url); - if (in_array($siteinfo['type'], ['image', 'video', 'audio'])) { - switch ($siteinfo['type']) { + if (in_array($type, ['image', 'video', 'audio'])) { + switch ($type) { case 'video': $bbcode = "\n" . '[video]' . $url . '[/video]' . "\n"; break; @@ -2275,6 +2275,8 @@ class BBCode return $bbcode; } + $siteinfo = ParseUrl::getSiteinfoCached($url); + unset($siteinfo['keywords']); // Bypass attachment if parse url for a comment diff --git a/src/Module/ParseUrl.php b/src/Module/ParseUrl.php index 8e72c4fa7..7138238d8 100644 --- a/src/Module/ParseUrl.php +++ b/src/Module/ParseUrl.php @@ -94,11 +94,10 @@ class ParseUrl extends BaseModule } if ($format == 'json') { - $siteinfo = Util\ParseUrl::getSiteinfoCached($url); + $type = Util\ParseUrl::getContentType($url); - if (empty($siteinfo['title']) && empty($siteinfo['text']) && empty($siteinfo['image']) - && in_array($siteinfo['type'], ['image', 'video', 'audio'])) { - switch ($siteinfo['type']) { + if (in_array($type, ['image', 'video', 'audio'])) { + switch ($type) { case 'video': $content_type = 'video'; break; @@ -114,6 +113,8 @@ class ParseUrl extends BaseModule $ret['data'] = ['url' => $url]; $ret['success'] = true; } else { + $siteinfo = Util\ParseUrl::getSiteinfoCached($url); + unset($siteinfo['keywords']); $ret['data'] = $siteinfo; diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index de280bcf8..a634545cd 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -51,6 +51,30 @@ class ParseUrl */ const MIN_DESC_COUNT = 100; + /** + * Fetch the content type of the given url + * @param string $url URL of the page + * @return string content type + */ + public static function getContentType(string $url) + { + $curlResult = DI::httpRequest()->head($url); + if (!$curlResult->isSuccess()) { + return ''; + } + + $contenttype = $curlResult->getHeader('Content-Type'); + if (empty($contenttype)) { + return ''; + } + + if (!preg_match('#(image|video|audio)/#i', $contenttype, $matches)) { + return ''; + } + + return array_pop($matches); + } + /** * Search for chached embeddable data of an url otherwise fetch it * From ffb92e3355883e2ed2228057c3ad95c4f028912f Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 13 Mar 2021 07:03:26 +0000 Subject: [PATCH 2/3] Integrate fetching of the content type into "getSiteinfo" --- src/Content/Text/BBCode.php | 8 +++----- src/Module/ParseUrl.php | 8 +++----- src/Util/ParseUrl.php | 8 +++++++- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/Content/Text/BBCode.php b/src/Content/Text/BBCode.php index 0b49e75a0..997370585 100644 --- a/src/Content/Text/BBCode.php +++ b/src/Content/Text/BBCode.php @@ -2257,10 +2257,10 @@ class BBCode return $result; } - $type = ParseUrl::getContentType($url); + $siteinfo = ParseUrl::getSiteinfoCached($url); - if (in_array($type, ['image', 'video', 'audio'])) { - switch ($type) { + if (in_array($siteinfo['type'], ['image', 'video', 'audio'])) { + switch ($siteinfo['type']) { case 'video': $bbcode = "\n" . '[video]' . $url . '[/video]' . "\n"; break; @@ -2275,8 +2275,6 @@ class BBCode return $bbcode; } - $siteinfo = ParseUrl::getSiteinfoCached($url); - unset($siteinfo['keywords']); // Bypass attachment if parse url for a comment diff --git a/src/Module/ParseUrl.php b/src/Module/ParseUrl.php index 7138238d8..ed48ea1b2 100644 --- a/src/Module/ParseUrl.php +++ b/src/Module/ParseUrl.php @@ -94,10 +94,10 @@ class ParseUrl extends BaseModule } if ($format == 'json') { - $type = Util\ParseUrl::getContentType($url); + $siteinfo = Util\ParseUrl::getSiteinfoCached($url); - if (in_array($type, ['image', 'video', 'audio'])) { - switch ($type) { + if (in_array($siteinfo['type'], ['image', 'video', 'audio'])) { + switch ($siteinfo['type']) { case 'video': $content_type = 'video'; break; @@ -113,8 +113,6 @@ class ParseUrl extends BaseModule $ret['data'] = ['url' => $url]; $ret['success'] = true; } else { - $siteinfo = Util\ParseUrl::getSiteinfoCached($url); - unset($siteinfo['keywords']); $ret['data'] = $siteinfo; diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index a634545cd..83d0d84dc 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -210,6 +210,12 @@ class ParseUrl return $siteinfo; } + $type = self::getContentType($url); + if (in_array($type, ['image', 'video', 'audio'])) { + $siteinfo['type'] = $type; + return $siteinfo; + } + $curlResult = DI::httpRequest()->get($url); if (!$curlResult->isSuccess()) { return $siteinfo; @@ -251,7 +257,7 @@ class ParseUrl $oembed_data = OEmbed::fetchURL($url); if (!empty($oembed_data->type)) { - if (!in_array($oembed_data->type, ['error', 'rich', ''])) { + if (!in_array($oembed_data->type, ['error', 'rich', 'image', 'video', 'audio', ''])) { $siteinfo['type'] = $oembed_data->type; } From 7adbd73eca0e960a33e30ecc8c4fe63ca5da8fbf Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 13 Mar 2021 13:17:42 +0000 Subject: [PATCH 3/3] More general content type detection --- src/Util/ParseUrl.php | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index 83d0d84dc..da6c88abb 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -54,25 +54,21 @@ class ParseUrl /** * Fetch the content type of the given url * @param string $url URL of the page - * @return string content type + * @return array content type */ public static function getContentType(string $url) { $curlResult = DI::httpRequest()->head($url); if (!$curlResult->isSuccess()) { - return ''; + return []; } $contenttype = $curlResult->getHeader('Content-Type'); if (empty($contenttype)) { - return ''; - } - - if (!preg_match('#(image|video|audio)/#i', $contenttype, $matches)) { - return ''; + return []; } - return array_pop($matches); + return explode('/', current(explode(';', $contenttype))); } /** @@ -211,8 +207,14 @@ class ParseUrl } $type = self::getContentType($url); - if (in_array($type, ['image', 'video', 'audio'])) { - $siteinfo['type'] = $type; + Logger::info('Got content-type', ['content-type' => $type, 'url' => $url]); + if (!empty($type) && in_array($type[0], ['image', 'video', 'audio'])) { + $siteinfo['type'] = $type[0]; + return $siteinfo; + } + + if ((count($type) >= 2) && (($type[0] != 'text') || ($type[1] != 'html'))) { + Logger::info('Unparseable content-type, quitting here, ', ['content-type' => $type, 'url' => $url]); return $siteinfo; } @@ -228,21 +230,6 @@ class ParseUrl return $siteinfo; } - // Native media type, no need for HTML parsing - $type = $curlResult->getHeader('Content-Type'); - if ($type) { - preg_match('#(image|video|audio)/#i', $type, $matches); - if ($matches) { - $siteinfo['type'] = array_pop($matches); - return $siteinfo; - } - } - - // If it isn't a HTML file then exit - if (($curlResult->getContentType() != '') && !strstr(strtolower($curlResult->getContentType()), 'html')) { - return $siteinfo; - } - if ($cacheControlHeader = $curlResult->getHeader('Cache-Control')) { if (preg_match('/max-age=([0-9]+)/i', $cacheControlHeader, $matches)) { $maxAge = max(86400, (int)array_pop($matches));