From 7adbd73eca0e960a33e30ecc8c4fe63ca5da8fbf Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 13 Mar 2021 13:17:42 +0000 Subject: [PATCH] More general content type detection --- src/Util/ParseUrl.php | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index 83d0d84dc9..da6c88abb1 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -54,25 +54,21 @@ class ParseUrl /** * Fetch the content type of the given url * @param string $url URL of the page - * @return string content type + * @return array content type */ public static function getContentType(string $url) { $curlResult = DI::httpRequest()->head($url); if (!$curlResult->isSuccess()) { - return ''; + return []; } $contenttype = $curlResult->getHeader('Content-Type'); if (empty($contenttype)) { - return ''; - } - - if (!preg_match('#(image|video|audio)/#i', $contenttype, $matches)) { - return ''; + return []; } - return array_pop($matches); + return explode('/', current(explode(';', $contenttype))); } /** @@ -211,8 +207,14 @@ class ParseUrl } $type = self::getContentType($url); - if (in_array($type, ['image', 'video', 'audio'])) { - $siteinfo['type'] = $type; + Logger::info('Got content-type', ['content-type' => $type, 'url' => $url]); + if (!empty($type) && in_array($type[0], ['image', 'video', 'audio'])) { + $siteinfo['type'] = $type[0]; + return $siteinfo; + } + + if ((count($type) >= 2) && (($type[0] != 'text') || ($type[1] != 'html'))) { + Logger::info('Unparseable content-type, quitting here, ', ['content-type' => $type, 'url' => $url]); return $siteinfo; } @@ -228,21 +230,6 @@ class ParseUrl return $siteinfo; } - // Native media type, no need for HTML parsing - $type = $curlResult->getHeader('Content-Type'); - if ($type) { - preg_match('#(image|video|audio)/#i', $type, $matches); - if ($matches) { - $siteinfo['type'] = array_pop($matches); - return $siteinfo; - } - } - - // If it isn't a HTML file then exit - if (($curlResult->getContentType() != '') && !strstr(strtolower($curlResult->getContentType()), 'html')) { - return $siteinfo; - } - if ($cacheControlHeader = $curlResult->getHeader('Cache-Control')) { if (preg_match('/max-age=([0-9]+)/i', $cacheControlHeader, $matches)) { $maxAge = max(86400, (int)array_pop($matches));