More general content type detection

This commit is contained in:
Michael 2021-03-13 13:17:42 +00:00
parent acffafe6b9
commit 7adbd73eca

View file

@ -54,25 +54,21 @@ class ParseUrl
/**
* Fetch the content type of the given url
* @param string $url URL of the page
* @return string content type
* @return array content type
*/
public static function getContentType(string $url)
{
$curlResult = DI::httpRequest()->head($url);
if (!$curlResult->isSuccess()) {
return '';
return [];
}
$contenttype = $curlResult->getHeader('Content-Type');
if (empty($contenttype)) {
return '';
return [];
}
if (!preg_match('#(image|video|audio)/#i', $contenttype, $matches)) {
return '';
}
return array_pop($matches);
return explode('/', current(explode(';', $contenttype)));
}
/**
@ -211,8 +207,14 @@ class ParseUrl
}
$type = self::getContentType($url);
if (in_array($type, ['image', 'video', 'audio'])) {
$siteinfo['type'] = $type;
Logger::info('Got content-type', ['content-type' => $type, 'url' => $url]);
if (!empty($type) && in_array($type[0], ['image', 'video', 'audio'])) {
$siteinfo['type'] = $type[0];
return $siteinfo;
}
if ((count($type) >= 2) && (($type[0] != 'text') || ($type[1] != 'html'))) {
Logger::info('Unparseable content-type, quitting here, ', ['content-type' => $type, 'url' => $url]);
return $siteinfo;
}
@ -228,21 +230,6 @@ class ParseUrl
return $siteinfo;
}
// Native media type, no need for HTML parsing
$type = $curlResult->getHeader('Content-Type');
if ($type) {
preg_match('#(image|video|audio)/#i', $type, $matches);
if ($matches) {
$siteinfo['type'] = array_pop($matches);
return $siteinfo;
}
}
// If it isn't a HTML file then exit
if (($curlResult->getContentType() != '') && !strstr(strtolower($curlResult->getContentType()), 'html')) {
return $siteinfo;
}
if ($cacheControlHeader = $curlResult->getHeader('Cache-Control')) {
if (preg_match('/max-age=([0-9]+)/i', $cacheControlHeader, $matches)) {
$maxAge = max(86400, (int)array_pop($matches));