Add native media types and expiration to getSiteInfo

This commit is contained in:
Hypolite Petovan 2021-02-16 10:16:04 -05:00
parent 7de03eb13f
commit 69802554fd
3 changed files with 56 additions and 37 deletions

View file

@ -180,28 +180,3 @@ function parse_url_content(App $a)
exit(); exit();
} }
/**
* Legacy function to call ParseUrl::getSiteinfoCached
*
* Note: We have moved the function to ParseUrl.php. This function is only for
* legacy support and will be remove in the future
*
* @param string $url The url of the page which should be scraped
* @param bool $no_guessing If true the parse doens't search for
* preview pictures
* @param bool $do_oembed The false option is used by the function fetch_oembed()
* to avoid endless loops
*
* @return array which contains needed data for embedding
*
* @throws \Friendica\Network\HTTPException\InternalServerErrorException
* @see ParseUrl::getSiteinfoCached()
*
* @deprecated since version 3.6 use ParseUrl::getSiteinfoCached instead
*/
function parseurl_getsiteinfo_cached($url, $no_guessing = false, $do_oembed = true)
{
$siteinfo = ParseUrl::getSiteinfoCached($url, $no_guessing, $do_oembed);
return $siteinfo;
}

View file

@ -29,6 +29,7 @@ use Friendica\Core\Logger;
use Friendica\Database\Database; use Friendica\Database\Database;
use Friendica\Database\DBA; use Friendica\Database\DBA;
use Friendica\DI; use Friendica\DI;
use Friendica\Network\HTTPException;
/** /**
* Get information about a given URL * Get information about a given URL
@ -37,6 +38,9 @@ use Friendica\DI;
*/ */
class ParseUrl class ParseUrl
{ {
const DEFAULT_EXPIRATION_FAILURE = 'now + 1 day';
const DEFAULT_EXPIRATION_SUCCESS = 'now + 3 months';
/** /**
* Maximum number of characters for the description * Maximum number of characters for the description
*/ */
@ -65,18 +69,23 @@ class ParseUrl
* array 'images' => (optional) Array of preview pictures * array 'images' => (optional) Array of preview pictures
* string 'keywords' => (optional) The tags which belong to the content * string 'keywords' => (optional) The tags which belong to the content
* *
* @throws \Friendica\Network\HTTPException\InternalServerErrorException * @throws HTTPException\InternalServerErrorException
* @see ParseUrl::getSiteinfo() for more information about scraping * @see ParseUrl::getSiteinfo() for more information about scraping
* embeddable content * embeddable content
*/ */
public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true) public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true): array
{ {
if ($url == "") { if (empty($url)) {
return false; return [
'url' => '',
'type' => 'error',
];
} }
$urlHash = hash('sha256', $url);
$parsed_url = DBA::selectFirst('parsed_url', ['content'], $parsed_url = DBA::selectFirst('parsed_url', ['content'],
['url' => Strings::normaliseLink($url), 'guessing' => !$no_guessing, 'oembed' => $do_oembed] ['url_hash' => $urlHash, 'guessing' => !$no_guessing, 'oembed' => $do_oembed]
); );
if (!empty($parsed_url['content'])) { if (!empty($parsed_url['content'])) {
$data = unserialize($parsed_url['content']); $data = unserialize($parsed_url['content']);
@ -85,12 +94,20 @@ class ParseUrl
$data = self::getSiteinfo($url, $no_guessing, $do_oembed); $data = self::getSiteinfo($url, $no_guessing, $do_oembed);
DBA::insert( $expires = $data['expires'];
unset($data['expires']);
DI::dba()->insert(
'parsed_url', 'parsed_url',
[ [
'url' => substr(Strings::normaliseLink($url), 0, 255), 'guessing' => !$no_guessing, 'url_hash' => $urlHash,
'oembed' => $do_oembed, 'content' => serialize($data), 'guessing' => !$no_guessing,
'created' => DateTimeFormat::utcNow() 'oembed' => $do_oembed,
'url' => $url,
'content' => serialize($data),
'created' => DateTimeFormat::utcNow(),
'expires' => $expires,
], ],
Database::INSERT_UPDATE Database::INSERT_UPDATE
); );
@ -117,7 +134,7 @@ class ParseUrl
* *
* @return array which contains needed data for embedding * @return array which contains needed data for embedding
* string 'url' => The url of the parsed page * string 'url' => The url of the parsed page
* string 'type' => Content type * string 'type' => Content type (error, link, photo, image, audio, video)
* string 'title' => (optional) The title of the content * string 'title' => (optional) The title of the content
* string 'text' => (optional) The description for the content * string 'text' => (optional) The description for the content
* string 'image' => (optional) A preview image of the content (only available if $no_guessing = false) * string 'image' => (optional) A preview image of the content (only available if $no_guessing = false)
@ -140,6 +157,13 @@ class ParseUrl
*/ */
public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1)
{ {
if (empty($url)) {
return [
'url' => '',
'type' => 'error',
];
}
// Check if the URL does contain a scheme // Check if the URL does contain a scheme
$scheme = parse_url($url, PHP_URL_SCHEME); $scheme = parse_url($url, PHP_URL_SCHEME);
@ -154,6 +178,7 @@ class ParseUrl
$siteinfo = [ $siteinfo = [
'url' => $url, 'url' => $url,
'type' => 'link', 'type' => 'link',
'expires' => DateTimeFormat::utc(self::DEFAULT_EXPIRATION_FAILURE),
]; ];
if ($count > 10) { if ($count > 10) {
@ -166,16 +191,35 @@ class ParseUrl
return $siteinfo; return $siteinfo;
} }
$siteinfo['expires'] = DateTimeFormat::utc(self::DEFAULT_EXPIRATION_SUCCESS);
// If the file is too large then exit // If the file is too large then exit
if (($curlResult->getInfo()['download_content_length'] ?? 0) > 1000000) { if (($curlResult->getInfo()['download_content_length'] ?? 0) > 1000000) {
return $siteinfo; return $siteinfo;
} }
// Native media type, no need for HTML parsing
$type = $curlResult->getHeader('Content-Type');
if ($type) {
preg_match('#(image|video|audio)/#i', $type, $matches);
if ($matches) {
$siteinfo['type'] = array_pop($matches);
return $siteinfo;
}
}
// If it isn't a HTML file then exit // If it isn't a HTML file then exit
if (($curlResult->getContentType() != '') && !strstr(strtolower($curlResult->getContentType()), 'html')) { if (($curlResult->getContentType() != '') && !strstr(strtolower($curlResult->getContentType()), 'html')) {
return $siteinfo; return $siteinfo;
} }
if ($cacheControlHeader = $curlResult->getHeader('Cache-Control')) {
if (preg_match('/max-age=([0-9]+)/i', $cacheControlHeader, $matches)) {
$maxAge = max(86400, (int)array_pop($matches));
$siteinfo['expires'] = DateTimeFormat::utc("now + $maxAge seconds");
}
}
$header = $curlResult->getHeader(); $header = $curlResult->getHeader();
$body = $curlResult->getBody(); $body = $curlResult->getBody();

View file

@ -64,7 +64,7 @@ class ClearCache
// Delete the cached OEmbed entries that are older than three month // Delete the cached OEmbed entries that are older than three month
DBA::delete('oembed', ["`created` < NOW() - INTERVAL 3 MONTH"]); DBA::delete('oembed', ["`created` < NOW() - INTERVAL 3 MONTH"]);
// Delete the cached "parse_url" entries that are older than three month // Delete the cached "parsed_url" entries that are expired
DBA::delete('parsed_url', ["`created` < NOW() - INTERVAL 3 MONTH"]); DBA::delete('parsed_url', ["`expires` < NOW()"]);
} }
} }