Merge pull request #7839 from annando/siteinfo

Don't guess the site info / restrict the description length
This commit is contained in:
Hypolite Petovan 2019-11-15 09:52:41 -05:00 committed by GitHub
commit 19bbae21de
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -17,6 +17,16 @@ use Friendica\Database\DBA;
*/
class ParseUrl
{
/**
* Maximum number of characters for the description
*/
const MAX_DESC_COUNT = 250;
/**
* Minimum number of characters for the description
*/
const MIN_DESC_COUNT = 100;
/**
* @brief Search for chached embeddable data of an url otherwise fetch it
*
@ -336,36 +346,7 @@ class ParseUrl
$siteinfo['type'] = 'link';
}
if (empty($siteinfo['image']) && !$no_guessing) {
$list = $xpath->query('//img[@src]');
foreach ($list as $node) {
$img_tag = [];
if ($node->attributes->length) {
foreach ($node->attributes as $attribute) {
$img_tag[$attribute->name] = $attribute->value;
}
}
$src = self::completeUrl($img_tag['src'], $url);
$photodata = Images::getInfoFromURLCached($src);
if (($photodata) && ($photodata[0] > 150) && ($photodata[1] > 150)) {
if ($photodata[0] > 300) {
$photodata[1] = round($photodata[1] * (300 / $photodata[0]));
$photodata[0] = 300;
}
if ($photodata[1] > 300) {
$photodata[0] = round($photodata[0] * (300 / $photodata[1]));
$photodata[1] = 300;
}
$siteinfo['images'][] = [
'src' => $src,
'width' => $photodata[0],
'height' => $photodata[1]
];
}
}
} elseif (!empty($siteinfo['image'])) {
if (!empty($siteinfo['image'])) {
$src = self::completeUrl($siteinfo['image'], $url);
unset($siteinfo['image']);
@ -379,47 +360,15 @@ class ParseUrl
}
}
if ((@$siteinfo['text'] == '') && (@$siteinfo['title'] != '') && !$no_guessing) {
$text = '';
$list = $xpath->query('//div[@class="article"]');
foreach ($list as $node) {
if (strlen($node->nodeValue) > 40) {
$text .= ' ' . trim($node->nodeValue);
if (!empty($siteinfo['text']) && mb_strlen($siteinfo['text']) > self::MAX_DESC_COUNT) {
$siteinfo['text'] = mb_substr($siteinfo['text'], 0, self::MAX_DESC_COUNT) . '…';
$pos = mb_strrpos($siteinfo['text'], '.');
if ($pos > self::MIN_DESC_COUNT) {
$siteinfo['text'] = mb_substr($siteinfo['text'], 0, $pos + 1);
}
}
if ($text == '') {
$list = $xpath->query('//div[@class="content"]');
foreach ($list as $node) {
if (strlen($node->nodeValue) > 40) {
$text .= ' ' . trim($node->nodeValue);
}
}
}
// If none text was found then take the paragraph content
if ($text == '') {
$list = $xpath->query('//p');
foreach ($list as $node) {
if (strlen($node->nodeValue) > 40) {
$text .= ' ' . trim($node->nodeValue);
}
}
}
if ($text != '') {
$text = trim(str_replace(["\n", "\r"], [' ', ' '], $text));
while (strpos($text, ' ')) {
$text = trim(str_replace(' ', ' ', $text));
}
$siteinfo['text'] = trim(html_entity_decode(substr($text, 0, 350), ENT_QUOTES, 'UTF-8') . '...');
}
}
Logger::log('Siteinfo for ' . $url . ' ' . print_r($siteinfo, true), Logger::DEBUG);
Logger::info('Siteinfo fetched', ['url' => $url, 'siteinfo' => $siteinfo]);
Hook::callAll('getsiteinfo', $siteinfo);