Don't guess the site info / restrict the description length
This commit is contained in:
parent
a8fa7e5187
commit
a3b7f08f78
1 changed files with 17 additions and 68 deletions
|
@ -17,6 +17,16 @@ use Friendica\Database\DBA;
|
||||||
*/
|
*/
|
||||||
class ParseUrl
|
class ParseUrl
|
||||||
{
|
{
|
||||||
|
/**
|
||||||
|
* Maximum number of characters for the description
|
||||||
|
*/
|
||||||
|
const MAX_DESC_COUNT = 250;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Minimum number of characters for the description
|
||||||
|
*/
|
||||||
|
const MIN_DESC_COUNT = 100;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Search for chached embeddable data of an url otherwise fetch it
|
* @brief Search for chached embeddable data of an url otherwise fetch it
|
||||||
*
|
*
|
||||||
|
@ -336,36 +346,7 @@ class ParseUrl
|
||||||
$siteinfo['type'] = 'link';
|
$siteinfo['type'] = 'link';
|
||||||
}
|
}
|
||||||
|
|
||||||
if (empty($siteinfo['image']) && !$no_guessing) {
|
if (!empty($siteinfo['image'])) {
|
||||||
$list = $xpath->query('//img[@src]');
|
|
||||||
foreach ($list as $node) {
|
|
||||||
$img_tag = [];
|
|
||||||
if ($node->attributes->length) {
|
|
||||||
foreach ($node->attributes as $attribute) {
|
|
||||||
$img_tag[$attribute->name] = $attribute->value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$src = self::completeUrl($img_tag['src'], $url);
|
|
||||||
$photodata = Images::getInfoFromURLCached($src);
|
|
||||||
|
|
||||||
if (($photodata) && ($photodata[0] > 150) && ($photodata[1] > 150)) {
|
|
||||||
if ($photodata[0] > 300) {
|
|
||||||
$photodata[1] = round($photodata[1] * (300 / $photodata[0]));
|
|
||||||
$photodata[0] = 300;
|
|
||||||
}
|
|
||||||
if ($photodata[1] > 300) {
|
|
||||||
$photodata[0] = round($photodata[0] * (300 / $photodata[1]));
|
|
||||||
$photodata[1] = 300;
|
|
||||||
}
|
|
||||||
$siteinfo['images'][] = [
|
|
||||||
'src' => $src,
|
|
||||||
'width' => $photodata[0],
|
|
||||||
'height' => $photodata[1]
|
|
||||||
];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} elseif (!empty($siteinfo['image'])) {
|
|
||||||
$src = self::completeUrl($siteinfo['image'], $url);
|
$src = self::completeUrl($siteinfo['image'], $url);
|
||||||
|
|
||||||
unset($siteinfo['image']);
|
unset($siteinfo['image']);
|
||||||
|
@ -379,47 +360,15 @@ class ParseUrl
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((@$siteinfo['text'] == '') && (@$siteinfo['title'] != '') && !$no_guessing) {
|
if (!empty($siteinfo['text']) && mb_strlen($siteinfo['text']) > self::MAX_DESC_COUNT) {
|
||||||
$text = '';
|
$siteinfo['text'] = mb_substr($siteinfo['text'], 0, self::MAX_DESC_COUNT) . '…';
|
||||||
|
$pos = mb_strrpos($siteinfo['text'], '.');
|
||||||
$list = $xpath->query('//div[@class="article"]');
|
if ($pos > self::MIN_DESC_COUNT) {
|
||||||
foreach ($list as $node) {
|
$siteinfo['text'] = mb_substr($siteinfo['text'], 0, $pos + 1);
|
||||||
if (strlen($node->nodeValue) > 40) {
|
|
||||||
$text .= ' ' . trim($node->nodeValue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($text == '') {
|
|
||||||
$list = $xpath->query('//div[@class="content"]');
|
|
||||||
foreach ($list as $node) {
|
|
||||||
if (strlen($node->nodeValue) > 40) {
|
|
||||||
$text .= ' ' . trim($node->nodeValue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If none text was found then take the paragraph content
|
|
||||||
if ($text == '') {
|
|
||||||
$list = $xpath->query('//p');
|
|
||||||
foreach ($list as $node) {
|
|
||||||
if (strlen($node->nodeValue) > 40) {
|
|
||||||
$text .= ' ' . trim($node->nodeValue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($text != '') {
|
|
||||||
$text = trim(str_replace(["\n", "\r"], [' ', ' '], $text));
|
|
||||||
|
|
||||||
while (strpos($text, ' ')) {
|
|
||||||
$text = trim(str_replace(' ', ' ', $text));
|
|
||||||
}
|
|
||||||
|
|
||||||
$siteinfo['text'] = trim(html_entity_decode(substr($text, 0, 350), ENT_QUOTES, 'UTF-8') . '...');
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Logger::log('Siteinfo for ' . $url . ' ' . print_r($siteinfo, true), Logger::DEBUG);
|
Logger::info('Siteinfo fetched', ['url' => $url, 'siteinfo' => $siteinfo]);
|
||||||
|
|
||||||
Hook::callAll('getsiteinfo', $siteinfo);
|
Hook::callAll('getsiteinfo', $siteinfo);
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue