Merge pull request #10196 from annando/link-detection

Improved link detection
This commit is contained in:
Hypolite Petovan 2021-05-02 23:00:23 -04:00 committed by GitHub
commit 4395f73d1e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 36 additions and 7 deletions

View file

@ -253,10 +253,15 @@ class PageInfo
// Fix for Mastodon where the mentions are in a different format
$body = preg_replace("~\[url=($URLSearchString)]([#!@])(.*?)\[/url]~is", '$2[url=$1]$3[/url]', $body);
preg_match("~(?<![!#@])\[url]($URLSearchString)\[/url]$~is", $body, $matches);
// Remove all hashtags and mentions
$body = preg_replace("/([#@!])\[url\=(.*?)\](.*?)\[\/url\]/ism", '', $body);
// Search for pure links
preg_match("/\[url\](https?:.*?)\[\/url\]/ism", $body, $matches);
if (!$matches) {
preg_match("~(?<![!#@])\[url=($URLSearchString)].*\[/url]$~is", $body, $matches);
// Search for links with descriptions
preg_match("/\[url\=(https?:.*?)\].*?\[\/url\]/ism", $body, $matches);
}
if (!$matches && $searchNakedUrls) {

View file

@ -971,6 +971,8 @@ class Item
$item['raw-body'] = Post\Media::insertFromBody($item['uri-id'], $item['raw-body']);
$item['raw-body'] = self::setHashtags($item['raw-body']);
Post\Media::insertFromAttachmentData($item['uri-id'], $item['body']);
// Check for hashtags in the body and repair or add hashtag links
$item['body'] = self::setHashtags($item['body']);
@ -2646,7 +2648,7 @@ class Item
}
$body = $item['body'] ?? '';
$item['body'] = preg_replace("/\s*\[attachment .*?\].*?\[\/attachment\]\s*/ism", '', $item['body']);
$item['body'] = preg_replace("/\s*\[attachment .*?\].*?\[\/attachment\]\s*/ism", "\n", $item['body']);
self::putInCache($item);
$item['body'] = $body;
$s = $item["rendered-html"];
@ -2722,6 +2724,12 @@ class Item
*/
public static function containsLink(string $body, string $url)
{
// Make sure that for example site parameters aren't used when testing if the link is contained in the body
$urlparts = parse_url($url);
unset($urlparts['query']);
unset($urlparts['fragment']);
$url = Network::unparseURL($urlparts);
if (strpos($body, $url)) {
return true;
}

View file

@ -286,6 +286,8 @@ class Media
// Simplify image codes
$body = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", '[img]$3[/img]', $body);
$unshared_body = preg_replace("/\s*\[share .*?\].*?\[\/share\]\s*/ism", '', $body);
$attachments = [];
if (preg_match_all("#\[url=([^\]]+?)\]\s*\[img=([^\[\]]*)\]([^\[\]]*)\[\/img\]\s*\[/url\]#ism", $body, $pictures, PREG_SET_ORDER)) {
foreach ($pictures as $picture) {
@ -346,8 +348,11 @@ class Media
}
foreach ($attachments as $attachment) {
// Only store attachments that are part of the unshared body
if (strpos($unshared_body, $attachment['url']) !== false) {
self::insert($attachment);
}
}
return trim($body);
}
@ -360,6 +365,9 @@ class Media
*/
public static function insertFromAttachmentData(int $uriid, string $body)
{
// Don't look at the shared content
$body = preg_replace("/\s*\[share .*?\].*?\[\/share\]\s*/ism", '', $body);
$data = BBCode::getAttachmentData($body);
if (empty($data)) {
return;
@ -548,11 +556,19 @@ class Media
}
if ($media['type'] == self::IMAGE) {
if (!empty($media['preview'])) {
if (!empty($media['description'])) {
$body .= "\n[url=" . $media['url'] . "][img=" . $media['preview'] . ']' . $media['description'] .'[/img][/url]';
} else {
$body .= "\n[url=" . $media['url'] . "][img]" . $media['preview'] .'[/img][/url]';
}
} else {
if (!empty($media['description'])) {
$body .= "\n[img=" . $media['url'] . ']' . $media['description'] .'[/img]';
} else {
$body .= "\n[img]" . $media['url'] .'[/img]';
}
}
} elseif ($media['type'] == self::AUDIO) {
$body .= "\n[audio]" . $media['url'] . "[/audio]\n";
} elseif ($media['type'] == self::VIDEO) {