Merge pull request #10196 from annando/link-detection

Improved link detection
This commit is contained in:
Hypolite Petovan 2021-05-02 23:00:23 -04:00 committed by GitHub
commit 4395f73d1e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 36 additions and 7 deletions

View file

@ -253,10 +253,15 @@ class PageInfo
// Fix for Mastodon where the mentions are in a different format // Fix for Mastodon where the mentions are in a different format
$body = preg_replace("~\[url=($URLSearchString)]([#!@])(.*?)\[/url]~is", '$2[url=$1]$3[/url]', $body); $body = preg_replace("~\[url=($URLSearchString)]([#!@])(.*?)\[/url]~is", '$2[url=$1]$3[/url]', $body);
preg_match("~(?<![!#@])\[url]($URLSearchString)\[/url]$~is", $body, $matches); // Remove all hashtags and mentions
$body = preg_replace("/([#@!])\[url\=(.*?)\](.*?)\[\/url\]/ism", '', $body);
// Search for pure links
preg_match("/\[url\](https?:.*?)\[\/url\]/ism", $body, $matches);
if (!$matches) { if (!$matches) {
preg_match("~(?<![!#@])\[url=($URLSearchString)].*\[/url]$~is", $body, $matches); // Search for links with descriptions
preg_match("/\[url\=(https?:.*?)\].*?\[\/url\]/ism", $body, $matches);
} }
if (!$matches && $searchNakedUrls) { if (!$matches && $searchNakedUrls) {

View file

@ -971,6 +971,8 @@ class Item
$item['raw-body'] = Post\Media::insertFromBody($item['uri-id'], $item['raw-body']); $item['raw-body'] = Post\Media::insertFromBody($item['uri-id'], $item['raw-body']);
$item['raw-body'] = self::setHashtags($item['raw-body']); $item['raw-body'] = self::setHashtags($item['raw-body']);
Post\Media::insertFromAttachmentData($item['uri-id'], $item['body']);
// Check for hashtags in the body and repair or add hashtag links // Check for hashtags in the body and repair or add hashtag links
$item['body'] = self::setHashtags($item['body']); $item['body'] = self::setHashtags($item['body']);
@ -2646,7 +2648,7 @@ class Item
} }
$body = $item['body'] ?? ''; $body = $item['body'] ?? '';
$item['body'] = preg_replace("/\s*\[attachment .*?\].*?\[\/attachment\]\s*/ism", '', $item['body']); $item['body'] = preg_replace("/\s*\[attachment .*?\].*?\[\/attachment\]\s*/ism", "\n", $item['body']);
self::putInCache($item); self::putInCache($item);
$item['body'] = $body; $item['body'] = $body;
$s = $item["rendered-html"]; $s = $item["rendered-html"];
@ -2722,6 +2724,12 @@ class Item
*/ */
public static function containsLink(string $body, string $url) public static function containsLink(string $body, string $url)
{ {
// Make sure that for example site parameters aren't used when testing if the link is contained in the body
$urlparts = parse_url($url);
unset($urlparts['query']);
unset($urlparts['fragment']);
$url = Network::unparseURL($urlparts);
if (strpos($body, $url)) { if (strpos($body, $url)) {
return true; return true;
} }

View file

@ -286,6 +286,8 @@ class Media
// Simplify image codes // Simplify image codes
$body = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", '[img]$3[/img]', $body); $body = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", '[img]$3[/img]', $body);
$unshared_body = preg_replace("/\s*\[share .*?\].*?\[\/share\]\s*/ism", '', $body);
$attachments = []; $attachments = [];
if (preg_match_all("#\[url=([^\]]+?)\]\s*\[img=([^\[\]]*)\]([^\[\]]*)\[\/img\]\s*\[/url\]#ism", $body, $pictures, PREG_SET_ORDER)) { if (preg_match_all("#\[url=([^\]]+?)\]\s*\[img=([^\[\]]*)\]([^\[\]]*)\[\/img\]\s*\[/url\]#ism", $body, $pictures, PREG_SET_ORDER)) {
foreach ($pictures as $picture) { foreach ($pictures as $picture) {
@ -346,7 +348,10 @@ class Media
} }
foreach ($attachments as $attachment) { foreach ($attachments as $attachment) {
self::insert($attachment); // Only store attachments that are part of the unshared body
if (strpos($unshared_body, $attachment['url']) !== false) {
self::insert($attachment);
}
} }
return trim($body); return trim($body);
@ -360,6 +365,9 @@ class Media
*/ */
public static function insertFromAttachmentData(int $uriid, string $body) public static function insertFromAttachmentData(int $uriid, string $body)
{ {
// Don't look at the shared content
$body = preg_replace("/\s*\[share .*?\].*?\[\/share\]\s*/ism", '', $body);
$data = BBCode::getAttachmentData($body); $data = BBCode::getAttachmentData($body);
if (empty($data)) { if (empty($data)) {
return; return;
@ -548,10 +556,18 @@ class Media
} }
if ($media['type'] == self::IMAGE) { if ($media['type'] == self::IMAGE) {
if (!empty($media['description'])) { if (!empty($media['preview'])) {
$body .= "\n[img=" . $media['url'] . ']' . $media['description'] .'[/img]'; if (!empty($media['description'])) {
$body .= "\n[url=" . $media['url'] . "][img=" . $media['preview'] . ']' . $media['description'] .'[/img][/url]';
} else {
$body .= "\n[url=" . $media['url'] . "][img]" . $media['preview'] .'[/img][/url]';
}
} else { } else {
$body .= "\n[img]" . $media['url'] .'[/img]'; if (!empty($media['description'])) {
$body .= "\n[img=" . $media['url'] . ']' . $media['description'] .'[/img]';
} else {
$body .= "\n[img]" . $media['url'] .'[/img]';
}
} }
} elseif ($media['type'] == self::AUDIO) { } elseif ($media['type'] == self::AUDIO) {
$body .= "\n[audio]" . $media['url'] . "[/audio]\n"; $body .= "\n[audio]" . $media['url'] . "[/audio]\n";