2020-06-17 10:57:21 +02:00
|
|
|
|
<?php
|
|
|
|
|
/**
|
2021-03-29 08:40:20 +02:00
|
|
|
|
* @copyright Copyright (C) 2010-2021, the Friendica project
|
2020-06-17 10:57:21 +02:00
|
|
|
|
*
|
|
|
|
|
* @license GNU AGPL version 3 or any later version
|
|
|
|
|
*
|
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU Affero General Public License as
|
|
|
|
|
* published by the Free Software Foundation, either version 3 of the
|
|
|
|
|
* License, or (at your option) any later version.
|
|
|
|
|
*
|
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU Affero General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
|
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
namespace Friendica\Content;
|
|
|
|
|
|
|
|
|
|
use Friendica\Core\Hook;
|
|
|
|
|
use Friendica\Core\Logger;
|
|
|
|
|
use Friendica\DI;
|
|
|
|
|
use Friendica\Network\HTTPException;
|
|
|
|
|
use Friendica\Util\ParseUrl;
|
|
|
|
|
use Friendica\Util\Strings;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Extracts trailing URLs from post bodies to transform them in enriched attachment tags through Site Info query
|
|
|
|
|
*/
|
|
|
|
|
class PageInfo
|
|
|
|
|
{
|
|
|
|
|
/**
|
|
|
|
|
* @param string $body
|
|
|
|
|
* @param bool $searchNakedUrls
|
|
|
|
|
* @param bool $no_photos
|
|
|
|
|
* @return string
|
|
|
|
|
* @throws HTTPException\InternalServerErrorException
|
|
|
|
|
*/
|
2020-07-18 01:15:43 +02:00
|
|
|
|
public static function searchAndAppendToBody(string $body, bool $searchNakedUrls = false, bool $no_photos = false)
|
2020-06-17 10:57:21 +02:00
|
|
|
|
{
|
|
|
|
|
Logger::info('add_page_info_to_body: fetch page info for body', ['body' => $body]);
|
|
|
|
|
|
|
|
|
|
$url = self::getRelevantUrlFromBody($body, $searchNakedUrls);
|
|
|
|
|
if (!$url) {
|
|
|
|
|
return $body;
|
|
|
|
|
}
|
|
|
|
|
|
2020-07-18 01:38:28 +02:00
|
|
|
|
$data = self::queryUrl($url);
|
|
|
|
|
if (!$data) {
|
2020-06-17 10:57:21 +02:00
|
|
|
|
return $body;
|
|
|
|
|
}
|
|
|
|
|
|
2020-07-18 01:38:28 +02:00
|
|
|
|
return self::appendDataToBody($body, $data, $no_photos);
|
|
|
|
|
}
|
2020-06-17 10:57:21 +02:00
|
|
|
|
|
2020-07-18 01:38:28 +02:00
|
|
|
|
/**
|
|
|
|
|
* @param string $body
|
|
|
|
|
* @param array $data
|
|
|
|
|
* @param bool $no_photos
|
|
|
|
|
* @return string
|
|
|
|
|
* @throws HTTPException\InternalServerErrorException
|
|
|
|
|
*/
|
|
|
|
|
public static function appendDataToBody(string $body, array $data, bool $no_photos = false)
|
|
|
|
|
{
|
|
|
|
|
// Only one [attachment] tag per body is allowed
|
|
|
|
|
$existingAttachmentPos = strpos($body, '[attachment');
|
|
|
|
|
if ($existingAttachmentPos !== false) {
|
|
|
|
|
$linkTitle = $data['title'] ?: $data['url'];
|
|
|
|
|
// Additional link attachments are prepended before the existing [attachment] tag
|
|
|
|
|
$body = substr_replace($body, "\n[bookmark=" . $data['url'] . ']' . $linkTitle . "[/bookmark]\n", $existingAttachmentPos, 0);
|
|
|
|
|
} else {
|
2021-04-26 08:50:12 +02:00
|
|
|
|
$footer = self::getFooterFromData($data, $no_photos);
|
2020-07-18 01:38:28 +02:00
|
|
|
|
$body = self::stripTrailingUrlFromBody($body, $data['url']);
|
|
|
|
|
$body .= "\n" . $footer;
|
|
|
|
|
}
|
2020-06-17 10:57:21 +02:00
|
|
|
|
|
|
|
|
|
return $body;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param string $url
|
|
|
|
|
* @param bool $no_photos
|
|
|
|
|
* @param string $photo
|
|
|
|
|
* @param bool $keywords
|
|
|
|
|
* @param string $keyword_denylist
|
|
|
|
|
* @return string
|
|
|
|
|
* @throws HTTPException\InternalServerErrorException
|
|
|
|
|
*/
|
|
|
|
|
public static function getFooterFromUrl(string $url, bool $no_photos = false, string $photo = '', bool $keywords = false, string $keyword_denylist = '')
|
|
|
|
|
{
|
|
|
|
|
$data = self::queryUrl($url, $photo, $keywords, $keyword_denylist);
|
|
|
|
|
|
|
|
|
|
return self::getFooterFromData($data, $no_photos);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param array $data
|
|
|
|
|
* @param bool $no_photos
|
|
|
|
|
* @return string
|
|
|
|
|
* @throws HTTPException\InternalServerErrorException
|
|
|
|
|
*/
|
|
|
|
|
public static function getFooterFromData(array $data, bool $no_photos = false)
|
|
|
|
|
{
|
|
|
|
|
Hook::callAll('page_info_data', $data);
|
|
|
|
|
|
|
|
|
|
if (empty($data['type'])) {
|
|
|
|
|
return '';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// It maybe is a rich content, but if it does have everything that a link has,
|
|
|
|
|
// then treat it that way
|
|
|
|
|
if (($data['type'] == 'rich') && is_string($data['title']) &&
|
|
|
|
|
is_string($data['text']) && !empty($data['images'])) {
|
|
|
|
|
$data['type'] = 'link';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$data['title'] = $data['title'] ?? '';
|
|
|
|
|
|
|
|
|
|
if ((($data['type'] != 'link') && ($data['type'] != 'video') && ($data['type'] != 'photo')) || ($data['title'] == $data['url'])) {
|
|
|
|
|
return '';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($no_photos && ($data['type'] == 'photo')) {
|
|
|
|
|
return '';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Escape some bad characters
|
2021-03-15 23:02:21 +01:00
|
|
|
|
$text = "[attachment";
|
2020-06-17 10:57:21 +02:00
|
|
|
|
|
2021-03-16 07:37:43 +01:00
|
|
|
|
foreach (['type', 'url', 'title', 'alternative_title', 'publisher_name', 'publisher_url', 'publisher_img', 'author_name', 'author_url', 'author_img'] as $field) {
|
2021-03-15 23:02:21 +01:00
|
|
|
|
if (!empty($data[$field])) {
|
|
|
|
|
$text .= " " . $field . "='" . str_replace(['[', ']'], ['[', ']'], htmlentities($data[$field], ENT_QUOTES, 'UTF-8', false)) . "'";
|
|
|
|
|
}
|
2020-06-17 10:57:21 +02:00
|
|
|
|
}
|
|
|
|
|
|
2020-09-20 19:47:44 +02:00
|
|
|
|
if (empty($data['text'])) {
|
|
|
|
|
$data['text'] = '';
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-17 10:57:21 +02:00
|
|
|
|
// Only embedd a picture link when it seems to be a valid picture ("width" is set)
|
|
|
|
|
if (!empty($data['images']) && !empty($data['images'][0]['width'])) {
|
|
|
|
|
$preview = str_replace(['[', ']'], ['[', ']'], htmlentities($data['images'][0]['src'], ENT_QUOTES, 'UTF-8', false));
|
|
|
|
|
// if the preview picture is larger than 500 pixels then show it in a larger mode
|
|
|
|
|
// But only, if the picture isn't higher than large (To prevent huge posts)
|
|
|
|
|
if (!DI::config()->get('system', 'always_show_preview') && ($data['images'][0]['width'] >= 500)
|
|
|
|
|
&& ($data['images'][0]['width'] >= $data['images'][0]['height'])) {
|
|
|
|
|
$text .= " image='" . $preview . "'";
|
|
|
|
|
} else {
|
|
|
|
|
$text .= " preview='" . $preview . "'";
|
2020-09-19 21:02:40 +02:00
|
|
|
|
|
|
|
|
|
if (empty($data['text'])) {
|
|
|
|
|
$data['text'] = $data['title'];
|
|
|
|
|
}
|
2021-04-26 08:50:12 +02:00
|
|
|
|
|
2020-09-19 21:02:40 +02:00
|
|
|
|
if (empty($data['text'])) {
|
|
|
|
|
$data['text'] = $data['url'];
|
|
|
|
|
}
|
2020-06-17 10:57:21 +02:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-15 23:02:21 +01:00
|
|
|
|
$text .= ']' . str_replace(['[', ']'], ['[', ']'], $data['text']) . '[/attachment]';
|
2020-06-17 10:57:21 +02:00
|
|
|
|
|
|
|
|
|
$hashtags = '';
|
|
|
|
|
if (!empty($data['keywords'])) {
|
|
|
|
|
$hashtags = "\n";
|
|
|
|
|
foreach ($data['keywords'] as $keyword) {
|
|
|
|
|
/// @TODO make a positive list of allowed characters
|
|
|
|
|
$hashtag = str_replace([' ', '+', '/', '.', '#', '@', "'", '"', '’', '`', '(', ')', '„', '“'], '', $keyword);
|
|
|
|
|
$hashtags .= '#[url=' . DI::baseUrl() . '/search?tag=' . $hashtag . ']' . $hashtag . '[/url] ';
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $text . $hashtags;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param string $url
|
|
|
|
|
* @param string $photo
|
|
|
|
|
* @param bool $keywords
|
|
|
|
|
* @param string $keyword_denylist
|
|
|
|
|
* @return array|bool
|
|
|
|
|
* @throws HTTPException\InternalServerErrorException
|
|
|
|
|
*/
|
|
|
|
|
public static function queryUrl(string $url, string $photo = '', bool $keywords = false, string $keyword_denylist = '')
|
|
|
|
|
{
|
2021-03-16 08:15:20 +01:00
|
|
|
|
$data = ParseUrl::getSiteinfoCached($url);
|
2020-06-17 10:57:21 +02:00
|
|
|
|
|
|
|
|
|
if ($photo != '') {
|
|
|
|
|
$data['images'][0]['src'] = $photo;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!$keywords) {
|
|
|
|
|
unset($data['keywords']);
|
2020-06-28 08:18:35 +02:00
|
|
|
|
} elseif ($keyword_denylist && !empty($data['keywords'])) {
|
2020-06-17 10:57:21 +02:00
|
|
|
|
$list = explode(', ', $keyword_denylist);
|
|
|
|
|
|
|
|
|
|
foreach ($list as $keyword) {
|
|
|
|
|
$keyword = trim($keyword);
|
|
|
|
|
|
|
|
|
|
$index = array_search($keyword, $data['keywords']);
|
|
|
|
|
if ($index !== false) {
|
|
|
|
|
unset($data['keywords'][$index]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Logger::info('fetch page info for URL', ['url' => $url, 'data' => $data]);
|
|
|
|
|
|
|
|
|
|
return $data;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param string $url
|
|
|
|
|
* @param string $photo
|
|
|
|
|
* @param string $keyword_denylist
|
|
|
|
|
* @return array
|
|
|
|
|
* @throws HTTPException\InternalServerErrorException
|
|
|
|
|
*/
|
|
|
|
|
public static function getTagsFromUrl(string $url, string $photo = '', string $keyword_denylist = '')
|
|
|
|
|
{
|
|
|
|
|
$data = self::queryUrl($url, $photo, true, $keyword_denylist);
|
|
|
|
|
|
2020-06-28 08:18:35 +02:00
|
|
|
|
if (empty($data['keywords'])) {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-17 10:57:21 +02:00
|
|
|
|
$taglist = [];
|
|
|
|
|
foreach ($data['keywords'] as $keyword) {
|
|
|
|
|
$hashtag = str_replace([' ', '+', '/', '.', '#', "'"],
|
|
|
|
|
['', '', '', '', '', ''], $keyword);
|
|
|
|
|
|
|
|
|
|
$taglist[] = $hashtag;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $taglist;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Picks a non-hashtag, non-mention, schemeful URL at the end of the provided body string to be converted into Page Info.
|
|
|
|
|
*
|
|
|
|
|
* @param string $body
|
|
|
|
|
* @param bool $searchNakedUrls Whether we should pick a naked URL (outside of BBCode tags) as a last resort
|
|
|
|
|
* @return string|null
|
|
|
|
|
*/
|
2021-04-26 08:50:12 +02:00
|
|
|
|
public static function getRelevantUrlFromBody(string $body, bool $searchNakedUrls = false)
|
2020-06-17 10:57:21 +02:00
|
|
|
|
{
|
|
|
|
|
$URLSearchString = 'https?://[^\[\]]*';
|
|
|
|
|
|
|
|
|
|
// Fix for Mastodon where the mentions are in a different format
|
|
|
|
|
$body = preg_replace("~\[url=($URLSearchString)]([#!@])(.*?)\[/url]~is", '$2[url=$1]$3[/url]', $body);
|
|
|
|
|
|
2021-05-02 19:33:32 +02:00
|
|
|
|
// Remove all hashtags and mentions
|
|
|
|
|
$body = preg_replace("/([#@!])\[url\=(.*?)\](.*?)\[\/url\]/ism", '', $body);
|
|
|
|
|
|
|
|
|
|
// Search for pure links
|
2021-05-02 19:53:11 +02:00
|
|
|
|
preg_match("/\[url\](https?:.*?)\[\/url\]/ism", $body, $matches);
|
2020-06-17 10:57:21 +02:00
|
|
|
|
|
|
|
|
|
if (!$matches) {
|
2021-05-02 19:33:32 +02:00
|
|
|
|
// Search for links with descriptions
|
2021-05-02 19:53:11 +02:00
|
|
|
|
preg_match("/\[url\=(https?:.*?)\].*?\[\/url\]/ism", $body, $matches);
|
2020-06-17 10:57:21 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!$matches && $searchNakedUrls) {
|
2020-12-04 13:27:13 +01:00
|
|
|
|
preg_match(Strings::autoLinkRegEx(), $body, $matches);
|
2020-06-17 10:57:21 +02:00
|
|
|
|
if ($matches && !Strings::endsWith($body, $matches[1])) {
|
|
|
|
|
unset($matches);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $matches[1] ?? null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Remove the provided URL from the body if it is at the end of it.
|
2020-07-17 23:16:22 +02:00
|
|
|
|
* Keep the link label if it isn't the full URL or a shortened version of it.
|
2020-06-17 10:57:21 +02:00
|
|
|
|
*
|
|
|
|
|
* @param string $body
|
|
|
|
|
* @param string $url
|
2020-07-17 23:16:22 +02:00
|
|
|
|
* @return string
|
2020-06-17 10:57:21 +02:00
|
|
|
|
*/
|
|
|
|
|
protected static function stripTrailingUrlFromBody(string $body, string $url)
|
|
|
|
|
{
|
|
|
|
|
$quotedUrl = preg_quote($url, '#');
|
2020-07-17 23:16:22 +02:00
|
|
|
|
$body = preg_replace_callback("#(?:
|
2020-06-17 10:57:21 +02:00
|
|
|
|
\[url]$quotedUrl\[/url]|
|
|
|
|
|
\[url=$quotedUrl]$quotedUrl\[/url]|
|
|
|
|
|
\[url=$quotedUrl]([^[]*?)\[/url]|
|
|
|
|
|
$quotedUrl
|
2020-07-17 23:16:22 +02:00
|
|
|
|
)$#isx", function ($match) use ($url) {
|
|
|
|
|
// Stripping URLs with no label
|
2020-10-29 14:28:27 +01:00
|
|
|
|
if (empty($match[1])) {
|
2020-07-17 23:16:22 +02:00
|
|
|
|
return '';
|
|
|
|
|
}
|
2020-06-17 10:57:21 +02:00
|
|
|
|
|
2020-07-17 23:16:22 +02:00
|
|
|
|
// Stripping link labels that include a shortened version of the URL
|
2021-04-23 21:12:00 +02:00
|
|
|
|
$trimMatch = trim($match[1], '.…');
|
|
|
|
|
if (!empty($trimMatch) && strpos($url, $trimMatch) !== false) {
|
2020-07-17 23:16:22 +02:00
|
|
|
|
return '';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Keep all other labels
|
|
|
|
|
return $match[1];
|
|
|
|
|
}, $body);
|
|
|
|
|
|
|
|
|
|
return rtrim($body);
|
2020-06-17 10:57:21 +02:00
|
|
|
|
}
|
|
|
|
|
}
|