Friendica Communications Platform (please note that this is a clone of the repository at github, issues are handled there) https://friendi.ca
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

307 lines
9.0 KiB

  1. <?php
  2. /**
  3. * @copyright Copyright (C) 2020, Friendica
  4. *
  5. * @license GNU AGPL version 3 or any later version
  6. *
  7. * This program is free software: you can redistribute it and/or modify
  8. * it under the terms of the GNU Affero General Public License as
  9. * published by the Free Software Foundation, either version 3 of the
  10. * License, or (at your option) any later version.
  11. *
  12. * This program is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU Affero General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Affero General Public License
  18. * along with this program. If not, see <https://www.gnu.org/licenses/>.
  19. *
  20. */
  21. namespace Friendica\Content;
  22. use Friendica\Core\Hook;
  23. use Friendica\Core\Logger;
  24. use Friendica\DI;
  25. use Friendica\Network\HTTPException;
  26. use Friendica\Util\ParseUrl;
  27. use Friendica\Util\Strings;
  28. /**
  29. * Extracts trailing URLs from post bodies to transform them in enriched attachment tags through Site Info query
  30. */
  31. class PageInfo
  32. {
  33. /**
  34. * @param string $body
  35. * @param bool $searchNakedUrls
  36. * @param bool $no_photos
  37. * @return string
  38. * @throws HTTPException\InternalServerErrorException
  39. */
  40. public static function searchAndAppendToBody(string $body, bool $searchNakedUrls = false, bool $no_photos = false)
  41. {
  42. Logger::info('add_page_info_to_body: fetch page info for body', ['body' => $body]);
  43. $url = self::getRelevantUrlFromBody($body, $searchNakedUrls);
  44. if (!$url) {
  45. return $body;
  46. }
  47. $data = self::queryUrl($url);
  48. if (!$data) {
  49. return $body;
  50. }
  51. return self::appendDataToBody($body, $data, $no_photos);
  52. }
  53. /**
  54. * @param string $body
  55. * @param array $data
  56. * @param bool $no_photos
  57. * @return string
  58. * @throws HTTPException\InternalServerErrorException
  59. */
  60. public static function appendDataToBody(string $body, array $data, bool $no_photos = false)
  61. {
  62. // Only one [attachment] tag per body is allowed
  63. $existingAttachmentPos = strpos($body, '[attachment');
  64. if ($existingAttachmentPos !== false) {
  65. $linkTitle = $data['title'] ?: $data['url'];
  66. // Additional link attachments are prepended before the existing [attachment] tag
  67. $body = substr_replace($body, "\n[bookmark=" . $data['url'] . ']' . $linkTitle . "[/bookmark]\n", $existingAttachmentPos, 0);
  68. } else {
  69. $footer = PageInfo::getFooterFromData($data, $no_photos);
  70. $body = self::stripTrailingUrlFromBody($body, $data['url']);
  71. $body .= "\n" . $footer;
  72. }
  73. return $body;
  74. }
  75. /**
  76. * @param string $url
  77. * @param bool $no_photos
  78. * @param string $photo
  79. * @param bool $keywords
  80. * @param string $keyword_denylist
  81. * @return string
  82. * @throws HTTPException\InternalServerErrorException
  83. */
  84. public static function getFooterFromUrl(string $url, bool $no_photos = false, string $photo = '', bool $keywords = false, string $keyword_denylist = '')
  85. {
  86. $data = self::queryUrl($url, $photo, $keywords, $keyword_denylist);
  87. return self::getFooterFromData($data, $no_photos);
  88. }
  89. /**
  90. * @param array $data
  91. * @param bool $no_photos
  92. * @return string
  93. * @throws HTTPException\InternalServerErrorException
  94. */
  95. public static function getFooterFromData(array $data, bool $no_photos = false)
  96. {
  97. Hook::callAll('page_info_data', $data);
  98. if (empty($data['type'])) {
  99. return '';
  100. }
  101. // It maybe is a rich content, but if it does have everything that a link has,
  102. // then treat it that way
  103. if (($data['type'] == 'rich') && is_string($data['title']) &&
  104. is_string($data['text']) && !empty($data['images'])) {
  105. $data['type'] = 'link';
  106. }
  107. $data['title'] = $data['title'] ?? '';
  108. if ((($data['type'] != 'link') && ($data['type'] != 'video') && ($data['type'] != 'photo')) || ($data['title'] == $data['url'])) {
  109. return '';
  110. }
  111. if ($no_photos && ($data['type'] == 'photo')) {
  112. return '';
  113. }
  114. // Escape some bad characters
  115. $data['url'] = str_replace(['[', ']'], ['&#91;', '&#93;'], htmlentities($data['url'], ENT_QUOTES, 'UTF-8', false));
  116. $data['title'] = str_replace(['[', ']'], ['&#91;', '&#93;'], htmlentities($data['title'], ENT_QUOTES, 'UTF-8', false));
  117. $text = "[attachment type='" . $data['type'] . "'";
  118. if (!empty($data['url'])) {
  119. $text .= " url='" . $data['url'] . "'";
  120. }
  121. if (!empty($data['title'])) {
  122. $text .= " title='" . $data['title'] . "'";
  123. }
  124. // Only embedd a picture link when it seems to be a valid picture ("width" is set)
  125. if (!empty($data['images']) && !empty($data['images'][0]['width'])) {
  126. $preview = str_replace(['[', ']'], ['&#91;', '&#93;'], htmlentities($data['images'][0]['src'], ENT_QUOTES, 'UTF-8', false));
  127. // if the preview picture is larger than 500 pixels then show it in a larger mode
  128. // But only, if the picture isn't higher than large (To prevent huge posts)
  129. if (!DI::config()->get('system', 'always_show_preview') && ($data['images'][0]['width'] >= 500)
  130. && ($data['images'][0]['width'] >= $data['images'][0]['height'])) {
  131. $text .= " image='" . $preview . "'";
  132. } else {
  133. $text .= " preview='" . $preview . "'";
  134. if (empty($data['text'])) {
  135. $data['text'] = $data['title'];
  136. }
  137. if (empty($data['text'])) {
  138. $data['text'] = $data['url'];
  139. }
  140. }
  141. }
  142. $text .= ']' . $data['text'] ?? '' . '[/attachment]';
  143. $hashtags = '';
  144. if (!empty($data['keywords'])) {
  145. $hashtags = "\n";
  146. foreach ($data['keywords'] as $keyword) {
  147. /// @TODO make a positive list of allowed characters
  148. $hashtag = str_replace([' ', '+', '/', '.', '#', '@', "'", '"', '’', '`', '(', ')', '„', '“'], '', $keyword);
  149. $hashtags .= '#[url=' . DI::baseUrl() . '/search?tag=' . $hashtag . ']' . $hashtag . '[/url] ';
  150. }
  151. }
  152. return $text . $hashtags;
  153. }
  154. /**
  155. * @param string $url
  156. * @param string $photo
  157. * @param bool $keywords
  158. * @param string $keyword_denylist
  159. * @return array|bool
  160. * @throws HTTPException\InternalServerErrorException
  161. */
  162. public static function queryUrl(string $url, string $photo = '', bool $keywords = false, string $keyword_denylist = '')
  163. {
  164. $data = ParseUrl::getSiteinfoCached($url, true);
  165. if ($photo != '') {
  166. $data['images'][0]['src'] = $photo;
  167. }
  168. if (!$keywords) {
  169. unset($data['keywords']);
  170. } elseif ($keyword_denylist && !empty($data['keywords'])) {
  171. $list = explode(', ', $keyword_denylist);
  172. foreach ($list as $keyword) {
  173. $keyword = trim($keyword);
  174. $index = array_search($keyword, $data['keywords']);
  175. if ($index !== false) {
  176. unset($data['keywords'][$index]);
  177. }
  178. }
  179. }
  180. Logger::info('fetch page info for URL', ['url' => $url, 'data' => $data]);
  181. return $data;
  182. }
  183. /**
  184. * @param string $url
  185. * @param string $photo
  186. * @param string $keyword_denylist
  187. * @return array
  188. * @throws HTTPException\InternalServerErrorException
  189. */
  190. public static function getTagsFromUrl(string $url, string $photo = '', string $keyword_denylist = '')
  191. {
  192. $data = self::queryUrl($url, $photo, true, $keyword_denylist);
  193. if (empty($data['keywords'])) {
  194. return [];
  195. }
  196. $taglist = [];
  197. foreach ($data['keywords'] as $keyword) {
  198. $hashtag = str_replace([' ', '+', '/', '.', '#', "'"],
  199. ['', '', '', '', '', ''], $keyword);
  200. $taglist[] = $hashtag;
  201. }
  202. return $taglist;
  203. }
  204. /**
  205. * Picks a non-hashtag, non-mention, schemeful URL at the end of the provided body string to be converted into Page Info.
  206. *
  207. * @param string $body
  208. * @param bool $searchNakedUrls Whether we should pick a naked URL (outside of BBCode tags) as a last resort
  209. * @return string|null
  210. */
  211. protected static function getRelevantUrlFromBody(string $body, bool $searchNakedUrls = false)
  212. {
  213. $URLSearchString = 'https?://[^\[\]]*';
  214. // Fix for Mastodon where the mentions are in a different format
  215. $body = preg_replace("~\[url=($URLSearchString)]([#!@])(.*?)\[/url]~is", '$2[url=$1]$3[/url]', $body);
  216. preg_match("~(?<![!#@])\[url]($URLSearchString)\[/url]$~is", $body, $matches);
  217. if (!$matches) {
  218. preg_match("~(?<![!#@])\[url=($URLSearchString)].*\[/url]$~is", $body, $matches);
  219. }
  220. if (!$matches && $searchNakedUrls) {
  221. preg_match('~(?<=\W|^)(?<![=\]])(https?://.+)$~is', $body, $matches);
  222. if ($matches && !Strings::endsWith($body, $matches[1])) {
  223. unset($matches);
  224. }
  225. }
  226. return $matches[1] ?? null;
  227. }
  228. /**
  229. * Remove the provided URL from the body if it is at the end of it.
  230. * Keep the link label if it isn't the full URL or a shortened version of it.
  231. *
  232. * @param string $body
  233. * @param string $url
  234. * @return string
  235. */
  236. protected static function stripTrailingUrlFromBody(string $body, string $url)
  237. {
  238. $quotedUrl = preg_quote($url, '#');
  239. $body = preg_replace_callback("#(?:
  240. \[url]$quotedUrl\[/url]|
  241. \[url=$quotedUrl]$quotedUrl\[/url]|
  242. \[url=$quotedUrl]([^[]*?)\[/url]|
  243. $quotedUrl
  244. )$#isx", function ($match) use ($url) {
  245. // Stripping URLs with no label
  246. if (!isset($match[1])) {
  247. return '';
  248. }
  249. // Stripping link labels that include a shortened version of the URL
  250. if (strpos($url, trim($match[1], '.…')) !== false) {
  251. return '';
  252. }
  253. // Keep all other labels
  254. return $match[1];
  255. }, $body);
  256. return rtrim($body);
  257. }
  258. }