Friendica Communications Platform (please note that this is a clone of the repository at github, issues are handled there) https://friendi.ca
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

459 lines
13 KiB

  1. <?php
  2. /**
  3. * @file src/Util/ParseUrl.php
  4. * @brief Get informations about a given URL
  5. */
  6. namespace Friendica\Util;
  7. use DOMDocument;
  8. use DOMXPath;
  9. use Friendica\Content\OEmbed;
  10. use Friendica\Core\Hook;
  11. use Friendica\Core\Logger;
  12. use Friendica\Database\DBA;
  13. /**
  14. * @brief Class with methods for extracting certain content from an url
  15. */
  16. class ParseUrl
  17. {
  18. /**
  19. * Maximum number of characters for the description
  20. */
  21. const MAX_DESC_COUNT = 250;
  22. /**
  23. * Minimum number of characters for the description
  24. */
  25. const MIN_DESC_COUNT = 100;
  26. /**
  27. * @brief Search for chached embeddable data of an url otherwise fetch it
  28. *
  29. * @param string $url The url of the page which should be scraped
  30. * @param bool $no_guessing If true the parse doens't search for
  31. * preview pictures
  32. * @param bool $do_oembed The false option is used by the function fetch_oembed()
  33. * to avoid endless loops
  34. *
  35. * @return array which contains needed data for embedding
  36. * string 'url' => The url of the parsed page
  37. * string 'type' => Content type
  38. * string 'title' => The title of the content
  39. * string 'text' => The description for the content
  40. * string 'image' => A preview image of the content (only available
  41. * if $no_geuessing = false
  42. * array'images' = Array of preview pictures
  43. * string 'keywords' => The tags which belong to the content
  44. *
  45. * @throws \Friendica\Network\HTTPException\InternalServerErrorException
  46. * @see ParseUrl::getSiteinfo() for more information about scraping
  47. * embeddable content
  48. */
  49. public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true)
  50. {
  51. if ($url == "") {
  52. return false;
  53. }
  54. $parsed_url = DBA::selectFirst('parsed_url', ['content'],
  55. ['url' => Strings::normaliseLink($url), 'guessing' => !$no_guessing, 'oembed' => $do_oembed]
  56. );
  57. if (!empty($parsed_url['content'])) {
  58. $data = unserialize($parsed_url['content']);
  59. return $data;
  60. }
  61. $data = self::getSiteinfo($url, $no_guessing, $do_oembed);
  62. DBA::insert(
  63. 'parsed_url',
  64. [
  65. 'url' => Strings::normaliseLink($url), 'guessing' => !$no_guessing,
  66. 'oembed' => $do_oembed, 'content' => serialize($data),
  67. 'created' => DateTimeFormat::utcNow()
  68. ],
  69. true
  70. );
  71. return $data;
  72. }
  73. /**
  74. * @brief Parse a page for embeddable content information
  75. *
  76. * This method parses to url for meta data which can be used to embed
  77. * the content. If available it prioritizes Open Graph meta tags.
  78. * If this is not available it uses the twitter cards meta tags.
  79. * As fallback it uses standard html elements with meta informations
  80. * like \<title\>Awesome Title\</title\> or
  81. * \<meta name="description" content="An awesome description"\>
  82. *
  83. * @param string $url The url of the page which should be scraped
  84. * @param bool $no_guessing If true the parse doens't search for
  85. * preview pictures
  86. * @param bool $do_oembed The false option is used by the function fetch_oembed()
  87. * to avoid endless loops
  88. * @param int $count Internal counter to avoid endless loops
  89. *
  90. * @return array which contains needed data for embedding
  91. * string 'url' => The url of the parsed page
  92. * string 'type' => Content type
  93. * string 'title' => The title of the content
  94. * string 'text' => The description for the content
  95. * string 'image' => A preview image of the content (only available
  96. * if $no_geuessing = false
  97. * array'images' = Array of preview pictures
  98. * string 'keywords' => The tags which belong to the content
  99. *
  100. * @throws \Friendica\Network\HTTPException\InternalServerErrorException
  101. * @todo https://developers.google.com/+/plugins/snippet/
  102. * @verbatim
  103. * <meta itemprop="name" content="Awesome title">
  104. * <meta itemprop="description" content="An awesome description">
  105. * <meta itemprop="image" content="http://maple.libertreeproject.org/images/tree-icon.png">
  106. *
  107. * <body itemscope itemtype="http://schema.org/Product">
  108. * <h1 itemprop="name">Shiny Trinket</h1>
  109. * <img itemprop="image" src="{image-url}" />
  110. * <p itemprop="description">Shiny trinkets are shiny.</p>
  111. * </body>
  112. * @endverbatim
  113. */
  114. public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1)
  115. {
  116. $siteinfo = [];
  117. // Check if the URL does contain a scheme
  118. $scheme = parse_url($url, PHP_URL_SCHEME);
  119. if ($scheme == '') {
  120. $url = 'http://' . trim($url, '/');
  121. }
  122. if ($count > 10) {
  123. Logger::log('Endless loop detected for ' . $url, Logger::DEBUG);
  124. return $siteinfo;
  125. }
  126. $url = trim($url, "'");
  127. $url = trim($url, '"');
  128. $url = Network::stripTrackingQueryParams($url);
  129. $siteinfo['url'] = $url;
  130. $siteinfo['type'] = 'link';
  131. $curlResult = Network::curl($url);
  132. if (!$curlResult->isSuccess()) {
  133. return $siteinfo;
  134. }
  135. // If the file is too large then exit
  136. if (($curlResult->getInfo()['download_content_length'] ?? 0) > 1000000) {
  137. return $siteinfo;
  138. }
  139. // If it isn't a HTML file then exit
  140. if (($curlResult->getContentType() != '') && !strstr(strtolower($curlResult->getContentType()), 'html')) {
  141. return $siteinfo;
  142. }
  143. $header = $curlResult->getHeader();
  144. $body = $curlResult->getBody();
  145. if ($do_oembed) {
  146. $oembed_data = OEmbed::fetchURL($url);
  147. if (!empty($oembed_data->type)) {
  148. if (!in_array($oembed_data->type, ['error', 'rich', ''])) {
  149. $siteinfo['type'] = $oembed_data->type;
  150. }
  151. // See https://github.com/friendica/friendica/pull/5763#discussion_r217913178
  152. if ($siteinfo['type'] != 'photo') {
  153. if (isset($oembed_data->title)) {
  154. $siteinfo['title'] = trim($oembed_data->title);
  155. }
  156. if (isset($oembed_data->description)) {
  157. $siteinfo['text'] = trim($oembed_data->description);
  158. }
  159. if (isset($oembed_data->thumbnail_url)) {
  160. $siteinfo['image'] = $oembed_data->thumbnail_url;
  161. }
  162. }
  163. }
  164. }
  165. // Fetch the first mentioned charset. Can be in body or header
  166. $charset = '';
  167. if (preg_match('/charset=(.*?)[\'"\s\n]/', $header, $matches)) {
  168. $charset = trim(trim(trim(array_pop($matches)), ';,'));
  169. }
  170. if ($charset && strtoupper($charset) != 'UTF-8') {
  171. // See https://github.com/friendica/friendica/issues/5470#issuecomment-418351211
  172. $charset = str_ireplace('latin-1', 'latin1', $charset);
  173. Logger::log('detected charset ' . $charset, Logger::DEBUG);
  174. $body = iconv($charset, 'UTF-8//TRANSLIT', $body);
  175. }
  176. $body = mb_convert_encoding($body, 'HTML-ENTITIES', 'UTF-8');
  177. $doc = new DOMDocument();
  178. @$doc->loadHTML($body);
  179. XML::deleteNode($doc, 'style');
  180. XML::deleteNode($doc, 'script');
  181. XML::deleteNode($doc, 'option');
  182. XML::deleteNode($doc, 'h1');
  183. XML::deleteNode($doc, 'h2');
  184. XML::deleteNode($doc, 'h3');
  185. XML::deleteNode($doc, 'h4');
  186. XML::deleteNode($doc, 'h5');
  187. XML::deleteNode($doc, 'h6');
  188. XML::deleteNode($doc, 'ol');
  189. XML::deleteNode($doc, 'ul');
  190. $xpath = new DOMXPath($doc);
  191. $list = $xpath->query('//meta[@content]');
  192. foreach ($list as $node) {
  193. $meta_tag = [];
  194. if ($node->attributes->length) {
  195. foreach ($node->attributes as $attribute) {
  196. $meta_tag[$attribute->name] = $attribute->value;
  197. }
  198. }
  199. if (@$meta_tag['http-equiv'] == 'refresh') {
  200. $path = $meta_tag['content'];
  201. $pathinfo = explode(';', $path);
  202. $content = '';
  203. foreach ($pathinfo as $value) {
  204. if (substr(strtolower($value), 0, 4) == 'url=') {
  205. $content = substr($value, 4);
  206. }
  207. }
  208. if ($content != '') {
  209. $siteinfo = self::getSiteinfo($content, $no_guessing, $do_oembed, ++$count);
  210. return $siteinfo;
  211. }
  212. }
  213. }
  214. $list = $xpath->query('//title');
  215. if ($list->length > 0) {
  216. $siteinfo['title'] = trim($list->item(0)->nodeValue);
  217. }
  218. $list = $xpath->query('//meta[@name]');
  219. foreach ($list as $node) {
  220. $meta_tag = [];
  221. if ($node->attributes->length) {
  222. foreach ($node->attributes as $attribute) {
  223. $meta_tag[$attribute->name] = $attribute->value;
  224. }
  225. }
  226. if (empty($meta_tag['content'])) {
  227. continue;
  228. }
  229. $meta_tag['content'] = trim(html_entity_decode($meta_tag['content'], ENT_QUOTES, 'UTF-8'));
  230. switch (strtolower($meta_tag['name'])) {
  231. case 'fulltitle':
  232. $siteinfo['title'] = trim($meta_tag['content']);
  233. break;
  234. case 'description':
  235. $siteinfo['text'] = trim($meta_tag['content']);
  236. break;
  237. case 'thumbnail':
  238. $siteinfo['image'] = $meta_tag['content'];
  239. break;
  240. case 'twitter:image':
  241. $siteinfo['image'] = $meta_tag['content'];
  242. break;
  243. case 'twitter:image:src':
  244. $siteinfo['image'] = $meta_tag['content'];
  245. break;
  246. case 'twitter:card':
  247. // Detect photo pages
  248. if ($meta_tag['content'] == 'summary_large_image') {
  249. $siteinfo['type'] = 'photo';
  250. }
  251. break;
  252. case 'twitter:description':
  253. $siteinfo['text'] = trim($meta_tag['content']);
  254. break;
  255. case 'twitter:title':
  256. $siteinfo['title'] = trim($meta_tag['content']);
  257. break;
  258. case 'dc.title':
  259. $siteinfo['title'] = trim($meta_tag['content']);
  260. break;
  261. case 'dc.description':
  262. $siteinfo['text'] = trim($meta_tag['content']);
  263. break;
  264. case 'keywords':
  265. $keywords = explode(',', $meta_tag['content']);
  266. break;
  267. case 'news_keywords':
  268. $keywords = explode(',', $meta_tag['content']);
  269. break;
  270. }
  271. }
  272. if (isset($keywords)) {
  273. $siteinfo['keywords'] = [];
  274. foreach ($keywords as $keyword) {
  275. if (!in_array(trim($keyword), $siteinfo['keywords'])) {
  276. $siteinfo['keywords'][] = trim($keyword);
  277. }
  278. }
  279. }
  280. $list = $xpath->query('//meta[@property]');
  281. foreach ($list as $node) {
  282. $meta_tag = [];
  283. if ($node->attributes->length) {
  284. foreach ($node->attributes as $attribute) {
  285. $meta_tag[$attribute->name] = $attribute->value;
  286. }
  287. }
  288. if (!empty($meta_tag['content'])) {
  289. $meta_tag['content'] = trim(html_entity_decode($meta_tag['content'], ENT_QUOTES, 'UTF-8'));
  290. switch (strtolower($meta_tag['property'])) {
  291. case 'og:image':
  292. $siteinfo['image'] = $meta_tag['content'];
  293. break;
  294. case 'og:title':
  295. $siteinfo['title'] = trim($meta_tag['content']);
  296. break;
  297. case 'og:description':
  298. $siteinfo['text'] = trim($meta_tag['content']);
  299. break;
  300. }
  301. }
  302. }
  303. // Prevent to have a photo type without an image
  304. if ((empty($siteinfo['image']) || !empty($siteinfo['text'])) && ($siteinfo['type'] == 'photo')) {
  305. $siteinfo['type'] = 'link';
  306. }
  307. if (!empty($siteinfo['image'])) {
  308. $src = self::completeUrl($siteinfo['image'], $url);
  309. unset($siteinfo['image']);
  310. $photodata = Images::getInfoFromURLCached($src);
  311. if (($photodata) && ($photodata[0] > 10) && ($photodata[1] > 10)) {
  312. $siteinfo['images'][] = ['src' => $src,
  313. 'width' => $photodata[0],
  314. 'height' => $photodata[1]];
  315. }
  316. }
  317. if (!empty($siteinfo['text']) && mb_strlen($siteinfo['text']) > self::MAX_DESC_COUNT) {
  318. $siteinfo['text'] = mb_substr($siteinfo['text'], 0, self::MAX_DESC_COUNT) . '…';
  319. $pos = mb_strrpos($siteinfo['text'], '.');
  320. if ($pos > self::MIN_DESC_COUNT) {
  321. $siteinfo['text'] = mb_substr($siteinfo['text'], 0, $pos + 1);
  322. }
  323. }
  324. Logger::info('Siteinfo fetched', ['url' => $url, 'siteinfo' => $siteinfo]);
  325. Hook::callAll('getsiteinfo', $siteinfo);
  326. return $siteinfo;
  327. }
  328. /**
  329. * @brief Convert tags from CSV to an array
  330. *
  331. * @param string $string Tags
  332. * @return array with formatted Hashtags
  333. */
  334. public static function convertTagsToArray($string)
  335. {
  336. $arr_tags = str_getcsv($string);
  337. if (count($arr_tags)) {
  338. // add the # sign to every tag
  339. array_walk($arr_tags, ["self", "arrAddHashes"]);
  340. return $arr_tags;
  341. }
  342. }
  343. /**
  344. * @brief Add a hasht sign to a string
  345. *
  346. * This method is used as callback function
  347. *
  348. * @param string $tag The pure tag name
  349. * @param int $k Counter for internal use
  350. * @return void
  351. */
  352. private static function arrAddHashes(&$tag, $k)
  353. {
  354. $tag = "#" . $tag;
  355. }
  356. /**
  357. * @brief Add a scheme to an url
  358. *
  359. * The src attribute of some html elements (e.g. images)
  360. * can miss the scheme so we need to add the correct
  361. * scheme
  362. *
  363. * @param string $url The url which possibly does have
  364. * a missing scheme (a link to an image)
  365. * @param string $scheme The url with a correct scheme
  366. * (e.g. the url from the webpage which does contain the image)
  367. *
  368. * @return string The url with a scheme
  369. */
  370. private static function completeUrl($url, $scheme)
  371. {
  372. $urlarr = parse_url($url);
  373. // If the url does allready have an scheme
  374. // we can stop the process here
  375. if (isset($urlarr["scheme"])) {
  376. return($url);
  377. }
  378. $schemearr = parse_url($scheme);
  379. $complete = $schemearr["scheme"]."://".$schemearr["host"];
  380. if (!empty($schemearr["port"])) {
  381. $complete .= ":".$schemearr["port"];
  382. }
  383. if (!empty($urlarr["path"])) {
  384. if (strpos($urlarr["path"], "/") !== 0) {
  385. $complete .= "/";
  386. }
  387. $complete .= $urlarr["path"];
  388. }
  389. if (!empty($urlarr["query"])) {
  390. $complete .= "?".$urlarr["query"];
  391. }
  392. if (!empty($urlarr["fragment"])) {
  393. $complete .= "#".$urlarr["fragment"];
  394. }
  395. return($complete);
  396. }
  397. }