Friendica Communications Platform (please note that this is a clone of the repository at github, issues are handled there) https://friendi.ca
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ParseUrl.php 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. <?php
  2. /**
  3. * @file include/ParseUrl.php
  4. * @brief Get informations about a given URL
  5. */
  6. namespace Friendica;
  7. use Friendica\Core\Config;
  8. use xml;
  9. use dba;
  10. use DomXPath;
  11. use DOMDocument;
  12. require_once("include/network.php");
  13. require_once("include/Photo.php");
  14. require_once("include/oembed.php");
  15. require_once("include/xml.php");
  16. /**
  17. * @brief Class with methods for extracting certain content from an url
  18. */
  19. class ParseUrl {
  20. /**
  21. * @brief Search for chached embeddable data of an url otherwise fetch it
  22. *
  23. * @param type $url The url of the page which should be scraped
  24. * @param type $no_guessing If true the parse doens't search for
  25. * preview pictures
  26. * @param type $do_oembed The false option is used by the function fetch_oembed()
  27. * to avoid endless loops
  28. *
  29. * @return array which contains needed data for embedding
  30. * string 'url' => The url of the parsed page
  31. * string 'type' => Content type
  32. * string 'title' => The title of the content
  33. * string 'text' => The description for the content
  34. * string 'image' => A preview image of the content (only available
  35. * if $no_geuessing = false
  36. * array'images' = Array of preview pictures
  37. * string 'keywords' => The tags which belong to the content
  38. *
  39. * @see ParseUrl::getSiteinfo() for more information about scraping
  40. * embeddable content
  41. */
  42. public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true) {
  43. if ($url == "") {
  44. return false;
  45. }
  46. $r = q("SELECT * FROM `parsed_url` WHERE `url` = '%s' AND `guessing` = %d AND `oembed` = %d",
  47. dbesc(normalise_link($url)), intval(!$no_guessing), intval($do_oembed));
  48. if ($r) {
  49. $data = $r[0]["content"];
  50. }
  51. if (!is_null($data)) {
  52. $data = unserialize($data);
  53. return $data;
  54. }
  55. $data = self::getSiteinfo($url, $no_guessing, $do_oembed);
  56. dba::insert('parsed_url', array('url' => normalise_link($url), 'guessing' => !$no_guessing,
  57. 'oembed' => $do_oembed, 'content' => serialize($data),
  58. 'created' => datetime_convert()), true);
  59. return $data;
  60. }
  61. /**
  62. * @brief Parse a page for embeddable content information
  63. *
  64. * This method parses to url for meta data which can be used to embed
  65. * the content. If available it prioritizes Open Graph meta tags.
  66. * If this is not available it uses the twitter cards meta tags.
  67. * As fallback it uses standard html elements with meta informations
  68. * like \<title\>Awesome Title\</title\> or
  69. * \<meta name="description" content="An awesome description"\>
  70. *
  71. * @param type $url The url of the page which should be scraped
  72. * @param type $no_guessing If true the parse doens't search for
  73. * preview pictures
  74. * @param type $do_oembed The false option is used by the function fetch_oembed()
  75. * to avoid endless loops
  76. * @param type $count Internal counter to avoid endless loops
  77. *
  78. * @return array which contains needed data for embedding
  79. * string 'url' => The url of the parsed page
  80. * string 'type' => Content type
  81. * string 'title' => The title of the content
  82. * string 'text' => The description for the content
  83. * string 'image' => A preview image of the content (only available
  84. * if $no_geuessing = false
  85. * array'images' = Array of preview pictures
  86. * string 'keywords' => The tags which belong to the content
  87. *
  88. * @todo https://developers.google.com/+/plugins/snippet/
  89. * @verbatim
  90. * <meta itemprop="name" content="Awesome title">
  91. * <meta itemprop="description" content="An awesome description">
  92. * <meta itemprop="image" content="http://maple.libertreeproject.org/images/tree-icon.png">
  93. *
  94. * <body itemscope itemtype="http://schema.org/Product">
  95. * <h1 itemprop="name">Shiny Trinket</h1>
  96. * <img itemprop="image" src="{image-url}" />
  97. * <p itemprop="description">Shiny trinkets are shiny.</p>
  98. * </body>
  99. * @endverbatim
  100. */
  101. public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) {
  102. $a = get_app();
  103. $siteinfo = array();
  104. // Check if the URL does contain a scheme
  105. $scheme = parse_url($url, PHP_URL_SCHEME);
  106. if ($scheme == "") {
  107. $url = "http://".trim($url, "/");
  108. }
  109. if ($count > 10) {
  110. logger("parseurl_getsiteinfo: Endless loop detected for ".$url, LOGGER_DEBUG);
  111. return($siteinfo);
  112. }
  113. $url = trim($url, "'");
  114. $url = trim($url, '"');
  115. $url = strip_tracking_query_params($url);
  116. $siteinfo["url"] = $url;
  117. $siteinfo["type"] = "link";
  118. $data = z_fetch_url($url);
  119. if (!$data['success']) {
  120. return($siteinfo);
  121. }
  122. // If the file is too large then exit
  123. if ($data["info"]["download_content_length"] > 1000000) {
  124. return($siteinfo);
  125. }
  126. // If it isn't a HTML file then exit
  127. if (($data["info"]["content_type"] != "") && !strstr(strtolower($data["info"]["content_type"]), "html")) {
  128. return($siteinfo);
  129. }
  130. $header = $data["header"];
  131. $body = $data["body"];
  132. if ($do_oembed) {
  133. $oembed_data = oembed_fetch_url($url);
  134. if (!in_array($oembed_data->type, array("error", "rich", ""))) {
  135. $siteinfo["type"] = $oembed_data->type;
  136. }
  137. if (($oembed_data->type == "link") && ($siteinfo["type"] != "photo")) {
  138. if (isset($oembed_data->title)) {
  139. $siteinfo["title"] = trim($oembed_data->title);
  140. }
  141. if (isset($oembed_data->description)) {
  142. $siteinfo["text"] = trim($oembed_data->description);
  143. }
  144. if (isset($oembed_data->thumbnail_url)) {
  145. $siteinfo["image"] = $oembed_data->thumbnail_url;
  146. }
  147. }
  148. }
  149. // Fetch the first mentioned charset. Can be in body or header
  150. $charset = "";
  151. if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches)) {
  152. $charset = trim(trim(trim(array_pop($matches)), ';,'));
  153. }
  154. if ($charset == "") {
  155. $charset = "utf-8";
  156. }
  157. if (($charset != "") && (strtoupper($charset) != "UTF-8")) {
  158. logger("parseurl_getsiteinfo: detected charset ".$charset, LOGGER_DEBUG);
  159. //$body = mb_convert_encoding($body, "UTF-8", $charset);
  160. $body = iconv($charset, "UTF-8//TRANSLIT", $body);
  161. }
  162. $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8");
  163. $doc = new DOMDocument();
  164. @$doc->loadHTML($body);
  165. xml::deleteNode($doc, "style");
  166. xml::deleteNode($doc, "script");
  167. xml::deleteNode($doc, "option");
  168. xml::deleteNode($doc, "h1");
  169. xml::deleteNode($doc, "h2");
  170. xml::deleteNode($doc, "h3");
  171. xml::deleteNode($doc, "h4");
  172. xml::deleteNode($doc, "h5");
  173. xml::deleteNode($doc, "h6");
  174. xml::deleteNode($doc, "ol");
  175. xml::deleteNode($doc, "ul");
  176. $xpath = new DomXPath($doc);
  177. $list = $xpath->query("//meta[@content]");
  178. foreach ($list as $node) {
  179. $attr = array();
  180. if ($node->attributes->length) {
  181. foreach ($node->attributes as $attribute) {
  182. $attr[$attribute->name] = $attribute->value;
  183. }
  184. }
  185. if (@$attr["http-equiv"] == "refresh") {
  186. $path = $attr["content"];
  187. $pathinfo = explode(";", $path);
  188. $content = "";
  189. foreach ($pathinfo as $value) {
  190. if (substr(strtolower($value), 0, 4) == "url=") {
  191. $content = substr($value, 4);
  192. }
  193. }
  194. if ($content != "") {
  195. $siteinfo = self::getSiteinfo($content, $no_guessing, $do_oembed, ++$count);
  196. return($siteinfo);
  197. }
  198. }
  199. }
  200. $list = $xpath->query("//title");
  201. if ($list->length > 0) {
  202. $siteinfo["title"] = trim($list->item(0)->nodeValue);
  203. }
  204. //$list = $xpath->query("head/meta[@name]");
  205. $list = $xpath->query("//meta[@name]");
  206. foreach ($list as $node) {
  207. $attr = array();
  208. if ($node->attributes->length) {
  209. foreach ($node->attributes as $attribute) {
  210. $attr[$attribute->name] = $attribute->value;
  211. }
  212. }
  213. $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8"));
  214. if ($attr["content"] != "") {
  215. switch (strtolower($attr["name"])) {
  216. case "fulltitle":
  217. $siteinfo["title"] = trim($attr["content"]);
  218. break;
  219. case "description":
  220. $siteinfo["text"] = trim($attr["content"]);
  221. break;
  222. case "thumbnail":
  223. $siteinfo["image"] = $attr["content"];
  224. break;
  225. case "twitter:image":
  226. $siteinfo["image"] = $attr["content"];
  227. break;
  228. case "twitter:image:src":
  229. $siteinfo["image"] = $attr["content"];
  230. break;
  231. case "twitter:card":
  232. if (($siteinfo["type"] == "") || ($attr["content"] == "photo")) {
  233. $siteinfo["type"] = $attr["content"];
  234. }
  235. break;
  236. case "twitter:description":
  237. $siteinfo["text"] = trim($attr["content"]);
  238. break;
  239. case "twitter:title":
  240. $siteinfo["title"] = trim($attr["content"]);
  241. break;
  242. case "dc.title":
  243. $siteinfo["title"] = trim($attr["content"]);
  244. break;
  245. case "dc.description":
  246. $siteinfo["text"] = trim($attr["content"]);
  247. break;
  248. case "keywords":
  249. $keywords = explode(",", $attr["content"]);
  250. break;
  251. case "news_keywords":
  252. $keywords = explode(",", $attr["content"]);
  253. break;
  254. }
  255. }
  256. if ($siteinfo["type"] == "summary") {
  257. $siteinfo["type"] = "link";
  258. }
  259. }
  260. if (isset($keywords)) {
  261. $siteinfo["keywords"] = array();
  262. foreach ($keywords as $keyword) {
  263. if (!in_array(trim($keyword), $siteinfo["keywords"])) {
  264. $siteinfo["keywords"][] = trim($keyword);
  265. }
  266. }
  267. }
  268. //$list = $xpath->query("head/meta[@property]");
  269. $list = $xpath->query("//meta[@property]");
  270. foreach ($list as $node) {
  271. $attr = array();
  272. if ($node->attributes->length) {
  273. foreach ($node->attributes as $attribute) {
  274. $attr[$attribute->name] = $attribute->value;
  275. }
  276. }
  277. $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8"));
  278. if ($attr["content"] != "") {
  279. switch (strtolower($attr["property"])) {
  280. case "og:image":
  281. $siteinfo["image"] = $attr["content"];
  282. break;
  283. case "og:title":
  284. $siteinfo["title"] = trim($attr["content"]);
  285. break;
  286. case "og:description":
  287. $siteinfo["text"] = trim($attr["content"]);
  288. break;
  289. }
  290. }
  291. }
  292. if ((@$siteinfo["image"] == "") && !$no_guessing) {
  293. $list = $xpath->query("//img[@src]");
  294. foreach ($list as $node) {
  295. $attr = array();
  296. if ($node->attributes->length) {
  297. foreach ($node->attributes as $attribute) {
  298. $attr[$attribute->name] = $attribute->value;
  299. }
  300. }
  301. $src = self::completeUrl($attr["src"], $url);
  302. $photodata = get_photo_info($src);
  303. if (($photodata) && ($photodata[0] > 150) && ($photodata[1] > 150)) {
  304. if ($photodata[0] > 300) {
  305. $photodata[1] = round($photodata[1] * (300 / $photodata[0]));
  306. $photodata[0] = 300;
  307. }
  308. if ($photodata[1] > 300) {
  309. $photodata[0] = round($photodata[0] * (300 / $photodata[1]));
  310. $photodata[1] = 300;
  311. }
  312. $siteinfo["images"][] = array("src" => $src,
  313. "width" => $photodata[0],
  314. "height" => $photodata[1]);
  315. }
  316. }
  317. } elseif ($siteinfo["image"] != "") {
  318. $src = self::completeUrl($siteinfo["image"], $url);
  319. unset($siteinfo["image"]);
  320. $photodata = get_photo_info($src);
  321. if (($photodata) && ($photodata[0] > 10) && ($photodata[1] > 10)) {
  322. $siteinfo["images"][] = array("src" => $src,
  323. "width" => $photodata[0],
  324. "height" => $photodata[1]);
  325. }
  326. }
  327. if ((@$siteinfo["text"] == "") && (@$siteinfo["title"] != "") && !$no_guessing) {
  328. $text = "";
  329. $list = $xpath->query("//div[@class='article']");
  330. foreach ($list as $node) {
  331. if (strlen($node->nodeValue) > 40) {
  332. $text .= " ".trim($node->nodeValue);
  333. }
  334. }
  335. if ($text == "") {
  336. $list = $xpath->query("//div[@class='content']");
  337. foreach ($list as $node) {
  338. if (strlen($node->nodeValue) > 40) {
  339. $text .= " ".trim($node->nodeValue);
  340. }
  341. }
  342. }
  343. // If none text was found then take the paragraph content
  344. if ($text == "") {
  345. $list = $xpath->query("//p");
  346. foreach ($list as $node) {
  347. if (strlen($node->nodeValue) > 40) {
  348. $text .= " ".trim($node->nodeValue);
  349. }
  350. }
  351. }
  352. if ($text != "") {
  353. $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text));
  354. while (strpos($text, " ")) {
  355. $text = trim(str_replace(" ", " ", $text));
  356. }
  357. $siteinfo["text"] = trim(html_entity_decode(substr($text, 0, 350), ENT_QUOTES, "UTF-8").'...');
  358. }
  359. }
  360. logger("parseurl_getsiteinfo: Siteinfo for ".$url." ".print_r($siteinfo, true), LOGGER_DEBUG);
  361. call_hooks("getsiteinfo", $siteinfo);
  362. return($siteinfo);
  363. }
  364. /**
  365. * @brief Convert tags from CSV to an array
  366. *
  367. * @param string $string Tags
  368. * @return array with formatted Hashtags
  369. */
  370. public static function convertTagsToArray($string) {
  371. $arr_tags = str_getcsv($string);
  372. if (count($arr_tags)) {
  373. // add the # sign to every tag
  374. array_walk($arr_tags, array("self", "arrAddHashes"));
  375. return $arr_tags;
  376. }
  377. }
  378. /**
  379. * @brief Add a hasht sign to a string
  380. *
  381. * This method is used as callback function
  382. *
  383. * @param string $tag The pure tag name
  384. * @param int $k Counter for internal use
  385. */
  386. private static function arrAddHashes(&$tag, $k) {
  387. $tag = "#" . $tag;
  388. }
  389. /**
  390. * @brief Add a scheme to an url
  391. *
  392. * The src attribute of some html elements (e.g. images)
  393. * can miss the scheme so we need to add the correct
  394. * scheme
  395. *
  396. * @param string $url The url which possibly does have
  397. * a missing scheme (a link to an image)
  398. * @param string $scheme The url with a correct scheme
  399. * (e.g. the url from the webpage which does contain the image)
  400. *
  401. * @return string The url with a scheme
  402. */
  403. private static function completeUrl($url, $scheme) {
  404. $urlarr = parse_url($url);
  405. // If the url does allready have an scheme
  406. // we can stop the process here
  407. if (isset($urlarr["scheme"])) {
  408. return($url);
  409. }
  410. $schemearr = parse_url($scheme);
  411. $complete = $schemearr["scheme"]."://".$schemearr["host"];
  412. if (@$schemearr["port"] != "") {
  413. $complete .= ":".$schemearr["port"];
  414. }
  415. if (strpos($urlarr["path"],"/") !== 0) {
  416. $complete .= "/";
  417. }
  418. $complete .= $urlarr["path"];
  419. if (@$urlarr["query"] != "") {
  420. $complete .= "?".$urlarr["query"];
  421. }
  422. if (@$urlarr["fragment"] != "") {
  423. $complete .= "#".$urlarr["fragment"];
  424. }
  425. return($complete);
  426. }
  427. }