Friendica Communications Platform (please note that this is a clone of the repository at github, issues are handled there) https://friendi.ca
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

560 lines
15 KiB

11 years ago
11 years ago
11 years ago
11 years ago
  1. <?php
  2. /**
  3. * @file mod/parse_url.php
  4. *
  5. * @todo https://developers.google.com/+/plugins/snippet/
  6. *
  7. * @verbatim
  8. * <meta itemprop="name" content="Toller Titel">
  9. * <meta itemprop="description" content="Eine tolle Beschreibung">
  10. * <meta itemprop="image" content="http://maple.libertreeproject.org/images/tree-icon.png">
  11. *
  12. * <body itemscope itemtype="http://schema.org/Product">
  13. * <h1 itemprop="name">Shiny Trinket</h1>
  14. * <img itemprop="image" src="{image-url}" />
  15. * <p itemprop="description">Shiny trinkets are shiny.</p>
  16. * </body>
  17. * @endverbatim
  18. */
  19. if(!function_exists('deletenode')) {
  20. function deletenode(&$doc, $node)
  21. {
  22. $xpath = new DomXPath($doc);
  23. $list = $xpath->query("//".$node);
  24. foreach ($list as $child)
  25. $child->parentNode->removeChild($child);
  26. }
  27. }
  28. function completeurl($url, $scheme) {
  29. $urlarr = parse_url($url);
  30. if (isset($urlarr["scheme"]))
  31. return($url);
  32. $schemearr = parse_url($scheme);
  33. $complete = $schemearr["scheme"]."://".$schemearr["host"];
  34. if (@$schemearr["port"] != "")
  35. $complete .= ":".$schemearr["port"];
  36. if(strpos($urlarr['path'],'/') !== 0)
  37. $complete .= '/';
  38. $complete .= $urlarr["path"];
  39. if (@$urlarr["query"] != "")
  40. $complete .= "?".$urlarr["query"];
  41. if (@$urlarr["fragment"] != "")
  42. $complete .= "#".$urlarr["fragment"];
  43. return($complete);
  44. }
  45. function parseurl_getsiteinfo_cached($url, $no_guessing = false, $do_oembed = true) {
  46. if ($url == "")
  47. return false;
  48. $r = q("SELECT * FROM `parsed_url` WHERE `url` = '%s' AND `guessing` = %d AND `oembed` = %d",
  49. dbesc(normalise_link($url)), intval(!$no_guessing), intval($do_oembed));
  50. if ($r)
  51. $data = $r[0]["content"];
  52. if (!is_null($data)) {
  53. $data = unserialize($data);
  54. return $data;
  55. }
  56. $data = parseurl_getsiteinfo($url, $no_guessing, $do_oembed);
  57. q("INSERT INTO `parsed_url` (`url`, `guessing`, `oembed`, `content`) VALUES ('%s', %d, %d, '%s')",
  58. dbesc(normalise_link($url)), intval(!$no_guessing), intval($do_oembed), dbesc(serialize($data)));
  59. return $data;
  60. }
  61. function parseurl_getsiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) {
  62. require_once("include/network.php");
  63. $a = get_app();
  64. $siteinfo = array();
  65. if ($count > 10) {
  66. logger("parseurl_getsiteinfo: Endless loop detected for ".$url, LOGGER_DEBUG);
  67. return($siteinfo);
  68. }
  69. $url = trim($url, "'");
  70. $url = trim($url, '"');
  71. $url = original_url($url);
  72. $siteinfo["url"] = $url;
  73. $siteinfo["type"] = "link";
  74. $stamp1 = microtime(true);
  75. $ch = curl_init();
  76. curl_setopt($ch, CURLOPT_URL, $url);
  77. curl_setopt($ch, CURLOPT_HEADER, 1);
  78. curl_setopt($ch, CURLOPT_NOBODY, 1);
  79. curl_setopt($ch, CURLOPT_TIMEOUT, 3);
  80. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  81. //curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
  82. curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent());
  83. $header = curl_exec($ch);
  84. $curl_info = @curl_getinfo($ch);
  85. $http_code = $curl_info['http_code'];
  86. curl_close($ch);
  87. $a->save_timestamp($stamp1, "network");
  88. if ((($curl_info['http_code'] == "301") OR ($curl_info['http_code'] == "302") OR ($curl_info['http_code'] == "303") OR ($curl_info['http_code'] == "307"))
  89. AND (($curl_info['redirect_url'] != "") OR ($curl_info['location'] != ""))) {
  90. if ($curl_info['redirect_url'] != "")
  91. $siteinfo = parseurl_getsiteinfo($curl_info['redirect_url'], $no_guessing, $do_oembed, ++$count);
  92. else
  93. $siteinfo = parseurl_getsiteinfo($curl_info['location'], $no_guessing, $do_oembed, ++$count);
  94. return($siteinfo);
  95. }
  96. // if the file is too large then exit
  97. if ($curl_info["download_content_length"] > 1000000)
  98. return($siteinfo);
  99. // if it isn't a HTML file then exit
  100. if (($curl_info["content_type"] != "") AND !strstr(strtolower($curl_info["content_type"]),"html"))
  101. return($siteinfo);
  102. if ($do_oembed) {
  103. require_once("include/oembed.php");
  104. $oembed_data = oembed_fetch_url($url);
  105. if ($oembed_data->type != "error")
  106. $siteinfo["type"] = $oembed_data->type;
  107. if (($oembed_data->type == "link") AND ($siteinfo["type"] != "photo")) {
  108. if (isset($oembed_data->title))
  109. $siteinfo["title"] = $oembed_data->title;
  110. if (isset($oembed_data->description))
  111. $siteinfo["text"] = trim($oembed_data->description);
  112. if (isset($oembed_data->thumbnail_url))
  113. $siteinfo["image"] = $oembed_data->thumbnail_url;
  114. }
  115. }
  116. $stamp1 = microtime(true);
  117. // Now fetch the body as well
  118. $ch = curl_init();
  119. curl_setopt($ch, CURLOPT_URL, $url);
  120. curl_setopt($ch, CURLOPT_HEADER, 1);
  121. curl_setopt($ch, CURLOPT_NOBODY, 0);
  122. curl_setopt($ch, CURLOPT_TIMEOUT, 10);
  123. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  124. curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent());
  125. $header = curl_exec($ch);
  126. $curl_info = @curl_getinfo($ch);
  127. $http_code = $curl_info['http_code'];
  128. curl_close($ch);
  129. $a->save_timestamp($stamp1, "network");
  130. // Fetch the first mentioned charset. Can be in body or header
  131. $charset = "";
  132. if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches))
  133. $charset = trim(trim(trim(array_pop($matches)), ';,'));
  134. if ($charset == "")
  135. $charset = "utf-8";
  136. $pos = strpos($header, "\r\n\r\n");
  137. if ($pos)
  138. $body = trim(substr($header, $pos));
  139. else
  140. $body = $header;
  141. if (($charset != '') AND (strtoupper($charset) != "UTF-8")) {
  142. logger("parseurl_getsiteinfo: detected charset ".$charset, LOGGER_DEBUG);
  143. //$body = mb_convert_encoding($body, "UTF-8", $charset);
  144. $body = iconv($charset, "UTF-8//TRANSLIT", $body);
  145. }
  146. $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8");
  147. $doc = new DOMDocument();
  148. @$doc->loadHTML($body);
  149. deletenode($doc, 'style');
  150. deletenode($doc, 'script');
  151. deletenode($doc, 'option');
  152. deletenode($doc, 'h1');
  153. deletenode($doc, 'h2');
  154. deletenode($doc, 'h3');
  155. deletenode($doc, 'h4');
  156. deletenode($doc, 'h5');
  157. deletenode($doc, 'h6');
  158. deletenode($doc, 'ol');
  159. deletenode($doc, 'ul');
  160. $xpath = new DomXPath($doc);
  161. $list = $xpath->query("//meta[@content]");
  162. foreach ($list as $node) {
  163. $attr = array();
  164. if ($node->attributes->length)
  165. foreach ($node->attributes as $attribute)
  166. $attr[$attribute->name] = $attribute->value;
  167. if (@$attr["http-equiv"] == 'refresh') {
  168. $path = $attr["content"];
  169. $pathinfo = explode(";", $path);
  170. $content = "";
  171. foreach ($pathinfo AS $value) {
  172. if (substr(strtolower($value), 0, 4) == "url=")
  173. $content = substr($value, 4);
  174. }
  175. if ($content != "") {
  176. $siteinfo = parseurl_getsiteinfo($content, $no_guessing, $do_oembed, ++$count);
  177. return($siteinfo);
  178. }
  179. }
  180. }
  181. //$list = $xpath->query("head/title");
  182. $list = $xpath->query("//title");
  183. foreach ($list as $node)
  184. $siteinfo["title"] = html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8");
  185. //$list = $xpath->query("head/meta[@name]");
  186. $list = $xpath->query("//meta[@name]");
  187. foreach ($list as $node) {
  188. $attr = array();
  189. if ($node->attributes->length)
  190. foreach ($node->attributes as $attribute)
  191. $attr[$attribute->name] = $attribute->value;
  192. $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8"));
  193. if ($attr["content"] != "")
  194. switch (strtolower($attr["name"])) {
  195. case "fulltitle":
  196. $siteinfo["title"] = $attr["content"];
  197. break;
  198. case "description":
  199. $siteinfo["text"] = $attr["content"];
  200. break;
  201. case "thumbnail":
  202. $siteinfo["image"] = $attr["content"];
  203. break;
  204. case "twitter:image":
  205. $siteinfo["image"] = $attr["content"];
  206. break;
  207. case "twitter:image:src":
  208. $siteinfo["image"] = $attr["content"];
  209. break;
  210. case "twitter:card":
  211. if (($siteinfo["type"] == "") OR ($attr["content"] == "photo"))
  212. $siteinfo["type"] = $attr["content"];
  213. break;
  214. case "twitter:description":
  215. $siteinfo["text"] = $attr["content"];
  216. break;
  217. case "twitter:title":
  218. $siteinfo["title"] = $attr["content"];
  219. break;
  220. case "dc.title":
  221. $siteinfo["title"] = $attr["content"];
  222. break;
  223. case "dc.description":
  224. $siteinfo["text"] = $attr["content"];
  225. break;
  226. case "keywords":
  227. $keywords = explode(",", $attr["content"]);
  228. break;
  229. case "news_keywords":
  230. $keywords = explode(",", $attr["content"]);
  231. break;
  232. }
  233. if ($siteinfo["type"] == "summary")
  234. $siteinfo["type"] = "link";
  235. }
  236. if (isset($keywords)) {
  237. $siteinfo["keywords"] = array();
  238. foreach ($keywords as $keyword)
  239. if (!in_array(trim($keyword), $siteinfo["keywords"]))
  240. $siteinfo["keywords"][] = trim($keyword);
  241. }
  242. //$list = $xpath->query("head/meta[@property]");
  243. $list = $xpath->query("//meta[@property]");
  244. foreach ($list as $node) {
  245. $attr = array();
  246. if ($node->attributes->length)
  247. foreach ($node->attributes as $attribute)
  248. $attr[$attribute->name] = $attribute->value;
  249. $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8"));
  250. if ($attr["content"] != "")
  251. switch (strtolower($attr["property"])) {
  252. case "og:image":
  253. $siteinfo["image"] = $attr["content"];
  254. break;
  255. case "og:title":
  256. $siteinfo["title"] = $attr["content"];
  257. break;
  258. case "og:description":
  259. $siteinfo["text"] = $attr["content"];
  260. break;
  261. }
  262. }
  263. if ((@$siteinfo["image"] == "") AND !$no_guessing) {
  264. $list = $xpath->query("//img[@src]");
  265. foreach ($list as $node) {
  266. $attr = array();
  267. if ($node->attributes->length)
  268. foreach ($node->attributes as $attribute)
  269. $attr[$attribute->name] = $attribute->value;
  270. $src = completeurl($attr["src"], $url);
  271. $photodata = @getimagesize($src);
  272. if (($photodata) && ($photodata[0] > 150) and ($photodata[1] > 150)) {
  273. if ($photodata[0] > 300) {
  274. $photodata[1] = round($photodata[1] * (300 / $photodata[0]));
  275. $photodata[0] = 300;
  276. }
  277. if ($photodata[1] > 300) {
  278. $photodata[0] = round($photodata[0] * (300 / $photodata[1]));
  279. $photodata[1] = 300;
  280. }
  281. $siteinfo["images"][] = array("src"=>$src,
  282. "width"=>$photodata[0],
  283. "height"=>$photodata[1]);
  284. }
  285. }
  286. } else {
  287. $src = completeurl($siteinfo["image"], $url);
  288. unset($siteinfo["image"]);
  289. $photodata = @getimagesize($src);
  290. if (($photodata) && ($photodata[0] > 10) and ($photodata[1] > 10))
  291. $siteinfo["images"][] = array("src"=>$src,
  292. "width"=>$photodata[0],
  293. "height"=>$photodata[1]);
  294. }
  295. if ((@$siteinfo["text"] == "") AND (@$siteinfo["title"] != "") AND !$no_guessing) {
  296. $text = "";
  297. $list = $xpath->query("//div[@class='article']");
  298. foreach ($list as $node)
  299. if (strlen($node->nodeValue) > 40)
  300. $text .= " ".trim($node->nodeValue);
  301. if ($text == "") {
  302. $list = $xpath->query("//div[@class='content']");
  303. foreach ($list as $node)
  304. if (strlen($node->nodeValue) > 40)
  305. $text .= " ".trim($node->nodeValue);
  306. }
  307. // If none text was found then take the paragraph content
  308. if ($text == "") {
  309. $list = $xpath->query("//p");
  310. foreach ($list as $node)
  311. if (strlen($node->nodeValue) > 40)
  312. $text .= " ".trim($node->nodeValue);
  313. }
  314. if ($text != "") {
  315. $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text));
  316. while (strpos($text, " "))
  317. $text = trim(str_replace(" ", " ", $text));
  318. $siteinfo["text"] = trim(html_entity_decode(substr($text,0,350), ENT_QUOTES, "UTF-8").'...');
  319. }
  320. }
  321. logger("parseurl_getsiteinfo: Siteinfo for ".$url." ".print_r($siteinfo, true), LOGGER_DEBUG);
  322. call_hooks('getsiteinfo', $siteinfo);
  323. return($siteinfo);
  324. }
  325. function arr_add_hashes(&$item,$k) {
  326. $item = '#' . $item;
  327. }
  328. function parse_url_content(&$a) {
  329. $text = null;
  330. $str_tags = '';
  331. $textmode = false;
  332. if(local_user() && (! feature_enabled(local_user(),'richtext')))
  333. $textmode = true;
  334. //if($textmode)
  335. $br = (($textmode) ? "\n" : '<br />');
  336. if(x($_GET,'binurl'))
  337. $url = trim(hex2bin($_GET['binurl']));
  338. else
  339. $url = trim($_GET['url']);
  340. if($_GET['title'])
  341. $title = strip_tags(trim($_GET['title']));
  342. if($_GET['description'])
  343. $text = strip_tags(trim($_GET['description']));
  344. if($_GET['tags']) {
  345. $arr_tags = str_getcsv($_GET['tags']);
  346. if(count($arr_tags)) {
  347. array_walk($arr_tags,'arr_add_hashes');
  348. $str_tags = $br . implode(' ',$arr_tags) . $br;
  349. }
  350. }
  351. // add url scheme if missing
  352. $arrurl = parse_url($url);
  353. if (!x($arrurl, 'scheme')) {
  354. if (x($arrurl, 'host'))
  355. $url = "http:".$url;
  356. else
  357. $url = "http://".$url;
  358. }
  359. logger('parse_url: ' . $url);
  360. if($textmode)
  361. $template = '[bookmark=%s]%s[/bookmark]%s';
  362. else
  363. $template = "<a class=\"bookmark\" href=\"%s\" >%s</a>%s";
  364. $arr = array('url' => $url, 'text' => '');
  365. call_hooks('parse_link', $arr);
  366. if(strlen($arr['text'])) {
  367. echo $arr['text'];
  368. killme();
  369. }
  370. if($url && $title && $text) {
  371. $title = str_replace(array("\r","\n"),array('',''),$title);
  372. if($textmode)
  373. $text = '[quote]' . trim($text) . '[/quote]' . $br;
  374. else {
  375. $text = '<blockquote>' . htmlspecialchars(trim($text)) . '</blockquote><br />';
  376. $title = htmlspecialchars($title);
  377. }
  378. $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
  379. logger('parse_url (unparsed): returns: ' . $result);
  380. echo $result;
  381. killme();
  382. }
  383. $siteinfo = parseurl_getsiteinfo($url);
  384. // if ($textmode) {
  385. // require_once("include/items.php");
  386. //
  387. // echo add_page_info_data($siteinfo);
  388. // killme();
  389. // }
  390. $url= $siteinfo["url"];
  391. // If the link contains BBCode stuff, make a short link out of this to avoid parsing problems
  392. if (strpos($url, '[') OR strpos($url, ']')) {
  393. require_once("include/network.php");
  394. $url = short_link($url);
  395. }
  396. $sitedata = "";
  397. if($siteinfo["title"] != "") {
  398. $text = $siteinfo["text"];
  399. $title = $siteinfo["title"];
  400. }
  401. $image = "";
  402. if (($siteinfo["type"] != "video") AND (sizeof($siteinfo["images"]) > 0)){
  403. /* Execute below code only if image is present in siteinfo */
  404. $total_images = 0;
  405. $max_images = get_config('system','max_bookmark_images');
  406. if($max_images === false)
  407. $max_images = 2;
  408. else
  409. $max_images = intval($max_images);
  410. foreach ($siteinfo["images"] as $imagedata) {
  411. if($textmode)
  412. $image .= '[img='.$imagedata["width"].'x'.$imagedata["height"].']'.$imagedata["src"].'[/img]' . "\n";
  413. else
  414. $image .= '<img height="'.$imagedata["height"].'" width="'.$imagedata["width"].'" src="'.$imagedata["src"].'" alt="photo" /><br />';
  415. $total_images ++;
  416. if($max_images && $max_images >= $total_images)
  417. break;
  418. }
  419. }
  420. if(strlen($text)) {
  421. if($textmode)
  422. $text = '[quote]'.trim($text).'[/quote]';
  423. else
  424. $text = '<blockquote>'.htmlspecialchars(trim($text)).'</blockquote>';
  425. }
  426. if($image)
  427. $text = $br.$br.$image.$text;
  428. else
  429. $text = $br.$text;
  430. $title = str_replace(array("\r","\n"),array('',''),$title);
  431. $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
  432. logger('parse_url: returns: ' . $result);
  433. $sitedata .= trim($result);
  434. if (($siteinfo["type"] == "video") AND ($url != ""))
  435. echo "[class=type-video]".$sitedata."[/class]";
  436. elseif (($siteinfo["type"] != "photo"))
  437. echo "[class=type-link]".$sitedata."[/class]";
  438. else
  439. echo "[class=type-photo]".$title.$br.$image."[/class]";
  440. killme();
  441. }
  442. ?>