From e9226eaf45e70bcd5a9a9f66b6b922dbc15c47ba Mon Sep 17 00:00:00 2001 From: rabuzarus Date: Thu, 24 Nov 2016 01:11:22 +0100 Subject: [PATCH 1/7] parse_url: recognize image/video/audio files + move functions into own class --- include/ParseUrl.php | 482 ++++++++++++++++++++++++++++++++++++++ include/items.php | 9 +- include/oembed.php | 75 +++--- include/plaintext.php | 22 +- mod/parse_url.php | 523 +++++++----------------------------------- 5 files changed, 625 insertions(+), 486 deletions(-) create mode 100644 include/ParseUrl.php diff --git a/include/ParseUrl.php b/include/ParseUrl.php new file mode 100644 index 000000000..8a3392e73 --- /dev/null +++ b/include/ParseUrl.php @@ -0,0 +1,482 @@ + 10) { + logger("parseurl_getsiteinfo: Endless loop detected for ".$url, LOGGER_DEBUG); + return($siteinfo); + } + + $url = trim($url, "'"); + $url = trim($url, '"'); + + $url = original_url($url); + + $siteinfo["url"] = $url; + $siteinfo["type"] = "link"; + + $check_cert = Config::get("system", "verifyssl"); + + $stamp1 = microtime(true); + + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_HEADER, 1); + curl_setopt($ch, CURLOPT_NOBODY, 1); + curl_setopt($ch, CURLOPT_TIMEOUT, 3); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent()); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false)); + curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, (($check_cert) ? 2 : false)); + + $header = curl_exec($ch); + $curl_info = @curl_getinfo($ch); + $http_code = $curl_info["http_code"]; + curl_close($ch); + + $a->save_timestamp($stamp1, "network"); + + if ((($curl_info["http_code"] == "301") || ($curl_info["http_code"] == "302") || ($curl_info["http_code"] == "303") || ($curl_info["http_code"] == "307")) + && (($curl_info["redirect_url"] != "") || ($curl_info["location"] != ""))) { + if ($curl_info["redirect_url"] != "") { + $siteinfo = self::getSiteinfo($curl_info["redirect_url"], $no_guessing, $do_oembed, ++$count); + } else { + $siteinfo = self::getSiteinfo($curl_info["location"], $no_guessing, $do_oembed, ++$count); + } + return($siteinfo); + } + + // If the file is too large then exit + if ($curl_info["download_content_length"] > 1000000) { + return($siteinfo); + } + + // If it isn't a HTML file then exit + if (($curl_info["content_type"] != "") && !strstr(strtolower($curl_info["content_type"]), "html")) { + return($siteinfo); + } + + if ($do_oembed) { + + $oembed_data = oembed_fetch_url($url); + + if (!in_array($oembed_data->type, array("error", "rich"))) { + $siteinfo["type"] = $oembed_data->type; + } + + if (($oembed_data->type == "link") && ($siteinfo["type"] != "photo")) { + if (isset($oembed_data->title)) { + $siteinfo["title"] = $oembed_data->title; + } + if (isset($oembed_data->description)) { + $siteinfo["text"] = trim($oembed_data->description); + } + if (isset($oembed_data->thumbnail_url)) { + $siteinfo["image"] = $oembed_data->thumbnail_url; + } + } + } + + $stamp1 = microtime(true); + + // Now fetch the body as well + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_HEADER, 1); + curl_setopt($ch, CURLOPT_NOBODY, 0); + curl_setopt($ch, CURLOPT_TIMEOUT, 10); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent()); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false)); + curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, (($check_cert) ? 2 : false)); + + $header = curl_exec($ch); + $curl_info = @curl_getinfo($ch); + $http_code = $curl_info["http_code"]; + curl_close($ch); + + $a->save_timestamp($stamp1, "network"); + + // Fetch the first mentioned charset. Can be in body or header + $charset = ""; + if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches)) { + $charset = trim(trim(trim(array_pop($matches)), ';,')); + } + + if ($charset == "") { + $charset = "utf-8"; + } + + $pos = strpos($header, "\r\n\r\n"); + + if ($pos) { + $body = trim(substr($header, $pos)); + } else { + $body = $header; + } + + if (($charset != "") && (strtoupper($charset) != "UTF-8")) { + logger("parseurl_getsiteinfo: detected charset ".$charset, LOGGER_DEBUG); + //$body = mb_convert_encoding($body, "UTF-8", $charset); + $body = iconv($charset, "UTF-8//TRANSLIT", $body); + } + + $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8"); + + $doc = new \DOMDocument(); + @$doc->loadHTML($body); + + self::deleteNode($doc, "style"); + self::deleteNode($doc, "script"); + self::deleteNode($doc, "option"); + self::deleteNode($doc, "h1"); + self::deleteNode($doc, "h2"); + self::deleteNode($doc, "h3"); + self::deleteNode($doc, "h4"); + self::deleteNode($doc, "h5"); + self::deleteNode($doc, "h6"); + self::deleteNode($doc, "ol"); + self::deleteNode($doc, "ul"); + + $xpath = new \DomXPath($doc); + + $list = $xpath->query("//meta[@content]"); + foreach ($list as $node) { + $attr = array(); + if ($node->attributes->length) { + foreach ($node->attributes as $attribute) { + $attr[$attribute->name] = $attribute->value; + } + } + + if (@$attr["http-equiv"] == "refresh") { + $path = $attr["content"]; + $pathinfo = explode(";", $path); + $content = ""; + foreach ($pathinfo as $value) { + if (substr(strtolower($value), 0, 4) == "url=") { + $content = substr($value, 4); + } + } + if ($content != "") { + $siteinfo = self::getSiteinfo($content, $no_guessing, $do_oembed, ++$count); + return($siteinfo); + } + } + } + + $list = $xpath->query("//title"); + if ($list->length > 0) { + $siteinfo["title"] = $list->item(0)->nodeValue; + } + + //$list = $xpath->query("head/meta[@name]"); + $list = $xpath->query("//meta[@name]"); + foreach ($list as $node) { + $attr = array(); + if ($node->attributes->length) { + foreach ($node->attributes as $attribute) { + $attr[$attribute->name] = $attribute->value; + } + } + + $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8")); + + if ($attr["content"] != "") { + switch (strtolower($attr["name"])) { + case "fulltitle": + $siteinfo["title"] = $attr["content"]; + break; + case "description": + $siteinfo["text"] = $attr["content"]; + break; + case "thumbnail": + $siteinfo["image"] = $attr["content"]; + break; + case "twitter:image": + $siteinfo["image"] = $attr["content"]; + break; + case "twitter:image:src": + $siteinfo["image"] = $attr["content"]; + break; + case "twitter:card": + if (($siteinfo["type"] == "") || ($attr["content"] == "photo")) { + $siteinfo["type"] = $attr["content"]; + } + break; + case "twitter:description": + $siteinfo["text"] = $attr["content"]; + break; + case "twitter:title": + $siteinfo["title"] = $attr["content"]; + break; + case "dc.title": + $siteinfo["title"] = $attr["content"]; + break; + case "dc.description": + $siteinfo["text"] = $attr["content"]; + break; + case "keywords": + $keywords = explode(",", $attr["content"]); + break; + case "news_keywords": + $keywords = explode(",", $attr["content"]); + break; + } + } + if ($siteinfo["type"] == "summary") { + $siteinfo["type"] = "link"; + } + } + + if (isset($keywords)) { + $siteinfo["keywords"] = array(); + foreach ($keywords as $keyword) { + if (!in_array(trim($keyword), $siteinfo["keywords"])) { + $siteinfo["keywords"][] = trim($keyword); + } + } + } + + //$list = $xpath->query("head/meta[@property]"); + $list = $xpath->query("//meta[@property]"); + foreach ($list as $node) { + $attr = array(); + if ($node->attributes->length) { + foreach ($node->attributes as $attribute) { + $attr[$attribute->name] = $attribute->value; + } + } + + $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8")); + + if ($attr["content"] != "") { + switch (strtolower($attr["property"])) { + case "og:image": + $siteinfo["image"] = $attr["content"]; + break; + case "og:title": + $siteinfo["title"] = $attr["content"]; + break; + case "og:description": + $siteinfo["text"] = $attr["content"]; + break; + } + } + } + + if ((@$siteinfo["image"] == "") && !$no_guessing) { + $list = $xpath->query("//img[@src]"); + foreach ($list as $node) { + $attr = array(); + if ($node->attributes->length) { + foreach ($node->attributes as $attribute) { + $attr[$attribute->name] = $attribute->value; + } + } + + $src = self::completeUrl($attr["src"], $url); + $photodata = get_photo_info($src); + + if (($photodata) && ($photodata[0] > 150) && ($photodata[1] > 150)) { + if ($photodata[0] > 300) { + $photodata[1] = round($photodata[1] * (300 / $photodata[0])); + $photodata[0] = 300; + } + if ($photodata[1] > 300) { + $photodata[0] = round($photodata[0] * (300 / $photodata[1])); + $photodata[1] = 300; + } + $siteinfo["images"][] = array("src" => $src, + "width" => $photodata[0], + "height" => $photodata[1]); + } + + } + } elseif ($siteinfo["image"] != "") { + $src = self::completeUrl($siteinfo["image"], $url); + + unset($siteinfo["image"]); + + $photodata = get_photo_info($src); + + if (($photodata) && ($photodata[0] > 10) && ($photodata[1] > 10)) { + $siteinfo["images"][] = array("src" => $src, + "width" => $photodata[0], + "height" => $photodata[1]); + } + } + + if ((@$siteinfo["text"] == "") && (@$siteinfo["title"] != "") && !$no_guessing) { + $text = ""; + + $list = $xpath->query("//div[@class='article']"); + foreach ($list as $node) { + if (strlen($node->nodeValue) > 40) { + $text .= " ".trim($node->nodeValue); + } + } + + if ($text == "") { + $list = $xpath->query("//div[@class='content']"); + foreach ($list as $node) { + if (strlen($node->nodeValue) > 40) { + $text .= " ".trim($node->nodeValue); + } + } + } + + // If none text was found then take the paragraph content + if ($text == "") { + $list = $xpath->query("//p"); + foreach ($list as $node) { + if (strlen($node->nodeValue) > 40) { + $text .= " ".trim($node->nodeValue); + } + } + } + + if ($text != "") { + $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text)); + + while (strpos($text, " ")) { + $text = trim(str_replace(" ", " ", $text)); + } + + $siteinfo["text"] = trim(html_entity_decode(substr($text, 0, 350), ENT_QUOTES, "UTF-8").'...'); + } + } + + logger("parseurl_getsiteinfo: Siteinfo for ".$url." ".print_r($siteinfo, true), LOGGER_DEBUG); + + call_hooks("getsiteinfo", $siteinfo); + + return($siteinfo); + } + + /** + * @brief Convert tags from CSV to an array + * + * @param string $string Tags + * @return array with formatted Hashtags + */ + public static function convertTagsToArray($string) { + $arr_tags = str_getcsv($string); + if (count($arr_tags)) { + // add the # sign to every tag + array_walk($arr_tags, array("self", "arrAddHashes")); + + return $arr_tags; + } + } + + /** + * @brief Add a hasht sign to a string + * + * This method is used as callback function + * + * @param string $tag The pure tag name + * @param int $k Counter for internal use + */ + private static function arrAddHashes(&$tag, $k) { + $tag = "#" . $tag; + } + + private static function deleteNode(&$doc, $node) { + $xpath = new \DomXPath($doc); + $list = $xpath->query("//".$node); + foreach ($list as $child) { + $child->parentNode->removeChild($child); + } + } + + private static function completeUrl($url, $scheme) { + $urlarr = parse_url($url); + + if (isset($urlarr["scheme"])) { + return($url); + } + + $schemearr = parse_url($scheme); + + $complete = $schemearr["scheme"]."://".$schemearr["host"]; + + if (@$schemearr["port"] != "") { + $complete .= ":".$schemearr["port"]; + } + + if (strpos($urlarr["path"],"/") !== 0) { + $complete .= "/"; + } + + $complete .= $urlarr["path"]; + + if (@$urlarr["query"] != "") { + $complete .= "?".$urlarr["query"]; + } + + if (@$urlarr["fragment"] != "") { + $complete .= "#".$urlarr["fragment"]; + } + + return($complete); + } +} diff --git a/include/items.php b/include/items.php index 9b199aed3..e9354b62d 100644 --- a/include/items.php +++ b/include/items.php @@ -1,5 +1,11 @@ type == "error") OR ($no_rich_type AND ($j->type == "rich"))) { - require_once("mod/parse_url.php"); - $data = parseurl_getsiteinfo_cached($embedurl, true, false); + $data = ParseUrl::getSiteinfoCached($embedurl, true, false); $j->type = $data["type"]; if ($j->type == "photo") { @@ -143,12 +151,11 @@ function oembed_fetch_url($embedurl, $no_rich_type = false){ function oembed_format_object($j){ require_once("mod/proxy.php"); - $a = get_app(); $embedurl = $j->embedurl; $jhtml = oembed_iframe($j->embedurl,(isset($j->width) ? $j->width : null), (isset($j->height) ? $j->height : null) ); $ret=""; switch ($j->type) { - case "video": { + case "video": if (isset($j->thumbnail_url)) { $tw = (isset($j->thumbnail_width) && intval($j->thumbnail_width)) ? $j->thumbnail_width:200; $th = (isset($j->thumbnail_height) && intval($j->thumbnail_height)) ? $j->thumbnail_height:180; @@ -158,7 +165,7 @@ function oembed_format_object($j){ $th=120; $tw = $th*$tr; $tpl=get_markup_template('oembed_video.tpl'); $ret.=replace_macros($tpl, array( - '$baseurl' => $a->get_baseurl(), + '$baseurl' => App::get_baseurl(), '$embedurl'=>$embedurl, '$escapedhtml'=>base64_encode($jhtml), '$tw'=>$tw, @@ -170,43 +177,49 @@ function oembed_format_object($j){ $ret=$jhtml; } //$ret.="
"; - }; break; - case "photo": { + break; + case "photo": $ret.= ""; - }; break; - case "link": { - }; break; - case "rich": { + break; + case "link": + break; + case "rich": // not so safe.. - if (!get_config("system","no_oembed_rich_content")) + if (!Config::get("system","no_oembed_rich_content")) { $ret.= proxy_parse_html($jhtml); - }; break; + } + break; } // add link to source if not present in "rich" type if ($j->type!='rich' || !strpos($j->html,$embedurl) ){ $ret .= "

"; if (isset($j->title)) { - if (isset($j->provider_name)) + if (isset($j->provider_name)) { $ret .= $j->provider_name.": "; + } $embedlink = (isset($j->title))?$j->title:$embedurl; $ret .= "$embedlink"; - if (isset($j->author_name)) + if (isset($j->author_name)) { $ret.=" (".$j->author_name.")"; + } } elseif (isset($j->provider_name) OR isset($j->author_name)) { $embedlink = ""; - if (isset($j->provider_name)) + if (isset($j->provider_name)) { $embedlink .= $j->provider_name; + } if (isset($j->author_name)) { - if ($embedlink != "") + if ($embedlink != "") { $embedlink .= ": "; + } $embedlink .= $j->author_name; } - if (trim($embedlink) == "") + if (trim($embedlink) == "") { $embedlink = $embedurl; + } $ret .= "$embedlink"; } @@ -247,15 +260,14 @@ function oembed_iframe($src, $width, $height) { } $width = '100%'; - $a = get_app(); - $s = $a->get_baseurl() . '/oembed/'.base64url_encode($src); + $s = App::get_baseurl() . '/oembed/'.base64url_encode($src); return ''; } function oembed_bbcode2html($text){ - $stopoembed = get_config("system","no_oembed"); + $stopoembed = Config::get("system","no_oembed"); if ($stopoembed == true){ return preg_replace("/\[embed\](.+?)\[\/embed\]/is", "". t('Embedding disabled') ." : $1" ,$text); } @@ -268,13 +280,13 @@ function oe_build_xpath($attr, $value){ return "contains( normalize-space( @$attr ), ' $value ' ) or substring( normalize-space( @$attr ), 1, string-length( '$value' ) + 1 ) = '$value ' or substring( normalize-space( @$attr ), string-length( @$attr ) - string-length( '$value' ) ) = ' $value' or @$attr = '$value'"; } -function oe_get_inner_html( $node ) { - $innerHTML= ''; - $children = $node->childNodes; - foreach ($children as $child) { - $innerHTML .= $child->ownerDocument->saveXML( $child ); - } - return $innerHTML; +function oe_get_inner_html($node) { + $innerHTML= ''; + $children = $node->childNodes; + foreach ($children as $child) { + $innerHTML .= $child->ownerDocument->saveXML($child); + } + return $innerHTML; } /** @@ -283,15 +295,16 @@ function oe_get_inner_html( $node ) { */ function oembed_html2bbcode($text) { // start parser only if 'oembed' is in text - if (strpos($text, "oembed")){ + if (strpos($text, "oembed")) { // convert non ascii chars to html entities $html_text = mb_convert_encoding($text, 'HTML-ENTITIES', mb_detect_encoding($text)); // If it doesn't parse at all, just return the text. $dom = @DOMDocument::loadHTML($html_text); - if(! $dom) + if (! $dom) { return $text; + } $xpath = new DOMXPath($dom); $attr = "oembed"; diff --git a/include/plaintext.php b/include/plaintext.php index 539ef020d..d98d73655 100644 --- a/include/plaintext.php +++ b/include/plaintext.php @@ -1,6 +1,15 @@ query("//".$node); - foreach ($list as $child) - $child->parentNode->removeChild($child); - } -} +use \Friendica\ParseUrl; -function completeurl($url, $scheme) { - $urlarr = parse_url($url); - - if (isset($urlarr["scheme"])) - return($url); - - $schemearr = parse_url($scheme); - - $complete = $schemearr["scheme"]."://".$schemearr["host"]; - - if (@$schemearr["port"] != "") - $complete .= ":".$schemearr["port"]; - - if(strpos($urlarr['path'],'/') !== 0) - $complete .= '/'; - - $complete .= $urlarr["path"]; - - if (@$urlarr["query"] != "") - $complete .= "?".$urlarr["query"]; - - if (@$urlarr["fragment"] != "") - $complete .= "#".$urlarr["fragment"]; - - return($complete); -} - -function parseurl_getsiteinfo_cached($url, $no_guessing = false, $do_oembed = true) { - - if ($url == "") - return false; - - $r = q("SELECT * FROM `parsed_url` WHERE `url` = '%s' AND `guessing` = %d AND `oembed` = %d", - dbesc(normalise_link($url)), intval(!$no_guessing), intval($do_oembed)); - - if ($r) - $data = $r[0]["content"]; - - if (!is_null($data)) { - $data = unserialize($data); - return $data; - } - - $data = parseurl_getsiteinfo($url, $no_guessing, $do_oembed); - - q("INSERT INTO `parsed_url` (`url`, `guessing`, `oembed`, `content`, `created`) VALUES ('%s', %d, %d, '%s', '%s') - ON DUPLICATE KEY UPDATE `content` = '%s', `created` = '%s'", - dbesc(normalise_link($url)), intval(!$no_guessing), intval($do_oembed), - dbesc(serialize($data)), dbesc(datetime_convert()), - dbesc(serialize($data)), dbesc(datetime_convert())); - - return $data; -} - -function parseurl_getsiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) { - require_once("include/network.php"); - require_once("include/Photo.php"); - - $a = get_app(); - - $siteinfo = array(); - - // Check if the URL does contain a scheme - $scheme = parse_url($url, PHP_URL_SCHEME); - - if ($scheme == "") { - $url = "http://".trim($url, "/"); - } - - if ($count > 10) { - logger("parseurl_getsiteinfo: Endless loop detected for ".$url, LOGGER_DEBUG); - return($siteinfo); - } - - $url = trim($url, "'"); - $url = trim($url, '"'); - - $url = original_url($url); - - $siteinfo["url"] = $url; - $siteinfo["type"] = "link"; - - $check_cert = get_config('system','verifyssl'); - - $stamp1 = microtime(true); - - $ch = curl_init(); - curl_setopt($ch, CURLOPT_URL, $url); - curl_setopt($ch, CURLOPT_HEADER, 1); - curl_setopt($ch, CURLOPT_NOBODY, 1); - curl_setopt($ch, CURLOPT_TIMEOUT, 3); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent()); - curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false)); - curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, (($check_cert) ? 2 : false)); - - $header = curl_exec($ch); - $curl_info = @curl_getinfo($ch); - $http_code = $curl_info['http_code']; - curl_close($ch); - - $a->save_timestamp($stamp1, "network"); - - if ((($curl_info['http_code'] == "301") OR ($curl_info['http_code'] == "302") OR ($curl_info['http_code'] == "303") OR ($curl_info['http_code'] == "307")) - AND (($curl_info['redirect_url'] != "") OR ($curl_info['location'] != ""))) { - if ($curl_info['redirect_url'] != "") - $siteinfo = parseurl_getsiteinfo($curl_info['redirect_url'], $no_guessing, $do_oembed, ++$count); - else - $siteinfo = parseurl_getsiteinfo($curl_info['location'], $no_guessing, $do_oembed, ++$count); - return($siteinfo); - } - - // if the file is too large then exit - if ($curl_info["download_content_length"] > 1000000) - return($siteinfo); - - // if it isn't a HTML file then exit - if (($curl_info["content_type"] != "") AND !strstr(strtolower($curl_info["content_type"]),"html")) - return($siteinfo); - - if ($do_oembed) { - require_once("include/oembed.php"); - - $oembed_data = oembed_fetch_url($url); - - if (!in_array($oembed_data->type, array("error", "rich"))) { - $siteinfo["type"] = $oembed_data->type; - } - - if (($oembed_data->type == "link") AND ($siteinfo["type"] != "photo")) { - if (isset($oembed_data->title)) - $siteinfo["title"] = $oembed_data->title; - if (isset($oembed_data->description)) - $siteinfo["text"] = trim($oembed_data->description); - if (isset($oembed_data->thumbnail_url)) - $siteinfo["image"] = $oembed_data->thumbnail_url; - } - } - - $stamp1 = microtime(true); - - // Now fetch the body as well - $ch = curl_init(); - curl_setopt($ch, CURLOPT_URL, $url); - curl_setopt($ch, CURLOPT_HEADER, 1); - curl_setopt($ch, CURLOPT_NOBODY, 0); - curl_setopt($ch, CURLOPT_TIMEOUT, 10); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent()); - curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false)); - curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, (($check_cert) ? 2 : false)); - - $header = curl_exec($ch); - $curl_info = @curl_getinfo($ch); - $http_code = $curl_info['http_code']; - curl_close($ch); - - $a->save_timestamp($stamp1, "network"); - - // Fetch the first mentioned charset. Can be in body or header - $charset = ""; - if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches)) - $charset = trim(trim(trim(array_pop($matches)), ';,')); - - if ($charset == "") - $charset = "utf-8"; - - $pos = strpos($header, "\r\n\r\n"); - - if ($pos) - $body = trim(substr($header, $pos)); - else - $body = $header; - - if (($charset != '') AND (strtoupper($charset) != "UTF-8")) { - logger("parseurl_getsiteinfo: detected charset ".$charset, LOGGER_DEBUG); - //$body = mb_convert_encoding($body, "UTF-8", $charset); - $body = iconv($charset, "UTF-8//TRANSLIT", $body); - } - - $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8"); - - $doc = new DOMDocument(); - @$doc->loadHTML($body); - - deletenode($doc, 'style'); - deletenode($doc, 'script'); - deletenode($doc, 'option'); - deletenode($doc, 'h1'); - deletenode($doc, 'h2'); - deletenode($doc, 'h3'); - deletenode($doc, 'h4'); - deletenode($doc, 'h5'); - deletenode($doc, 'h6'); - deletenode($doc, 'ol'); - deletenode($doc, 'ul'); - - $xpath = new DomXPath($doc); - - $list = $xpath->query("//meta[@content]"); - foreach ($list as $node) { - $attr = array(); - if ($node->attributes->length) - foreach ($node->attributes as $attribute) - $attr[$attribute->name] = $attribute->value; - - if (@$attr["http-equiv"] == 'refresh') { - $path = $attr["content"]; - $pathinfo = explode(";", $path); - $content = ""; - foreach ($pathinfo AS $value) { - if (substr(strtolower($value), 0, 4) == "url=") - $content = substr($value, 4); - } - if ($content != "") { - $siteinfo = parseurl_getsiteinfo($content, $no_guessing, $do_oembed, ++$count); - return($siteinfo); - } - } - } - - $list = $xpath->query("//title"); - if ($list->length > 0) - $siteinfo["title"] = $list->item(0)->nodeValue; - - //$list = $xpath->query("head/meta[@name]"); - $list = $xpath->query("//meta[@name]"); - foreach ($list as $node) { - $attr = array(); - if ($node->attributes->length) - foreach ($node->attributes as $attribute) - $attr[$attribute->name] = $attribute->value; - - $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8")); - - if ($attr["content"] != "") - switch (strtolower($attr["name"])) { - case "fulltitle": - $siteinfo["title"] = $attr["content"]; - break; - case "description": - $siteinfo["text"] = $attr["content"]; - break; - case "thumbnail": - $siteinfo["image"] = $attr["content"]; - break; - case "twitter:image": - $siteinfo["image"] = $attr["content"]; - break; - case "twitter:image:src": - $siteinfo["image"] = $attr["content"]; - break; - case "twitter:card": - if (($siteinfo["type"] == "") OR ($attr["content"] == "photo")) - $siteinfo["type"] = $attr["content"]; - break; - case "twitter:description": - $siteinfo["text"] = $attr["content"]; - break; - case "twitter:title": - $siteinfo["title"] = $attr["content"]; - break; - case "dc.title": - $siteinfo["title"] = $attr["content"]; - break; - case "dc.description": - $siteinfo["text"] = $attr["content"]; - break; - case "keywords": - $keywords = explode(",", $attr["content"]); - break; - case "news_keywords": - $keywords = explode(",", $attr["content"]); - break; - } - if ($siteinfo["type"] == "summary") - $siteinfo["type"] = "link"; - } - - if (isset($keywords)) { - $siteinfo["keywords"] = array(); - foreach ($keywords as $keyword) - if (!in_array(trim($keyword), $siteinfo["keywords"])) - $siteinfo["keywords"][] = trim($keyword); - } - - //$list = $xpath->query("head/meta[@property]"); - $list = $xpath->query("//meta[@property]"); - foreach ($list as $node) { - $attr = array(); - if ($node->attributes->length) - foreach ($node->attributes as $attribute) - $attr[$attribute->name] = $attribute->value; - - $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8")); - - if ($attr["content"] != "") - switch (strtolower($attr["property"])) { - case "og:image": - $siteinfo["image"] = $attr["content"]; - break; - case "og:title": - $siteinfo["title"] = $attr["content"]; - break; - case "og:description": - $siteinfo["text"] = $attr["content"]; - break; - } - } - - if ((@$siteinfo["image"] == "") AND !$no_guessing) { - $list = $xpath->query("//img[@src]"); - foreach ($list as $node) { - $attr = array(); - if ($node->attributes->length) - foreach ($node->attributes as $attribute) - $attr[$attribute->name] = $attribute->value; - - $src = completeurl($attr["src"], $url); - $photodata = get_photo_info($src); - - if (($photodata) && ($photodata[0] > 150) and ($photodata[1] > 150)) { - if ($photodata[0] > 300) { - $photodata[1] = round($photodata[1] * (300 / $photodata[0])); - $photodata[0] = 300; - } - if ($photodata[1] > 300) { - $photodata[0] = round($photodata[0] * (300 / $photodata[1])); - $photodata[1] = 300; - } - $siteinfo["images"][] = array("src"=>$src, - "width"=>$photodata[0], - "height"=>$photodata[1]); - } - - } - } elseif ($siteinfo["image"] != "") { - $src = completeurl($siteinfo["image"], $url); - - unset($siteinfo["image"]); - - $photodata = get_photo_info($src); - - if (($photodata) && ($photodata[0] > 10) and ($photodata[1] > 10)) - $siteinfo["images"][] = array("src"=>$src, - "width"=>$photodata[0], - "height"=>$photodata[1]); - } - - if ((@$siteinfo["text"] == "") AND (@$siteinfo["title"] != "") AND !$no_guessing) { - $text = ""; - - $list = $xpath->query("//div[@class='article']"); - foreach ($list as $node) - if (strlen($node->nodeValue) > 40) - $text .= " ".trim($node->nodeValue); - - if ($text == "") { - $list = $xpath->query("//div[@class='content']"); - foreach ($list as $node) - if (strlen($node->nodeValue) > 40) - $text .= " ".trim($node->nodeValue); - } - - // If none text was found then take the paragraph content - if ($text == "") { - $list = $xpath->query("//p"); - foreach ($list as $node) - if (strlen($node->nodeValue) > 40) - $text .= " ".trim($node->nodeValue); - } - - if ($text != "") { - $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text)); - - while (strpos($text, " ")) - $text = trim(str_replace(" ", " ", $text)); - - $siteinfo["text"] = trim(html_entity_decode(substr($text,0,350), ENT_QUOTES, "UTF-8").'...'); - } - } - - logger("parseurl_getsiteinfo: Siteinfo for ".$url." ".print_r($siteinfo, true), LOGGER_DEBUG); - - call_hooks('getsiteinfo', $siteinfo); - - return($siteinfo); -} - -function arr_add_hashes(&$item,$k) { - $item = '#' . $item; -} +require_once("include/items.php"); function parse_url_content(&$a) { - require_once("include/items.php"); - $text = null; - $str_tags = ''; + $str_tags = ""; $textmode = false; - if(local_user() && (! feature_enabled(local_user(),'richtext'))) + if (local_user() && (!feature_enabled(local_user(), "richtext"))) { $textmode = true; + } - //if($textmode) - $br = (($textmode) ? "\n" : '
'); + $br = (($textmode) ? "\n" : "
"); - if(x($_GET,'binurl')) - $url = trim(hex2bin($_GET['binurl'])); - else - $url = trim($_GET['url']); + if (x($_GET,"binurl")) { + $url = trim(hex2bin($_GET["binurl"])); + } else { + $url = trim($_GET["url"]); + } - if($_GET['title']) - $title = strip_tags(trim($_GET['title'])); + if ($_GET["title"]) { + $title = strip_tags(trim($_GET["title"])); + } - if($_GET['description']) - $text = strip_tags(trim($_GET['description'])); + if ($_GET["description"]) { + $text = strip_tags(trim($_GET["description"])); + } - if($_GET['tags']) { - $arr_tags = str_getcsv($_GET['tags']); - if(count($arr_tags)) { - array_walk($arr_tags,'arr_add_hashes'); - $str_tags = $br . implode(' ',$arr_tags) . $br; + if ($_GET["tags"]) { + $arr_tags = ParseUrl::convertTagsToArray($_GET["tags"]); + if (count($arr_tags)) { + $str_tags = $br . implode(" ", $arr_tags) . $br; } } - // add url scheme if missing + // Add url scheme if it is missing $arrurl = parse_url($url); - if (!x($arrurl, 'scheme')) { - if (x($arrurl, 'host')) + if (!x($arrurl, "scheme")) { + if (x($arrurl, "host")) { $url = "http:".$url; - else + } else { $url = "http://".$url; + } } - logger('parse_url: ' . $url); + logger("prse_url: " . $url); - if($textmode) - $template = '[bookmark=%s]%s[/bookmark]%s'; - else + // If the URL is a image, video or audio file format the URL with the corresponding + // BBCode media tag + $redirects = 0; + // Fetch the header of the URL + $result = z_fetch_url($url, false, $redirects, array("novalidate" => true, "nobody" => true)); + if($result["success"]) { + // Convert the header fields into an array + $hdrs = array(); + $h = explode("\n", $result["header"]); + foreach ($h as $l) { + list($k,$v) = array_map("trim", explode(":", trim($l), 2)); + $hdrs[$k] = $v; + } + if (array_key_exists("Content-Type", $hdrs)) { + $type = $hdrs["Content-Type"]; + } + if ($type) { + if(stripos($type, "image/") !== false) { + echo $br . "[img]" . $url . "[/img]" . $br; + killme(); + } + if (stripos($type, "video/") !== false) { + echo $br . "[video]" . $url . "[/video]" . $br; + killme(); + } + if (stripos($type, "audio/") !== false) { + echo $br . "[audio]" . $url . "[/audio]" . $br; + killme(); + } + } + } + + if ($textmode) { + $template = "[bookmark=%s]%s[/bookmark]%s"; + } else { $template = "%s%s"; + } - $arr = array('url' => $url, 'text' => ''); + $arr = array("url" => $url, "text" => ""); - call_hooks('parse_link', $arr); + call_hooks("parse_link", $arr); - if(strlen($arr['text'])) { - echo $arr['text']; + if (strlen($arr["text"])) { + echo $arr["text"]; killme(); } - if($url && $title && $text) { + if ($url && $title && $text) { - $title = str_replace(array("\r","\n"),array('',''),$title); + $title = str_replace(array("\r","\n"),array("",""),$title); - if($textmode) - $text = '[quote]' . trim($text) . '[/quote]' . $br; - else { - $text = '
' . htmlspecialchars(trim($text)) . '

'; + if ($textmode) { + $text = "[quote]" . trim($text) . "[/quote]" . $br; + } else { + $text = "
" . htmlspecialchars(trim($text)) . "

"; $title = htmlspecialchars($title); } - $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags; + $result = sprintf($template, $url, ($title) ? $title : $url, $text) . $str_tags; - logger('parse_url (unparsed): returns: ' . $result); + logger("parse_url (unparsed): returns: " . $result); echo $result; killme(); } - $siteinfo = parseurl_getsiteinfo($url); + // Fetch the information from the webpage + $siteinfo = ParseUrl::getSiteinfo($url); unset($siteinfo["keywords"]); + // Format it as BBCode attachment $info = add_page_info_data($siteinfo); - if (!$textmode) + if (!$textmode) { // Replace ' with ’ - not perfect - but the richtext editor has problems otherwise $info = str_replace(array("'"), array("’"), $info); + } echo $info; killme(); } -?> From 26664c22e0544ccda1405f6aafd890245bf29201 Mon Sep 17 00:00:00 2001 From: rabuzarus Date: Fri, 25 Nov 2016 16:59:31 +0100 Subject: [PATCH 2/7] parse_url: add some docu --- mod/parse_url.php | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/mod/parse_url.php b/mod/parse_url.php index a6b7cd502..6104f6caa 100644 --- a/mod/parse_url.php +++ b/mod/parse_url.php @@ -1,6 +1,15 @@ true, "nobody" => true)); @@ -114,7 +123,8 @@ function parse_url_content(&$a) { killme(); } - + // If there is allready some content information submitted we don't + // need to parse the url for content. if ($url && $title && $text) { $title = str_replace(array("\r","\n"),array("",""),$title); @@ -134,7 +144,7 @@ function parse_url_content(&$a) { killme(); } - // Fetch the information from the webpage + // Fetch the information directly from the webpage $siteinfo = ParseUrl::getSiteinfo($url); unset($siteinfo["keywords"]); From c96a53b4b4ca2d46e9b4a61ff454196d2fcc054e Mon Sep 17 00:00:00 2001 From: rabuzarus <> Date: Sat, 26 Nov 2016 04:10:50 +0100 Subject: [PATCH 3/7] parse_url: fix typo --- mod/parse_url.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mod/parse_url.php b/mod/parse_url.php index 6104f6caa..0a9b096cb 100644 --- a/mod/parse_url.php +++ b/mod/parse_url.php @@ -8,7 +8,7 @@ * on the user settings - default is BBCode output). * If the user has enabled the richtext editor setting the output will be in html * (Note: This is not always possible and in some case not useful because - * the richtext editor doesn't support all ind of html). + * the richtext editor doesn't support all kind of html). * Otherwise the output will be constructed BBCode. * * @todo https://developers.google.com/+/plugins/snippet/ From 56e38dd6bd404c8c98c9dbda87769887e567048f Mon Sep 17 00:00:00 2001 From: rabuzarus Date: Sun, 27 Nov 2016 20:19:43 +0100 Subject: [PATCH 4/7] move function deletenode() to the xml class --- include/ParseUrl.php | 31 ++++++++++++------------------- include/html2bbcode.php | 40 +++++++++++++++++----------------------- include/xml.php | 18 ++++++++++++++++-- 3 files changed, 45 insertions(+), 44 deletions(-) diff --git a/include/ParseUrl.php b/include/ParseUrl.php index 8a3392e73..834c64475 100644 --- a/include/ParseUrl.php +++ b/include/ParseUrl.php @@ -12,6 +12,7 @@ use \Friendica\Core\Config; require_once("include/network.php"); require_once("include/Photo.php"); require_once("include/oembed.php"); +require_once("include/xml.php"); /** * @brief Class with methods for extracting certain content from an url @@ -184,17 +185,17 @@ class ParseUrl { $doc = new \DOMDocument(); @$doc->loadHTML($body); - self::deleteNode($doc, "style"); - self::deleteNode($doc, "script"); - self::deleteNode($doc, "option"); - self::deleteNode($doc, "h1"); - self::deleteNode($doc, "h2"); - self::deleteNode($doc, "h3"); - self::deleteNode($doc, "h4"); - self::deleteNode($doc, "h5"); - self::deleteNode($doc, "h6"); - self::deleteNode($doc, "ol"); - self::deleteNode($doc, "ul"); + \xml::deleteNode($doc, "style"); + \xml::deleteNode($doc, "script"); + \xml::deleteNode($doc, "option"); + \xml::deleteNode($doc, "h1"); + \xml::deleteNode($doc, "h2"); + \xml::deleteNode($doc, "h3"); + \xml::deleteNode($doc, "h4"); + \xml::deleteNode($doc, "h5"); + \xml::deleteNode($doc, "h6"); + \xml::deleteNode($doc, "ol"); + \xml::deleteNode($doc, "ul"); $xpath = new \DomXPath($doc); @@ -440,14 +441,6 @@ class ParseUrl { $tag = "#" . $tag; } - private static function deleteNode(&$doc, $node) { - $xpath = new \DomXPath($doc); - $list = $xpath->query("//".$node); - foreach ($list as $child) { - $child->parentNode->removeChild($child); - } - } - private static function completeUrl($url, $scheme) { $urlarr = parse_url($url); diff --git a/include/html2bbcode.php b/include/html2bbcode.php index 28e251aee..189ba91f1 100644 --- a/include/html2bbcode.php +++ b/include/html2bbcode.php @@ -1,11 +1,14 @@ query("//".$node); - foreach ($list as $child) - $child->parentNode->removeChild($child); -}} - function _replace_code_cb($m){ return "".str_replace("\n","
\n",$m[1]). "
"; } @@ -117,12 +111,12 @@ function html2bbcode($message) @$doc->loadHTML($message); - deletenode($doc, 'style'); - deletenode($doc, 'head'); - deletenode($doc, 'title'); - deletenode($doc, 'meta'); - deletenode($doc, 'xml'); - deletenode($doc, 'removeme'); + xml::deleteNode($doc, 'style'); + xml::deleteNode($doc, 'head'); + xml::deleteNode($doc, 'title'); + xml::deleteNode($doc, 'meta'); + xml::deleteNode($doc, 'xml'); + xml::deleteNode($doc, 'removeme'); $xpath = new DomXPath($doc); $list = $xpath->query("//pre"); @@ -239,7 +233,7 @@ function html2bbcode($message) node2bbcode($doc, 'iframe', array('src'=>'/(.+)/'), '[iframe]$1', '[/iframe]'); node2bbcode($doc, 'code', array(), '[code]', '[/code]'); - node2bbcode($doc, 'key', array(), '[code]', '[/code]'); + node2bbcode($doc, 'key', array(), '[code]', '[/code]'); $message = $doc->saveHTML(); diff --git a/include/xml.php b/include/xml.php index 3bb376aba..fd04ed1df 100644 --- a/include/xml.php +++ b/include/xml.php @@ -1,11 +1,12 @@ query("//".$node); + foreach ($list as $child) { + $child->parentNode->removeChild($child); + } + } } -?> From 85b51ee41c66e938d267100d79faedb270442de4 Mon Sep 17 00:00:00 2001 From: rabuzarus <> Date: Sun, 27 Nov 2016 20:42:40 +0100 Subject: [PATCH 5/7] xml:php - some code standard work --- include/xml.php | 139 ++++++++++++++++++++++++++++-------------------- 1 file changed, 81 insertions(+), 58 deletions(-) diff --git a/include/xml.php b/include/xml.php index fd04ed1df..9f7de8f33 100644 --- a/include/xml.php +++ b/include/xml.php @@ -24,15 +24,17 @@ class xml { public static function from_array($array, &$xml, $remove_header = false, $namespaces = array(), $root = true) { if ($root) { - foreach($array as $key => $value) { - foreach ($namespaces AS $nskey => $nsvalue) + foreach ($array as $key => $value) { + foreach ($namespaces AS $nskey => $nsvalue) { $key .= " xmlns".($nskey == "" ? "":":").$nskey.'="'.$nsvalue.'"'; + } if (is_array($value)) { $root = new SimpleXMLElement("<".$key."/>"); self::from_array($value, $root, $remove_header, $namespaces, false); - } else + } else { $root = new SimpleXMLElement("<".$key.">".xmlify($value).""); + } $dom = dom_import_simplexml($root)->ownerDocument; $dom->formatOutput = true; @@ -40,16 +42,18 @@ class xml { $xml_text = $dom->saveXML(); - if ($remove_header) + if ($remove_header) { $xml_text = trim(substr($xml_text, 21)); + } return $xml_text; } } foreach($array as $key => $value) { - if (!isset($element) AND isset($xml)) + if (!isset($element) AND isset($xml)) { $element = $xml; + } if (is_integer($key)) { if (isset($element)) { @@ -63,27 +67,31 @@ class xml { } $element_parts = explode(":", $key); - if ((count($element_parts) > 1) AND isset($namespaces[$element_parts[0]])) + if ((count($element_parts) > 1) AND isset($namespaces[$element_parts[0]])) { $namespace = $namespaces[$element_parts[0]]; - elseif (isset($namespaces[""])) { + } elseif (isset($namespaces[""])) { $namespace = $namespaces[""]; - } else + } else { $namespace = NULL; + } // Remove undefined namespaces from the key - if ((count($element_parts) > 1) AND is_null($namespace)) + if ((count($element_parts) > 1) AND is_null($namespace)) { $key = $element_parts[1]; + } if (substr($key, 0, 11) == "@attributes") { - if (!isset($element) OR !is_array($value)) + if (!isset($element) OR !is_array($value)) { continue; + } foreach ($value as $attr_key => $attr_value) { $element_parts = explode(":", $attr_key); - if ((count($element_parts) > 1) AND isset($namespaces[$element_parts[0]])) + if ((count($element_parts) > 1) AND isset($namespaces[$element_parts[0]])) { $namespace = $namespaces[$element_parts[0]]; - else + } else { $namespace = NULL; + } $element->addAttribute($attr_key, $attr_value, $namespace); } @@ -91,9 +99,9 @@ class xml { continue; } - if (!is_array($value)) + if (!is_array($value)) { $element = $xml->addChild($key, xmlify($value), $namespace); - elseif (is_array($value)) { + } elseif (is_array($value)) { $element = $xml->addChild($key, NULL, $namespace); self::from_array($value, $element, $remove_header, $namespaces, false); } @@ -112,8 +120,9 @@ class xml { $target->addChild($elementname, xmlify($source)); else { $child = $target->addChild($elementname); - foreach ($source->children() AS $childfield => $childentry) + foreach ($source->children() AS $childfield => $childentry) { self::copy($childentry, $child, $childfield); + } } } @@ -169,11 +178,11 @@ class xml { return(null); } - if (!is_string($xml_element) && - !is_array($xml_element) && - (get_class($xml_element) == 'SimpleXMLElement')) { - $xml_element_copy = $xml_element; - $xml_element = get_object_vars($xml_element); + if (!is_string($xml_element) + && !is_array($xml_element) + && (get_class($xml_element) == 'SimpleXMLElement')) { + $xml_element_copy = $xml_element; + $xml_element = get_object_vars($xml_element); } if (is_array($xml_element)) { @@ -182,7 +191,7 @@ class xml { return (trim(strval($xml_element_copy))); } - foreach($xml_element as $key=>$value) { + foreach ($xml_element as $key => $value) { $recursion_depth++; $result_array[strtolower($key)] = @@ -224,10 +233,12 @@ class xml { * * @return array The parsed XML in an array form. Use print_r() to see the resulting array structure. */ - public static function to_array($contents, $namespaces = true, $get_attributes=1, $priority = 'attribute') { - if(!$contents) return array(); + public static function to_array($contents, $namespaces = true, $get_attributes = 1, $priority = 'attribute') { + if (!$contents) { + return array(); + } - if(!function_exists('xml_parser_create')) { + if (!function_exists('xml_parser_create')) { logger('xml::to_array: parser function missing'); return array(); } @@ -236,12 +247,13 @@ class xml { libxml_use_internal_errors(true); libxml_clear_errors(); - if($namespaces) + if ($namespaces) { $parser = @xml_parser_create_ns("UTF-8",':'); - else + } else { $parser = @xml_parser_create(); + } - if(! $parser) { + if (! $parser) { logger('xml::to_array: xml_parser_create: no resource'); return array(); } @@ -253,10 +265,11 @@ class xml { @xml_parse_into_struct($parser, trim($contents), $xml_values); @xml_parser_free($parser); - if(! $xml_values) { + if (! $xml_values) { logger('xml::to_array: libxml: parse error: ' . $contents, LOGGER_DATA); - foreach(libxml_get_errors() as $err) + foreach (libxml_get_errors() as $err) { logger('libxml: parse: ' . $err->code . " at " . $err->line . ":" . $err->column . " : " . $err->message, LOGGER_DATA); + } libxml_clear_errors(); return; } @@ -271,8 +284,8 @@ class xml { // Go through the tags. $repeated_tag_index = array(); // Multiple tags with same name will be turned into an array - foreach($xml_values as $data) { - unset($attributes,$value); // Remove existing values, or there will be trouble + foreach ($xml_values as $data) { + unset($attributes, $value); // Remove existing values, or there will be trouble // This command will extract these variables into the foreach scope // tag(string), type(string), level(int), attributes(array). @@ -281,46 +294,54 @@ class xml { $result = array(); $attributes_data = array(); - if(isset($value)) { - if($priority == 'tag') $result = $value; - else $result['value'] = $value; // Put the value in a assoc array if we are in the 'Attribute' mode + if (isset($value)) { + if ($priority == 'tag') { + $result = $value; + } else { + $result['value'] = $value; // Put the value in a assoc array if we are in the 'Attribute' mode + } } //Set the attributes too. - if(isset($attributes) and $get_attributes) { - foreach($attributes as $attr => $val) { - if($priority == 'tag') $attributes_data[$attr] = $val; - else $result['@attributes'][$attr] = $val; // Set all the attributes in a array called 'attr' + if (isset($attributes) and $get_attributes) { + foreach ($attributes as $attr => $val) { + if($priority == 'tag') { + $attributes_data[$attr] = $val; + } else { + $result['@attributes'][$attr] = $val; // Set all the attributes in a array called 'attr' + } } } // See tag status and do the needed. - if($namespaces && strpos($tag,':')) { - $namespc = substr($tag,0,strrpos($tag,':')); - $tag = strtolower(substr($tag,strlen($namespc)+1)); + if ($namespaces && strpos($tag, ':')) { + $namespc = substr($tag, 0, strrpos($tag, ':')); + $tag = strtolower(substr($tag, strlen($namespc)+1)); $result['@namespace'] = $namespc; } $tag = strtolower($tag); - if($type == "open") { // The starting of the tag '' + if ($type == "open") { // The starting of the tag '' $parent[$level-1] = &$current; - if(!is_array($current) or (!in_array($tag, array_keys($current)))) { // Insert New tag + if (!is_array($current) or (!in_array($tag, array_keys($current)))) { // Insert New tag $current[$tag] = $result; - if($attributes_data) $current[$tag. '_attr'] = $attributes_data; + if ($attributes_data) { + $current[$tag. '_attr'] = $attributes_data; + } $repeated_tag_index[$tag.'_'.$level] = 1; $current = &$current[$tag]; } else { // There was another element with the same tag name - if(isset($current[$tag][0])) { // If there is a 0th element it is already an array + if (isset($current[$tag][0])) { // If there is a 0th element it is already an array $current[$tag][$repeated_tag_index[$tag.'_'.$level]] = $result; $repeated_tag_index[$tag.'_'.$level]++; } else { // This section will make the value an array if multiple tags with the same name appear together - $current[$tag] = array($current[$tag],$result); // This will combine the existing item and the new item together to make an array + $current[$tag] = array($current[$tag], $result); // This will combine the existing item and the new item together to make an array $repeated_tag_index[$tag.'_'.$level] = 2; - if(isset($current[$tag.'_attr'])) { // The attribute of the last(0th) tag must be moved as well + if (isset($current[$tag.'_attr'])) { // The attribute of the last(0th) tag must be moved as well $current[$tag]['0_attr'] = $current[$tag.'_attr']; unset($current[$tag.'_attr']); } @@ -330,35 +351,37 @@ class xml { $current = &$current[$tag][$last_item_index]; } - } elseif($type == "complete") { // Tags that ends in 1 line '' + } elseif ($type == "complete") { // Tags that ends in 1 line '' //See if the key is already taken. - if(!isset($current[$tag])) { //New Key + if (!isset($current[$tag])) { //New Key $current[$tag] = $result; $repeated_tag_index[$tag.'_'.$level] = 1; - if($priority == 'tag' and $attributes_data) $current[$tag. '_attr'] = $attributes_data; + if ($priority == 'tag' and $attributes_data) { + $current[$tag. '_attr'] = $attributes_data; + } } else { // If taken, put all things inside a list(array) - if(isset($current[$tag][0]) and is_array($current[$tag])) { // If it is already an array... + if (isset($current[$tag][0]) and is_array($current[$tag])) { // If it is already an array... // ...push the new element into that array. $current[$tag][$repeated_tag_index[$tag.'_'.$level]] = $result; - if($priority == 'tag' and $get_attributes and $attributes_data) { + if ($priority == 'tag' and $get_attributes and $attributes_data) { $current[$tag][$repeated_tag_index[$tag.'_'.$level] . '_attr'] = $attributes_data; } $repeated_tag_index[$tag.'_'.$level]++; } else { // If it is not an array... - $current[$tag] = array($current[$tag],$result); //...Make it an array using using the existing value and the new value + $current[$tag] = array($current[$tag], $result); //...Make it an array using using the existing value and the new value $repeated_tag_index[$tag.'_'.$level] = 1; - if($priority == 'tag' and $get_attributes) { - if(isset($current[$tag.'_attr'])) { // The attribute of the last(0th) tag must be moved as well + if ($priority == 'tag' and $get_attributes) { + if (isset($current[$tag.'_attr'])) { // The attribute of the last(0th) tag must be moved as well $current[$tag]['0_attr'] = $current[$tag.'_attr']; unset($current[$tag.'_attr']); } - if($attributes_data) { + if ($attributes_data) { $current[$tag][$repeated_tag_index[$tag.'_'.$level] . '_attr'] = $attributes_data; } } @@ -366,7 +389,7 @@ class xml { } } - } elseif($type == 'close') { // End of tag '' + } elseif ($type == 'close') { // End of tag '' $current = &$parent[$level-1]; } } @@ -381,7 +404,7 @@ class xml { * @param string $node Node name */ public static function deleteNode(&$doc, $node) { - $xpath = new \DomXPath($doc); + $xpath = new DomXPath($doc); $list = $xpath->query("//".$node); foreach ($list as $child) { $child->parentNode->removeChild($child); From f229d65f8535c63e97df980bf42db6639203590e Mon Sep 17 00:00:00 2001 From: rabuzarus <> Date: Sun, 27 Nov 2016 23:41:55 +0100 Subject: [PATCH 6/7] ParseUrl: some docu work --- include/ParseUrl.php | 79 +++++++++++++++++++++++++++++++++++++++++++- mod/parse_url.php | 15 ++------- 2 files changed, 80 insertions(+), 14 deletions(-) diff --git a/include/ParseUrl.php b/include/ParseUrl.php index 834c64475..549d705da 100644 --- a/include/ParseUrl.php +++ b/include/ParseUrl.php @@ -19,6 +19,28 @@ require_once("include/xml.php"); */ class ParseUrl { + /** + * @brief Search for chached embeddable data of an url otherwise fetch it + * + * @param type $url The url of the page which should be scraped + * @param type $no_guessing If true the parse doens't search for + * preview pictures + * @param type $do_oembed The false option is used by the function fetch_oembed() + * to avoid endless loops + * + * @return array which contains needed data for embedding + * string 'url' => The url of the parsed page + * string 'type' => Content type + * string 'title' => The title of the content + * string 'text' => The description for the content + * string 'image' => A preview image of the content (only available + * if $no_geuessing = false + * array'images' = Array of preview pictures + * string 'keywords' => The tags which belong to the content + * + * @see ParseUrl::getSiteinfo() for more information about scraping + * embeddable content + */ public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true) { if ($url == "") { @@ -47,7 +69,46 @@ class ParseUrl { return $data; } - + /** + * @brief Parse a page for embeddable content information + * + * This method parses to url for meta data which can be used to embed + * the content. If available it prioritizes Open Graph meta tags. + * If this is not available it uses the twitter cards meta tags. + * As fallback it uses standard html elements with meta informations + * like \Awesome Title\ or + * \ + * + * @param type $url The url of the page which should be scraped + * @param type $no_guessing If true the parse doens't search for + * preview pictures + * @param type $do_oembed The false option is used by the function fetch_oembed() + * to avoid endless loops + * @param type $count Internal counter to avoid endless loops + * + * @return array which contains needed data for embedding + * string 'url' => The url of the parsed page + * string 'type' => Content type + * string 'title' => The title of the content + * string 'text' => The description for the content + * string 'image' => A preview image of the content (only available + * if $no_geuessing = false + * array'images' = Array of preview pictures + * string 'keywords' => The tags which belong to the content + * + * @todo https://developers.google.com/+/plugins/snippet/ + * @verbatim + * + * + * + * + * + *

Shiny Trinket

+ * + *

Shiny trinkets are shiny.

+ * + * @endverbatim + */ public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) { $a = get_app(); @@ -441,9 +502,25 @@ class ParseUrl { $tag = "#" . $tag; } + /** + * @brief Add a scheme to an url + * + * The src attribute of some html elements (e.g. images) + * can miss the scheme so we need to add the correct + * scheme + * + * @param string $url The url which possibly does have + * a missing scheme (a link to an image) + * @param string $scheme The url with a correct scheme + * (e.g. the url from the webpage which does contain the image) + * + * @return string The url with a scheme + */ private static function completeUrl($url, $scheme) { $urlarr = parse_url($url); + // If the url does allready have an scheme + // we can stop the process here if (isset($urlarr["scheme"])) { return($url); } diff --git a/mod/parse_url.php b/mod/parse_url.php index 0a9b096cb..4907c20bf 100644 --- a/mod/parse_url.php +++ b/mod/parse_url.php @@ -1,4 +1,5 @@ - * - * - * - * - *

Shiny Trinket

- * - *

Shiny trinkets are shiny.

- * - * @endverbatim + * @see ParseUrl::getSiteinfo() for more information about scraping embeddable content */ use \Friendica\ParseUrl; From 743378129b82a99ee5b59ca9ad0646ccd813d7df Mon Sep 17 00:00:00 2001 From: rabuzarus <> Date: Mon, 28 Nov 2016 15:26:51 +0100 Subject: [PATCH 7/7] legacy support for function parseurl_getsiteinfo_cached() --- mod/parse_url.php | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/mod/parse_url.php b/mod/parse_url.php index 4907c20bf..410e08773 100644 --- a/mod/parse_url.php +++ b/mod/parse_url.php @@ -150,3 +150,27 @@ function parse_url_content(&$a) { killme(); } + +/** + * @brief Legacy function to call ParseUrl::getSiteinfoCached + * + * Note: We have moved the function to ParseUrl.php. This function is only for + * legacy support and will be remove in the future + * + * @param type $url The url of the page which should be scraped + * @param type $no_guessing If true the parse doens't search for + * preview pictures + * @param type $do_oembed The false option is used by the function fetch_oembed() + * to avoid endless loops + * + * @return array which contains needed data for embedding + * + * @see ParseUrl::getSiteinfoCached() + * + * @todo Remove this function after all Addons has been changed to use + * ParseUrl::getSiteinfoCached + */ +function parseurl_getsiteinfo_cached($url, $no_guessing = false, $do_oembed = true) { + $siteinfo = ParseUrl::getSiteinfoCached($url, $no_guessing, $do_oembed); + return $siteinfo; +}