parse_url: Further improvements of the new method to fetch page data

2012-07-12 23:41:04 +02:00 · 2012-07-12 23:41:04 +02:00 · 02a1fc9cd0
parent 09034ce0ee
commit 02a1fc9cd0
2 changed files with 66 additions and 41 deletions
--- a/include/api.php
+++ b/include/api.php
@ -1727,5 +1727,6 @@ notifications/follow
 notifications/leave
 blocks/exists
 blocks/blocking
 lists
 */
--- a/mod/parse_url.php
+++ b/mod/parse_url.php
@ -1,6 +1,4 @@
 <?php
 require_once('include/Photo.php');
 if(!function_exists('deletenode')) {
 	function deletenode(&$doc, $node)
 	{
@ -11,6 +9,30 @@ if(!function_exists('deletenode')) {
 	}
 }
 function completeurl($url, $scheme) {
        $urlarr = parse_url($url);
        if (isset($urlarr["scheme"]))
                return($url);
        $schemearr = parse_url($scheme);
        $complete = $schemearr["scheme"]."://".$schemearr["host"];
        if ($schemearr["port"] != "")
                $complete .= ":".$schemearr["port"];
        $complete .= $urlarr["path"];
        if ($urlarr["query"] != "")
                $complete .= "?".$urlarr["query"];
        if ($urlarr["fragment"] != "")
                $complete .= "#".$urlarr["fragment"];
        return($complete);
 }
 function parseurl_getsiteinfo($url) {
 	$siteinfo = array();
@ -25,7 +47,8 @@ function parseurl_getsiteinfo($url) {
 	$header = curl_exec($ch);
 	curl_close($ch);
-	if (preg_match('/charset=(.*?)\n/', $header, $matches))
+	// Fetch the first mentioned charset. Can be in body or header
 	if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches))
 		$charset = trim(array_pop($matches));
 	else
 		$charset = "utf-8";
@ -57,11 +80,13 @@ function parseurl_getsiteinfo($url) {
 	$xpath = new DomXPath($doc);
-	$list = $xpath->query("head/title");
+	//$list = $xpath->query("head/title");
 	$list = $xpath->query("//title");
 	foreach ($list as $node)
 		$siteinfo["title"] =  html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8");
-	$list = $xpath->query("head/meta[@name]");
+	//$list = $xpath->query("head/meta[@name]");
 	$list = $xpath->query("//meta[@name]");
 	foreach ($list as $node) {
 		$attr = array();
 		if ($node->attributes->length)
@ -86,7 +111,8 @@ function parseurl_getsiteinfo($url) {
 		}
 	}
-	$list = $xpath->query("head/meta[@property]");
+	//$list = $xpath->query("head/meta[@property]");
 	$list = $xpath->query("//meta[@property]");
 	foreach ($list as $node) {
 		$attr = array();
 		if ($node->attributes->length)
@ -116,38 +142,32 @@ function parseurl_getsiteinfo($url) {
                                foreach ($node->attributes as $attribute)
                                        $attr[$attribute->name] = $attribute->value;
-                        // guess mimetype from headers or filename
+			$src = completeurl($attr["src"], $url);
-                        $type = guess_image_type($attr["src"],true);
+			$photodata = getimagesize($src);
-                        $i = fetch_url($attr["src"]);
+			if (($photodata[0] > 150) and ($photodata[1] > 150)) {
-                        $ph = new Photo($i, $type);
+				if ($photodata[0] > 300) {
-
+					$photodata[1] = $photodata[1] * (300 / $photodata[0]);
-			if($ph->is_valid() and ($ph->getWidth() > 200) and ($ph->getHeight() > 200)) {
+					$photodata[0] = 300;
-				if ($siteinfo["image"] == "")
+				}
-	                                $siteinfo["image"] = $attr["src"];
+				if ($photodata[1] > 300) {
-
+					$photodata[0] = $photodata[0] * (300 / $photodata[1]);
-				if($ph->getWidth() > 300 || $ph->getHeight() > 300) {
+					$photodata[1] = 300;
-					$ph->scaleImage(300);
+				}
-	                                $siteinfo["images"][] = array("src"=>$attr["src"],
+				$siteinfo["images"][] = array("src"=>$src,
-									"width"=>$ph->getWidth(),
+								"width"=>$photodata[0],
-									"height"=>$ph->getHeight());
+								"height"=>$photodata[1]);
 				} else
 	                                $siteinfo["images"][] = array("src"=>$attr["src"],
 									"width"=>$ph->getWidth(),
 									"height"=>$ph->getHeight());
 			}
                }
        } else {
-		// guess mimetype from headers or filename
+		$src = completeurl($siteinfo["image"], $url);
-                $type = guess_image_type($siteinfo["image"],true);
+		$photodata = getimagesize($src);
-                $i = fetch_url($siteinfo["image"]);
+		if (($photodata[0] > 10) and ($photodata[1] > 10))
-                $ph = new Photo($i, $type);
+			$siteinfo["images"][] = array("src"=>$src,
-
+							"width"=>$photodata[0],
-		if($ph->is_valid())
+							"height"=>$photodata[1]);
 			$siteinfo["images"][] = array("src"=>$siteinfo["image"],
 							"width"=>$ph->getWidth(),
 							"height"=>$ph->getHeight());
 	}
 	if ($siteinfo["text"] == "") {
@ -155,19 +175,22 @@ function parseurl_getsiteinfo($url) {
 		$list = $xpath->query("//div[@class='article']");
 		foreach ($list as $node)
-			$text .= " ".trim($node->nodeValue);
+			if (strlen($node->nodeValue) > 40)
 				$text .= " ".trim($node->nodeValue);
 		if ($text == "") {
 			$list = $xpath->query("//div[@class='content']");
 			foreach ($list as $node)
-				$text .= " ".trim($node->nodeValue);
+				if (strlen($node->nodeValue) > 40)
 					$text .= " ".trim($node->nodeValue);
 		}
 		// If none text was found then take the paragraph content
 		if ($text == "") {
 			$list = $xpath->query("//p");
 			foreach ($list as $node)
-				$text .= " ".trim($node->nodeValue);
+				if (strlen($node->nodeValue) > 40)
 					$text .= " ".trim($node->nodeValue);
 		}
 		if ($text != "") {
@ -238,9 +261,9 @@ function parse_url_content(&$a) {
 	if($url && $title && $text) {
 		if($textmode)
-			$text = $br . $br . '[quote]' . trim($text) . '[/quote]' . $br;
+			$text = $br . '[quote]' . trim($text) . '[/quote]' . $br;
 		else
-			$text = '<br /><br /><blockquote>' . trim($text) . '</blockquote><br />';
+			$text = '<br /><blockquote>' . trim($text) . '</blockquote><br />';
 		$title = str_replace(array("\r","\n"),array('',''),$title);
@ -255,7 +278,8 @@ function parse_url_content(&$a) {
 	$siteinfo = parseurl_getsiteinfo($url);
 	if($siteinfo["title"] == "") {
-		echo sprintf($template,$url,$url,'') . $str_tags;
+		echo print_r($siteinfo, true);
 		//echo sprintf($template,$url,$url,'') . $str_tags;
 		killme();
 	} else {
 		$text = $siteinfo["text"];
@ -305,7 +329,7 @@ function parse_url_content(&$a) {
 	}
 	if($image) {
-		$text = $br.$br.$image.$br.$text;
+		$text = $br.$br.$image.$text;
 	}
 	$title = str_replace(array("\r","\n"),array('',''),$title);
@ -313,6 +337,6 @@ function parse_url_content(&$a) {
 	logger('parse_url: returns: ' . $result);
-	echo $result;
+	echo trim($result);
 	killme();
 }