parse_url: Further improvements of the new method to fetch page data
This commit is contained in:
		
					parent
					
						
							
								09034ce0ee
							
						
					
				
			
			
				commit
				
					
						02a1fc9cd0
					
				
			
		
					 2 changed files with 66 additions and 41 deletions
				
			
		|  | @ -1727,5 +1727,6 @@ notifications/follow | |||
| notifications/leave | ||||
| blocks/exists | ||||
| blocks/blocking | ||||
| lists | ||||
| */ | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,4 @@ | |||
| <?php | ||||
| require_once('include/Photo.php'); | ||||
| 
 | ||||
| if(!function_exists('deletenode')) { | ||||
| 	function deletenode(&$doc, $node) | ||||
| 	{ | ||||
|  | @ -11,6 +9,30 @@ if(!function_exists('deletenode')) { | |||
| 	} | ||||
| } | ||||
| 
 | ||||
| function completeurl($url, $scheme) { | ||||
|         $urlarr = parse_url($url); | ||||
| 
 | ||||
|         if (isset($urlarr["scheme"])) | ||||
|                 return($url); | ||||
| 
 | ||||
|         $schemearr = parse_url($scheme); | ||||
| 
 | ||||
|         $complete = $schemearr["scheme"]."://".$schemearr["host"]; | ||||
| 
 | ||||
|         if ($schemearr["port"] != "") | ||||
|                 $complete .= ":".$schemearr["port"]; | ||||
| 
 | ||||
|         $complete .= $urlarr["path"]; | ||||
| 
 | ||||
|         if ($urlarr["query"] != "") | ||||
|                 $complete .= "?".$urlarr["query"]; | ||||
| 
 | ||||
|         if ($urlarr["fragment"] != "") | ||||
|                 $complete .= "#".$urlarr["fragment"]; | ||||
| 
 | ||||
|         return($complete); | ||||
| } | ||||
| 
 | ||||
| function parseurl_getsiteinfo($url) { | ||||
| 	$siteinfo = array(); | ||||
| 
 | ||||
|  | @ -25,7 +47,8 @@ function parseurl_getsiteinfo($url) { | |||
| 	$header = curl_exec($ch); | ||||
| 	curl_close($ch); | ||||
| 
 | ||||
| 	if (preg_match('/charset=(.*?)\n/', $header, $matches)) | ||||
| 	// Fetch the first mentioned charset. Can be in body or header
 | ||||
| 	if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches)) | ||||
| 		$charset = trim(array_pop($matches)); | ||||
| 	else | ||||
| 		$charset = "utf-8"; | ||||
|  | @ -57,11 +80,13 @@ function parseurl_getsiteinfo($url) { | |||
| 
 | ||||
| 	$xpath = new DomXPath($doc); | ||||
| 
 | ||||
| 	$list = $xpath->query("head/title"); | ||||
| 	//$list = $xpath->query("head/title");
 | ||||
| 	$list = $xpath->query("//title"); | ||||
| 	foreach ($list as $node) | ||||
| 		$siteinfo["title"] =  html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8"); | ||||
| 
 | ||||
| 	$list = $xpath->query("head/meta[@name]"); | ||||
| 	//$list = $xpath->query("head/meta[@name]");
 | ||||
| 	$list = $xpath->query("//meta[@name]"); | ||||
| 	foreach ($list as $node) { | ||||
| 		$attr = array(); | ||||
| 		if ($node->attributes->length) | ||||
|  | @ -86,7 +111,8 @@ function parseurl_getsiteinfo($url) { | |||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	$list = $xpath->query("head/meta[@property]"); | ||||
| 	//$list = $xpath->query("head/meta[@property]");
 | ||||
| 	$list = $xpath->query("//meta[@property]"); | ||||
| 	foreach ($list as $node) { | ||||
| 		$attr = array(); | ||||
| 		if ($node->attributes->length) | ||||
|  | @ -116,38 +142,32 @@ function parseurl_getsiteinfo($url) { | |||
|                                 foreach ($node->attributes as $attribute) | ||||
|                                         $attr[$attribute->name] = $attribute->value; | ||||
| 
 | ||||
|                         // guess mimetype from headers or filename
 | ||||
|                         $type = guess_image_type($attr["src"],true); | ||||
| 			$src = completeurl($attr["src"], $url); | ||||
| 			$photodata = getimagesize($src); | ||||
| 
 | ||||
|                         $i = fetch_url($attr["src"]); | ||||
|                         $ph = new Photo($i, $type); | ||||
| 
 | ||||
| 			if($ph->is_valid() and ($ph->getWidth() > 200) and ($ph->getHeight() > 200)) { | ||||
| 				if ($siteinfo["image"] == "") | ||||
| 	                                $siteinfo["image"] = $attr["src"]; | ||||
| 
 | ||||
| 				if($ph->getWidth() > 300 || $ph->getHeight() > 300) { | ||||
| 					$ph->scaleImage(300); | ||||
| 	                                $siteinfo["images"][] = array("src"=>$attr["src"], | ||||
| 									"width"=>$ph->getWidth(), | ||||
| 									"height"=>$ph->getHeight()); | ||||
| 				} else | ||||
| 	                                $siteinfo["images"][] = array("src"=>$attr["src"], | ||||
| 									"width"=>$ph->getWidth(), | ||||
| 									"height"=>$ph->getHeight()); | ||||
| 			if (($photodata[0] > 150) and ($photodata[1] > 150)) { | ||||
| 				if ($photodata[0] > 300) { | ||||
| 					$photodata[1] = $photodata[1] * (300 / $photodata[0]); | ||||
| 					$photodata[0] = 300; | ||||
| 				} | ||||
| 				if ($photodata[1] > 300) { | ||||
| 					$photodata[0] = $photodata[0] * (300 / $photodata[1]); | ||||
| 					$photodata[1] = 300; | ||||
| 				} | ||||
| 				$siteinfo["images"][] = array("src"=>$src, | ||||
| 								"width"=>$photodata[0], | ||||
| 								"height"=>$photodata[1]); | ||||
| 			} | ||||
| 
 | ||||
|                 } | ||||
|         } else { | ||||
| 		// guess mimetype from headers or filename
 | ||||
|                 $type = guess_image_type($siteinfo["image"],true); | ||||
| 		$src = completeurl($siteinfo["image"], $url); | ||||
| 		$photodata = getimagesize($src); | ||||
| 
 | ||||
|                 $i = fetch_url($siteinfo["image"]); | ||||
|                 $ph = new Photo($i, $type); | ||||
| 
 | ||||
| 		if($ph->is_valid()) | ||||
| 			$siteinfo["images"][] = array("src"=>$siteinfo["image"], | ||||
| 							"width"=>$ph->getWidth(), | ||||
| 							"height"=>$ph->getHeight()); | ||||
| 		if (($photodata[0] > 10) and ($photodata[1] > 10)) | ||||
| 			$siteinfo["images"][] = array("src"=>$src, | ||||
| 							"width"=>$photodata[0], | ||||
| 							"height"=>$photodata[1]); | ||||
| 	} | ||||
| 
 | ||||
| 	if ($siteinfo["text"] == "") { | ||||
|  | @ -155,11 +175,13 @@ function parseurl_getsiteinfo($url) { | |||
| 
 | ||||
| 		$list = $xpath->query("//div[@class='article']"); | ||||
| 		foreach ($list as $node) | ||||
| 			if (strlen($node->nodeValue) > 40) | ||||
| 				$text .= " ".trim($node->nodeValue); | ||||
| 
 | ||||
| 		if ($text == "") { | ||||
| 			$list = $xpath->query("//div[@class='content']"); | ||||
| 			foreach ($list as $node) | ||||
| 				if (strlen($node->nodeValue) > 40) | ||||
| 					$text .= " ".trim($node->nodeValue); | ||||
| 		} | ||||
| 
 | ||||
|  | @ -167,6 +189,7 @@ function parseurl_getsiteinfo($url) { | |||
| 		if ($text == "") { | ||||
| 			$list = $xpath->query("//p"); | ||||
| 			foreach ($list as $node) | ||||
| 				if (strlen($node->nodeValue) > 40) | ||||
| 					$text .= " ".trim($node->nodeValue); | ||||
| 		} | ||||
| 
 | ||||
|  | @ -238,9 +261,9 @@ function parse_url_content(&$a) { | |||
| 	if($url && $title && $text) { | ||||
| 
 | ||||
| 		if($textmode) | ||||
| 			$text = $br . $br . '[quote]' . trim($text) . '[/quote]' . $br; | ||||
| 			$text = $br . '[quote]' . trim($text) . '[/quote]' . $br; | ||||
| 		else | ||||
| 			$text = '<br /><br /><blockquote>' . trim($text) . '</blockquote><br />'; | ||||
| 			$text = '<br /><blockquote>' . trim($text) . '</blockquote><br />'; | ||||
| 
 | ||||
| 		$title = str_replace(array("\r","\n"),array('',''),$title); | ||||
| 
 | ||||
|  | @ -255,7 +278,8 @@ function parse_url_content(&$a) { | |||
| 	$siteinfo = parseurl_getsiteinfo($url); | ||||
| 
 | ||||
| 	if($siteinfo["title"] == "") { | ||||
| 		echo sprintf($template,$url,$url,'') . $str_tags; | ||||
| 		echo print_r($siteinfo, true); | ||||
| 		//echo sprintf($template,$url,$url,'') . $str_tags;
 | ||||
| 		killme(); | ||||
| 	} else { | ||||
| 		$text = $siteinfo["text"]; | ||||
|  | @ -305,7 +329,7 @@ function parse_url_content(&$a) { | |||
| 	} | ||||
| 
 | ||||
| 	if($image) { | ||||
| 		$text = $br.$br.$image.$br.$text; | ||||
| 		$text = $br.$br.$image.$text; | ||||
| 	} | ||||
| 	$title = str_replace(array("\r","\n"),array('',''),$title); | ||||
| 
 | ||||
|  | @ -313,6 +337,6 @@ function parse_url_content(&$a) { | |||
| 
 | ||||
| 	logger('parse_url: returns: ' . $result); | ||||
| 
 | ||||
| 	echo $result; | ||||
| 	echo trim($result); | ||||
| 	killme(); | ||||
| } | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue