From 02a1fc9cd08fba2168895d1892a91d8143323848 Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Thu, 12 Jul 2012 23:41:04 +0200 Subject: [PATCH] parse_url: Further improvements of the new method to fetch page data --- include/api.php | 1 + mod/parse_url.php | 106 ++++++++++++++++++++++++++++------------------ 2 files changed, 66 insertions(+), 41 deletions(-) diff --git a/include/api.php b/include/api.php index 3858b9fe32..e0b788424e 100644 --- a/include/api.php +++ b/include/api.php @@ -1727,5 +1727,6 @@ notifications/follow notifications/leave blocks/exists blocks/blocking +lists */ diff --git a/mod/parse_url.php b/mod/parse_url.php index 97e1658c89..4d894969aa 100644 --- a/mod/parse_url.php +++ b/mod/parse_url.php @@ -1,6 +1,4 @@ query("head/title"); + //$list = $xpath->query("head/title"); + $list = $xpath->query("//title"); foreach ($list as $node) $siteinfo["title"] = html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8"); - $list = $xpath->query("head/meta[@name]"); + //$list = $xpath->query("head/meta[@name]"); + $list = $xpath->query("//meta[@name]"); foreach ($list as $node) { $attr = array(); if ($node->attributes->length) @@ -86,7 +111,8 @@ function parseurl_getsiteinfo($url) { } } - $list = $xpath->query("head/meta[@property]"); + //$list = $xpath->query("head/meta[@property]"); + $list = $xpath->query("//meta[@property]"); foreach ($list as $node) { $attr = array(); if ($node->attributes->length) @@ -116,38 +142,32 @@ function parseurl_getsiteinfo($url) { foreach ($node->attributes as $attribute) $attr[$attribute->name] = $attribute->value; - // guess mimetype from headers or filename - $type = guess_image_type($attr["src"],true); + $src = completeurl($attr["src"], $url); + $photodata = getimagesize($src); - $i = fetch_url($attr["src"]); - $ph = new Photo($i, $type); - - if($ph->is_valid() and ($ph->getWidth() > 200) and ($ph->getHeight() > 200)) { - if ($siteinfo["image"] == "") - $siteinfo["image"] = $attr["src"]; - - if($ph->getWidth() > 300 || $ph->getHeight() > 300) { - $ph->scaleImage(300); - $siteinfo["images"][] = array("src"=>$attr["src"], - "width"=>$ph->getWidth(), - "height"=>$ph->getHeight()); - } else - $siteinfo["images"][] = array("src"=>$attr["src"], - "width"=>$ph->getWidth(), - "height"=>$ph->getHeight()); + if (($photodata[0] > 150) and ($photodata[1] > 150)) { + if ($photodata[0] > 300) { + $photodata[1] = $photodata[1] * (300 / $photodata[0]); + $photodata[0] = 300; + } + if ($photodata[1] > 300) { + $photodata[0] = $photodata[0] * (300 / $photodata[1]); + $photodata[1] = 300; + } + $siteinfo["images"][] = array("src"=>$src, + "width"=>$photodata[0], + "height"=>$photodata[1]); } + } } else { - // guess mimetype from headers or filename - $type = guess_image_type($siteinfo["image"],true); + $src = completeurl($siteinfo["image"], $url); + $photodata = getimagesize($src); - $i = fetch_url($siteinfo["image"]); - $ph = new Photo($i, $type); - - if($ph->is_valid()) - $siteinfo["images"][] = array("src"=>$siteinfo["image"], - "width"=>$ph->getWidth(), - "height"=>$ph->getHeight()); + if (($photodata[0] > 10) and ($photodata[1] > 10)) + $siteinfo["images"][] = array("src"=>$src, + "width"=>$photodata[0], + "height"=>$photodata[1]); } if ($siteinfo["text"] == "") { @@ -155,19 +175,22 @@ function parseurl_getsiteinfo($url) { $list = $xpath->query("//div[@class='article']"); foreach ($list as $node) - $text .= " ".trim($node->nodeValue); + if (strlen($node->nodeValue) > 40) + $text .= " ".trim($node->nodeValue); if ($text == "") { $list = $xpath->query("//div[@class='content']"); foreach ($list as $node) - $text .= " ".trim($node->nodeValue); + if (strlen($node->nodeValue) > 40) + $text .= " ".trim($node->nodeValue); } // If none text was found then take the paragraph content if ($text == "") { $list = $xpath->query("//p"); foreach ($list as $node) - $text .= " ".trim($node->nodeValue); + if (strlen($node->nodeValue) > 40) + $text .= " ".trim($node->nodeValue); } if ($text != "") { @@ -238,9 +261,9 @@ function parse_url_content(&$a) { if($url && $title && $text) { if($textmode) - $text = $br . $br . '[quote]' . trim($text) . '[/quote]' . $br; + $text = $br . '[quote]' . trim($text) . '[/quote]' . $br; else - $text = '

' . trim($text) . '

'; + $text = '
' . trim($text) . '

'; $title = str_replace(array("\r","\n"),array('',''),$title); @@ -255,7 +278,8 @@ function parse_url_content(&$a) { $siteinfo = parseurl_getsiteinfo($url); if($siteinfo["title"] == "") { - echo sprintf($template,$url,$url,'') . $str_tags; + echo print_r($siteinfo, true); + //echo sprintf($template,$url,$url,'') . $str_tags; killme(); } else { $text = $siteinfo["text"]; @@ -305,7 +329,7 @@ function parse_url_content(&$a) { } if($image) { - $text = $br.$br.$image.$br.$text; + $text = $br.$br.$image.$text; } $title = str_replace(array("\r","\n"),array('',''),$title); @@ -313,6 +337,6 @@ function parse_url_content(&$a) { logger('parse_url: returns: ' . $result); - echo $result; + echo trim($result); killme(); }