suppress some scraping errors when confronted with hybrid/strange
feeds that provide insufficient content-type and choke the html parser.
This commit is contained in:
parent
fce9988f73
commit
ee45dee932
2 changed files with 21 additions and 2 deletions
3
boot.php
3
boot.php
|
@ -1366,6 +1366,7 @@ function lrdd($uri) {
|
||||||
else {
|
else {
|
||||||
$html = fetch_url($uri);
|
$html = fetch_url($uri);
|
||||||
$headers = $a->get_curl_headers();
|
$headers = $a->get_curl_headers();
|
||||||
|
logger('lrdd: headers=' . $headers, LOGGER_DEBUG);
|
||||||
$lines = explode("\n",$headers);
|
$lines = explode("\n",$headers);
|
||||||
if(count($lines)) {
|
if(count($lines)) {
|
||||||
foreach($lines as $line) {
|
foreach($lines as $line) {
|
||||||
|
@ -1377,6 +1378,8 @@ function lrdd($uri) {
|
||||||
// don't try and run feeds through the html5 parser
|
// don't try and run feeds through the html5 parser
|
||||||
if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
|
if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
|
||||||
return array();
|
return array();
|
||||||
|
if(stristr($html,'<rss') || stristr($html,'<feed'))
|
||||||
|
return array();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(! isset($link)) {
|
if(! isset($link)) {
|
||||||
|
|
|
@ -8,12 +8,18 @@ function scrape_dfrn($url) {
|
||||||
$a = get_app();
|
$a = get_app();
|
||||||
|
|
||||||
$ret = array();
|
$ret = array();
|
||||||
|
|
||||||
|
logger('scrape_dfrn: url=' . $url);
|
||||||
|
|
||||||
$s = fetch_url($url);
|
$s = fetch_url($url);
|
||||||
|
|
||||||
if(! $s)
|
if(! $s)
|
||||||
return $ret;
|
return $ret;
|
||||||
|
|
||||||
$headers = $a->get_curl_headers();
|
$headers = $a->get_curl_headers();
|
||||||
|
logger('scrape_dfrn: headers=' . $headers, LOGGER_DEBUG);
|
||||||
|
|
||||||
|
|
||||||
$lines = explode("\n",$headers);
|
$lines = explode("\n",$headers);
|
||||||
if(count($lines)) {
|
if(count($lines)) {
|
||||||
foreach($lines as $line) {
|
foreach($lines as $line) {
|
||||||
|
@ -93,12 +99,17 @@ function scrape_meta($url) {
|
||||||
$a = get_app();
|
$a = get_app();
|
||||||
|
|
||||||
$ret = array();
|
$ret = array();
|
||||||
|
|
||||||
|
logger('scrape_meta: url=' . $url);
|
||||||
|
|
||||||
$s = fetch_url($url);
|
$s = fetch_url($url);
|
||||||
|
|
||||||
if(! $s)
|
if(! $s)
|
||||||
return $ret;
|
return $ret;
|
||||||
|
|
||||||
$headers = $a->get_curl_headers();
|
$headers = $a->get_curl_headers();
|
||||||
|
logger('scrape_meta: headers=' . $headers, LOGGER_DEBUG);
|
||||||
|
|
||||||
$lines = explode("\n",$headers);
|
$lines = explode("\n",$headers);
|
||||||
if(count($lines)) {
|
if(count($lines)) {
|
||||||
foreach($lines as $line) {
|
foreach($lines as $line) {
|
||||||
|
@ -135,6 +146,9 @@ function scrape_vcard($url) {
|
||||||
$a = get_app();
|
$a = get_app();
|
||||||
|
|
||||||
$ret = array();
|
$ret = array();
|
||||||
|
|
||||||
|
logger('scrape_vcard: url=' . $url);
|
||||||
|
|
||||||
$s = fetch_url($url);
|
$s = fetch_url($url);
|
||||||
|
|
||||||
if(! $s)
|
if(! $s)
|
||||||
|
@ -190,15 +204,17 @@ function scrape_feed($url) {
|
||||||
return $ret;
|
return $ret;
|
||||||
|
|
||||||
$headers = $a->get_curl_headers();
|
$headers = $a->get_curl_headers();
|
||||||
|
logger('scrape_feed: headers=' . $headers, LOGGER_DEBUG);
|
||||||
|
|
||||||
$lines = explode("\n",$headers);
|
$lines = explode("\n",$headers);
|
||||||
if(count($lines)) {
|
if(count($lines)) {
|
||||||
foreach($lines as $line) {
|
foreach($lines as $line) {
|
||||||
if(stristr($line,'content-type:')) {
|
if(stristr($line,'content-type:')) {
|
||||||
if(stristr($line,'application/atom+xml')) {
|
if(stristr($line,'application/atom+xml') || stristr($s,'<feed')) {
|
||||||
$ret['feed_atom'] = $url;
|
$ret['feed_atom'] = $url;
|
||||||
return $ret;
|
return $ret;
|
||||||
}
|
}
|
||||||
if(stristr($line,'application/rss+xml')) {
|
if(stristr($line,'application/rss+xml') || stristr($s,'<rss')) {
|
||||||
$ret['feed_rss'] = $url;
|
$ret['feed_rss'] = $url;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue