ee45dee932
feeds that provide insufficient content-type and choke the html parser.
243 lines
5.3 KiB
PHP
243 lines
5.3 KiB
PHP
<?php
|
|
|
|
require_once('library/HTML5/Parser.php');
|
|
|
|
if(! function_exists('scrape_dfrn')) {
|
|
function scrape_dfrn($url) {
|
|
|
|
$a = get_app();
|
|
|
|
$ret = array();
|
|
|
|
logger('scrape_dfrn: url=' . $url);
|
|
|
|
$s = fetch_url($url);
|
|
|
|
if(! $s)
|
|
return $ret;
|
|
|
|
$headers = $a->get_curl_headers();
|
|
logger('scrape_dfrn: headers=' . $headers, LOGGER_DEBUG);
|
|
|
|
|
|
$lines = explode("\n",$headers);
|
|
if(count($lines)) {
|
|
foreach($lines as $line) {
|
|
// don't try and run feeds through the html5 parser
|
|
if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
|
|
$dom = HTML5_Parser::parse($s);
|
|
|
|
if(! $dom)
|
|
return $ret;
|
|
|
|
$items = $dom->getElementsByTagName('link');
|
|
|
|
// get DFRN link elements
|
|
|
|
foreach($items as $item) {
|
|
$x = $item->getAttribute('rel');
|
|
if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml'))
|
|
$ret['feed_atom'] = $item->getAttribute('href');
|
|
if(substr($x,0,5) == "dfrn-")
|
|
$ret[$x] = $item->getAttribute('href');
|
|
if($x === 'lrdd') {
|
|
$decoded = urldecode($item->getAttribute('href'));
|
|
if(preg_match('/acct:([^@]*)@/',$decoded,$matches))
|
|
$ret['nick'] = $matches[1];
|
|
}
|
|
}
|
|
|
|
// Pull out hCard profile elements
|
|
|
|
$items = $dom->getElementsByTagName('*');
|
|
foreach($items as $item) {
|
|
if(attribute_contains($item->getAttribute('class'), 'vcard')) {
|
|
$level2 = $item->getElementsByTagName('*');
|
|
foreach($level2 as $x) {
|
|
if(attribute_contains($x->getAttribute('class'),'fn'))
|
|
$ret['fn'] = $x->textContent;
|
|
if(attribute_contains($x->getAttribute('class'),'photo'))
|
|
$ret['photo'] = $x->getAttribute('src');
|
|
if(attribute_contains($x->getAttribute('class'),'key'))
|
|
$ret['key'] = $x->textContent;
|
|
}
|
|
}
|
|
}
|
|
|
|
return $ret;
|
|
}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if(! function_exists('validate_dfrn')) {
|
|
function validate_dfrn($a) {
|
|
$errors = 0;
|
|
if(! x($a,'key'))
|
|
$errors ++;
|
|
if(! x($a,'dfrn-request'))
|
|
$errors ++;
|
|
if(! x($a,'dfrn-confirm'))
|
|
$errors ++;
|
|
if(! x($a,'dfrn-notify'))
|
|
$errors ++;
|
|
if(! x($a,'dfrn-poll'))
|
|
$errors ++;
|
|
return $errors;
|
|
}}
|
|
|
|
if(! function_exists('scrape_meta')) {
|
|
function scrape_meta($url) {
|
|
|
|
$a = get_app();
|
|
|
|
$ret = array();
|
|
|
|
logger('scrape_meta: url=' . $url);
|
|
|
|
$s = fetch_url($url);
|
|
|
|
if(! $s)
|
|
return $ret;
|
|
|
|
$headers = $a->get_curl_headers();
|
|
logger('scrape_meta: headers=' . $headers, LOGGER_DEBUG);
|
|
|
|
$lines = explode("\n",$headers);
|
|
if(count($lines)) {
|
|
foreach($lines as $line) {
|
|
// don't try and run feeds through the html5 parser
|
|
if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
$dom = HTML5_Parser::parse($s);
|
|
|
|
if(! $dom)
|
|
return $ret;
|
|
|
|
$items = $dom->getElementsByTagName('meta');
|
|
|
|
// get DFRN link elements
|
|
|
|
foreach($items as $item) {
|
|
$x = $item->getAttribute('name');
|
|
if(substr($x,0,5) == "dfrn-")
|
|
$ret[$x] = $item->getAttribute('content');
|
|
}
|
|
|
|
return $ret;
|
|
}}
|
|
|
|
|
|
if(! function_exists('scrape_vcard')) {
|
|
function scrape_vcard($url) {
|
|
|
|
$a = get_app();
|
|
|
|
$ret = array();
|
|
|
|
logger('scrape_vcard: url=' . $url);
|
|
|
|
$s = fetch_url($url);
|
|
|
|
if(! $s)
|
|
return $ret;
|
|
|
|
$headers = $a->get_curl_headers();
|
|
$lines = explode("\n",$headers);
|
|
if(count($lines)) {
|
|
foreach($lines as $line) {
|
|
// don't try and run feeds through the html5 parser
|
|
if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
$dom = HTML5_Parser::parse($s);
|
|
|
|
if(! $dom)
|
|
return $ret;
|
|
|
|
// Pull out hCard profile elements
|
|
|
|
$items = $dom->getElementsByTagName('*');
|
|
foreach($items as $item) {
|
|
if(attribute_contains($item->getAttribute('class'), 'vcard')) {
|
|
$level2 = $item->getElementsByTagName('*');
|
|
foreach($level2 as $x) {
|
|
if(attribute_contains($x->getAttribute('class'),'fn'))
|
|
$ret['fn'] = $x->textContent;
|
|
if((attribute_contains($x->getAttribute('class'),'photo'))
|
|
|| (attribute_contains($x->getAttribute('class'),'avatar')))
|
|
$ret['photo'] = $x->getAttribute('src');
|
|
if((attribute_contains($x->getAttribute('class'),'nickname'))
|
|
|| (attribute_contains($x->getAttribute('class'),'uid')))
|
|
$ret['nick'] = $x->textContent;
|
|
}
|
|
}
|
|
}
|
|
|
|
return $ret;
|
|
}}
|
|
|
|
|
|
if(! function_exists('scrape_feed')) {
|
|
function scrape_feed($url) {
|
|
|
|
$a = get_app();
|
|
|
|
$ret = array();
|
|
$s = fetch_url($url);
|
|
|
|
if(! $s)
|
|
return $ret;
|
|
|
|
$headers = $a->get_curl_headers();
|
|
logger('scrape_feed: headers=' . $headers, LOGGER_DEBUG);
|
|
|
|
$lines = explode("\n",$headers);
|
|
if(count($lines)) {
|
|
foreach($lines as $line) {
|
|
if(stristr($line,'content-type:')) {
|
|
if(stristr($line,'application/atom+xml') || stristr($s,'<feed')) {
|
|
$ret['feed_atom'] = $url;
|
|
return $ret;
|
|
}
|
|
if(stristr($line,'application/rss+xml') || stristr($s,'<rss')) {
|
|
$ret['feed_rss'] = $url;
|
|
return ret;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
$dom = HTML5_Parser::parse($s);
|
|
|
|
if(! $dom)
|
|
return $ret;
|
|
|
|
$items = $dom->getElementsByTagName('link');
|
|
|
|
// get Atom link elements
|
|
|
|
foreach($items as $item) {
|
|
$x = $item->getAttribute('rel');
|
|
if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml'))
|
|
$ret['feed_atom'] = $item->getAttribute('href');
|
|
if(($x === 'alternate') && ($item->getAttribute('type') === 'application/rss+xml'))
|
|
$ret['feed_rss'] = $item->getAttribute('href');
|
|
}
|
|
|
|
return $ret;
|
|
}} |