2010-07-02 01:48:07 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
require_once('library/HTML5/Parser.php');
|
2011-08-12 12:01:11 +02:00
|
|
|
require_once('include/crypto.php');
|
2016-02-14 19:50:59 +01:00
|
|
|
require_once('include/feed.php');
|
2016-07-04 00:14:08 +02:00
|
|
|
require_once('include/Probe.php');
|
2010-07-02 01:48:07 +02:00
|
|
|
|
|
|
|
if(! function_exists('scrape_dfrn')) {
|
2015-11-25 18:46:02 +01:00
|
|
|
function scrape_dfrn($url, $dont_probe = false) {
|
2010-07-02 01:48:07 +02:00
|
|
|
|
2011-02-01 23:55:29 +01:00
|
|
|
$a = get_app();
|
|
|
|
|
2010-07-02 01:48:07 +02:00
|
|
|
$ret = array();
|
2011-02-02 23:48:27 +01:00
|
|
|
|
|
|
|
logger('scrape_dfrn: url=' . $url);
|
|
|
|
|
2016-02-13 12:26:58 +01:00
|
|
|
// Try to fetch the data from noscrape. This is faster than parsing the HTML
|
|
|
|
$noscrape = str_replace("/hcard/", "/noscrape/", $url);
|
|
|
|
$noscrapejson = fetch_url($noscrape);
|
|
|
|
$noscrapedata = array();
|
|
|
|
if ($noscrapejson) {
|
|
|
|
$noscrapedata = json_decode($noscrapejson, true);
|
|
|
|
|
2016-02-13 22:20:00 +01:00
|
|
|
if (is_array($noscrapedata)) {
|
2016-02-13 12:26:58 +01:00
|
|
|
if ($noscrapedata["nick"] != "")
|
|
|
|
return($noscrapedata);
|
2016-04-02 15:41:55 +02:00
|
|
|
else
|
|
|
|
unset($noscrapedata["nick"]);
|
2016-02-13 22:20:00 +01:00
|
|
|
} else
|
|
|
|
$noscrapedata = array();
|
2016-02-13 12:26:58 +01:00
|
|
|
}
|
|
|
|
|
2010-07-02 01:48:07 +02:00
|
|
|
$s = fetch_url($url);
|
|
|
|
|
2016-04-02 15:41:55 +02:00
|
|
|
if (!$s)
|
2010-07-02 01:48:07 +02:00
|
|
|
return $ret;
|
|
|
|
|
2015-11-26 21:58:01 +01:00
|
|
|
if (!$dont_probe) {
|
|
|
|
$probe = probe_url($url);
|
2015-11-25 18:46:02 +01:00
|
|
|
|
2015-11-26 21:58:01 +01:00
|
|
|
if (isset($probe["addr"]))
|
|
|
|
$ret["addr"] = $probe["addr"];
|
|
|
|
}
|
2015-11-25 18:46:02 +01:00
|
|
|
|
2011-02-01 23:55:29 +01:00
|
|
|
$headers = $a->get_curl_headers();
|
2011-02-02 23:48:27 +01:00
|
|
|
logger('scrape_dfrn: headers=' . $headers, LOGGER_DEBUG);
|
|
|
|
|
|
|
|
|
2011-02-01 23:55:29 +01:00
|
|
|
$lines = explode("\n",$headers);
|
|
|
|
if(count($lines)) {
|
2014-04-04 10:42:12 +02:00
|
|
|
foreach($lines as $line) {
|
2011-02-01 23:55:29 +01:00
|
|
|
// don't try and run feeds through the html5 parser
|
|
|
|
if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-10-21 01:48:07 +02:00
|
|
|
try {
|
|
|
|
$dom = HTML5_Parser::parse($s);
|
|
|
|
} catch (DOMException $e) {
|
|
|
|
logger('scrape_dfrn: parse error: ' . $e);
|
|
|
|
}
|
2010-07-02 01:48:07 +02:00
|
|
|
|
|
|
|
if(! $dom)
|
|
|
|
return $ret;
|
|
|
|
|
|
|
|
$items = $dom->getElementsByTagName('link');
|
|
|
|
|
|
|
|
// get DFRN link elements
|
|
|
|
|
|
|
|
foreach($items as $item) {
|
|
|
|
$x = $item->getAttribute('rel');
|
2011-01-24 05:09:34 +01:00
|
|
|
if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml'))
|
|
|
|
$ret['feed_atom'] = $item->getAttribute('href');
|
2011-08-18 08:01:44 +02:00
|
|
|
if(substr($x,0,5) == "dfrn-") {
|
2010-07-02 01:48:07 +02:00
|
|
|
$ret[$x] = $item->getAttribute('href');
|
2011-08-18 08:01:44 +02:00
|
|
|
}
|
2010-10-23 10:20:26 +02:00
|
|
|
if($x === 'lrdd') {
|
|
|
|
$decoded = urldecode($item->getAttribute('href'));
|
|
|
|
if(preg_match('/acct:([^@]*)@/',$decoded,$matches))
|
|
|
|
$ret['nick'] = $matches[1];
|
|
|
|
}
|
2010-07-02 01:48:07 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Pull out hCard profile elements
|
|
|
|
|
2011-08-18 08:01:44 +02:00
|
|
|
$largest_photo = 0;
|
|
|
|
|
2010-07-02 01:48:07 +02:00
|
|
|
$items = $dom->getElementsByTagName('*');
|
|
|
|
foreach($items as $item) {
|
|
|
|
if(attribute_contains($item->getAttribute('class'), 'vcard')) {
|
|
|
|
$level2 = $item->getElementsByTagName('*');
|
|
|
|
foreach($level2 as $x) {
|
2016-06-25 00:38:47 +02:00
|
|
|
if(attribute_contains($x->getAttribute('class'),'uid'))
|
|
|
|
$ret['guid'] = $x->textContent;
|
|
|
|
if(attribute_contains($x->getAttribute('class'),'nickname'))
|
|
|
|
$ret['nickname'] = $x->textContent;
|
|
|
|
if(attribute_contains($x->getAttribute('class'),'fn'))
|
2010-07-02 01:48:07 +02:00
|
|
|
$ret['fn'] = $x->textContent;
|
2016-06-25 00:38:47 +02:00
|
|
|
if(attribute_contains($x->getAttribute('class'),'searchable'))
|
|
|
|
$ret['searchable'] = $x->textContent;
|
|
|
|
if(attribute_contains($x->getAttribute('class'),'key'))
|
|
|
|
$ret['key'] = $x->textContent;
|
|
|
|
if(attribute_contains($x->getAttribute('class'),'url'))
|
|
|
|
$ret['url'] = $x->textContent;
|
2011-08-18 08:01:44 +02:00
|
|
|
if((attribute_contains($x->getAttribute('class'),'photo'))
|
|
|
|
|| (attribute_contains($x->getAttribute('class'),'avatar'))) {
|
|
|
|
$size = intval($x->getAttribute('width'));
|
|
|
|
// dfrn prefers 175, so if we find this, we set largest_size so it can't be topped.
|
|
|
|
if(($size > $largest_photo) || ($size == 175) || (! $largest_photo)) {
|
|
|
|
$ret['photo'] = $x->getAttribute('src');
|
|
|
|
$largest_photo = (($size == 175) ? 9999 : $size);
|
|
|
|
}
|
|
|
|
}
|
2010-10-26 06:52:30 +02:00
|
|
|
}
|
2010-07-02 01:48:07 +02:00
|
|
|
}
|
|
|
|
}
|
2016-02-13 12:26:58 +01:00
|
|
|
return array_merge($ret, $noscrapedata);
|
2010-07-02 01:48:07 +02:00
|
|
|
}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if(! function_exists('validate_dfrn')) {
|
|
|
|
function validate_dfrn($a) {
|
|
|
|
$errors = 0;
|
|
|
|
if(! x($a,'key'))
|
|
|
|
$errors ++;
|
|
|
|
if(! x($a,'dfrn-request'))
|
|
|
|
$errors ++;
|
|
|
|
if(! x($a,'dfrn-confirm'))
|
|
|
|
$errors ++;
|
|
|
|
if(! x($a,'dfrn-notify'))
|
|
|
|
$errors ++;
|
|
|
|
if(! x($a,'dfrn-poll'))
|
|
|
|
$errors ++;
|
|
|
|
return $errors;
|
|
|
|
}}
|
|
|
|
|
2011-08-18 08:10:55 +02:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
* Probe a network address to discover what kind of protocols we need to communicate with it.
|
|
|
|
*
|
|
|
|
* Warning: this function is a bit touchy and there are some subtle dependencies within the logic flow.
|
|
|
|
* Edit with care.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* PROBE_DIASPORA has a bias towards returning Diaspora information
|
|
|
|
* while PROBE_NORMAL has a bias towards dfrn/zot - in the case where
|
2012-02-19 19:27:54 +01:00
|
|
|
* an address (such as a Friendica address) supports more than one type
|
2015-11-05 00:42:38 +01:00
|
|
|
* of network.
|
2011-08-18 08:10:55 +02:00
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
2016-07-04 00:14:08 +02:00
|
|
|
define('PROBE_NORMAL', 0);
|
|
|
|
define('PROBE_DIASPORA', 1);
|
2011-04-15 09:59:00 +02:00
|
|
|
|
2015-06-14 01:52:26 +02:00
|
|
|
function probe_url($url, $mode = PROBE_NORMAL, $level = 1) {
|
2015-12-06 18:52:19 +01:00
|
|
|
|
2016-07-04 00:14:08 +02:00
|
|
|
if ($mode == PROBE_DIASPORA)
|
|
|
|
$network = NETWORK_DIASPORA;
|
|
|
|
else
|
|
|
|
$network = "";
|
2015-12-06 18:52:19 +01:00
|
|
|
|
2016-07-04 00:14:08 +02:00
|
|
|
$data = Probe::uri($url, $network);
|
2015-01-20 22:54:25 +01:00
|
|
|
|
2016-07-04 00:14:08 +02:00
|
|
|
return $data;
|
2011-04-15 09:59:00 +02:00
|
|
|
}
|
2015-02-16 22:11:51 +01:00
|
|
|
|
2016-02-14 11:56:23 +01:00
|
|
|
/**
|
|
|
|
* @brief Find the matching part between two url
|
|
|
|
*
|
|
|
|
* @param string $url1
|
|
|
|
* @param string $url2
|
|
|
|
* @return string The matching part
|
|
|
|
*/
|
|
|
|
function matching_url($url1, $url2) {
|
|
|
|
|
|
|
|
if (($url1 == "") OR ($url2 == ""))
|
|
|
|
return "";
|
|
|
|
|
|
|
|
$url1 = normalise_link($url1);
|
|
|
|
$url2 = normalise_link($url2);
|
|
|
|
|
|
|
|
$parts1 = parse_url($url1);
|
|
|
|
$parts2 = parse_url($url2);
|
|
|
|
|
|
|
|
if (!isset($parts1["host"]) OR !isset($parts2["host"]))
|
|
|
|
return "";
|
|
|
|
|
|
|
|
if ($parts1["scheme"] != $parts2["scheme"])
|
|
|
|
return "";
|
|
|
|
|
|
|
|
if ($parts1["host"] != $parts2["host"])
|
|
|
|
return "";
|
|
|
|
|
|
|
|
if ($parts1["port"] != $parts2["port"])
|
|
|
|
return "";
|
|
|
|
|
|
|
|
$match = $parts1["scheme"]."://".$parts1["host"];
|
|
|
|
|
|
|
|
if ($parts1["port"])
|
|
|
|
$match .= ":".$parts1["port"];
|
|
|
|
|
|
|
|
$pathparts1 = explode("/", $parts1["path"]);
|
|
|
|
$pathparts2 = explode("/", $parts2["path"]);
|
2015-02-16 22:11:51 +01:00
|
|
|
|
|
|
|
$i = 0;
|
2016-02-14 11:56:23 +01:00
|
|
|
$path = "";
|
|
|
|
do {
|
|
|
|
$path1 = $pathparts1[$i];
|
|
|
|
$path2 = $pathparts2[$i];
|
2015-02-16 22:11:51 +01:00
|
|
|
|
2016-02-14 11:56:23 +01:00
|
|
|
if ($path1 == $path2)
|
|
|
|
$path .= $path1."/";
|
|
|
|
|
|
|
|
} while (($path1 == $path2) AND ($i++ <= count($pathparts1)));
|
|
|
|
|
|
|
|
$match .= $path;
|
|
|
|
|
|
|
|
return normalise_link($match);
|
2015-02-16 22:11:51 +01:00
|
|
|
}
|