friendica/include/Scrape.php

<?php

require_once('library/HTML5/Parser.php');
require_once('include/crypto.php');
require_once('include/feed.php');
require_once('include/Probe.php');

if(! function_exists('scrape_dfrn')) {
function scrape_dfrn($url, $dont_probe = false) {

	$a = get_app();

	$ret = array();

	logger('scrape_dfrn: url=' . $url);

	// Try to fetch the data from noscrape. This is faster than parsing the HTML
	$noscrape = str_replace("/hcard/", "/noscrape/", $url);
	$noscrapejson = fetch_url($noscrape);
	$noscrapedata = array();
	if ($noscrapejson) {
		$noscrapedata = json_decode($noscrapejson, true);

		if (is_array($noscrapedata)) {
			if ($noscrapedata["nick"] != "")
				return($noscrapedata);
			else
				unset($noscrapedata["nick"]);
		} else
			$noscrapedata = array();
	}

	$s = fetch_url($url);

	if (!$s)
		return $ret;

	if (!$dont_probe) {
		$probe = probe_url($url);

		if (isset($probe["addr"]))
			$ret["addr"] = $probe["addr"];
	}

	$headers = $a->get_curl_headers();
	logger('scrape_dfrn: headers=' . $headers, LOGGER_DEBUG);


	$lines = explode("\n",$headers);
	if(count($lines)) {
		foreach($lines as $line) {
			// don't try and run feeds through the html5 parser
			if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
				return ret;
		}
	}

	try {
		$dom = HTML5_Parser::parse($s);
	} catch (DOMException $e) {
		logger('scrape_dfrn: parse error: ' . $e);
	}

	if(! $dom)
		return $ret;

	$items = $dom->getElementsByTagName('link');

	// get DFRN link elements

	foreach($items as $item) {
		$x = $item->getAttribute('rel');
		if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml'))
			$ret['feed_atom'] = $item->getAttribute('href');
		if(substr($x,0,5) == "dfrn-") {
			$ret[$x] = $item->getAttribute('href');
		}
		if($x === 'lrdd') {
			$decoded = urldecode($item->getAttribute('href'));
			if(preg_match('/acct:([^@]*)@/',$decoded,$matches))
				$ret['nick'] = $matches[1];
		}
	}

	// Pull out hCard profile elements

	$largest_photo = 0;

	$items = $dom->getElementsByTagName('*');
	foreach($items as $item) {
		if(attribute_contains($item->getAttribute('class'), 'vcard')) {
			$level2 = $item->getElementsByTagName('*');
			foreach($level2 as $x) {
				if(attribute_contains($x->getAttribute('class'),'uid'))
					$ret['guid'] = $x->textContent;
				if(attribute_contains($x->getAttribute('class'),'nickname'))
					$ret['nickname'] = $x->textContent;
				if(attribute_contains($x->getAttribute('class'),'fn'))
					$ret['fn'] = $x->textContent;
				if(attribute_contains($x->getAttribute('class'),'searchable'))
					$ret['searchable'] = $x->textContent;
				if(attribute_contains($x->getAttribute('class'),'key'))
					$ret['key'] = $x->textContent;
				if(attribute_contains($x->getAttribute('class'),'url'))
					$ret['url'] = $x->textContent;
				if((attribute_contains($x->getAttribute('class'),'photo'))
					|| (attribute_contains($x->getAttribute('class'),'avatar'))) {
					$size = intval($x->getAttribute('width'));
					// dfrn prefers 175, so if we find this, we set largest_size so it can't be topped.
					if(($size > $largest_photo) || ($size == 175) || (! $largest_photo)) {
						$ret['photo'] = $x->getAttribute('src');
						$largest_photo = (($size == 175) ? 9999 : $size);
					}
				}
			}
		}
	}
	return array_merge($ret, $noscrapedata);
}}


if(! function_exists('validate_dfrn')) {
function validate_dfrn($a) {
	$errors = 0;
	if(! x($a,'key'))
		$errors ++;
	if(! x($a,'dfrn-request'))
		$errors ++;
	if(! x($a,'dfrn-confirm'))
		$errors ++;
	if(! x($a,'dfrn-notify'))
		$errors ++;
	if(! x($a,'dfrn-poll'))
		$errors ++;
	return $errors;
}}

/**
 *
 * Probe a network address to discover what kind of protocols we need to communicate with it.
 *
 * Warning: this function is a bit touchy and there are some subtle dependencies within the logic flow.
 * Edit with care.
 *
 */

/**
 *
 * PROBE_DIASPORA has a bias towards returning Diaspora information
 * while PROBE_NORMAL has a bias towards dfrn/zot - in the case where
 * an address (such as a Friendica address) supports more than one type
 * of network.
 *
 */


define('PROBE_NORMAL',   0);
define('PROBE_DIASPORA', 1);

function probe_url($url, $mode = PROBE_NORMAL, $level = 1) {

	if ($mode == PROBE_DIASPORA)
		$network = NETWORK_DIASPORA;
	else
		$network = "";

	$data = Probe::uri($url, $network);

	return $data;
}

/**
 * @brief Find the matching part between two url
 *
 * @param string $url1
 * @param string $url2
 * @return string The matching part
 */
function matching_url($url1, $url2) {

	if (($url1 == "") OR ($url2 == ""))
		return "";

	$url1 = normalise_link($url1);
	$url2 = normalise_link($url2);

	$parts1 = parse_url($url1);
	$parts2 = parse_url($url2);

	if (!isset($parts1["host"]) OR !isset($parts2["host"]))
		return "";

	if ($parts1["scheme"] != $parts2["scheme"])
		return "";

	if ($parts1["host"] != $parts2["host"])
		return "";

	if ($parts1["port"] != $parts2["port"])
		return "";

	$match = $parts1["scheme"]."://".$parts1["host"];

	if ($parts1["port"])
		$match .= ":".$parts1["port"];

	$pathparts1 = explode("/", $parts1["path"]);
	$pathparts2 = explode("/", $parts2["path"]);

	$i = 0;
	$path = "";
	do {
		$path1 = $pathparts1[$i];
		$path2 = $pathparts2[$i];

		if ($path1 == $path2)
			$path .= $path1."/";

	} while (($path1 == $path2) AND ($i++ <= count($pathparts1)));

	$match .= $path;

	return normalise_link($match);
}
Initial checkin 2010-07-02 01:48:07 +02:00			`<?php`

			`require_once('library/HTML5/Parser.php');`
improved diaspora discovery 2011-08-12 12:01:11 +02:00			`require_once('include/crypto.php');`
Simplepie is removed since we don't use it anymore 2016-02-14 19:50:59 +01:00			`require_once('include/feed.php');`
Removed old code 2016-07-04 00:14:08 +02:00			`require_once('include/Probe.php');`
Initial checkin 2010-07-02 01:48:07 +02:00
			`if(! function_exists('scrape_dfrn')) {`
scrape_dfrn now scrapes the address as well. 2015-11-25 18:46:02 +01:00			`function scrape_dfrn($url, $dont_probe = false) {`
Initial checkin 2010-07-02 01:48:07 +02:00
don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 23:55:29 +01:00			`$a = get_app();`

Initial checkin 2010-07-02 01:48:07 +02:00			`$ret = array();`
suppress some scraping errors when confronted with hybrid/strange feeds that provide insufficient content-type and choke the html parser. 2011-02-02 23:48:27 +01:00
			`logger('scrape_dfrn: url=' . $url);`

Improvements how gcontact entries are updated 2016-02-13 12:26:58 +01:00			`// Try to fetch the data from noscrape. This is faster than parsing the HTML`
			`$noscrape = str_replace("/hcard/", "/noscrape/", $url);`
			`$noscrapejson = fetch_url($noscrape);`
			`$noscrapedata = array();`
			`if ($noscrapejson) {`
			`$noscrapedata = json_decode($noscrapejson, true);`

Avoid errors when noscrape data can't be fetched. 2016-02-13 22:20:00 +01:00			`if (is_array($noscrapedata)) {`
Improvements how gcontact entries are updated 2016-02-13 12:26:58 +01:00			`if ($noscrapedata["nick"] != "")`
			`return($noscrapedata);`
Bugfix: The nickname vanished/better way to fetch the alias 2016-04-02 15:41:55 +02:00			`else`
			`unset($noscrapedata["nick"]);`
Avoid errors when noscrape data can't be fetched. 2016-02-13 22:20:00 +01:00			`} else`
			`$noscrapedata = array();`
Improvements how gcontact entries are updated 2016-02-13 12:26:58 +01:00			`}`

Initial checkin 2010-07-02 01:48:07 +02:00			`$s = fetch_url($url);`

Bugfix: The nickname vanished/better way to fetch the alias 2016-04-02 15:41:55 +02:00			`if (!$s)`
Initial checkin 2010-07-02 01:48:07 +02:00			`return $ret;`

Bugfix: There was an endless loop in the probe_url function ... 2015-11-26 21:58:01 +01:00			`if (!$dont_probe) {`
			`$probe = probe_url($url);`
scrape_dfrn now scrapes the address as well. 2015-11-25 18:46:02 +01:00
Bugfix: There was an endless loop in the probe_url function ... 2015-11-26 21:58:01 +01:00			`if (isset($probe["addr"]))`
			`$ret["addr"] = $probe["addr"];`
			`}`
scrape_dfrn now scrapes the address as well. 2015-11-25 18:46:02 +01:00
don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 23:55:29 +01:00			`$headers = $a->get_curl_headers();`
suppress some scraping errors when confronted with hybrid/strange feeds that provide insufficient content-type and choke the html parser. 2011-02-02 23:48:27 +01:00			`logger('scrape_dfrn: headers=' . $headers, LOGGER_DEBUG);`


don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 23:55:29 +01:00			`$lines = explode("\n",$headers);`
			`if(count($lines)) {`
Removed deprecated twitter code, since twitter had changed its API long time ago. 2014-04-04 10:42:12 +02:00			`foreach($lines as $line) {`
don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 23:55:29 +01:00			`// don't try and run feeds through the html5 parser`
			`if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) \|\| (stristr($line,'application/rss+xml'))))`
			`return ret;`
			`}`
			`}`

fix various html parse errors 2011-10-21 01:48:07 +02:00			`try {`
			`$dom = HTML5_Parser::parse($s);`
			`} catch (DOMException $e) {`
			`logger('scrape_dfrn: parse error: ' . $e);`
			`}`
Initial checkin 2010-07-02 01:48:07 +02:00
			`if(! $dom)`
			`return $ret;`

			`$items = $dom->getElementsByTagName('link');`

			`// get DFRN link elements`

			`foreach($items as $item) {`
			`$x = $item->getAttribute('rel');`
following random feeds 2011-01-24 05:09:34 +01:00			`if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml'))`
			`$ret['feed_atom'] = $item->getAttribute('href');`
fix several probe related issues 2011-08-18 08:01:44 +02:00			`if(substr($x,0,5) == "dfrn-") {`
Initial checkin 2010-07-02 01:48:07 +02:00			`$ret[$x] = $item->getAttribute('href');`
fix several probe related issues 2011-08-18 08:01:44 +02:00			`}`
add nicknames to contact records (going forward and retroactive) 2010-10-23 10:20:26 +02:00			`if($x === 'lrdd') {`
			`$decoded = urldecode($item->getAttribute('href'));`
			`if(preg_match('/acct:([^@]*)@/',$decoded,$matches))`
			`$ret['nick'] = $matches[1];`
			`}`
Initial checkin 2010-07-02 01:48:07 +02:00			`}`

			`// Pull out hCard profile elements`

fix several probe related issues 2011-08-18 08:01:44 +02:00			`$largest_photo = 0;`

Initial checkin 2010-07-02 01:48:07 +02:00			`$items = $dom->getElementsByTagName('*');`
			`foreach($items as $item) {`
			`if(attribute_contains($item->getAttribute('class'), 'vcard')) {`
			`$level2 = $item->getElementsByTagName('*');`
			`foreach($level2 as $x) {`
Support for the case when the guid and public key will vanish from webfinger 2016-06-25 00:38:47 +02:00			`if(attribute_contains($x->getAttribute('class'),'uid'))`
			`$ret['guid'] = $x->textContent;`
			`if(attribute_contains($x->getAttribute('class'),'nickname'))`
			`$ret['nickname'] = $x->textContent;`
			`if(attribute_contains($x->getAttribute('class'),'fn'))`
Initial checkin 2010-07-02 01:48:07 +02:00			`$ret['fn'] = $x->textContent;`
Support for the case when the guid and public key will vanish from webfinger 2016-06-25 00:38:47 +02:00			`if(attribute_contains($x->getAttribute('class'),'searchable'))`
			`$ret['searchable'] = $x->textContent;`
			`if(attribute_contains($x->getAttribute('class'),'key'))`
			`$ret['key'] = $x->textContent;`
			`if(attribute_contains($x->getAttribute('class'),'url'))`
			`$ret['url'] = $x->textContent;`
fix several probe related issues 2011-08-18 08:01:44 +02:00			`if((attribute_contains($x->getAttribute('class'),'photo'))`
			`\|\| (attribute_contains($x->getAttribute('class'),'avatar'))) {`
			`$size = intval($x->getAttribute('width'));`
			`// dfrn prefers 175, so if we find this, we set largest_size so it can't be topped.`
			`if(($size > $largest_photo) \|\| ($size == 175) \|\| (! $largest_photo)) {`
			`$ret['photo'] = $x->getAttribute('src');`
			`$largest_photo = (($size == 175) ? 9999 : $size);`
			`}`
			`}`
two-way subscriptions working with federated social accounts 2010-10-26 06:52:30 +02:00			`}`
Initial checkin 2010-07-02 01:48:07 +02:00			`}`
			`}`
Improvements how gcontact entries are updated 2016-02-13 12:26:58 +01:00			`return array_merge($ret, $noscrapedata);`
Initial checkin 2010-07-02 01:48:07 +02:00			`}}`






			`if(! function_exists('validate_dfrn')) {`
			`function validate_dfrn($a) {`
			`$errors = 0;`
			`if(! x($a,'key'))`
			`$errors ++;`
			`if(! x($a,'dfrn-request'))`
			`$errors ++;`
			`if(! x($a,'dfrn-confirm'))`
			`$errors ++;`
			`if(! x($a,'dfrn-notify'))`
			`$errors ++;`
			`if(! x($a,'dfrn-poll'))`
			`$errors ++;`
			`return $errors;`
			`}}`

documentation 2011-08-18 08:10:55 +02:00			`/**`
			`*`
			`* Probe a network address to discover what kind of protocols we need to communicate with it.`
			`*`
			`* Warning: this function is a bit touchy and there are some subtle dependencies within the logic flow.`
			`* Edit with care.`
			`*`
			`*/`

			`/**`
			`*`
			`* PROBE_DIASPORA has a bias towards returning Diaspora information`
			`* while PROBE_NORMAL has a bias towards dfrn/zot - in the case where`
Scrape.php - Friendika -> Friendica 2012-02-19 19:27:54 +01:00			`* an address (such as a Friendica address) supports more than one type`
Friendica contacts where detected as OStatus contacts when they should be Diaspora contacts 2015-11-05 00:42:38 +01:00			`* of network.`
documentation 2011-08-18 08:10:55 +02:00			`*`
			`*/`


Removed old code 2016-07-04 00:14:08 +02:00			`define('PROBE_NORMAL', 0);`
			`define('PROBE_DIASPORA', 1);`
relocated "follow" url scraping code so it can be used elsewhere (contact repair, etc.) 2011-04-15 09:59:00 +02:00
probe_url: Better detection for non-standard installations of GNU Social 2015-06-14 01:52:26 +02:00			`function probe_url($url, $mode = PROBE_NORMAL, $level = 1) {`
Workaround for misconfigured Friendica servers 2015-12-06 18:52:19 +01:00
Removed old code 2016-07-04 00:14:08 +02:00			`if ($mode == PROBE_DIASPORA)`
			`$network = NETWORK_DIASPORA;`
			`else`
			`$network = "";`
Workaround for misconfigured Friendica servers 2015-12-06 18:52:19 +01:00
Removed old code 2016-07-04 00:14:08 +02:00			`$data = Probe::uri($url, $network);`
Caching for scrape, keywords for remote_self, notifications for addresses that aren't in your contact list. 2015-01-20 22:54:25 +01:00
Removed old code 2016-07-04 00:14:08 +02:00			`return $data;`
relocated "follow" url scraping code so it can be used elsewhere (contact repair, etc.) 2011-04-15 09:59:00 +02:00			`}`
Improved probe_url, fixed wrong network detection. 2015-02-16 22:11:51 +01:00
"addr" and "server_url" are now generated directly in "update_gcontact" if not given. 2016-02-14 11:56:23 +01:00			`/**`
			`* @brief Find the matching part between two url`
			`*`
			`* @param string $url1`
			`* @param string $url2`
			`* @return string The matching part`
			`*/`
			`function matching_url($url1, $url2) {`

			`if (($url1 == "") OR ($url2 == ""))`
			`return "";`

			`$url1 = normalise_link($url1);`
			`$url2 = normalise_link($url2);`

			`$parts1 = parse_url($url1);`
			`$parts2 = parse_url($url2);`

			`if (!isset($parts1["host"]) OR !isset($parts2["host"]))`
			`return "";`

			`if ($parts1["scheme"] != $parts2["scheme"])`
			`return "";`

			`if ($parts1["host"] != $parts2["host"])`
			`return "";`

			`if ($parts1["port"] != $parts2["port"])`
			`return "";`

			`$match = $parts1["scheme"]."://".$parts1["host"];`

			`if ($parts1["port"])`
			`$match .= ":".$parts1["port"];`

			`$pathparts1 = explode("/", $parts1["path"]);`
			`$pathparts2 = explode("/", $parts2["path"]);`
Improved probe_url, fixed wrong network detection. 2015-02-16 22:11:51 +01:00
			`$i = 0;`
"addr" and "server_url" are now generated directly in "update_gcontact" if not given. 2016-02-14 11:56:23 +01:00			`$path = "";`
			`do {`
			`$path1 = $pathparts1[$i];`
			`$path2 = $pathparts2[$i];`
Improved probe_url, fixed wrong network detection. 2015-02-16 22:11:51 +01:00
"addr" and "server_url" are now generated directly in "update_gcontact" if not given. 2016-02-14 11:56:23 +01:00			`if ($path1 == $path2)`
			`$path .= $path1."/";`

			`} while (($path1 == $path2) AND ($i++ <= count($pathparts1)));`

			`$match .= $path;`

			`return normalise_link($match);`
Improved probe_url, fixed wrong network detection. 2015-02-16 22:11:51 +01:00			`}`