Merge pull request #8671 from MrPetovan/bug/8623-relative-href
Add support for relative URL in HTML documents
This commit is contained in:
commit
58f06a830f
2 changed files with 174 additions and 29 deletions
|
@ -1756,50 +1756,87 @@ class Probe
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check page for feed link
|
* Checks HTML page for RSS feed link
|
||||||
*
|
*
|
||||||
* @param string $url Page link
|
* @param string $url Page link
|
||||||
*
|
* @param string $body Page body string
|
||||||
* @return string feed link
|
* @return string|false Feed link or false if body was invalid HTML document
|
||||||
*/
|
*/
|
||||||
private static function getFeedLink($url)
|
public static function getFeedLink(string $url, string $body)
|
||||||
{
|
{
|
||||||
$curlResult = Network::curl($url);
|
|
||||||
if (!$curlResult->isSuccess()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
$doc = new DOMDocument();
|
$doc = new DOMDocument();
|
||||||
if (!@$doc->loadHTML($curlResult->getBody())) {
|
if (!@$doc->loadHTML($body)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
$xpath = new DomXPath($doc);
|
$xpath = new DOMXPath($doc);
|
||||||
|
|
||||||
//$feeds = $xpath->query("/html/head/link[@type='application/rss+xml']");
|
$feedUrl = $xpath->evaluate('string(/html/head/link[@type="application/rss+xml" and @rel="alternate"]/@href)');
|
||||||
$feeds = $xpath->query("/html/head/link[@type='application/rss+xml' and @rel='alternate']");
|
|
||||||
if (!is_object($feeds)) {
|
$feedUrl = $feedUrl ? self::ensureAbsoluteLinkFromHTMLDoc($feedUrl, $url, $xpath) : '';
|
||||||
return false;
|
|
||||||
|
return $feedUrl;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($feeds->length == 0) {
|
/**
|
||||||
return false;
|
* Return an absolute URL in the context of a HTML document retrieved from the provided URL.
|
||||||
|
*
|
||||||
|
* Loosely based on RFC 1808
|
||||||
|
*
|
||||||
|
* @see https://tools.ietf.org/html/rfc1808
|
||||||
|
*
|
||||||
|
* @param string $href The potential relative href found in the HTML document
|
||||||
|
* @param string $base The HTML document URL
|
||||||
|
* @param DOMXPath $xpath The HTML document XPath
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
private static function ensureAbsoluteLinkFromHTMLDoc(string $href, string $base, DOMXPath $xpath)
|
||||||
|
{
|
||||||
|
if (filter_var($href, FILTER_VALIDATE_URL)) {
|
||||||
|
return $href;
|
||||||
}
|
}
|
||||||
|
|
||||||
$feed_url = "";
|
$base = $xpath->evaluate('string(/html/head/base/@href)') ?: $base;
|
||||||
|
|
||||||
foreach ($feeds as $feed) {
|
$baseParts = parse_url($base);
|
||||||
$attr = [];
|
|
||||||
foreach ($feed->attributes as $attribute) {
|
|
||||||
$attr[$attribute->name] = trim($attribute->value);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (empty($feed_url) && !empty($attr['href'])) {
|
// Naked domain case (scheme://basehost)
|
||||||
$feed_url = $attr["href"];
|
$path = $baseParts['path'] ?? '/';
|
||||||
|
|
||||||
|
// Remove the filename part of the path if it exists (/base/path/file)
|
||||||
|
$path = implode('/', array_slice(explode('/', $path), 0, -1));
|
||||||
|
|
||||||
|
$hrefParts = parse_url($href);
|
||||||
|
|
||||||
|
// Root path case (/path) including relative scheme case (//host/path)
|
||||||
|
if ($hrefParts['path'] && $hrefParts['path'][0] == '/') {
|
||||||
|
$path = $hrefParts['path'];
|
||||||
|
} else {
|
||||||
|
$path = $path . '/' . $hrefParts['path'];
|
||||||
|
|
||||||
|
// Resolve arbitrary relative path
|
||||||
|
// Lifted from https://www.php.net/manual/en/function.realpath.php#84012
|
||||||
|
$parts = array_filter(explode('/', $path), 'strlen');
|
||||||
|
$absolutes = array();
|
||||||
|
foreach ($parts as $part) {
|
||||||
|
if ('.' == $part) continue;
|
||||||
|
if ('..' == $part) {
|
||||||
|
array_pop($absolutes);
|
||||||
|
} else {
|
||||||
|
$absolutes[] = $part;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return $feed_url;
|
$path = '/' . implode('/', $absolutes);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Relative scheme case (//host/path)
|
||||||
|
$baseParts['host'] = $hrefParts['host'] ?? $baseParts['host'];
|
||||||
|
$baseParts['path'] = $path;
|
||||||
|
unset($baseParts['query']);
|
||||||
|
unset($baseParts['fragment']);
|
||||||
|
|
||||||
|
return Network::unparseURL($baseParts);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1826,7 +1863,7 @@ class Probe
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
$feed_url = self::getFeedLink($url);
|
$feed_url = self::getFeedLink($url, $feed);
|
||||||
|
|
||||||
if (!$feed_url) {
|
if (!$feed_url) {
|
||||||
return false;
|
return false;
|
||||||
|
|
108
tests/src/Network/ProbeTest.php
Normal file
108
tests/src/Network/ProbeTest.php
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Friendica\Test\src\Network;
|
||||||
|
|
||||||
|
use Friendica\Network\Probe;
|
||||||
|
use PHPUnit\Framework\TestCase;
|
||||||
|
|
||||||
|
class ProbeTest extends TestCase
|
||||||
|
{
|
||||||
|
const TEMPLATENOBASE = '
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en-us">
|
||||||
|
<head>
|
||||||
|
<title>Example Blog</title>
|
||||||
|
<link href="{{$link}}" rel="alternate" type="application/rss+xml" title="Example Blog" />
|
||||||
|
<link href="{{$link}}" rel="feed" type="application/rss+xml" title="Example Blog" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>Hello World!</p>
|
||||||
|
</body>
|
||||||
|
</html>';
|
||||||
|
|
||||||
|
const TEMPLATEBASE = '
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en-us">
|
||||||
|
<head>
|
||||||
|
<title>Example Blog</title>
|
||||||
|
<link href="{{$link}}" rel="alternate" type="application/rss+xml" title="Example Blog" />
|
||||||
|
<link href="{{$link}}" rel="feed" type="application/rss+xml" title="Example Blog" />
|
||||||
|
<base href="{{$url}}">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>Hello World!</p>
|
||||||
|
</body>
|
||||||
|
</html>';
|
||||||
|
|
||||||
|
const EXPECTED = [
|
||||||
|
'https://example.org/path/to/blog/index.php' => [
|
||||||
|
'index.xml' => 'https://example.org/path/to/blog/index.xml',
|
||||||
|
'./index.xml' => 'https://example.org/path/to/blog/index.xml',
|
||||||
|
'../index.xml' => 'https://example.org/path/to/index.xml',
|
||||||
|
'/index.xml' => 'https://example.org/index.xml',
|
||||||
|
'//example.com/index.xml' => 'https://example.com/index.xml',
|
||||||
|
],
|
||||||
|
'https://example.org/path/to/blog/' => [
|
||||||
|
'index.xml' => 'https://example.org/path/to/blog/index.xml',
|
||||||
|
'./index.xml' => 'https://example.org/path/to/blog/index.xml',
|
||||||
|
'../index.xml' => 'https://example.org/path/to/index.xml',
|
||||||
|
'/index.xml' => 'https://example.org/index.xml',
|
||||||
|
'//example.com/index.xml' => 'https://example.com/index.xml',
|
||||||
|
],
|
||||||
|
'https://example.org/blog/' => [
|
||||||
|
'index.xml' => 'https://example.org/blog/index.xml',
|
||||||
|
'./index.xml' => 'https://example.org/blog/index.xml',
|
||||||
|
'../index.xml' => 'https://example.org/index.xml',
|
||||||
|
'/index.xml' => 'https://example.org/index.xml',
|
||||||
|
'//example.com/index.xml' => 'https://example.com/index.xml',
|
||||||
|
],
|
||||||
|
'https://example.org' => [
|
||||||
|
'index.xml' => 'https://example.org/index.xml',
|
||||||
|
'./index.xml' => 'https://example.org/index.xml',
|
||||||
|
'../index.xml' => 'https://example.org/index.xml',
|
||||||
|
'/index.xml' => 'https://example.org/index.xml',
|
||||||
|
'//example.com/index.xml' => 'https://example.com/index.xml',
|
||||||
|
],
|
||||||
|
];
|
||||||
|
|
||||||
|
private function replaceMacros($template, $vars)
|
||||||
|
{
|
||||||
|
foreach ($vars as $var => $value) {
|
||||||
|
$template = str_replace('{{' . $var . '}}', $value, $template);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $template;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @small
|
||||||
|
*/
|
||||||
|
public function testGetFeedLinkNoBase()
|
||||||
|
{
|
||||||
|
foreach (self::EXPECTED as $url => $hrefs) {
|
||||||
|
foreach ($hrefs as $href => $expected) {
|
||||||
|
$body = $this->replaceMacros(self::TEMPLATENOBASE, ['$link' => $href]);
|
||||||
|
|
||||||
|
$feedLink = Probe::getFeedLink($url, $body);
|
||||||
|
|
||||||
|
$this->assertEquals($expected, $feedLink, 'base url = ' . $url . ' | href = ' . $href);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @small
|
||||||
|
*/
|
||||||
|
public function testGetFeedLinkBase()
|
||||||
|
{
|
||||||
|
foreach (self::EXPECTED as $url => $hrefs) {
|
||||||
|
foreach ($hrefs as $href => $expected) {
|
||||||
|
$body = $this->replaceMacros(self::TEMPLATEBASE, ['$url' => $url, '$link' => $href]);
|
||||||
|
|
||||||
|
$feedLink = Probe::getFeedLink('http://example.com', $body);
|
||||||
|
|
||||||
|
$this->assertEquals($expected, $feedLink, 'base url = ' . $url . ' | href = ' . $href);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue