Use mattwright/urlresolver for HTTPClient::finalUrl()

This commit is contained in:
Philipp Holzer 2021-08-23 14:28:25 +02:00
parent e576af218b
commit 8385ee7a61
No known key found for this signature in database
GPG key ID: 9A28B7D4FF5667BD
5 changed files with 83 additions and 115 deletions

View file

@ -69,7 +69,8 @@
"npm-asset/perfect-scrollbar": "0.6.16", "npm-asset/perfect-scrollbar": "0.6.16",
"npm-asset/textcomplete": "^0.18.2", "npm-asset/textcomplete": "^0.18.2",
"npm-asset/typeahead.js": "^0.11.1", "npm-asset/typeahead.js": "^0.11.1",
"minishlink/web-push": "^6.0" "minishlink/web-push": "^6.0",
"mattwright/urlresolver": "^2.0"
}, },
"repositories": [ "repositories": [
{ {

48
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically" "This file is @generated automatically"
], ],
"content-hash": "7d6dee6e449da931e8fe209e61b2e78e", "content-hash": "c9e0a9eacc23d884012042eeab01cc8b",
"packages": [ "packages": [
{ {
"name": "asika/simple-console", "name": "asika/simple-console",
@ -1133,6 +1133,52 @@
], ],
"time": "2017-07-19T15:11:19+00:00" "time": "2017-07-19T15:11:19+00:00"
}, },
{
"name": "mattwright/urlresolver",
"version": "2.0",
"source": {
"type": "git",
"url": "https://github.com/mattwright/URLResolver.php.git",
"reference": "416039192cb6d9158bdacd68349bceff8739b857"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/mattwright/URLResolver.php/zipball/416039192cb6d9158bdacd68349bceff8739b857",
"reference": "416039192cb6d9158bdacd68349bceff8739b857",
"shasum": ""
},
"require": {
"ext-curl": "*",
"ext-mbstring": "*",
"php": ">=5.3"
},
"type": "library",
"autoload": {
"psr-4": {
"mattwright\\": "."
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Matt Wright",
"email": "mw@mattwright.com"
}
],
"description": "PHP class that attempts to resolve URLs to a final, canonical link.",
"homepage": "https://github.com/mattwright/URLResolver.php",
"keywords": [
"canonical",
"link",
"redirect",
"resolve",
"url"
],
"time": "2019-01-18T00:59:34+00:00"
},
{ {
"name": "michelf/php-markdown", "name": "michelf/php-markdown",
"version": "1.9.0", "version": "1.9.0",

View file

@ -10,6 +10,7 @@ use Friendica\Network\IHTTPClient;
use Friendica\Util\Profiler; use Friendica\Util\Profiler;
use GuzzleHttp\Client; use GuzzleHttp\Client;
use GuzzleHttp\RequestOptions; use GuzzleHttp\RequestOptions;
use mattwright\URLResolver;
use Psr\Http\Message\RequestInterface; use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\ResponseInterface; use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface; use Psr\Http\Message\UriInterface;
@ -85,6 +86,13 @@ class HTTPClientFactory extends BaseFactory
], ],
]); ]);
return new HTTPClient($logger, $this->profiler, $this->config, $userAgent, $guzzle); $resolver = new URLResolver();
$resolver->setUserAgent($userAgent);
$resolver->setMaxRedirects(10);
$resolver->setRequestTimeout(10);
// if the file is too large then exit
$resolver->setMaxResponseDataSize(1000000);
return new HTTPClient($logger, $this->profiler, $guzzle, $resolver);
} }
} }

View file

@ -21,9 +21,6 @@
namespace Friendica\Network; namespace Friendica\Network;
use DOMDocument;
use DomXPath;
use Friendica\Core\Config\IConfig;
use Friendica\Core\System; use Friendica\Core\System;
use Friendica\Util\Network; use Friendica\Util\Network;
use Friendica\Util\Profiler; use Friendica\Util\Profiler;
@ -32,6 +29,7 @@ use GuzzleHttp\Cookie\FileCookieJar;
use GuzzleHttp\Exception\RequestException; use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Exception\TransferException; use GuzzleHttp\Exception\TransferException;
use GuzzleHttp\RequestOptions; use GuzzleHttp\RequestOptions;
use mattwright\URLResolver;
use Psr\Http\Message\ResponseInterface; use Psr\Http\Message\ResponseInterface;
use Psr\Log\LoggerInterface; use Psr\Log\LoggerInterface;
@ -44,20 +42,17 @@ class HTTPClient implements IHTTPClient
private $logger; private $logger;
/** @var Profiler */ /** @var Profiler */
private $profiler; private $profiler;
/** @var IConfig */
private $config;
/** @var string */
private $userAgent;
/** @var Client */ /** @var Client */
private $client; private $client;
/** @var URLResolver */
private $resolver;
public function __construct(LoggerInterface $logger, Profiler $profiler, IConfig $config, string $userAgent, Client $client) public function __construct(LoggerInterface $logger, Profiler $profiler, Client $client, URLResolver $resolver)
{ {
$this->logger = $logger; $this->logger = $logger;
$this->profiler = $profiler; $this->profiler = $profiler;
$this->config = $config; $this->client = $client;
$this->userAgent = $userAgent; $this->resolver = $resolver;
$this->client = $client;
} }
/** /**
@ -97,6 +92,11 @@ class HTTPClient implements IHTTPClient
return CurlResult::createErrorCurl($url); return CurlResult::createErrorCurl($url);
} }
if (Network::isRedirectBlocked($url)) {
$this->logger->info('Domain should not be redirected.', ['url' => $url]);
return CurlResult::createErrorCurl($url);
}
$conf = []; $conf = [];
if (!empty($opts['cookiejar'])) { if (!empty($opts['cookiejar'])) {
@ -197,10 +197,12 @@ class HTTPClient implements IHTTPClient
/** /**
* {@inheritDoc} * {@inheritDoc}
*/ */
public function finalUrl(string $url, int $depth = 1, bool $fetchbody = false) public function finalUrl(string $url)
{ {
$this->profiler->startRecording('network');
if (Network::isLocalLink($url)) { if (Network::isLocalLink($url)) {
$this->logger->info('Local link', ['url' => $url, 'callstack' => System::callstack(20)]); $this->logger->debug('Local link', ['url' => $url, 'callstack' => System::callstack(20)]);
} }
if (Network::isUrlBlocked($url)) { if (Network::isUrlBlocked($url)) {
@ -215,104 +217,19 @@ class HTTPClient implements IHTTPClient
$url = Network::stripTrackingQueryParams($url); $url = Network::stripTrackingQueryParams($url);
if ($depth > 10) {
return $url;
}
$url = trim($url, "'"); $url = trim($url, "'");
$this->profiler->startRecording('network'); // Designate a temporary file that will store cookies during the session.
// Some websites test the browser for cookie support, so this enhances results.
$this->resolver->setCookieJar(tempnam(get_temppath() , 'url_resolver-'));
$ch = curl_init(); $urlResult = $this->resolver->resolveURL($url);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_NOBODY, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent);
curl_exec($ch); if ($urlResult->didErrorOccur()) {
$curl_info = @curl_getinfo($ch); throw new TransferException($urlResult->getErrorMessageString());
$http_code = $curl_info['http_code'];
curl_close($ch);
$this->profiler->stopRecording();
if ($http_code == 0) {
return $url;
} }
if (in_array($http_code, ['301', '302'])) { return $urlResult->getURL();
if (!empty($curl_info['redirect_url'])) {
return $this->finalUrl($curl_info['redirect_url'], ++$depth, $fetchbody);
} elseif (!empty($curl_info['location'])) {
return $this->finalUrl($curl_info['location'], ++$depth, $fetchbody);
}
}
// Check for redirects in the meta elements of the body if there are no redirects in the header.
if (!$fetchbody) {
return $this->finalUrl($url, ++$depth, true);
}
// if the file is too large then exit
if ($curl_info["download_content_length"] > 1000000) {
return $url;
}
// if it isn't a HTML file then exit
if (!empty($curl_info["content_type"]) && !strstr(strtolower($curl_info["content_type"]), "html")) {
return $url;
}
$this->profiler->startRecording('network');
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_NOBODY, 0);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent);
$body = curl_exec($ch);
curl_close($ch);
$this->profiler->stopRecording();
if (trim($body) == "") {
return $url;
}
// Check for redirect in meta elements
$doc = new DOMDocument();
@$doc->loadHTML($body);
$xpath = new DomXPath($doc);
$list = $xpath->query("//meta[@content]");
foreach ($list as $node) {
$attr = [];
if ($node->attributes->length) {
foreach ($node->attributes as $attribute) {
$attr[$attribute->name] = $attribute->value;
}
}
if (@$attr["http-equiv"] == 'refresh') {
$path = $attr["content"];
$pathinfo = explode(";", $path);
foreach ($pathinfo as $value) {
if (substr(strtolower($value), 0, 4) == "url=") {
return $this->finalUrl(substr($value, 4), ++$depth);
}
}
}
}
return $url;
} }
/** /**

View file

@ -104,14 +104,10 @@ interface IHTTPClient
* through HTTP code or meta refresh tags. Stops after 10 redirections. * through HTTP code or meta refresh tags. Stops after 10 redirections.
* *
* @param string $url A user-submitted URL * @param string $url A user-submitted URL
* @param int $depth The current redirection recursion level (internal)
* @param bool $fetchbody Wether to fetch the body or not after the HEAD requests
* *
* @return string A canonical URL * @return string A canonical URL
* @throws \Friendica\Network\HTTPException\InternalServerErrorException * @throws \Friendica\Network\HTTPException\InternalServerErrorException
* @see ParseUrl::getSiteinfo * @see ParseUrl::getSiteinfo
*
* @todo Remove the $fetchbody parameter that generates an extraneous HEAD request
*/ */
public function finalUrl(string $url, int $depth = 1, bool $fetchbody = false); public function finalUrl(string $url);
} }