Use mattwright/urlresolver for HTTPClient::finalUrl()
This commit is contained in:
parent
e576af218b
commit
8385ee7a61
5 changed files with 83 additions and 115 deletions
|
@ -69,7 +69,8 @@
|
|||
"npm-asset/perfect-scrollbar": "0.6.16",
|
||||
"npm-asset/textcomplete": "^0.18.2",
|
||||
"npm-asset/typeahead.js": "^0.11.1",
|
||||
"minishlink/web-push": "^6.0"
|
||||
"minishlink/web-push": "^6.0",
|
||||
"mattwright/urlresolver": "^2.0"
|
||||
},
|
||||
"repositories": [
|
||||
{
|
||||
|
|
48
composer.lock
generated
48
composer.lock
generated
|
@ -4,7 +4,7 @@
|
|||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"content-hash": "7d6dee6e449da931e8fe209e61b2e78e",
|
||||
"content-hash": "c9e0a9eacc23d884012042eeab01cc8b",
|
||||
"packages": [
|
||||
{
|
||||
"name": "asika/simple-console",
|
||||
|
@ -1133,6 +1133,52 @@
|
|||
],
|
||||
"time": "2017-07-19T15:11:19+00:00"
|
||||
},
|
||||
{
|
||||
"name": "mattwright/urlresolver",
|
||||
"version": "2.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/mattwright/URLResolver.php.git",
|
||||
"reference": "416039192cb6d9158bdacd68349bceff8739b857"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/mattwright/URLResolver.php/zipball/416039192cb6d9158bdacd68349bceff8739b857",
|
||||
"reference": "416039192cb6d9158bdacd68349bceff8739b857",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"ext-curl": "*",
|
||||
"ext-mbstring": "*",
|
||||
"php": ">=5.3"
|
||||
},
|
||||
"type": "library",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"mattwright\\": "."
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"MIT"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Matt Wright",
|
||||
"email": "mw@mattwright.com"
|
||||
}
|
||||
],
|
||||
"description": "PHP class that attempts to resolve URLs to a final, canonical link.",
|
||||
"homepage": "https://github.com/mattwright/URLResolver.php",
|
||||
"keywords": [
|
||||
"canonical",
|
||||
"link",
|
||||
"redirect",
|
||||
"resolve",
|
||||
"url"
|
||||
],
|
||||
"time": "2019-01-18T00:59:34+00:00"
|
||||
},
|
||||
{
|
||||
"name": "michelf/php-markdown",
|
||||
"version": "1.9.0",
|
||||
|
|
|
@ -10,6 +10,7 @@ use Friendica\Network\IHTTPClient;
|
|||
use Friendica\Util\Profiler;
|
||||
use GuzzleHttp\Client;
|
||||
use GuzzleHttp\RequestOptions;
|
||||
use mattwright\URLResolver;
|
||||
use Psr\Http\Message\RequestInterface;
|
||||
use Psr\Http\Message\ResponseInterface;
|
||||
use Psr\Http\Message\UriInterface;
|
||||
|
@ -85,6 +86,13 @@ class HTTPClientFactory extends BaseFactory
|
|||
],
|
||||
]);
|
||||
|
||||
return new HTTPClient($logger, $this->profiler, $this->config, $userAgent, $guzzle);
|
||||
$resolver = new URLResolver();
|
||||
$resolver->setUserAgent($userAgent);
|
||||
$resolver->setMaxRedirects(10);
|
||||
$resolver->setRequestTimeout(10);
|
||||
// if the file is too large then exit
|
||||
$resolver->setMaxResponseDataSize(1000000);
|
||||
|
||||
return new HTTPClient($logger, $this->profiler, $guzzle, $resolver);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,9 +21,6 @@
|
|||
|
||||
namespace Friendica\Network;
|
||||
|
||||
use DOMDocument;
|
||||
use DomXPath;
|
||||
use Friendica\Core\Config\IConfig;
|
||||
use Friendica\Core\System;
|
||||
use Friendica\Util\Network;
|
||||
use Friendica\Util\Profiler;
|
||||
|
@ -32,6 +29,7 @@ use GuzzleHttp\Cookie\FileCookieJar;
|
|||
use GuzzleHttp\Exception\RequestException;
|
||||
use GuzzleHttp\Exception\TransferException;
|
||||
use GuzzleHttp\RequestOptions;
|
||||
use mattwright\URLResolver;
|
||||
use Psr\Http\Message\ResponseInterface;
|
||||
use Psr\Log\LoggerInterface;
|
||||
|
||||
|
@ -44,20 +42,17 @@ class HTTPClient implements IHTTPClient
|
|||
private $logger;
|
||||
/** @var Profiler */
|
||||
private $profiler;
|
||||
/** @var IConfig */
|
||||
private $config;
|
||||
/** @var string */
|
||||
private $userAgent;
|
||||
/** @var Client */
|
||||
private $client;
|
||||
/** @var URLResolver */
|
||||
private $resolver;
|
||||
|
||||
public function __construct(LoggerInterface $logger, Profiler $profiler, IConfig $config, string $userAgent, Client $client)
|
||||
public function __construct(LoggerInterface $logger, Profiler $profiler, Client $client, URLResolver $resolver)
|
||||
{
|
||||
$this->logger = $logger;
|
||||
$this->profiler = $profiler;
|
||||
$this->config = $config;
|
||||
$this->userAgent = $userAgent;
|
||||
$this->client = $client;
|
||||
$this->resolver = $resolver;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -97,6 +92,11 @@ class HTTPClient implements IHTTPClient
|
|||
return CurlResult::createErrorCurl($url);
|
||||
}
|
||||
|
||||
if (Network::isRedirectBlocked($url)) {
|
||||
$this->logger->info('Domain should not be redirected.', ['url' => $url]);
|
||||
return CurlResult::createErrorCurl($url);
|
||||
}
|
||||
|
||||
$conf = [];
|
||||
|
||||
if (!empty($opts['cookiejar'])) {
|
||||
|
@ -197,10 +197,12 @@ class HTTPClient implements IHTTPClient
|
|||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
public function finalUrl(string $url, int $depth = 1, bool $fetchbody = false)
|
||||
public function finalUrl(string $url)
|
||||
{
|
||||
$this->profiler->startRecording('network');
|
||||
|
||||
if (Network::isLocalLink($url)) {
|
||||
$this->logger->info('Local link', ['url' => $url, 'callstack' => System::callstack(20)]);
|
||||
$this->logger->debug('Local link', ['url' => $url, 'callstack' => System::callstack(20)]);
|
||||
}
|
||||
|
||||
if (Network::isUrlBlocked($url)) {
|
||||
|
@ -215,104 +217,19 @@ class HTTPClient implements IHTTPClient
|
|||
|
||||
$url = Network::stripTrackingQueryParams($url);
|
||||
|
||||
if ($depth > 10) {
|
||||
return $url;
|
||||
}
|
||||
|
||||
$url = trim($url, "'");
|
||||
|
||||
$this->profiler->startRecording('network');
|
||||
// Designate a temporary file that will store cookies during the session.
|
||||
// Some websites test the browser for cookie support, so this enhances results.
|
||||
$this->resolver->setCookieJar(tempnam(get_temppath() , 'url_resolver-'));
|
||||
|
||||
$ch = curl_init();
|
||||
curl_setopt($ch, CURLOPT_URL, $url);
|
||||
curl_setopt($ch, CURLOPT_HEADER, 1);
|
||||
curl_setopt($ch, CURLOPT_NOBODY, 1);
|
||||
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent);
|
||||
$urlResult = $this->resolver->resolveURL($url);
|
||||
|
||||
curl_exec($ch);
|
||||
$curl_info = @curl_getinfo($ch);
|
||||
$http_code = $curl_info['http_code'];
|
||||
curl_close($ch);
|
||||
|
||||
$this->profiler->stopRecording();
|
||||
|
||||
if ($http_code == 0) {
|
||||
return $url;
|
||||
if ($urlResult->didErrorOccur()) {
|
||||
throw new TransferException($urlResult->getErrorMessageString());
|
||||
}
|
||||
|
||||
if (in_array($http_code, ['301', '302'])) {
|
||||
if (!empty($curl_info['redirect_url'])) {
|
||||
return $this->finalUrl($curl_info['redirect_url'], ++$depth, $fetchbody);
|
||||
} elseif (!empty($curl_info['location'])) {
|
||||
return $this->finalUrl($curl_info['location'], ++$depth, $fetchbody);
|
||||
}
|
||||
}
|
||||
|
||||
// Check for redirects in the meta elements of the body if there are no redirects in the header.
|
||||
if (!$fetchbody) {
|
||||
return $this->finalUrl($url, ++$depth, true);
|
||||
}
|
||||
|
||||
// if the file is too large then exit
|
||||
if ($curl_info["download_content_length"] > 1000000) {
|
||||
return $url;
|
||||
}
|
||||
|
||||
// if it isn't a HTML file then exit
|
||||
if (!empty($curl_info["content_type"]) && !strstr(strtolower($curl_info["content_type"]), "html")) {
|
||||
return $url;
|
||||
}
|
||||
|
||||
$this->profiler->startRecording('network');
|
||||
|
||||
$ch = curl_init();
|
||||
curl_setopt($ch, CURLOPT_URL, $url);
|
||||
curl_setopt($ch, CURLOPT_HEADER, 0);
|
||||
curl_setopt($ch, CURLOPT_NOBODY, 0);
|
||||
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent);
|
||||
|
||||
$body = curl_exec($ch);
|
||||
curl_close($ch);
|
||||
|
||||
$this->profiler->stopRecording();
|
||||
|
||||
if (trim($body) == "") {
|
||||
return $url;
|
||||
}
|
||||
|
||||
// Check for redirect in meta elements
|
||||
$doc = new DOMDocument();
|
||||
@$doc->loadHTML($body);
|
||||
|
||||
$xpath = new DomXPath($doc);
|
||||
|
||||
$list = $xpath->query("//meta[@content]");
|
||||
foreach ($list as $node) {
|
||||
$attr = [];
|
||||
if ($node->attributes->length) {
|
||||
foreach ($node->attributes as $attribute) {
|
||||
$attr[$attribute->name] = $attribute->value;
|
||||
}
|
||||
}
|
||||
|
||||
if (@$attr["http-equiv"] == 'refresh') {
|
||||
$path = $attr["content"];
|
||||
$pathinfo = explode(";", $path);
|
||||
foreach ($pathinfo as $value) {
|
||||
if (substr(strtolower($value), 0, 4) == "url=") {
|
||||
return $this->finalUrl(substr($value, 4), ++$depth);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $url;
|
||||
return $urlResult->getURL();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -104,14 +104,10 @@ interface IHTTPClient
|
|||
* through HTTP code or meta refresh tags. Stops after 10 redirections.
|
||||
*
|
||||
* @param string $url A user-submitted URL
|
||||
* @param int $depth The current redirection recursion level (internal)
|
||||
* @param bool $fetchbody Wether to fetch the body or not after the HEAD requests
|
||||
*
|
||||
* @return string A canonical URL
|
||||
* @throws \Friendica\Network\HTTPException\InternalServerErrorException
|
||||
* @see ParseUrl::getSiteinfo
|
||||
*
|
||||
* @todo Remove the $fetchbody parameter that generates an extraneous HEAD request
|
||||
*/
|
||||
public function finalUrl(string $url, int $depth = 1, bool $fetchbody = false);
|
||||
public function finalUrl(string $url);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue