From 8385ee7a612b33c2e2af2533d922f4a8a21e6dd4 Mon Sep 17 00:00:00 2001
From: Philipp <admin@philipp.info>
Date: Mon, 23 Aug 2021 14:28:25 +0200
Subject: [PATCH] Use mattwright/urlresolver for HTTPClient::finalUrl()

---
 composer.json                     |   3 +-
 composer.lock                     |  48 ++++++++++-
 src/Factory/HTTPClientFactory.php |  10 ++-
 src/Network/HTTPClient.php        | 131 ++++++------------------------
 src/Network/IHTTPClient.php       |   6 +-
 5 files changed, 83 insertions(+), 115 deletions(-)

diff --git a/composer.json b/composer.json
index 2dd5dec7b..bf0559254 100644
--- a/composer.json
+++ b/composer.json
@@ -69,7 +69,8 @@
 		"npm-asset/perfect-scrollbar": "0.6.16",
 		"npm-asset/textcomplete": "^0.18.2",
 		"npm-asset/typeahead.js": "^0.11.1",
-		"minishlink/web-push": "^6.0"
+		"minishlink/web-push": "^6.0",
+		"mattwright/urlresolver": "^2.0"
 	},
 	"repositories": [
 		{
diff --git a/composer.lock b/composer.lock
index 5e8f1a20a..906a681e4 100644
--- a/composer.lock
+++ b/composer.lock
@@ -4,7 +4,7 @@
         "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
         "This file is @generated automatically"
     ],
-    "content-hash": "7d6dee6e449da931e8fe209e61b2e78e",
+    "content-hash": "c9e0a9eacc23d884012042eeab01cc8b",
     "packages": [
         {
             "name": "asika/simple-console",
@@ -1133,6 +1133,52 @@
             ],
             "time": "2017-07-19T15:11:19+00:00"
         },
+        {
+            "name": "mattwright/urlresolver",
+            "version": "2.0",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/mattwright/URLResolver.php.git",
+                "reference": "416039192cb6d9158bdacd68349bceff8739b857"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/mattwright/URLResolver.php/zipball/416039192cb6d9158bdacd68349bceff8739b857",
+                "reference": "416039192cb6d9158bdacd68349bceff8739b857",
+                "shasum": ""
+            },
+            "require": {
+                "ext-curl": "*",
+                "ext-mbstring": "*",
+                "php": ">=5.3"
+            },
+            "type": "library",
+            "autoload": {
+                "psr-4": {
+                    "mattwright\\": "."
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "MIT"
+            ],
+            "authors": [
+                {
+                    "name": "Matt Wright",
+                    "email": "mw@mattwright.com"
+                }
+            ],
+            "description": "PHP class that attempts to resolve URLs to a final, canonical link.",
+            "homepage": "https://github.com/mattwright/URLResolver.php",
+            "keywords": [
+                "canonical",
+                "link",
+                "redirect",
+                "resolve",
+                "url"
+            ],
+            "time": "2019-01-18T00:59:34+00:00"
+        },
         {
             "name": "michelf/php-markdown",
             "version": "1.9.0",
diff --git a/src/Factory/HTTPClientFactory.php b/src/Factory/HTTPClientFactory.php
index 636f8a46d..c1cb47541 100644
--- a/src/Factory/HTTPClientFactory.php
+++ b/src/Factory/HTTPClientFactory.php
@@ -10,6 +10,7 @@ use Friendica\Network\IHTTPClient;
 use Friendica\Util\Profiler;
 use GuzzleHttp\Client;
 use GuzzleHttp\RequestOptions;
+use mattwright\URLResolver;
 use Psr\Http\Message\RequestInterface;
 use Psr\Http\Message\ResponseInterface;
 use Psr\Http\Message\UriInterface;
@@ -85,6 +86,13 @@ class HTTPClientFactory extends BaseFactory
 			],
 		]);
 
-		return new HTTPClient($logger, $this->profiler, $this->config, $userAgent, $guzzle);
+		$resolver = new URLResolver();
+		$resolver->setUserAgent($userAgent);
+		$resolver->setMaxRedirects(10);
+		$resolver->setRequestTimeout(10);
+		// if the file is too large then exit
+		$resolver->setMaxResponseDataSize(1000000);
+
+		return new HTTPClient($logger, $this->profiler, $guzzle, $resolver);
 	}
 }
diff --git a/src/Network/HTTPClient.php b/src/Network/HTTPClient.php
index 000d3c76a..d83b805df 100644
--- a/src/Network/HTTPClient.php
+++ b/src/Network/HTTPClient.php
@@ -21,9 +21,6 @@
 
 namespace Friendica\Network;
 
-use DOMDocument;
-use DomXPath;
-use Friendica\Core\Config\IConfig;
 use Friendica\Core\System;
 use Friendica\Util\Network;
 use Friendica\Util\Profiler;
@@ -32,6 +29,7 @@ use GuzzleHttp\Cookie\FileCookieJar;
 use GuzzleHttp\Exception\RequestException;
 use GuzzleHttp\Exception\TransferException;
 use GuzzleHttp\RequestOptions;
+use mattwright\URLResolver;
 use Psr\Http\Message\ResponseInterface;
 use Psr\Log\LoggerInterface;
 
@@ -44,20 +42,17 @@ class HTTPClient implements IHTTPClient
 	private $logger;
 	/** @var Profiler */
 	private $profiler;
-	/** @var IConfig */
-	private $config;
-	/** @var string */
-	private $userAgent;
 	/** @var Client */
 	private $client;
+	/** @var URLResolver */
+	private $resolver;
 
-	public function __construct(LoggerInterface $logger, Profiler $profiler, IConfig $config, string $userAgent, Client $client)
+	public function __construct(LoggerInterface $logger, Profiler $profiler, Client $client, URLResolver $resolver)
 	{
-		$this->logger    = $logger;
-		$this->profiler  = $profiler;
-		$this->config    = $config;
-		$this->userAgent = $userAgent;
-		$this->client    = $client;
+		$this->logger   = $logger;
+		$this->profiler = $profiler;
+		$this->client   = $client;
+		$this->resolver = $resolver;
 	}
 
 	/**
@@ -97,6 +92,11 @@ class HTTPClient implements IHTTPClient
 			return CurlResult::createErrorCurl($url);
 		}
 
+		if (Network::isRedirectBlocked($url)) {
+			$this->logger->info('Domain should not be redirected.', ['url' => $url]);
+			return CurlResult::createErrorCurl($url);
+		}
+
 		$conf = [];
 
 		if (!empty($opts['cookiejar'])) {
@@ -197,10 +197,12 @@ class HTTPClient implements IHTTPClient
 	/**
 	 * {@inheritDoc}
 	 */
-	public function finalUrl(string $url, int $depth = 1, bool $fetchbody = false)
+	public function finalUrl(string $url)
 	{
+		$this->profiler->startRecording('network');
+
 		if (Network::isLocalLink($url)) {
-			$this->logger->info('Local link', ['url' => $url, 'callstack' => System::callstack(20)]);
+			$this->logger->debug('Local link', ['url' => $url, 'callstack' => System::callstack(20)]);
 		}
 
 		if (Network::isUrlBlocked($url)) {
@@ -215,104 +217,19 @@ class HTTPClient implements IHTTPClient
 
 		$url = Network::stripTrackingQueryParams($url);
 
-		if ($depth > 10) {
-			return $url;
-		}
-
 		$url = trim($url, "'");
 
-		$this->profiler->startRecording('network');
+		// Designate a temporary file that will store cookies during the session.
+		// Some websites test the browser for cookie support, so this enhances results.
+		$this->resolver->setCookieJar(tempnam(get_temppath() , 'url_resolver-'));
 
-		$ch = curl_init();
-		curl_setopt($ch, CURLOPT_URL, $url);
-		curl_setopt($ch, CURLOPT_HEADER, 1);
-		curl_setopt($ch, CURLOPT_NOBODY, 1);
-		curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
-		curl_setopt($ch, CURLOPT_TIMEOUT, 10);
-		curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
-		curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent);
+		$urlResult = $this->resolver->resolveURL($url);
 
-		curl_exec($ch);
-		$curl_info = @curl_getinfo($ch);
-		$http_code = $curl_info['http_code'];
-		curl_close($ch);
-
-		$this->profiler->stopRecording();
-
-		if ($http_code == 0) {
-			return $url;
+		if ($urlResult->didErrorOccur()) {
+			throw new TransferException($urlResult->getErrorMessageString());
 		}
 
-		if (in_array($http_code, ['301', '302'])) {
-			if (!empty($curl_info['redirect_url'])) {
-				return $this->finalUrl($curl_info['redirect_url'], ++$depth, $fetchbody);
-			} elseif (!empty($curl_info['location'])) {
-				return $this->finalUrl($curl_info['location'], ++$depth, $fetchbody);
-			}
-		}
-
-		// Check for redirects in the meta elements of the body if there are no redirects in the header.
-		if (!$fetchbody) {
-			return $this->finalUrl($url, ++$depth, true);
-		}
-
-		// if the file is too large then exit
-		if ($curl_info["download_content_length"] > 1000000) {
-			return $url;
-		}
-
-		// if it isn't a HTML file then exit
-		if (!empty($curl_info["content_type"]) && !strstr(strtolower($curl_info["content_type"]), "html")) {
-			return $url;
-		}
-
-		$this->profiler->startRecording('network');
-
-		$ch = curl_init();
-		curl_setopt($ch, CURLOPT_URL, $url);
-		curl_setopt($ch, CURLOPT_HEADER, 0);
-		curl_setopt($ch, CURLOPT_NOBODY, 0);
-		curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
-		curl_setopt($ch, CURLOPT_TIMEOUT, 10);
-		curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
-		curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent);
-
-		$body = curl_exec($ch);
-		curl_close($ch);
-
-		$this->profiler->stopRecording();
-
-		if (trim($body) == "") {
-			return $url;
-		}
-
-		// Check for redirect in meta elements
-		$doc = new DOMDocument();
-		@$doc->loadHTML($body);
-
-		$xpath = new DomXPath($doc);
-
-		$list = $xpath->query("//meta[@content]");
-		foreach ($list as $node) {
-			$attr = [];
-			if ($node->attributes->length) {
-				foreach ($node->attributes as $attribute) {
-					$attr[$attribute->name] = $attribute->value;
-				}
-			}
-
-			if (@$attr["http-equiv"] == 'refresh') {
-				$path = $attr["content"];
-				$pathinfo = explode(";", $path);
-				foreach ($pathinfo as $value) {
-					if (substr(strtolower($value), 0, 4) == "url=") {
-						return $this->finalUrl(substr($value, 4), ++$depth);
-					}
-				}
-			}
-		}
-
-		return $url;
+		return $urlResult->getURL();
 	}
 
 	/**
diff --git a/src/Network/IHTTPClient.php b/src/Network/IHTTPClient.php
index 8fa5285d2..180908eed 100644
--- a/src/Network/IHTTPClient.php
+++ b/src/Network/IHTTPClient.php
@@ -104,14 +104,10 @@ interface IHTTPClient
 	 * through HTTP code or meta refresh tags. Stops after 10 redirections.
 	 *
 	 * @param string $url       A user-submitted URL
-	 * @param int    $depth     The current redirection recursion level (internal)
-	 * @param bool   $fetchbody Wether to fetch the body or not after the HEAD requests
 	 *
 	 * @return string A canonical URL
 	 * @throws \Friendica\Network\HTTPException\InternalServerErrorException
 	 * @see   ParseUrl::getSiteinfo
-	 *
-	 * @todo  Remove the $fetchbody parameter that generates an extraneous HEAD request
 	 */
-	public function finalUrl(string $url, int $depth = 1, bool $fetchbody = false);
+	public function finalUrl(string $url);
 }