From 0ee5bf55b14af19227c8843f900bd1dd00e08a99 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Mon, 6 Jun 2022 01:52:21 -0400 Subject: [PATCH] Replace direct curl uses with Guzzle HTTP client - Add http dependency with custom User Agent - Simplify profile poll to remove the second profile call to check availability - Remove obsolete Network::fetchURL and Network::testURL --- src/classes/Pollers/Directory.php | 38 ++------------ src/classes/Pollers/Profile.php | 31 ++++++----- src/classes/Pollers/Server.php | 85 +++++++++++++------------------ src/classes/Utils/Network.php | 50 ------------------ src/classes/Utils/Scrape.php | 10 ++-- src/dependencies.php | 14 ++++- 6 files changed, 76 insertions(+), 152 deletions(-) diff --git a/src/classes/Pollers/Directory.php b/src/classes/Pollers/Directory.php index 53c0f9e..68a9f38 100644 --- a/src/classes/Pollers/Directory.php +++ b/src/classes/Pollers/Directory.php @@ -10,9 +10,9 @@ use Friendica\Directory\Utils\Network; class Directory { /** - * @var \Atlas\Pdo\Connection + * @var \GuzzleHttp\ClientInterface */ - private $atlas; + private $http; /** * @var \Friendica\Directory\Models\ProfilePollQueue */ @@ -30,12 +30,12 @@ class Directory ]; public function __construct( - \Atlas\Pdo\Connection $atlas, + \GuzzleHttp\ClientInterface $http, \Friendica\Directory\Models\ProfilePollQueue $profilePollQueueModel, \Psr\Log\LoggerInterface $logger, array $settings) { - $this->atlas = $atlas; + $this->http = $http; $this->profilePollQueueModel = $profilePollQueueModel; $this->logger = $logger; $this->settings = array_merge($this->settings, $settings); @@ -82,35 +82,7 @@ class Directory $path = '/sync/pull/since/' . $last_polled; } - //Prepare the CURL call. - $handle = curl_init(); - $options = array( - //Timeouts - CURLOPT_TIMEOUT => max($this->settings['probe_timeout'], 1), //Minimum of 1 second timeout. - CURLOPT_CONNECTTIMEOUT => 1, - //Redirecting - CURLOPT_FOLLOWLOCATION => true, - CURLOPT_MAXREDIRS => 8, - //SSL - CURLOPT_SSL_VERIFYPEER => true, - // CURLOPT_VERBOSE => true, - // CURLOPT_CERTINFO => true, - CURLOPT_SSL_VERIFYHOST => 2, - CURLOPT_PROTOCOLS => CURLPROTO_HTTP | CURLPROTO_HTTPS, - //Basic request - CURLOPT_USERAGENT => Network::USER_AGENT, - CURLOPT_RETURNTRANSFER => true, - CURLOPT_URL => $directory_url . $path - ); - curl_setopt_array($handle, $options); - - $this->logger->info('Pulling profiles from directory URL: ' . $directory_url . $path); - - //Probe the site. - $pull_data = curl_exec($handle); - - //Done with CURL now. - curl_close($handle); + $pull_data = $this->http->get($directory_url . $path, ['timeout' => max($this->settings['probe_timeout'], 1)])->getBody()->getContents(); $data = json_decode($pull_data, true); diff --git a/src/classes/Pollers/Profile.php b/src/classes/Pollers/Profile.php index 9dd546f..f9d66ef 100644 --- a/src/classes/Pollers/Profile.php +++ b/src/classes/Pollers/Profile.php @@ -18,6 +18,11 @@ class Profile */ private $atlas; + /** + * @var \GuzzleHttp\ClientInterface + */ + private $http; + /** * @var \Friendica\Directory\Models\Server */ @@ -43,6 +48,7 @@ class Profile public function __construct( \Atlas\Pdo\Connection $atlas, + \GuzzleHttp\ClientInterface $http, \Friendica\Directory\Models\Server $serverModel, \Friendica\Directory\Models\Profile $profileModel, \Psr\Log\LoggerInterface $logger, @@ -50,6 +56,7 @@ class Profile ) { $this->atlas = $atlas; + $this->http = $http; $this->serverModel = $serverModel; $this->profileModel = $profileModel; $this->logger = $logger; @@ -122,25 +129,23 @@ class Profile ); } - //Skip the profile scrape? - $noscrape = $server['noscrape_url']; + $available = false; $params = []; - if ($noscrape) { + + //Skip the profile scrape? + if ($server['noscrape_url']) { $this->logger->debug('Calling ' . $server['noscrape_url'] . '/' . $username); - $params = \Friendica\Directory\Utils\Scrape::retrieveNoScrapeData($server['noscrape_url'] . '/' . $username); - $noscrape = !!$params; //If the result was false, do a scrape after all. + $params = \Friendica\Directory\Utils\Scrape::retrieveNoScrapeData($this->http, $server['noscrape_url'] . '/' . $username); + $available = !!$params; //If the result was false, do a scrape after all. } - $available = true; - - if ($noscrape) { - $available = Network::testURL($profile_uri); - $this->logger->debug('Testing ' . $profile_uri . ': ' . ($available?'Success':'Failure')); - } else { + if (!$available) { $this->logger->notice('Parsing profile page ' . $profile_uri); - $params = \Friendica\Directory\Utils\Scrape::retrieveProfileData($profile_uri); + $params = \Friendica\Directory\Utils\Scrape::retrieveProfileData($this->http, $profile_uri); $params['language'] = $server['language']; + + $available = !empty($params['fn']); } // Empty result is due to an offline site. @@ -301,7 +306,7 @@ class Profile $status = false; if ($profile_id) { - $img_str = \Friendica\Directory\Utils\Network::fetchURL($params['photo'], true); + $img_str = $this->http->get($params['photo'])->getBody()->getContents(); $img = new \Friendica\Directory\Utils\Photo($img_str); if ($img->getImage()) { $img->scaleImageSquare(80); diff --git a/src/classes/Pollers/Server.php b/src/classes/Pollers/Server.php index 96ef29b..3be6131 100644 --- a/src/classes/Pollers/Server.php +++ b/src/classes/Pollers/Server.php @@ -3,7 +3,8 @@ namespace Friendica\Directory\Pollers; use ByJG\Util\WebRequest; -use Friendica\Directory\Utils\Network; +use GuzzleHttp\Exception\RequestException; +use GuzzleHttp\TransferStats; /** * @author Hypolite Petovan @@ -14,6 +15,10 @@ class Server * @var \Atlas\Pdo\Connection */ private $atlas; + /** + * @var \GuzzleHttp\ClientInterface + */ + private $http; /** * @var \Friendica\Directory\Models\ProfilePollQueue */ @@ -41,6 +46,7 @@ class Server public function __construct( \Atlas\Pdo\Connection $atlas, + \GuzzleHttp\ClientInterface $http, \Friendica\Directory\Models\ProfilePollQueue $profilePollQueueModel, \Friendica\Directory\Models\Server $serverModel, \Psr\SimpleCache\CacheInterface $simplecache, @@ -48,6 +54,7 @@ class Server array $settings) { $this->atlas = $atlas; + $this->http = $http; $this->profilePollQueueModel = $profilePollQueueModel; $this->serverModel = $serverModel; $this->simplecache = $simplecache; @@ -247,65 +254,41 @@ class Server private function getProbeResult(string $base_url): array { - //Prepare the CURL call. - $handle = curl_init(); - $options = array( - //Timeouts - CURLOPT_TIMEOUT => max($this->settings['probe_timeout'], 1), //Minimum of 1 second timeout. - CURLOPT_CONNECTTIMEOUT => 1, - //Redirecting - CURLOPT_FOLLOWLOCATION => true, - CURLOPT_MAXREDIRS => 8, - //SSL - CURLOPT_SSL_VERIFYPEER => true, - // CURLOPT_VERBOSE => true, - // CURLOPT_CERTINFO => true, - CURLOPT_SSL_VERIFYHOST => 2, - CURLOPT_PROTOCOLS => CURLPROTO_HTTP | CURLPROTO_HTTPS, - //Basic request - CURLOPT_USERAGENT => Network::USER_AGENT, - CURLOPT_RETURNTRANSFER => true, - CURLOPT_URL => $base_url . '/friendica/json' - ); - curl_setopt_array($handle, $options); + $curl_info = null; - //Probe the site. - $probe_start = microtime(true); - $probe_data = curl_exec($handle); - $probe_end = microtime(true); + $options = [ + 'timeout' => max($this->settings['probe_timeout'], 1), + 'on_stats' => function (TransferStats $transferStats) use (&$curl_info) { + $curl_info = $transferStats->getHandlerStats(); + } + ]; - //Check for SSL problems. - $curl_statuscode = curl_errno($handle); - $sslcert_issues = in_array($curl_statuscode, array( - 60, //Could not authenticate certificate with known CA's - 83 //Issuer check failed - )); + $sslcert_issues = false; - //When it's the certificate that doesn't work. - if ($sslcert_issues) { - //Probe again, without strict SSL. - $options[CURLOPT_SSL_VERIFYPEER] = false; - - //Replace the handle. - curl_close($handle); - $handle = curl_init(); - curl_setopt_array($handle, $options); - - //Probe. + try { + //Probe the site. $probe_start = microtime(true); - $probe_data = curl_exec($handle); + $probe_data = $this->http->get($base_url . '/friendica/json', $options)->getBody()->getContents(); $probe_end = microtime(true); + } catch (RequestException $e) { + if (!in_array($e->getHandlerContext()['errno'], [ + 60, //Could not authenticate certificate with known CA's + 83 //Issuer check failed + ])) { + throw $e; + } - //Store new status. - $curl_statuscode = curl_errno($handle); + $sslcert_issues = true; + + //When it's the certificate that doesn't work, we probe again without strict SSL. + $options['verify'] = false; + + $probe_start = microtime(true); + $probe_data = $this->http->get($base_url . '/friendica/json', $options)->getBody()->getContents(); + $probe_end = microtime(true); } - //Gather more meta. $time = round(($probe_end - $probe_start) * 1000); - $curl_info = curl_getinfo($handle); - - //Done with CURL now. - curl_close($handle); try { $data = json_decode($probe_data, true); diff --git a/src/classes/Utils/Network.php b/src/classes/Utils/Network.php index 14b0c13..1542fc8 100644 --- a/src/classes/Utils/Network.php +++ b/src/classes/Utils/Network.php @@ -15,56 +15,6 @@ namespace Friendica\Directory\Utils; */ class Network { - const USER_AGENT = 'friendica-directory-probe-1.0'; - - public static function fetchURL(string $url, bool $binary = false, int $timeout = 20): string - { - $ch = curl_init($url); - if (!$ch) { - return false; - } - - curl_setopt($ch, CURLOPT_HEADER, 0); - curl_setopt($ch, CURLOPT_TIMEOUT, max($timeout, 1)); //Minimum of 1 second timeout. - curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($ch, CURLOPT_MAXREDIRS, 8); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - curl_setopt($ch, CURLOPT_USERAGENT, self::USER_AGENT); - if ($binary) { - curl_setopt($ch, CURLOPT_BINARYTRANSFER, 1); - } - curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); - $s = curl_exec($ch); - curl_close($ch); - return $s; - } - - public static function testURL(string $url, int $timeout = 20): bool - { - $ch = curl_init($url); - if (!$ch) { - return false; - } - - curl_setopt($ch, CURLOPT_HEADER , 0); - curl_setopt($ch, CURLOPT_TIMEOUT , max($timeout, 1)); //Minimum of 1 second timeout. - curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($ch, CURLOPT_MAXREDIRS , 8); - curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, true); - curl_setopt($ch, CURLOPT_NOBODY , true); - curl_setopt($ch, CURLOPT_USERAGENT , self::USER_AGENT); - - curl_exec($ch); - - $responseCode = intval(curl_getinfo($ch, CURLINFO_RESPONSE_CODE)); - - $testSuccess = curl_errno($ch) === 0 && $responseCode < 400; - - curl_close($ch); - - return $testSuccess; - } - /** * Check if a hostname is public and non-reserved * diff --git a/src/classes/Utils/Scrape.php b/src/classes/Utils/Scrape.php index 2189b58..f9f31ac 100644 --- a/src/classes/Utils/Scrape.php +++ b/src/classes/Utils/Scrape.php @@ -2,6 +2,8 @@ namespace Friendica\Directory\Utils; +use GuzzleHttp\ClientInterface; + /** * @author Hypolite Petovan */ @@ -12,10 +14,10 @@ class Scrape * @param string $url * @return array|false */ - public static function retrieveNoScrapeData(string $url) + public static function retrieveNoScrapeData(ClientInterface $http, string $url) { $submit_noscrape_start = microtime(true); - $data = Network::fetchURL($url); + $data = $http->get($url)->getBody()->getContents(); $submit_noscrape_request_end = microtime(true); if (empty($data)) { @@ -42,7 +44,7 @@ class Scrape return $params; } - public static function retrieveProfileData(string $url, int $max_nodes = 3500): array + public static function retrieveProfileData(ClientInterface $http, string $url, int $max_nodes = 3500): array { $minNodes = 100; //Lets do at least 100 nodes per type. @@ -56,7 +58,7 @@ class Scrape $scrape_start = microtime(true); $params = []; - $html = Network::fetchURL($url, false, $timeout); + $html = $http->get($url, ['timeout' => $timeout])->getBody()->getContents();; $scrape_fetch_end = microtime(true); diff --git a/src/dependencies.php b/src/dependencies.php index 162ace8..ec6c5a0 100644 --- a/src/dependencies.php +++ b/src/dependencies.php @@ -89,6 +89,16 @@ $container['migration'] = function (ContainerInterface $c): ByJG\DbMigration\Mig return $migration; }; +$container['http'] = function (ContainerInterface $c): GuzzleHttp\ClientInterface { + $version = file_get_contents(__DIR__ . '/../VERSION'); + + if (!$version || !preg_match('/^\s*\d\.\d\.\d\s*$/', $version)) { + $version = '0.0.0'; + } + + return new GuzzleHttp\Client(['timeout' => 20, 'headers' => ['User-Agent' => 'FriendicaDirectory/' . trim($version) . ' ' . \GuzzleHttp\default_user_agent()]]); +}; + // Internal Dependency Injection $container[\Friendica\Directory\Models\Profile::class] = function (ContainerInterface $c): Friendica\Directory\Models\Profile { @@ -106,7 +116,7 @@ $container[\Friendica\Directory\Models\Server::class] = function (ContainerInter $container[\Friendica\Directory\Pollers\Directory::class] = function (ContainerInterface $c): Friendica\Directory\Pollers\Directory { $settings = $c->get('settings')['poller']; return new Friendica\Directory\Pollers\Directory( - $c->get('atlas'), + $c->get('http'), $c->get(\Friendica\Directory\Models\ProfilePollQueue::class), $c->get('logger'), $settings ?: [] @@ -117,6 +127,7 @@ $container[\Friendica\Directory\Pollers\Profile::class] = function (ContainerInt $settings = $c->get('settings')['poller']; return new Friendica\Directory\Pollers\Profile( $c->get('atlas'), + $c->get('http'), $c->get(\Friendica\Directory\Models\Server::class), $c->get(\Friendica\Directory\Models\Profile::class), $c->get('logger'), @@ -128,6 +139,7 @@ $container[\Friendica\Directory\Pollers\Server::class] = function (ContainerInte $settings = $c->get('settings')['poller']; return new Friendica\Directory\Pollers\Server( $c->get('atlas'), + $c->get('http'), $c->get(\Friendica\Directory\Models\ProfilePollQueue::class), $c->get(\Friendica\Directory\Models\Server::class), $c->get('simplecache'),