Replace direct curl uses with Guzzle HTTP client

- Add http dependency with custom User Agent
- Simplify profile poll to remove the second profile call to check availability
- Remove obsolete Network::fetchURL and Network::testURL
This commit is contained in:
Hypolite Petovan 2022-06-06 01:52:21 -04:00
parent 5829108c0e
commit 0ee5bf55b1
6 changed files with 76 additions and 152 deletions

View file

@ -10,9 +10,9 @@ use Friendica\Directory\Utils\Network;
class Directory class Directory
{ {
/** /**
* @var \Atlas\Pdo\Connection * @var \GuzzleHttp\ClientInterface
*/ */
private $atlas; private $http;
/** /**
* @var \Friendica\Directory\Models\ProfilePollQueue * @var \Friendica\Directory\Models\ProfilePollQueue
*/ */
@ -30,12 +30,12 @@ class Directory
]; ];
public function __construct( public function __construct(
\Atlas\Pdo\Connection $atlas, \GuzzleHttp\ClientInterface $http,
\Friendica\Directory\Models\ProfilePollQueue $profilePollQueueModel, \Friendica\Directory\Models\ProfilePollQueue $profilePollQueueModel,
\Psr\Log\LoggerInterface $logger, \Psr\Log\LoggerInterface $logger,
array $settings) array $settings)
{ {
$this->atlas = $atlas; $this->http = $http;
$this->profilePollQueueModel = $profilePollQueueModel; $this->profilePollQueueModel = $profilePollQueueModel;
$this->logger = $logger; $this->logger = $logger;
$this->settings = array_merge($this->settings, $settings); $this->settings = array_merge($this->settings, $settings);
@ -82,35 +82,7 @@ class Directory
$path = '/sync/pull/since/' . $last_polled; $path = '/sync/pull/since/' . $last_polled;
} }
//Prepare the CURL call. $pull_data = $this->http->get($directory_url . $path, ['timeout' => max($this->settings['probe_timeout'], 1)])->getBody()->getContents();
$handle = curl_init();
$options = array(
//Timeouts
CURLOPT_TIMEOUT => max($this->settings['probe_timeout'], 1), //Minimum of 1 second timeout.
CURLOPT_CONNECTTIMEOUT => 1,
//Redirecting
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 8,
//SSL
CURLOPT_SSL_VERIFYPEER => true,
// CURLOPT_VERBOSE => true,
// CURLOPT_CERTINFO => true,
CURLOPT_SSL_VERIFYHOST => 2,
CURLOPT_PROTOCOLS => CURLPROTO_HTTP | CURLPROTO_HTTPS,
//Basic request
CURLOPT_USERAGENT => Network::USER_AGENT,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_URL => $directory_url . $path
);
curl_setopt_array($handle, $options);
$this->logger->info('Pulling profiles from directory URL: ' . $directory_url . $path);
//Probe the site.
$pull_data = curl_exec($handle);
//Done with CURL now.
curl_close($handle);
$data = json_decode($pull_data, true); $data = json_decode($pull_data, true);

View file

@ -18,6 +18,11 @@ class Profile
*/ */
private $atlas; private $atlas;
/**
* @var \GuzzleHttp\ClientInterface
*/
private $http;
/** /**
* @var \Friendica\Directory\Models\Server * @var \Friendica\Directory\Models\Server
*/ */
@ -43,6 +48,7 @@ class Profile
public function __construct( public function __construct(
\Atlas\Pdo\Connection $atlas, \Atlas\Pdo\Connection $atlas,
\GuzzleHttp\ClientInterface $http,
\Friendica\Directory\Models\Server $serverModel, \Friendica\Directory\Models\Server $serverModel,
\Friendica\Directory\Models\Profile $profileModel, \Friendica\Directory\Models\Profile $profileModel,
\Psr\Log\LoggerInterface $logger, \Psr\Log\LoggerInterface $logger,
@ -50,6 +56,7 @@ class Profile
) )
{ {
$this->atlas = $atlas; $this->atlas = $atlas;
$this->http = $http;
$this->serverModel = $serverModel; $this->serverModel = $serverModel;
$this->profileModel = $profileModel; $this->profileModel = $profileModel;
$this->logger = $logger; $this->logger = $logger;
@ -122,25 +129,23 @@ class Profile
); );
} }
//Skip the profile scrape? $available = false;
$noscrape = $server['noscrape_url'];
$params = []; $params = [];
if ($noscrape) {
//Skip the profile scrape?
if ($server['noscrape_url']) {
$this->logger->debug('Calling ' . $server['noscrape_url'] . '/' . $username); $this->logger->debug('Calling ' . $server['noscrape_url'] . '/' . $username);
$params = \Friendica\Directory\Utils\Scrape::retrieveNoScrapeData($server['noscrape_url'] . '/' . $username); $params = \Friendica\Directory\Utils\Scrape::retrieveNoScrapeData($this->http, $server['noscrape_url'] . '/' . $username);
$noscrape = !!$params; //If the result was false, do a scrape after all. $available = !!$params; //If the result was false, do a scrape after all.
} }
$available = true; if (!$available) {
if ($noscrape) {
$available = Network::testURL($profile_uri);
$this->logger->debug('Testing ' . $profile_uri . ': ' . ($available?'Success':'Failure'));
} else {
$this->logger->notice('Parsing profile page ' . $profile_uri); $this->logger->notice('Parsing profile page ' . $profile_uri);
$params = \Friendica\Directory\Utils\Scrape::retrieveProfileData($profile_uri); $params = \Friendica\Directory\Utils\Scrape::retrieveProfileData($this->http, $profile_uri);
$params['language'] = $server['language']; $params['language'] = $server['language'];
$available = !empty($params['fn']);
} }
// Empty result is due to an offline site. // Empty result is due to an offline site.
@ -301,7 +306,7 @@ class Profile
$status = false; $status = false;
if ($profile_id) { if ($profile_id) {
$img_str = \Friendica\Directory\Utils\Network::fetchURL($params['photo'], true); $img_str = $this->http->get($params['photo'])->getBody()->getContents();
$img = new \Friendica\Directory\Utils\Photo($img_str); $img = new \Friendica\Directory\Utils\Photo($img_str);
if ($img->getImage()) { if ($img->getImage()) {
$img->scaleImageSquare(80); $img->scaleImageSquare(80);

View file

@ -3,7 +3,8 @@
namespace Friendica\Directory\Pollers; namespace Friendica\Directory\Pollers;
use ByJG\Util\WebRequest; use ByJG\Util\WebRequest;
use Friendica\Directory\Utils\Network; use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\TransferStats;
/** /**
* @author Hypolite Petovan <hypolite@mrpetovan.com> * @author Hypolite Petovan <hypolite@mrpetovan.com>
@ -14,6 +15,10 @@ class Server
* @var \Atlas\Pdo\Connection * @var \Atlas\Pdo\Connection
*/ */
private $atlas; private $atlas;
/**
* @var \GuzzleHttp\ClientInterface
*/
private $http;
/** /**
* @var \Friendica\Directory\Models\ProfilePollQueue * @var \Friendica\Directory\Models\ProfilePollQueue
*/ */
@ -41,6 +46,7 @@ class Server
public function __construct( public function __construct(
\Atlas\Pdo\Connection $atlas, \Atlas\Pdo\Connection $atlas,
\GuzzleHttp\ClientInterface $http,
\Friendica\Directory\Models\ProfilePollQueue $profilePollQueueModel, \Friendica\Directory\Models\ProfilePollQueue $profilePollQueueModel,
\Friendica\Directory\Models\Server $serverModel, \Friendica\Directory\Models\Server $serverModel,
\Psr\SimpleCache\CacheInterface $simplecache, \Psr\SimpleCache\CacheInterface $simplecache,
@ -48,6 +54,7 @@ class Server
array $settings) array $settings)
{ {
$this->atlas = $atlas; $this->atlas = $atlas;
$this->http = $http;
$this->profilePollQueueModel = $profilePollQueueModel; $this->profilePollQueueModel = $profilePollQueueModel;
$this->serverModel = $serverModel; $this->serverModel = $serverModel;
$this->simplecache = $simplecache; $this->simplecache = $simplecache;
@ -247,65 +254,41 @@ class Server
private function getProbeResult(string $base_url): array private function getProbeResult(string $base_url): array
{ {
//Prepare the CURL call. $curl_info = null;
$handle = curl_init();
$options = array(
//Timeouts
CURLOPT_TIMEOUT => max($this->settings['probe_timeout'], 1), //Minimum of 1 second timeout.
CURLOPT_CONNECTTIMEOUT => 1,
//Redirecting
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 8,
//SSL
CURLOPT_SSL_VERIFYPEER => true,
// CURLOPT_VERBOSE => true,
// CURLOPT_CERTINFO => true,
CURLOPT_SSL_VERIFYHOST => 2,
CURLOPT_PROTOCOLS => CURLPROTO_HTTP | CURLPROTO_HTTPS,
//Basic request
CURLOPT_USERAGENT => Network::USER_AGENT,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_URL => $base_url . '/friendica/json'
);
curl_setopt_array($handle, $options);
$options = [
'timeout' => max($this->settings['probe_timeout'], 1),
'on_stats' => function (TransferStats $transferStats) use (&$curl_info) {
$curl_info = $transferStats->getHandlerStats();
}
];
$sslcert_issues = false;
try {
//Probe the site. //Probe the site.
$probe_start = microtime(true); $probe_start = microtime(true);
$probe_data = curl_exec($handle); $probe_data = $this->http->get($base_url . '/friendica/json', $options)->getBody()->getContents();
$probe_end = microtime(true); $probe_end = microtime(true);
} catch (RequestException $e) {
//Check for SSL problems. if (!in_array($e->getHandlerContext()['errno'], [
$curl_statuscode = curl_errno($handle);
$sslcert_issues = in_array($curl_statuscode, array(
60, //Could not authenticate certificate with known CA's 60, //Could not authenticate certificate with known CA's
83 //Issuer check failed 83 //Issuer check failed
)); ])) {
throw $e;
//When it's the certificate that doesn't work.
if ($sslcert_issues) {
//Probe again, without strict SSL.
$options[CURLOPT_SSL_VERIFYPEER] = false;
//Replace the handle.
curl_close($handle);
$handle = curl_init();
curl_setopt_array($handle, $options);
//Probe.
$probe_start = microtime(true);
$probe_data = curl_exec($handle);
$probe_end = microtime(true);
//Store new status.
$curl_statuscode = curl_errno($handle);
} }
//Gather more meta. $sslcert_issues = true;
$time = round(($probe_end - $probe_start) * 1000);
$curl_info = curl_getinfo($handle);
//Done with CURL now. //When it's the certificate that doesn't work, we probe again without strict SSL.
curl_close($handle); $options['verify'] = false;
$probe_start = microtime(true);
$probe_data = $this->http->get($base_url . '/friendica/json', $options)->getBody()->getContents();
$probe_end = microtime(true);
}
$time = round(($probe_end - $probe_start) * 1000);
try { try {
$data = json_decode($probe_data, true); $data = json_decode($probe_data, true);

View file

@ -15,56 +15,6 @@ namespace Friendica\Directory\Utils;
*/ */
class Network class Network
{ {
const USER_AGENT = 'friendica-directory-probe-1.0';
public static function fetchURL(string $url, bool $binary = false, int $timeout = 20): string
{
$ch = curl_init($url);
if (!$ch) {
return false;
}
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_TIMEOUT, max($timeout, 1)); //Minimum of 1 second timeout.
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_MAXREDIRS, 8);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_USERAGENT, self::USER_AGENT);
if ($binary) {
curl_setopt($ch, CURLOPT_BINARYTRANSFER, 1);
}
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$s = curl_exec($ch);
curl_close($ch);
return $s;
}
public static function testURL(string $url, int $timeout = 20): bool
{
$ch = curl_init($url);
if (!$ch) {
return false;
}
curl_setopt($ch, CURLOPT_HEADER , 0);
curl_setopt($ch, CURLOPT_TIMEOUT , max($timeout, 1)); //Minimum of 1 second timeout.
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_MAXREDIRS , 8);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($ch, CURLOPT_NOBODY , true);
curl_setopt($ch, CURLOPT_USERAGENT , self::USER_AGENT);
curl_exec($ch);
$responseCode = intval(curl_getinfo($ch, CURLINFO_RESPONSE_CODE));
$testSuccess = curl_errno($ch) === 0 && $responseCode < 400;
curl_close($ch);
return $testSuccess;
}
/** /**
* Check if a hostname is public and non-reserved * Check if a hostname is public and non-reserved
* *

View file

@ -2,6 +2,8 @@
namespace Friendica\Directory\Utils; namespace Friendica\Directory\Utils;
use GuzzleHttp\ClientInterface;
/** /**
* @author Hypolite Petovan <hypolite@mrpetovan.com> * @author Hypolite Petovan <hypolite@mrpetovan.com>
*/ */
@ -12,10 +14,10 @@ class Scrape
* @param string $url * @param string $url
* @return array|false * @return array|false
*/ */
public static function retrieveNoScrapeData(string $url) public static function retrieveNoScrapeData(ClientInterface $http, string $url)
{ {
$submit_noscrape_start = microtime(true); $submit_noscrape_start = microtime(true);
$data = Network::fetchURL($url); $data = $http->get($url)->getBody()->getContents();
$submit_noscrape_request_end = microtime(true); $submit_noscrape_request_end = microtime(true);
if (empty($data)) { if (empty($data)) {
@ -42,7 +44,7 @@ class Scrape
return $params; return $params;
} }
public static function retrieveProfileData(string $url, int $max_nodes = 3500): array public static function retrieveProfileData(ClientInterface $http, string $url, int $max_nodes = 3500): array
{ {
$minNodes = 100; //Lets do at least 100 nodes per type. $minNodes = 100; //Lets do at least 100 nodes per type.
@ -56,7 +58,7 @@ class Scrape
$scrape_start = microtime(true); $scrape_start = microtime(true);
$params = []; $params = [];
$html = Network::fetchURL($url, false, $timeout); $html = $http->get($url, ['timeout' => $timeout])->getBody()->getContents();;
$scrape_fetch_end = microtime(true); $scrape_fetch_end = microtime(true);

View file

@ -89,6 +89,16 @@ $container['migration'] = function (ContainerInterface $c): ByJG\DbMigration\Mig
return $migration; return $migration;
}; };
$container['http'] = function (ContainerInterface $c): GuzzleHttp\ClientInterface {
$version = file_get_contents(__DIR__ . '/../VERSION');
if (!$version || !preg_match('/^\s*\d\.\d\.\d\s*$/', $version)) {
$version = '0.0.0';
}
return new GuzzleHttp\Client(['timeout' => 20, 'headers' => ['User-Agent' => 'FriendicaDirectory/' . trim($version) . ' ' . \GuzzleHttp\default_user_agent()]]);
};
// Internal Dependency Injection // Internal Dependency Injection
$container[\Friendica\Directory\Models\Profile::class] = function (ContainerInterface $c): Friendica\Directory\Models\Profile { $container[\Friendica\Directory\Models\Profile::class] = function (ContainerInterface $c): Friendica\Directory\Models\Profile {
@ -106,7 +116,7 @@ $container[\Friendica\Directory\Models\Server::class] = function (ContainerInter
$container[\Friendica\Directory\Pollers\Directory::class] = function (ContainerInterface $c): Friendica\Directory\Pollers\Directory { $container[\Friendica\Directory\Pollers\Directory::class] = function (ContainerInterface $c): Friendica\Directory\Pollers\Directory {
$settings = $c->get('settings')['poller']; $settings = $c->get('settings')['poller'];
return new Friendica\Directory\Pollers\Directory( return new Friendica\Directory\Pollers\Directory(
$c->get('atlas'), $c->get('http'),
$c->get(\Friendica\Directory\Models\ProfilePollQueue::class), $c->get(\Friendica\Directory\Models\ProfilePollQueue::class),
$c->get('logger'), $c->get('logger'),
$settings ?: [] $settings ?: []
@ -117,6 +127,7 @@ $container[\Friendica\Directory\Pollers\Profile::class] = function (ContainerInt
$settings = $c->get('settings')['poller']; $settings = $c->get('settings')['poller'];
return new Friendica\Directory\Pollers\Profile( return new Friendica\Directory\Pollers\Profile(
$c->get('atlas'), $c->get('atlas'),
$c->get('http'),
$c->get(\Friendica\Directory\Models\Server::class), $c->get(\Friendica\Directory\Models\Server::class),
$c->get(\Friendica\Directory\Models\Profile::class), $c->get(\Friendica\Directory\Models\Profile::class),
$c->get('logger'), $c->get('logger'),
@ -128,6 +139,7 @@ $container[\Friendica\Directory\Pollers\Server::class] = function (ContainerInte
$settings = $c->get('settings')['poller']; $settings = $c->get('settings')['poller'];
return new Friendica\Directory\Pollers\Server( return new Friendica\Directory\Pollers\Server(
$c->get('atlas'), $c->get('atlas'),
$c->get('http'),
$c->get(\Friendica\Directory\Models\ProfilePollQueue::class), $c->get(\Friendica\Directory\Models\ProfilePollQueue::class),
$c->get(\Friendica\Directory\Models\Server::class), $c->get(\Friendica\Directory\Models\Server::class),
$c->get('simplecache'), $c->get('simplecache'),