Replace direct curl uses with Guzzle HTTP client

- Add http dependency with custom User Agent
- Simplify profile poll to remove the second profile call to check availability
- Remove obsolete Network::fetchURL and Network::testURL
This commit is contained in:
Hypolite Petovan 2022-06-06 01:52:21 -04:00
parent 5829108c0e
commit 0ee5bf55b1
6 changed files with 76 additions and 152 deletions

View File

@ -10,9 +10,9 @@ use Friendica\Directory\Utils\Network;
class Directory
{
/**
* @var \Atlas\Pdo\Connection
* @var \GuzzleHttp\ClientInterface
*/
private $atlas;
private $http;
/**
* @var \Friendica\Directory\Models\ProfilePollQueue
*/
@ -30,12 +30,12 @@ class Directory
];
public function __construct(
\Atlas\Pdo\Connection $atlas,
\GuzzleHttp\ClientInterface $http,
\Friendica\Directory\Models\ProfilePollQueue $profilePollQueueModel,
\Psr\Log\LoggerInterface $logger,
array $settings)
{
$this->atlas = $atlas;
$this->http = $http;
$this->profilePollQueueModel = $profilePollQueueModel;
$this->logger = $logger;
$this->settings = array_merge($this->settings, $settings);
@ -82,35 +82,7 @@ class Directory
$path = '/sync/pull/since/' . $last_polled;
}
//Prepare the CURL call.
$handle = curl_init();
$options = array(
//Timeouts
CURLOPT_TIMEOUT => max($this->settings['probe_timeout'], 1), //Minimum of 1 second timeout.
CURLOPT_CONNECTTIMEOUT => 1,
//Redirecting
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 8,
//SSL
CURLOPT_SSL_VERIFYPEER => true,
// CURLOPT_VERBOSE => true,
// CURLOPT_CERTINFO => true,
CURLOPT_SSL_VERIFYHOST => 2,
CURLOPT_PROTOCOLS => CURLPROTO_HTTP | CURLPROTO_HTTPS,
//Basic request
CURLOPT_USERAGENT => Network::USER_AGENT,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_URL => $directory_url . $path
);
curl_setopt_array($handle, $options);
$this->logger->info('Pulling profiles from directory URL: ' . $directory_url . $path);
//Probe the site.
$pull_data = curl_exec($handle);
//Done with CURL now.
curl_close($handle);
$pull_data = $this->http->get($directory_url . $path, ['timeout' => max($this->settings['probe_timeout'], 1)])->getBody()->getContents();
$data = json_decode($pull_data, true);

View File

@ -18,6 +18,11 @@ class Profile
*/
private $atlas;
/**
* @var \GuzzleHttp\ClientInterface
*/
private $http;
/**
* @var \Friendica\Directory\Models\Server
*/
@ -43,6 +48,7 @@ class Profile
public function __construct(
\Atlas\Pdo\Connection $atlas,
\GuzzleHttp\ClientInterface $http,
\Friendica\Directory\Models\Server $serverModel,
\Friendica\Directory\Models\Profile $profileModel,
\Psr\Log\LoggerInterface $logger,
@ -50,6 +56,7 @@ class Profile
)
{
$this->atlas = $atlas;
$this->http = $http;
$this->serverModel = $serverModel;
$this->profileModel = $profileModel;
$this->logger = $logger;
@ -122,25 +129,23 @@ class Profile
);
}
//Skip the profile scrape?
$noscrape = $server['noscrape_url'];
$available = false;
$params = [];
if ($noscrape) {
//Skip the profile scrape?
if ($server['noscrape_url']) {
$this->logger->debug('Calling ' . $server['noscrape_url'] . '/' . $username);
$params = \Friendica\Directory\Utils\Scrape::retrieveNoScrapeData($server['noscrape_url'] . '/' . $username);
$noscrape = !!$params; //If the result was false, do a scrape after all.
$params = \Friendica\Directory\Utils\Scrape::retrieveNoScrapeData($this->http, $server['noscrape_url'] . '/' . $username);
$available = !!$params; //If the result was false, do a scrape after all.
}
$available = true;
if ($noscrape) {
$available = Network::testURL($profile_uri);
$this->logger->debug('Testing ' . $profile_uri . ': ' . ($available?'Success':'Failure'));
} else {
if (!$available) {
$this->logger->notice('Parsing profile page ' . $profile_uri);
$params = \Friendica\Directory\Utils\Scrape::retrieveProfileData($profile_uri);
$params = \Friendica\Directory\Utils\Scrape::retrieveProfileData($this->http, $profile_uri);
$params['language'] = $server['language'];
$available = !empty($params['fn']);
}
// Empty result is due to an offline site.
@ -301,7 +306,7 @@ class Profile
$status = false;
if ($profile_id) {
$img_str = \Friendica\Directory\Utils\Network::fetchURL($params['photo'], true);
$img_str = $this->http->get($params['photo'])->getBody()->getContents();
$img = new \Friendica\Directory\Utils\Photo($img_str);
if ($img->getImage()) {
$img->scaleImageSquare(80);

View File

@ -3,7 +3,8 @@
namespace Friendica\Directory\Pollers;
use ByJG\Util\WebRequest;
use Friendica\Directory\Utils\Network;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\TransferStats;
/**
* @author Hypolite Petovan <hypolite@mrpetovan.com>
@ -14,6 +15,10 @@ class Server
* @var \Atlas\Pdo\Connection
*/
private $atlas;
/**
* @var \GuzzleHttp\ClientInterface
*/
private $http;
/**
* @var \Friendica\Directory\Models\ProfilePollQueue
*/
@ -41,6 +46,7 @@ class Server
public function __construct(
\Atlas\Pdo\Connection $atlas,
\GuzzleHttp\ClientInterface $http,
\Friendica\Directory\Models\ProfilePollQueue $profilePollQueueModel,
\Friendica\Directory\Models\Server $serverModel,
\Psr\SimpleCache\CacheInterface $simplecache,
@ -48,6 +54,7 @@ class Server
array $settings)
{
$this->atlas = $atlas;
$this->http = $http;
$this->profilePollQueueModel = $profilePollQueueModel;
$this->serverModel = $serverModel;
$this->simplecache = $simplecache;
@ -247,65 +254,41 @@ class Server
private function getProbeResult(string $base_url): array
{
//Prepare the CURL call.
$handle = curl_init();
$options = array(
//Timeouts
CURLOPT_TIMEOUT => max($this->settings['probe_timeout'], 1), //Minimum of 1 second timeout.
CURLOPT_CONNECTTIMEOUT => 1,
//Redirecting
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 8,
//SSL
CURLOPT_SSL_VERIFYPEER => true,
// CURLOPT_VERBOSE => true,
// CURLOPT_CERTINFO => true,
CURLOPT_SSL_VERIFYHOST => 2,
CURLOPT_PROTOCOLS => CURLPROTO_HTTP | CURLPROTO_HTTPS,
//Basic request
CURLOPT_USERAGENT => Network::USER_AGENT,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_URL => $base_url . '/friendica/json'
);
curl_setopt_array($handle, $options);
$curl_info = null;
$options = [
'timeout' => max($this->settings['probe_timeout'], 1),
'on_stats' => function (TransferStats $transferStats) use (&$curl_info) {
$curl_info = $transferStats->getHandlerStats();
}
];
$sslcert_issues = false;
try {
//Probe the site.
$probe_start = microtime(true);
$probe_data = curl_exec($handle);
$probe_data = $this->http->get($base_url . '/friendica/json', $options)->getBody()->getContents();
$probe_end = microtime(true);
//Check for SSL problems.
$curl_statuscode = curl_errno($handle);
$sslcert_issues = in_array($curl_statuscode, array(
} catch (RequestException $e) {
if (!in_array($e->getHandlerContext()['errno'], [
60, //Could not authenticate certificate with known CA's
83 //Issuer check failed
));
//When it's the certificate that doesn't work.
if ($sslcert_issues) {
//Probe again, without strict SSL.
$options[CURLOPT_SSL_VERIFYPEER] = false;
//Replace the handle.
curl_close($handle);
$handle = curl_init();
curl_setopt_array($handle, $options);
//Probe.
$probe_start = microtime(true);
$probe_data = curl_exec($handle);
$probe_end = microtime(true);
//Store new status.
$curl_statuscode = curl_errno($handle);
])) {
throw $e;
}
//Gather more meta.
$time = round(($probe_end - $probe_start) * 1000);
$curl_info = curl_getinfo($handle);
$sslcert_issues = true;
//Done with CURL now.
curl_close($handle);
//When it's the certificate that doesn't work, we probe again without strict SSL.
$options['verify'] = false;
$probe_start = microtime(true);
$probe_data = $this->http->get($base_url . '/friendica/json', $options)->getBody()->getContents();
$probe_end = microtime(true);
}
$time = round(($probe_end - $probe_start) * 1000);
try {
$data = json_decode($probe_data, true);

View File

@ -15,56 +15,6 @@ namespace Friendica\Directory\Utils;
*/
class Network
{
const USER_AGENT = 'friendica-directory-probe-1.0';
public static function fetchURL(string $url, bool $binary = false, int $timeout = 20): string
{
$ch = curl_init($url);
if (!$ch) {
return false;
}
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_TIMEOUT, max($timeout, 1)); //Minimum of 1 second timeout.
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_MAXREDIRS, 8);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_USERAGENT, self::USER_AGENT);
if ($binary) {
curl_setopt($ch, CURLOPT_BINARYTRANSFER, 1);
}
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$s = curl_exec($ch);
curl_close($ch);
return $s;
}
public static function testURL(string $url, int $timeout = 20): bool
{
$ch = curl_init($url);
if (!$ch) {
return false;
}
curl_setopt($ch, CURLOPT_HEADER , 0);
curl_setopt($ch, CURLOPT_TIMEOUT , max($timeout, 1)); //Minimum of 1 second timeout.
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_MAXREDIRS , 8);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($ch, CURLOPT_NOBODY , true);
curl_setopt($ch, CURLOPT_USERAGENT , self::USER_AGENT);
curl_exec($ch);
$responseCode = intval(curl_getinfo($ch, CURLINFO_RESPONSE_CODE));
$testSuccess = curl_errno($ch) === 0 && $responseCode < 400;
curl_close($ch);
return $testSuccess;
}
/**
* Check if a hostname is public and non-reserved
*

View File

@ -2,6 +2,8 @@
namespace Friendica\Directory\Utils;
use GuzzleHttp\ClientInterface;
/**
* @author Hypolite Petovan <hypolite@mrpetovan.com>
*/
@ -12,10 +14,10 @@ class Scrape
* @param string $url
* @return array|false
*/
public static function retrieveNoScrapeData(string $url)
public static function retrieveNoScrapeData(ClientInterface $http, string $url)
{
$submit_noscrape_start = microtime(true);
$data = Network::fetchURL($url);
$data = $http->get($url)->getBody()->getContents();
$submit_noscrape_request_end = microtime(true);
if (empty($data)) {
@ -42,7 +44,7 @@ class Scrape
return $params;
}
public static function retrieveProfileData(string $url, int $max_nodes = 3500): array
public static function retrieveProfileData(ClientInterface $http, string $url, int $max_nodes = 3500): array
{
$minNodes = 100; //Lets do at least 100 nodes per type.
@ -56,7 +58,7 @@ class Scrape
$scrape_start = microtime(true);
$params = [];
$html = Network::fetchURL($url, false, $timeout);
$html = $http->get($url, ['timeout' => $timeout])->getBody()->getContents();;
$scrape_fetch_end = microtime(true);

View File

@ -89,6 +89,16 @@ $container['migration'] = function (ContainerInterface $c): ByJG\DbMigration\Mig
return $migration;
};
$container['http'] = function (ContainerInterface $c): GuzzleHttp\ClientInterface {
$version = file_get_contents(__DIR__ . '/../VERSION');
if (!$version || !preg_match('/^\s*\d\.\d\.\d\s*$/', $version)) {
$version = '0.0.0';
}
return new GuzzleHttp\Client(['timeout' => 20, 'headers' => ['User-Agent' => 'FriendicaDirectory/' . trim($version) . ' ' . \GuzzleHttp\default_user_agent()]]);
};
// Internal Dependency Injection
$container[\Friendica\Directory\Models\Profile::class] = function (ContainerInterface $c): Friendica\Directory\Models\Profile {
@ -106,7 +116,7 @@ $container[\Friendica\Directory\Models\Server::class] = function (ContainerInter
$container[\Friendica\Directory\Pollers\Directory::class] = function (ContainerInterface $c): Friendica\Directory\Pollers\Directory {
$settings = $c->get('settings')['poller'];
return new Friendica\Directory\Pollers\Directory(
$c->get('atlas'),
$c->get('http'),
$c->get(\Friendica\Directory\Models\ProfilePollQueue::class),
$c->get('logger'),
$settings ?: []
@ -117,6 +127,7 @@ $container[\Friendica\Directory\Pollers\Profile::class] = function (ContainerInt
$settings = $c->get('settings')['poller'];
return new Friendica\Directory\Pollers\Profile(
$c->get('atlas'),
$c->get('http'),
$c->get(\Friendica\Directory\Models\Server::class),
$c->get(\Friendica\Directory\Models\Profile::class),
$c->get('logger'),
@ -128,6 +139,7 @@ $container[\Friendica\Directory\Pollers\Server::class] = function (ContainerInte
$settings = $c->get('settings')['poller'];
return new Friendica\Directory\Pollers\Server(
$c->get('atlas'),
$c->get('http'),
$c->get(\Friendica\Directory\Models\ProfilePollQueue::class),
$c->get(\Friendica\Directory\Models\Server::class),
$c->get('simplecache'),