Added syncing (push and pull) and refactored a few functions.
This commit is contained in:
parent
0026b08a33
commit
1fe9bb9b5b
78
.htconfig.php
Normal file
78
.htconfig.php
Normal file
|
@ -0,0 +1,78 @@
|
|||
<?php
|
||||
|
||||
//MySQL host.
|
||||
$db_host = 'localhost';
|
||||
$db_user = 'friendica-dir';
|
||||
$db_pass = 'thisisyourpasswordbuddy';
|
||||
$db_data = 'friendica-dir';
|
||||
|
||||
// Choose a legal default timezone. If you are unsure, use "America/Los_Angeles".
|
||||
// It can be changed later and only applies to timestamps for anonymous viewers.
|
||||
$default_timezone = 'Europe/Amsterdam';
|
||||
|
||||
// What is your site name?
|
||||
$a->config['sitename'] = "EXPERIMENTAL Friendica public directory";
|
||||
|
||||
//Settings related to the syncing feature.
|
||||
$a->config['syncing'] = array(
|
||||
|
||||
//Pulling may be quite intensive at first when it has to do a full sync and your directory is empty.
|
||||
//This timeout should be shorter than your cronjob interval. Preferably with a little breathing room.
|
||||
'timeout' => 3*60, //3 minutes
|
||||
|
||||
//Push new submits to the `sync-target` entries?
|
||||
'enable_pushing' => true,
|
||||
|
||||
//Maximum amount of items per batch per target to push to other sync-targets.
|
||||
//For example: 3 targets x20 items = 60 requests.
|
||||
'max_push_items' => 10,
|
||||
|
||||
//Pull updates from the `sync-target` entries?
|
||||
'enable_pulling' => true,
|
||||
|
||||
//This is your normal amount of threads for pulling.
|
||||
//With regular intervals, there's no need to give this a high value.
|
||||
//But when your server is brand new, you may want to keep this high for the first day or two.
|
||||
'pulling_threads' => 25,
|
||||
|
||||
//How many items should we crawl per sync?
|
||||
'max_pull_items' => 250
|
||||
|
||||
);
|
||||
|
||||
//Things related to site-health monitoring.
|
||||
$a->config['site-health'] = array(
|
||||
|
||||
//Wait for at least ... before probing a site again.
|
||||
//The longer this value, the more "stable" site-healths will be over time.
|
||||
//Note: If a bad (negative) health site submits something, a probe will be performed regardless.
|
||||
'min_probe_delay' => 3*24*3600, // 3 days
|
||||
|
||||
//Probes get a simple /friendica/json file from the server.
|
||||
//Feel free to set this timeout to a very tight value.
|
||||
'probe_timeout' => 5, // seconds
|
||||
|
||||
//Imports should be fast. Feel free to prioritize healthy sites.
|
||||
'skip_import_threshold' => -20
|
||||
|
||||
);
|
||||
|
||||
//Things related to the maintenance cronjob.
|
||||
$a->config['maintenance'] = array(
|
||||
|
||||
//This is to prevent I/O blocking. Will cost you some RAM overhead though.
|
||||
//A good server should handle much more than this default, so you can tweak this.
|
||||
'threads' => 10,
|
||||
|
||||
//Limit the amount of scrapes per execution of the maintainer.
|
||||
//This will depend a lot on the frequency with which you call the maintainer.
|
||||
//If you have 10 threads and 80 max_scrapes, that means each thread will handle 8 scrapes.
|
||||
'max_scrapes' => 80,
|
||||
|
||||
//Wait for at least ... before scraping a profile again.
|
||||
'min_scrape_delay' => 3*24*3600, // 3 days
|
||||
|
||||
//At which health value should we start removing profiles?
|
||||
'remove_profile_health_threshold' => -60
|
||||
|
||||
);
|
10
README.md
10
README.md
|
@ -1,4 +1,8 @@
|
|||
dir
|
||||
===
|
||||
# Friendica Global Directory
|
||||
|
||||
Friendica Global Directory
|
||||
Example cronjob.
|
||||
|
||||
```
|
||||
*/30 * * * * www-data cd /var/www/friendica-directory; php include/cron_maintain.php
|
||||
*/5 * * * * www-data cd /var/www/friendica-directory; php include/cron_sync.php
|
||||
```
|
15
dfrndir.sql
15
dfrndir.sql
|
@ -203,12 +203,25 @@ CREATE TABLE IF NOT EXISTS `sync-targets` (
|
|||
`base_url` varchar(255) NOT NULL,
|
||||
`pull` bit(1) NOT NULL DEFAULT b'0',
|
||||
`push` bit(1) NOT NULL DEFAULT b'1',
|
||||
`dt_last_pull` bigint unsigned NULL DEFAULT NULL,
|
||||
PRIMARY KEY (`base_url`),
|
||||
KEY `push` (`push`),
|
||||
KEY `pull` (`pull`)
|
||||
) ENGINE=MyISAM DEFAULT CHARSET=utf8 ;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS `sync-queue` (
|
||||
CREATE TABLE IF NOT EXISTS `sync-push-queue` (
|
||||
`url` varchar(255) NOT NULL,
|
||||
PRIMARY KEY (`url`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS `sync-pull-queue` (
|
||||
`url` varchar(255) NOT NULL,
|
||||
PRIMARY KEY (`url`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS `sync-timestamps` (
|
||||
`url` varchar(255) NOT NULL,
|
||||
`modified` datetime NOT NULL,
|
||||
PRIMARY KEY (`url`),
|
||||
KEY `modified` (`modified`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ;
|
||||
|
|
93
include/cron_sync.php
Normal file
93
include/cron_sync.php
Normal file
|
@ -0,0 +1,93 @@
|
|||
<?php
|
||||
|
||||
/*
|
||||
|
||||
#TODO:
|
||||
|
||||
* First do the pulls then the pushes.
|
||||
If pull prevents the push, the push queue just creates a backlog until it gets a chance to push.
|
||||
|
||||
* When doing a first-pull, there's a safety mechanism for the timeout and detecting duplicate attempts.
|
||||
|
||||
1. Perform all JSON pulls on the source servers.
|
||||
2. Combine the results into one giant pool of URLs.
|
||||
3. Write this pool to a file (TODO-file).
|
||||
4. Shuffle the pool in RAM.
|
||||
5. Start threads for crawling.
|
||||
6. Every finished crawl attempt (successful or not) should write to a 2nd file (DONE-file).
|
||||
|
||||
IF the first-pull times out, don't do anything else.
|
||||
Otherwise, mark the dates we last performed a pull from each server.
|
||||
|
||||
* When resuming a first-pull.
|
||||
|
||||
1. Check for the TODO-file and the DONE-file.
|
||||
2. Remove the entries in the DONE-file from the pool in the TODO-file.
|
||||
3. Replace the TODO-file with the updated pool.
|
||||
4. Perform steps 4, 5 and 6 (shuffle, create threads and crawl) from before.
|
||||
|
||||
This way you can resume without repeating attempts.
|
||||
|
||||
* Write documentation about syncing.
|
||||
|
||||
* Create "official" directory policy for my directory.
|
||||
|
||||
* Decide if a retry mechanism is desirable for pulling (for the failed attempts).
|
||||
After all, you did imply trust when you indicated to pull from that source...
|
||||
This could be done easily by doing a /sync/pull/all again from those sources.
|
||||
|
||||
* Decide if cron_sync.php should be split into push pull and pull-all commands.
|
||||
|
||||
*/
|
||||
|
||||
// Debug stuff.
|
||||
ini_set('display_errors', 1);
|
||||
ini_set('log_errors','0');
|
||||
error_reporting(E_ALL^E_NOTICE);
|
||||
|
||||
$start_syncing = time();
|
||||
|
||||
//Startup.
|
||||
require_once('boot.php');
|
||||
$a = new App;
|
||||
|
||||
//Create a simple log function for CLI use.
|
||||
global $verbose;
|
||||
$verbose = $argv[1] === 'verbose';
|
||||
|
||||
function msg($message, $fatal=false){
|
||||
global $verbose;
|
||||
if($verbose || $fatal) echo($message.PHP_EOL);
|
||||
logger($message);
|
||||
if($fatal) exit(1);
|
||||
};
|
||||
|
||||
//Config.
|
||||
require_once(".htconfig.php");
|
||||
|
||||
//Connect the DB.
|
||||
require_once("dba.php");
|
||||
$db = new dba($db_host, $db_user, $db_pass, $db_data, $install);
|
||||
|
||||
//Import syncing functions.
|
||||
require_once('sync.php');
|
||||
|
||||
//Get work for pulling.
|
||||
$pull_batch = get_pulling_job($a);
|
||||
|
||||
//Get work for pushing.
|
||||
list($push_targets, $push_batch) = get_pushing_job($a);
|
||||
|
||||
//Close the connection for now. Process forking and DB connections are not the best of friends.
|
||||
$db->getdb()->close();
|
||||
|
||||
if(count($pull_batch))
|
||||
run_pulling_job($a, $pull_batch, $db_host, $db_user, $db_pass, $db_data, $install);
|
||||
|
||||
//Do our multi-fork job, if we have a batch and targets.
|
||||
if(count($push_targets) && count($push_batch))
|
||||
run_pushing_job($push_targets, $push_batch, $db_host, $db_user, $db_pass, $db_data, $install);
|
||||
|
||||
//Log the time it took.
|
||||
$time = time() - $start_syncing;
|
||||
msg("Syncing completed. Took $time seconds.");
|
438
include/sync.php
Normal file
438
include/sync.php
Normal file
|
@ -0,0 +1,438 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* Pull this URL to our pulling queue.
|
||||
* @param string $url
|
||||
* @return void
|
||||
*/
|
||||
function sync_pull($url)
|
||||
{
|
||||
|
||||
global $a;
|
||||
|
||||
//If we support it that is.
|
||||
if($a->config['syncing']['enable_pulling']){
|
||||
q("INSERT INTO `sync-pull-queue` (`url`) VALUES ('%s')", dbesc($url));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Push this URL to our pushing queue as well as mark it as modified using sync_mark.
|
||||
* @param string $url
|
||||
* @return void
|
||||
*/
|
||||
function sync_push($url)
|
||||
{
|
||||
|
||||
global $a;
|
||||
|
||||
//If we support it that is.
|
||||
if($a->config['syncing']['enable_pushing']){
|
||||
q("INSERT INTO `sync-push-queue` (`url`) VALUES ('%s')", dbesc($url));
|
||||
}
|
||||
|
||||
sync_mark($url);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark a URL as modified in some way or form.
|
||||
* This will cause anyone that pulls our changes to see this profile listed.
|
||||
* @param string $url
|
||||
* @return void
|
||||
*/
|
||||
function sync_mark($url)
|
||||
{
|
||||
|
||||
global $a;
|
||||
|
||||
//If we support it that is.
|
||||
if(!$a->config['syncing']['enable_pulling']){
|
||||
return;
|
||||
}
|
||||
|
||||
$exists = count(q("SELECT * FROM `sync-timestamps` WHERE `url`='%s'", dbesc($url)));
|
||||
|
||||
if(!$exists)
|
||||
q("INSERT INTO `sync-timestamps` (`url`, `modified`) VALUES ('%s', NOW())", dbesc($url));
|
||||
else
|
||||
q("UPDATE `sync-timestamps` SET `modified`=NOW() WHERE `url`='%s'", dbesc($url));
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* For a single fork during the push jobs.
|
||||
* Takes a lower priority and pushes a batch of items.
|
||||
* @param string $target A sync-target database row.
|
||||
* @param array $batch The batch of items to submit.
|
||||
* @return void
|
||||
*/
|
||||
function push_worker($target, $batch)
|
||||
{
|
||||
|
||||
//Lets be nice, we're only doing a background job here...
|
||||
pcntl_setpriority(5);
|
||||
|
||||
//Find our target's submit URL.
|
||||
$submit = $target['base_url'].'/submit';
|
||||
|
||||
foreach($batch as $item){
|
||||
set_time_limit(30); //This should work for 1 submit.
|
||||
msg("Submitting {$item['url']} to $submit");
|
||||
fetch_url($submit.'?url='.bin2hex($item['url']));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets an array of push targets.
|
||||
* @return array Push targets.
|
||||
*/
|
||||
function get_push_targets(){
|
||||
return q("SELECT * FROM `sync-targets` WHERE `push`=b'1'");
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a batch of URL's to push.
|
||||
* @param object $a The App instance.
|
||||
* @return array Batch of URL's.
|
||||
*/
|
||||
function get_push_batch($a){
|
||||
return q("SELECT * FROM `sync-push-queue` LIMIT %u", intval($a->config['syncing']['max_push_items']));
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the push targets as well as a batch of URL's for a pushing job.
|
||||
* @param object $a The App instance.
|
||||
* @return list($targets, $batch) A list of both the targets array and batch array.
|
||||
*/
|
||||
function get_pushing_job($a)
|
||||
{
|
||||
|
||||
//When pushing is requested...
|
||||
if(!!$a->config['syncing']['enable_pushing']){
|
||||
|
||||
//Find our targets.
|
||||
$targets = get_push_targets();
|
||||
|
||||
//No targets?
|
||||
if(!count($targets)){
|
||||
msg('Pushing enabled, but no push targets.');
|
||||
$batch = array();
|
||||
}
|
||||
|
||||
//If we have targets, get our batch.
|
||||
else{
|
||||
$batch = get_push_batch($a);
|
||||
if(!count($batch)) msg('Empty pushing queue.'); //No batch, means no work.
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//No pushing if it's disabled.
|
||||
else{
|
||||
$targets = array();
|
||||
$batch = array();
|
||||
}
|
||||
|
||||
return array($targets, $batch);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs a pushing job, creating a thread for each target.
|
||||
* @param array $targets Pushing targets.
|
||||
* @param array $batch Batch of URL's to push.
|
||||
* @param string $db_host DB host to connect to.
|
||||
* @param string $db_user DB user to connect with.
|
||||
* @param string $db_pass DB pass to connect with.
|
||||
* @param mixed $db_data Nobody knows.
|
||||
* @param mixed $install Maybe a boolean.
|
||||
* @return void
|
||||
*/
|
||||
function run_pushing_job($targets, $batch, $db_host, $db_user, $db_pass, $db_data, $install)
|
||||
{
|
||||
|
||||
//Create a thread for each target we want to serve push messages to.
|
||||
//Not good creating more, because it would stress their server too much.
|
||||
$threadc = count($targets);
|
||||
$threads = array();
|
||||
|
||||
//Do we only have 1 target? No need for threads.
|
||||
if($threadc === 1){
|
||||
msg('No threads needed. Only one pushing target.');
|
||||
push_worker($targets[0], $batch);
|
||||
}
|
||||
|
||||
//When we need threads.
|
||||
elseif($threadc > 1){
|
||||
|
||||
//POSIX threads only.
|
||||
if(!function_exists('pcntl_fork')){
|
||||
msg('Error: no pcntl_fork support. Are you running a different OS? Report an issue please.', true);
|
||||
}
|
||||
|
||||
//Debug...
|
||||
$items = count($batch);
|
||||
msg("Creating $threadc push threads for $items items.");
|
||||
|
||||
//Loop while we need more threads.
|
||||
for($i = 0; $i < $threadc; $i++){
|
||||
|
||||
$pid = pcntl_fork();
|
||||
if($pid === -1) msg('Error: something went wrong with the fork. '.pcntl_strerror(), true);
|
||||
|
||||
//You're a child, go do some labor!
|
||||
if($pid === 0){push_worker($targets[$i], $batch); exit;}
|
||||
|
||||
//Store the list of PID's.
|
||||
if($pid > 0) $threads[] = $pid;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//Wait for all child processes.
|
||||
$theading_problems = false;
|
||||
foreach($threads as $pid){
|
||||
pcntl_waitpid($pid, $status);
|
||||
if($status !== 0){
|
||||
$theading_problems = true;
|
||||
msg("Bad process return value $pid:$status");
|
||||
}
|
||||
}
|
||||
|
||||
//If we did not have any "threading" problems.
|
||||
if(!$theading_problems){
|
||||
|
||||
//Reconnect
|
||||
global $db;
|
||||
$db = new dba($db_host, $db_user, $db_pass, $db_data, $install);
|
||||
|
||||
//Create a query for deleting this queue.
|
||||
$where = array();
|
||||
foreach($batch as $item) $where[] = dbesc($item['url']);
|
||||
$where = "WHERE `url` IN ('".implode("', '", $where)."')";
|
||||
|
||||
//Remove the items from queue.
|
||||
q("DELETE FROM `sync-push-queue` $where LIMIT %u", count($batch));
|
||||
msg('Removed items from push queue.');
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a batch of URL's to push.
|
||||
* @param object $a The App instance.
|
||||
* @return array Batch of URL's.
|
||||
*/
|
||||
function get_queued_pull_batch($a){
|
||||
//Randomize this, to prevent scraping the same servers too much or dead URL's.
|
||||
$batch = q("SELECT * FROM `sync-pull-queue` ORDER BY RAND() LIMIT %u", intval($a->config['syncing']['max_pull_items']));
|
||||
msg(sprintf('Pulling %u items from queue.', count($batch)));
|
||||
return $batch;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets an array of pull targets.
|
||||
* @return array Pull targets.
|
||||
*/
|
||||
function get_pull_targets(){
|
||||
return q("SELECT * FROM `sync-targets` WHERE `pull`=b'1'");
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a batch of URL's to push.
|
||||
* @param object $a The App instance.
|
||||
* @return array Batch of URL's.
|
||||
*/
|
||||
function get_remote_pull_batch($a)
|
||||
{
|
||||
|
||||
//Find our targets.
|
||||
$targets = get_pull_targets();
|
||||
|
||||
msg(sprintf('Pulling from %u remote targets.', count($targets)));
|
||||
|
||||
//No targets, means no batch.
|
||||
if(!count($targets))
|
||||
return array();
|
||||
|
||||
//Pull a list of URL's from each target.
|
||||
$urls = array();
|
||||
foreach($targets as $target){
|
||||
|
||||
//First pull, or an update?
|
||||
if(!$target['dt_last_pull'])
|
||||
$url = $target['base_url'].'/sync/pull/all';
|
||||
else
|
||||
$url = $target['base_url'].'/sync/pull/since/'.intval($target['dt_last_pull']);
|
||||
|
||||
//Go for it :D
|
||||
$target['pull_data'] = json_decode(fetch_url($url), true);
|
||||
|
||||
//If we didn't get any JSON.
|
||||
if($target['pull_data'] === null){
|
||||
msg(sprintf('Failed to pull from "%s".', $url));
|
||||
continue;
|
||||
}
|
||||
|
||||
//Add all entries as keys, to remove duplicates.
|
||||
foreach($target['pull_data']['results'] as $url)
|
||||
$urls[$url]=true;
|
||||
|
||||
}
|
||||
|
||||
//Now that we have our URL's. Store them in the queue.
|
||||
foreach($urls as $url=>$bool){
|
||||
if($url) sync_pull($url);
|
||||
}
|
||||
|
||||
//Since this all worked out, mark each source with the timestamp of pulling.
|
||||
foreach($targets as $target){
|
||||
if($targets['pull_data'] && $targets['pull_data']['now'])
|
||||
q("UPDATE `sync-targets` SET `dt_last_pull`=%u WHERE `base_url`='%s'", $targets['pull_data']['now'], dbesc($targets['base_url']));
|
||||
}
|
||||
|
||||
//Finally, return a batch of this.
|
||||
return get_queued_pull_batch($a);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Gathers an array of URL's to scrape from the pulling targets.
|
||||
* @param object $a The App instance.
|
||||
* @return array URL's to scrape.
|
||||
*/
|
||||
function get_pulling_job($a)
|
||||
{
|
||||
|
||||
//No pulling today...
|
||||
if(!$a->config['syncing']['enable_pulling'])
|
||||
return array();
|
||||
|
||||
//Firstly, finish the items from our queue.
|
||||
$batch = get_queued_pull_batch($a);
|
||||
if(count($batch)) return $batch;
|
||||
|
||||
//If that is empty, fill the queue with remote items and return a batch of that.
|
||||
$batch = get_remote_pull_batch($a);
|
||||
if(count($batch)) return $batch;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* For a single fork during the pull jobs.
|
||||
* Takes a lower priority and pulls a batch of items.
|
||||
* @param int $i The index number of this worker (for round-robin).
|
||||
* @param int $threadc The amount of workers (for round-robin).
|
||||
* @param array $pull_batch A batch of URL's to pull.
|
||||
* @param string $db_host DB host to connect to.
|
||||
* @param string $db_user DB user to connect with.
|
||||
* @param string $db_pass DB pass to connect with.
|
||||
* @param mixed $db_data Nobody knows.
|
||||
* @param mixed $install Maybe a boolean.
|
||||
* @return void
|
||||
*/
|
||||
function pull_worker($i, $threadc, $pull_batch, $db_host, $db_user, $db_pass, $db_data, $install)
|
||||
{
|
||||
|
||||
//Lets be nice, we're only doing maintenance here...
|
||||
pcntl_setpriority(5);
|
||||
|
||||
//Get personal DBA's.
|
||||
global $db;
|
||||
$db = new dba($db_host, $db_user, $db_pass, $db_data, $install);
|
||||
|
||||
//Get our (round-robin) workload from the batch.
|
||||
$workload = array();
|
||||
while(isset($pull_batch[$i])){
|
||||
$entry = $pull_batch[$i];
|
||||
$workload[] = $entry;
|
||||
$i+=$threadc;
|
||||
}
|
||||
|
||||
//While we've got work to do.
|
||||
while(count($workload)){
|
||||
$entry = array_pop($workload);
|
||||
set_time_limit(20); //This should work for 1 submit.
|
||||
msg("Submitting ".$entry['url']);
|
||||
run_submit($entry['url']);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs a pulling job, creating several threads to do so.
|
||||
* @param object $a The App instance.
|
||||
* @param array $pull_batch A batch of URL's to pull.
|
||||
* @param string $db_host DB host to connect to.
|
||||
* @param string $db_user DB user to connect with.
|
||||
* @param string $db_pass DB pass to connect with.
|
||||
* @param mixed $db_data Nobody knows.
|
||||
* @param mixed $install Maybe a boolean.
|
||||
* @return void
|
||||
*/
|
||||
function run_pulling_job($a, $pull_batch, $db_host, $db_user, $db_pass, $db_data, $install)
|
||||
{
|
||||
|
||||
//We need the scraper.
|
||||
require_once('include/submit.php');
|
||||
|
||||
//POSIX threads only.
|
||||
if(!function_exists('pcntl_fork')){
|
||||
msg('Error: no pcntl_fork support. Are you running a different OS? Report an issue please.', true);
|
||||
}
|
||||
|
||||
//Create the threads we need.
|
||||
$items = count($pull_batch);
|
||||
$threadc = min($a->config['syncing']['pulling_threads'], $items); //Don't need more threads than items.
|
||||
$threads = array();
|
||||
|
||||
msg("Creating $threadc pulling threads for $items profiles.");
|
||||
|
||||
//Build the threads.
|
||||
for($i = 0; $i < $threadc; $i++){
|
||||
|
||||
$pid = pcntl_fork();
|
||||
if($pid === -1) msg('Error: something went wrong with the fork. '.pcntl_strerror(), true);
|
||||
|
||||
//You're a child, go do some labor!
|
||||
if($pid === 0){pull_worker($i, $threadc, $pull_batch, $db_host, $db_user, $db_pass, $db_data, $install); exit;}
|
||||
|
||||
//Store the list of PID's.
|
||||
if($pid > 0) $threads[] = $pid;
|
||||
|
||||
}
|
||||
|
||||
//Wait for all child processes.
|
||||
$theading_problems = false;
|
||||
foreach($threads as $pid){
|
||||
pcntl_waitpid($pid, $status);
|
||||
if($status !== 0){
|
||||
$theading_problems = true;
|
||||
msg("Bad process return value $pid:$status");
|
||||
}
|
||||
}
|
||||
|
||||
//If we did not have any "threading" problems.
|
||||
if(!$theading_problems){
|
||||
|
||||
//Reconnect
|
||||
global $db;
|
||||
$db = new dba($db_host, $db_user, $db_pass, $db_data, $install);
|
||||
|
||||
//Create a query for deleting this queue.
|
||||
$where = array();
|
||||
foreach($pull_batch as $item) $where[] = dbesc($item['url']);
|
||||
$where = "WHERE `url` IN ('".implode("', '", $where)."')";
|
||||
|
||||
//Remove the items from queue.
|
||||
q("DELETE FROM `sync-pull-queue` $where LIMIT %u", count($pull_batch));
|
||||
msg('Removed items from pull queue.');
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,141 +0,0 @@
|
|||
<?php
|
||||
|
||||
// Debug stuff.
|
||||
ini_set('display_errors', 1);
|
||||
ini_set('log_errors','0');
|
||||
error_reporting(E_ALL^E_NOTICE);
|
||||
|
||||
$start_syncing = time();
|
||||
|
||||
//Startup.
|
||||
require_once('boot.php');
|
||||
$a = new App;
|
||||
|
||||
//Create a simple log function for CLI use.
|
||||
$verbose = $argv[1] === 'verbose';
|
||||
$msg = function($message, $fatal=false)use($verbose){
|
||||
if($verbose || $fatal) echo($message.PHP_EOL);
|
||||
logger($message);
|
||||
if($fatal) exit(1);
|
||||
};
|
||||
|
||||
//Config.
|
||||
require_once(".htconfig.php");
|
||||
|
||||
//No pushing? Leave... because we haven't implemented pulling yet.
|
||||
if(!$a->config['syncing']['enable_pushing']){
|
||||
$msg('No push support enabled in your settings.', true);
|
||||
}
|
||||
|
||||
//Connect the DB.
|
||||
require_once("dba.php");
|
||||
$db = new dba($db_host, $db_user, $db_pass, $db_data, $install);
|
||||
|
||||
//Find our targets.
|
||||
$targets = q("SELECT * FROM `sync-targets` WHERE `push`=b'1'");
|
||||
if(!count($targets)) $msg('No targets.', true); //No targets, means no work.
|
||||
|
||||
//Get our batch of URL's.
|
||||
$batch = q("SELECT * FROM `sync-queue` LIMIT %u", intval($a->config['syncing']['max_push_items']));
|
||||
if(!count($batch)) $msg('Empty queue.', true); //No batch, means no work.
|
||||
|
||||
//Close the connection for now. Process forking and DB connections are not the best of friends.
|
||||
$db->getdb()->close();
|
||||
|
||||
//Create a thread for each target we want to serve push messages to.
|
||||
//No good creating more, because it would stress their server too much.
|
||||
$threadc = count($targets);
|
||||
$threads = array();
|
||||
|
||||
//Do we only have 1 target? No need for threads.
|
||||
if($threadc === 1){
|
||||
//Pretend to be worker #1.
|
||||
$pid = 0;
|
||||
$i = 0;
|
||||
$main = true;
|
||||
$msg('No threads needed. Only one pushing target.');
|
||||
}
|
||||
|
||||
//When we need threads.
|
||||
else{
|
||||
|
||||
//POSIX threads only.
|
||||
if(!function_exists('pcntl_fork')){
|
||||
$msg('Error: no pcntl_fork support. Are you running a different OS? Report an issue please.', true);
|
||||
}
|
||||
|
||||
//Debug...
|
||||
$items = count($batch);
|
||||
$msg("Creating $threadc push threads for $items items.");
|
||||
|
||||
//Loop while we need more threads.
|
||||
for($i = 0; $i < $threadc; $i++){
|
||||
|
||||
$pid = pcntl_fork();
|
||||
if($pid === -1) $msg('Error: something went wrong with the fork. '.pcntl_strerror(), true);
|
||||
|
||||
//You're a child, go do some labor!
|
||||
if($pid === 0) break;
|
||||
|
||||
//Store the list of PID's.
|
||||
if($pid > 0) $threads[] = $pid;
|
||||
|
||||
}
|
||||
|
||||
//Are we the main thread?
|
||||
$main = $pid !== 0;
|
||||
|
||||
}
|
||||
|
||||
//The work for child processes.
|
||||
if($pid === 0){
|
||||
|
||||
//Lets be nice, we're only doing a background job here...
|
||||
pcntl_setpriority(5);
|
||||
|
||||
//Find our target's submit URL.
|
||||
$submit = $targets[$i]['base_url'].'/submit';
|
||||
|
||||
foreach($batch as $item){
|
||||
set_time_limit(30); //This should work for 1 submit.
|
||||
$msg("Submitting {$item['url']} to $submit");
|
||||
fetch_url($submit.'?url='.bin2hex($item['url']));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//The main process.
|
||||
if($main){
|
||||
|
||||
//Wait for all child processes.
|
||||
$all_good = true;
|
||||
foreach($threads as $pid){
|
||||
pcntl_waitpid($pid, $status);
|
||||
if($status !== 0){
|
||||
$all_good = false;
|
||||
$msg("Bad process return value $pid:$status");
|
||||
}
|
||||
}
|
||||
|
||||
//If we did not have any "threading" problems.
|
||||
if($all_good){
|
||||
|
||||
//Reconnect
|
||||
$db = new dba($db_host, $db_user, $db_pass, $db_data, $install);
|
||||
|
||||
//Create a query for deleting this queue.
|
||||
$where = array();
|
||||
foreach($batch as $item) $where[] = dbesc($item['url']);
|
||||
$where = "WHERE `url` IN ('".implode("', '", $where)."')";
|
||||
|
||||
//Remove the items from queue.
|
||||
q("DELETE FROM `sync-queue` $where LIMIT %u", intval($a->config['syncing']['max_push_items']));
|
||||
$msg('Removed items from queue.');
|
||||
|
||||
}
|
||||
|
||||
//Log the time it took.
|
||||
$time = time() - $start_syncing;
|
||||
$msg("Syncing completed. Took $time seconds.");
|
||||
|
||||
}
|
|
@ -28,17 +28,23 @@ function admin_content(&$a) {
|
|||
$flagged = 'No entries.';
|
||||
}
|
||||
|
||||
//Get the backlog size.
|
||||
$res = q("SELECT count(*) as `count` FROM `profile` WHERE `updated` < '%s'",
|
||||
//Get the maintenance backlog size.
|
||||
$res = q("SELECT count(*) as `count` FROM `profile` WHERE `updated` < '%s'",
|
||||
dbesc(date('Y-m-d H:i:s', time()-$a->config['maintenance']['min_scrape_delay'])));
|
||||
$backlog = 'unknown';
|
||||
if(count($res)){ $backlog = $res[0]['count'].' entries'; }
|
||||
$maintenance_backlog = 'unknown';
|
||||
if(count($res)){ $maintenance_backlog = $res[0]['count'].' entries'; }
|
||||
|
||||
//Get the pulling backlog size.
|
||||
$res = q("SELECT count(*) as `count` FROM `sync-pull-queue`");
|
||||
$pulling_backlog = 'unknown';
|
||||
if(count($res)){ $pulling_backlog = $res[0]['count'].' entries'; }
|
||||
|
||||
$tpl = file_get_contents('view/admin.tpl');
|
||||
return replace_macros($tpl, array(
|
||||
'$present' => is_file('.htimport') ? ' (present)' : '',
|
||||
'$flagged' => $flagged,
|
||||
'$backlog' => $backlog,
|
||||
'$maintenance_backlog' => $maintenance_backlog,
|
||||
'$pulling_backlog' => $pulling_backlog,
|
||||
'$maintenance_size' => $a->config['maintenance']['max_scrapes'].' items per maintenance call.'
|
||||
));
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
<?php
|
||||
|
||||
require_once('include/submit.php');
|
||||
require_once('include/sync.php');
|
||||
|
||||
function submit_content(&$a) {
|
||||
|
||||
|
@ -8,10 +9,7 @@ function submit_content(&$a) {
|
|||
$url = hex2bin(notags(trim($_GET['url'])));
|
||||
|
||||
//Currently we simply push RAW URL's to our targets.
|
||||
//If we support it that is.
|
||||
if($a->config['syncing']['enable_pushing']){
|
||||
q("INSERT INTO `sync-queue` (`url`) VALUES ('%s')", dbesc($url));
|
||||
}
|
||||
sync_push($url);
|
||||
|
||||
//Run the submit sequence.
|
||||
run_submit($url);
|
||||
|
|
80
mod/sync.php
Normal file
80
mod/sync.php
Normal file
|
@ -0,0 +1,80 @@
|
|||
<?php
|
||||
|
||||
function sync_content(&$a)
|
||||
{
|
||||
|
||||
header('Content-type: application/json; charset=utf-8');
|
||||
|
||||
//When no arguments were given, return a json token to show we support this method.
|
||||
if($a->argc < 2){
|
||||
echo json_encode(array(
|
||||
'pulling_enabled'=>!!$a->config['syncing']['enable_pulling'],
|
||||
'pushing_enabled'=>!!$a->config['syncing']['enable_pushing']
|
||||
));
|
||||
exit;
|
||||
}
|
||||
|
||||
//Method switcher here.
|
||||
else{
|
||||
switch($a->argv[1]){
|
||||
case 'pull':
|
||||
if(!$a->config['syncing']['enable_pulling']){
|
||||
echo json_encode(array('error'=>'Pulling disabled.')); exit;
|
||||
}
|
||||
switch ($a->argv[2]) {
|
||||
case 'all': echo json_encode(do_pull_all()); exit;
|
||||
case 'since': echo json_encode(do_pull($a->argv[3])); exit;
|
||||
}
|
||||
default: echo json_encode(array('error'=>'Unknown method.')); exit;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
function do_pull($since)
|
||||
{
|
||||
|
||||
if(!intval($since)){
|
||||
return array('error' => 'Must set a since timestamp.');
|
||||
}
|
||||
|
||||
//Recently modified items.
|
||||
$r = q("SELECT * FROM `sync-timestamps` WHERE `modified` > '%s'", date('Y-m-d H:i:s', intval($since)));
|
||||
|
||||
//This removes all duplicates.
|
||||
$profiles = array();
|
||||
foreach($r as $row) $profiles[$row['url']] = $row['url'];
|
||||
|
||||
//This removes the keys, so it's a flat array.
|
||||
$results = array_values($profiles);
|
||||
|
||||
//Format it nicely.
|
||||
return array(
|
||||
'now' => time(),
|
||||
'count' => count($results),
|
||||
'results' => $results
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
function do_pull_all()
|
||||
{
|
||||
|
||||
//Find all the profiles.
|
||||
$r = q("SELECT `homepage` FROM `profile`");
|
||||
|
||||
//This removes all duplicates.
|
||||
$profiles = array();
|
||||
foreach($r as $row) $profiles[$row['homepage']] = $row['homepage'];
|
||||
|
||||
//This removes the keys, so it's a flat array.
|
||||
$results = array_values($profiles);
|
||||
|
||||
//Format it nicely.
|
||||
return array(
|
||||
'now' => time(),
|
||||
'count' => count($results),
|
||||
'results' => $results
|
||||
);
|
||||
|
||||
}
|
|
@ -6,14 +6,22 @@
|
|||
<div class="maintenance-wrapper">
|
||||
<h1>Maintenance</h1>
|
||||
<p>
|
||||
<strong>Current backlog: $backlog</strong><br>
|
||||
<strong>Current maintenance backlog: $maintenance_backlog</strong><br>
|
||||
<i>$maintenance_size</i>
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="pulling-wrapper">
|
||||
<h1>Pulling</h1>
|
||||
<p>
|
||||
<strong>Current pulling backlog: $pulling_backlog</strong><br>
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="import-wrapper">
|
||||
<h1>Import tools</h1>
|
||||
<h2>Mirror a directory</h2>
|
||||
<p>This is very slow, faster would be to use pull targets as that is multi-threaded.</p>
|
||||
<form method="POST">
|
||||
<label>Extract URL's:</label>
|
||||
<input type="text" name="dir_import_url" value="http://dir.friendica.com">
|
||||
|
|
Loading…
Reference in a new issue