93 lines
2.8 KiB
PHP
93 lines
2.8 KiB
PHP
<?php
|
|
|
|
/*
|
|
|
|
#TODO:
|
|
|
|
* First do the pulls then the pushes.
|
|
If pull prevents the push, the push queue just creates a backlog until it gets a chance to push.
|
|
|
|
* When doing a first-pull, there's a safety mechanism for the timeout and detecting duplicate attempts.
|
|
|
|
1. Perform all JSON pulls on the source servers.
|
|
2. Combine the results into one giant pool of URLs.
|
|
3. Write this pool to a file (TODO-file).
|
|
4. Shuffle the pool in RAM.
|
|
5. Start threads for crawling.
|
|
6. Every finished crawl attempt (successful or not) should write to a 2nd file (DONE-file).
|
|
|
|
IF the first-pull times out, don't do anything else.
|
|
Otherwise, mark the dates we last performed a pull from each server.
|
|
|
|
* When resuming a first-pull.
|
|
|
|
1. Check for the TODO-file and the DONE-file.
|
|
2. Remove the entries in the DONE-file from the pool in the TODO-file.
|
|
3. Replace the TODO-file with the updated pool.
|
|
4. Perform steps 4, 5 and 6 (shuffle, create threads and crawl) from before.
|
|
|
|
This way you can resume without repeating attempts.
|
|
|
|
* Write documentation about syncing.
|
|
|
|
* Create "official" directory policy for my directory.
|
|
|
|
* Decide if a retry mechanism is desirable for pulling (for the failed attempts).
|
|
After all, you did imply trust when you indicated to pull from that source...
|
|
This could be done easily by doing a /sync/pull/all again from those sources.
|
|
|
|
* Decide if cron_sync.php should be split into push pull and pull-all commands.
|
|
|
|
*/
|
|
|
|
// Debug stuff.
|
|
ini_set('display_errors', 1);
|
|
ini_set('log_errors','0');
|
|
error_reporting(E_ALL^E_NOTICE);
|
|
|
|
$start_syncing = time();
|
|
|
|
//Startup.
|
|
require_once('boot.php');
|
|
$a = new App;
|
|
|
|
//Create a simple log function for CLI use.
|
|
global $verbose;
|
|
$verbose = $argv[1] === 'verbose';
|
|
|
|
function msg($message, $fatal=false){
|
|
global $verbose;
|
|
if($verbose || $fatal) echo($message.PHP_EOL);
|
|
logger($message);
|
|
if($fatal) exit(1);
|
|
};
|
|
|
|
//Config.
|
|
require_once(".htconfig.php");
|
|
|
|
//Connect the DB.
|
|
require_once("dba.php");
|
|
$db = new dba($db_host, $db_user, $db_pass, $db_data, $install);
|
|
|
|
//Import syncing functions.
|
|
require_once('sync.php');
|
|
|
|
//Get work for pulling.
|
|
$pull_batch = get_pulling_job($a);
|
|
|
|
//Get work for pushing.
|
|
list($push_targets, $push_batch) = get_pushing_job($a);
|
|
|
|
//Close the connection for now. Process forking and DB connections are not the best of friends.
|
|
$db->getdb()->close();
|
|
|
|
if(count($pull_batch))
|
|
run_pulling_job($a, $pull_batch, $db_host, $db_user, $db_pass, $db_data, $install);
|
|
|
|
//Do our multi-fork job, if we have a batch and targets.
|
|
if(count($push_targets) && count($push_batch))
|
|
run_pushing_job($push_targets, $push_batch, $db_host, $db_user, $db_pass, $db_data, $install);
|
|
|
|
//Log the time it took.
|
|
$time = time() - $start_syncing;
|
|
msg("Syncing completed. Took $time seconds.");
|