Added syncing (push and pull) and refactored a few functions.
This commit is contained in:
parent
0026b08a33
commit
1fe9bb9b5b
11 changed files with 732 additions and 155 deletions
78
.htconfig.php
Normal file
78
.htconfig.php
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
//MySQL host.
|
||||||
|
$db_host = 'localhost';
|
||||||
|
$db_user = 'friendica-dir';
|
||||||
|
$db_pass = 'thisisyourpasswordbuddy';
|
||||||
|
$db_data = 'friendica-dir';
|
||||||
|
|
||||||
|
// Choose a legal default timezone. If you are unsure, use "America/Los_Angeles".
|
||||||
|
// It can be changed later and only applies to timestamps for anonymous viewers.
|
||||||
|
$default_timezone = 'Europe/Amsterdam';
|
||||||
|
|
||||||
|
// What is your site name?
|
||||||
|
$a->config['sitename'] = "EXPERIMENTAL Friendica public directory";
|
||||||
|
|
||||||
|
//Settings related to the syncing feature.
|
||||||
|
$a->config['syncing'] = array(
|
||||||
|
|
||||||
|
//Pulling may be quite intensive at first when it has to do a full sync and your directory is empty.
|
||||||
|
//This timeout should be shorter than your cronjob interval. Preferably with a little breathing room.
|
||||||
|
'timeout' => 3*60, //3 minutes
|
||||||
|
|
||||||
|
//Push new submits to the `sync-target` entries?
|
||||||
|
'enable_pushing' => true,
|
||||||
|
|
||||||
|
//Maximum amount of items per batch per target to push to other sync-targets.
|
||||||
|
//For example: 3 targets x20 items = 60 requests.
|
||||||
|
'max_push_items' => 10,
|
||||||
|
|
||||||
|
//Pull updates from the `sync-target` entries?
|
||||||
|
'enable_pulling' => true,
|
||||||
|
|
||||||
|
//This is your normal amount of threads for pulling.
|
||||||
|
//With regular intervals, there's no need to give this a high value.
|
||||||
|
//But when your server is brand new, you may want to keep this high for the first day or two.
|
||||||
|
'pulling_threads' => 25,
|
||||||
|
|
||||||
|
//How many items should we crawl per sync?
|
||||||
|
'max_pull_items' => 250
|
||||||
|
|
||||||
|
);
|
||||||
|
|
||||||
|
//Things related to site-health monitoring.
|
||||||
|
$a->config['site-health'] = array(
|
||||||
|
|
||||||
|
//Wait for at least ... before probing a site again.
|
||||||
|
//The longer this value, the more "stable" site-healths will be over time.
|
||||||
|
//Note: If a bad (negative) health site submits something, a probe will be performed regardless.
|
||||||
|
'min_probe_delay' => 3*24*3600, // 3 days
|
||||||
|
|
||||||
|
//Probes get a simple /friendica/json file from the server.
|
||||||
|
//Feel free to set this timeout to a very tight value.
|
||||||
|
'probe_timeout' => 5, // seconds
|
||||||
|
|
||||||
|
//Imports should be fast. Feel free to prioritize healthy sites.
|
||||||
|
'skip_import_threshold' => -20
|
||||||
|
|
||||||
|
);
|
||||||
|
|
||||||
|
//Things related to the maintenance cronjob.
|
||||||
|
$a->config['maintenance'] = array(
|
||||||
|
|
||||||
|
//This is to prevent I/O blocking. Will cost you some RAM overhead though.
|
||||||
|
//A good server should handle much more than this default, so you can tweak this.
|
||||||
|
'threads' => 10,
|
||||||
|
|
||||||
|
//Limit the amount of scrapes per execution of the maintainer.
|
||||||
|
//This will depend a lot on the frequency with which you call the maintainer.
|
||||||
|
//If you have 10 threads and 80 max_scrapes, that means each thread will handle 8 scrapes.
|
||||||
|
'max_scrapes' => 80,
|
||||||
|
|
||||||
|
//Wait for at least ... before scraping a profile again.
|
||||||
|
'min_scrape_delay' => 3*24*3600, // 3 days
|
||||||
|
|
||||||
|
//At which health value should we start removing profiles?
|
||||||
|
'remove_profile_health_threshold' => -60
|
||||||
|
|
||||||
|
);
|
10
README.md
10
README.md
|
@ -1,4 +1,8 @@
|
||||||
dir
|
# Friendica Global Directory
|
||||||
===
|
|
||||||
|
|
||||||
Friendica Global Directory
|
Example cronjob.
|
||||||
|
|
||||||
|
```
|
||||||
|
*/30 * * * * www-data cd /var/www/friendica-directory; php include/cron_maintain.php
|
||||||
|
*/5 * * * * www-data cd /var/www/friendica-directory; php include/cron_sync.php
|
||||||
|
```
|
15
dfrndir.sql
15
dfrndir.sql
|
@ -203,12 +203,25 @@ CREATE TABLE IF NOT EXISTS `sync-targets` (
|
||||||
`base_url` varchar(255) NOT NULL,
|
`base_url` varchar(255) NOT NULL,
|
||||||
`pull` bit(1) NOT NULL DEFAULT b'0',
|
`pull` bit(1) NOT NULL DEFAULT b'0',
|
||||||
`push` bit(1) NOT NULL DEFAULT b'1',
|
`push` bit(1) NOT NULL DEFAULT b'1',
|
||||||
|
`dt_last_pull` bigint unsigned NULL DEFAULT NULL,
|
||||||
PRIMARY KEY (`base_url`),
|
PRIMARY KEY (`base_url`),
|
||||||
KEY `push` (`push`),
|
KEY `push` (`push`),
|
||||||
KEY `pull` (`pull`)
|
KEY `pull` (`pull`)
|
||||||
) ENGINE=MyISAM DEFAULT CHARSET=utf8 ;
|
) ENGINE=MyISAM DEFAULT CHARSET=utf8 ;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS `sync-queue` (
|
CREATE TABLE IF NOT EXISTS `sync-push-queue` (
|
||||||
`url` varchar(255) NOT NULL,
|
`url` varchar(255) NOT NULL,
|
||||||
PRIMARY KEY (`url`)
|
PRIMARY KEY (`url`)
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ;
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS `sync-pull-queue` (
|
||||||
|
`url` varchar(255) NOT NULL,
|
||||||
|
PRIMARY KEY (`url`)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS `sync-timestamps` (
|
||||||
|
`url` varchar(255) NOT NULL,
|
||||||
|
`modified` datetime NOT NULL,
|
||||||
|
PRIMARY KEY (`url`),
|
||||||
|
KEY `modified` (`modified`)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ;
|
||||||
|
|
93
include/cron_sync.php
Normal file
93
include/cron_sync.php
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
#TODO:
|
||||||
|
|
||||||
|
* First do the pulls then the pushes.
|
||||||
|
If pull prevents the push, the push queue just creates a backlog until it gets a chance to push.
|
||||||
|
|
||||||
|
* When doing a first-pull, there's a safety mechanism for the timeout and detecting duplicate attempts.
|
||||||
|
|
||||||
|
1. Perform all JSON pulls on the source servers.
|
||||||
|
2. Combine the results into one giant pool of URLs.
|
||||||
|
3. Write this pool to a file (TODO-file).
|
||||||
|
4. Shuffle the pool in RAM.
|
||||||
|
5. Start threads for crawling.
|
||||||
|
6. Every finished crawl attempt (successful or not) should write to a 2nd file (DONE-file).
|
||||||
|
|
||||||
|
IF the first-pull times out, don't do anything else.
|
||||||
|
Otherwise, mark the dates we last performed a pull from each server.
|
||||||
|
|
||||||
|
* When resuming a first-pull.
|
||||||
|
|
||||||
|
1. Check for the TODO-file and the DONE-file.
|
||||||
|
2. Remove the entries in the DONE-file from the pool in the TODO-file.
|
||||||
|
3. Replace the TODO-file with the updated pool.
|
||||||
|
4. Perform steps 4, 5 and 6 (shuffle, create threads and crawl) from before.
|
||||||
|
|
||||||
|
This way you can resume without repeating attempts.
|
||||||
|
|
||||||
|
* Write documentation about syncing.
|
||||||
|
|
||||||
|
* Create "official" directory policy for my directory.
|
||||||
|
|
||||||
|
* Decide if a retry mechanism is desirable for pulling (for the failed attempts).
|
||||||
|
After all, you did imply trust when you indicated to pull from that source...
|
||||||
|
This could be done easily by doing a /sync/pull/all again from those sources.
|
||||||
|
|
||||||
|
* Decide if cron_sync.php should be split into push pull and pull-all commands.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Debug stuff.
|
||||||
|
ini_set('display_errors', 1);
|
||||||
|
ini_set('log_errors','0');
|
||||||
|
error_reporting(E_ALL^E_NOTICE);
|
||||||
|
|
||||||
|
$start_syncing = time();
|
||||||
|
|
||||||
|
//Startup.
|
||||||
|
require_once('boot.php');
|
||||||
|
$a = new App;
|
||||||
|
|
||||||
|
//Create a simple log function for CLI use.
|
||||||
|
global $verbose;
|
||||||
|
$verbose = $argv[1] === 'verbose';
|
||||||
|
|
||||||
|
function msg($message, $fatal=false){
|
||||||
|
global $verbose;
|
||||||
|
if($verbose || $fatal) echo($message.PHP_EOL);
|
||||||
|
logger($message);
|
||||||
|
if($fatal) exit(1);
|
||||||
|
};
|
||||||
|
|
||||||
|
//Config.
|
||||||
|
require_once(".htconfig.php");
|
||||||
|
|
||||||
|
//Connect the DB.
|
||||||
|
require_once("dba.php");
|
||||||
|
$db = new dba($db_host, $db_user, $db_pass, $db_data, $install);
|
||||||
|
|
||||||
|
//Import syncing functions.
|
||||||
|
require_once('sync.php');
|
||||||
|
|
||||||
|
//Get work for pulling.
|
||||||
|
$pull_batch = get_pulling_job($a);
|
||||||
|
|
||||||
|
//Get work for pushing.
|
||||||
|
list($push_targets, $push_batch) = get_pushing_job($a);
|
||||||
|
|
||||||
|
//Close the connection for now. Process forking and DB connections are not the best of friends.
|
||||||
|
$db->getdb()->close();
|
||||||
|
|
||||||
|
if(count($pull_batch))
|
||||||
|
run_pulling_job($a, $pull_batch, $db_host, $db_user, $db_pass, $db_data, $install);
|
||||||
|
|
||||||
|
//Do our multi-fork job, if we have a batch and targets.
|
||||||
|
if(count($push_targets) && count($push_batch))
|
||||||
|
run_pushing_job($push_targets, $push_batch, $db_host, $db_user, $db_pass, $db_data, $install);
|
||||||
|
|
||||||
|
//Log the time it took.
|
||||||
|
$time = time() - $start_syncing;
|
||||||
|
msg("Syncing completed. Took $time seconds.");
|
438
include/sync.php
Normal file
438
include/sync.php
Normal file
|
@ -0,0 +1,438 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pull this URL to our pulling queue.
|
||||||
|
* @param string $url
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
function sync_pull($url)
|
||||||
|
{
|
||||||
|
|
||||||
|
global $a;
|
||||||
|
|
||||||
|
//If we support it that is.
|
||||||
|
if($a->config['syncing']['enable_pulling']){
|
||||||
|
q("INSERT INTO `sync-pull-queue` (`url`) VALUES ('%s')", dbesc($url));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Push this URL to our pushing queue as well as mark it as modified using sync_mark.
|
||||||
|
* @param string $url
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
function sync_push($url)
|
||||||
|
{
|
||||||
|
|
||||||
|
global $a;
|
||||||
|
|
||||||
|
//If we support it that is.
|
||||||
|
if($a->config['syncing']['enable_pushing']){
|
||||||
|
q("INSERT INTO `sync-push-queue` (`url`) VALUES ('%s')", dbesc($url));
|
||||||
|
}
|
||||||
|
|
||||||
|
sync_mark($url);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Mark a URL as modified in some way or form.
|
||||||
|
* This will cause anyone that pulls our changes to see this profile listed.
|
||||||
|
* @param string $url
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
function sync_mark($url)
|
||||||
|
{
|
||||||
|
|
||||||
|
global $a;
|
||||||
|
|
||||||
|
//If we support it that is.
|
||||||
|
if(!$a->config['syncing']['enable_pulling']){
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
$exists = count(q("SELECT * FROM `sync-timestamps` WHERE `url`='%s'", dbesc($url)));
|
||||||
|
|
||||||
|
if(!$exists)
|
||||||
|
q("INSERT INTO `sync-timestamps` (`url`, `modified`) VALUES ('%s', NOW())", dbesc($url));
|
||||||
|
else
|
||||||
|
q("UPDATE `sync-timestamps` SET `modified`=NOW() WHERE `url`='%s'", dbesc($url));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For a single fork during the push jobs.
|
||||||
|
* Takes a lower priority and pushes a batch of items.
|
||||||
|
* @param string $target A sync-target database row.
|
||||||
|
* @param array $batch The batch of items to submit.
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
function push_worker($target, $batch)
|
||||||
|
{
|
||||||
|
|
||||||
|
//Lets be nice, we're only doing a background job here...
|
||||||
|
pcntl_setpriority(5);
|
||||||
|
|
||||||
|
//Find our target's submit URL.
|
||||||
|
$submit = $target['base_url'].'/submit';
|
||||||
|
|
||||||
|
foreach($batch as $item){
|
||||||
|
set_time_limit(30); //This should work for 1 submit.
|
||||||
|
msg("Submitting {$item['url']} to $submit");
|
||||||
|
fetch_url($submit.'?url='.bin2hex($item['url']));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets an array of push targets.
|
||||||
|
* @return array Push targets.
|
||||||
|
*/
|
||||||
|
function get_push_targets(){
|
||||||
|
return q("SELECT * FROM `sync-targets` WHERE `push`=b'1'");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets a batch of URL's to push.
|
||||||
|
* @param object $a The App instance.
|
||||||
|
* @return array Batch of URL's.
|
||||||
|
*/
|
||||||
|
function get_push_batch($a){
|
||||||
|
return q("SELECT * FROM `sync-push-queue` LIMIT %u", intval($a->config['syncing']['max_push_items']));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the push targets as well as a batch of URL's for a pushing job.
|
||||||
|
* @param object $a The App instance.
|
||||||
|
* @return list($targets, $batch) A list of both the targets array and batch array.
|
||||||
|
*/
|
||||||
|
function get_pushing_job($a)
|
||||||
|
{
|
||||||
|
|
||||||
|
//When pushing is requested...
|
||||||
|
if(!!$a->config['syncing']['enable_pushing']){
|
||||||
|
|
||||||
|
//Find our targets.
|
||||||
|
$targets = get_push_targets();
|
||||||
|
|
||||||
|
//No targets?
|
||||||
|
if(!count($targets)){
|
||||||
|
msg('Pushing enabled, but no push targets.');
|
||||||
|
$batch = array();
|
||||||
|
}
|
||||||
|
|
||||||
|
//If we have targets, get our batch.
|
||||||
|
else{
|
||||||
|
$batch = get_push_batch($a);
|
||||||
|
if(!count($batch)) msg('Empty pushing queue.'); //No batch, means no work.
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//No pushing if it's disabled.
|
||||||
|
else{
|
||||||
|
$targets = array();
|
||||||
|
$batch = array();
|
||||||
|
}
|
||||||
|
|
||||||
|
return array($targets, $batch);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs a pushing job, creating a thread for each target.
|
||||||
|
* @param array $targets Pushing targets.
|
||||||
|
* @param array $batch Batch of URL's to push.
|
||||||
|
* @param string $db_host DB host to connect to.
|
||||||
|
* @param string $db_user DB user to connect with.
|
||||||
|
* @param string $db_pass DB pass to connect with.
|
||||||
|
* @param mixed $db_data Nobody knows.
|
||||||
|
* @param mixed $install Maybe a boolean.
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
function run_pushing_job($targets, $batch, $db_host, $db_user, $db_pass, $db_data, $install)
|
||||||
|
{
|
||||||
|
|
||||||
|
//Create a thread for each target we want to serve push messages to.
|
||||||
|
//Not good creating more, because it would stress their server too much.
|
||||||
|
$threadc = count($targets);
|
||||||
|
$threads = array();
|
||||||
|
|
||||||
|
//Do we only have 1 target? No need for threads.
|
||||||
|
if($threadc === 1){
|
||||||
|
msg('No threads needed. Only one pushing target.');
|
||||||
|
push_worker($targets[0], $batch);
|
||||||
|
}
|
||||||
|
|
||||||
|
//When we need threads.
|
||||||
|
elseif($threadc > 1){
|
||||||
|
|
||||||
|
//POSIX threads only.
|
||||||
|
if(!function_exists('pcntl_fork')){
|
||||||
|
msg('Error: no pcntl_fork support. Are you running a different OS? Report an issue please.', true);
|
||||||
|
}
|
||||||
|
|
||||||
|
//Debug...
|
||||||
|
$items = count($batch);
|
||||||
|
msg("Creating $threadc push threads for $items items.");
|
||||||
|
|
||||||
|
//Loop while we need more threads.
|
||||||
|
for($i = 0; $i < $threadc; $i++){
|
||||||
|
|
||||||
|
$pid = pcntl_fork();
|
||||||
|
if($pid === -1) msg('Error: something went wrong with the fork. '.pcntl_strerror(), true);
|
||||||
|
|
||||||
|
//You're a child, go do some labor!
|
||||||
|
if($pid === 0){push_worker($targets[$i], $batch); exit;}
|
||||||
|
|
||||||
|
//Store the list of PID's.
|
||||||
|
if($pid > 0) $threads[] = $pid;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//Wait for all child processes.
|
||||||
|
$theading_problems = false;
|
||||||
|
foreach($threads as $pid){
|
||||||
|
pcntl_waitpid($pid, $status);
|
||||||
|
if($status !== 0){
|
||||||
|
$theading_problems = true;
|
||||||
|
msg("Bad process return value $pid:$status");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//If we did not have any "threading" problems.
|
||||||
|
if(!$theading_problems){
|
||||||
|
|
||||||
|
//Reconnect
|
||||||
|
global $db;
|
||||||
|
$db = new dba($db_host, $db_user, $db_pass, $db_data, $install);
|
||||||
|
|
||||||
|
//Create a query for deleting this queue.
|
||||||
|
$where = array();
|
||||||
|
foreach($batch as $item) $where[] = dbesc($item['url']);
|
||||||
|
$where = "WHERE `url` IN ('".implode("', '", $where)."')";
|
||||||
|
|
||||||
|
//Remove the items from queue.
|
||||||
|
q("DELETE FROM `sync-push-queue` $where LIMIT %u", count($batch));
|
||||||
|
msg('Removed items from push queue.');
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets a batch of URL's to push.
|
||||||
|
* @param object $a The App instance.
|
||||||
|
* @return array Batch of URL's.
|
||||||
|
*/
|
||||||
|
function get_queued_pull_batch($a){
|
||||||
|
//Randomize this, to prevent scraping the same servers too much or dead URL's.
|
||||||
|
$batch = q("SELECT * FROM `sync-pull-queue` ORDER BY RAND() LIMIT %u", intval($a->config['syncing']['max_pull_items']));
|
||||||
|
msg(sprintf('Pulling %u items from queue.', count($batch)));
|
||||||
|
return $batch;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets an array of pull targets.
|
||||||
|
* @return array Pull targets.
|
||||||
|
*/
|
||||||
|
function get_pull_targets(){
|
||||||
|
return q("SELECT * FROM `sync-targets` WHERE `pull`=b'1'");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets a batch of URL's to push.
|
||||||
|
* @param object $a The App instance.
|
||||||
|
* @return array Batch of URL's.
|
||||||
|
*/
|
||||||
|
function get_remote_pull_batch($a)
|
||||||
|
{
|
||||||
|
|
||||||
|
//Find our targets.
|
||||||
|
$targets = get_pull_targets();
|
||||||
|
|
||||||
|
msg(sprintf('Pulling from %u remote targets.', count($targets)));
|
||||||
|
|
||||||
|
//No targets, means no batch.
|
||||||
|
if(!count($targets))
|
||||||
|
return array();
|
||||||
|
|
||||||
|
//Pull a list of URL's from each target.
|
||||||
|
$urls = array();
|
||||||
|
foreach($targets as $target){
|
||||||
|
|
||||||
|
//First pull, or an update?
|
||||||
|
if(!$target['dt_last_pull'])
|
||||||
|
$url = $target['base_url'].'/sync/pull/all';
|
||||||
|
else
|
||||||
|
$url = $target['base_url'].'/sync/pull/since/'.intval($target['dt_last_pull']);
|
||||||
|
|
||||||
|
//Go for it :D
|
||||||
|
$target['pull_data'] = json_decode(fetch_url($url), true);
|
||||||
|
|
||||||
|
//If we didn't get any JSON.
|
||||||
|
if($target['pull_data'] === null){
|
||||||
|
msg(sprintf('Failed to pull from "%s".', $url));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
//Add all entries as keys, to remove duplicates.
|
||||||
|
foreach($target['pull_data']['results'] as $url)
|
||||||
|
$urls[$url]=true;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//Now that we have our URL's. Store them in the queue.
|
||||||
|
foreach($urls as $url=>$bool){
|
||||||
|
if($url) sync_pull($url);
|
||||||
|
}
|
||||||
|
|
||||||
|
//Since this all worked out, mark each source with the timestamp of pulling.
|
||||||
|
foreach($targets as $target){
|
||||||
|
if($targets['pull_data'] && $targets['pull_data']['now'])
|
||||||
|
q("UPDATE `sync-targets` SET `dt_last_pull`=%u WHERE `base_url`='%s'", $targets['pull_data']['now'], dbesc($targets['base_url']));
|
||||||
|
}
|
||||||
|
|
||||||
|
//Finally, return a batch of this.
|
||||||
|
return get_queued_pull_batch($a);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gathers an array of URL's to scrape from the pulling targets.
|
||||||
|
* @param object $a The App instance.
|
||||||
|
* @return array URL's to scrape.
|
||||||
|
*/
|
||||||
|
function get_pulling_job($a)
|
||||||
|
{
|
||||||
|
|
||||||
|
//No pulling today...
|
||||||
|
if(!$a->config['syncing']['enable_pulling'])
|
||||||
|
return array();
|
||||||
|
|
||||||
|
//Firstly, finish the items from our queue.
|
||||||
|
$batch = get_queued_pull_batch($a);
|
||||||
|
if(count($batch)) return $batch;
|
||||||
|
|
||||||
|
//If that is empty, fill the queue with remote items and return a batch of that.
|
||||||
|
$batch = get_remote_pull_batch($a);
|
||||||
|
if(count($batch)) return $batch;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For a single fork during the pull jobs.
|
||||||
|
* Takes a lower priority and pulls a batch of items.
|
||||||
|
* @param int $i The index number of this worker (for round-robin).
|
||||||
|
* @param int $threadc The amount of workers (for round-robin).
|
||||||
|
* @param array $pull_batch A batch of URL's to pull.
|
||||||
|
* @param string $db_host DB host to connect to.
|
||||||
|
* @param string $db_user DB user to connect with.
|
||||||
|
* @param string $db_pass DB pass to connect with.
|
||||||
|
* @param mixed $db_data Nobody knows.
|
||||||
|
* @param mixed $install Maybe a boolean.
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
function pull_worker($i, $threadc, $pull_batch, $db_host, $db_user, $db_pass, $db_data, $install)
|
||||||
|
{
|
||||||
|
|
||||||
|
//Lets be nice, we're only doing maintenance here...
|
||||||
|
pcntl_setpriority(5);
|
||||||
|
|
||||||
|
//Get personal DBA's.
|
||||||
|
global $db;
|
||||||
|
$db = new dba($db_host, $db_user, $db_pass, $db_data, $install);
|
||||||
|
|
||||||
|
//Get our (round-robin) workload from the batch.
|
||||||
|
$workload = array();
|
||||||
|
while(isset($pull_batch[$i])){
|
||||||
|
$entry = $pull_batch[$i];
|
||||||
|
$workload[] = $entry;
|
||||||
|
$i+=$threadc;
|
||||||
|
}
|
||||||
|
|
||||||
|
//While we've got work to do.
|
||||||
|
while(count($workload)){
|
||||||
|
$entry = array_pop($workload);
|
||||||
|
set_time_limit(20); //This should work for 1 submit.
|
||||||
|
msg("Submitting ".$entry['url']);
|
||||||
|
run_submit($entry['url']);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs a pulling job, creating several threads to do so.
|
||||||
|
* @param object $a The App instance.
|
||||||
|
* @param array $pull_batch A batch of URL's to pull.
|
||||||
|
* @param string $db_host DB host to connect to.
|
||||||
|
* @param string $db_user DB user to connect with.
|
||||||
|
* @param string $db_pass DB pass to connect with.
|
||||||
|
* @param mixed $db_data Nobody knows.
|
||||||
|
* @param mixed $install Maybe a boolean.
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
function run_pulling_job($a, $pull_batch, $db_host, $db_user, $db_pass, $db_data, $install)
|
||||||
|
{
|
||||||
|
|
||||||
|
//We need the scraper.
|
||||||
|
require_once('include/submit.php');
|
||||||
|
|
||||||
|
//POSIX threads only.
|
||||||
|
if(!function_exists('pcntl_fork')){
|
||||||
|
msg('Error: no pcntl_fork support. Are you running a different OS? Report an issue please.', true);
|
||||||
|
}
|
||||||
|
|
||||||
|
//Create the threads we need.
|
||||||
|
$items = count($pull_batch);
|
||||||
|
$threadc = min($a->config['syncing']['pulling_threads'], $items); //Don't need more threads than items.
|
||||||
|
$threads = array();
|
||||||
|
|
||||||
|
msg("Creating $threadc pulling threads for $items profiles.");
|
||||||
|
|
||||||
|
//Build the threads.
|
||||||
|
for($i = 0; $i < $threadc; $i++){
|
||||||
|
|
||||||
|
$pid = pcntl_fork();
|
||||||
|
if($pid === -1) msg('Error: something went wrong with the fork. '.pcntl_strerror(), true);
|
||||||
|
|
||||||
|
//You're a child, go do some labor!
|
||||||
|
if($pid === 0){pull_worker($i, $threadc, $pull_batch, $db_host, $db_user, $db_pass, $db_data, $install); exit;}
|
||||||
|
|
||||||
|
//Store the list of PID's.
|
||||||
|
if($pid > 0) $threads[] = $pid;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//Wait for all child processes.
|
||||||
|
$theading_problems = false;
|
||||||
|
foreach($threads as $pid){
|
||||||
|
pcntl_waitpid($pid, $status);
|
||||||
|
if($status !== 0){
|
||||||
|
$theading_problems = true;
|
||||||
|
msg("Bad process return value $pid:$status");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//If we did not have any "threading" problems.
|
||||||
|
if(!$theading_problems){
|
||||||
|
|
||||||
|
//Reconnect
|
||||||
|
global $db;
|
||||||
|
$db = new dba($db_host, $db_user, $db_pass, $db_data, $install);
|
||||||
|
|
||||||
|
//Create a query for deleting this queue.
|
||||||
|
$where = array();
|
||||||
|
foreach($pull_batch as $item) $where[] = dbesc($item['url']);
|
||||||
|
$where = "WHERE `url` IN ('".implode("', '", $where)."')";
|
||||||
|
|
||||||
|
//Remove the items from queue.
|
||||||
|
q("DELETE FROM `sync-pull-queue` $where LIMIT %u", count($pull_batch));
|
||||||
|
msg('Removed items from pull queue.');
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,141 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
// Debug stuff.
|
|
||||||
ini_set('display_errors', 1);
|
|
||||||
ini_set('log_errors','0');
|
|
||||||
error_reporting(E_ALL^E_NOTICE);
|
|
||||||
|
|
||||||
$start_syncing = time();
|
|
||||||
|
|
||||||
//Startup.
|
|
||||||
require_once('boot.php');
|
|
||||||
$a = new App;
|
|
||||||
|
|
||||||
//Create a simple log function for CLI use.
|
|
||||||
$verbose = $argv[1] === 'verbose';
|
|
||||||
$msg = function($message, $fatal=false)use($verbose){
|
|
||||||
if($verbose || $fatal) echo($message.PHP_EOL);
|
|
||||||
logger($message);
|
|
||||||
if($fatal) exit(1);
|
|
||||||
};
|
|
||||||
|
|
||||||
//Config.
|
|
||||||
require_once(".htconfig.php");
|
|
||||||
|
|
||||||
//No pushing? Leave... because we haven't implemented pulling yet.
|
|
||||||
if(!$a->config['syncing']['enable_pushing']){
|
|
||||||
$msg('No push support enabled in your settings.', true);
|
|
||||||
}
|
|
||||||
|
|
||||||
//Connect the DB.
|
|
||||||
require_once("dba.php");
|
|
||||||
$db = new dba($db_host, $db_user, $db_pass, $db_data, $install);
|
|
||||||
|
|
||||||
//Find our targets.
|
|
||||||
$targets = q("SELECT * FROM `sync-targets` WHERE `push`=b'1'");
|
|
||||||
if(!count($targets)) $msg('No targets.', true); //No targets, means no work.
|
|
||||||
|
|
||||||
//Get our batch of URL's.
|
|
||||||
$batch = q("SELECT * FROM `sync-queue` LIMIT %u", intval($a->config['syncing']['max_push_items']));
|
|
||||||
if(!count($batch)) $msg('Empty queue.', true); //No batch, means no work.
|
|
||||||
|
|
||||||
//Close the connection for now. Process forking and DB connections are not the best of friends.
|
|
||||||
$db->getdb()->close();
|
|
||||||
|
|
||||||
//Create a thread for each target we want to serve push messages to.
|
|
||||||
//No good creating more, because it would stress their server too much.
|
|
||||||
$threadc = count($targets);
|
|
||||||
$threads = array();
|
|
||||||
|
|
||||||
//Do we only have 1 target? No need for threads.
|
|
||||||
if($threadc === 1){
|
|
||||||
//Pretend to be worker #1.
|
|
||||||
$pid = 0;
|
|
||||||
$i = 0;
|
|
||||||
$main = true;
|
|
||||||
$msg('No threads needed. Only one pushing target.');
|
|
||||||
}
|
|
||||||
|
|
||||||
//When we need threads.
|
|
||||||
else{
|
|
||||||
|
|
||||||
//POSIX threads only.
|
|
||||||
if(!function_exists('pcntl_fork')){
|
|
||||||
$msg('Error: no pcntl_fork support. Are you running a different OS? Report an issue please.', true);
|
|
||||||
}
|
|
||||||
|
|
||||||
//Debug...
|
|
||||||
$items = count($batch);
|
|
||||||
$msg("Creating $threadc push threads for $items items.");
|
|
||||||
|
|
||||||
//Loop while we need more threads.
|
|
||||||
for($i = 0; $i < $threadc; $i++){
|
|
||||||
|
|
||||||
$pid = pcntl_fork();
|
|
||||||
if($pid === -1) $msg('Error: something went wrong with the fork. '.pcntl_strerror(), true);
|
|
||||||
|
|
||||||
//You're a child, go do some labor!
|
|
||||||
if($pid === 0) break;
|
|
||||||
|
|
||||||
//Store the list of PID's.
|
|
||||||
if($pid > 0) $threads[] = $pid;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
//Are we the main thread?
|
|
||||||
$main = $pid !== 0;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
//The work for child processes.
|
|
||||||
if($pid === 0){
|
|
||||||
|
|
||||||
//Lets be nice, we're only doing a background job here...
|
|
||||||
pcntl_setpriority(5);
|
|
||||||
|
|
||||||
//Find our target's submit URL.
|
|
||||||
$submit = $targets[$i]['base_url'].'/submit';
|
|
||||||
|
|
||||||
foreach($batch as $item){
|
|
||||||
set_time_limit(30); //This should work for 1 submit.
|
|
||||||
$msg("Submitting {$item['url']} to $submit");
|
|
||||||
fetch_url($submit.'?url='.bin2hex($item['url']));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
//The main process.
|
|
||||||
if($main){
|
|
||||||
|
|
||||||
//Wait for all child processes.
|
|
||||||
$all_good = true;
|
|
||||||
foreach($threads as $pid){
|
|
||||||
pcntl_waitpid($pid, $status);
|
|
||||||
if($status !== 0){
|
|
||||||
$all_good = false;
|
|
||||||
$msg("Bad process return value $pid:$status");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//If we did not have any "threading" problems.
|
|
||||||
if($all_good){
|
|
||||||
|
|
||||||
//Reconnect
|
|
||||||
$db = new dba($db_host, $db_user, $db_pass, $db_data, $install);
|
|
||||||
|
|
||||||
//Create a query for deleting this queue.
|
|
||||||
$where = array();
|
|
||||||
foreach($batch as $item) $where[] = dbesc($item['url']);
|
|
||||||
$where = "WHERE `url` IN ('".implode("', '", $where)."')";
|
|
||||||
|
|
||||||
//Remove the items from queue.
|
|
||||||
q("DELETE FROM `sync-queue` $where LIMIT %u", intval($a->config['syncing']['max_push_items']));
|
|
||||||
$msg('Removed items from queue.');
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
//Log the time it took.
|
|
||||||
$time = time() - $start_syncing;
|
|
||||||
$msg("Syncing completed. Took $time seconds.");
|
|
||||||
|
|
||||||
}
|
|
|
@ -28,17 +28,23 @@ function admin_content(&$a) {
|
||||||
$flagged = 'No entries.';
|
$flagged = 'No entries.';
|
||||||
}
|
}
|
||||||
|
|
||||||
//Get the backlog size.
|
//Get the maintenance backlog size.
|
||||||
$res = q("SELECT count(*) as `count` FROM `profile` WHERE `updated` < '%s'",
|
$res = q("SELECT count(*) as `count` FROM `profile` WHERE `updated` < '%s'",
|
||||||
dbesc(date('Y-m-d H:i:s', time()-$a->config['maintenance']['min_scrape_delay'])));
|
dbesc(date('Y-m-d H:i:s', time()-$a->config['maintenance']['min_scrape_delay'])));
|
||||||
$backlog = 'unknown';
|
$maintenance_backlog = 'unknown';
|
||||||
if(count($res)){ $backlog = $res[0]['count'].' entries'; }
|
if(count($res)){ $maintenance_backlog = $res[0]['count'].' entries'; }
|
||||||
|
|
||||||
|
//Get the pulling backlog size.
|
||||||
|
$res = q("SELECT count(*) as `count` FROM `sync-pull-queue`");
|
||||||
|
$pulling_backlog = 'unknown';
|
||||||
|
if(count($res)){ $pulling_backlog = $res[0]['count'].' entries'; }
|
||||||
|
|
||||||
$tpl = file_get_contents('view/admin.tpl');
|
$tpl = file_get_contents('view/admin.tpl');
|
||||||
return replace_macros($tpl, array(
|
return replace_macros($tpl, array(
|
||||||
'$present' => is_file('.htimport') ? ' (present)' : '',
|
'$present' => is_file('.htimport') ? ' (present)' : '',
|
||||||
'$flagged' => $flagged,
|
'$flagged' => $flagged,
|
||||||
'$backlog' => $backlog,
|
'$maintenance_backlog' => $maintenance_backlog,
|
||||||
|
'$pulling_backlog' => $pulling_backlog,
|
||||||
'$maintenance_size' => $a->config['maintenance']['max_scrapes'].' items per maintenance call.'
|
'$maintenance_size' => $a->config['maintenance']['max_scrapes'].' items per maintenance call.'
|
||||||
));
|
));
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
<?php
|
<?php
|
||||||
|
|
||||||
require_once('include/submit.php');
|
require_once('include/submit.php');
|
||||||
|
require_once('include/sync.php');
|
||||||
|
|
||||||
function submit_content(&$a) {
|
function submit_content(&$a) {
|
||||||
|
|
||||||
|
@ -8,10 +9,7 @@ function submit_content(&$a) {
|
||||||
$url = hex2bin(notags(trim($_GET['url'])));
|
$url = hex2bin(notags(trim($_GET['url'])));
|
||||||
|
|
||||||
//Currently we simply push RAW URL's to our targets.
|
//Currently we simply push RAW URL's to our targets.
|
||||||
//If we support it that is.
|
sync_push($url);
|
||||||
if($a->config['syncing']['enable_pushing']){
|
|
||||||
q("INSERT INTO `sync-queue` (`url`) VALUES ('%s')", dbesc($url));
|
|
||||||
}
|
|
||||||
|
|
||||||
//Run the submit sequence.
|
//Run the submit sequence.
|
||||||
run_submit($url);
|
run_submit($url);
|
||||||
|
|
80
mod/sync.php
Normal file
80
mod/sync.php
Normal file
|
@ -0,0 +1,80 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
function sync_content(&$a)
|
||||||
|
{
|
||||||
|
|
||||||
|
header('Content-type: application/json; charset=utf-8');
|
||||||
|
|
||||||
|
//When no arguments were given, return a json token to show we support this method.
|
||||||
|
if($a->argc < 2){
|
||||||
|
echo json_encode(array(
|
||||||
|
'pulling_enabled'=>!!$a->config['syncing']['enable_pulling'],
|
||||||
|
'pushing_enabled'=>!!$a->config['syncing']['enable_pushing']
|
||||||
|
));
|
||||||
|
exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
//Method switcher here.
|
||||||
|
else{
|
||||||
|
switch($a->argv[1]){
|
||||||
|
case 'pull':
|
||||||
|
if(!$a->config['syncing']['enable_pulling']){
|
||||||
|
echo json_encode(array('error'=>'Pulling disabled.')); exit;
|
||||||
|
}
|
||||||
|
switch ($a->argv[2]) {
|
||||||
|
case 'all': echo json_encode(do_pull_all()); exit;
|
||||||
|
case 'since': echo json_encode(do_pull($a->argv[3])); exit;
|
||||||
|
}
|
||||||
|
default: echo json_encode(array('error'=>'Unknown method.')); exit;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
function do_pull($since)
|
||||||
|
{
|
||||||
|
|
||||||
|
if(!intval($since)){
|
||||||
|
return array('error' => 'Must set a since timestamp.');
|
||||||
|
}
|
||||||
|
|
||||||
|
//Recently modified items.
|
||||||
|
$r = q("SELECT * FROM `sync-timestamps` WHERE `modified` > '%s'", date('Y-m-d H:i:s', intval($since)));
|
||||||
|
|
||||||
|
//This removes all duplicates.
|
||||||
|
$profiles = array();
|
||||||
|
foreach($r as $row) $profiles[$row['url']] = $row['url'];
|
||||||
|
|
||||||
|
//This removes the keys, so it's a flat array.
|
||||||
|
$results = array_values($profiles);
|
||||||
|
|
||||||
|
//Format it nicely.
|
||||||
|
return array(
|
||||||
|
'now' => time(),
|
||||||
|
'count' => count($results),
|
||||||
|
'results' => $results
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
function do_pull_all()
|
||||||
|
{
|
||||||
|
|
||||||
|
//Find all the profiles.
|
||||||
|
$r = q("SELECT `homepage` FROM `profile`");
|
||||||
|
|
||||||
|
//This removes all duplicates.
|
||||||
|
$profiles = array();
|
||||||
|
foreach($r as $row) $profiles[$row['homepage']] = $row['homepage'];
|
||||||
|
|
||||||
|
//This removes the keys, so it's a flat array.
|
||||||
|
$results = array_values($profiles);
|
||||||
|
|
||||||
|
//Format it nicely.
|
||||||
|
return array(
|
||||||
|
'now' => time(),
|
||||||
|
'count' => count($results),
|
||||||
|
'results' => $results
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
|
@ -6,14 +6,22 @@
|
||||||
<div class="maintenance-wrapper">
|
<div class="maintenance-wrapper">
|
||||||
<h1>Maintenance</h1>
|
<h1>Maintenance</h1>
|
||||||
<p>
|
<p>
|
||||||
<strong>Current backlog: $backlog</strong><br>
|
<strong>Current maintenance backlog: $maintenance_backlog</strong><br>
|
||||||
<i>$maintenance_size</i>
|
<i>$maintenance_size</i>
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div class="pulling-wrapper">
|
||||||
|
<h1>Pulling</h1>
|
||||||
|
<p>
|
||||||
|
<strong>Current pulling backlog: $pulling_backlog</strong><br>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="import-wrapper">
|
<div class="import-wrapper">
|
||||||
<h1>Import tools</h1>
|
<h1>Import tools</h1>
|
||||||
<h2>Mirror a directory</h2>
|
<h2>Mirror a directory</h2>
|
||||||
|
<p>This is very slow, faster would be to use pull targets as that is multi-threaded.</p>
|
||||||
<form method="POST">
|
<form method="POST">
|
||||||
<label>Extract URL's:</label>
|
<label>Extract URL's:</label>
|
||||||
<input type="text" name="dir_import_url" value="http://dir.friendica.com">
|
<input type="text" name="dir_import_url" value="http://dir.friendica.com">
|
||||||
|
|
Loading…
Add table
Reference in a new issue