From b833a8d25556e03b0c56d306798993472a32f0f2 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Sun, 22 Oct 2017 23:48:00 -0400 Subject: [PATCH 1/4] Fix cron_maintain - Update profile before fetching scrape url - Honor $parms['hide'] before validating dfrn site - Fetch maintenance items oldest first - Add backlog size in log - Add pid to logger for easier threaded cron debug --- boot.php | 2 +- include/cron_maintain.php | 21 ++++++++++++++++----- include/submit.php | 24 +++++++++++++++++------- 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/boot.php b/boot.php index 0f4f36a4..9944e881 100644 --- a/boot.php +++ b/boot.php @@ -79,7 +79,7 @@ if (!function_exists('logger')) { } require_once('include/datetime.php'); - @file_put_contents($logfile, datetime_convert() . ':' . ' ' . $msg . "\n", FILE_APPEND); + @file_put_contents($logfile, datetime_convert() . ' [#' . getmypid() . '] ' . $msg . "\n", FILE_APPEND); return; } } diff --git a/include/cron_maintain.php b/include/cron_maintain.php index 960fe901..0e1051d4 100644 --- a/include/cron_maintain.php +++ b/include/cron_maintain.php @@ -21,12 +21,23 @@ require_once '.htconfig.php'; require_once 'dba.php'; $db = new dba($db_host, $db_user, $db_pass, $db_data, $install); -//Get our set of items. Youngest items first, after the threshold. -//This may be counter-intuitive, but is to prevent items that fail to update from blocking the rest. + +//Get the maintenance backlog size. +$res = q("SELECT count(*) as `count` +FROM `profile` +WHERE `updated` < '%s'", + dbesc(date('Y-m-d H:i:s', time() - $a->config['maintenance']['min_scrape_delay'])) +); +$maintenance_backlog = 'unknown'; +if (count($res)) { + $maintenance_backlog = $res[0]['count'] . ' entries left'; +} + +//Get our set of items. Oldest items first, after the threshold. $res = q("SELECT `id`, `homepage`, `censored` FROM `profile` WHERE `updated` < '%s' -ORDER BY `updated` DESC +ORDER BY `updated` ASC LIMIT %u", dbesc(date('Y-m-d H:i:s', time() - $a->config['maintenance']['min_scrape_delay'])), intval($a->config['maintenance']['max_scrapes']) @@ -56,9 +67,9 @@ $threads = array(); //Debug... if ($verbose) { - echo "Creating $threadc maintainer threads for $items profiles." . PHP_EOL; + echo "Creating $threadc maintainer threads for $items profiles, $maintenance_backlog" . PHP_EOL; } -logger("Creating $threadc maintainer threads for $items profiles."); +logger("Creating $threadc maintainer threads for $items profiles. $maintenance_backlog"); for ($i = 0; $i < $threadc; $i++) { diff --git a/include/submit.php b/include/submit.php index 255657b4..90876c2d 100644 --- a/include/submit.php +++ b/include/submit.php @@ -29,6 +29,14 @@ function run_submit($url) { if(count($r)) { $profile_exists = true; $profile_id = $r[0]['id']; + + $r = q("UPDATE `profile` SET + `updated` = '%s' + WHERE `id` = %d LIMIT 1", + + dbesc(datetime_convert()), + intval($profile_id) + ); } //Remove duplicates. @@ -86,19 +94,21 @@ function run_submit($url) { nuke_record($url); return true; //This is a good update. } - - //This is most likely a problem with the site configuration. Ignore. - elseif(validate_dfrn($parms)) { - return false; - } - + if((x($parms,'hide')) || (! (x($parms,'fn')) && (x($parms,'photo')))) { if($profile_exists) { + logger('Profile inferred to be opted out of the directory.'); nuke_record($url); } return true; //This is a good update. } - + + //This is most likely a problem with the site configuration. Ignore. + if(validate_dfrn($parms)) { + logger('Site is unavailable'); + return false; + } + $photo = $parms['photo']; dbesc_array($parms); From cdb8670dea79f1400a76adc0134d18c7b86f2d43 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Sun, 22 Oct 2017 23:48:17 -0400 Subject: [PATCH 2/4] Fix whitespaces in include/submit.php --- include/submit.php | 64 +++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/include/submit.php b/include/submit.php index 90876c2d..754474a9 100644 --- a/include/submit.php +++ b/include/submit.php @@ -6,17 +6,17 @@ require_once('site-health.php'); function run_submit($url) { global $a; - + if(! strlen($url)) return false; logger('Updating: ' . $url); - + //First run a notice script for the site it is hosted on. $site_health = notice_site($url, true); - + $submit_start = microtime(true); - + $nurl = str_replace(array('https:','//www.'), array('http:','//'), $url); $profile_exists = false; @@ -26,7 +26,7 @@ function run_submit($url) { dbesc($nurl) ); - if(count($r)) { + if(count($r)) { $profile_exists = true; $profile_id = $r[0]['id']; @@ -38,7 +38,7 @@ function run_submit($url) { intval($profile_id) ); } - + //Remove duplicates. if(count($r) > 1){ for($i=1; $iconfig['maintenance']['remove_profile_health_threshold']){ logger('Nuked bad health record.'); nuke_record($url); } - + return false; - + } - + //We don't care about valid dfrn if the user indicates to be hidden. elseif($parms['explicit-hide'] && $profile_exists) { logger('User opted out of the directory.'); @@ -117,18 +117,18 @@ function run_submit($url) { $parms['comm'] = intval($parms['comm']); if($profile_exists) { - $r = q("UPDATE `profile` SET - `name` = '%s', + $r = q("UPDATE `profile` SET + `name` = '%s', `pdesc` = '%s', - `locality` = '%s', - `region` = '%s', - `postal-code` = '%s', - `country-name` = '%s', + `locality` = '%s', + `region` = '%s', + `postal-code` = '%s', + `country-name` = '%s', `homepage` = '%s', `nurl` = '%s', `comm` = %d, `tags` = '%s', - `updated` = '%s' + `updated` = '%s' WHERE `id` = %d LIMIT 1", $parms['fn'], @@ -140,7 +140,7 @@ function run_submit($url) { dbesc($url), dbesc($nurl), intval($parms['comm']), - $parms['tags'], + $parms['tags'], dbesc(datetime_convert()), intval($profile_id) ); @@ -206,15 +206,15 @@ function run_submit($url) { } } } - + $submit_photo_start = microtime(true); - + require_once("Photo.php"); $photo_failure = false; - + $status = false; - + if($profile_id) { $img_str = fetch_url($photo,true); $img = new Photo($img_str); @@ -232,11 +232,11 @@ function run_submit($url) { nuke_record($url); return false; } - + $submit_end = microtime(true); $photo_time = round(($submit_end - $submit_photo_start) * 1000); $time = round(($submit_end - $submit_start) * 1000); - + //Record the scrape speed in a scrapes table. if($site_health && $status) q( "INSERT INTO `site-scrape` (`site_health_id`, `dt_performed`, `request_time`, `scrape_time`, `photo_time`, `total_time`)". @@ -247,7 +247,7 @@ function run_submit($url) { $photo_time, $time ); - + return $status; } From ef7551df814ef5386440d42be001139dcab71cd9 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Sun, 22 Oct 2017 23:55:27 -0400 Subject: [PATCH 3/4] Fix empty $parms check - $parms always contains the `_timings` key --- include/submit.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/submit.php b/include/submit.php index 754474a9..255f6021 100644 --- a/include/submit.php +++ b/include/submit.php @@ -75,7 +75,7 @@ function run_submit($url) { } //Empty result is due to an offline site. - if(!count($parms)){ + if(!count($parms) > 1){ //For large sites this could lower the health too quickly, so don't track health. //But for sites that are already in bad status. Do a cleanup now. From 163d7d2b4b75834342891ff411ff1e8e1a0471d3 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Mon, 23 Oct 2017 00:34:04 -0400 Subject: [PATCH 4/4] Add profile availability field - Add available field to profile table - Make profile unavailable in directory until update suceeds --- include/submit.php | 3 +++ mod/directory.php | 4 ++-- mod/search.php | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/submit.php b/include/submit.php index 255f6021..c454a356 100644 --- a/include/submit.php +++ b/include/submit.php @@ -31,6 +31,7 @@ function run_submit($url) { $profile_id = $r[0]['id']; $r = q("UPDATE `profile` SET + `available` = 0, `updated` = '%s' WHERE `id` = %d LIMIT 1", @@ -56,6 +57,7 @@ function run_submit($url) { //Skip the scrape? :D $noscrape = $site_health && $site_health['no_scrape_url']; + if($noscrape){ //Find out who to look up. @@ -128,6 +130,7 @@ function run_submit($url) { `nurl` = '%s', `comm` = %d, `tags` = '%s', + `available` = 1, `updated` = '%s' WHERE `id` = %d LIMIT 1", diff --git a/mod/directory.php b/mod/directory.php index 9f1aa9c3..bd3a879d 100644 --- a/mod/directory.php +++ b/mod/directory.php @@ -46,7 +46,7 @@ function directory_content(App $a) $sql_extra = str_replace('%', '%%', $sql_extra); - $r = q("SELECT COUNT(*) AS `total` FROM `profile` WHERE `censored` = 0 $sql_extra "); + $r = q("SELECT COUNT(*) AS `total` FROM `profile` WHERE `censored` = 0 AND `available` = 1 $sql_extra "); if (count($r)) { $total = $r[0]['total']; $a->set_pager_total($total); @@ -58,7 +58,7 @@ function directory_content(App $a) $order = ' ORDER BY `updated` DESC, `id` DESC '; } - $r = q("SELECT * FROM `profile` WHERE `censored` = 0 $sql_extra $order LIMIT %d , %d ", + $r = q("SELECT * FROM `profile` WHERE `censored` = 0 AND `available` = 1 $sql_extra $order LIMIT %d , %d ", intval($a->pager['start']), intval($a->pager['itemspage']) ); diff --git a/mod/search.php b/mod/search.php index c88795c7..ad07863b 100644 --- a/mod/search.php +++ b/mod/search.php @@ -60,7 +60,7 @@ function search_content(App $a) $sql_extra = str_replace('%', '%%', $sql_extra); $total = 0; - $r = q("SELECT COUNT(*) AS `total` FROM `profile` WHERE `censored` = 0 $sql_extra "); + $r = q("SELECT COUNT(*) AS `total` FROM `profile` WHERE `censored` = 0 AND `available` = 1 $sql_extra "); if (count($r)) { $total = $r[0]['total']; $a->set_pager_total($total); @@ -72,7 +72,7 @@ function search_content(App $a) $order = ' ORDER BY `updated` DESC, `id` DESC '; } - $r = q("SELECT * FROM `profile` WHERE `censored` = 0 $sql_extra $order LIMIT %d , %d ", + $r = q("SELECT * FROM `profile` WHERE `censored` = 0 AND `available` = 1 $sql_extra $order LIMIT %d , %d ", intval($a->pager['start']), intval($a->pager['itemspage']) );