Adding site-health and noscrape support.

This commit is contained in:
Beanow 2014-07-10 23:43:25 +02:00
commit a69a9d2278
15 changed files with 1025 additions and 61 deletions

View file

@ -1,21 +1,27 @@
<?php
require_once('datetime.php');
require_once('site-health.php');
function run_submit(&$a, $url) {
function run_submit($url) {
global $a;
if(! strlen($url))
return false;
logger('Updating: ' . $url);
//First run a notice script for the site it is hosted on.
$site_health = notice_site($url, true);
$submit_start = microtime(true);
$nurl = str_replace(array('https:','//www.'), array('http:','//'), $url);
$profile_exists = false;
$r = q("SELECT * FROM `profile` WHERE ( `homepage` = '%s' OR `nurl` = '%s' ) LIMIT 1",
$r = q("SELECT * FROM `profile` WHERE ( `homepage` = '%s' OR `nurl` = '%s' )",
dbesc($url),
dbesc($nurl)
);
@ -24,25 +30,74 @@ function run_submit(&$a, $url) {
$profile_exists = true;
$profile_id = $r[0]['id'];
}
//Remove duplicates.
if(count($r) > 1){
for($i=1; $i<count($r); $i++){
logger('Removed duplicate profile '.intval($r[$i]['id']));
q("DELETE FROM `photo` WHERE `profile-id` = %d LIMIT 1",
intval($r[$i]['id'])
);
q("DELETE FROM `profile` WHERE `id` = %d LIMIT 1",
intval($r[$i]['id'])
);
}
}
require_once('Scrape.php');
//Skip the scrape? :D
$noscrape = $site_health && $site_health['no_scrape_url'];
if($noscrape){
//Find out who to look up.
$which = str_replace($site_health['base_url'], '', $url);
$noscrape = preg_match('~/profile/([^/]+)~', $which, $matches) === 1;
//If that did not fail...
if($noscrape){
$parms = noscrape_dfrn($site_health['no_scrape_url'].'/'.$matches[1]);
$noscrape = !!$parms; //If the result was false, do a scrape after all.
}
}
$parms = scrape_dfrn($url);
if(!$noscrape){
$parms = scrape_dfrn($url);
}
// logger('dir_submit: ' . print_r($parms,true));
if((! count($parms)) || (validate_dfrn($parms))) {
//Empty result is due to an offline site.
if(!count($parms)){
//For large sites this could lower the health too quickly, so don't track health.
//But for sites that are already in bad status. Do a cleanup now.
if($profile_exists && $site_health['health_score'] < $a->config['maintenance']['remove_profile_health_threshold']){
logger('Nuked bad health record.');
nuke_record($url);
}
return false;
}
//We don't care about valid dfrn if the user indicates to be hidden.
elseif($parms['explicit-hide'] && $profile_exists) {
logger('User opted out of the directory.');
nuke_record($url);
}
//This is most likely a problem with the site configuration. Ignore.
elseif(validate_dfrn($parms)) {
return false;
}
if((x($parms,'hide')) || (! (x($parms,'fn')) && (x($parms,'photo')))) {
if($profile_exists) {
nuke_record($url);
}
return false;
}
$photo = $parms['photo'];
dbesc_array($parms);
@ -146,11 +201,15 @@ function run_submit(&$a, $url) {
}
}
}
$submit_photo_start = microtime(true);
require_once("Photo.php");
$photo_failure = false;
$status = false;
$img_str = fetch_url($photo,true);
$img = new Photo($img_str);
if($img) {
@ -162,12 +221,28 @@ function run_submit(&$a, $url) {
dbesc($a->get_baseurl() . '/photo/' . $profile_id . '.jpg'),
intval($profile_id)
);
$status = true;
}
else{
nuke_record($url);
return false;
}
return true;
$submit_end = microtime(true);
$photo_time = round(($submit_end - $submit_photo_start) * 1000);
$time = round(($submit_end - $submit_start) * 1000);
//Record the scrape speed in a scrapes table.
if($site_health && $status) q(
"INSERT INTO `site-scrape` (`site_health_id`, `dt_performed`, `request_time`, `scrape_time`, `photo_time`, `total_time`)".
"VALUES (%u, NOW(), %u, %u, %u, %u)",
$site_health['id'],
$parms['_timings']['fetch'],
$parms['_timings']['scrape'],
$photo_time,
$time
);
return $status;
}