Adding site-health and noscrape support.

This commit is contained in:
Beanow 2014-07-10 23:43:25 +02:00
parent 1bac9fb268
commit a69a9d2278
15 changed files with 1025 additions and 61 deletions

View file

@ -144,3 +144,58 @@ CREATE TABLE IF NOT EXISTS `user` (
`password` char(255) NOT NULL, `password` char(255) NOT NULL,
PRIMARY KEY (`uid`) PRIMARY KEY (`uid`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 ; ) ENGINE=MyISAM DEFAULT CHARSET=utf8 ;
-- --------------------------------------------------------
--
-- Table structure for table `site-health`
--
CREATE TABLE IF NOT EXISTS `site-health` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`base_url` varchar(255) NOT NULL,
`health_score` int(11) NOT NULL DEFAULT 0,
`no_scrape_url` varchar(255) NULL DEFAULT NULL,
`dt_first_noticed` datetime NOT NULL,
`dt_last_seen` datetime NULL DEFAULT NULL,
`dt_last_probed` datetime NULL DEFAULT NULL,
`dt_last_heartbeat` datetime NULL DEFAULT NULL,
`name` varchar(255) NULL DEFAULT NULL,
`version` varchar(255) NULL DEFAULT NULL,
`plugins` text NULL DEFAULT NULL,
`reg_policy` char(32) NULL DEFAULT NULL,
`info` text NULL DEFAULT NULL,
`admin_name` varchar(255) NULL DEFAULT NULL,
`admin_profile` varchar(255) NULL DEFAULT NULL,
`ssl_state` bit(1) NULL DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `base_url` (`base_url`),
KEY `health_score` (`health_score`),
KEY `dt_last_seen` (`dt_last_seen`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 ;
CREATE TABLE IF NOT EXISTS `site-probe` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`site_health_id` int(10) unsigned NOT NULL,
`dt_performed` datetime NOT NULL,
`request_time` int(10) unsigned NOT NULL,
PRIMARY KEY (`id`),
KEY `site_health_id` (`site_health_id`),
KEY `dt_performed` (`dt_performed`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 ;
CREATE TABLE IF NOT EXISTS `site-scrape` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`site_health_id` int(10) unsigned NOT NULL,
`dt_performed` datetime NOT NULL,
`request_time` int(10) unsigned NOT NULL,
`scrape_time` int(10) unsigned NOT NULL,
`photo_time` int(10) unsigned NOT NULL,
`total_time` int(10) unsigned NOT NULL,
PRIMARY KEY (`id`),
KEY `site_health_id` (`site_health_id`),
KEY `dt_performed` (`dt_performed`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 ;

View file

@ -11,15 +11,41 @@ function attribute_contains($attr,$s) {
}} }}
if(! function_exists('noscrape_dfrn')) {
function noscrape_dfrn($url) {
$submit_noscrape_start = microtime(true);
$data = fetch_url($url);
$submit_noscrape_request_end = microtime(true);
if(empty($data)) return false;
$parms = json_decode($data, true);
if(!$parms || !count($parms)) return false;
$parms['tags'] = implode(' ', (array)$parms['tags']);
$submit_noscrape_end = microtime(true);
$parms['_timings'] = array(
'fetch' => round(($submit_noscrape_request_end - $submit_noscrape_start) * 1000),
'scrape' => round(($submit_noscrape_end - $submit_noscrape_request_end) * 1000)
);
return $parms;
}}
if(! function_exists('scrape_dfrn')) { if(! function_exists('scrape_dfrn')) {
function scrape_dfrn($url, $max_nodes=5000) { function scrape_dfrn($url, $max_nodes=3500) {
$minNodes = 100; //Lets do at least 100 nodes per type. $minNodes = 100; //Lets do at least 100 nodes per type.
$timeout = 10; //Timeout will affect batch processing. $timeout = 10; //Timeout will affect batch processing.
//Try and cheat our way into faster profiles.
if(strpos($url, 'tab=profile') === false){
$url .= (strpos($url, '?') > 0 ? '&' : '?').'tab=profile';
}
$scrape_start = microtime(true);
$ret = array(); $ret = array();
$s = fetch_url($url, $timeout); $s = fetch_url($url, $timeout);
$scrape_fetch_end = microtime(true);
if(! $s) if(! $s)
return $ret; return $ret;
@ -28,30 +54,36 @@ function scrape_dfrn($url, $max_nodes=5000) {
if(! $dom) if(! $dom)
return $ret; return $ret;
$items = $dom->getElementsByTagName('meta'); $items = $dom->getElementsByTagName('meta');
// get DFRN link elements // get DFRN link elements
$nodes_left = max(intval($max_nodes), $minNodes); $nodes_left = max(intval($max_nodes), $minNodes);
$targets = array('hide', 'comm', 'tags');
$targets_left = count($targets);
foreach($items as $item) { foreach($items as $item) {
$x = $item->getAttribute('name'); $x = $item->getAttribute('name');
if($x == 'dfrn-global-visibility') { if($x == 'dfrn-global-visibility') {
$z = strtolower(trim($item->getAttribute('content'))); $z = strtolower(trim($item->getAttribute('content')));
if($z != 'true') if($z != 'true')
$ret['hide'] = 1; $ret['hide'] = 1;
if($z === 'false')
$ret['explicit-hide'] = 1;
$targets_left = pop_scrape_target($targets, 'hide');
} }
if($x == 'friendika.community' || $x == 'friendica.community') { if($x == 'friendika.community' || $x == 'friendica.community') {
$z = strtolower(trim($item->getAttribute('content'))); $z = strtolower(trim($item->getAttribute('content')));
if($z == 'true') if($z == 'true')
$ret['comm'] = 1; $ret['comm'] = 1;
$targets_left = pop_scrape_target($targets, 'comm');
} }
if($x == 'keywords') { if($x == 'keywords') {
$z = str_replace(',',' ',strtolower(trim($item->getAttribute('content')))); $z = str_replace(',',' ',strtolower(trim($item->getAttribute('content'))));
if(strlen($z)) if(strlen($z))
$ret['tags'] = $z; $ret['tags'] = $z;
$targets_left = pop_scrape_target($targets, 'tags');
} }
$nodes_left--; $nodes_left--;
if($nodes_left <= 0) break; if($nodes_left <= 0 || $targets_left <= 0) break;
} }
$items = $dom->getElementsByTagName('link'); $items = $dom->getElementsByTagName('link');
@ -71,37 +103,69 @@ function scrape_dfrn($url, $max_nodes=5000) {
$nodes_left = max(intval($max_nodes), $minNodes); $nodes_left = max(intval($max_nodes), $minNodes);
$items = $dom->getElementsByTagName('*'); $items = $dom->getElementsByTagName('*');
$targets = array('fn', 'pdesc', 'photo', 'key', 'locality', 'region', 'postal-code', 'country-name', 'gender', 'marital');
$targets_left = count($targets);
foreach($items as $item) { foreach($items as $item) {
if(attribute_contains($item->getAttribute('class'), 'vcard')) { if(attribute_contains($item->getAttribute('class'), 'vcard')) {
$level2 = $item->getElementsByTagName('*'); $level2 = $item->getElementsByTagName('*');
foreach($level2 as $x) { foreach($level2 as $x) {
if(attribute_contains($x->getAttribute('class'),'fn')) if(attribute_contains($x->getAttribute('class'),'fn')){
$ret['fn'] = $x->textContent; $ret['fn'] = $x->textContent;
if(attribute_contains($x->getAttribute('class'),'title')) $targets_left = pop_scrape_target($targets, 'fn');
}
if(attribute_contains($x->getAttribute('class'),'title')){
$ret['pdesc'] = $x->textContent; $ret['pdesc'] = $x->textContent;
if(attribute_contains($x->getAttribute('class'),'photo')) $targets_left = pop_scrape_target($targets, 'pdesc');
}
if(attribute_contains($x->getAttribute('class'),'photo')){
$ret['photo'] = $x->getAttribute('src'); $ret['photo'] = $x->getAttribute('src');
if(attribute_contains($x->getAttribute('class'),'key')) $targets_left = pop_scrape_target($targets, 'photo');
}
if(attribute_contains($x->getAttribute('class'),'key')){
$ret['key'] = $x->textContent; $ret['key'] = $x->textContent;
if(attribute_contains($x->getAttribute('class'),'locality')) $targets_left = pop_scrape_target($targets, 'key');
}
if(attribute_contains($x->getAttribute('class'),'locality')){
$ret['locality'] = $x->textContent; $ret['locality'] = $x->textContent;
if(attribute_contains($x->getAttribute('class'),'region')) $targets_left = pop_scrape_target($targets, 'locality');
}
if(attribute_contains($x->getAttribute('class'),'region')){
$ret['region'] = $x->textContent; $ret['region'] = $x->textContent;
if(attribute_contains($x->getAttribute('class'),'postal-code')) $targets_left = pop_scrape_target($targets, 'region');
}
if(attribute_contains($x->getAttribute('class'),'postal-code')){
$ret['postal-code'] = $x->textContent; $ret['postal-code'] = $x->textContent;
if(attribute_contains($x->getAttribute('class'),'country-name')) $targets_left = pop_scrape_target($targets, 'postal-code');
}
if(attribute_contains($x->getAttribute('class'),'country-name')){
$ret['country-name'] = $x->textContent; $ret['country-name'] = $x->textContent;
if(attribute_contains($x->getAttribute('class'),'x-gender')) $targets_left = pop_scrape_target($targets, 'country-name');
}
if(attribute_contains($x->getAttribute('class'),'x-gender')){
$ret['gender'] = $x->textContent; $ret['gender'] = $x->textContent;
$targets_left = pop_scrape_target($targets, 'gender');
} }
} }
if(attribute_contains($item->getAttribute('class'),'marital-text')) }
if(attribute_contains($item->getAttribute('class'),'marital-text')){
$ret['marital'] = $item->textContent; $ret['marital'] = $item->textContent;
$nodes_left--; $targets_left = pop_scrape_target($targets, 'marital');
if($nodes_left <= 0) break;
} }
$nodes_left--;
if($nodes_left <= 0 || $targets_left <= 0) break;
}
$scrape_end = microtime(true);
$fetch_time = round(($scrape_fetch_end - $scrape_start) * 1000);
$scrape_time = round(($scrape_end - $scrape_fetch_end) * 1000);
$ret['_timings'] = array(
'fetch' => $fetch_time,
'scrape' => $scrape_time
);
return $ret; return $ret;
}} }}
@ -121,3 +185,10 @@ function validate_dfrn($a) {
return $errors; return $errors;
}} }}
if(! function_exists('pop_scrape_target')) {
function pop_scrape_target(&$array, $name) {
$at = array_search($name, $array);
unset($array[$at]);
return count($array);
}}

15
include/g.line-min.js vendored Normal file
View file

@ -0,0 +1,15 @@
/*!
* g.Raphael 0.51 - Charting library, based on Raphaël
*
* Copyright (c) 2009-2012 Dmitry Baranovskiy (http://g.raphaeljs.com)
* Licensed under the MIT (http://www.opensource.org/licenses/mit-license.php) license.
*/
(function(){function S(h,o){for(var p=h.length/o,m=0,k=p,b=0,i=[];m<h.length;)k--,0>k?(b+=h[m]*(1+k),i.push(b/p),b=h[m++]*-k,k+=p):b+=1*h[m++];return i}function E(h,o,p,m,k,b,i,c){var F,f,u,w;function J(a){for(var s=[],e=0,G=b.length;e<G;e++)s=s.concat(b[e]);s.sort(function(a,e){return a-e});for(var c=[],g=[],e=0,G=s.length;e<G;e++)s[e]!=s[e-1]&&c.push(s[e])&&g.push(o+d+(s[e]-v)*A);for(var s=c,G=s.length,l=a||h.set(),e=0;e<G;e++){var c=g[e]-(g[e]-(g[e-1]||o))/2,f=((g[e+1]||o+m)-g[e])/2+(g[e]-(g[e-
1]||o))/2,j;a?j={}:l.push(j=h.rect(c-1,p,Math.max(f+1,1),k).attr({stroke:"none",fill:"#000",opacity:0}));j.values=[];j.symbols=h.set();j.y=[];j.x=g[e];j.axis=s[e];for(var f=0,r=i.length;f<r;f++)for(var c=b[f]||b[0],n=0,u=c.length;n<u;n++)c[n]==s[e]&&(j.values.push(i[f][n]),j.y.push(p+k-d-(i[f][n]-y)*H),j.symbols.push(q.symbols[f][n]));a&&a.call(j)}!a&&(t=l)}function N(a){for(var g=a||h.set(),e,c=0,j=i.length;c<j;c++)for(var f=0,m=i[c].length;f<m;f++){var l=o+d+((b[c]||b[0])[f]-v)*A,n=o+d+((b[c]||
b[0])[f?f-1:1]-v)*A,r=p+k-d-(i[c][f]-y)*H;a?e={}:g.push(e=h.circle(l,r,Math.abs(n-l)/2).attr({stroke:"#000",fill:"#000",opacity:1}));e.x=l;e.y=r;e.value=i[c][f];e.line=q.lines[c];e.shade=q.shades[c];e.symbol=q.symbols[c][f];e.symbols=q.symbols[c];e.axis=(b[c]||b[0])[f];a&&a.call(e)}!a&&(C=g)}c=c||{};h.raphael.is(b[0],"array")||(b=[b]);h.raphael.is(i[0],"array")||(i=[i]);for(var d=c.gutter||10,l=Math.max(b[0].length,i[0].length),O=c.symbol||"",P=c.colors||this.colors,t=null,C=null,q=h.set(),g=[],a=
0,n=i.length;a<n;a++)l=Math.max(l,i[a].length);for(var K=h.set(),a=0,n=i.length;a<n;a++)c.shade&&K.push(h.path().attr({stroke:"none",fill:P[a],opacity:c.nostroke?1:0.3})),i[a].length>m-2*d&&(i[a]=S(i[a],m-2*d),l=m-2*d),b[a]&&b[a].length>m-2*d&&(b[a]=S(b[a],m-2*d));var g=Array.prototype.concat.apply([],b),l=Array.prototype.concat.apply([],i),g=this.snapEnds(Math.min.apply(Math,g),Math.max.apply(Math,g),b[0].length-1),v=g.from,g=g.to,l=this.snapEnds(Math.min.apply(Math,l),Math.max.apply(Math,l),i[0].length-
1),y=l.from,a=l.to,A=(m-2*d)/(g-v||1),H=(k-2*d)/(a-y||1),l=h.set();c.axis&&(n=(c.axis+"").split(/[,\s]+/),+n[0]&&l.push(this.axis(o+d,p+d,m-2*d,v,g,c.axisxstep||Math.floor((m-2*d)/20),2,h)),+n[1]&&l.push(this.axis(o+m-d,p+k-d,k-2*d,y,a,c.axisystep||Math.floor((k-2*d)/20),3,h)),+n[2]&&l.push(this.axis(o+d,p+k-d,m-2*d,v,g,c.axisxstep||Math.floor((m-2*d)/20),0,h)),+n[3]&&l.push(this.axis(o+d,p+k-d,k-2*d,y,a,c.axisystep||Math.floor((k-2*d)/20),1,h)));for(var Q=h.set(),R=h.set(),E,a=0,n=i.length;a<n;a++){c.nostroke||
Q.push(E=h.path().attr({stroke:P[a],"stroke-width":c.width||2,"stroke-linejoin":"round","stroke-linecap":"round","stroke-dasharray":c.dash||""}));for(var D=Raphael.is(O,"array")?O[a]:O,I=h.set(),g=[],j=0,T=i[a].length;j<T;j++){var x=o+d+((b[a]||b[0])[j]-v)*A,z=p+k-d-(i[a][j]-y)*H;(Raphael.is(D,"array")?D[j]:D)&&I.push(h[Raphael.is(D,"array")?D[j]:D](x,z,3*(c.width||2)).attr({fill:P[a],stroke:"none"}));if(c.smooth){if(j&&j!=T-1){f=o+d+((b[a]||b[0])[j-1]-v)*A;var L=p+k-d-(i[a][j-1]-y)*H;u=x;w=z;var r=
o+d+((b[a]||b[0])[j+1]-v)*A,B=p+k-d-(i[a][j+1]-y)*H,M=(u-f)/2;F=(r-u)/2;f=Math.atan((u-f)/Math.abs(w-L));r=Math.atan((r-u)/Math.abs(w-B));f=L<w?Math.PI-f:f;r=B<w?Math.PI-r:r;B=Math.PI/2-(f+r)%(2*Math.PI)/2;L=M*Math.sin(B+f);f=M*Math.cos(B+f);M=F*Math.sin(B+r);r=F*Math.cos(B+r);F=u-L;f=w+f;u+=M;w+=r;g=g.concat([F,f,x,z,u,w])}j||(g=["M",x,z,"C",x,z])}else g=g.concat([j?"L":"M",x,z])}c.smooth&&(g=g.concat([x,z,x,z]));R.push(I);c.shade&&K[a].attr({path:g.concat(["L",x,p+k-d,"L",o+d+((b[a]||b[0])[0]-v)*
A,p+k-d,"z"]).join(",")});!c.nostroke&&E.attr({path:g.join(",")})}q.push(Q,K,R,l,t,C);q.lines=Q;q.shades=K;q.symbols=R;q.axis=l;q.hoverColumn=function(a,c){!t&&J();t.mouseover(a).mouseout(c);return this};q.clickColumn=function(a){!t&&J();t.click(a);return this};q.hrefColumn=function(a){var c=h.raphael.is(arguments[0],"array")?arguments[0]:arguments;if(!(arguments.length-1)&&typeof a=="object")for(var e in a)for(var b=0,d=t.length;b<d;b++)t[b].axis==e&&t[b].attr("href",a[e]);!t&&J();b=0;for(d=c.length;b<
d;b++)t[b]&&t[b].attr("href",c[b]);return this};q.hover=function(a,b){!C&&N();C.mouseover(a).mouseout(b);return this};q.click=function(a){!C&&N();C.click(a);return this};q.each=function(a){N(a);return this};q.eachColumn=function(a){J(a);return this};return q}var I=function(){};I.prototype=Raphael.g;E.prototype=new I;Raphael.fn.linechart=function(h,o,p,m,k,b,i){return new E(this,h,o,p,m,k,b,i)}})();

7
include/g.raphael.js Normal file

File diff suppressed because one or more lines are too long

11
include/raphael.js Normal file

File diff suppressed because one or more lines are too long

342
include/site-health.php Normal file
View file

@ -0,0 +1,342 @@
<?php
/*
Based on a submitted URL, take note of the site it mentions.
Ensures that the site health will be tracked if it wasn't already.
If $check_health is set to true, this function may trigger some health checks (CURL requests) when needed.
Do not enable it unless you have enough execution time to do so.
But when you do, it's better to check for health whenever a site submits something.
After all, the highest chance for the server to be online is when it submits activity.
*/
if(! function_exists('notice_site')){
function notice_site($url, $check_health=false)
{
global $a;
//Parse the domain from the URL.
$site = parse_site_from_url($url);
//Search for it in the site-health table.
$result = q(
"SELECT * FROM `site-health` WHERE `base_url`= '%s' ORDER BY `id` ASC LIMIT 1",
dbesc($site)
);
//If it exists, see if we need to update any flags / statuses.
if(!empty($result) && isset($result[0])){
$entry = $result[0];
//If we are allowed to do health checks...
if(!!$check_health){
//And the site is in bad health currently, do a check now.
//This is because you have a high certainty the site may perform better now.
if($entry['health_score'] < -40){
run_site_probe($entry['id'], $entry);
}
//Or if the site has not been probed for longer than the minimum delay.
//This is to make sure not everything is postponed to the batches.
elseif(strtotime($entry['dt_last_probed']) < time()-$a->config['site-health']['min_probe_delay']){
run_site_probe($entry['id'], $entry);
}
}
}
//If it does not exist.
else{
//Add it and make sure it is ready for probing.
q(
"INSERT INTO `site-health` (`base_url`, `dt_first_noticed`) VALUES ('%s', NOW())",
dbesc($site)
);
//And in case we should probe now, do so.
if(!!$check_health){
$result = q(
"SELECT * FROM `site-health` WHERE `base_url`= '%s' ORDER BY `id` ASC LIMIT 1",
dbesc($site)
);
if(!empty($result) && isset($result[0])){
$entry = $result[0];
run_site_probe($result[0]['id'], $entry);
}
}
}
//Give other scripts the site health.
return isset($entry) ? $entry : false;
}}
//Extracts the site from a given URL.
if(! function_exists('parse_site_from_url')){
function parse_site_from_url($url)
{
//Currently a simple implementation, but may improve over time.
#TODO: support subdirectories?
$urlMeta = parse_url($url);
return $urlMeta['scheme'].'://'.$urlMeta['host'];
}}
//Performs a ping to the given site ID
//You may need to notice the site first before you know it's ID.
if(! function_exists('run_site_ping')){
function run_site_probe($id, &$entry_out)
{
global $a;
//Get the site information from the DB, based on the ID.
$result = q(
"SELECT * FROM `site-health` WHERE `id`= %u ORDER BY `id` ASC LIMIT 1",
intval($id)
);
//Abort the probe if site is not known.
if(!$result || !isset($result[0])){
logger('Unknown site-health ID being probed: '.$id);
throw new \Exception('Unknown site-health ID being probed: '.$id);
}
//Shortcut.
$entry = $result[0];
$base_url = $entry['base_url'];
$probe_location = $base_url.'/friendica/json';
//Prepare the CURL call.
$handle = curl_init();
$options = array(
//Timeouts
CURLOPT_TIMEOUT => max($a->config['site-health']['probe_timeout'], 1), //Minimum of 1 second timeout.
CURLOPT_CONNECTTIMEOUT => 1,
//Redirecting
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 8,
//SSL
CURLOPT_SSL_VERIFYPEER => true,
// CURLOPT_VERBOSE => true,
// CURLOPT_CERTINFO => true,
CURLOPT_SSL_VERIFYHOST => 2,
CURLOPT_PROTOCOLS => CURLPROTO_HTTP | CURLPROTO_HTTPS,
//Basic request
CURLOPT_USERAGENT => 'friendica-directory-probe-0.1',
CURLOPT_RETURNTRANSFER => true,
CURLOPT_URL => $probe_location
);
curl_setopt_array($handle, $options);
//Probe the site.
$probe_start = microtime(true);
$probe_data = curl_exec($handle);
$probe_end = microtime(true);
//Check for SSL problems.
$curl_statuscode = curl_errno($handle);
$sslcert_issues = in_array($curl_statuscode, array(
60, //Could not authenticate certificate with known CA's
83 //Issuer check failed
));
//When it's the certificate that doesn't work.
if($sslcert_issues){
//Probe again, without strict SSL.
$options[CURLOPT_SSL_VERIFYPEER] = false;
//Replace the handler.
curl_close($handle);
$handle = curl_init();
curl_setopt_array($handle, $options);
//Probe.
$probe_start = microtime(true);
$probe_data = curl_exec($handle);
$probe_end = microtime(true);
//Store new status.
$curl_statuscode = curl_errno($handle);
}
//Gather more meta.
$time = round(($probe_end - $probe_start) * 1000);
$status = curl_getinfo($handle, CURLINFO_HTTP_CODE);
$type = curl_getinfo($handle, CURLINFO_CONTENT_TYPE);
$effective_url = curl_getinfo($handle, CURLINFO_EFFECTIVE_URL);
//Done with CURL now.
curl_close($handle);
#TODO: if the site redirects elsewhere, notice this site and record an issue.
$wrong_base_url = parse_site_from_url($effective_url) !== $entry['base_url'];
try{
$data = json_decode($probe_data);
}catch(\Exception $ex){
$data = false;
}
$parse_failed = !$data;
$parsedDataQuery = '';
if(!$parse_failed){
$given_base_url_match = $data->url == $base_url;
//Record the probe speed in a probes table.
q(
"INSERT INTO `site-probe` (`site_health_id`, `dt_performed`, `request_time`)".
"VALUES (%u, NOW(), %u)",
$entry['id'],
$time
);
//Update any health calculations or otherwise processed data.
$parsedDataQuery = sprintf(
"`dt_last_seen` = NOW(),
`name` = '%s',
`version` = '%s',
`plugins` = '%s',
`reg_policy` = '%s',
`info` = '%s',
`admin_name` = '%s',
`admin_profile` = '%s',
",
dbesc($data->site_name),
dbesc($data->version),
dbesc(implode("\r\n", $data->plugins)),
dbesc($data->register_policy),
dbesc($data->info),
dbesc($data->admin->name),
dbesc($data->admin->profile)
);
//Did we use HTTPS?
$urlMeta = parse_url($probe_location);
if($urlMeta['scheme'] == 'https'){
$parsedDataQuery .= sprintf("`ssl_state` = b'%u',", $sslcert_issues ? '0' : '1');
} else {
$parsedDataQuery .= "`ssl_state` = NULL,";
}
//Do we have a no scrape supporting node? :D
if(isset($data->no_scrape_url)){
$parsedDataQuery .= sprintf("`no_scrape_url` = '%s',", dbesc($data->no_scrape_url));
}
}
//Get the new health.
$version = $parse_failed ? '' : $data->version;
$health = health_score_after_probe($entry['health_score'], !$parse_failed, $time, $version, $sslcert_issues);
//Update the health.
q("UPDATE `site-health` SET
`health_score` = '%d',
$parsedDataQuery
`dt_last_probed` = NOW()
WHERE `id` = %d LIMIT 1",
$health,
$entry['id']
);
//Get the site information from the DB, based on the ID.
$result = q(
"SELECT * FROM `site-health` WHERE `id`= %u ORDER BY `id` ASC LIMIT 1",
$entry['id']
);
//Return updated entry data.
if($result && isset($result[0])){
$entry_out = $result[0];
}
}}
//Determines the new health score after a probe has been executed.
if(! function_exists('health_score_after_probe')){
function health_score_after_probe($current, $probe_success, $time=null, $version=null, $ssl_issues=null)
{
//Probe failed, costs you 30 points.
if(!$probe_success) return max($current-30, -100);
//A good probe gives you 20 points.
$current += 20;
//Speed scoring.
if(intval($time) > 0){
//Pentaly / bonus points.
if ($time > 800) $current -= 10; //Bad speed.
elseif ($time > 400) $current -= 5; //Still not good.
elseif ($time > 250) $current += 0; //This is normal.
elseif ($time > 120) $current += 5; //Good speed.
else $current += 10; //Excellent speed.
//Cap for bad speeds.
if ($time > 800) $current = min(40, $current);
elseif ($time > 400) $current = min(60, $current);
}
//Version check.
if(!empty($version)){
$versionParts = explode('.', $version);
//Older than 3.x.x?
//Your score can not go above 30 health.
if(intval($versionParts[0]) < 3){
$current = min($current, 30);
}
//Older than 3.2.x?
elseif(intval($versionParts[1] < 2)){
$current -= 5; //Somewhat outdated.
}
#TODO: See if this needs to be more dynamic.
#TODO: See if this is a proper indicator of health.
}
//SSL problems? That's a big deal.
if($ssl_issues === true){
$current -= 10;
}
//Don't go beyond +100 or -100.
return max(min(100, $current), -100);
}}
//Changes a score into a name. Used for classes and such.
if(! function_exists('health_score_to_name')){
function health_score_to_name($score)
{
if ($score < -50) return 'very-bad';
elseif ($score < 0) return 'bad';
elseif ($score < 30) return 'neutral';
elseif ($score < 50) return 'ok';
elseif ($score < 80) return 'good';
else return 'perfect';
}}

View file

@ -1,21 +1,27 @@
<?php <?php
require_once('datetime.php'); require_once('datetime.php');
require_once('site-health.php');
function run_submit(&$a, $url) { function run_submit($url) {
global $a;
if(! strlen($url)) if(! strlen($url))
return false; return false;
logger('Updating: ' . $url); logger('Updating: ' . $url);
//First run a notice script for the site it is hosted on.
$site_health = notice_site($url, true);
$submit_start = microtime(true);
$nurl = str_replace(array('https:','//www.'), array('http:','//'), $url); $nurl = str_replace(array('https:','//www.'), array('http:','//'), $url);
$profile_exists = false; $profile_exists = false;
$r = q("SELECT * FROM `profile` WHERE ( `homepage` = '%s' OR `nurl` = '%s' ) LIMIT 1", $r = q("SELECT * FROM `profile` WHERE ( `homepage` = '%s' OR `nurl` = '%s' )",
dbesc($url), dbesc($url),
dbesc($nurl) dbesc($nurl)
); );
@ -25,14 +31,63 @@ function run_submit(&$a, $url) {
$profile_id = $r[0]['id']; $profile_id = $r[0]['id'];
} }
//Remove duplicates.
if(count($r) > 1){
for($i=1; $i<count($r); $i++){
logger('Removed duplicate profile '.intval($r[$i]['id']));
q("DELETE FROM `photo` WHERE `profile-id` = %d LIMIT 1",
intval($r[$i]['id'])
);
q("DELETE FROM `profile` WHERE `id` = %d LIMIT 1",
intval($r[$i]['id'])
);
}
}
require_once('Scrape.php'); require_once('Scrape.php');
//Skip the scrape? :D
$noscrape = $site_health && $site_health['no_scrape_url'];
if($noscrape){
//Find out who to look up.
$which = str_replace($site_health['base_url'], '', $url);
$noscrape = preg_match('~/profile/([^/]+)~', $which, $matches) === 1;
//If that did not fail...
if($noscrape){
$parms = noscrape_dfrn($site_health['no_scrape_url'].'/'.$matches[1]);
$noscrape = !!$parms; //If the result was false, do a scrape after all.
}
}
if(!$noscrape){
$parms = scrape_dfrn($url); $parms = scrape_dfrn($url);
}
// logger('dir_submit: ' . print_r($parms,true)); //Empty result is due to an offline site.
if(!count($parms)){
if((! count($parms)) || (validate_dfrn($parms))) { //For large sites this could lower the health too quickly, so don't track health.
//But for sites that are already in bad status. Do a cleanup now.
if($profile_exists && $site_health['health_score'] < $a->config['maintenance']['remove_profile_health_threshold']){
logger('Nuked bad health record.');
nuke_record($url);
}
return false;
}
//We don't care about valid dfrn if the user indicates to be hidden.
elseif($parms['explicit-hide'] && $profile_exists) {
logger('User opted out of the directory.');
nuke_record($url);
}
//This is most likely a problem with the site configuration. Ignore.
elseif(validate_dfrn($parms)) {
return false; return false;
} }
@ -147,10 +202,14 @@ function run_submit(&$a, $url) {
} }
} }
$submit_photo_start = microtime(true);
require_once("Photo.php"); require_once("Photo.php");
$photo_failure = false; $photo_failure = false;
$status = false;
$img_str = fetch_url($photo,true); $img_str = fetch_url($photo,true);
$img = new Photo($img_str); $img = new Photo($img_str);
if($img) { if($img) {
@ -162,12 +221,28 @@ function run_submit(&$a, $url) {
dbesc($a->get_baseurl() . '/photo/' . $profile_id . '.jpg'), dbesc($a->get_baseurl() . '/photo/' . $profile_id . '.jpg'),
intval($profile_id) intval($profile_id)
); );
$status = true;
} }
else{ else{
nuke_record($url); nuke_record($url);
return false;
} }
return true;
$submit_end = microtime(true);
$photo_time = round(($submit_end - $submit_photo_start) * 1000);
$time = round(($submit_end - $submit_start) * 1000);
//Record the scrape speed in a scrapes table.
if($site_health && $status) q(
"INSERT INTO `site-scrape` (`site_health_id`, `dt_performed`, `request_time`, `scrape_time`, `photo_time`, `total_time`)".
"VALUES (%u, NOW(), %u, %u, %u, %u)",
$site_health['id'],
$parms['_timings']['fetch'],
$parms['_timings']['scrape'],
$photo_time,
$time
);
return $status;
} }

314
mod/health.php Normal file
View file

@ -0,0 +1,314 @@
<?php
require_once('include/site-health.php');
function health_content(&$a) {
if($a->argc > 1){
return health_details($a, $a->argv[1]);
}
if($_GET['s']){
return health_search($a, $_GET['s']);
}
return health_summary($a);
}
function health_search(&$a, $search)
{
if(strlen($search) < 3){
$result = 'Please use at least 3 characters in your search';
}
else {
$r = q("SELECT * FROM `site-health` WHERE `base_url` LIKE '%%%s%%' ORDER BY `health_score` DESC LIMIT 100", dbesc($search));
if(count($r)){
$result = '';
foreach($r as $site){
//Get user count.
$site['users'] = 0;
$r = q(
"SELECT COUNT(*) as `users` FROM `profile`
WHERE `homepage` LIKE '%s%%'",
dbesc($site['base_url'])
);
if(count($r)){
$site['users'] = $r[0]['users'];
}
$result .=
'<span class="health '.health_score_to_name($site['health_score']).'">&hearts;</span> '.
'<a href="/health/'.$site['id'].'">' . $site['base_url'] . '</a> '.
'(' . $site['users'] . ')'.
"<br />\r\n";
}
}
else {
$result = 'No results';
}
}
$tpl .= file_get_contents('view/health_search.tpl');
return replace_macros($tpl, array(
'$searched' => $search,
'$result' => $result
));
}
function health_summary(&$a){
$sites = array();
//Find the user count per site.
$r = q("SELECT `homepage` FROM `profile` WHERE 1");
if(count($r)) {
foreach($r as $rr) {
$site = parse_site_from_url($rr['homepage']);
if($site) {
if(!isset($sites[$site]))
$sites[$site] = 0;
$sites[$site] ++;
}
}
}
//See if we have a health for them.
$sites_with_health = array();
$site_healths = array();
$r = q("SELECT * FROM `site-health` WHERE `reg_policy`='REGISTER_OPEN'");
if(count($r)) {
foreach($r as $rr) {
$sites_with_health[$rr['base_url']] = (($sites[$rr['base_url']] / 100) + 10) * intval($rr['health_score']);
$site_healths[$rr['base_url']] = $rr;
}
}
arsort($sites_with_health);
$total = 0;
$public_sites = '';
foreach($sites_with_health as $k => $v)
{
//Stop at unhealthy sites.
$site = $site_healths[$k];
if($site['health_score'] <= 20) break;
//Skip small sites.
$users = $sites[$k];
if($users < 10) continue;
$public_sites .=
'<span class="health '.health_score_to_name($site['health_score']).'">&hearts;</span> '.
'<a href="/health/'.$site['id'].'">' . $k . '</a> '.
'(' . $users . ')'.
"<br />\r\n";
$total ++;
}
$public_sites .= "<br>Total: $total<br />\r\n";
$tpl .= file_get_contents('view/health_summary.tpl');
return replace_macros($tpl, array(
'$versions' => $versions,
'$public_sites' => $public_sites
));
}
function health_details($a, $id)
{
//The overall health status.
$r = q(
"SELECT * FROM `site-health`
WHERE `id`=%u",
intval($id)
);
if(!count($r)){
$a->error = 404;
return;
}
$site = $r[0];
//Figure out SSL state.
$urlMeta = parse_url($site['base_url']);
if($urlMeta['scheme'] !== 'https'){
$ssl_state = 'No';
}else{
switch ($site['ssl_state']) {
case null: $ssl_state = 'Yes, but not yet verified.'; break;
case '0': $ssl_state = 'Certificate error!'; break;
case '1': $ssl_state = '&radic; Yes, verified.'; break;
}
$ssl_state .= ' <a href="https://www.ssllabs.com/ssltest/analyze.html?d='.$urlMeta['host'].'" target="_blank">Detailed test</a>';
}
//Get user count.
$site['users'] = 0;
$r = q(
"SELECT COUNT(*) as `users` FROM `profile`
WHERE `homepage` LIKE '%s%%'",
dbesc($site['base_url'])
);
if(count($r)){
$site['users'] = $r[0]['users'];
}
//Get avg probe speed.
$r = q(
"SELECT AVG(`request_time`) as `avg_probe_time` FROM `site-probe`
WHERE `site_health_id` = %u",
intval($site['id'])
);
if(count($r)){
$site['avg_probe_time'] = $r[0]['avg_probe_time'];
}
//Get scraping / submit speeds.
$r = q(
"SELECT
AVG(`request_time`) as `avg_profile_time`,
AVG(`scrape_time`) as `avg_scrape_time`,
AVG(`photo_time`) as `avg_photo_time`,
AVG(`total_time`) as `avg_submit_time`
FROM `site-scrape`
WHERE `site_health_id` = %u",
intval($site['id'])
);
if(count($r)){
$site['avg_profile_time'] = $r[0]['avg_profile_time'];
$site['avg_scrape_time'] = $r[0]['avg_scrape_time'];
$site['avg_photo_time'] = $r[0]['avg_photo_time'];
$site['avg_submit_time'] = $r[0]['avg_submit_time'];
}
//Get probe speed data.
$r = q(
"SELECT `request_time`, `dt_performed` FROM `site-probe`
WHERE `site_health_id` = %u",
intval($site['id'])
);
if(count($r)){
//Include graphael line charts.
$a->page['htmlhead'] .= '<script type="text/javascript" src="'.$a->get_baseurl().'/include/raphael.js"></script>'.PHP_EOL;
$a->page['htmlhead'] .= '<script type="text/javascript" src="'.$a->get_baseurl().'/include/g.raphael.js"></script>'.PHP_EOL;
$a->page['htmlhead'] .= '<script type="text/javascript" src="'.$a->get_baseurl().'/include/g.line-min.js"></script>';
$speeds = array();
$times = array();
$mintime = time();
foreach($r as $row){
$speeds[] = $row['request_time'];
$time = strtotime($row['dt_performed']);
$times[] = $time;
if($mintime > $time) $mintime = $time;
}
for($i=0; $i < count($times); $i++){
$times[$i] -= $mintime;
$times[$i] = floor($times[$i] / (24*3600));
}
$a->page['htmlhead'] .=
'<script type="text/javascript">
jQuery(function($){
var r = Raphael("probe-chart")
, x = ['.implode(',', $times).']
, y = ['.implode(',', $speeds).']
;
r.linechart(30, 15, 400, 300, x, [y], {symbol:"circle", axis:"0 0 0 1", shade:true, width:1.5}).hoverColumn(function () {
this.tags = r.set();
for (var i = 0, ii = this.y.length; i < ii; i++) {
this.tags.push(r.popup(this.x, this.y[i], this.values[i]+"ms", "right", 5).insertBefore(this).attr([{ fill: "#eee" }, { fill: this.symbols[i].attr("fill") }]));
}
}, function () {
this.tags && this.tags.remove();
});
});
</script>';
}
//Get scrape speed data.
$r = q(
"SELECT AVG(`total_time`) as `avg_time`, date(`dt_performed`) as `date` FROM `site-scrape`
WHERE `site_health_id` = %u GROUP BY `date`",
intval($site['id'])
// date('Y-m-d H:i:s', time()-(3*24*3600)) //Max 3 days old.
);
if($r && count($r)){
//Include graphael line charts.
$a->page['htmlhead'] .= '<script type="text/javascript" src="'.$a->get_baseurl().'/include/raphael.js"></script>'.PHP_EOL;
$a->page['htmlhead'] .= '<script type="text/javascript" src="'.$a->get_baseurl().'/include/g.raphael.js"></script>'.PHP_EOL;
$a->page['htmlhead'] .= '<script type="text/javascript" src="'.$a->get_baseurl().'/include/g.line-min.js"></script>';
$speeds = array();
$times = array();
$mintime = time();
foreach($r as $row){
$speeds[] = $row['avg_time'];
$time = strtotime($row['date']);
$times[] = $time;
if($mintime > $time) $mintime = $time;
}
for($i=0; $i < count($times); $i++){
$times[$i] -= $mintime;
$times[$i] = floor($times[$i] / (24*3600));
}
$a->page['htmlhead'] .=
'<script type="text/javascript">
jQuery(function($){
var r = Raphael("scrape-chart")
, x = ['.implode(',', $times).']
, y = ['.implode(',', $speeds).']
;
r.linechart(30, 15, 400, 300, x, [y], {shade:true, axis:"0 0 0 1", width:1}).hoverColumn(function () {
this.tags = r.set();
for (var i = 0, ii = this.y.length; i < ii; i++) {
this.tags.push(r.popup(this.x, this.y[i], Math.round(this.values[i])+"ms", "right", 5).insertBefore(this));
}
}, function () {
this.tags && this.tags.remove();
});
});
</script>';
}
$tpl .= file_get_contents('view/health_details.tpl');
return replace_macros($tpl, array(
'$name' => $site['name'],
'$base_url' => $site['base_url'],
'$health_score' => $site['health_score'],
'$health_name' => health_score_to_name($site['health_score']),
'$no_scrape_support' => !empty($site['no_scrape_url']) ? '&radic; Supports noscrape' : '',
'$dt_first_noticed' => $site['dt_first_noticed'],
'$dt_last_seen' => $site['dt_last_seen'],
'$version' => $site['version'],
'$plugins' => $site['plugins'],
'$reg_policy' => $site['reg_policy'],
'$info' => $site['info'],
'$admin_name' => $site['admin_name'],
'$admin_profile' => $site['admin_profile'],
'$users' => $site['users'],
'$ssl_state' => $ssl_state,
'$avg_probe_time' => round($site['avg_probe_time']),
'$avg_profile_time' => round($site['avg_profile_time']),
'$avg_scrape_time' => round($site['avg_scrape_time']),
'$avg_photo_time' => round($site['avg_photo_time']),
'$avg_submit_time' => round($site['avg_submit_time'])
));
}

View file

@ -5,30 +5,7 @@ require_once('include/submit.php');
function submit_content(&$a) { function submit_content(&$a) {
$url = hex2bin(notags(trim($_GET['url']))); $url = hex2bin(notags(trim($_GET['url'])));
run_submit($a, $url); run_submit($url);
exit; exit;
} }
function nuke_record($url) {
$nurl = str_replace(array('https:','//www.'), array('http:','//'), $url);
$r = q("SELECT `id` FROM `profile` WHERE ( `homepage` = '%s' OR `nurl` = '%s' ) ",
dbesc($url),
dbesc($nurl)
);
if(count($r)) {
foreach($r as $rr) {
q("DELETE FROM `photo` WHERE `profile-id` = %d LIMIT 1",
intval($rr['id'])
);
q("DELETE FROM `profile` WHERE `id` = %d LIMIT 1",
intval($rr['id'])
);
}
}
return;
}

24
mod/versions.php Normal file
View file

@ -0,0 +1,24 @@
<?php
function versions_content(&$a){
$sites = array();
//Grab a version list.
$versions = '';
$r = q("SELECT count(*) as `count`, `version` FROM `site-health` WHERE `version` IS NOT NULL GROUP BY `version` ORDER BY `version` DESC");
if(count($r)){
foreach($r as $version){
$versions .=
($version['count'] >= 10 ? '<b>' : '').
$version['version'] . ' ('.$version['count'].')<br>'."\r\n".
($version['count'] >= 10 ? '</b>' : '');
}
}
$tpl .= file_get_contents('view/versions.tpl');
return replace_macros($tpl, array(
'$versions' => $versions
));
}

33
view/health_details.tpl Normal file
View file

@ -0,0 +1,33 @@
<h1>
<span class="health $health_name">&hearts;</span> $name<br>
<sup><a href="$base_url">$base_url</a></sup>
</h1>
<p><a href="/health">&laquo; Back to index</a></p>
<div class="meta">
<h3>General information</h3>
<div class="users">$users users</div>
<div class="version">Friendica $version</div>
<div class="first_noticed">First noticed: $dt_first_noticed</div>
<div class="last_seen">Last update: $dt_last_seen</div>
</div>
<div class="security">
<h3>Security</h3>
<div class="ssl_state">HTTPS: $ssl_state</div>
</div>
<div class="performance">
<h3>Performance information</h3>
<div style="float:left;margin-right:30px;padding-top:20px;">
<div class="probe_speed">Probe speed: $avg_probe_timems</div>
<div class="photo_speed">Photo speed: $avg_photo_timems</div>
<div class="profile_speed">Profile speed: $avg_profile_timems</div>
<div class="scrape_speed">Scrape speed: $avg_scrape_timems</div>
<div class="submit_speed">Submit speed: $avg_submit_timems</div>
<span class="health perfect">$no_scrape_support</span>
</div>
<div id="probe-chart" class="speed-chart">Probe speed</div>
<div id="scrape-chart" class="speed-chart">Submit speed</div>
</div>

10
view/health_search.tpl Normal file
View file

@ -0,0 +1,10 @@
<h1>Search your site</h1>
<form method="GET">
<label>Your site URL:</label>
<input type="text" name="s" placeholder="example.com" value="$searched" />
<input type="submit" value="Search" />
</form>
<p><a href="/health">&laquo; Back to index</a></p>
<h1>Search results</h1>
<div class="result-sites">$result</div>

14
view/health_summary.tpl Normal file
View file

@ -0,0 +1,14 @@
<h1>Search your site</h1>
<form method="GET">
<label>Your site URL:</label>
<input type="text" name="s" placeholder="example.com" />
<input type="submit" value="Search" />
</form>
<h1>Healthy public sites</h1>
<p>
These are sites with their registration set to an open policy and a decent health score.<br>
Not on the list: try searching.<br>
More info: ask <a href="https://fc.oscp.info/profile/beanow">Beanow</a>.
</p>
<div class="public-sites">$public_sites</div>

View file

@ -1587,3 +1587,17 @@ input#dfrn-url {
margin-left: 20px; margin-left: 20px;
} }
.health{font-size:120%; vertical-align:bottom;}
.health.very-bad{ color:#f99; }
.health.bad{ color:#f1ba7a; }
.health.neutral{ color:#e6e782; }
.health.ok{ color:#bef273; }
.health.good{ color:#7cf273; }
.health.perfect{ color:#33ff80; }
.speed-chart{
float:left;
width:480px;
height:320px;
text-align:center;
}

2
view/versions.tpl Normal file
View file

@ -0,0 +1,2 @@
<h1>Used versions</h1>
<div class="version-list">$versions</div>