diff --git a/boot.php b/boot.php index de18161e..786f3045 100755 --- a/boot.php +++ b/boot.php @@ -227,11 +227,12 @@ function t($s) { if(! function_exists('fetch_url')) { -function fetch_url($url,$binary = false) { +function fetch_url($url,$binary = false, $timeout=20) { $ch = curl_init($url); if(! $ch) return false; - curl_setopt($ch, CURLOPT_HEADER, 0); + curl_setopt($ch, CURLOPT_HEADER, 0); + curl_setopt($ch, CURLOPT_TIMEOUT, max(intval($timeout), 1)); //Minimum of 1 second timeout. curl_setopt($ch, CURLOPT_FOLLOWLOCATION,true); curl_setopt($ch, CURLOPT_MAXREDIRS,8); curl_setopt($ch, CURLOPT_RETURNTRANSFER,true); diff --git a/include/Scrape.php b/include/Scrape.php index 9cbf1ebc..1edd0672 100755 --- a/include/Scrape.php +++ b/include/Scrape.php @@ -12,10 +12,13 @@ function attribute_contains($attr,$s) { if(! function_exists('scrape_dfrn')) { -function scrape_dfrn($url) { - +function scrape_dfrn($url, $max_nodes=5000) { + + $minNodes = 100; //Lets do at least 100 nodes per type. + $timeout = 10; //Timeout will affect batch processing. + $ret = array(); - $s = fetch_url($url); + $s = fetch_url($url, $timeout); if(! $s) return $ret; @@ -29,7 +32,7 @@ function scrape_dfrn($url) { $items = $dom->getElementsByTagName('meta'); // get DFRN link elements - + $nodes_left = max(intval($max_nodes), $minNodes); foreach($items as $item) { $x = $item->getAttribute('name'); if($x == 'dfrn-global-visibility') { @@ -47,20 +50,26 @@ function scrape_dfrn($url) { if(strlen($z)) $ret['tags'] = $z; } + $nodes_left--; + if($nodes_left <= 0) break; } $items = $dom->getElementsByTagName('link'); // get DFRN link elements - + + $nodes_left = max(intval($max_nodes), $minNodes); foreach($items as $item) { $x = $item->getAttribute('rel'); if(substr($x,0,5) == "dfrn-") $ret[$x] = $item->getAttribute('href'); + $nodes_left--; + if($nodes_left <= 0) break; } // Pull out hCard profile elements - + + $nodes_left = max(intval($max_nodes), $minNodes); $items = $dom->getElementsByTagName('*'); foreach($items as $item) { if(attribute_contains($item->getAttribute('class'), 'vcard')) { @@ -89,6 +98,8 @@ function scrape_dfrn($url) { } if(attribute_contains($item->getAttribute('class'),'marital-text')) $ret['marital'] = $item->textContent; + $nodes_left--; + if($nodes_left <= 0) break; } return $ret; }} diff --git a/mod/import.php b/mod/import.php index 63451c82..b5e09c0f 100644 --- a/mod/import.php +++ b/mod/import.php @@ -33,7 +33,7 @@ function import_post(&$a) //Per batch setting. $perPage = 200; - $perBatch = 10; + $perBatch = 2; if($batch){ @@ -62,7 +62,7 @@ function import_post(&$a) for($i=0; $i<$perBatch; $i++){ if($url = array_shift($list)){ - set_time_limit(20); + set_time_limit(15); $_SESSION['import_total']++; $_SESSION['import_failed']++; try{ @@ -77,16 +77,16 @@ function import_post(&$a) $left = count($list); + $s = $_SESSION['import_success']; + $total = $_SESSION['import_total']; + $errors = $_SESSION['import_failed']; if($left > 0){ - notice("$left items left in batch."); + notice("$left items left in batch.
Stats: $s / $total success, $errors errors."); file_put_contents($file, implode("\r\n", $list)); $fid = uniqid('autosubmit_'); echo '
'. ''; } else { - $s = $_SESSION['import_success']; - $total = $_SESSION['import_total']; - $errors = $_SESSION['import_failed']; notice("Completed batch! $s / $total success. $errors errors."); unlink($file); unset($_SESSION['import_progress']);