From c398974918347e3491302d63177c787675f53e74 Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Mon, 8 Dec 2014 11:02:03 +0100 Subject: [PATCH 1/3] Logging to analyse the stale gprobe processes --- include/gprobe.php | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/gprobe.php b/include/gprobe.php index 0cf32e95fe..36650eb9ae 100644 --- a/include/gprobe.php +++ b/include/gprobe.php @@ -10,7 +10,7 @@ function gprobe_run(&$argv, &$argc){ if(is_null($a)) { $a = new App; } - + if(is_null($db)) { @include(".htconfig.php"); require_once("include/dba.php"); @@ -37,6 +37,8 @@ function gprobe_run(&$argv, &$argc){ dbesc(normalise_link($url)) ); + logger("gprobe start for ".normalise_link($url), LOGGER_DEBUG); + if(! count($r)) { $arr = probe_url($url); @@ -55,7 +57,8 @@ function gprobe_run(&$argv, &$argc){ } if(count($r)) poco_load(0,0,$r[0]['id'], str_replace('/profile/','/poco/',$r[0]['url'])); - + + logger("gprobe end for ".normalise_link($url), LOGGER_DEBUG); return; } From 620ee6be0bdb4e706abfa2172d3dfb3e2d035e25 Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Mon, 8 Dec 2014 14:26:44 +0100 Subject: [PATCH 2/3] Cleanup the HTML before parsing it. --- include/Scrape.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/Scrape.php b/include/Scrape.php index 99784af336..ad9e030a39 100644 --- a/include/Scrape.php +++ b/include/Scrape.php @@ -263,6 +263,11 @@ function scrape_feed($url) { } try { + // Cleanup invalid HTML + $doc = new DOMDocument(); + @$doc->loadHTML($s); + $s = $doc->saveHTML(); + $dom = HTML5_Parser::parse($s); } catch (DOMException $e) { logger('scrape_feed: parse error: ' . $e); From 04106ff6f52b03d0d5b009f07a7b6ab11107f0c0 Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Mon, 8 Dec 2014 22:37:49 +0100 Subject: [PATCH 3/3] Moving the clean up code to a central place. --- include/Scrape.php | 5 ----- library/HTML5/Parser.php | 6 ++++++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/Scrape.php b/include/Scrape.php index ad9e030a39..99784af336 100644 --- a/include/Scrape.php +++ b/include/Scrape.php @@ -263,11 +263,6 @@ function scrape_feed($url) { } try { - // Cleanup invalid HTML - $doc = new DOMDocument(); - @$doc->loadHTML($s); - $s = $doc->saveHTML(); - $dom = HTML5_Parser::parse($s); } catch (DOMException $e) { logger('scrape_feed: parse error: ' . $e); diff --git a/library/HTML5/Parser.php b/library/HTML5/Parser.php index 5f9ca560e5..c7faf875ad 100644 --- a/library/HTML5/Parser.php +++ b/library/HTML5/Parser.php @@ -17,6 +17,12 @@ class HTML5_Parser * @return Parsed HTML as DOMDocument */ static public function parse($text, $builder = null) { + + // Cleanup invalid HTML + $doc = new DOMDocument(); + @$doc->loadHTML($text); + $text = $doc->saveHTML(); + $tokenizer = new HTML5_Tokenizer($text, $builder); $tokenizer->parse(); return $tokenizer->save();