purify html before trying to parse wild urls. This way at least it should parse.

This commit is contained in:
Friendika 2011-07-04 23:02:04 -07:00
parent 92831c9416
commit 24d41e2c6e
1 changed files with 13 additions and 4 deletions

View File

@ -1,6 +1,7 @@
<?php <?php
require_once('library/HTML5/Parser.php'); require_once('library/HTML5/Parser.php');
require_once('library/HTMLPurifier.auto.php');
function parse_url_content(&$a) { function parse_url_content(&$a) {
@ -31,16 +32,25 @@ function parse_url_content(&$a) {
killme(); killme();
} }
logger('parse_url: data: ' . $s, LOGGER_DATA);
if(! $s) { if(! $s) {
echo sprintf($template,$url,$url,''); echo sprintf($template,$url,$url,'');
killme(); killme();
} }
$config = HTMLPurifier_Config::createDefault();
$config->set('Cache.DefinitionImpl', null);
$purifier = new HTMLPurifier($config);
$s = $purifier->purify($s);
$dom = @HTML5_Parser::parse($s); $dom = @HTML5_Parser::parse($s);
if(! $dom) if(! $dom) {
return $ret; echo sprintf($template,$url,$url,'');
killme();
}
$items = $dom->getElementsByTagName('title'); $items = $dom->getElementsByTagName('title');
@ -51,7 +61,6 @@ function parse_url_content(&$a) {
} }
} }
$divs = $dom->getElementsByTagName('div'); $divs = $dom->getElementsByTagName('div');
if($divs) { if($divs) {
foreach($divs as $div) { foreach($divs as $div) {
@ -94,6 +103,6 @@ function parse_url_content(&$a) {
$text = '<br />' . $text; $text = '<br />' . $text;
} }
echo sprintf($template,$url,$title,$text); echo sprintf($template,$url,($title) ? $title : $url,$text);
killme(); killme();
} }