do a slightly better job at finding relevant content from scraping submitted links

This commit is contained in:
Friendika 2010-12-19 19:04:37 -08:00
parent 24a9a41f96
commit 2d9718fee9
1 changed files with 34 additions and 9 deletions

View File

@ -6,6 +6,8 @@ function parse_url_content(&$a) {
$url = trim($_GET['url']); $url = trim($_GET['url']);
$text = null;
$template = "<a href=\"%s\" >%s</a>%s"; $template = "<a href=\"%s\" >%s</a>%s";
if($url) if($url)
@ -34,15 +36,38 @@ function parse_url_content(&$a) {
} }
} }
$items = $dom->getElementsByTagName('p');
if($items) { $divs = $dom->getElementsByTagName('div');
foreach($items as $item) { if($divs) {
$text = $item->textContent; foreach($divs as $div) {
$text = strip_tags($text); $class = $div->getAttribute('class');
if(strlen($text) < 100) if($class && stristr($class,'article')) {
continue; $items = $div->getElementsByTagName('p');
$text = substr($text,0,250) . '...' ; if($items) {
break; foreach($items as $item) {
$text = $item->textContent;
$text = strip_tags($text);
if(strlen($text) < 100)
continue;
$text = substr($text,0,250) . '...' ;
break;
}
}
}
}
}
if(! $text) {
$items = $dom->getElementsByTagName('p');
if($items) {
foreach($items as $item) {
$text = $item->textContent;
$text = strip_tags($text);
if(strlen($text) < 100)
continue;
$text = substr($text,0,250) . '...' ;
break;
}
} }
} }