do a slightly better job at finding relevant content from scraping submitted links

This commit is contained in:
Friendika 2010-12-19 19:04:37 -08:00
parent 24a9a41f96
commit 2d9718fee9

View file

@ -6,6 +6,8 @@ function parse_url_content(&$a) {
$url = trim($_GET['url']); $url = trim($_GET['url']);
$text = null;
$template = "<a href=\"%s\" >%s</a>%s"; $template = "<a href=\"%s\" >%s</a>%s";
if($url) if($url)
@ -34,6 +36,28 @@ function parse_url_content(&$a) {
} }
} }
$divs = $dom->getElementsByTagName('div');
if($divs) {
foreach($divs as $div) {
$class = $div->getAttribute('class');
if($class && stristr($class,'article')) {
$items = $div->getElementsByTagName('p');
if($items) {
foreach($items as $item) {
$text = $item->textContent;
$text = strip_tags($text);
if(strlen($text) < 100)
continue;
$text = substr($text,0,250) . '...' ;
break;
}
}
}
}
}
if(! $text) {
$items = $dom->getElementsByTagName('p'); $items = $dom->getElementsByTagName('p');
if($items) { if($items) {
foreach($items as $item) { foreach($items as $item) {
@ -45,6 +69,7 @@ function parse_url_content(&$a) {
break; break;
} }
} }
}
if(strlen($text)) { if(strlen($text)) {
$text = '<br />' . $text; $text = '<br />' . $text;