do a slightly better job at finding relevant content from scraping submitted links
This commit is contained in:
parent
24a9a41f96
commit
2d9718fee9
1 changed files with 34 additions and 9 deletions
|
@ -6,6 +6,8 @@ function parse_url_content(&$a) {
|
|||
|
||||
$url = trim($_GET['url']);
|
||||
|
||||
$text = null;
|
||||
|
||||
$template = "<a href=\"%s\" >%s</a>%s";
|
||||
|
||||
if($url)
|
||||
|
@ -34,6 +36,28 @@ function parse_url_content(&$a) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
$divs = $dom->getElementsByTagName('div');
|
||||
if($divs) {
|
||||
foreach($divs as $div) {
|
||||
$class = $div->getAttribute('class');
|
||||
if($class && stristr($class,'article')) {
|
||||
$items = $div->getElementsByTagName('p');
|
||||
if($items) {
|
||||
foreach($items as $item) {
|
||||
$text = $item->textContent;
|
||||
$text = strip_tags($text);
|
||||
if(strlen($text) < 100)
|
||||
continue;
|
||||
$text = substr($text,0,250) . '...' ;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(! $text) {
|
||||
$items = $dom->getElementsByTagName('p');
|
||||
if($items) {
|
||||
foreach($items as $item) {
|
||||
|
@ -45,6 +69,7 @@ function parse_url_content(&$a) {
|
|||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(strlen($text)) {
|
||||
$text = '<br />' . $text;
|
||||
|
|
Loading…
Reference in a new issue