do a slightly better job at finding relevant content from scraping submitted links
This commit is contained in:
parent
24a9a41f96
commit
2d9718fee9
1 changed files with 34 additions and 9 deletions
|
@ -6,6 +6,8 @@ function parse_url_content(&$a) {
|
||||||
|
|
||||||
$url = trim($_GET['url']);
|
$url = trim($_GET['url']);
|
||||||
|
|
||||||
|
$text = null;
|
||||||
|
|
||||||
$template = "<a href=\"%s\" >%s</a>%s";
|
$template = "<a href=\"%s\" >%s</a>%s";
|
||||||
|
|
||||||
if($url)
|
if($url)
|
||||||
|
@ -34,6 +36,28 @@ function parse_url_content(&$a) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
$divs = $dom->getElementsByTagName('div');
|
||||||
|
if($divs) {
|
||||||
|
foreach($divs as $div) {
|
||||||
|
$class = $div->getAttribute('class');
|
||||||
|
if($class && stristr($class,'article')) {
|
||||||
|
$items = $div->getElementsByTagName('p');
|
||||||
|
if($items) {
|
||||||
|
foreach($items as $item) {
|
||||||
|
$text = $item->textContent;
|
||||||
|
$text = strip_tags($text);
|
||||||
|
if(strlen($text) < 100)
|
||||||
|
continue;
|
||||||
|
$text = substr($text,0,250) . '...' ;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(! $text) {
|
||||||
$items = $dom->getElementsByTagName('p');
|
$items = $dom->getElementsByTagName('p');
|
||||||
if($items) {
|
if($items) {
|
||||||
foreach($items as $item) {
|
foreach($items as $item) {
|
||||||
|
@ -45,6 +69,7 @@ function parse_url_content(&$a) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if(strlen($text)) {
|
if(strlen($text)) {
|
||||||
$text = '<br />' . $text;
|
$text = '<br />' . $text;
|
||||||
|
|
Loading…
Reference in a new issue