do a slightly better job at finding relevant content from scraping submitted links

2010-12-19 19:04:37 -08:00 · 2010-12-19 19:04:37 -08:00 · 2d9718fee9
parent 24a9a41f96
commit 2d9718fee9
1 changed files with 34 additions and 9 deletions
--- a/mod/parse_url.php
+++ b/mod/parse_url.php
@ -6,6 +6,8 @@ function parse_url_content(&$a) {

 	$url = trim($_GET['url']);

+	$text = null;
+
 	$template = "<a href=\"%s\" >%s</a>%s";

 	if($url) 
@ -34,6 +36,28 @@ function parse_url_content(&$a) {
 		}
 	}

+
+	$divs = $dom->getElementsByTagName('div');
+	if($divs) {
+		foreach($divs as $div) {
+			$class = $div->getAttribute('class');
+			if($class && stristr($class,'article')) {
+				$items = $div->getElementsByTagName('p');
+				if($items) {
+					foreach($items as $item) {
+						$text = $item->textContent;
+						$text = strip_tags($text);
+						if(strlen($text) < 100)
+							continue;
+						$text = substr($text,0,250) . '...' ;
+						break;
+					}
+				}
+			}
+		}
+	}
+
+	if(! $text) {
 		$items = $dom->getElementsByTagName('p');
 		if($items) {
 			foreach($items as $item) {
@ -45,6 +69,7 @@ function parse_url_content(&$a) {
 				break;
 			}
 		}
+	}

 	if(strlen($text)) {
 		$text = '<br />' . $text;