From a0f77e180024ad8e0275034416088a7513dca067 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Sun, 4 Aug 2019 10:11:59 -0400 Subject: [PATCH] Improve HTML::toPlaintext - Ignore empty trimmed text nodes - Ignore anchor links - Ignore blank tags and avoids adding a doctype to transitional DOM objects --- src/Content/Text/HTML.php | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/Content/Text/HTML.php b/src/Content/Text/HTML.php index 4be217b3e..b9132c5d4 100644 --- a/src/Content/Text/HTML.php +++ b/src/Content/Text/HTML.php @@ -56,6 +56,7 @@ class HTML $xpath = new DOMXPath($doc); + /** @var \DOMNode[] $list */ $list = $xpath->query("//" . $tag); foreach ($list as $node) { $attr = []; @@ -98,9 +99,12 @@ class HTML $node->parentNode->insertBefore($StartCode, $node); if ($node->hasChildNodes()) { + /** @var \DOMNode $child */ foreach ($node->childNodes as $child) { - $newNode = $child->cloneNode(true); - $node->parentNode->insertBefore($newNode, $node); + if (trim($child->nodeValue)) { + $newNode = $child->cloneNode(true); + $node->parentNode->insertBefore($newNode, $node); + } } } @@ -559,6 +563,8 @@ class HTML $ignore = false; } + $ignore = $ignore || strpos($treffer[1], '#') === 0; + if (!$ignore) { $urls[$treffer[1]] = $treffer[1]; } @@ -582,7 +588,7 @@ class HTML $message = mb_convert_encoding($message, 'HTML-ENTITIES', "UTF-8"); - @$doc->loadHTML($message); + @$doc->loadHTML($message, LIBXML_HTML_NODEFDTD | LIBXML_NOBLANKS); $message = $doc->saveHTML(); // Remove eventual UTF-8 BOM @@ -591,7 +597,7 @@ class HTML // Collecting all links $urls = self::collectURLs($message); - @$doc->loadHTML($message); + @$doc->loadHTML($message, LIBXML_HTML_NODEFDTD | LIBXML_NOBLANKS); self::tagToBBCode($doc, 'html', [], '', ''); self::tagToBBCode($doc, 'body', [], '', '');