Improve HTML::toPlaintext

- Ignore empty trimmed text nodes
- Ignore anchor links
- Ignore blank tags and avoids adding a doctype to transitional DOM objects
This commit is contained in:
Hypolite Petovan 2019-08-04 10:11:59 -04:00
parent d5fc4a268d
commit a0f77e1800

View file

@ -56,6 +56,7 @@ class HTML
$xpath = new DOMXPath($doc); $xpath = new DOMXPath($doc);
/** @var \DOMNode[] $list */
$list = $xpath->query("//" . $tag); $list = $xpath->query("//" . $tag);
foreach ($list as $node) { foreach ($list as $node) {
$attr = []; $attr = [];
@ -98,9 +99,12 @@ class HTML
$node->parentNode->insertBefore($StartCode, $node); $node->parentNode->insertBefore($StartCode, $node);
if ($node->hasChildNodes()) { if ($node->hasChildNodes()) {
/** @var \DOMNode $child */
foreach ($node->childNodes as $child) { foreach ($node->childNodes as $child) {
$newNode = $child->cloneNode(true); if (trim($child->nodeValue)) {
$node->parentNode->insertBefore($newNode, $node); $newNode = $child->cloneNode(true);
$node->parentNode->insertBefore($newNode, $node);
}
} }
} }
@ -559,6 +563,8 @@ class HTML
$ignore = false; $ignore = false;
} }
$ignore = $ignore || strpos($treffer[1], '#') === 0;
if (!$ignore) { if (!$ignore) {
$urls[$treffer[1]] = $treffer[1]; $urls[$treffer[1]] = $treffer[1];
} }
@ -582,7 +588,7 @@ class HTML
$message = mb_convert_encoding($message, 'HTML-ENTITIES', "UTF-8"); $message = mb_convert_encoding($message, 'HTML-ENTITIES', "UTF-8");
@$doc->loadHTML($message); @$doc->loadHTML($message, LIBXML_HTML_NODEFDTD | LIBXML_NOBLANKS);
$message = $doc->saveHTML(); $message = $doc->saveHTML();
// Remove eventual UTF-8 BOM // Remove eventual UTF-8 BOM
@ -591,7 +597,7 @@ class HTML
// Collecting all links // Collecting all links
$urls = self::collectURLs($message); $urls = self::collectURLs($message);
@$doc->loadHTML($message); @$doc->loadHTML($message, LIBXML_HTML_NODEFDTD | LIBXML_NOBLANKS);
self::tagToBBCode($doc, 'html', [], '', ''); self::tagToBBCode($doc, 'html', [], '', '');
self::tagToBBCode($doc, 'body', [], '', ''); self::tagToBBCode($doc, 'body', [], '', '');