Improve HTML::toPlaintext
- Ignore empty trimmed text nodes - Ignore anchor links - Ignore blank tags and avoids adding a doctype to transitional DOM objects
This commit is contained in:
parent
d5fc4a268d
commit
a0f77e1800
1 changed files with 10 additions and 4 deletions
|
@ -56,6 +56,7 @@ class HTML
|
||||||
|
|
||||||
$xpath = new DOMXPath($doc);
|
$xpath = new DOMXPath($doc);
|
||||||
|
|
||||||
|
/** @var \DOMNode[] $list */
|
||||||
$list = $xpath->query("//" . $tag);
|
$list = $xpath->query("//" . $tag);
|
||||||
foreach ($list as $node) {
|
foreach ($list as $node) {
|
||||||
$attr = [];
|
$attr = [];
|
||||||
|
@ -98,11 +99,14 @@ class HTML
|
||||||
$node->parentNode->insertBefore($StartCode, $node);
|
$node->parentNode->insertBefore($StartCode, $node);
|
||||||
|
|
||||||
if ($node->hasChildNodes()) {
|
if ($node->hasChildNodes()) {
|
||||||
|
/** @var \DOMNode $child */
|
||||||
foreach ($node->childNodes as $child) {
|
foreach ($node->childNodes as $child) {
|
||||||
|
if (trim($child->nodeValue)) {
|
||||||
$newNode = $child->cloneNode(true);
|
$newNode = $child->cloneNode(true);
|
||||||
$node->parentNode->insertBefore($newNode, $node);
|
$node->parentNode->insertBefore($newNode, $node);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
$node->parentNode->insertBefore($EndCode, $node);
|
$node->parentNode->insertBefore($EndCode, $node);
|
||||||
$node->parentNode->removeChild($node);
|
$node->parentNode->removeChild($node);
|
||||||
|
@ -559,6 +563,8 @@ class HTML
|
||||||
$ignore = false;
|
$ignore = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$ignore = $ignore || strpos($treffer[1], '#') === 0;
|
||||||
|
|
||||||
if (!$ignore) {
|
if (!$ignore) {
|
||||||
$urls[$treffer[1]] = $treffer[1];
|
$urls[$treffer[1]] = $treffer[1];
|
||||||
}
|
}
|
||||||
|
@ -582,7 +588,7 @@ class HTML
|
||||||
|
|
||||||
$message = mb_convert_encoding($message, 'HTML-ENTITIES', "UTF-8");
|
$message = mb_convert_encoding($message, 'HTML-ENTITIES', "UTF-8");
|
||||||
|
|
||||||
@$doc->loadHTML($message);
|
@$doc->loadHTML($message, LIBXML_HTML_NODEFDTD | LIBXML_NOBLANKS);
|
||||||
|
|
||||||
$message = $doc->saveHTML();
|
$message = $doc->saveHTML();
|
||||||
// Remove eventual UTF-8 BOM
|
// Remove eventual UTF-8 BOM
|
||||||
|
@ -591,7 +597,7 @@ class HTML
|
||||||
// Collecting all links
|
// Collecting all links
|
||||||
$urls = self::collectURLs($message);
|
$urls = self::collectURLs($message);
|
||||||
|
|
||||||
@$doc->loadHTML($message);
|
@$doc->loadHTML($message, LIBXML_HTML_NODEFDTD | LIBXML_NOBLANKS);
|
||||||
|
|
||||||
self::tagToBBCode($doc, 'html', [], '', '');
|
self::tagToBBCode($doc, 'html', [], '', '');
|
||||||
self::tagToBBCode($doc, 'body', [], '', '');
|
self::tagToBBCode($doc, 'body', [], '', '');
|
||||||
|
|
Loading…
Reference in a new issue