From 93823ecef572e09db9a23aef8a002c43d716c8a4 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Sun, 14 Mar 2021 13:40:32 -0400 Subject: [PATCH] Move HTML purification to own method in Content\Text\HTML --- src/Content/Text/BBCode.php | 32 ++++++-------------- src/Content/Text/HTML.php | 59 +++++++++++++++++++++++++++++++++++++ src/Module/Debug/Babel.php | 4 +-- 3 files changed, 69 insertions(+), 26 deletions(-) diff --git a/src/Content/Text/BBCode.php b/src/Content/Text/BBCode.php index 1a93008810..c6095cd419 100644 --- a/src/Content/Text/BBCode.php +++ b/src/Content/Text/BBCode.php @@ -29,12 +29,10 @@ use Friendica\Content\Item; use Friendica\Content\OEmbed; use Friendica\Content\PageInfo; use Friendica\Content\Smilies; -use Friendica\Content\Text\HTMLPurifier_URIScheme_cid; use Friendica\Core\Hook; use Friendica\Core\Logger; use Friendica\Core\Protocol; use Friendica\Core\Renderer; -use Friendica\Core\System; use Friendica\DI; use Friendica\Model\Contact; use Friendica\Model\Event; @@ -1877,28 +1875,16 @@ class BBCode $text ); - \HTMLPurifier_URISchemeRegistry::instance()->register('cid', new HTMLPurifier_URIScheme_cid()); + // Default iframe allowed domains/path + $allowedIframeDomains = [ + DI::baseUrl()->getHostname() + . (DI::baseUrl()->getUrlPath() ? '/' . DI::baseUrl()->getUrlPath() : '') + . '/oembed/', # The path part has to change with the source in Content\Oembed::iframe + 'www.youtube.com/embed/', + 'player.vimeo.com/video/', + ]; - $config = \HTMLPurifier_HTML5Config::createDefault(); - $config->set('HTML.Doctype', 'HTML5'); - $config->set('HTML.SafeIframe', true); - $config->set('URI.SafeIframeRegexp', '%^(?: - https://www.youtube.com/embed/ - | - https://player.vimeo.com/video/ - | - ' . DI::baseUrl() . '/oembed/ # Has to change with the source in Content\Oembed::iframe - )%xi'); - $config->set('Attr.AllowedRel', [ - 'noreferrer' => true, - 'noopener' => true, - ]); - $config->set('Attr.AllowedFrameTargets', [ - '_blank' => true, - ]); - - $HTMLPurifier = new \HTMLPurifier($config); - $text = $HTMLPurifier->purify($text); + $text = HTML::purify($text, $allowedIframeDomains); return $text; } diff --git a/src/Content/Text/HTML.php b/src/Content/Text/HTML.php index 975be8b1ff..c77b84db8a 100644 --- a/src/Content/Text/HTML.php +++ b/src/Content/Text/HTML.php @@ -961,4 +961,63 @@ class HTML { return str_replace('&', '&', $s); } + + /** + * Clean an HTML text for potentially harmful code + * + * @param string $text + * @param array $allowedIframeDomains List of allowed iframe source domains without the scheme + * @return string + */ + public static function purify(string $text, array $allowedIframeDomains = []): string + { + // Allows cid: URL scheme + \HTMLPurifier_URISchemeRegistry::instance()->register('cid', new HTMLPurifier_URIScheme_cid()); + + $config = \HTMLPurifier_HTML5Config::createDefault(); + $config->set('HTML.Doctype', 'HTML5'); + + // Used to remove iframe with src attribute filtered out + $config->set('AutoFormat.RemoveEmpty', true); + + $config->set('HTML.SafeIframe', true); + + array_walk($allowedIframeDomains, function (&$domain) { + // Allow the domain and all its eventual sub-domains + $domain = '(?:(?!-)[A-Za-z0-9-]{1,63}(?set('URI.SafeIframeRegexp', + '%^https://(?: + ' . implode('|', $allowedIframeDomains) . ' + ) + (?:/|$) # Prevents bogus domains like youtube.com.fake.tld + %xi' + ); + + $config->set('Attr.AllowedRel', [ + 'noreferrer' => true, + 'noopener' => true, + ]); + $config->set('Attr.AllowedFrameTargets', [ + '_blank' => true, + ]); + + /* Uncomment to debug HTMLPurifier behavior + $config->set('Core.CollectErrors', true); + $config->set('Core.MaintainLineNumbers', true); + */ + + $HTMLPurifier = new \HTMLPurifier($config); + + $text = $HTMLPurifier->purify($text); + + /** @var \HTMLPurifier_ErrorCollector $errorCollector */ + /* Uncomment to debug HTML Purifier behavior + $errorCollector = $HTMLPurifier->context->get('ErrorCollector'); + var_dump($errorCollector->getRaw()); + */ + + return $text; + } } diff --git a/src/Module/Debug/Babel.php b/src/Module/Debug/Babel.php index 322b742fbe..52f6614454 100644 --- a/src/Module/Debug/Babel.php +++ b/src/Module/Debug/Babel.php @@ -180,9 +180,7 @@ class Babel extends BaseModule 'content' => $html ]; - $config = \HTMLPurifier_Config::createDefault(); - $HTMLPurifier = new \HTMLPurifier($config); - $purified = $HTMLPurifier->purify($html); + $purified = Text\HTML::purify($html); $results[] = [ 'title' => DI::l10n()->t('HTML Purified (raw)'),