diff --git a/composer.json b/composer.json index f2aeaa8eb2..8b4a630d3e 100644 --- a/composer.json +++ b/composer.json @@ -15,6 +15,7 @@ "require": { "ezyang/htmlpurifier": "~4.7.0", "mobiledetect/mobiledetectlib": "2.8.*", + "league/html-to-markdown": "~4.4.1", "pear-pear.php.net/Text_Highlighter": "*" }, "repositories": [ diff --git a/composer.lock b/composer.lock index b35d1ca120..b46614f6e3 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", "This file is @generated automatically" ], - "content-hash": "4d3a9e742e7ad746fb7206f3b5aff5af", + "content-hash": "802372ddf124ef949e80dd8dc1d38797", "packages": [ { "name": "ezyang/htmlpurifier", @@ -50,6 +50,70 @@ ], "time": "2015-08-05T01:03:42+00:00" }, + { + "name": "league/html-to-markdown", + "version": "4.4.1", + "source": { + "type": "git", + "url": "https://github.com/thephpleague/html-to-markdown.git", + "reference": "82ea375b5b2b1da1da222644c0565c695bf88186" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/thephpleague/html-to-markdown/zipball/82ea375b5b2b1da1da222644c0565c695bf88186", + "reference": "82ea375b5b2b1da1da222644c0565c695bf88186", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-xml": "*", + "php": ">=5.3.3" + }, + "require-dev": { + "mikehaertl/php-shellcommand": "~1.1.0", + "phpunit/phpunit": "4.*", + "scrutinizer/ocular": "~1.1" + }, + "bin": [ + "bin/html-to-markdown" + ], + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "4.5-dev" + } + }, + "autoload": { + "psr-4": { + "League\\HTMLToMarkdown\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Colin O'Dell", + "email": "colinodell@gmail.com", + "homepage": "http://www.colinodell.com", + "role": "Lead Developer" + }, + { + "name": "Nick Cernis", + "email": "nick@cern.is", + "homepage": "http://modernnerd.net", + "role": "Original Author" + } + ], + "description": "An HTML-to-markdown conversion helper for PHP", + "homepage": "https://github.com/thephpleague/html-to-markdown", + "keywords": [ + "html", + "markdown" + ], + "time": "2017-03-16T00:45:59+00:00" + }, { "name": "mobiledetect/mobiledetectlib", "version": "2.8.25", diff --git a/library/html-to-markdown/.gitignore b/library/html-to-markdown/.gitignore deleted file mode 100644 index aa429a9a12..0000000000 --- a/library/html-to-markdown/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -~* -vendor -composer.lock \ No newline at end of file diff --git a/library/html-to-markdown/.travis.yml b/library/html-to-markdown/.travis.yml deleted file mode 100644 index 48b3e64a41..0000000000 --- a/library/html-to-markdown/.travis.yml +++ /dev/null @@ -1,6 +0,0 @@ -language: php -php: - - "5.5" - - "5.4" - - "5.3" -script: phpunit --no-configuration HTML_To_MarkdownTest ./tests/HTML_To_MarkdownTest.php \ No newline at end of file diff --git a/library/html-to-markdown/HTML_To_Markdown.php b/library/html-to-markdown/HTML_To_Markdown.php deleted file mode 100644 index 109780eddf..0000000000 --- a/library/html-to-markdown/HTML_To_Markdown.php +++ /dev/null @@ -1,598 +0,0 @@ - - * @link https://github.com/nickcernis/html2markdown/ Latest version on GitHub. - * @link http://twitter.com/nickcernis Nick on twitter. - * @license http://www.opensource.org/licenses/mit-license.php MIT - */ -class HTML_To_Markdown -{ - /** - * @var DOMDocument The root of the document tree that holds our HTML. - */ - private $document; - - /** - * @var string|boolean The Markdown version of the original HTML, or false if conversion failed - */ - private $output; - - /** - * @var array Class-wide options users can override. - */ - private $options = array( - 'header_style' => 'setext', // Set to "atx" to output H1 and H2 headers as # Header1 and ## Header2 - 'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML - 'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output. - 'bold_style' => '**', // Set to '__' if you prefer the underlined style - 'italic_style' => '*', // Set to '_' if you prefer the underlined style - 'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: "meta style script" - ); - - - /** - * Constructor - * - * Set up a new DOMDocument from the supplied HTML, convert it to Markdown, and store it in $this->$output. - * - * @param string $html The HTML to convert to Markdown. - * @param array $overrides [optional] List of style and error display overrides. - */ - public function __construct($html = null, $overrides = null) - { - if ($overrides) - $this->options = array_merge($this->options, $overrides); - - if ($html) - $this->convert($html); - } - - - /** - * Setter for conversion options - * - * @param $name - * @param $value - */ - public function set_option($name, $value) - { - $this->options[$name] = $value; - } - - - /** - * Convert - * - * Loads HTML and passes to get_markdown() - * - * @param $html - * @return string The Markdown version of the html - */ - public function convert($html) - { - $html = preg_replace('~>\s+<~', '><', $html); // Strip white space between tags to prevent creation of empty #text nodes - - $this->document = new DOMDocument(); - - if ($this->options['suppress_errors']) - libxml_use_internal_errors(true); // Suppress conversion errors (from http://bit.ly/pCCRSX ) - - $this->document->loadHTML('' . $html); // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt ) - $this->document->encoding = 'UTF-8'; - - if ($this->options['suppress_errors']) - libxml_clear_errors(); - - return $this->get_markdown($html); - } - - - /** - * Is Child Of? - * - * Is the node a child of the given parent tag? - * - * @param $parent_name string|array The name of the parent node(s) to search for e.g. 'code' or array('pre', 'code') - * @param $node - * @return bool - */ - private static function is_child_of($parent_name, $node) - { - for ($p = $node->parentNode; $p != false; $p = $p->parentNode) { - if (is_null($p)) - return false; - - if ( is_array($parent_name) && in_array($p->nodeName, $parent_name) ) - return true; - - if ($p->nodeName == $parent_name) - return true; - } - return false; - } - - - /** - * Convert Children - * - * Recursive function to drill into the DOM and convert each node into Markdown from the inside out. - * - * Finds children of each node and convert those to #text nodes containing their Markdown equivalent, - * starting with the innermost element and working up to the outermost element. - * - * @param $node - */ - private function convert_children($node) - { - // Don't convert HTML code inside and
 blocks to Markdown - that should stay as HTML
-        if (self::is_child_of(array('pre', 'code'), $node))
-            return;
-
-        // If the node has children, convert those to Markdown first
-        if ($node->hasChildNodes()) {
-            $length = $node->childNodes->length;
-
-            for ($i = 0; $i < $length; $i++) {
-                $child = $node->childNodes->item($i);
-                $this->convert_children($child);
-            }
-        }
-
-        // Now that child nodes have been converted, convert the original node
-        $markdown = $this->convert_to_markdown($node);
-
-        // Create a DOM text node containing the Markdown equivalent of the original node
-        $markdown_node = $this->document->createTextNode($markdown);
-
-        // Replace the old $node e.g. "

Title

" with the new $markdown_node e.g. "### Title" - $node->parentNode->replaceChild($markdown_node, $node); - } - - - /** - * Get Markdown - * - * Sends the body node to convert_children() to change inner nodes to Markdown #text nodes, then saves and - * returns the resulting converted document as a string in Markdown format. - * - * @return string|boolean The converted HTML as Markdown, or false if conversion failed - */ - private function get_markdown() - { - // Work on the entire DOM tree (including head and body) - $input = $this->document->getElementsByTagName("html")->item(0); - - if (!$input) - return false; - - // Convert all children of this root element. The DOMDocument stored in $this->doc will - // then consist of #text nodes, each containing a Markdown version of the original node - // that it replaced. - $this->convert_children($input); - - // Sanitize and return the body contents as a string. - $markdown = $this->document->saveHTML(); // stores the DOMDocument as a string - $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8'); - $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8'); // Double decode to cover cases like &nbsp; http://www.php.net/manual/en/function.htmlentities.php#99984 - $markdown = preg_replace("/]+>/", "", $markdown); // Strip doctype declaration - $unwanted = array('', '', '', '', '', '', '', ' '); - $markdown = str_replace($unwanted, '', $markdown); // Strip unwanted tags - $markdown = trim($markdown, "\n\r\0\x0B"); - - $this->output = $markdown; - - return $markdown; - } - - - /** - * Convert to Markdown - * - * Converts an individual node into a #text node containing a string of its Markdown equivalent. - * - * Example: An

node with text content of "Title" becomes a text node with content of "### Title" - * - * @param $node - * @return string The converted HTML as Markdown - */ - private function convert_to_markdown($node) - { - $tag = $node->nodeName; // the type of element, e.g. h1 - $value = $node->nodeValue; // the value of that element, e.g. The Title - - // Strip nodes named in remove_nodes - $tags_to_remove = explode(' ', $this->options['remove_nodes']); - if ( in_array($tag, $tags_to_remove) ) - return false; - - switch ($tag) { - case "p": - $markdown = (trim($value)) ? rtrim($value) . PHP_EOL . PHP_EOL : ''; - break; - case "pre": - $markdown = PHP_EOL . $this->convert_code($node) . PHP_EOL; - break; - case "h1": - case "h2": - $markdown = $this->convert_header($tag, $node); - break; - case "h3": - $markdown = "### " . $value . PHP_EOL . PHP_EOL; - break; - case "h4": - $markdown = "#### " . $value . PHP_EOL . PHP_EOL; - break; - case "h5": - $markdown = "##### " . $value . PHP_EOL . PHP_EOL; - break; - case "h6": - $markdown = "###### " . $value . PHP_EOL . PHP_EOL; - break; - case "em": - case "i": - case "strong": - case "b": - $markdown = $this->convert_emphasis($tag, $value); - break; - case "hr": - $markdown = "- - - - - -" . PHP_EOL . PHP_EOL; - break; - case "br": - $markdown = " " . PHP_EOL; - break; - case "blockquote": - $markdown = $this->convert_blockquote($node); - break; - case "code": - $markdown = $this->convert_code($node); - break; - case "ol": - case "ul": - $markdown = $value . PHP_EOL; - break; - case "li": - $markdown = $this->convert_list($node); - break; - case "img": - $markdown = $this->convert_image($node); - break; - case "a": - $markdown = $this->convert_anchor($node); - break; - case "#text": - $markdown = preg_replace('~\s+~', ' ', $value); - $markdown = preg_replace('~^#~', '\\\\#', $markdown); - break; - case "#comment": - $markdown = ''; - break; - case "div": - $markdown = ($this->options['strip_tags']) ? $value . PHP_EOL . PHP_EOL : html_entity_decode($node->C14N()); - break; - default: - // If strip_tags is false (the default), preserve tags that don't have Markdown equivalents, - // such as nodes on their own. C14N() canonicalizes the node to a string. - // See: http://www.php.net/manual/en/domnode.c14n.php - $markdown = ($this->options['strip_tags']) ? $value : html_entity_decode($node->C14N()); - } - - return $markdown; - } - - - /** - * Convert Header - * - * Converts h1 and h2 headers to Markdown-style headers in setext style, - * matching the number of underscores with the length of the title. - * - * e.g. Header 1 Header Two - * ======== ---------- - * - * Returns atx headers instead if $this->options['header_style'] is "atx" - * - * e.g. # Header 1 ## Header Two - * - * @param string $level The header level, including the "h". e.g. h1 - * @param string $node The node to convert. - * @return string The Markdown version of the header. - */ - private function convert_header($level, $node) - { - $content = $node->nodeValue; - - if (!$this->is_child_of('blockquote', $node) && $this->options['header_style'] == "setext") { - $length = (function_exists('mb_strlen')) ? mb_strlen($content, 'utf-8') : strlen($content); - $underline = ($level == "h1") ? "=" : "-"; - $markdown = $content . PHP_EOL . str_repeat($underline, $length) . PHP_EOL . PHP_EOL; // setext style - } else { - $prefix = ($level == "h1") ? "# " : "## "; - $markdown = $prefix . $content . PHP_EOL . PHP_EOL; // atx style - } - - return $markdown; - } - - - /** - * Converts inline styles - * This function is used to render strong and em tags - * - * eg bold text becomes **bold text** or __bold text__ - * - * @param string $tag - * @param string $value - * @return string - */ - private function convert_emphasis($tag, $value) - { - if ($tag == 'i' || $tag == 'em') { - $markdown = $this->options['italic_style'] . $value . $this->options['italic_style']; - } else { - $markdown = $this->options['bold_style'] . $value . $this->options['bold_style']; - } - - return $markdown; - } - - - /** - * Convert Image - * - * Converts tags to Markdown. - * - * e.g. alt text - * becomes ![alt text](/path/img.jpg "Title") - * - * @param $node - * @return string - */ - private function convert_image($node) - { - $src = $node->getAttribute('src'); - $alt = $node->getAttribute('alt'); - $title = $node->getAttribute('title'); - - if ($title != "") { - $markdown = '![' . $alt . '](' . $src . ' "' . $title . '")'; // No newlines added. should be in a block-level element. - } else { - $markdown = '![' . $alt . '](' . $src . ')'; - } - - return $markdown; - } - - - /** - * Convert Anchor - * - * Converts tags to Markdown. - * - * e.g. Modern Nerd - * becomes [Modern Nerd](http://modernnerd.net "Title") - * - * @param $node - * @return string - */ - private function convert_anchor($node) - { - $href = $node->getAttribute('href'); - $title = $node->getAttribute('title'); - $text = $node->nodeValue; - - if ($title != "") { - $markdown = '[' . $text . '](' . $href . ' "' . $title . '")'; - } else { - $markdown = '[' . $text . '](' . $href . ')'; - } - - if (! $href) - $markdown = html_entity_decode($node->C14N()); - - // Append a space if the node after this one is also an anchor - $next_node_name = $this->get_next_node_name($node); - - if ($next_node_name == 'a') - $markdown = $markdown . ' '; - - return $markdown; - } - - - /** - * Convert List - * - * Converts