From 2f46675a89b1dcb453654be4be2117114a3d0a65 Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Sun, 22 Feb 2015 17:38:28 +0100 Subject: [PATCH 01/13] New routines for markdown to html and html to markdown. --- include/bb2diaspora.php | 19 +- include/bbcode.php | 7 +- library/html-to-markdown/HTML_To_Markdown.php | 592 ++++ library/html-to-markdown/LICENSE | 20 + library/html-to-markdown/README.md | 138 + library/markdown.php | 2932 +---------------- library/parsedown/LICENSE.txt | 20 + library/parsedown/Parsedown.php | 1528 +++++++++ library/parsedown/README.md | 48 + 9 files changed, 2368 insertions(+), 2936 deletions(-) create mode 100644 library/html-to-markdown/HTML_To_Markdown.php create mode 100644 library/html-to-markdown/LICENSE create mode 100644 library/html-to-markdown/README.md create mode 100644 library/parsedown/LICENSE.txt create mode 100755 library/parsedown/Parsedown.php create mode 100644 library/parsedown/README.md diff --git a/include/bb2diaspora.php b/include/bb2diaspora.php index 7107c49139..272b69dff9 100644 --- a/include/bb2diaspora.php +++ b/include/bb2diaspora.php @@ -5,7 +5,8 @@ require_once("include/event.php"); require_once("library/markdown.php"); require_once("include/html2bbcode.php"); require_once("include/bbcode.php"); -require_once("include/markdownify/markdownify.php"); +require_once("library/html-to-markdown/HTML_To_Markdown.php"); +//require_once("include/markdownify/markdownify.php"); // we don't want to support a bbcode specific markdown interpreter @@ -21,15 +22,15 @@ function diaspora2bb($s) { $s = str_replace("\r","",$s); //
is invalid. Replace it with the valid expression - $s = str_replace(array("
", "

", "

", '

'),array("
", "
", "
", "
"),$s); - - $s = preg_replace('/\@\{(.+?)\; (.+?)\@(.+?)\}/','@[url=https://$3/u/$2]$1[/url]',$s); + //$s = str_replace(array("
", "

", "

", '

'),array("
", "
", "
", "
"),$s); // Escaping the hash tags $s = preg_replace('/\#([^\s\#])/','#$1',$s); $s = Markdown($s); + $s = preg_replace('/\@\{(.+?)\; (.+?)\@(.+?)\}/','@[url=https://$3/u/$2]$1[/url]',$s); + $s = str_replace('#','#',$s); $s = html2bbcode($s); @@ -92,12 +93,15 @@ function bb2diaspora($Text,$preserve_nl = false, $fordiaspora = true) { $Text = bbcode($Text, $preserve_nl, false, 4); // Libertree doesn't convert a harizontal rule if there isn't a linefeed - $Text = str_replace(array("


", "
"), array("

", "

"), $Text); + //$Text = str_replace(array("
", "
"), array("

", "

"), $Text); } // Now convert HTML to Markdown - $md = new Markdownify(false, false, false); - $Text = $md->parseString($Text); + $Text = new HTML_To_Markdown($Text); + +/* + //$md = new Markdownify(false, false, false); + //$Text = $md->parseString($Text); // The Markdownify converter converts underscores '_' in URLs to '\_', which // messes up the URL. Manually fix these @@ -123,6 +127,7 @@ function bb2diaspora($Text,$preserve_nl = false, $fordiaspora = true) { // Remove all unconverted tags $Text = strip_tags($Text); +*/ // Remove any leading or trailing whitespace, as this will mess up // the Diaspora signature verification and cause the item to disappear diff --git a/include/bbcode.php b/include/bbcode.php index 9a3563527a..d461b98482 100644 --- a/include/bbcode.php +++ b/include/bbcode.php @@ -168,6 +168,8 @@ function bb_remove_share_information($Text, $plaintext = false, $nolink = false) } function bb_cleanup_share($shared, $plaintext, $nolink) { + $shared[1] = trim($shared[1]); + if (!in_array($shared[2], array("type-link", "type-video"))) return($shared[0]); @@ -178,7 +180,7 @@ function bb_cleanup_share($shared, $plaintext, $nolink) { return($shared[0]); if ($nolink) - return(trim($shared[1])); + return($shared[1]); $title = ""; $link = ""; @@ -189,6 +191,9 @@ function bb_cleanup_share($shared, $plaintext, $nolink) { if (isset($bookmark[1][0])) $link = $bookmark[1][0]; + if (($title != "") AND (strpos($title, $shared[1]) !== false)) + $shared[1] = $title; + if (($title != "") AND ((strpos($shared[1],$title) !== false) OR (similar_text($shared[1],$title) / strlen($title)) > 0.9)) $title = ""; diff --git a/library/html-to-markdown/HTML_To_Markdown.php b/library/html-to-markdown/HTML_To_Markdown.php new file mode 100644 index 0000000000..1cc86505b6 --- /dev/null +++ b/library/html-to-markdown/HTML_To_Markdown.php @@ -0,0 +1,592 @@ + + * @link https://github.com/nickcernis/html2markdown/ Latest version on GitHub. + * @link http://twitter.com/nickcernis Nick on twitter. + * @license http://www.opensource.org/licenses/mit-license.php MIT + */ +class HTML_To_Markdown +{ + /** + * @var DOMDocument The root of the document tree that holds our HTML. + */ + private $document; + + /** + * @var string|boolean The Markdown version of the original HTML, or false if conversion failed + */ + private $output; + + /** + * @var array Class-wide options users can override. + */ + private $options = array( + 'header_style' => 'setext', // Set to "atx" to output H1 and H2 headers as # Header1 and ## Header2 + 'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML + 'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output. + 'bold_style' => '**', // Set to '__' if you prefer the underlined style + 'italic_style' => '*', // Set to '_' if you prefer the underlined style + 'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: "meta style script" + ); + + + /** + * Constructor + * + * Set up a new DOMDocument from the supplied HTML, convert it to Markdown, and store it in $this->$output. + * + * @param string $html The HTML to convert to Markdown. + * @param array $overrides [optional] List of style and error display overrides. + */ + public function __construct($html = null, $overrides = null) + { + if ($overrides) + $this->options = array_merge($this->options, $overrides); + + if ($html) + $this->convert($html); + } + + + /** + * Setter for conversion options + * + * @param $name + * @param $value + */ + public function set_option($name, $value) + { + $this->options[$name] = $value; + } + + + /** + * Convert + * + * Loads HTML and passes to get_markdown() + * + * @param $html + * @return string The Markdown version of the html + */ + public function convert($html) + { + $html = preg_replace('~>\s+<~', '><', $html); // Strip white space between tags to prevent creation of empty #text nodes + + $this->document = new DOMDocument(); + + if ($this->options['suppress_errors']) + libxml_use_internal_errors(true); // Suppress conversion errors (from http://bit.ly/pCCRSX ) + + $this->document->loadHTML('' . $html); // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt ) + $this->document->encoding = 'UTF-8'; + + if ($this->options['suppress_errors']) + libxml_clear_errors(); + + return $this->get_markdown($html); + } + + + /** + * Is Child Of? + * + * Is the node a child of the given parent tag? + * + * @param $parent_name string The name of the parent node to search for (e.g. 'code') + * @param $node + * @return bool + */ + private static function is_child_of($parent_name, $node) + { + for ($p = $node->parentNode; $p != false; $p = $p->parentNode) { + if (is_null($p)) + return false; + + if ($p->nodeName == $parent_name) + return true; + } + return false; + } + + + /** + * Convert Children + * + * Recursive function to drill into the DOM and convert each node into Markdown from the inside out. + * + * Finds children of each node and convert those to #text nodes containing their Markdown equivalent, + * starting with the innermost element and working up to the outermost element. + * + * @param $node + */ + private function convert_children($node) + { + // Don't convert HTML code inside and
 blocks to Markdown - that should stay as HTML
+        if (self::is_child_of('pre', $node) || self::is_child_of('code', $node))
+            return;
+
+        // If the node has children, convert those to Markdown first
+        if ($node->hasChildNodes()) {
+            $length = $node->childNodes->length;
+
+            for ($i = 0; $i < $length; $i++) {
+                $child = $node->childNodes->item($i);
+                $this->convert_children($child);
+            }
+        }
+
+        // Now that child nodes have been converted, convert the original node
+        $markdown = $this->convert_to_markdown($node);
+
+        // Create a DOM text node containing the Markdown equivalent of the original node
+        $markdown_node = $this->document->createTextNode($markdown);
+
+        // Replace the old $node e.g. "

Title

" with the new $markdown_node e.g. "### Title" + $node->parentNode->replaceChild($markdown_node, $node); + } + + + /** + * Get Markdown + * + * Sends the body node to convert_children() to change inner nodes to Markdown #text nodes, then saves and + * returns the resulting converted document as a string in Markdown format. + * + * @return string|boolean The converted HTML as Markdown, or false if conversion failed + */ + private function get_markdown() + { + // Work on the entire DOM tree (including head and body) + $input = $this->document->getElementsByTagName("html")->item(0); + + if (!$input) + return false; + + // Convert all children of this root element. The DOMDocument stored in $this->doc will + // then consist of #text nodes, each containing a Markdown version of the original node + // that it replaced. + $this->convert_children($input); + + // Sanitize and return the body contents as a string. + $markdown = $this->document->saveHTML(); // stores the DOMDocument as a string + $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8'); + $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8'); // Double decode to cover cases like &nbsp; http://www.php.net/manual/en/function.htmlentities.php#99984 + $markdown = preg_replace("/]+>/", "", $markdown); // Strip doctype declaration + $unwanted = array('', '', '', '', '', '', '', ' '); + $markdown = str_replace($unwanted, '', $markdown); // Strip unwanted tags + $markdown = trim($markdown, "\n\r\0\x0B"); + + $this->output = $markdown; + + return $markdown; + } + + + /** + * Convert to Markdown + * + * Converts an individual node into a #text node containing a string of its Markdown equivalent. + * + * Example: An

node with text content of "Title" becomes a text node with content of "### Title" + * + * @param $node + * @return string The converted HTML as Markdown + */ + private function convert_to_markdown($node) + { + $tag = $node->nodeName; // the type of element, e.g. h1 + $value = $node->nodeValue; // the value of that element, e.g. The Title + + // Strip nodes named in remove_nodes + $tags_to_remove = explode(' ', $this->options['remove_nodes']); + if ( in_array($tag, $tags_to_remove) ) + return false; + + switch ($tag) { + case "p": + $markdown = (trim($value)) ? rtrim($value) . PHP_EOL . PHP_EOL : ''; + break; + case "pre": + $markdown = PHP_EOL . $this->convert_code($node) . PHP_EOL; + break; + case "h1": + case "h2": + $markdown = $this->convert_header($tag, $node); + break; + case "h3": + $markdown = "### " . $value . PHP_EOL . PHP_EOL; + break; + case "h4": + $markdown = "#### " . $value . PHP_EOL . PHP_EOL; + break; + case "h5": + $markdown = "##### " . $value . PHP_EOL . PHP_EOL; + break; + case "h6": + $markdown = "###### " . $value . PHP_EOL . PHP_EOL; + break; + case "em": + case "i": + case "strong": + case "b": + $markdown = $this->convert_emphasis($tag, $value); + break; + case "hr": + $markdown = "- - - - - -" . PHP_EOL . PHP_EOL; + break; + case "br": + $markdown = " " . PHP_EOL; + break; + case "blockquote": + $markdown = $this->convert_blockquote($node); + break; + case "code": + $markdown = $this->convert_code($node); + break; + case "ol": + case "ul": + $markdown = $value . PHP_EOL; + break; + case "li": + $markdown = $this->convert_list($node); + break; + case "img": + $markdown = $this->convert_image($node); + break; + case "a": + $markdown = $this->convert_anchor($node); + break; + case "#text": + $markdown = preg_replace('~\s+~', ' ', $value); + $markdown = preg_replace('~^#~', '\\\\#', $markdown); + break; + case "#comment": + $markdown = ''; + break; + case "div": + $markdown = ($this->options['strip_tags']) ? $value . PHP_EOL . PHP_EOL : html_entity_decode($node->C14N()); + break; + default: + // If strip_tags is false (the default), preserve tags that don't have Markdown equivalents, + // such as nodes on their own. C14N() canonicalizes the node to a string. + // See: http://www.php.net/manual/en/domnode.c14n.php + $markdown = ($this->options['strip_tags']) ? $value : html_entity_decode($node->C14N()); + } + + return $markdown; + } + + + /** + * Convert Header + * + * Converts h1 and h2 headers to Markdown-style headers in setext style, + * matching the number of underscores with the length of the title. + * + * e.g. Header 1 Header Two + * ======== ---------- + * + * Returns atx headers instead if $this->options['header_style'] is "atx" + * + * e.g. # Header 1 ## Header Two + * + * @param string $level The header level, including the "h". e.g. h1 + * @param string $node The node to convert. + * @return string The Markdown version of the header. + */ + private function convert_header($level, $node) + { + $content = $node->nodeValue; + + if (!$this->is_child_of('blockquote', $node) && $this->options['header_style'] == "setext") { + $length = (function_exists('mb_strlen')) ? mb_strlen($content, 'utf-8') : strlen($content); + $underline = ($level == "h1") ? "=" : "-"; + $markdown = $content . PHP_EOL . str_repeat($underline, $length) . PHP_EOL . PHP_EOL; // setext style + } else { + $prefix = ($level == "h1") ? "# " : "## "; + $markdown = $prefix . $content . PHP_EOL . PHP_EOL; // atx style + } + + return $markdown; + } + + + /** + * Converts inline styles + * This function is used to render strong and em tags + * + * eg bold text becomes **bold text** or __bold text__ + * + * @param string $tag + * @param string $value + * @return string + */ + private function convert_emphasis($tag, $value) + { + if ($tag == 'i' || $tag == 'em') { + $markdown = $this->options['italic_style'] . $value . $this->options['italic_style']; + } else { + $markdown = $this->options['bold_style'] . $value . $this->options['bold_style']; + } + + return $markdown; + } + + + /** + * Convert Image + * + * Converts tags to Markdown. + * + * e.g. alt text + * becomes ![alt text](/path/img.jpg "Title") + * + * @param $node + * @return string + */ + private function convert_image($node) + { + $src = $node->getAttribute('src'); + $alt = $node->getAttribute('alt'); + $title = $node->getAttribute('title'); + + if ($title != "") { + $markdown = '![' . $alt . '](' . $src . ' "' . $title . '")'; // No newlines added. should be in a block-level element. + } else { + $markdown = '![' . $alt . '](' . $src . ')'; + } + + return $markdown; + } + + + /** + * Convert Anchor + * + * Converts tags to Markdown. + * + * e.g. Modern Nerd + * becomes [Modern Nerd](http://modernnerd.net "Title") + * + * @param $node + * @return string + */ + private function convert_anchor($node) + { + $href = $node->getAttribute('href'); + $title = $node->getAttribute('title'); + $text = $node->nodeValue; + + if ($title != "") { + $markdown = '[' . $text . '](' . $href . ' "' . $title . '")'; + } else { + $markdown = '[' . $text . '](' . $href . ')'; + } + + // Append a space if the node after this one is also an anchor + $next_node_name = $this->get_next_node_name($node); + + if ($next_node_name == 'a') + $markdown = $markdown . ' '; + + return $markdown; + } + + + /** + * Convert List + * + * Converts