Friendica Communications Platform (please note that this is a clone of the repository at github, issues are handled there) https://friendi.ca
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

598 lines
18 KiB

7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
  1. <?php
  2. /**
  3. * Class HTML_To_Markdown
  4. *
  5. * A helper class to convert HTML to Markdown.
  6. *
  7. * @version 2.2.1
  8. * @author Nick Cernis <nick@cern.is>
  9. * @link https://github.com/nickcernis/html2markdown/ Latest version on GitHub.
  10. * @link http://twitter.com/nickcernis Nick on twitter.
  11. * @license http://www.opensource.org/licenses/mit-license.php MIT
  12. */
  13. class HTML_To_Markdown
  14. {
  15. /**
  16. * @var DOMDocument The root of the document tree that holds our HTML.
  17. */
  18. private $document;
  19. /**
  20. * @var string|boolean The Markdown version of the original HTML, or false if conversion failed
  21. */
  22. private $output;
  23. /**
  24. * @var array Class-wide options users can override.
  25. */
  26. private $options = array(
  27. 'header_style' => 'setext', // Set to "atx" to output H1 and H2 headers as # Header1 and ## Header2
  28. 'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
  29. 'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
  30. 'bold_style' => '**', // Set to '__' if you prefer the underlined style
  31. 'italic_style' => '*', // Set to '_' if you prefer the underlined style
  32. 'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: "meta style script"
  33. );
  34. /**
  35. * Constructor
  36. *
  37. * Set up a new DOMDocument from the supplied HTML, convert it to Markdown, and store it in $this->$output.
  38. *
  39. * @param string $html The HTML to convert to Markdown.
  40. * @param array $overrides [optional] List of style and error display overrides.
  41. */
  42. public function __construct($html = null, $overrides = null)
  43. {
  44. if ($overrides)
  45. $this->options = array_merge($this->options, $overrides);
  46. if ($html)
  47. $this->convert($html);
  48. }
  49. /**
  50. * Setter for conversion options
  51. *
  52. * @param $name
  53. * @param $value
  54. */
  55. public function set_option($name, $value)
  56. {
  57. $this->options[$name] = $value;
  58. }
  59. /**
  60. * Convert
  61. *
  62. * Loads HTML and passes to get_markdown()
  63. *
  64. * @param $html
  65. * @return string The Markdown version of the html
  66. */
  67. public function convert($html)
  68. {
  69. $html = preg_replace('~>\s+<~', '><', $html); // Strip white space between tags to prevent creation of empty #text nodes
  70. $this->document = new DOMDocument();
  71. if ($this->options['suppress_errors'])
  72. libxml_use_internal_errors(true); // Suppress conversion errors (from http://bit.ly/pCCRSX )
  73. $this->document->loadHTML('<?xml encoding="UTF-8">' . $html); // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt )
  74. $this->document->encoding = 'UTF-8';
  75. if ($this->options['suppress_errors'])
  76. libxml_clear_errors();
  77. return $this->get_markdown($html);
  78. }
  79. /**
  80. * Is Child Of?
  81. *
  82. * Is the node a child of the given parent tag?
  83. *
  84. * @param $parent_name string|array The name of the parent node(s) to search for e.g. 'code' or array('pre', 'code')
  85. * @param $node
  86. * @return bool
  87. */
  88. private static function is_child_of($parent_name, $node)
  89. {
  90. for ($p = $node->parentNode; $p != false; $p = $p->parentNode) {
  91. if (is_null($p))
  92. return false;
  93. if ( is_array($parent_name) && in_array($p->nodeName, $parent_name) )
  94. return true;
  95. if ($p->nodeName == $parent_name)
  96. return true;
  97. }
  98. return false;
  99. }
  100. /**
  101. * Convert Children
  102. *
  103. * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
  104. *
  105. * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
  106. * starting with the innermost element and working up to the outermost element.
  107. *
  108. * @param $node
  109. */
  110. private function convert_children($node)
  111. {
  112. // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
  113. if (self::is_child_of(array('pre', 'code'), $node))
  114. return;
  115. // If the node has children, convert those to Markdown first
  116. if ($node->hasChildNodes()) {
  117. $length = $node->childNodes->length;
  118. for ($i = 0; $i < $length; $i++) {
  119. $child = $node->childNodes->item($i);
  120. $this->convert_children($child);
  121. }
  122. }
  123. // Now that child nodes have been converted, convert the original node
  124. $markdown = $this->convert_to_markdown($node);
  125. // Create a DOM text node containing the Markdown equivalent of the original node
  126. $markdown_node = $this->document->createTextNode($markdown);
  127. // Replace the old $node e.g. "<h3>Title</h3>" with the new $markdown_node e.g. "### Title"
  128. $node->parentNode->replaceChild($markdown_node, $node);
  129. }
  130. /**
  131. * Get Markdown
  132. *
  133. * Sends the body node to convert_children() to change inner nodes to Markdown #text nodes, then saves and
  134. * returns the resulting converted document as a string in Markdown format.
  135. *
  136. * @return string|boolean The converted HTML as Markdown, or false if conversion failed
  137. */
  138. private function get_markdown()
  139. {
  140. // Work on the entire DOM tree (including head and body)
  141. $input = $this->document->getElementsByTagName("html")->item(0);
  142. if (!$input)
  143. return false;
  144. // Convert all children of this root element. The DOMDocument stored in $this->doc will
  145. // then consist of #text nodes, each containing a Markdown version of the original node
  146. // that it replaced.
  147. $this->convert_children($input);
  148. // Sanitize and return the body contents as a string.
  149. $markdown = $this->document->saveHTML(); // stores the DOMDocument as a string
  150. $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
  151. $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8'); // Double decode to cover cases like &amp;nbsp; http://www.php.net/manual/en/function.htmlentities.php#99984
  152. $markdown = preg_replace("/<!DOCTYPE [^>]+>/", "", $markdown); // Strip doctype declaration
  153. $unwanted = array('<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '<?xml encoding="UTF-8">', '&#xD;');
  154. $markdown = str_replace($unwanted, '', $markdown); // Strip unwanted tags
  155. $markdown = trim($markdown, "\n\r\0\x0B");
  156. $this->output = $markdown;
  157. return $markdown;
  158. }
  159. /**
  160. * Convert to Markdown
  161. *
  162. * Converts an individual node into a #text node containing a string of its Markdown equivalent.
  163. *
  164. * Example: An <h3> node with text content of "Title" becomes a text node with content of "### Title"
  165. *
  166. * @param $node
  167. * @return string The converted HTML as Markdown
  168. */
  169. private function convert_to_markdown($node)
  170. {
  171. $tag = $node->nodeName; // the type of element, e.g. h1
  172. $value = $node->nodeValue; // the value of that element, e.g. The Title
  173. // Strip nodes named in remove_nodes
  174. $tags_to_remove = explode(' ', $this->options['remove_nodes']);
  175. if ( in_array($tag, $tags_to_remove) )
  176. return false;
  177. switch ($tag) {
  178. case "p":
  179. $markdown = (trim($value)) ? rtrim($value) . PHP_EOL . PHP_EOL : '';
  180. break;
  181. case "pre":
  182. $markdown = PHP_EOL . $this->convert_code($node) . PHP_EOL;
  183. break;
  184. case "h1":
  185. case "h2":
  186. $markdown = $this->convert_header($tag, $node);
  187. break;
  188. case "h3":
  189. $markdown = "### " . $value . PHP_EOL . PHP_EOL;
  190. break;
  191. case "h4":
  192. $markdown = "#### " . $value . PHP_EOL . PHP_EOL;
  193. break;
  194. case "h5":
  195. $markdown = "##### " . $value . PHP_EOL . PHP_EOL;
  196. break;
  197. case "h6":
  198. $markdown = "###### " . $value . PHP_EOL . PHP_EOL;
  199. break;
  200. case "em":
  201. case "i":
  202. case "strong":
  203. case "b":
  204. $markdown = $this->convert_emphasis($tag, $value);
  205. break;
  206. case "hr":
  207. $markdown = "- - - - - -" . PHP_EOL . PHP_EOL;
  208. break;
  209. case "br":
  210. $markdown = " " . PHP_EOL;
  211. break;
  212. case "blockquote":
  213. $markdown = $this->convert_blockquote($node);
  214. break;
  215. case "code":
  216. $markdown = $this->convert_code($node);
  217. break;
  218. case "ol":
  219. case "ul":
  220. $markdown = $value . PHP_EOL;
  221. break;
  222. case "li":
  223. $markdown = $this->convert_list($node);
  224. break;
  225. case "img":
  226. $markdown = $this->convert_image($node);
  227. break;
  228. case "a":
  229. $markdown = $this->convert_anchor($node);
  230. break;
  231. case "#text":
  232. $markdown = preg_replace('~\s+~', ' ', $value);
  233. $markdown = preg_replace('~^#~', '\\\\#', $markdown);
  234. break;
  235. case "#comment":
  236. $markdown = '';
  237. break;
  238. case "div":
  239. $markdown = ($this->options['strip_tags']) ? $value . PHP_EOL . PHP_EOL : html_entity_decode($node->C14N());
  240. break;
  241. default:
  242. // If strip_tags is false (the default), preserve tags that don't have Markdown equivalents,
  243. // such as <span> nodes on their own. C14N() canonicalizes the node to a string.
  244. // See: http://www.php.net/manual/en/domnode.c14n.php
  245. $markdown = ($this->options['strip_tags']) ? $value : html_entity_decode($node->C14N());
  246. }
  247. return $markdown;
  248. }
  249. /**
  250. * Convert Header
  251. *
  252. * Converts h1 and h2 headers to Markdown-style headers in setext style,
  253. * matching the number of underscores with the length of the title.
  254. *
  255. * e.g. Header 1 Header Two
  256. * ======== ----------
  257. *
  258. * Returns atx headers instead if $this->options['header_style'] is "atx"
  259. *
  260. * e.g. # Header 1 ## Header Two
  261. *
  262. * @param string $level The header level, including the "h". e.g. h1
  263. * @param string $node The node to convert.
  264. * @return string The Markdown version of the header.
  265. */
  266. private function convert_header($level, $node)
  267. {
  268. $content = $node->nodeValue;
  269. if (!$this->is_child_of('blockquote', $node) && $this->options['header_style'] == "setext") {
  270. $length = (function_exists('mb_strlen')) ? mb_strlen($content, 'utf-8') : strlen($content);
  271. $underline = ($level == "h1") ? "=" : "-";
  272. $markdown = $content . PHP_EOL . str_repeat($underline, $length) . PHP_EOL . PHP_EOL; // setext style
  273. } else {
  274. $prefix = ($level == "h1") ? "# " : "## ";
  275. $markdown = $prefix . $content . PHP_EOL . PHP_EOL; // atx style
  276. }
  277. return $markdown;
  278. }
  279. /**
  280. * Converts inline styles
  281. * This function is used to render strong and em tags
  282. *
  283. * eg <strong>bold text</strong> becomes **bold text** or __bold text__
  284. *
  285. * @param string $tag
  286. * @param string $value
  287. * @return string
  288. */
  289. private function convert_emphasis($tag, $value)
  290. {
  291. if ($tag == 'i' || $tag == 'em') {
  292. $markdown = $this->options['italic_style'] . $value . $this->options['italic_style'];
  293. } else {
  294. $markdown = $this->options['bold_style'] . $value . $this->options['bold_style'];
  295. }
  296. return $markdown;
  297. }
  298. /**
  299. * Convert Image
  300. *
  301. * Converts <img /> tags to Markdown.
  302. *
  303. * e.g. <img src="/path/img.jpg" alt="alt text" title="Title" />
  304. * becomes ![alt text](/path/img.jpg "Title")
  305. *
  306. * @param $node
  307. * @return string
  308. */
  309. private function convert_image($node)
  310. {
  311. $src = $node->getAttribute('src');
  312. $alt = $node->getAttribute('alt');
  313. $title = $node->getAttribute('title');
  314. if ($title != "") {
  315. $markdown = '![' . $alt . '](' . $src . ' "' . $title . '")'; // No newlines added. <img> should be in a block-level element.
  316. } else {
  317. $markdown = '![' . $alt . '](' . $src . ')';
  318. }
  319. return $markdown;
  320. }
  321. /**
  322. * Convert Anchor
  323. *
  324. * Converts <a> tags to Markdown.
  325. *
  326. * e.g. <a href="http://modernnerd.net" title="Title">Modern Nerd</a>
  327. * becomes [Modern Nerd](http://modernnerd.net "Title")
  328. *
  329. * @param $node
  330. * @return string
  331. */
  332. private function convert_anchor($node)
  333. {
  334. $href = $node->getAttribute('href');
  335. $title = $node->getAttribute('title');
  336. $text = $node->nodeValue;
  337. if ($title != "") {
  338. $markdown = '[' . $text . '](' . $href . ' "' . $title . '")';
  339. } else {
  340. $markdown = '[' . $text . '](' . $href . ')';
  341. }
  342. if (! $href)
  343. $markdown = html_entity_decode($node->C14N());
  344. // Append a space if the node after this one is also an anchor
  345. $next_node_name = $this->get_next_node_name($node);
  346. if ($next_node_name == 'a')
  347. $markdown = $markdown . ' ';
  348. return $markdown;
  349. }
  350. /**
  351. * Convert List
  352. *
  353. * Converts <ul> and <ol> lists to Markdown.
  354. *
  355. * @param $node
  356. * @return string
  357. */
  358. private function convert_list($node)
  359. {
  360. // If parent is an ol, use numbers, otherwise, use dashes
  361. $list_type = $node->parentNode->nodeName;
  362. $value = $node->nodeValue;
  363. if ($list_type == "ul") {
  364. $markdown = "- " . trim($value) . PHP_EOL;
  365. } else {
  366. $number = $this->get_position($node);
  367. $markdown = $number . ". " . trim($value) . PHP_EOL;
  368. }
  369. return $markdown;
  370. }
  371. /**
  372. * Convert Code
  373. *
  374. * Convert code tags by indenting blocks of code and wrapping single lines in backticks.
  375. *
  376. * @param DOMNode $node
  377. * @return string
  378. */
  379. private function convert_code($node)
  380. {
  381. // Store the content of the code block in an array, one entry for each line
  382. $markdown = '';
  383. $code_content = html_entity_decode($node->C14N());
  384. $code_content = str_replace(array("<code>", "</code>"), "", $code_content);
  385. $code_content = str_replace(array("<pre>", "</pre>"), "", $code_content);
  386. $lines = preg_split('/\r\n|\r|\n/', $code_content);
  387. $total = count($lines);
  388. // If there's more than one line of code, prepend each line with four spaces and no backticks.
  389. if ($total > 1 || $node->nodeName === 'pre') {
  390. // Remove the first and last line if they're empty
  391. $first_line = trim($lines[0]);
  392. $last_line = trim($lines[$total - 1]);
  393. $first_line = trim($first_line, "&#xD;"); //trim XML style carriage returns too
  394. $last_line = trim($last_line, "&#xD;");
  395. if (empty($first_line))
  396. array_shift($lines);
  397. if (empty($last_line))
  398. array_pop($lines);
  399. $count = 1;
  400. foreach ($lines as $line) {
  401. $line = str_replace('&#xD;', '', $line);
  402. $markdown .= " " . $line;
  403. // Add newlines, except final line of the code
  404. if ($count != $total)
  405. $markdown .= PHP_EOL;
  406. $count++;
  407. }
  408. $markdown .= PHP_EOL;
  409. } else { // There's only one line of code. It's a code span, not a block. Just wrap it with backticks.
  410. $markdown .= "`" . $lines[0] . "`";
  411. }
  412. return $markdown;
  413. }
  414. /**
  415. * Convert blockquote
  416. *
  417. * Prepend blockquotes with > chars.
  418. *
  419. * @param $node
  420. * @return string
  421. */
  422. private function convert_blockquote($node)
  423. {
  424. // Contents should have already been converted to Markdown by this point,
  425. // so we just need to add ">" symbols to each line.
  426. $markdown = '';
  427. $quote_content = trim($node->nodeValue);
  428. $lines = preg_split('/\r\n|\r|\n/', $quote_content);
  429. $total_lines = count($lines);
  430. foreach ($lines as $i => $line) {
  431. $markdown .= "> " . $line . PHP_EOL;
  432. if ($i + 1 == $total_lines)
  433. $markdown .= PHP_EOL;
  434. }
  435. return $markdown;
  436. }
  437. /**
  438. * Get Position
  439. *
  440. * Returns the numbered position of a node inside its parent
  441. *
  442. * @param $node
  443. * @return int The numbered position of the node, starting at 1.
  444. */
  445. private function get_position($node)
  446. {
  447. // Get all of the nodes inside the parent
  448. $list_nodes = $node->parentNode->childNodes;
  449. $total_nodes = $list_nodes->length;
  450. $position = 1;
  451. // Loop through all nodes and find the given $node
  452. for ($a = 0; $a < $total_nodes; $a++) {
  453. $current_node = $list_nodes->item($a);
  454. if ($current_node->isSameNode($node))
  455. $position = $a + 1;
  456. }
  457. return $position;
  458. }
  459. /**
  460. * Get Next Node Name
  461. *
  462. * Return the name of the node immediately after the passed one.
  463. *
  464. * @param $node
  465. * @return string|null The node name (e.g. 'h1') or null.
  466. */
  467. private function get_next_node_name($node)
  468. {
  469. $next_node_name = null;
  470. $current_position = $this->get_position($node);
  471. $next_node = $node->parentNode->childNodes->item($current_position);
  472. if ($next_node)
  473. $next_node_name = $next_node->nodeName;
  474. return $next_node_name;
  475. }
  476. /**
  477. * To String
  478. *
  479. * Magic method to return Markdown output when HTML_To_Markdown instance is treated as a string.
  480. *
  481. * @return string
  482. */
  483. public function __toString()
  484. {
  485. return $this->output();
  486. }
  487. /**
  488. * Output
  489. *
  490. * Getter for the converted Markdown contents stored in $this->output
  491. *
  492. * @return string
  493. */
  494. public function output()
  495. {
  496. if (!$this->output) {
  497. return '';
  498. } else {
  499. return $this->output;
  500. }
  501. }
  502. }