489 lines
15 KiB
PHP
489 lines
15 KiB
PHP
|
<?php
|
||
|
/**
|
||
|
* Class to convert HTML to Markdown with PHP Markdown Extra syntax support.
|
||
|
*
|
||
|
* @version 1.0.0 alpha
|
||
|
* @author Milian Wolff (<mail@milianw.de>, <http://milianw.de>)
|
||
|
* @license LGPL, see LICENSE_LGPL.txt and the summary below
|
||
|
* @copyright (C) 2007 Milian Wolff
|
||
|
*
|
||
|
* This library is free software; you can redistribute it and/or
|
||
|
* modify it under the terms of the GNU Lesser General Public
|
||
|
* License as published by the Free Software Foundation; either
|
||
|
* version 2.1 of the License, or (at your option) any later version.
|
||
|
*
|
||
|
* This library is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
* Lesser General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU Lesser General Public
|
||
|
* License along with this library; if not, write to the Free Software
|
||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||
|
*/
|
||
|
|
||
|
/**
|
||
|
* standard Markdownify class
|
||
|
*/
|
||
|
require_once dirname(__FILE__).'/markdownify.php';
|
||
|
|
||
|
class Markdownify_Extra extends Markdownify {
|
||
|
/**
|
||
|
* table data, including rows with content and the maximum width of each col
|
||
|
*
|
||
|
* @var array
|
||
|
*/
|
||
|
var $table = array();
|
||
|
/**
|
||
|
* current col
|
||
|
*
|
||
|
* @var int
|
||
|
*/
|
||
|
var $col = -1;
|
||
|
/**
|
||
|
* current row
|
||
|
*
|
||
|
* @var int
|
||
|
*/
|
||
|
var $row = 0;
|
||
|
/**
|
||
|
* constructor, see Markdownify::Markdownify() for more information
|
||
|
*/
|
||
|
function Markdownify_Extra($linksAfterEachParagraph = MDFY_LINKS_EACH_PARAGRAPH, $bodyWidth = MDFY_BODYWIDTH, $keepHTML = MDFY_KEEPHTML) {
|
||
|
parent::Markdownify($linksAfterEachParagraph, $bodyWidth, $keepHTML);
|
||
|
|
||
|
### new markdownable tags & attributes
|
||
|
# header ids: # foo {bar}
|
||
|
$this->isMarkdownable['h1']['id'] = 'optional';
|
||
|
$this->isMarkdownable['h2']['id'] = 'optional';
|
||
|
$this->isMarkdownable['h3']['id'] = 'optional';
|
||
|
$this->isMarkdownable['h4']['id'] = 'optional';
|
||
|
$this->isMarkdownable['h5']['id'] = 'optional';
|
||
|
$this->isMarkdownable['h6']['id'] = 'optional';
|
||
|
# tables
|
||
|
$this->isMarkdownable['table'] = array();
|
||
|
$this->isMarkdownable['th'] = array(
|
||
|
'align' => 'optional',
|
||
|
);
|
||
|
$this->isMarkdownable['td'] = array(
|
||
|
'align' => 'optional',
|
||
|
);
|
||
|
$this->isMarkdownable['tr'] = array();
|
||
|
array_push($this->ignore, 'thead');
|
||
|
array_push($this->ignore, 'tbody');
|
||
|
array_push($this->ignore, 'tfoot');
|
||
|
# definition lists
|
||
|
$this->isMarkdownable['dl'] = array();
|
||
|
$this->isMarkdownable['dd'] = array();
|
||
|
$this->isMarkdownable['dt'] = array();
|
||
|
# footnotes
|
||
|
$this->isMarkdownable['fnref'] = array(
|
||
|
'target' => 'required',
|
||
|
);
|
||
|
$this->isMarkdownable['footnotes'] = array();
|
||
|
$this->isMarkdownable['fn'] = array(
|
||
|
'name' => 'required',
|
||
|
);
|
||
|
$this->parser->blockElements['fnref'] = false;
|
||
|
$this->parser->blockElements['fn'] = true;
|
||
|
$this->parser->blockElements['footnotes'] = true;
|
||
|
# abbr
|
||
|
$this->isMarkdownable['abbr'] = array(
|
||
|
'title' => 'required',
|
||
|
);
|
||
|
# build RegEx lookahead to decide wether table can pe parsed or not
|
||
|
$inlineTags = array_keys($this->parser->blockElements, false);
|
||
|
$colContents = '(?:[^<]|<(?:'.implode('|', $inlineTags).'|[^a-z]))+';
|
||
|
$this->tableLookaheadHeader = '{
|
||
|
^\s*(?:<thead\s*>)?\s* # open optional thead
|
||
|
<tr\s*>\s*(?: # start required row with headers
|
||
|
<th(?:\s+align=("|\')(?:left|center|right)\1)?\s*> # header with optional align
|
||
|
\s*'.$colContents.'\s* # contents
|
||
|
</th>\s* # close header
|
||
|
)+</tr> # close row with headers
|
||
|
\s*(?:</thead>)? # close optional thead
|
||
|
}sxi';
|
||
|
$this->tdSubstitute = '\s*'.$colContents.'\s* # contents
|
||
|
</td>\s*';
|
||
|
$this->tableLookaheadBody = '{
|
||
|
\s*(?:<tbody\s*>)?\s* # open optional tbody
|
||
|
(?:<tr\s*>\s* # start row
|
||
|
%s # cols to be substituted
|
||
|
</tr>)+ # close row
|
||
|
\s*(?:</tbody>)? # close optional tbody
|
||
|
\s*</table> # close table
|
||
|
}sxi';
|
||
|
}
|
||
|
/**
|
||
|
* handle header tags (<h1> - <h6>)
|
||
|
*
|
||
|
* @param int $level 1-6
|
||
|
* @return void
|
||
|
*/
|
||
|
function handleHeader($level) {
|
||
|
static $id = null;
|
||
|
if ($this->parser->isStartTag) {
|
||
|
if (isset($this->parser->tagAttributes['id'])) {
|
||
|
$id = $this->parser->tagAttributes['id'];
|
||
|
}
|
||
|
} else {
|
||
|
if (!is_null($id)) {
|
||
|
$this->out(' {#'.$id.'}');
|
||
|
$id = null;
|
||
|
}
|
||
|
}
|
||
|
parent::handleHeader($level);
|
||
|
}
|
||
|
/**
|
||
|
* handle <abbr> tags
|
||
|
*
|
||
|
* @param void
|
||
|
* @return void
|
||
|
*/
|
||
|
function handleTag_abbr() {
|
||
|
if ($this->parser->isStartTag) {
|
||
|
$this->stack();
|
||
|
$this->buffer();
|
||
|
} else {
|
||
|
$tag = $this->unstack();
|
||
|
$tag['text'] = $this->unbuffer();
|
||
|
$add = true;
|
||
|
foreach ($this->stack['abbr'] as $stacked) {
|
||
|
if ($stacked['text'] == $tag['text']) {
|
||
|
/** TODO: differing abbr definitions, i.e. different titles for same text **/
|
||
|
$add = false;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
$this->out($tag['text']);
|
||
|
if ($add) {
|
||
|
array_push($this->stack['abbr'], $tag);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
/**
|
||
|
* flush stacked abbr tags
|
||
|
*
|
||
|
* @param void
|
||
|
* @return void
|
||
|
*/
|
||
|
function flushStacked_abbr() {
|
||
|
$out = array();
|
||
|
foreach ($this->stack['abbr'] as $k => $tag) {
|
||
|
if (!isset($tag['unstacked'])) {
|
||
|
array_push($out, ' *['.$tag['text'].']: '.$tag['title']);
|
||
|
$tag['unstacked'] = true;
|
||
|
$this->stack['abbr'][$k] = $tag;
|
||
|
}
|
||
|
}
|
||
|
if (!empty($out)) {
|
||
|
$this->out("\n\n".implode("\n", $out));
|
||
|
}
|
||
|
}
|
||
|
/**
|
||
|
* handle <table> tags
|
||
|
*
|
||
|
* @param void
|
||
|
* @return void
|
||
|
*/
|
||
|
function handleTag_table() {
|
||
|
if ($this->parser->isStartTag) {
|
||
|
# check if upcoming table can be converted
|
||
|
if ($this->keepHTML) {
|
||
|
if (preg_match($this->tableLookaheadHeader, $this->parser->html, $matches)) {
|
||
|
# header seems good, now check body
|
||
|
# get align & number of cols
|
||
|
preg_match_all('#<th(?:\s+align=("|\')(left|right|center)\1)?\s*>#si', $matches[0], $cols);
|
||
|
$regEx = '';
|
||
|
$i = 1;
|
||
|
$aligns = array();
|
||
|
foreach ($cols[2] as $align) {
|
||
|
$align = strtolower($align);
|
||
|
array_push($aligns, $align);
|
||
|
if (empty($align)) {
|
||
|
$align = 'left'; # default value
|
||
|
}
|
||
|
$td = '\s+align=("|\')'.$align.'\\'.$i;
|
||
|
$i++;
|
||
|
if ($align == 'left') {
|
||
|
# look for empty align or left
|
||
|
$td = '(?:'.$td.')?';
|
||
|
}
|
||
|
$td = '<td'.$td.'\s*>';
|
||
|
$regEx .= $td.$this->tdSubstitute;
|
||
|
}
|
||
|
$regEx = sprintf($this->tableLookaheadBody, $regEx);
|
||
|
if (preg_match($regEx, $this->parser->html, $matches, null, strlen($matches[0]))) {
|
||
|
# this is a markdownable table tag!
|
||
|
$this->table = array(
|
||
|
'rows' => array(),
|
||
|
'col_widths' => array(),
|
||
|
'aligns' => $aligns,
|
||
|
);
|
||
|
$this->row = 0;
|
||
|
} else {
|
||
|
# non markdownable table
|
||
|
$this->handleTagToText();
|
||
|
}
|
||
|
} else {
|
||
|
# non markdownable table
|
||
|
$this->handleTagToText();
|
||
|
}
|
||
|
} else {
|
||
|
$this->table = array(
|
||
|
'rows' => array(),
|
||
|
'col_widths' => array(),
|
||
|
'aligns' => array(),
|
||
|
);
|
||
|
$this->row = 0;
|
||
|
}
|
||
|
} else {
|
||
|
# finally build the table in Markdown Extra syntax
|
||
|
$separator = array();
|
||
|
# seperator with correct align identifikators
|
||
|
foreach($this->table['aligns'] as $col => $align) {
|
||
|
if (!$this->keepHTML && !isset($this->table['col_widths'][$col])) {
|
||
|
break;
|
||
|
}
|
||
|
$left = ' ';
|
||
|
$right = ' ';
|
||
|
switch ($align) {
|
||
|
case 'left':
|
||
|
$left = ':';
|
||
|
break;
|
||
|
case 'center':
|
||
|
$right = ':';
|
||
|
$left = ':';
|
||
|
case 'right':
|
||
|
$right = ':';
|
||
|
break;
|
||
|
}
|
||
|
array_push($separator, $left.str_repeat('-', $this->table['col_widths'][$col]).$right);
|
||
|
}
|
||
|
$separator = '|'.implode('|', $separator).'|';
|
||
|
|
||
|
$rows = array();
|
||
|
# add padding
|
||
|
array_walk_recursive($this->table['rows'], array(&$this, 'alignTdContent'));
|
||
|
$header = array_shift($this->table['rows']);
|
||
|
array_push($rows, '| '.implode(' | ', $header).' |');
|
||
|
array_push($rows, $separator);
|
||
|
foreach ($this->table['rows'] as $row) {
|
||
|
array_push($rows, '| '.implode(' | ', $row).' |');
|
||
|
}
|
||
|
$this->out(implode("\n".$this->indent, $rows));
|
||
|
$this->table = array();
|
||
|
$this->setLineBreaks(2);
|
||
|
}
|
||
|
}
|
||
|
/**
|
||
|
* properly pad content so it is aligned as whished
|
||
|
* should be used with array_walk_recursive on $this->table['rows']
|
||
|
*
|
||
|
* @param string &$content
|
||
|
* @param int $col
|
||
|
* @return void
|
||
|
*/
|
||
|
function alignTdContent(&$content, $col) {
|
||
|
switch ($this->table['aligns'][$col]) {
|
||
|
default:
|
||
|
case 'left':
|
||
|
$content .= str_repeat(' ', $this->table['col_widths'][$col] - $this->strlen($content));
|
||
|
break;
|
||
|
case 'right':
|
||
|
$content = str_repeat(' ', $this->table['col_widths'][$col] - $this->strlen($content)).$content;
|
||
|
break;
|
||
|
case 'center':
|
||
|
$paddingNeeded = $this->table['col_widths'][$col] - $this->strlen($content);
|
||
|
$left = floor($paddingNeeded / 2);
|
||
|
$right = $paddingNeeded - $left;
|
||
|
$content = str_repeat(' ', $left).$content.str_repeat(' ', $right);
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
/**
|
||
|
* handle <tr> tags
|
||
|
*
|
||
|
* @param void
|
||
|
* @return void
|
||
|
*/
|
||
|
function handleTag_tr() {
|
||
|
if ($this->parser->isStartTag) {
|
||
|
$this->col = -1;
|
||
|
} else {
|
||
|
$this->row++;
|
||
|
}
|
||
|
}
|
||
|
/**
|
||
|
* handle <td> tags
|
||
|
*
|
||
|
* @param void
|
||
|
* @return void
|
||
|
*/
|
||
|
function handleTag_td() {
|
||
|
if ($this->parser->isStartTag) {
|
||
|
$this->col++;
|
||
|
if (!isset($this->table['col_widths'][$this->col])) {
|
||
|
$this->table['col_widths'][$this->col] = 0;
|
||
|
}
|
||
|
$this->buffer();
|
||
|
} else {
|
||
|
$buffer = trim($this->unbuffer());
|
||
|
$this->table['col_widths'][$this->col] = max($this->table['col_widths'][$this->col], $this->strlen($buffer));
|
||
|
$this->table['rows'][$this->row][$this->col] = $buffer;
|
||
|
}
|
||
|
}
|
||
|
/**
|
||
|
* handle <th> tags
|
||
|
*
|
||
|
* @param void
|
||
|
* @return void
|
||
|
*/
|
||
|
function handleTag_th() {
|
||
|
if (!$this->keepHTML && !isset($this->table['rows'][1]) && !isset($this->table['aligns'][$this->col+1])) {
|
||
|
if (isset($this->parser->tagAttributes['align'])) {
|
||
|
$this->table['aligns'][$this->col+1] = $this->parser->tagAttributes['align'];
|
||
|
} else {
|
||
|
$this->table['aligns'][$this->col+1] = '';
|
||
|
}
|
||
|
}
|
||
|
$this->handleTag_td();
|
||
|
}
|
||
|
/**
|
||
|
* handle <dl> tags
|
||
|
*
|
||
|
* @param void
|
||
|
* @return void
|
||
|
*/
|
||
|
function handleTag_dl() {
|
||
|
if (!$this->parser->isStartTag) {
|
||
|
$this->setLineBreaks(2);
|
||
|
}
|
||
|
}
|
||
|
/**
|
||
|
* handle <dt> tags
|
||
|
*
|
||
|
* @param void
|
||
|
* @return void
|
||
|
**/
|
||
|
function handleTag_dt() {
|
||
|
if (!$this->parser->isStartTag) {
|
||
|
$this->setLineBreaks(1);
|
||
|
}
|
||
|
}
|
||
|
/**
|
||
|
* handle <dd> tags
|
||
|
*
|
||
|
* @param void
|
||
|
* @return void
|
||
|
*/
|
||
|
function handleTag_dd() {
|
||
|
if ($this->parser->isStartTag) {
|
||
|
if (substr(ltrim($this->parser->html), 0, 3) == '<p>') {
|
||
|
# next comes a paragraph, so we'll need an extra line
|
||
|
$this->out("\n".$this->indent);
|
||
|
} elseif (substr($this->output, -2) == "\n\n") {
|
||
|
$this->output = substr($this->output, 0, -1);
|
||
|
}
|
||
|
$this->out(': ');
|
||
|
$this->indent(' ', false);
|
||
|
} else {
|
||
|
# lookahead for next dt
|
||
|
if (substr(ltrim($this->parser->html), 0, 4) == '<dt>') {
|
||
|
$this->setLineBreaks(2);
|
||
|
} else {
|
||
|
$this->setLineBreaks(1);
|
||
|
}
|
||
|
$this->indent(' ');
|
||
|
}
|
||
|
}
|
||
|
/**
|
||
|
* handle <fnref /> tags (custom footnote references, see markdownify_extra::parseString())
|
||
|
*
|
||
|
* @param void
|
||
|
* @return void
|
||
|
*/
|
||
|
function handleTag_fnref() {
|
||
|
$this->out('[^'.$this->parser->tagAttributes['target'].']');
|
||
|
}
|
||
|
/**
|
||
|
* handle <fn> tags (custom footnotes, see markdownify_extra::parseString()
|
||
|
* and markdownify_extra::_makeFootnotes())
|
||
|
*
|
||
|
* @param void
|
||
|
* @return void
|
||
|
*/
|
||
|
function handleTag_fn() {
|
||
|
if ($this->parser->isStartTag) {
|
||
|
$this->out('[^'.$this->parser->tagAttributes['name'].']:');
|
||
|
$this->setLineBreaks(1);
|
||
|
} else {
|
||
|
$this->setLineBreaks(2);
|
||
|
}
|
||
|
$this->indent(' ');
|
||
|
}
|
||
|
/**
|
||
|
* handle <footnotes> tag (custom footnotes, see markdownify_extra::parseString()
|
||
|
* and markdownify_extra::_makeFootnotes())
|
||
|
*
|
||
|
* @param void
|
||
|
* @return void
|
||
|
*/
|
||
|
function handleTag_footnotes() {
|
||
|
if (!$this->parser->isStartTag) {
|
||
|
$this->setLineBreaks(2);
|
||
|
}
|
||
|
}
|
||
|
/**
|
||
|
* parse a HTML string, clean up footnotes prior
|
||
|
*
|
||
|
* @param string $HTML input
|
||
|
* @return string Markdown formatted output
|
||
|
*/
|
||
|
function parseString($html) {
|
||
|
/** TODO: custom markdown-extra options, e.g. titles & classes **/
|
||
|
# <sup id="fnref:..."><a href"#fn..." rel="footnote">...</a></sup>
|
||
|
# => <fnref target="..." />
|
||
|
$html = preg_replace('@<sup id="fnref:([^"]+)">\s*<a href="#fn:\1" rel="footnote">\s*\d+\s*</a>\s*</sup>@Us', '<fnref target="$1" />', $html);
|
||
|
# <div class="footnotes">
|
||
|
# <hr />
|
||
|
# <ol>
|
||
|
#
|
||
|
# <li id="fn:...">...</li>
|
||
|
# ...
|
||
|
#
|
||
|
# </ol>
|
||
|
# </div>
|
||
|
# =>
|
||
|
# <footnotes>
|
||
|
# <fn name="...">...</fn>
|
||
|
# ...
|
||
|
# </footnotes>
|
||
|
$html = preg_replace_callback('#<div class="footnotes">\s*<hr />\s*<ol>\s*(.+)\s*</ol>\s*</div>#Us', array(&$this, '_makeFootnotes'), $html);
|
||
|
return parent::parseString($html);
|
||
|
}
|
||
|
/**
|
||
|
* replace HTML representation of footnotes with something more easily parsable
|
||
|
*
|
||
|
* @note this is a callback to be used in parseString()
|
||
|
*
|
||
|
* @param array $matches
|
||
|
* @return string
|
||
|
*/
|
||
|
function _makeFootnotes($matches) {
|
||
|
# <li id="fn:1">
|
||
|
# ...
|
||
|
# <a href="#fnref:block" rev="footnote">↩</a></p>
|
||
|
# </li>
|
||
|
# => <fn name="1">...</fn>
|
||
|
# remove footnote link
|
||
|
$fns = preg_replace('@\s*( \s*)?<a href="#fnref:[^"]+" rev="footnote"[^>]*>↩</a>\s*@s', '', $matches[1]);
|
||
|
# remove empty paragraph
|
||
|
$fns = preg_replace('@<p>\s*</p>@s', '', $fns);
|
||
|
# <li id="fn:1">...</li> -> <footnote nr="1">...</footnote>
|
||
|
$fns = str_replace('<li id="fn:', '<fn name="', $fns);
|
||
|
|
||
|
$fns = '<footnotes>'.$fns.'</footnotes>';
|
||
|
return preg_replace('#</li>\s*(?=(?:<fn|</footnotes>))#s', '</fn>$1', $fns);
|
||
|
}
|
||
|
}
|