1
0
Fork 0

Move HTML to Markdown library to Composer

This commit is contained in:
Hypolite Petovan 2017-04-06 23:33:12 -04:00
commit c9dafe3b4e
43 changed files with 2380 additions and 776 deletions

View file

@ -0,0 +1,44 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class BlockquoteConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
// Contents should have already been converted to Markdown by this point,
// so we just need to add '>' symbols to each line.
$markdown = '';
$quote_content = trim($element->getValue());
$lines = preg_split('/\r\n|\r|\n/', $quote_content);
$total_lines = count($lines);
foreach ($lines as $i => $line) {
$markdown .= '> ' . $line . "\n";
if ($i + 1 === $total_lines) {
$markdown .= "\n";
}
}
return $markdown;
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('blockquote');
}
}

View file

@ -0,0 +1,62 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class CodeConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
$language = null;
// Checking for language class on the code block
$classes = $element->getAttribute('class');
if ($classes) {
// Since tags can have more than one class, we need to find the one that starts with 'language-'
$classes = explode(' ', $classes);
foreach ($classes as $class) {
if (strpos($class, 'language-') !== false) {
// Found one, save it as the selected language and stop looping over the classes.
// The space after the language avoids gluing the actual code with the language tag
$language = str_replace('language-', '', $class) . ' ';
break;
}
}
}
$markdown = '';
$code = html_entity_decode($element->getChildrenAsString());
// In order to remove the code tags we need to search for them and, in the case of the opening tag
// use a regular expression to find the tag and the other attributes it might have
$code = preg_replace('/<code\b[^>]*>/', '', $code);
$code = str_replace('</code>', '', $code);
// Checking if the code has multiple lines
$lines = preg_split('/\r\n|\r|\n/', $code);
if (count($lines) > 1) {
// Multiple lines detected, adding three backticks and newlines
$markdown .= '```' . $language . "\n" . $code . "\n" . '```';
} else {
// One line of code, wrapping it on one backtick.
$markdown .= '`' . $language . $code . '`';
}
return $markdown;
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('code');
}
}

View file

@ -0,0 +1,26 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class CommentConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
return '';
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('#comment');
}
}

View file

@ -0,0 +1,20 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
interface ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element);
/**
* @return string[]
*/
public function getSupportedTags();
}

View file

@ -0,0 +1,50 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\Configuration;
use League\HTMLToMarkdown\ConfigurationAwareInterface;
use League\HTMLToMarkdown\ElementInterface;
class DefaultConverter implements ConverterInterface, ConfigurationAwareInterface
{
const DEFAULT_CONVERTER = '_default';
/**
* @var Configuration
*/
protected $config;
/**
* @param Configuration $config
*/
public function setConfig(Configuration $config)
{
$this->config = $config;
}
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
// If strip_tags is false (the default), preserve tags that don't have Markdown equivalents,
// such as <span> nodes on their own. C14N() canonicalizes the node to a string.
// See: http://www.php.net/manual/en/domnode.c14n.php
if ($this->config->getOption('strip_tags', false)) {
return $element->getValue();
}
return html_entity_decode($element->getChildrenAsString());
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array(self::DEFAULT_CONVERTER);
}
}

View file

@ -0,0 +1,45 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\Configuration;
use League\HTMLToMarkdown\ConfigurationAwareInterface;
use League\HTMLToMarkdown\ElementInterface;
class DivConverter implements ConverterInterface, ConfigurationAwareInterface
{
/**
* @var Configuration
*/
protected $config;
/**
* @param Configuration $config
*/
public function setConfig(Configuration $config)
{
$this->config = $config;
}
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
if ($this->config->getOption('strip_tags', false)) {
return $element->getValue() . "\n\n";
}
return html_entity_decode($element->getChildrenAsString());
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('div');
}
}

View file

@ -0,0 +1,57 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\Configuration;
use League\HTMLToMarkdown\ConfigurationAwareInterface;
use League\HTMLToMarkdown\ElementInterface;
class EmphasisConverter implements ConverterInterface, ConfigurationAwareInterface
{
/**
* @var Configuration
*/
protected $config;
/**
* @param Configuration $config
*/
public function setConfig(Configuration $config)
{
$this->config = $config;
}
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
$tag = $element->getTagName();
$value = $element->getValue();
if (!trim($value)) {
return '';
}
if ($tag === 'i' || $tag === 'em') {
$style = $this->config->getOption('italic_style');
} else {
$style = $this->config->getOption('bold_style');
}
$prefix = ltrim($value) !== $value ? ' ' : '';
$suffix = rtrim($value) !== $value ? ' ' : '';
return $prefix . $style . trim($value) . $style . $suffix;
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('em', 'i', 'strong', 'b');
}
}

View file

@ -0,0 +1,41 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\Configuration;
use League\HTMLToMarkdown\ConfigurationAwareInterface;
use League\HTMLToMarkdown\ElementInterface;
class HardBreakConverter implements ConverterInterface, ConfigurationAwareInterface
{
/**
* @var Configuration
*/
protected $config;
/**
* @param Configuration $config
*/
public function setConfig(Configuration $config)
{
$this->config = $config;
}
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
return $this->config->getOption('hard_break') ? "\n" : " \n";
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('br');
}
}

View file

@ -0,0 +1,78 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\Configuration;
use League\HTMLToMarkdown\ConfigurationAwareInterface;
use League\HTMLToMarkdown\ElementInterface;
class HeaderConverter implements ConverterInterface, ConfigurationAwareInterface
{
const STYLE_ATX = 'atx';
const STYLE_SETEXT = 'setext';
/**
* @var Configuration
*/
protected $config;
/**
* @param Configuration $config
*/
public function setConfig(Configuration $config)
{
$this->config = $config;
}
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
$level = (int) substr($element->getTagName(), 1, 1);
$style = $this->config->getOption('header_style', self::STYLE_SETEXT);
if (($level === 1 || $level === 2) && !$element->isDescendantOf('blockquote') && $style === self::STYLE_SETEXT) {
return $this->createSetextHeader($level, $element->getValue());
}
return $this->createAtxHeader($level, $element->getValue());
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
}
/**
* @param int $level
* @param string $content
*
* @return string
*/
private function createSetextHeader($level, $content)
{
$length = function_exists('mb_strlen') ? mb_strlen($content, 'utf-8') : strlen($content);
$underline = ($level === 1) ? '=' : '-';
return $content . "\n" . str_repeat($underline, $length) . "\n\n";
}
/**
* @param int $level
* @param string $content
*
* @return string
*/
private function createAtxHeader($level, $content)
{
$prefix = str_repeat('#', $level) . ' ';
return $prefix . $content . "\n\n";
}
}

View file

@ -0,0 +1,26 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class HorizontalRuleConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
return "- - - - - -\n\n";
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('hr');
}
}

View file

@ -0,0 +1,35 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class ImageConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
$src = $element->getAttribute('src');
$alt = $element->getAttribute('alt');
$title = $element->getAttribute('title');
if ($title !== '') {
// No newlines added. <img> should be in a block-level element.
return '![' . $alt . '](' . $src . ' "' . $title . '")';
}
return '![' . $alt . '](' . $src . ')';
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('img');
}
}

View file

@ -0,0 +1,52 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class LinkConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
$href = $element->getAttribute('href');
$title = $element->getAttribute('title');
$text = trim($element->getValue());
if ($title !== '') {
$markdown = '[' . $text . '](' . $href . ' "' . $title . '")';
} elseif ($href === $text && $this->isValidAutolink($href)) {
$markdown = '<' . $href . '>';
} else {
$markdown = '[' . $text . '](' . $href . ')';
}
if (!$href) {
$markdown = html_entity_decode($element->getChildrenAsString());
}
return $markdown;
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('a');
}
/**
* @param string $href
*
* @return bool
*/
private function isValidAutolink($href)
{
return preg_match('/^[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*/i', $href) === 1;
}
}

View file

@ -0,0 +1,26 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class ListBlockConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
return $element->getValue() . "\n";
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('ol', 'ul');
}
}

View file

@ -0,0 +1,47 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class ListItemConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
// If parent is an ol, use numbers, otherwise, use dashes
$list_type = $element->getParent()->getTagName();
// Add spaces to start for nested list items
$level = $element->getListItemLevel($element);
$prefixForParagraph = str_repeat(' ', $level + 1);
$value = trim(implode("\n" . $prefixForParagraph, explode("\n", trim($element->getValue()))));
// If list item is the first in a nested list, add a newline before it
$prefix = '';
if ($level > 0 && $element->getSiblingPosition() === 1) {
$prefix = "\n";
}
if ($list_type === 'ul') {
return $prefix . '- ' . $value . "\n";
}
$number = $element->getSiblingPosition();
return $prefix . $number . '. ' . $value . "\n";
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('li');
}
}

View file

@ -0,0 +1,124 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class ParagraphConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
$value = $element->getValue();
$markdown = '';
$lines = preg_split('/\r\n|\r|\n/', $value);
foreach ($lines as $line) {
/*
* Some special characters need to be escaped based on the position that they appear
* The following function will deal with those special cases.
*/
$markdown .= $this->escapeSpecialCharacters($line);
$markdown .= "\n";
}
return trim($markdown) !== '' ? rtrim($markdown) . "\n\n" : '';
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('p');
}
/**
* @param string $line
*
* @return string
*/
private function escapeSpecialCharacters($line)
{
$line = $this->escapeFirstCharacters($line);
$line = $this->escapeOtherCharacters($line);
$line = $this->escapeOtherCharactersRegex($line);
return $line;
}
/**
* @param string $line
*
* @return string
*/
private function escapeFirstCharacters($line)
{
$escapable = array(
'>',
'- ',
'+ ',
'--',
'~~~',
'---',
'- - -'
);
foreach ($escapable as $i) {
if (strpos(ltrim($line), $i) === 0) {
// Found a character that must be escaped, adding a backslash before
return '\\' . ltrim($line);
}
}
return $line;
}
/**
* @param string $line
*
* @return string
*/
private function escapeOtherCharacters($line)
{
$escapable = array(
'<!--'
);
foreach ($escapable as $i) {
if (strpos($line, $i) !== false) {
// Found an escapable character, escaping it
$line = substr_replace($line, '\\', strpos($line, $i), 0);
}
}
return $line;
}
/**
* @param string $line
*
* @return string
*/
private function escapeOtherCharactersRegex($line)
{
$regExs = array(
// Match numbers ending on ')' or '.' that are at the beginning of the line.
'/^[0-9]+(?=\)|\.)/'
);
foreach ($regExs as $i) {
if (preg_match($i, $line, $match)) {
// Matched an escapable character, adding a backslash on the string before the offending character
$line = substr_replace($line, '\\', strlen($match[0]), 0);
}
}
return $line;
}
}

View file

@ -0,0 +1,59 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class PreformattedConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
$markdown = '';
$pre_content = html_entity_decode($element->getChildrenAsString());
$pre_content = str_replace(array('<pre>', '</pre>'), '', $pre_content);
/*
* Checking for the code tag.
* Usually pre tags are used along with code tags. This conditional will check for already converted code tags,
* which use backticks, and if those backticks are at the beginning and at the end of the string it means
* there's no more information to convert.
*/
$firstBacktick = strpos(trim($pre_content), '`');
$lastBacktick = strrpos(trim($pre_content), '`');
if ($firstBacktick === 0 && $lastBacktick === strlen(trim($pre_content)) - 1) {
return $pre_content;
}
// If the execution reaches this point it means it's just a pre tag, with no code tag nested
// Normalizing new lines
$pre_content = preg_replace('/\r\n|\r|\n/', PHP_EOL, $pre_content);
// Checking if the string has multiple lines
$lines = preg_split('/\r\n|\r|\n/', $pre_content);
if (count($lines) > 1) {
// Multiple lines detected, adding three backticks and newlines
$markdown .= '```' . "\n" . $pre_content . "\n" . '```';
} else {
// One line of code, wrapping it on one backtick.
$markdown .= '`' . $pre_content . '`';
}
return $markdown;
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('pre');
}
}

View file

@ -0,0 +1,46 @@
<?php
namespace League\HTMLToMarkdown\Converter;
use League\HTMLToMarkdown\ElementInterface;
class TextConverter implements ConverterInterface
{
/**
* @param ElementInterface $element
*
* @return string
*/
public function convert(ElementInterface $element)
{
$markdown = $element->getValue();
// Remove leftover \n at the beginning of the line
$markdown = ltrim($markdown, "\n");
// Replace sequences of invisible characters with spaces
$markdown = preg_replace('~\s+~u', ' ', $markdown);
// Escape the following characters: '*', '_', '[', ']' and '\'
$markdown = preg_replace('~([*_\\[\\]\\\\])~u', '\\\\$1', $markdown);
$markdown = preg_replace('~^#~u', '\\\\#', $markdown);
if ($markdown === ' ') {
$next = $element->getNext();
if (!$next || $next->isBlock()) {
$markdown = '';
}
}
return $markdown;
}
/**
* @return string[]
*/
public function getSupportedTags()
{
return array('#text');
}
}