618 lines
		
	
	
	
		
			15 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			618 lines
		
	
	
	
		
			15 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | |
| /**
 | |
|  * parseHTML is a HTML parser which works with PHP 4 and above.
 | |
|  * It tries to handle invalid HTML to some degree.
 | |
|  *
 | |
|  * @version 1.0 beta
 | |
|  * @author Milian Wolff (mail@milianw.de, http://milianw.de)
 | |
|  * @license LGPL, see LICENSE_LGPL.txt and the summary below
 | |
|  * @copyright (C) 2007  Milian Wolff
 | |
|  *
 | |
|  * This library is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU Lesser General Public
 | |
|  * License as published by the Free Software Foundation; either
 | |
|  * version 2.1 of the License, or (at your option) any later version.
 | |
|  *
 | |
|  * This library is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
|  * Lesser General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU Lesser General Public
 | |
|  * License along with this library; if not, write to the Free Software
 | |
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 | |
|  */
 | |
| class parseHTML {
 | |
|   /**
 | |
|    * tags which are always empty (<br /> etc.)
 | |
|    *
 | |
|    * @var array<string>
 | |
|    */
 | |
|   var $emptyTags = array(
 | |
|     'br',
 | |
|     'hr',
 | |
|     'input',
 | |
|     'img',
 | |
|     'area',
 | |
|     'link',
 | |
|     'meta',
 | |
|     'param',
 | |
|   );
 | |
|   /**
 | |
|    * tags with preformatted text
 | |
|    * whitespaces wont be touched in them
 | |
|    *
 | |
|    * @var array<string>
 | |
|    */
 | |
|   var $preformattedTags = array(
 | |
|     'script',
 | |
|     'style',
 | |
|     'pre',
 | |
|     'code',
 | |
|   );
 | |
|   /**
 | |
|    * supress HTML tags inside preformatted tags (see above)
 | |
|    *
 | |
|    * @var bool
 | |
|    */
 | |
|   var $noTagsInCode = false;
 | |
|   /**
 | |
|    * html to be parsed
 | |
|    *
 | |
|    * @var string
 | |
|    */
 | |
|   var $html = '';
 | |
|   /**
 | |
|    * node type:
 | |
|    *
 | |
|    * - tag (see isStartTag)
 | |
|    * - text (includes cdata)
 | |
|    * - comment
 | |
|    * - doctype
 | |
|    * - pi (processing instruction)
 | |
|    *
 | |
|    * @var string
 | |
|    */
 | |
|   var $nodeType = '';
 | |
|   /**
 | |
|    * current node content, i.e. either a
 | |
|    * simple string (text node), or something like
 | |
|    * <tag attrib="value"...>
 | |
|    *
 | |
|    * @var string
 | |
|    */
 | |
|   var $node = '';
 | |
|   /**
 | |
|    * wether current node is an opening tag (<a>) or not (</a>)
 | |
|    * set to NULL if current node is not a tag
 | |
|    * NOTE: empty tags (<br />) set this to true as well!
 | |
|    *
 | |
|    * @var bool | null
 | |
|    */
 | |
|   var $isStartTag = null;
 | |
|   /**
 | |
|    * wether current node is an empty tag (<br />) or not (<a></a>)
 | |
|    *
 | |
|    * @var bool | null
 | |
|    */
 | |
|   var $isEmptyTag = null;
 | |
|   /**
 | |
|    * tag name
 | |
|    *
 | |
|    * @var string | null
 | |
|    */
 | |
|   var $tagName = '';
 | |
|   /**
 | |
|    * attributes of current tag
 | |
|    *
 | |
|    * @var array (attribName=>value) | null
 | |
|    */
 | |
|   var $tagAttributes = null;
 | |
|   /**
 | |
|    * wether the current tag is a block element
 | |
|    *
 | |
|    * @var bool | null
 | |
|    */
 | |
|   var $isBlockElement = null;
 | |
| 
 | |
|   /**
 | |
|    * keep whitespace
 | |
|    *
 | |
|    * @var int
 | |
|    */
 | |
|   var $keepWhitespace = 0;
 | |
|   /**
 | |
|    * list of open tags
 | |
|    * count this to get current depth
 | |
|    *
 | |
|    * @var array
 | |
|    */
 | |
|   var $openTags = array();
 | |
|   /**
 | |
|    * list of block elements
 | |
|    *
 | |
|    * @var array
 | |
|    * TODO: what shall we do with <del> and <ins> ?!
 | |
|    */
 | |
|   var $blockElements = array (
 | |
|     # tag name => <bool> is block
 | |
|     # block elements
 | |
|     'address' => true,
 | |
|     'blockquote' => true,
 | |
|     'center' => true,
 | |
|     'del' => true,
 | |
|     'dir' => true,
 | |
|     'div' => true,
 | |
|     'dl' => true,
 | |
|     'fieldset' => true,
 | |
|     'form' => true,
 | |
|     'h1' => true,
 | |
|     'h2' => true,
 | |
|     'h3' => true,
 | |
|     'h4' => true,
 | |
|     'h5' => true,
 | |
|     'h6' => true,
 | |
|     'hr' => true,
 | |
|     'ins' => true,
 | |
|     'isindex' => true,
 | |
|     'menu' => true,
 | |
|     'noframes' => true,
 | |
|     'noscript' => true,
 | |
|     'ol' => true,
 | |
|     'p' => true,
 | |
|     'pre' => true,
 | |
|     'table' => true,
 | |
|     'ul' => true,
 | |
|     # set table elements and list items to block as well
 | |
|     'thead' => true,
 | |
|     'tbody' => true,
 | |
|     'tfoot' => true,
 | |
|     'td' => true,
 | |
|     'tr' => true,
 | |
|     'th' => true,
 | |
|     'li' => true,
 | |
|     'dd' => true,
 | |
|     'dt' => true,
 | |
|     # header items and html / body as well
 | |
|     'html' => true,
 | |
|     'body' => true,
 | |
|     'head' => true,
 | |
|     'meta' => true,
 | |
|     'link' => true,
 | |
|     'style' => true,
 | |
|     'title' => true,
 | |
|     # unfancy media tags, when indented should be rendered as block
 | |
|     'map' => true,
 | |
|     'object' => true,
 | |
|     'param' => true,
 | |
|     'embed' => true,
 | |
|     'area' => true,
 | |
|     # inline elements
 | |
|     'a' => false,
 | |
|     'abbr' => false,
 | |
|     'acronym' => false,
 | |
|     'applet' => false,
 | |
|     'b' => false,
 | |
|     'basefont' => false,
 | |
|     'bdo' => false,
 | |
|     'big' => false,
 | |
|     'br' => false,
 | |
|     'button' => false,
 | |
|     'cite' => false,
 | |
|     'code' => false,
 | |
|     'del' => false,
 | |
|     'dfn' => false,
 | |
|     'em' => false,
 | |
|     'font' => false,
 | |
|     'i' => false,
 | |
|     'img' => false,
 | |
|     'ins' => false,
 | |
|     'input' => false,
 | |
|     'iframe' => false,
 | |
|     'kbd' => false,
 | |
|     'label' => false,
 | |
|     'q' => false,
 | |
|     'samp' => false,
 | |
|     'script' => false,
 | |
|     'select' => false,
 | |
|     'small' => false,
 | |
|     'span' => false,
 | |
|     'strong' => false,
 | |
|     'sub' => false,
 | |
|     'sup' => false,
 | |
|     'textarea' => false,
 | |
|     'tt' => false,
 | |
|     'var' => false,
 | |
|   );
 | |
|   /**
 | |
|    * get next node, set $this->html prior!
 | |
|    *
 | |
|    * @param void
 | |
|    * @return bool
 | |
|    */
 | |
|   function nextNode() {
 | |
|     if (empty($this->html)) {
 | |
|       # we are done with parsing the html string
 | |
|       return false;
 | |
|     }
 | |
|     static $skipWhitespace = true;
 | |
|     if ($this->isStartTag && !$this->isEmptyTag) {
 | |
|       array_push($this->openTags, $this->tagName);
 | |
|       if (in_array($this->tagName, $this->preformattedTags)) {
 | |
|         # dont truncate whitespaces for <code> or <pre> contents
 | |
|         $this->keepWhitespace++;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if ($this->html[0] == '<') {
 | |
|       $token = substr($this->html, 0, 9);
 | |
|       if (substr($token, 0, 2) == '<?') {
 | |
|         # xml prolog or other pi's
 | |
|         /** TODO **/
 | |
|         #trigger_error('this might need some work', E_USER_NOTICE);
 | |
|         $pos = strpos($this->html, '>');
 | |
|         $this->setNode('pi', $pos + 1);
 | |
|         return true;
 | |
|       }
 | |
|       if (substr($token, 0, 4) == '<!--') {
 | |
|         # comment
 | |
|         $pos = strpos($this->html, '-->');
 | |
|         if ($pos === false) {
 | |
|           # could not find a closing -->, use next gt instead
 | |
|           # this is firefox' behaviour
 | |
|           $pos = strpos($this->html, '>') + 1;
 | |
|         } else {
 | |
|           $pos += 3;
 | |
|         }
 | |
|         $this->setNode('comment', $pos);
 | |
| 
 | |
|         $skipWhitespace = true;
 | |
|         return true;
 | |
|       }
 | |
|       if ($token == '<!DOCTYPE') {
 | |
|         # doctype
 | |
|         $this->setNode('doctype', strpos($this->html, '>')+1);
 | |
| 
 | |
|         $skipWhitespace = true;
 | |
|         return true;
 | |
|       }
 | |
|       if ($token == '<![CDATA[') {
 | |
|         # cdata, use text node
 | |
| 
 | |
|         # remove leading <![CDATA[
 | |
|         $this->html = substr($this->html, 9);
 | |
| 
 | |
|         $this->setNode('text', strpos($this->html, ']]>')+3);
 | |
| 
 | |
|         # remove trailing ]]> and trim
 | |
|         $this->node = substr($this->node, 0, -3);
 | |
|         $this->handleWhitespaces();
 | |
| 
 | |
|         $skipWhitespace = true;
 | |
|         return true;
 | |
|       }
 | |
|       if ($this->parseTag()) {
 | |
|         # seems to be a tag
 | |
|         # handle whitespaces
 | |
|         if ($this->isBlockElement) {
 | |
|           $skipWhitespace = true;
 | |
|         } else {
 | |
|           $skipWhitespace = false;
 | |
|         }
 | |
|         return true;
 | |
|       }
 | |
|     }
 | |
|     if ($this->keepWhitespace) {
 | |
|       $skipWhitespace = false;
 | |
|     }
 | |
|     # when we get here it seems to be a text node
 | |
|     $pos = strpos($this->html, '<');
 | |
|     if ($pos === false) {
 | |
|       $pos = strlen($this->html);
 | |
|     }
 | |
|     $this->setNode('text', $pos);
 | |
|     $this->handleWhitespaces();
 | |
|     if ($skipWhitespace && $this->node == ' ') {
 | |
|       return $this->nextNode();
 | |
|     }
 | |
|     $skipWhitespace = false;
 | |
|     return true;
 | |
|   }
 | |
|   /**
 | |
|    * parse tag, set tag name and attributes, see if it's a closing tag and so forth...
 | |
|    *
 | |
|    * @param void
 | |
|    * @return bool
 | |
|    */
 | |
|   function parseTag() {
 | |
|     static $a_ord, $z_ord, $special_ords;
 | |
|     if (!isset($a_ord)) {
 | |
|       $a_ord = ord('a');
 | |
|       $z_ord = ord('z');
 | |
|       $special_ords = array(
 | |
|         ord(':'), // for xml:lang
 | |
|         ord('-'), // for http-equiv
 | |
|       );
 | |
|     }
 | |
| 
 | |
|     $tagName = '';
 | |
| 
 | |
|     $pos = 1;
 | |
|     $isStartTag = $this->html[$pos] != '/';
 | |
|     if (!$isStartTag) {
 | |
|       $pos++;
 | |
|     }
 | |
|     # get tagName
 | |
|     while (isset($this->html[$pos])) {
 | |
|       $pos_ord = ord(strtolower($this->html[$pos]));
 | |
|       if (($pos_ord >= $a_ord && $pos_ord <= $z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) {
 | |
|         $tagName .= $this->html[$pos];
 | |
|         $pos++;
 | |
|       } else {
 | |
|         $pos--;
 | |
|         break;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     $tagName = strtolower($tagName);
 | |
|     if (empty($tagName) || !isset($this->blockElements[$tagName])) {
 | |
|       # something went wrong => invalid tag
 | |
|       $this->invalidTag();
 | |
|       return false;
 | |
|     }
 | |
|     if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) {
 | |
|       # we supress all HTML tags inside code tags
 | |
|       $this->invalidTag();
 | |
|       return false;
 | |
|     }
 | |
| 
 | |
|     # get tag attributes
 | |
|     /** TODO: in html 4 attributes do not need to be quoted **/
 | |
|     $isEmptyTag = false;
 | |
|     $attributes = array();
 | |
|     $currAttrib = '';
 | |
|     while (isset($this->html[$pos+1])) {
 | |
|       $pos++;
 | |
|       # close tag
 | |
|       if ($this->html[$pos] == '>' || $this->html[$pos].$this->html[$pos+1] == '/>') {
 | |
|         if ($this->html[$pos] == '/') {
 | |
|           $isEmptyTag = true;
 | |
|           $pos++;
 | |
|         }
 | |
|         break;
 | |
|       }
 | |
| 
 | |
|       $pos_ord = ord(strtolower($this->html[$pos]));
 | |
|       if ( ($pos_ord >= $a_ord && $pos_ord <= $z_ord) || in_array($pos_ord, $special_ords)) {
 | |
|         # attribute name
 | |
|         $currAttrib .= $this->html[$pos];
 | |
|       } elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) {
 | |
|         # drop whitespace
 | |
|       } elseif (in_array($this->html[$pos].$this->html[$pos+1], array('="', "='"))) {
 | |
|         # get attribute value
 | |
|         $pos++;
 | |
|         $await = $this->html[$pos]; # single or double quote
 | |
|         $pos++;
 | |
|         $value = '';
 | |
|         while (isset($this->html[$pos]) && $this->html[$pos] != $await) {
 | |
|           $value .= $this->html[$pos];
 | |
|           $pos++;
 | |
|         }
 | |
|         $attributes[$currAttrib] = $value;
 | |
|         $currAttrib = '';
 | |
|       } else {
 | |
|         $this->invalidTag();
 | |
|         return false;
 | |
|       }
 | |
|     }
 | |
|     if ($this->html[$pos] != '>') {
 | |
|       $this->invalidTag();
 | |
|       return false;
 | |
|     }
 | |
| 
 | |
|     if (!empty($currAttrib)) {
 | |
|       # html 4 allows something like <option selected> instead of <option selected="selected">
 | |
|       $attributes[$currAttrib] = $currAttrib;
 | |
|     }
 | |
|     if (!$isStartTag) {
 | |
|       if (!empty($attributes) || $tagName != end($this->openTags)) {
 | |
|         # end tags must not contain any attributes
 | |
|         # or maybe we did not expect a different tag to be closed
 | |
|         $this->invalidTag();
 | |
|         return false;
 | |
|       }
 | |
|       array_pop($this->openTags);
 | |
|       if (in_array($tagName, $this->preformattedTags)) {
 | |
|         $this->keepWhitespace--;
 | |
|       }
 | |
|     }
 | |
|     $pos++;
 | |
|     $this->node = substr($this->html, 0, $pos);
 | |
|     $this->html = substr($this->html, $pos);
 | |
|     $this->tagName = $tagName;
 | |
|     $this->tagAttributes = $attributes;
 | |
|     $this->isStartTag = $isStartTag;
 | |
|     $this->isEmptyTag = $isEmptyTag || in_array($tagName, $this->emptyTags);
 | |
|     if ($this->isEmptyTag) {
 | |
|       # might be not well formed
 | |
|       $this->node = preg_replace('# */? *>$#', ' />', $this->node);
 | |
|     }
 | |
|     $this->nodeType = 'tag';
 | |
|     $this->isBlockElement = $this->blockElements[$tagName];
 | |
|     return true;
 | |
|   }
 | |
|   /**
 | |
|    * handle invalid tags
 | |
|    *
 | |
|    * @param void
 | |
|    * @return void
 | |
|    */
 | |
|   function invalidTag() {
 | |
|     $this->html = substr_replace($this->html, '<', 0, 1);
 | |
|   }
 | |
|   /**
 | |
|    * update all vars and make $this->html shorter
 | |
|    *
 | |
|    * @param string $type see description for $this->nodeType
 | |
|    * @param int $pos to which position shall we cut?
 | |
|    * @return void
 | |
|    */
 | |
|   function setNode($type, $pos) {
 | |
|     if ($this->nodeType == 'tag') {
 | |
|       # set tag specific vars to null
 | |
|       # $type == tag should not be called here
 | |
|       # see this::parseTag() for more
 | |
|       $this->tagName = null;
 | |
|       $this->tagAttributes = null;
 | |
|       $this->isStartTag = null;
 | |
|       $this->isEmptyTag = null;
 | |
|       $this->isBlockElement = null;
 | |
| 
 | |
|     }
 | |
|     $this->nodeType = $type;
 | |
|     $this->node = substr($this->html, 0, $pos);
 | |
|     $this->html = substr($this->html, $pos);
 | |
|   }
 | |
|   /**
 | |
|    * check if $this->html begins with $str
 | |
|    *
 | |
|    * @param string $str
 | |
|    * @return bool
 | |
|    */
 | |
|   function match($str) {
 | |
|     return substr($this->html, 0, strlen($str)) == $str;
 | |
|   }
 | |
|   /**
 | |
|    * truncate whitespaces
 | |
|    *
 | |
|    * @param void
 | |
|    * @return void
 | |
|    */
 | |
|   function handleWhitespaces() {
 | |
|     if ($this->keepWhitespace) {
 | |
|       # <pre> or <code> before...
 | |
|       return;
 | |
|     }
 | |
|     # truncate multiple whitespaces to a single one
 | |
|     $this->node = preg_replace('#\s+#s', ' ', $this->node);
 | |
|   }
 | |
|   /**
 | |
|    * normalize self::node
 | |
|    *
 | |
|    * @param void
 | |
|    * @return void
 | |
|    */
 | |
|   function normalizeNode() {
 | |
|     $this->node = '<';
 | |
|     if (!$this->isStartTag) {
 | |
|       $this->node .= '/'.$this->tagName.'>';
 | |
|       return;
 | |
|     }
 | |
|     $this->node .= $this->tagName;
 | |
|     foreach ($this->tagAttributes as $name => $value) {
 | |
|       $this->node .= ' '.$name.'="'.str_replace('"', '"', $value).'"';
 | |
|     }
 | |
|     if ($this->isEmptyTag) {
 | |
|       $this->node .= ' /';
 | |
|     }
 | |
|     $this->node .= '>';
 | |
|   }
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * indent a HTML string properly
 | |
|  *
 | |
|  * @param string $html
 | |
|  * @param string $indent optional
 | |
|  * @return string
 | |
|  */
 | |
| function indentHTML($html, $indent = "  ", $noTagsInCode = false) {
 | |
|   $parser = new parseHTML;
 | |
|   $parser->noTagsInCode = $noTagsInCode;
 | |
|   $parser->html = $html;
 | |
|   $html = '';
 | |
|   $last = true; # last tag was block elem
 | |
|   $indent_a = array();
 | |
|   while($parser->nextNode()) {
 | |
|     if ($parser->nodeType == 'tag') {
 | |
|       $parser->normalizeNode();
 | |
|     }
 | |
|     if ($parser->nodeType == 'tag' && $parser->isBlockElement) {
 | |
|       $isPreOrCode = in_array($parser->tagName, array('code', 'pre'));
 | |
|       if (!$parser->keepWhitespace && !$last && !$isPreOrCode) {
 | |
|         $html = rtrim($html)."\n";
 | |
|       }
 | |
|       if ($parser->isStartTag) {
 | |
|         $html .= implode($indent_a);
 | |
|         if (!$parser->isEmptyTag) {
 | |
|           array_push($indent_a, $indent);
 | |
|         }
 | |
|       } else {
 | |
|         array_pop($indent_a);
 | |
|         if (!$isPreOrCode) {
 | |
|           $html .= implode($indent_a);
 | |
|         }
 | |
|       }
 | |
|       $html .= $parser->node;
 | |
|       if (!$parser->keepWhitespace && !($isPreOrCode && $parser->isStartTag)) {
 | |
|         $html .= "\n";
 | |
|       }
 | |
|       $last = true;
 | |
|     } else {
 | |
|       if ($parser->nodeType == 'tag' && $parser->tagName == 'br') {
 | |
|         $html .= $parser->node."\n";
 | |
|         $last = true;
 | |
|         continue;
 | |
|       } elseif ($last && !$parser->keepWhitespace) {
 | |
|         $html .= implode($indent_a);
 | |
|         $parser->node = ltrim($parser->node);
 | |
|       }
 | |
|       $html .= $parser->node;
 | |
| 
 | |
|       if (in_array($parser->nodeType, array('comment', 'pi', 'doctype'))) {
 | |
|         $html .= "\n";
 | |
|       } else {
 | |
|         $last = false;
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   return $html;
 | |
| }
 | |
| /*
 | |
| # testcase / example
 | |
| error_reporting(E_ALL);
 | |
| 
 | |
| $html = '<p>Simple block on one line:</p>
 | |
| 
 | |
| <div>foo</div>
 | |
| 
 | |
| <p>And nested without indentation:</p>
 | |
| 
 | |
| <div>
 | |
| <div>
 | |
| <div>
 | |
| foo
 | |
| </div>
 | |
| <div style=">"/>
 | |
| </div>
 | |
| <div>bar</div>
 | |
| </div>
 | |
| 
 | |
| <p>And with attributes:</p>
 | |
| 
 | |
| <div>
 | |
|     <div id="foo">
 | |
|     </div>
 | |
| </div>
 | |
| 
 | |
| <p>This was broken in 1.0.2b7:</p>
 | |
| 
 | |
| <div class="inlinepage">
 | |
| <div class="toggleableend">
 | |
| foo
 | |
| </div>
 | |
| </div>';
 | |
| #$html = '<a href="asdfasdf"       title=\'asdf\' foo="bar">asdf</a>';
 | |
| echo indentHTML($html);
 | |
| die();
 | |
| */
 |