etc.) * * @var array */ var $emptyTags = array( 'br', 'hr', 'input', 'img', 'area', 'link', 'meta', 'param', ); /** * tags with preformatted text * whitespaces wont be touched in them * * @var array */ var $preformattedTags = array( 'script', 'style', 'pre', 'code', ); /** * supress HTML tags inside preformatted tags (see above) * * @var bool */ var $noTagsInCode = false; /** * html to be parsed * * @var string */ var $html = ''; /** * node type: * * - tag (see isStartTag) * - text (includes cdata) * - comment * - doctype * - pi (processing instruction) * * @var string */ var $nodeType = ''; /** * current node content, i.e. either a * simple string (text node), or something like * * * @var string */ var $node = ''; /** * wether current node is an opening tag () or not () * set to NULL if current node is not a tag * NOTE: empty tags (
) set this to true as well! * * @var bool | null */ var $isStartTag = null; /** * wether current node is an empty tag (
) or not () * * @var bool | null */ var $isEmptyTag = null; /** * tag name * * @var string | null */ var $tagName = ''; /** * attributes of current tag * * @var array (attribName=>value) | null */ var $tagAttributes = null; /** * wether the current tag is a block element * * @var bool | null */ var $isBlockElement = null; /** * keep whitespace * * @var int */ var $keepWhitespace = 0; /** * list of open tags * count this to get current depth * * @var array */ var $openTags = array(); /** * list of block elements * * @var array * TODO: what shall we do with and ?! */ var $blockElements = array ( # tag name => is block # block elements 'address' => true, 'blockquote' => true, 'center' => true, 'del' => true, 'dir' => true, 'div' => true, 'dl' => true, 'fieldset' => true, 'form' => true, 'h1' => true, 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true, 'h6' => true, 'hr' => true, 'ins' => true, 'isindex' => true, 'menu' => true, 'noframes' => true, 'noscript' => true, 'ol' => true, 'p' => true, 'pre' => true, 'table' => true, 'ul' => true, # set table elements and list items to block as well 'thead' => true, 'tbody' => true, 'tfoot' => true, 'td' => true, 'tr' => true, 'th' => true, 'li' => true, 'dd' => true, 'dt' => true, # header items and html / body as well 'html' => true, 'body' => true, 'head' => true, 'meta' => true, 'link' => true, 'style' => true, 'title' => true, # unfancy media tags, when indented should be rendered as block 'map' => true, 'object' => true, 'param' => true, 'embed' => true, 'area' => true, # inline elements 'a' => false, 'abbr' => false, 'acronym' => false, 'applet' => false, 'b' => false, 'basefont' => false, 'bdo' => false, 'big' => false, 'br' => false, 'button' => false, 'cite' => false, 'code' => false, 'del' => false, 'dfn' => false, 'em' => false, 'font' => false, 'i' => false, 'img' => false, 'ins' => false, 'input' => false, 'iframe' => false, 'kbd' => false, 'label' => false, 'q' => false, 'samp' => false, 'script' => false, 'select' => false, 'small' => false, 'span' => false, 'strong' => false, 'sub' => false, 'sup' => false, 'textarea' => false, 'tt' => false, 'var' => false, ); /** * get next node, set $this->html prior! * * @param void * @return bool */ function nextNode() { if (empty($this->html)) { # we are done with parsing the html string return false; } static $skipWhitespace = true; if ($this->isStartTag && !$this->isEmptyTag) { array_push($this->openTags, $this->tagName); if (in_array($this->tagName, $this->preformattedTags)) { # dont truncate whitespaces for or

    if ($this->html[0] == '<') {
      $token = substr($this->html, 0, 9);
      if (substr($token, 0, 2) == 'html, '>');
        $this->setNode('pi', $pos + 1);
        return true;
      if (substr($token, 0, 4) == '');
        if ($pos === false) {
          # could not find a closing -->, use next gt instead
          # this is firefox' behaviour
          $pos = strpos($this->html, '>') + 1;
        } else {
          $pos += 3;
        $this->setNode('comment', $pos);

        $skipWhitespace = true;
        return true;
      if ($token == 'setNode('doctype', strpos($this->html, '>')+1);

        $skipWhitespace = true;
        return true;
      if ($token == 'html = substr($this->html, 9);

        $this->setNode('text', strpos($this->html, ']]>')+3);

        # remove trailing ]]> and trim
        $this->node = substr($this->node, 0, -3);

        $skipWhitespace = true;
        return true;
      if ($this->parseTag()) {
        # seems to be a tag
        # handle whitespaces
        if ($this->isBlockElement) {
          $skipWhitespace = true;
        } else {
          $skipWhitespace = false;
        return true;
    if ($this->keepWhitespace) {
      $skipWhitespace = false;
    # when we get here it seems to be a text node
    $pos = strpos($this->html, '<');
    if ($pos === false) {
      $pos = strlen($this->html);
    $this->setNode('text', $pos);
    if ($skipWhitespace && $this->node == ' ') {
      return $this->nextNode();
    $skipWhitespace = false;
    return true;
   * parse tag, set tag name and attributes, see if it's a closing tag and so forth...
   * @param void
   * @return bool
  function parseTag() {
    static $a_ord, $z_ord, $special_ords;
    if (!isset($a_ord)) {
      $a_ord = ord('a');
      $z_ord = ord('z');
      $special_ords = array(
        ord(':'), // for xml:lang
        ord('-'), // for http-equiv

    $tagName = '';

    $pos = 1;
    $isStartTag = $this->html[$pos] != '/';
    if (!$isStartTag) {
    # get tagName
    while (isset($this->html[$pos])) {
      $pos_ord = ord(strtolower($this->html[$pos]));
      if (($pos_ord >= $a_ord && $pos_ord <= $z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) {
        $tagName .= $this->html[$pos];
      } else {

    $tagName = strtolower($tagName);
    if (empty($tagName) || !isset($this->blockElements[$tagName])) {
      # something went wrong => invalid tag
      return false;
    if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) {
      # we supress all HTML tags inside code tags
      return false;

    # get tag attributes
    /** TODO: in html 4 attributes do not need to be quoted **/
    $isEmptyTag = false;
    $attributes = array();
    $currAttrib = '';
    while (isset($this->html[$pos+1])) {
      # close tag
      if ($this->html[$pos] == '>' || $this->html[$pos].$this->html[$pos+1] == '/>') {
        if ($this->html[$pos] == '/') {
          $isEmptyTag = true;

      $pos_ord = ord(strtolower($this->html[$pos]));
      if ( ($pos_ord >= $a_ord && $pos_ord <= $z_ord) || in_array($pos_ord, $special_ords)) {
        # attribute name
        $currAttrib .= $this->html[$pos];
      } elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) {
        # drop whitespace
      } elseif (in_array($this->html[$pos].$this->html[$pos+1], array('="', "='"))) {
        # get attribute value
        $await = $this->html[$pos]; # single or double quote
        $value = '';
        while (isset($this->html[$pos]) && $this->html[$pos] != $await) {
          $value .= $this->html[$pos];
        $attributes[$currAttrib] = $value;
        $currAttrib = '';
      } else {
        return false;
    if ($this->html[$pos] != '>') {
      return false;

    if (!empty($currAttrib)) {
      # html 4 allows something like