or contents
$this->keepWhitespace++;
}
}
if ($this->html[0] == '<') {
$token = substr($this->html, 0, 9);
if (substr($token, 0, 2) == '') {
# xml prolog or other pi's
/** TODO **/
#trigger_error('this might need some work', E_USER_NOTICE);
$pos = strpos($this->html, '>');
$this->setNode('pi', $pos + 1);
return true;
}
if (substr($token, 0, 4) == '');
if ($pos === false) {
# could not find a closing -->, use next gt instead
# this is firefox' behaviour
$pos = strpos($this->html, '>') + 1;
} else {
$pos += 3;
}
$this->setNode('comment', $pos);
$skipWhitespace = true;
return true;
}
if ($token == 'setNode('doctype', strpos($this->html, '>')+1);
$skipWhitespace = true;
return true;
}
if ($token == 'html = substr($this->html, 9);
$this->setNode('text', strpos($this->html, ']]>')+3);
# remove trailing ]]> and trim
$this->node = substr($this->node, 0, -3);
$this->handleWhitespaces();
$skipWhitespace = true;
return true;
}
if ($this->parseTag()) {
# seems to be a tag
# handle whitespaces
if ($this->isBlockElement) {
$skipWhitespace = true;
} else {
$skipWhitespace = false;
}
return true;
}
}
if ($this->keepWhitespace) {
$skipWhitespace = false;
}
# when we get here it seems to be a text node
$pos = strpos($this->html, '<');
if ($pos === false) {
$pos = strlen($this->html);
}
$this->setNode('text', $pos);
$this->handleWhitespaces();
if ($skipWhitespace && $this->node == ' ') {
return $this->nextNode();
}
$skipWhitespace = false;
return true;
}
/**
* parse tag, set tag name and attributes, see if it's a closing tag and so forth...
*
* @param void
* @return bool
*/
function parseTag() {
static $a_ord, $z_ord, $special_ords;
if (!isset($a_ord)) {
$a_ord = ord('a');
$z_ord = ord('z');
$special_ords = array(
ord(':'), // for xml:lang
ord('-'), // for http-equiv
);
}
$tagName = '';
$pos = 1;
$isStartTag = $this->html[$pos] != '/';
if (!$isStartTag) {
$pos++;
}
# get tagName
while (isset($this->html[$pos])) {
$pos_ord = ord(strtolower($this->html[$pos]));
if (($pos_ord >= $a_ord && $pos_ord <= $z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) {
$tagName .= $this->html[$pos];
$pos++;
} else {
$pos--;
break;
}
}
$tagName = strtolower($tagName);
if (empty($tagName) || !isset($this->blockElements[$tagName])) {
# something went wrong => invalid tag
$this->invalidTag();
return false;
}
if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) {
# we supress all HTML tags inside code tags
$this->invalidTag();
return false;
}
# get tag attributes
/** TODO: in html 4 attributes do not need to be quoted **/
$isEmptyTag = false;
$attributes = array();
$currAttrib = '';
while (isset($this->html[$pos+1])) {
$pos++;
# close tag
if ($this->html[$pos] == '>' || $this->html[$pos].$this->html[$pos+1] == '/>') {
if ($this->html[$pos] == '/') {
$isEmptyTag = true;
$pos++;
}
break;
}
$pos_ord = ord(strtolower($this->html[$pos]));
if ( ($pos_ord >= $a_ord && $pos_ord <= $z_ord) || in_array($pos_ord, $special_ords)) {
# attribute name
$currAttrib .= $this->html[$pos];
} elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) {
# drop whitespace
} elseif (in_array($this->html[$pos].$this->html[$pos+1], array('="', "='"))) {
# get attribute value
$pos++;
$await = $this->html[$pos]; # single or double quote
$pos++;
$value = '';
while (isset($this->html[$pos]) && $this->html[$pos] != $await) {
$value .= $this->html[$pos];
$pos++;
}
$attributes[$currAttrib] = $value;
$currAttrib = '';
} else {
$this->invalidTag();
return false;
}
}
if ($this->html[$pos] != '>') {
$this->invalidTag();
return false;
}
if (!empty($currAttrib)) {
# html 4 allows something like