mirror of
https://github.com/friendica/friendica
synced 2024-10-13 02:24:34 +02:00
3716 lines
163 KiB
PHP
3716 lines
163 KiB
PHP
<?php
|
|
|
|
/*
|
|
|
|
Copyright 2007 Jeroen van der Meer <http://jero.net/>
|
|
Copyright 2009 Edward Z. Yang <edwardzyang@thewritingpot.com>
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a
|
|
copy of this software and associated documentation files (the
|
|
"Software"), to deal in the Software without restriction, including
|
|
without limitation the rights to use, copy, modify, merge, publish,
|
|
distribute, sublicense, and/or sell copies of the Software, and to
|
|
permit persons to whom the Software is furnished to do so, subject to
|
|
the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included
|
|
in all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
*/
|
|
|
|
// Tags for FIX ME!!!: (in order of priority)
|
|
// XXX - should be fixed NAO!
|
|
// XERROR - with regards to parse errors
|
|
// XSCRIPT - with regards to scripting mode
|
|
// XENCODING - with regards to encoding (for reparsing tests)
|
|
|
|
class HTML5_TreeBuilder {
|
|
public $stack = array();
|
|
public $content_model;
|
|
|
|
private $mode;
|
|
private $original_mode;
|
|
private $secondary_mode;
|
|
private $dom;
|
|
// Whether or not normal insertion of nodes should actually foster
|
|
// parent (used in one case in spec)
|
|
private $foster_parent = false;
|
|
private $a_formatting = array();
|
|
|
|
private $head_pointer = null;
|
|
private $form_pointer = null;
|
|
|
|
private $flag_frameset_ok = true;
|
|
private $flag_force_quirks = false;
|
|
private $ignored = false;
|
|
private $quirks_mode = null;
|
|
// this gets to 2 when we want to ignore the next lf character, and
|
|
// is decrement at the beginning of each processed token (this way,
|
|
// code can check for (bool)$ignore_lf_token, but it phases out
|
|
// appropriately)
|
|
private $ignore_lf_token = 0;
|
|
private $fragment = false;
|
|
private $root;
|
|
|
|
private $scoping = array('applet','button','caption','html','marquee','object','table','td','th', 'svg:foreignObject');
|
|
private $formatting = array('a','b','big','code','em','font','i','nobr','s','small','strike','strong','tt','u');
|
|
private $special = array('address','area','article','aside','base','basefont','bgsound',
|
|
'blockquote','body','br','center','col','colgroup','command','dd','details','dialog','dir','div','dl',
|
|
'dt','embed','fieldset','figure','footer','form','frame','frameset','h1','h2','h3','h4','h5',
|
|
'h6','head','header','hgroup','hr','iframe','img','input','isindex','li','link',
|
|
'listing','menu','meta','nav','noembed','noframes','noscript','ol',
|
|
'p','param','plaintext','pre','script','select','spacer','style',
|
|
'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
|
|
|
|
// Tree construction modes
|
|
const INITIAL = 0;
|
|
const BEFORE_HTML = 1;
|
|
const BEFORE_HEAD = 2;
|
|
const IN_HEAD = 3;
|
|
const IN_HEAD_NOSCRIPT = 4;
|
|
const AFTER_HEAD = 5;
|
|
const IN_BODY = 6;
|
|
const IN_CDATA_RCDATA = 7;
|
|
const IN_TABLE = 8;
|
|
const IN_CAPTION = 9;
|
|
const IN_COLUMN_GROUP = 10;
|
|
const IN_TABLE_BODY = 11;
|
|
const IN_ROW = 12;
|
|
const IN_CELL = 13;
|
|
const IN_SELECT = 14;
|
|
const IN_SELECT_IN_TABLE= 15;
|
|
const IN_FOREIGN_CONTENT= 16;
|
|
const AFTER_BODY = 17;
|
|
const IN_FRAMESET = 18;
|
|
const AFTER_FRAMESET = 19;
|
|
const AFTER_AFTER_BODY = 20;
|
|
const AFTER_AFTER_FRAMESET = 21;
|
|
|
|
/**
|
|
* Converts a magic number to a readable name. Use for debugging.
|
|
*/
|
|
private function strConst($number) {
|
|
static $lookup;
|
|
if (!$lookup) {
|
|
$r = new ReflectionClass('HTML5_TreeBuilder');
|
|
$lookup = array_flip($r->getConstants());
|
|
}
|
|
return $lookup[$number];
|
|
}
|
|
|
|
// The different types of elements.
|
|
const SPECIAL = 100;
|
|
const SCOPING = 101;
|
|
const FORMATTING = 102;
|
|
const PHRASING = 103;
|
|
|
|
// Quirks modes in $quirks_mode
|
|
const NO_QUIRKS = 200;
|
|
const QUIRKS_MODE = 201;
|
|
const LIMITED_QUIRKS_MODE = 202;
|
|
|
|
// Marker to be placed in $a_formatting
|
|
const MARKER = 300;
|
|
|
|
// Namespaces for foreign content
|
|
const NS_HTML = null; // to prevent DOM from requiring NS on everything
|
|
const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
|
|
const NS_SVG = 'http://www.w3.org/2000/svg';
|
|
const NS_XLINK = 'http://www.w3.org/1999/xlink';
|
|
const NS_XML = 'http://www.w3.org/XML/1998/namespace';
|
|
const NS_XMLNS = 'http://www.w3.org/2000/xmlns/';
|
|
|
|
public function __construct() {
|
|
$this->mode = self::INITIAL;
|
|
$this->dom = new DOMDocument;
|
|
|
|
$this->dom->encoding = 'UTF-8';
|
|
$this->dom->preserveWhiteSpace = true;
|
|
$this->dom->substituteEntities = true;
|
|
$this->dom->strictErrorChecking = false;
|
|
}
|
|
|
|
// Process tag tokens
|
|
public function emitToken($token, $mode = null) {
|
|
// XXX: ignore parse errors... why are we emitting them, again?
|
|
if ($token['type'] === HTML5_Tokenizer::PARSEERROR) return;
|
|
if ($mode === null) $mode = $this->mode;
|
|
|
|
/*
|
|
$backtrace = debug_backtrace();
|
|
if ($backtrace[1]['class'] !== 'HTML5_TreeBuilder') echo "--\n";
|
|
echo $this->strConst($mode);
|
|
if ($this->original_mode) echo " (originally ".$this->strConst($this->original_mode).")";
|
|
echo "\n ";
|
|
token_dump($token);
|
|
$this->printStack();
|
|
$this->printActiveFormattingElements();
|
|
if ($this->foster_parent) echo " -> this is a foster parent mode\n";
|
|
*/
|
|
|
|
if ($this->ignore_lf_token) $this->ignore_lf_token--;
|
|
$this->ignored = false;
|
|
// indenting is a little wonky, this can be changed later on
|
|
switch ($mode) {
|
|
|
|
case self::INITIAL:
|
|
|
|
/* A character token that is one of U+0009 CHARACTER TABULATION,
|
|
* U+000A LINE FEED (LF), U+000C FORM FEED (FF), or U+0020 SPACE */
|
|
if ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
|
/* Ignore the token. */
|
|
$this->ignored = true;
|
|
} elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
|
if (
|
|
$token['name'] !== 'html' || !empty($token['public']) ||
|
|
!empty($token['system']) || $token !== 'about:legacy-compat'
|
|
) {
|
|
/* If the DOCTYPE token's name is not a case-sensitive match
|
|
* for the string "html", or if the token's public identifier
|
|
* is not missing, or if the token's system identifier is
|
|
* neither missing nor a case-sensitive match for the string
|
|
* "about:legacy-compat", then there is a parse error (this
|
|
* is the DOCTYPE parse error). */
|
|
// DOCTYPE parse error
|
|
}
|
|
/* Append a DocumentType node to the Document node, with the name
|
|
* attribute set to the name given in the DOCTYPE token, or the
|
|
* empty string if the name was missing; the publicId attribute
|
|
* set to the public identifier given in the DOCTYPE token, or
|
|
* the empty string if the public identifier was missing; the
|
|
* systemId attribute set to the system identifier given in the
|
|
* DOCTYPE token, or the empty string if the system identifier
|
|
* was missing; and the other attributes specific to
|
|
* DocumentType objects set to null and empty lists as
|
|
* appropriate. Associate the DocumentType node with the
|
|
* Document object so that it is returned as the value of the
|
|
* doctype attribute of the Document object. */
|
|
if (!isset($token['public'])) $token['public'] = null;
|
|
if (!isset($token['system'])) $token['system'] = null;
|
|
// Yes this is hacky. I'm kind of annoyed that I can't appendChild
|
|
// a doctype to DOMDocument. Maybe I haven't chanted the right
|
|
// syllables.
|
|
$impl = new DOMImplementation();
|
|
// This call can fail for particularly pathological cases (namely,
|
|
// the qualifiedName parameter ($token['name']) could be missing.
|
|
if ($token['name']) {
|
|
$doctype = $impl->createDocumentType($token['name'], $token['public'], $token['system']);
|
|
$this->dom->appendChild($doctype);
|
|
} else {
|
|
// It looks like libxml's not actually *able* to express this case.
|
|
// So... don't.
|
|
$this->dom->emptyDoctype = true;
|
|
}
|
|
$public = is_null($token['public']) ? false : strtolower($token['public']);
|
|
$system = is_null($token['system']) ? false : strtolower($token['system']);
|
|
$publicStartsWithForQuirks = array(
|
|
"+//silmaril//dtd html pro v0r11 19970101//",
|
|
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
|
|
"-//as//dtd html 3.0 aswedit + extensions//",
|
|
"-//ietf//dtd html 2.0 level 1//",
|
|
"-//ietf//dtd html 2.0 level 2//",
|
|
"-//ietf//dtd html 2.0 strict level 1//",
|
|
"-//ietf//dtd html 2.0 strict level 2//",
|
|
"-//ietf//dtd html 2.0 strict//",
|
|
"-//ietf//dtd html 2.0//",
|
|
"-//ietf//dtd html 2.1e//",
|
|
"-//ietf//dtd html 3.0//",
|
|
"-//ietf//dtd html 3.2 final//",
|
|
"-//ietf//dtd html 3.2//",
|
|
"-//ietf//dtd html 3//",
|
|
"-//ietf//dtd html level 0//",
|
|
"-//ietf//dtd html level 1//",
|
|
"-//ietf//dtd html level 2//",
|
|
"-//ietf//dtd html level 3//",
|
|
"-//ietf//dtd html strict level 0//",
|
|
"-//ietf//dtd html strict level 1//",
|
|
"-//ietf//dtd html strict level 2//",
|
|
"-//ietf//dtd html strict level 3//",
|
|
"-//ietf//dtd html strict//",
|
|
"-//ietf//dtd html//",
|
|
"-//metrius//dtd metrius presentational//",
|
|
"-//microsoft//dtd internet explorer 2.0 html strict//",
|
|
"-//microsoft//dtd internet explorer 2.0 html//",
|
|
"-//microsoft//dtd internet explorer 2.0 tables//",
|
|
"-//microsoft//dtd internet explorer 3.0 html strict//",
|
|
"-//microsoft//dtd internet explorer 3.0 html//",
|
|
"-//microsoft//dtd internet explorer 3.0 tables//",
|
|
"-//netscape comm. corp.//dtd html//",
|
|
"-//netscape comm. corp.//dtd strict html//",
|
|
"-//o'reilly and associates//dtd html 2.0//",
|
|
"-//o'reilly and associates//dtd html extended 1.0//",
|
|
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
|
|
"-//spyglass//dtd html 2.0 extended//",
|
|
"-//sq//dtd html 2.0 hotmetal + extensions//",
|
|
"-//sun microsystems corp.//dtd hotjava html//",
|
|
"-//sun microsystems corp.//dtd hotjava strict html//",
|
|
"-//w3c//dtd html 3 1995-03-24//",
|
|
"-//w3c//dtd html 3.2 draft//",
|
|
"-//w3c//dtd html 3.2 final//",
|
|
"-//w3c//dtd html 3.2//",
|
|
"-//w3c//dtd html 3.2s draft//",
|
|
"-//w3c//dtd html 4.0 frameset//",
|
|
"-//w3c//dtd html 4.0 transitional//",
|
|
"-//w3c//dtd html experimental 19960712//",
|
|
"-//w3c//dtd html experimental 970421//",
|
|
"-//w3c//dtd w3 html//",
|
|
"-//w3o//dtd w3 html 3.0//",
|
|
"-//webtechs//dtd mozilla html 2.0//",
|
|
"-//webtechs//dtd mozilla html//",
|
|
);
|
|
$publicSetToForQuirks = array(
|
|
"-//w3o//dtd w3 html strict 3.0//",
|
|
"-/w3c/dtd html 4.0 transitional/en",
|
|
"html",
|
|
);
|
|
$publicStartsWithAndSystemForQuirks = array(
|
|
"-//w3c//dtd html 4.01 frameset//",
|
|
"-//w3c//dtd html 4.01 transitional//",
|
|
);
|
|
$publicStartsWithForLimitedQuirks = array(
|
|
"-//w3c//dtd xhtml 1.0 frameset//",
|
|
"-//w3c//dtd xhtml 1.0 transitional//",
|
|
);
|
|
$publicStartsWithAndSystemForLimitedQuirks = array(
|
|
"-//w3c//dtd html 4.01 frameset//",
|
|
"-//w3c//dtd html 4.01 transitional//",
|
|
);
|
|
// first, do easy checks
|
|
if (
|
|
!empty($token['force-quirks']) ||
|
|
strtolower($token['name']) !== 'html'
|
|
) {
|
|
$this->quirks_mode = self::QUIRKS_MODE;
|
|
} else {
|
|
do {
|
|
if ($system) {
|
|
foreach ($publicStartsWithAndSystemForQuirks as $x) {
|
|
if (strncmp($public, $x, strlen($x)) === 0) {
|
|
$this->quirks_mode = self::QUIRKS_MODE;
|
|
break;
|
|
}
|
|
}
|
|
if (!is_null($this->quirks_mode)) break;
|
|
foreach ($publicStartsWithAndSystemForLimitedQuirks as $x) {
|
|
if (strncmp($public, $x, strlen($x)) === 0) {
|
|
$this->quirks_mode = self::LIMITED_QUIRKS_MODE;
|
|
break;
|
|
}
|
|
}
|
|
if (!is_null($this->quirks_mode)) break;
|
|
}
|
|
foreach ($publicSetToForQuirks as $x) {
|
|
if ($public === $x) {
|
|
$this->quirks_mode = self::QUIRKS_MODE;
|
|
break;
|
|
}
|
|
}
|
|
if (!is_null($this->quirks_mode)) break;
|
|
foreach ($publicStartsWithForLimitedQuirks as $x) {
|
|
if (strncmp($public, $x, strlen($x)) === 0) {
|
|
$this->quirks_mode = self::LIMITED_QUIRKS_MODE;
|
|
}
|
|
}
|
|
if (!is_null($this->quirks_mode)) break;
|
|
if ($system === "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
|
|
$this->quirks_mode = self::QUIRKS_MODE;
|
|
break;
|
|
}
|
|
foreach ($publicStartsWithForQuirks as $x) {
|
|
if (strncmp($public, $x, strlen($x)) === 0) {
|
|
$this->quirks_mode = self::QUIRKS_MODE;
|
|
break;
|
|
}
|
|
}
|
|
if (is_null($this->quirks_mode)) {
|
|
$this->quirks_mode = self::NO_QUIRKS;
|
|
}
|
|
} while (false);
|
|
}
|
|
$this->mode = self::BEFORE_HTML;
|
|
} else {
|
|
// parse error
|
|
/* Switch the insertion mode to "before html", then reprocess the
|
|
* current token. */
|
|
$this->mode = self::BEFORE_HTML;
|
|
$this->quirks_mode = self::QUIRKS_MODE;
|
|
$this->emitToken($token);
|
|
}
|
|
break;
|
|
|
|
case self::BEFORE_HTML:
|
|
|
|
/* A DOCTYPE token */
|
|
if($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
|
// Parse error. Ignore the token.
|
|
$this->ignored = true;
|
|
|
|
/* A comment token */
|
|
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
|
/* Append a Comment node to the Document object with the data
|
|
attribute set to the data given in the comment token. */
|
|
$comment = $this->dom->createComment($token['data']);
|
|
$this->dom->appendChild($comment);
|
|
|
|
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
|
|
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
|
|
or U+0020 SPACE */
|
|
} elseif($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
|
/* Ignore the token. */
|
|
$this->ignored = true;
|
|
|
|
/* A start tag whose tag name is "html" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] == 'html') {
|
|
/* Create an element for the token in the HTML namespace. Append it
|
|
* to the Document object. Put this element in the stack of open
|
|
* elements. */
|
|
$html = $this->insertElement($token, false);
|
|
$this->dom->appendChild($html);
|
|
$this->stack[] = $html;
|
|
|
|
$this->mode = self::BEFORE_HEAD;
|
|
|
|
} else {
|
|
/* Create an html element. Append it to the Document object. Put
|
|
* this element in the stack of open elements. */
|
|
$html = $this->dom->createElementNS(self::NS_HTML, 'html');
|
|
$this->dom->appendChild($html);
|
|
$this->stack[] = $html;
|
|
|
|
/* Switch the insertion mode to "before head", then reprocess the
|
|
* current token. */
|
|
$this->mode = self::BEFORE_HEAD;
|
|
$this->emitToken($token);
|
|
}
|
|
break;
|
|
|
|
case self::BEFORE_HEAD:
|
|
|
|
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
|
|
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
|
|
or U+0020 SPACE */
|
|
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
|
/* Ignore the token. */
|
|
$this->ignored = true;
|
|
|
|
/* A comment token */
|
|
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
|
/* Append a Comment node to the current node with the data attribute
|
|
set to the data given in the comment token. */
|
|
$this->insertComment($token['data']);
|
|
|
|
/* A DOCTYPE token */
|
|
} elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
|
/* Parse error. Ignore the token */
|
|
$this->ignored = true;
|
|
// parse error
|
|
|
|
/* A start tag token with the tag name "html" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
|
|
/* Process the token using the rules for the "in body"
|
|
* insertion mode. */
|
|
$this->processWithRulesFor($token, self::IN_BODY);
|
|
|
|
/* A start tag token with the tag name "head" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'head') {
|
|
/* Insert an HTML element for the token. */
|
|
$element = $this->insertElement($token);
|
|
|
|
/* Set the head element pointer to this new element node. */
|
|
$this->head_pointer = $element;
|
|
|
|
/* Change the insertion mode to "in head". */
|
|
$this->mode = self::IN_HEAD;
|
|
|
|
/* An end tag whose tag name is one of: "head", "body", "html", "br" */
|
|
} elseif(
|
|
$token['type'] === HTML5_Tokenizer::ENDTAG && (
|
|
$token['name'] === 'head' || $token['name'] === 'body' ||
|
|
$token['name'] === 'html' || $token['name'] === 'br'
|
|
)) {
|
|
/* Act as if a start tag token with the tag name "head" and no
|
|
* attributes had been seen, then reprocess the current token. */
|
|
$this->emitToken(array(
|
|
'name' => 'head',
|
|
'type' => HTML5_Tokenizer::STARTTAG,
|
|
'attr' => array()
|
|
));
|
|
$this->emitToken($token);
|
|
|
|
/* Any other end tag */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG) {
|
|
/* Parse error. Ignore the token. */
|
|
$this->ignored = true;
|
|
|
|
} else {
|
|
/* Act as if a start tag token with the tag name "head" and no
|
|
* attributes had been seen, then reprocess the current token.
|
|
* Note: This will result in an empty head element being
|
|
* generated, with the current token being reprocessed in the
|
|
* "after head" insertion mode. */
|
|
$this->emitToken(array(
|
|
'name' => 'head',
|
|
'type' => HTML5_Tokenizer::STARTTAG,
|
|
'attr' => array()
|
|
));
|
|
$this->emitToken($token);
|
|
}
|
|
break;
|
|
|
|
case self::IN_HEAD:
|
|
|
|
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
|
|
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
|
|
or U+0020 SPACE. */
|
|
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
|
/* Insert the character into the current node. */
|
|
$this->insertText($token['data']);
|
|
|
|
/* A comment token */
|
|
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
|
/* Append a Comment node to the current node with the data attribute
|
|
set to the data given in the comment token. */
|
|
$this->insertComment($token['data']);
|
|
|
|
/* A DOCTYPE token */
|
|
} elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
|
/* Parse error. Ignore the token. */
|
|
$this->ignored = true;
|
|
// parse error
|
|
|
|
/* A start tag whose tag name is "html" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
$token['name'] === 'html') {
|
|
$this->processWithRulesFor($token, self::IN_BODY);
|
|
|
|
/* A start tag whose tag name is one of: "base", "command", "link" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
($token['name'] === 'base' || $token['name'] === 'command' ||
|
|
$token['name'] === 'link')) {
|
|
/* Insert an HTML element for the token. Immediately pop the
|
|
* current node off the stack of open elements. */
|
|
$this->insertElement($token);
|
|
array_pop($this->stack);
|
|
|
|
// YYY: Acknowledge the token's self-closing flag, if it is set.
|
|
|
|
/* A start tag whose tag name is "meta" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'meta') {
|
|
/* Insert an HTML element for the token. Immediately pop the
|
|
* current node off the stack of open elements. */
|
|
$this->insertElement($token);
|
|
array_pop($this->stack);
|
|
|
|
// XERROR: Acknowledge the token's self-closing flag, if it is set.
|
|
|
|
// XENCODING: If the element has a charset attribute, and its value is a
|
|
// supported encoding, and the confidence is currently tentative,
|
|
// then change the encoding to the encoding given by the value of
|
|
// the charset attribute.
|
|
//
|
|
// Otherwise, if the element has a content attribute, and applying
|
|
// the algorithm for extracting an encoding from a Content-Type to
|
|
// its value returns a supported encoding encoding, and the
|
|
// confidence is currently tentative, then change the encoding to
|
|
// the encoding encoding.
|
|
|
|
/* A start tag with the tag name "title" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'title') {
|
|
$this->insertRCDATAElement($token);
|
|
|
|
/* A start tag whose tag name is "noscript", if the scripting flag is enabled, or
|
|
* A start tag whose tag name is one of: "noframes", "style" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
($token['name'] === 'noscript' || $token['name'] === 'noframes' || $token['name'] === 'style')) {
|
|
// XSCRIPT: Scripting flag not respected
|
|
$this->insertCDATAElement($token);
|
|
|
|
// XSCRIPT: Scripting flag disable not implemented
|
|
|
|
/* A start tag with the tag name "script" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'script') {
|
|
/* 1. Create an element for the token in the HTML namespace. */
|
|
$node = $this->insertElement($token, false);
|
|
|
|
/* 2. Mark the element as being "parser-inserted" */
|
|
// Uhhh... XSCRIPT
|
|
|
|
/* 3. If the parser was originally created for the HTML
|
|
* fragment parsing algorithm, then mark the script element as
|
|
* "already executed". (fragment case) */
|
|
// ditto... XSCRIPT
|
|
|
|
/* 4. Append the new element to the current node and push it onto
|
|
* the stack of open elements. */
|
|
end($this->stack)->appendChild($node);
|
|
$this->stack[] = $node;
|
|
// I guess we could squash these together
|
|
|
|
/* 6. Let the original insertion mode be the current insertion mode. */
|
|
$this->original_mode = $this->mode;
|
|
/* 7. Switch the insertion mode to "in CDATA/RCDATA" */
|
|
$this->mode = self::IN_CDATA_RCDATA;
|
|
/* 5. Switch the tokeniser's content model flag to the CDATA state. */
|
|
$this->content_model = HTML5_Tokenizer::CDATA;
|
|
|
|
/* An end tag with the tag name "head" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'head') {
|
|
/* Pop the current node (which will be the head element) off the stack of open elements. */
|
|
array_pop($this->stack);
|
|
|
|
/* Change the insertion mode to "after head". */
|
|
$this->mode = self::AFTER_HEAD;
|
|
|
|
// Slight logic inversion here to minimize duplication
|
|
/* A start tag with the tag name "head". */
|
|
/* An end tag whose tag name is not one of: "body", "html", "br" */
|
|
} elseif(($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'head') ||
|
|
($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] !== 'html' &&
|
|
$token['name'] !== 'body' && $token['name'] !== 'br')) {
|
|
// Parse error. Ignore the token.
|
|
$this->ignored = true;
|
|
|
|
/* Anything else */
|
|
} else {
|
|
/* Act as if an end tag token with the tag name "head" had been
|
|
* seen, and reprocess the current token. */
|
|
$this->emitToken(array(
|
|
'name' => 'head',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
|
|
/* Then, reprocess the current token. */
|
|
$this->emitToken($token);
|
|
}
|
|
break;
|
|
|
|
case self::IN_HEAD_NOSCRIPT:
|
|
if ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
|
// parse error
|
|
} elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
|
|
$this->processWithRulesFor($token, self::IN_BODY);
|
|
} elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'noscript') {
|
|
/* Pop the current node (which will be a noscript element) from the
|
|
* stack of open elements; the new current node will be a head
|
|
* element. */
|
|
array_pop($this->stack);
|
|
$this->mode = self::IN_HEAD;
|
|
} elseif (
|
|
($token['type'] === HTML5_Tokenizer::SPACECHARACTER) ||
|
|
($token['type'] === HTML5_Tokenizer::COMMENT) ||
|
|
($token['type'] === HTML5_Tokenizer::STARTTAG && (
|
|
$token['name'] === 'link' || $token['name'] === 'meta' ||
|
|
$token['name'] === 'noframes' || $token['name'] === 'style'))) {
|
|
$this->processWithRulesFor($token, self::IN_HEAD);
|
|
// inverted logic
|
|
} elseif (
|
|
($token['type'] === HTML5_Tokenizer::STARTTAG && (
|
|
$token['name'] === 'head' || $token['name'] === 'noscript')) ||
|
|
($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
|
$token['name'] !== 'br')) {
|
|
// parse error
|
|
} else {
|
|
// parse error
|
|
$this->emitToken(array(
|
|
'type' => HTML5_Tokenizer::ENDTAG,
|
|
'name' => 'noscript',
|
|
));
|
|
$this->emitToken($token);
|
|
}
|
|
break;
|
|
|
|
case self::AFTER_HEAD:
|
|
/* Handle the token as follows: */
|
|
|
|
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
|
|
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
|
|
or U+0020 SPACE */
|
|
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
|
/* Append the character to the current node. */
|
|
$this->insertText($token['data']);
|
|
|
|
/* A comment token */
|
|
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
|
/* Append a Comment node to the current node with the data attribute
|
|
set to the data given in the comment token. */
|
|
$this->insertComment($token['data']);
|
|
|
|
} elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
|
// parse error
|
|
|
|
} elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
|
|
$this->processWithRulesFor($token, self::IN_BODY);
|
|
|
|
/* A start tag token with the tag name "body" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'body') {
|
|
$this->insertElement($token);
|
|
|
|
/* Set the frameset-ok flag to "not ok". */
|
|
$this->flag_frameset_ok = false;
|
|
|
|
/* Change the insertion mode to "in body". */
|
|
$this->mode = self::IN_BODY;
|
|
|
|
/* A start tag token with the tag name "frameset" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'frameset') {
|
|
/* Insert a frameset element for the token. */
|
|
$this->insertElement($token);
|
|
|
|
/* Change the insertion mode to "in frameset". */
|
|
$this->mode = self::IN_FRAMESET;
|
|
|
|
/* A start tag token whose tag name is one of: "base", "link", "meta",
|
|
"script", "style", "title" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
|
|
array('base', 'link', 'meta', 'noframes', 'script', 'style', 'title'))) {
|
|
// parse error
|
|
/* Push the node pointed to by the head element pointer onto the
|
|
* stack of open elements. */
|
|
$this->stack[] = $this->head_pointer;
|
|
$this->processWithRulesFor($token, self::IN_HEAD);
|
|
array_splice($this->stack, array_search($this->head_pointer, $this->stack, true), 1);
|
|
|
|
// inversion of specification
|
|
} elseif(
|
|
($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'head') ||
|
|
($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
|
$token['name'] !== 'body' && $token['name'] !== 'html' &&
|
|
$token['name'] !== 'br')) {
|
|
// parse error
|
|
|
|
/* Anything else */
|
|
} else {
|
|
$this->emitToken(array(
|
|
'name' => 'body',
|
|
'type' => HTML5_Tokenizer::STARTTAG,
|
|
'attr' => array()
|
|
));
|
|
$this->flag_frameset_ok = true;
|
|
$this->emitToken($token);
|
|
}
|
|
break;
|
|
|
|
case self::IN_BODY:
|
|
/* Handle the token as follows: */
|
|
|
|
switch($token['type']) {
|
|
/* A character token */
|
|
case HTML5_Tokenizer::CHARACTER:
|
|
case HTML5_Tokenizer::SPACECHARACTER:
|
|
/* Reconstruct the active formatting elements, if any. */
|
|
$this->reconstructActiveFormattingElements();
|
|
|
|
/* Append the token's character to the current node. */
|
|
$this->insertText($token['data']);
|
|
|
|
/* If the token is not one of U+0009 CHARACTER TABULATION,
|
|
* U+000A LINE FEED (LF), U+000C FORM FEED (FF), or U+0020
|
|
* SPACE, then set the frameset-ok flag to "not ok". */
|
|
// i.e., if any of the characters is not whitespace
|
|
if (strlen($token['data']) !== strspn($token['data'], HTML5_Tokenizer::WHITESPACE)) {
|
|
$this->flag_frameset_ok = false;
|
|
}
|
|
break;
|
|
|
|
/* A comment token */
|
|
case HTML5_Tokenizer::COMMENT:
|
|
/* Append a Comment node to the current node with the data
|
|
attribute set to the data given in the comment token. */
|
|
$this->insertComment($token['data']);
|
|
break;
|
|
|
|
case HTML5_Tokenizer::DOCTYPE:
|
|
// parse error
|
|
break;
|
|
|
|
case HTML5_Tokenizer::STARTTAG:
|
|
switch($token['name']) {
|
|
case 'html':
|
|
// parse error
|
|
/* For each attribute on the token, check to see if the
|
|
* attribute is already present on the top element of the
|
|
* stack of open elements. If it is not, add the attribute
|
|
* and its corresponding value to that element. */
|
|
foreach($token['attr'] as $attr) {
|
|
if(!$this->stack[0]->hasAttribute($attr['name'])) {
|
|
$this->stack[0]->setAttribute($attr['name'], $attr['value']);
|
|
}
|
|
}
|
|
break;
|
|
|
|
case 'base': case 'command': case 'link': case 'meta': case 'noframes':
|
|
case 'script': case 'style': case 'title':
|
|
/* Process the token as if the insertion mode had been "in
|
|
head". */
|
|
$this->processWithRulesFor($token, self::IN_HEAD);
|
|
break;
|
|
|
|
/* A start tag token with the tag name "body" */
|
|
case 'body':
|
|
/* Parse error. If the second element on the stack of open
|
|
elements is not a body element, or, if the stack of open
|
|
elements has only one node on it, then ignore the token.
|
|
(fragment case) */
|
|
if(count($this->stack) === 1 || $this->stack[1]->tagName !== 'body') {
|
|
$this->ignored = true;
|
|
// Ignore
|
|
|
|
/* Otherwise, for each attribute on the token, check to see
|
|
if the attribute is already present on the body element (the
|
|
second element) on the stack of open elements. If it is not,
|
|
add the attribute and its corresponding value to that
|
|
element. */
|
|
} else {
|
|
foreach($token['attr'] as $attr) {
|
|
if(!$this->stack[1]->hasAttribute($attr['name'])) {
|
|
$this->stack[1]->setAttribute($attr['name'], $attr['value']);
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
|
|
case 'frameset':
|
|
// parse error
|
|
/* If the second element on the stack of open elements is
|
|
* not a body element, or, if the stack of open elements
|
|
* has only one node on it, then ignore the token.
|
|
* (fragment case) */
|
|
if(count($this->stack) === 1 || $this->stack[1]->tagName !== 'body') {
|
|
$this->ignored = true;
|
|
// Ignore
|
|
} elseif (!$this->flag_frameset_ok) {
|
|
$this->ignored = true;
|
|
// Ignore
|
|
} else {
|
|
/* 1. Remove the second element on the stack of open
|
|
* elements from its parent node, if it has one. */
|
|
if($this->stack[1]->parentNode) {
|
|
$this->stack[1]->parentNode->removeChild($this->stack[1]);
|
|
}
|
|
|
|
/* 2. Pop all the nodes from the bottom of the stack of
|
|
* open elements, from the current node up to the root
|
|
* html element. */
|
|
array_splice($this->stack, 1);
|
|
|
|
$this->insertElement($token);
|
|
$this->mode = self::IN_FRAMESET;
|
|
}
|
|
break;
|
|
|
|
// in spec, there is a diversion here
|
|
|
|
case 'address': case 'article': case 'aside': case 'blockquote':
|
|
case 'center': case 'datagrid': case 'details': case 'dialog': case 'dir':
|
|
case 'div': case 'dl': case 'fieldset': case 'figure': case 'footer':
|
|
case 'header': case 'hgroup': case 'menu': case 'nav':
|
|
case 'ol': case 'p': case 'section': case 'ul':
|
|
/* If the stack of open elements has a p element in scope,
|
|
then act as if an end tag with the tag name p had been
|
|
seen. */
|
|
if($this->elementInScope('p')) {
|
|
$this->emitToken(array(
|
|
'name' => 'p',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
}
|
|
|
|
/* Insert an HTML element for the token. */
|
|
$this->insertElement($token);
|
|
break;
|
|
|
|
/* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
|
|
"h5", "h6" */
|
|
case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
|
|
/* If the stack of open elements has a p element in scope,
|
|
then act as if an end tag with the tag name p had been seen. */
|
|
if($this->elementInScope('p')) {
|
|
$this->emitToken(array(
|
|
'name' => 'p',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
}
|
|
|
|
/* If the current node is an element whose tag name is one
|
|
* of "h1", "h2", "h3", "h4", "h5", or "h6", then this is a
|
|
* parse error; pop the current node off the stack of open
|
|
* elements. */
|
|
$peek = array_pop($this->stack);
|
|
if (in_array($peek->tagName, array("h1", "h2", "h3", "h4", "h5", "h6"))) {
|
|
// parse error
|
|
} else {
|
|
$this->stack[] = $peek;
|
|
}
|
|
|
|
/* Insert an HTML element for the token. */
|
|
$this->insertElement($token);
|
|
break;
|
|
|
|
case 'pre': case 'listing':
|
|
/* If the stack of open elements has a p element in scope,
|
|
then act as if an end tag with the tag name p had been seen. */
|
|
if($this->elementInScope('p')) {
|
|
$this->emitToken(array(
|
|
'name' => 'p',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
}
|
|
$this->insertElement($token);
|
|
/* If the next token is a U+000A LINE FEED (LF) character
|
|
* token, then ignore that token and move on to the next
|
|
* one. (Newlines at the start of pre blocks are ignored as
|
|
* an authoring convenience.) */
|
|
$this->ignore_lf_token = 2;
|
|
$this->flag_frameset_ok = false;
|
|
break;
|
|
|
|
/* A start tag whose tag name is "form" */
|
|
case 'form':
|
|
/* If the form element pointer is not null, ignore the
|
|
token with a parse error. */
|
|
if($this->form_pointer !== null) {
|
|
$this->ignored = true;
|
|
// Ignore.
|
|
|
|
/* Otherwise: */
|
|
} else {
|
|
/* If the stack of open elements has a p element in
|
|
scope, then act as if an end tag with the tag name p
|
|
had been seen. */
|
|
if($this->elementInScope('p')) {
|
|
$this->emitToken(array(
|
|
'name' => 'p',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
}
|
|
|
|
/* Insert an HTML element for the token, and set the
|
|
form element pointer to point to the element created. */
|
|
$element = $this->insertElement($token);
|
|
$this->form_pointer = $element;
|
|
}
|
|
break;
|
|
|
|
// condensed specification
|
|
case 'li': case 'dd': case 'dt':
|
|
/* 1. Set the frameset-ok flag to "not ok". */
|
|
$this->flag_frameset_ok = false;
|
|
|
|
$stack_length = count($this->stack) - 1;
|
|
for($n = $stack_length; 0 <= $n; $n--) {
|
|
/* 2. Initialise node to be the current node (the
|
|
bottommost node of the stack). */
|
|
$stop = false;
|
|
$node = $this->stack[$n];
|
|
$cat = $this->getElementCategory($node);
|
|
|
|
// for case 'li':
|
|
/* 3. If node is an li element, then act as if an end
|
|
* tag with the tag name "li" had been seen, then jump
|
|
* to the last step. */
|
|
// for case 'dd': case 'dt':
|
|
/* If node is a dd or dt element, then act as if an end
|
|
* tag with the same tag name as node had been seen, then
|
|
* jump to the last step. */
|
|
if(($token['name'] === 'li' && $node->tagName === 'li') ||
|
|
($token['name'] !== 'li' && ($node->tagName === 'dd' || $node->tagName === 'dt'))) { // limited conditional
|
|
$this->emitToken(array(
|
|
'type' => HTML5_Tokenizer::ENDTAG,
|
|
'name' => $node->tagName,
|
|
));
|
|
break;
|
|
}
|
|
|
|
/* 4. If node is not in the formatting category, and is
|
|
not in the phrasing category, and is not an address,
|
|
div or p element, then stop this algorithm. */
|
|
if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
|
|
$node->tagName !== 'address' && $node->tagName !== 'div' &&
|
|
$node->tagName !== 'p') {
|
|
break;
|
|
}
|
|
|
|
/* 5. Otherwise, set node to the previous entry in the
|
|
* stack of open elements and return to step 2. */
|
|
}
|
|
|
|
/* 6. This is the last step. */
|
|
|
|
/* If the stack of open elements has a p element in scope,
|
|
then act as if an end tag with the tag name p had been
|
|
seen. */
|
|
if($this->elementInScope('p')) {
|
|
$this->emitToken(array(
|
|
'name' => 'p',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
}
|
|
|
|
/* Finally, insert an HTML element with the same tag
|
|
name as the token's. */
|
|
$this->insertElement($token);
|
|
break;
|
|
|
|
/* A start tag token whose tag name is "plaintext" */
|
|
case 'plaintext':
|
|
/* If the stack of open elements has a p element in scope,
|
|
then act as if an end tag with the tag name p had been
|
|
seen. */
|
|
if($this->elementInScope('p')) {
|
|
$this->emitToken(array(
|
|
'name' => 'p',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
}
|
|
|
|
/* Insert an HTML element for the token. */
|
|
$this->insertElement($token);
|
|
|
|
$this->content_model = HTML5_Tokenizer::PLAINTEXT;
|
|
break;
|
|
|
|
// more diversions
|
|
|
|
/* A start tag whose tag name is "a" */
|
|
case 'a':
|
|
/* If the list of active formatting elements contains
|
|
an element whose tag name is "a" between the end of the
|
|
list and the last marker on the list (or the start of
|
|
the list if there is no marker on the list), then this
|
|
is a parse error; act as if an end tag with the tag name
|
|
"a" had been seen, then remove that element from the list
|
|
of active formatting elements and the stack of open
|
|
elements if the end tag didn't already remove it (it
|
|
might not have if the element is not in table scope). */
|
|
$leng = count($this->a_formatting);
|
|
|
|
for($n = $leng - 1; $n >= 0; $n--) {
|
|
if($this->a_formatting[$n] === self::MARKER) {
|
|
break;
|
|
|
|
} elseif($this->a_formatting[$n]->tagName === 'a') {
|
|
$a = $this->a_formatting[$n];
|
|
$this->emitToken(array(
|
|
'name' => 'a',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
if (in_array($a, $this->a_formatting)) {
|
|
$a_i = array_search($a, $this->a_formatting, true);
|
|
if($a_i !== false) array_splice($this->a_formatting, $a_i, 1);
|
|
}
|
|
if (in_array($a, $this->stack)) {
|
|
$a_i = array_search($a, $this->stack, true);
|
|
if ($a_i !== false) array_splice($this->stack, $a_i, 1);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Reconstruct the active formatting elements, if any. */
|
|
$this->reconstructActiveFormattingElements();
|
|
|
|
/* Insert an HTML element for the token. */
|
|
$el = $this->insertElement($token);
|
|
|
|
/* Add that element to the list of active formatting
|
|
elements. */
|
|
$this->a_formatting[] = $el;
|
|
break;
|
|
|
|
case 'b': case 'big': case 'code': case 'em': case 'font': case 'i':
|
|
case 's': case 'small': case 'strike':
|
|
case 'strong': case 'tt': case 'u':
|
|
/* Reconstruct the active formatting elements, if any. */
|
|
$this->reconstructActiveFormattingElements();
|
|
|
|
/* Insert an HTML element for the token. */
|
|
$el = $this->insertElement($token);
|
|
|
|
/* Add that element to the list of active formatting
|
|
elements. */
|
|
$this->a_formatting[] = $el;
|
|
break;
|
|
|
|
case 'nobr':
|
|
/* Reconstruct the active formatting elements, if any. */
|
|
$this->reconstructActiveFormattingElements();
|
|
|
|
/* If the stack of open elements has a nobr element in
|
|
* scope, then this is a parse error; act as if an end tag
|
|
* with the tag name "nobr" had been seen, then once again
|
|
* reconstruct the active formatting elements, if any. */
|
|
if ($this->elementInScope('nobr')) {
|
|
$this->emitToken(array(
|
|
'name' => 'nobr',
|
|
'type' => HTML5_Tokenizer::ENDTAG,
|
|
));
|
|
$this->reconstructActiveFormattingElements();
|
|
}
|
|
|
|
/* Insert an HTML element for the token. */
|
|
$el = $this->insertElement($token);
|
|
|
|
/* Add that element to the list of active formatting
|
|
elements. */
|
|
$this->a_formatting[] = $el;
|
|
break;
|
|
|
|
// another diversion
|
|
|
|
/* A start tag token whose tag name is "button" */
|
|
case 'button':
|
|
/* If the stack of open elements has a button element in scope,
|
|
then this is a parse error; act as if an end tag with the tag
|
|
name "button" had been seen, then reprocess the token. (We don't
|
|
do that. Unnecessary.) (I hope you're right! -- ezyang) */
|
|
if($this->elementInScope('button')) {
|
|
$this->emitToken(array(
|
|
'name' => 'button',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
}
|
|
|
|
/* Reconstruct the active formatting elements, if any. */
|
|
$this->reconstructActiveFormattingElements();
|
|
|
|
/* Insert an HTML element for the token. */
|
|
$this->insertElement($token);
|
|
|
|
/* Insert a marker at the end of the list of active
|
|
formatting elements. */
|
|
$this->a_formatting[] = self::MARKER;
|
|
|
|
$this->flag_frameset_ok = false;
|
|
break;
|
|
|
|
case 'applet': case 'marquee': case 'object':
|
|
/* Reconstruct the active formatting elements, if any. */
|
|
$this->reconstructActiveFormattingElements();
|
|
|
|
/* Insert an HTML element for the token. */
|
|
$this->insertElement($token);
|
|
|
|
/* Insert a marker at the end of the list of active
|
|
formatting elements. */
|
|
$this->a_formatting[] = self::MARKER;
|
|
|
|
$this->flag_frameset_ok = false;
|
|
break;
|
|
|
|
// spec diversion
|
|
|
|
/* A start tag whose tag name is "table" */
|
|
case 'table':
|
|
/* If the stack of open elements has a p element in scope,
|
|
then act as if an end tag with the tag name p had been seen. */
|
|
if($this->quirks_mode !== self::QUIRKS_MODE &&
|
|
$this->elementInScope('p')) {
|
|
$this->emitToken(array(
|
|
'name' => 'p',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
}
|
|
|
|
/* Insert an HTML element for the token. */
|
|
$this->insertElement($token);
|
|
|
|
$this->flag_frameset_ok = false;
|
|
|
|
/* Change the insertion mode to "in table". */
|
|
$this->mode = self::IN_TABLE;
|
|
break;
|
|
|
|
/* A start tag whose tag name is one of: "area", "basefont",
|
|
"bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
|
|
case 'area': case 'basefont': case 'bgsound': case 'br':
|
|
case 'embed': case 'img': case 'input': case 'keygen': case 'spacer':
|
|
case 'wbr':
|
|
/* Reconstruct the active formatting elements, if any. */
|
|
$this->reconstructActiveFormattingElements();
|
|
|
|
/* Insert an HTML element for the token. */
|
|
$this->insertElement($token);
|
|
|
|
/* Immediately pop the current node off the stack of open elements. */
|
|
array_pop($this->stack);
|
|
|
|
// YYY: Acknowledge the token's self-closing flag, if it is set.
|
|
|
|
$this->flag_frameset_ok = false;
|
|
break;
|
|
|
|
case 'param': case 'source':
|
|
/* Insert an HTML element for the token. */
|
|
$this->insertElement($token);
|
|
|
|
/* Immediately pop the current node off the stack of open elements. */
|
|
array_pop($this->stack);
|
|
|
|
// YYY: Acknowledge the token's self-closing flag, if it is set.
|
|
break;
|
|
|
|
/* A start tag whose tag name is "hr" */
|
|
case 'hr':
|
|
/* If the stack of open elements has a p element in scope,
|
|
then act as if an end tag with the tag name p had been seen. */
|
|
if($this->elementInScope('p')) {
|
|
$this->emitToken(array(
|
|
'name' => 'p',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
}
|
|
|
|
/* Insert an HTML element for the token. */
|
|
$this->insertElement($token);
|
|
|
|
/* Immediately pop the current node off the stack of open elements. */
|
|
array_pop($this->stack);
|
|
|
|
// YYY: Acknowledge the token's self-closing flag, if it is set.
|
|
|
|
$this->flag_frameset_ok = false;
|
|
break;
|
|
|
|
/* A start tag whose tag name is "image" */
|
|
case 'image':
|
|
/* Parse error. Change the token's tag name to "img" and
|
|
reprocess it. (Don't ask.) */
|
|
$token['name'] = 'img';
|
|
$this->emitToken($token);
|
|
break;
|
|
|
|
/* A start tag whose tag name is "isindex" */
|
|
case 'isindex':
|
|
/* Parse error. */
|
|
|
|
/* If the form element pointer is not null,
|
|
then ignore the token. */
|
|
if($this->form_pointer === null) {
|
|
/* Act as if a start tag token with the tag name "form" had
|
|
been seen. */
|
|
/* If the token has an attribute called "action", set
|
|
* the action attribute on the resulting form
|
|
* element to the value of the "action" attribute of
|
|
* the token. */
|
|
$attr = array();
|
|
$action = $this->getAttr($token, 'action');
|
|
if ($action !== false) {
|
|
$attr[] = array('name' => 'action', 'value' => $action);
|
|
}
|
|
$this->emitToken(array(
|
|
'name' => 'form',
|
|
'type' => HTML5_Tokenizer::STARTTAG,
|
|
'attr' => $attr
|
|
));
|
|
|
|
/* Act as if a start tag token with the tag name "hr" had
|
|
been seen. */
|
|
$this->emitToken(array(
|
|
'name' => 'hr',
|
|
'type' => HTML5_Tokenizer::STARTTAG,
|
|
'attr' => array()
|
|
));
|
|
|
|
/* Act as if a start tag token with the tag name "p" had
|
|
been seen. */
|
|
$this->emitToken(array(
|
|
'name' => 'p',
|
|
'type' => HTML5_Tokenizer::STARTTAG,
|
|
'attr' => array()
|
|
));
|
|
|
|
/* Act as if a start tag token with the tag name "label"
|
|
had been seen. */
|
|
$this->emitToken(array(
|
|
'name' => 'label',
|
|
'type' => HTML5_Tokenizer::STARTTAG,
|
|
'attr' => array()
|
|
));
|
|
|
|
/* Act as if a stream of character tokens had been seen. */
|
|
$prompt = $this->getAttr($token, 'prompt');
|
|
if ($prompt === false) {
|
|
$prompt = 'This is a searchable index. '.
|
|
'Insert your search keywords here: ';
|
|
}
|
|
$this->emitToken(array(
|
|
'data' => $prompt,
|
|
'type' => HTML5_Tokenizer::CHARACTER,
|
|
));
|
|
|
|
/* Act as if a start tag token with the tag name "input"
|
|
had been seen, with all the attributes from the "isindex"
|
|
token, except with the "name" attribute set to the value
|
|
"isindex" (ignoring any explicit "name" attribute). */
|
|
$attr = array();
|
|
foreach ($token['attr'] as $keypair) {
|
|
if ($keypair['name'] === 'name' || $keypair['name'] === 'action' ||
|
|
$keypair['name'] === 'prompt') continue;
|
|
$attr[] = $keypair;
|
|
}
|
|
$attr[] = array('name' => 'name', 'value' => 'isindex');
|
|
|
|
$this->emitToken(array(
|
|
'name' => 'input',
|
|
'type' => HTML5_Tokenizer::STARTTAG,
|
|
'attr' => $attr
|
|
));
|
|
|
|
/* Act as if an end tag token with the tag name "label"
|
|
had been seen. */
|
|
$this->emitToken(array(
|
|
'name' => 'label',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
|
|
/* Act as if an end tag token with the tag name "p" had
|
|
been seen. */
|
|
$this->emitToken(array(
|
|
'name' => 'p',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
|
|
/* Act as if a start tag token with the tag name "hr" had
|
|
been seen. */
|
|
$this->emitToken(array(
|
|
'name' => 'hr',
|
|
'type' => HTML5_Tokenizer::STARTTAG
|
|
));
|
|
|
|
/* Act as if an end tag token with the tag name "form" had
|
|
been seen. */
|
|
$this->emitToken(array(
|
|
'name' => 'form',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
} else {
|
|
$this->ignored = true;
|
|
}
|
|
break;
|
|
|
|
/* A start tag whose tag name is "textarea" */
|
|
case 'textarea':
|
|
$this->insertElement($token);
|
|
|
|
/* If the next token is a U+000A LINE FEED (LF)
|
|
* character token, then ignore that token and move on to
|
|
* the next one. (Newlines at the start of textarea
|
|
* elements are ignored as an authoring convenience.)
|
|
* need flag, see also <pre> */
|
|
$this->ignore_lf_token = 2;
|
|
|
|
$this->original_mode = $this->mode;
|
|
$this->flag_frameset_ok = false;
|
|
$this->mode = self::IN_CDATA_RCDATA;
|
|
|
|
/* Switch the tokeniser's content model flag to the
|
|
RCDATA state. */
|
|
$this->content_model = HTML5_Tokenizer::RCDATA;
|
|
break;
|
|
|
|
/* A start tag token whose tag name is "xmp" */
|
|
case 'xmp':
|
|
/* Reconstruct the active formatting elements, if any. */
|
|
$this->reconstructActiveFormattingElements();
|
|
|
|
$this->flag_frameset_ok = false;
|
|
|
|
$this->insertCDATAElement($token);
|
|
break;
|
|
|
|
case 'iframe':
|
|
$this->flag_frameset_ok = false;
|
|
$this->insertCDATAElement($token);
|
|
break;
|
|
|
|
case 'noembed': case 'noscript':
|
|
// XSCRIPT: should check scripting flag
|
|
$this->insertCDATAElement($token);
|
|
break;
|
|
|
|
/* A start tag whose tag name is "select" */
|
|
case 'select':
|
|
/* Reconstruct the active formatting elements, if any. */
|
|
$this->reconstructActiveFormattingElements();
|
|
|
|
/* Insert an HTML element for the token. */
|
|
$this->insertElement($token);
|
|
|
|
$this->flag_frameset_ok = false;
|
|
|
|
/* If the insertion mode is one of in table", "in caption",
|
|
* "in column group", "in table body", "in row", or "in
|
|
* cell", then switch the insertion mode to "in select in
|
|
* table". Otherwise, switch the insertion mode to "in
|
|
* select". */
|
|
if (
|
|
$this->mode === self::IN_TABLE || $this->mode === self::IN_CAPTION ||
|
|
$this->mode === self::IN_COLUMN_GROUP || $this->mode ==+self::IN_TABLE_BODY ||
|
|
$this->mode === self::IN_ROW || $this->mode === self::IN_CELL
|
|
) {
|
|
$this->mode = self::IN_SELECT_IN_TABLE;
|
|
} else {
|
|
$this->mode = self::IN_SELECT;
|
|
}
|
|
break;
|
|
|
|
case 'option': case 'optgroup':
|
|
if ($this->elementInScope('option')) {
|
|
$this->emitToken(array(
|
|
'name' => 'option',
|
|
'type' => HTML5_Tokenizer::ENDTAG,
|
|
));
|
|
}
|
|
$this->reconstructActiveFormattingElements();
|
|
$this->insertElement($token);
|
|
break;
|
|
|
|
case 'rp': case 'rt':
|
|
/* If the stack of open elements has a ruby element in scope, then generate
|
|
* implied end tags. If the current node is not then a ruby element, this is
|
|
* a parse error; pop all the nodes from the current node up to the node
|
|
* immediately before the bottommost ruby element on the stack of open elements.
|
|
*/
|
|
if ($this->elementInScope('ruby')) {
|
|
$this->generateImpliedEndTags();
|
|
}
|
|
$peek = false;
|
|
do {
|
|
if ($peek) {
|
|
// parse error
|
|
}
|
|
$peek = array_pop($this->stack);
|
|
} while ($peek->tagName !== 'ruby');
|
|
$this->stack[] = $peek; // we popped one too many
|
|
$this->insertElement($token);
|
|
break;
|
|
|
|
// spec diversion
|
|
|
|
case 'math':
|
|
$this->reconstructActiveFormattingElements();
|
|
$token = $this->adjustMathMLAttributes($token);
|
|
$token = $this->adjustForeignAttributes($token);
|
|
$this->insertForeignElement($token, self::NS_MATHML);
|
|
if (isset($token['self-closing'])) {
|
|
// XERROR: acknowledge the token's self-closing flag
|
|
array_pop($this->stack);
|
|
}
|
|
if ($this->mode !== self::IN_FOREIGN_CONTENT) {
|
|
$this->secondary_mode = $this->mode;
|
|
$this->mode = self::IN_FOREIGN_CONTENT;
|
|
}
|
|
break;
|
|
|
|
case 'svg':
|
|
$this->reconstructActiveFormattingElements();
|
|
$token = $this->adjustSVGAttributes($token);
|
|
$token = $this->adjustForeignAttributes($token);
|
|
$this->insertForeignElement($token, self::NS_SVG);
|
|
if (isset($token['self-closing'])) {
|
|
// XERROR: acknowledge the token's self-closing flag
|
|
array_pop($this->stack);
|
|
}
|
|
if ($this->mode !== self::IN_FOREIGN_CONTENT) {
|
|
$this->secondary_mode = $this->mode;
|
|
$this->mode = self::IN_FOREIGN_CONTENT;
|
|
}
|
|
break;
|
|
|
|
case 'caption': case 'col': case 'colgroup': case 'frame': case 'head':
|
|
case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': case 'tr':
|
|
// parse error
|
|
break;
|
|
|
|
/* A start tag token not covered by the previous entries */
|
|
default:
|
|
/* Reconstruct the active formatting elements, if any. */
|
|
$this->reconstructActiveFormattingElements();
|
|
|
|
$this->insertElement($token);
|
|
/* This element will be a phrasing element. */
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case HTML5_Tokenizer::ENDTAG:
|
|
switch($token['name']) {
|
|
/* An end tag with the tag name "body" */
|
|
case 'body':
|
|
/* If the second element in the stack of open elements is
|
|
not a body element, this is a parse error. Ignore the token.
|
|
(innerHTML case) */
|
|
if(count($this->stack) < 2 || $this->stack[1]->tagName !== 'body') {
|
|
$this->ignored = true;
|
|
|
|
/* Otherwise, if there is a node in the stack of open
|
|
* elements that is not either a dd element, a dt
|
|
* element, an li element, an optgroup element, an
|
|
* option element, a p element, an rp element, an rt
|
|
* element, a tbody element, a td element, a tfoot
|
|
* element, a th element, a thead element, a tr element,
|
|
* the body element, or the html element, then this is a
|
|
* parse error. */
|
|
} else {
|
|
// XERROR: implement this check for parse error
|
|
}
|
|
|
|
/* Change the insertion mode to "after body". */
|
|
$this->mode = self::AFTER_BODY;
|
|
break;
|
|
|
|
/* An end tag with the tag name "html" */
|
|
case 'html':
|
|
/* Act as if an end tag with tag name "body" had been seen,
|
|
then, if that token wasn't ignored, reprocess the current
|
|
token. */
|
|
$this->emitToken(array(
|
|
'name' => 'body',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
|
|
if (!$this->ignored) $this->emitToken($token);
|
|
break;
|
|
|
|
case 'address': case 'article': case 'aside': case 'blockquote':
|
|
case 'center': case 'datagrid': case 'details': case 'dir':
|
|
case 'div': case 'dl': case 'fieldset': case 'figure': case 'footer':
|
|
case 'header': case 'hgroup': case 'listing': case 'menu':
|
|
case 'nav': case 'ol': case 'pre': case 'section': case 'ul':
|
|
/* If the stack of open elements has an element in scope
|
|
with the same tag name as that of the token, then generate
|
|
implied end tags. */
|
|
if($this->elementInScope($token['name'])) {
|
|
$this->generateImpliedEndTags();
|
|
|
|
/* Now, if the current node is not an element with
|
|
the same tag name as that of the token, then this
|
|
is a parse error. */
|
|
// XERROR: implement parse error logic
|
|
|
|
/* If the stack of open elements has an element in
|
|
scope with the same tag name as that of the token,
|
|
then pop elements from this stack until an element
|
|
with that tag name has been popped from the stack. */
|
|
do {
|
|
$node = array_pop($this->stack);
|
|
} while ($node->tagName !== $token['name']);
|
|
} else {
|
|
// parse error
|
|
}
|
|
break;
|
|
|
|
/* An end tag whose tag name is "form" */
|
|
case 'form':
|
|
/* Let node be the element that the form element pointer is set to. */
|
|
$node = $this->form_pointer;
|
|
/* Set the form element pointer to null. */
|
|
$this->form_pointer = null;
|
|
/* If node is null or the stack of open elements does not
|
|
* have node in scope, then this is a parse error; ignore the token. */
|
|
if ($node === null || !in_array($node, $this->stack)) {
|
|
// parse error
|
|
$this->ignored = true;
|
|
} else {
|
|
/* 1. Generate implied end tags. */
|
|
$this->generateImpliedEndTags();
|
|
/* 2. If the current node is not node, then this is a parse error. */
|
|
if (end($this->stack) !== $node) {
|
|
// parse error
|
|
}
|
|
/* 3. Remove node from the stack of open elements. */
|
|
array_splice($this->stack, array_search($node, $this->stack, true), 1);
|
|
}
|
|
|
|
break;
|
|
|
|
/* An end tag whose tag name is "p" */
|
|
case 'p':
|
|
/* If the stack of open elements has a p element in scope,
|
|
then generate implied end tags, except for p elements. */
|
|
if($this->elementInScope('p')) {
|
|
/* Generate implied end tags, except for elements with
|
|
* the same tag name as the token. */
|
|
$this->generateImpliedEndTags(array('p'));
|
|
|
|
/* If the current node is not a p element, then this is
|
|
a parse error. */
|
|
// XERROR: implement
|
|
|
|
/* Pop elements from the stack of open elements until
|
|
* an element with the same tag name as the token has
|
|
* been popped from the stack. */
|
|
do {
|
|
$node = array_pop($this->stack);
|
|
} while ($node->tagName !== 'p');
|
|
|
|
} else {
|
|
// parse error
|
|
$this->emitToken(array(
|
|
'name' => 'p',
|
|
'type' => HTML5_Tokenizer::STARTTAG,
|
|
));
|
|
$this->emitToken($token);
|
|
}
|
|
break;
|
|
|
|
/* An end tag whose tag name is "dd", "dt", or "li" */
|
|
case 'dd': case 'dt': case 'li':
|
|
if($this->elementInScope($token['name'])) {
|
|
$this->generateImpliedEndTags(array($token['name']));
|
|
|
|
/* If the current node is not an element with the same
|
|
tag name as the token, then this is a parse error. */
|
|
// XERROR: implement parse error
|
|
|
|
/* Pop elements from the stack of open elements until
|
|
* an element with the same tag name as the token has
|
|
* been popped from the stack. */
|
|
do {
|
|
$node = array_pop($this->stack);
|
|
} while ($node->tagName !== $token['name']);
|
|
|
|
} else {
|
|
// parse error
|
|
}
|
|
break;
|
|
|
|
/* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
|
|
"h5", "h6" */
|
|
case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
|
|
$elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
|
|
|
|
/* If the stack of open elements has in scope an element whose
|
|
tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
|
|
generate implied end tags. */
|
|
if($this->elementInScope($elements)) {
|
|
$this->generateImpliedEndTags();
|
|
|
|
/* Now, if the current node is not an element with the same
|
|
tag name as that of the token, then this is a parse error. */
|
|
// XERROR: implement parse error
|
|
|
|
/* If the stack of open elements has in scope an element
|
|
whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
|
|
"h6", then pop elements from the stack until an element
|
|
with one of those tag names has been popped from the stack. */
|
|
do {
|
|
$node = array_pop($this->stack);
|
|
} while (!in_array($node->tagName, $elements));
|
|
} else {
|
|
// parse error
|
|
}
|
|
break;
|
|
|
|
/* An end tag whose tag name is one of: "a", "b", "big", "em",
|
|
"font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
|
|
case 'a': case 'b': case 'big': case 'code': case 'em': case 'font':
|
|
case 'i': case 'nobr': case 's': case 'small': case 'strike':
|
|
case 'strong': case 'tt': case 'u':
|
|
// XERROR: generally speaking this needs parse error logic
|
|
/* 1. Let the formatting element be the last element in
|
|
the list of active formatting elements that:
|
|
* is between the end of the list and the last scope
|
|
marker in the list, if any, or the start of the list
|
|
otherwise, and
|
|
* has the same tag name as the token.
|
|
*/
|
|
while(true) {
|
|
for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
|
|
if($this->a_formatting[$a] === self::MARKER) {
|
|
break;
|
|
|
|
} elseif($this->a_formatting[$a]->tagName === $token['name']) {
|
|
$formatting_element = $this->a_formatting[$a];
|
|
$in_stack = in_array($formatting_element, $this->stack, true);
|
|
$fe_af_pos = $a;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* If there is no such node, or, if that node is
|
|
also in the stack of open elements but the element
|
|
is not in scope, then this is a parse error. Abort
|
|
these steps. The token is ignored. */
|
|
if(!isset($formatting_element) || ($in_stack &&
|
|
!$this->elementInScope($token['name']))) {
|
|
$this->ignored = true;
|
|
break;
|
|
|
|
/* Otherwise, if there is such a node, but that node
|
|
is not in the stack of open elements, then this is a
|
|
parse error; remove the element from the list, and
|
|
abort these steps. */
|
|
} elseif(isset($formatting_element) && !$in_stack) {
|
|
unset($this->a_formatting[$fe_af_pos]);
|
|
$this->a_formatting = array_merge($this->a_formatting);
|
|
break;
|
|
}
|
|
|
|
/* Otherwise, there is a formatting element and that
|
|
* element is in the stack and is in scope. If the
|
|
* element is not the current node, this is a parse
|
|
* error. In any case, proceed with the algorithm as
|
|
* written in the following steps. */
|
|
// XERROR: implement me
|
|
|
|
/* 2. Let the furthest block be the topmost node in the
|
|
stack of open elements that is lower in the stack
|
|
than the formatting element, and is not an element in
|
|
the phrasing or formatting categories. There might
|
|
not be one. */
|
|
$fe_s_pos = array_search($formatting_element, $this->stack, true);
|
|
$length = count($this->stack);
|
|
|
|
for($s = $fe_s_pos + 1; $s < $length; $s++) {
|
|
$category = $this->getElementCategory($this->stack[$s]);
|
|
|
|
if($category !== self::PHRASING && $category !== self::FORMATTING) {
|
|
$furthest_block = $this->stack[$s];
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* 3. If there is no furthest block, then the UA must
|
|
skip the subsequent steps and instead just pop all
|
|
the nodes from the bottom of the stack of open
|
|
elements, from the current node up to the formatting
|
|
element, and remove the formatting element from the
|
|
list of active formatting elements. */
|
|
if(!isset($furthest_block)) {
|
|
for($n = $length - 1; $n >= $fe_s_pos; $n--) {
|
|
array_pop($this->stack);
|
|
}
|
|
|
|
unset($this->a_formatting[$fe_af_pos]);
|
|
$this->a_formatting = array_merge($this->a_formatting);
|
|
break;
|
|
}
|
|
|
|
/* 4. Let the common ancestor be the element
|
|
immediately above the formatting element in the stack
|
|
of open elements. */
|
|
$common_ancestor = $this->stack[$fe_s_pos - 1];
|
|
|
|
/* 5. Let a bookmark note the position of the
|
|
formatting element in the list of active formatting
|
|
elements relative to the elements on either side
|
|
of it in the list. */
|
|
$bookmark = $fe_af_pos;
|
|
|
|
/* 6. Let node and last node be the furthest block.
|
|
Follow these steps: */
|
|
$node = $furthest_block;
|
|
$last_node = $furthest_block;
|
|
|
|
while(true) {
|
|
for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
|
|
/* 6.1 Let node be the element immediately
|
|
prior to node in the stack of open elements. */
|
|
$node = $this->stack[$n];
|
|
|
|
/* 6.2 If node is not in the list of active
|
|
formatting elements, then remove node from
|
|
the stack of open elements and then go back
|
|
to step 1. */
|
|
if(!in_array($node, $this->a_formatting, true)) {
|
|
array_splice($this->stack, $n, 1);
|
|
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* 6.3 Otherwise, if node is the formatting
|
|
element, then go to the next step in the overall
|
|
algorithm. */
|
|
if($node === $formatting_element) {
|
|
break;
|
|
|
|
/* 6.4 Otherwise, if last node is the furthest
|
|
block, then move the aforementioned bookmark to
|
|
be immediately after the node in the list of
|
|
active formatting elements. */
|
|
} elseif($last_node === $furthest_block) {
|
|
$bookmark = array_search($node, $this->a_formatting, true) + 1;
|
|
}
|
|
|
|
/* 6.5 Create an element for the token for which
|
|
* the element node was created, replace the entry
|
|
* for node in the list of active formatting
|
|
* elements with an entry for the new element,
|
|
* replace the entry for node in the stack of open
|
|
* elements with an entry for the new element, and
|
|
* let node be the new element. */
|
|
// we don't know what the token is anymore
|
|
$clone = $node->cloneNode();
|
|
$a_pos = array_search($node, $this->a_formatting, true);
|
|
$s_pos = array_search($node, $this->stack, true);
|
|
$this->a_formatting[$a_pos] = $clone;
|
|
$this->stack[$s_pos] = $clone;
|
|
$node = $clone;
|
|
|
|
/* 6.6 Insert last node into node, first removing
|
|
it from its previous parent node if any. */
|
|
if($last_node->parentNode !== null) {
|
|
$last_node->parentNode->removeChild($last_node);
|
|
}
|
|
|
|
$node->appendChild($last_node);
|
|
|
|
/* 6.7 Let last node be node. */
|
|
$last_node = $node;
|
|
|
|
/* 6.8 Return to step 1 of this inner set of steps. */
|
|
}
|
|
|
|
/* 7. If the common ancestor node is a table, tbody,
|
|
* tfoot, thead, or tr element, then, foster parent
|
|
* whatever last node ended up being in the previous
|
|
* step, first removing it from its previous parent
|
|
* node if any. */
|
|
if ($last_node->parentNode) { // common step
|
|
$last_node->parentNode->removeChild($last_node);
|
|
}
|
|
if (in_array($common_ancestor->tagName, array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
|
|
$this->fosterParent($last_node);
|
|
/* Otherwise, append whatever last node ended up being
|
|
* in the previous step to the common ancestor node,
|
|
* first removing it from its previous parent node if
|
|
* any. */
|
|
} else {
|
|
$common_ancestor->appendChild($last_node);
|
|
}
|
|
|
|
/* 8. Create an element for the token for which the
|
|
* formatting element was created. */
|
|
$clone = $formatting_element->cloneNode();
|
|
|
|
/* 9. Take all of the child nodes of the furthest
|
|
block and append them to the element created in the
|
|
last step. */
|
|
while($furthest_block->hasChildNodes()) {
|
|
$child = $furthest_block->firstChild;
|
|
$furthest_block->removeChild($child);
|
|
$clone->appendChild($child);
|
|
}
|
|
|
|
/* 10. Append that clone to the furthest block. */
|
|
$furthest_block->appendChild($clone);
|
|
|
|
/* 11. Remove the formatting element from the list
|
|
of active formatting elements, and insert the new element
|
|
into the list of active formatting elements at the
|
|
position of the aforementioned bookmark. */
|
|
$fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
|
|
array_splice($this->a_formatting, $fe_af_pos, 1);
|
|
|
|
$af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
|
|
$af_part2 = array_slice($this->a_formatting, $bookmark);
|
|
$this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
|
|
|
|
/* 12. Remove the formatting element from the stack
|
|
of open elements, and insert the new element into the stack
|
|
of open elements immediately below the position of the
|
|
furthest block in that stack. */
|
|
$fe_s_pos = array_search($formatting_element, $this->stack, true);
|
|
array_splice($this->stack, $fe_s_pos, 1);
|
|
|
|
$fb_s_pos = array_search($furthest_block, $this->stack, true);
|
|
$s_part1 = array_slice($this->stack, 0, $fb_s_pos + 1);
|
|
$s_part2 = array_slice($this->stack, $fb_s_pos + 1);
|
|
$this->stack = array_merge($s_part1, array($clone), $s_part2);
|
|
|
|
/* 13. Jump back to step 1 in this series of steps. */
|
|
unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
|
|
}
|
|
break;
|
|
|
|
case 'applet': case 'button': case 'marquee': case 'object':
|
|
/* If the stack of open elements has an element in scope whose
|
|
tag name matches the tag name of the token, then generate implied
|
|
tags. */
|
|
if($this->elementInScope($token['name'])) {
|
|
$this->generateImpliedEndTags();
|
|
|
|
/* Now, if the current node is not an element with the same
|
|
tag name as the token, then this is a parse error. */
|
|
// XERROR: implement logic
|
|
|
|
/* Pop elements from the stack of open elements until
|
|
* an element with the same tag name as the token has
|
|
* been popped from the stack. */
|
|
do {
|
|
$node = array_pop($this->stack);
|
|
} while ($node->tagName !== $token['name']);
|
|
|
|
/* Clear the list of active formatting elements up to the
|
|
* last marker. */
|
|
$keys = array_keys($this->a_formatting, self::MARKER, true);
|
|
$marker = end($keys);
|
|
|
|
for($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
|
|
array_pop($this->a_formatting);
|
|
}
|
|
} else {
|
|
// parse error
|
|
}
|
|
break;
|
|
|
|
case 'br':
|
|
// Parse error
|
|
$this->emitToken(array(
|
|
'name' => 'br',
|
|
'type' => HTML5_Tokenizer::STARTTAG,
|
|
));
|
|
break;
|
|
|
|
/* An end tag token not covered by the previous entries */
|
|
default:
|
|
for($n = count($this->stack) - 1; $n >= 0; $n--) {
|
|
/* Initialise node to be the current node (the bottommost
|
|
node of the stack). */
|
|
$node = $this->stack[$n];
|
|
|
|
/* If node has the same tag name as the end tag token,
|
|
then: */
|
|
if($token['name'] === $node->tagName) {
|
|
/* Generate implied end tags. */
|
|
$this->generateImpliedEndTags();
|
|
|
|
/* If the tag name of the end tag token does not
|
|
match the tag name of the current node, this is a
|
|
parse error. */
|
|
// XERROR: implement this
|
|
|
|
/* Pop all the nodes from the current node up to
|
|
node, including node, then stop these steps. */
|
|
// XSKETCHY
|
|
do {
|
|
$pop = array_pop($this->stack);
|
|
} while ($pop !== $node);
|
|
break;
|
|
|
|
} else {
|
|
$category = $this->getElementCategory($node);
|
|
|
|
if($category !== self::FORMATTING && $category !== self::PHRASING) {
|
|
/* Otherwise, if node is in neither the formatting
|
|
category nor the phrasing category, then this is a
|
|
parse error. Stop this algorithm. The end tag token
|
|
is ignored. */
|
|
$this->ignored = true;
|
|
break;
|
|
// parse error
|
|
}
|
|
}
|
|
/* Set node to the previous entry in the stack of open elements. Loop. */
|
|
}
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case self::IN_CDATA_RCDATA:
|
|
if (
|
|
$token['type'] === HTML5_Tokenizer::CHARACTER ||
|
|
$token['type'] === HTML5_Tokenizer::SPACECHARACTER
|
|
) {
|
|
$this->insertText($token['data']);
|
|
} elseif ($token['type'] === HTML5_Tokenizer::EOF) {
|
|
// parse error
|
|
/* If the current node is a script element, mark the script
|
|
* element as "already executed". */
|
|
// probably not necessary
|
|
array_pop($this->stack);
|
|
$this->mode = $this->original_mode;
|
|
$this->emitToken($token);
|
|
} elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'script') {
|
|
array_pop($this->stack);
|
|
$this->mode = $this->original_mode;
|
|
// we're ignoring all of the execution stuff
|
|
} elseif ($token['type'] === HTML5_Tokenizer::ENDTAG) {
|
|
array_pop($this->stack);
|
|
$this->mode = $this->original_mode;
|
|
}
|
|
break;
|
|
|
|
case self::IN_TABLE:
|
|
$clear = array('html', 'table');
|
|
|
|
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
|
|
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
|
|
or U+0020 SPACE */
|
|
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER &&
|
|
/* If the current table is tainted, then act as described in
|
|
* the "anything else" entry below. */
|
|
// Note: hsivonen has a test that fails due to this line
|
|
// because he wants to convince Hixie not to do taint
|
|
!$this->currentTableIsTainted()) {
|
|
/* Append the character to the current node. */
|
|
$this->insertText($token['data']);
|
|
|
|
/* A comment token */
|
|
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
|
/* Append a Comment node to the current node with the data
|
|
attribute set to the data given in the comment token. */
|
|
$this->insertComment($token['data']);
|
|
|
|
} elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
|
// parse error
|
|
|
|
/* A start tag whose tag name is "caption" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
$token['name'] === 'caption') {
|
|
/* Clear the stack back to a table context. */
|
|
$this->clearStackToTableContext($clear);
|
|
|
|
/* Insert a marker at the end of the list of active
|
|
formatting elements. */
|
|
$this->a_formatting[] = self::MARKER;
|
|
|
|
/* Insert an HTML element for the token, then switch the
|
|
insertion mode to "in caption". */
|
|
$this->insertElement($token);
|
|
$this->mode = self::IN_CAPTION;
|
|
|
|
/* A start tag whose tag name is "colgroup" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
$token['name'] === 'colgroup') {
|
|
/* Clear the stack back to a table context. */
|
|
$this->clearStackToTableContext($clear);
|
|
|
|
/* Insert an HTML element for the token, then switch the
|
|
insertion mode to "in column group". */
|
|
$this->insertElement($token);
|
|
$this->mode = self::IN_COLUMN_GROUP;
|
|
|
|
/* A start tag whose tag name is "col" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
$token['name'] === 'col') {
|
|
$this->emitToken(array(
|
|
'name' => 'colgroup',
|
|
'type' => HTML5_Tokenizer::STARTTAG,
|
|
'attr' => array()
|
|
));
|
|
|
|
$this->emitToken($token);
|
|
|
|
/* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
|
|
array('tbody', 'tfoot', 'thead'))) {
|
|
/* Clear the stack back to a table context. */
|
|
$this->clearStackToTableContext($clear);
|
|
|
|
/* Insert an HTML element for the token, then switch the insertion
|
|
mode to "in table body". */
|
|
$this->insertElement($token);
|
|
$this->mode = self::IN_TABLE_BODY;
|
|
|
|
/* A start tag whose tag name is one of: "td", "th", "tr" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
in_array($token['name'], array('td', 'th', 'tr'))) {
|
|
/* Act as if a start tag token with the tag name "tbody" had been
|
|
seen, then reprocess the current token. */
|
|
$this->emitToken(array(
|
|
'name' => 'tbody',
|
|
'type' => HTML5_Tokenizer::STARTTAG,
|
|
'attr' => array()
|
|
));
|
|
|
|
$this->emitToken($token);
|
|
|
|
/* A start tag whose tag name is "table" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
$token['name'] === 'table') {
|
|
/* Parse error. Act as if an end tag token with the tag name "table"
|
|
had been seen, then, if that token wasn't ignored, reprocess the
|
|
current token. */
|
|
$this->emitToken(array(
|
|
'name' => 'table',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
|
|
if (!$this->ignored) $this->emitToken($token);
|
|
|
|
/* An end tag whose tag name is "table" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
|
$token['name'] === 'table') {
|
|
/* If the stack of open elements does not have an element in table
|
|
scope with the same tag name as the token, this is a parse error.
|
|
Ignore the token. (fragment case) */
|
|
if(!$this->elementInScope($token['name'], true)) {
|
|
$this->ignored = true;
|
|
|
|
/* Otherwise: */
|
|
} else {
|
|
do {
|
|
$node = array_pop($this->stack);
|
|
} while ($node->tagName !== 'table');
|
|
|
|
/* Reset the insertion mode appropriately. */
|
|
$this->resetInsertionMode();
|
|
}
|
|
|
|
/* An end tag whose tag name is one of: "body", "caption", "col",
|
|
"colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
|
|
array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
|
|
'tfoot', 'th', 'thead', 'tr'))) {
|
|
// Parse error. Ignore the token.
|
|
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
($token['name'] === 'style' || $token['name'] === 'script')) {
|
|
$this->processWithRulesFor($token, self::IN_HEAD);
|
|
|
|
} elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'input' &&
|
|
// assignment is intentional
|
|
/* If the token does not have an attribute with the name "type", or
|
|
* if it does, but that attribute's value is not an ASCII
|
|
* case-insensitive match for the string "hidden", then: act as
|
|
* described in the "anything else" entry below. */
|
|
($type = $this->getAttr($token, 'type')) && strtolower($type) === 'hidden') {
|
|
// I.e., if its an input with the type attribute == 'hidden'
|
|
/* Otherwise */
|
|
// parse error
|
|
$this->insertElement($token);
|
|
array_pop($this->stack);
|
|
} elseif ($token['type'] === HTML5_Tokenizer::EOF) {
|
|
/* If the current node is not the root html element, then this is a parse error. */
|
|
if (end($this->stack)->tagName !== 'html') {
|
|
// Note: It can only be the current node in the fragment case.
|
|
// parse error
|
|
}
|
|
/* Stop parsing. */
|
|
/* Anything else */
|
|
} else {
|
|
/* Parse error. Process the token as if the insertion mode was "in
|
|
body", with the following exception: */
|
|
|
|
$old = $this->foster_parent;
|
|
$this->foster_parent = true;
|
|
$this->processWithRulesFor($token, self::IN_BODY);
|
|
$this->foster_parent = $old;
|
|
}
|
|
break;
|
|
|
|
case self::IN_CAPTION:
|
|
/* An end tag whose tag name is "caption" */
|
|
if($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'caption') {
|
|
/* If the stack of open elements does not have an element in table
|
|
scope with the same tag name as the token, this is a parse error.
|
|
Ignore the token. (fragment case) */
|
|
if(!$this->elementInScope($token['name'], true)) {
|
|
$this->ignored = true;
|
|
// Ignore
|
|
|
|
/* Otherwise: */
|
|
} else {
|
|
/* Generate implied end tags. */
|
|
$this->generateImpliedEndTags();
|
|
|
|
/* Now, if the current node is not a caption element, then this
|
|
is a parse error. */
|
|
// XERROR: implement
|
|
|
|
/* Pop elements from this stack until a caption element has
|
|
been popped from the stack. */
|
|
do {
|
|
$node = array_pop($this->stack);
|
|
} while ($node->tagName !== 'caption');
|
|
|
|
/* Clear the list of active formatting elements up to the last
|
|
marker. */
|
|
$this->clearTheActiveFormattingElementsUpToTheLastMarker();
|
|
|
|
/* Switch the insertion mode to "in table". */
|
|
$this->mode = self::IN_TABLE;
|
|
}
|
|
|
|
/* A start tag whose tag name is one of: "caption", "col", "colgroup",
|
|
"tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
|
|
name is "table" */
|
|
} elseif(($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
|
|
array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
|
|
'thead', 'tr'))) || ($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
|
$token['name'] === 'table')) {
|
|
/* Parse error. Act as if an end tag with the tag name "caption"
|
|
had been seen, then, if that token wasn't ignored, reprocess the
|
|
current token. */
|
|
$this->emitToken(array(
|
|
'name' => 'caption',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
|
|
if (!$this->ignored) $this->emitToken($token);
|
|
|
|
/* An end tag whose tag name is one of: "body", "col", "colgroup",
|
|
"html", "tbody", "td", "tfoot", "th", "thead", "tr" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
|
|
array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
|
|
'thead', 'tr'))) {
|
|
// Parse error. Ignore the token.
|
|
$this->ignored = true;
|
|
|
|
/* Anything else */
|
|
} else {
|
|
/* Process the token as if the insertion mode was "in body". */
|
|
$this->processWithRulesFor($token, self::IN_BODY);
|
|
}
|
|
break;
|
|
|
|
case self::IN_COLUMN_GROUP:
|
|
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
|
|
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
|
|
or U+0020 SPACE */
|
|
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
|
/* Append the character to the current node. */
|
|
$this->insertText($token['data']);
|
|
|
|
/* A comment token */
|
|
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
|
/* Append a Comment node to the current node with the data
|
|
attribute set to the data given in the comment token. */
|
|
$this->insertToken($token['data']);
|
|
|
|
} elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
|
// parse error
|
|
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
|
|
$this->processWithRulesFor($token, self::IN_BODY);
|
|
|
|
/* A start tag whose tag name is "col" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'col') {
|
|
/* Insert a col element for the token. Immediately pop the current
|
|
node off the stack of open elements. */
|
|
$this->insertElement($token);
|
|
array_pop($this->stack);
|
|
// XERROR: Acknowledge the token's self-closing flag, if it is set.
|
|
|
|
/* An end tag whose tag name is "colgroup" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
|
$token['name'] === 'colgroup') {
|
|
/* If the current node is the root html element, then this is a
|
|
parse error, ignore the token. (fragment case) */
|
|
if(end($this->stack)->tagName === 'html') {
|
|
$this->ignored = true;
|
|
|
|
/* Otherwise, pop the current node (which will be a colgroup
|
|
element) from the stack of open elements. Switch the insertion
|
|
mode to "in table". */
|
|
} else {
|
|
array_pop($this->stack);
|
|
$this->mode = self::IN_TABLE;
|
|
}
|
|
|
|
/* An end tag whose tag name is "col" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'col') {
|
|
/* Parse error. Ignore the token. */
|
|
$this->ignored = true;
|
|
|
|
/* An end-of-file token */
|
|
/* If the current node is the root html element */
|
|
} elseif($token['type'] === HTML5_Tokenizer::EOF && end($this->stack)->tagName === 'html') {
|
|
/* Stop parsing */
|
|
|
|
/* Anything else */
|
|
} else {
|
|
/* Act as if an end tag with the tag name "colgroup" had been seen,
|
|
and then, if that token wasn't ignored, reprocess the current token. */
|
|
$this->emitToken(array(
|
|
'name' => 'colgroup',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
|
|
if (!$this->ignored) $this->emitToken($token);
|
|
}
|
|
break;
|
|
|
|
case self::IN_TABLE_BODY:
|
|
$clear = array('tbody', 'tfoot', 'thead', 'html');
|
|
|
|
/* A start tag whose tag name is "tr" */
|
|
if($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'tr') {
|
|
/* Clear the stack back to a table body context. */
|
|
$this->clearStackToTableContext($clear);
|
|
|
|
/* Insert a tr element for the token, then switch the insertion
|
|
mode to "in row". */
|
|
$this->insertElement($token);
|
|
$this->mode = self::IN_ROW;
|
|
|
|
/* A start tag whose tag name is one of: "th", "td" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
($token['name'] === 'th' || $token['name'] === 'td')) {
|
|
/* Parse error. Act as if a start tag with the tag name "tr" had
|
|
been seen, then reprocess the current token. */
|
|
$this->emitToken(array(
|
|
'name' => 'tr',
|
|
'type' => HTML5_Tokenizer::STARTTAG,
|
|
'attr' => array()
|
|
));
|
|
|
|
$this->emitToken($token);
|
|
|
|
/* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
|
in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
|
|
/* If the stack of open elements does not have an element in table
|
|
scope with the same tag name as the token, this is a parse error.
|
|
Ignore the token. */
|
|
if(!$this->elementInScope($token['name'], true)) {
|
|
// Parse error
|
|
$this->ignored = true;
|
|
|
|
/* Otherwise: */
|
|
} else {
|
|
/* Clear the stack back to a table body context. */
|
|
$this->clearStackToTableContext($clear);
|
|
|
|
/* Pop the current node from the stack of open elements. Switch
|
|
the insertion mode to "in table". */
|
|
array_pop($this->stack);
|
|
$this->mode = self::IN_TABLE;
|
|
}
|
|
|
|
/* A start tag whose tag name is one of: "caption", "col", "colgroup",
|
|
"tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
|
|
} elseif(($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
|
|
array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead'))) ||
|
|
($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'table')) {
|
|
/* If the stack of open elements does not have a tbody, thead, or
|
|
tfoot element in table scope, this is a parse error. Ignore the
|
|
token. (fragment case) */
|
|
if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
|
|
// parse error
|
|
$this->ignored = true;
|
|
|
|
/* Otherwise: */
|
|
} else {
|
|
/* Clear the stack back to a table body context. */
|
|
$this->clearStackToTableContext($clear);
|
|
|
|
/* Act as if an end tag with the same tag name as the current
|
|
node ("tbody", "tfoot", or "thead") had been seen, then
|
|
reprocess the current token. */
|
|
$this->emitToken(array(
|
|
'name' => end($this->stack)->tagName,
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
|
|
$this->emitToken($token);
|
|
}
|
|
|
|
/* An end tag whose tag name is one of: "body", "caption", "col",
|
|
"colgroup", "html", "td", "th", "tr" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
|
|
array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
|
|
/* Parse error. Ignore the token. */
|
|
$this->ignored = true;
|
|
|
|
/* Anything else */
|
|
} else {
|
|
/* Process the token as if the insertion mode was "in table". */
|
|
$this->processWithRulesFor($token, self::IN_TABLE);
|
|
}
|
|
break;
|
|
|
|
case self::IN_ROW:
|
|
$clear = array('tr', 'html');
|
|
|
|
/* A start tag whose tag name is one of: "th", "td" */
|
|
if($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
($token['name'] === 'th' || $token['name'] === 'td')) {
|
|
/* Clear the stack back to a table row context. */
|
|
$this->clearStackToTableContext($clear);
|
|
|
|
/* Insert an HTML element for the token, then switch the insertion
|
|
mode to "in cell". */
|
|
$this->insertElement($token);
|
|
$this->mode = self::IN_CELL;
|
|
|
|
/* Insert a marker at the end of the list of active formatting
|
|
elements. */
|
|
$this->a_formatting[] = self::MARKER;
|
|
|
|
/* An end tag whose tag name is "tr" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'tr') {
|
|
/* If the stack of open elements does not have an element in table
|
|
scope with the same tag name as the token, this is a parse error.
|
|
Ignore the token. (fragment case) */
|
|
if(!$this->elementInScope($token['name'], true)) {
|
|
// Ignore.
|
|
$this->ignored = true;
|
|
|
|
/* Otherwise: */
|
|
} else {
|
|
/* Clear the stack back to a table row context. */
|
|
$this->clearStackToTableContext($clear);
|
|
|
|
/* Pop the current node (which will be a tr element) from the
|
|
stack of open elements. Switch the insertion mode to "in table
|
|
body". */
|
|
array_pop($this->stack);
|
|
$this->mode = self::IN_TABLE_BODY;
|
|
}
|
|
|
|
/* A start tag whose tag name is one of: "caption", "col", "colgroup",
|
|
"tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
|
|
} elseif(($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
|
|
array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) ||
|
|
($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'table')) {
|
|
/* Act as if an end tag with the tag name "tr" had been seen, then,
|
|
if that token wasn't ignored, reprocess the current token. */
|
|
$this->emitToken(array(
|
|
'name' => 'tr',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
if (!$this->ignored) $this->emitToken($token);
|
|
|
|
/* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
|
in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
|
|
/* If the stack of open elements does not have an element in table
|
|
scope with the same tag name as the token, this is a parse error.
|
|
Ignore the token. */
|
|
if(!$this->elementInScope($token['name'], true)) {
|
|
$this->ignored = true;
|
|
|
|
/* Otherwise: */
|
|
} else {
|
|
/* Otherwise, act as if an end tag with the tag name "tr" had
|
|
been seen, then reprocess the current token. */
|
|
$this->emitToken(array(
|
|
'name' => 'tr',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
|
|
$this->emitToken($token);
|
|
}
|
|
|
|
/* An end tag whose tag name is one of: "body", "caption", "col",
|
|
"colgroup", "html", "td", "th" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
|
|
array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th'))) {
|
|
/* Parse error. Ignore the token. */
|
|
$this->ignored = true;
|
|
|
|
/* Anything else */
|
|
} else {
|
|
/* Process the token as if the insertion mode was "in table". */
|
|
$this->processWithRulesFor($token, self::IN_TABLE);
|
|
}
|
|
break;
|
|
|
|
case self::IN_CELL:
|
|
/* An end tag whose tag name is one of: "td", "th" */
|
|
if($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
|
($token['name'] === 'td' || $token['name'] === 'th')) {
|
|
/* If the stack of open elements does not have an element in table
|
|
scope with the same tag name as that of the token, then this is a
|
|
parse error and the token must be ignored. */
|
|
if(!$this->elementInScope($token['name'], true)) {
|
|
$this->ignored = true;
|
|
|
|
/* Otherwise: */
|
|
} else {
|
|
/* Generate implied end tags, except for elements with the same
|
|
tag name as the token. */
|
|
$this->generateImpliedEndTags(array($token['name']));
|
|
|
|
/* Now, if the current node is not an element with the same tag
|
|
name as the token, then this is a parse error. */
|
|
// XERROR: Implement parse error code
|
|
|
|
/* Pop elements from this stack until an element with the same
|
|
tag name as the token has been popped from the stack. */
|
|
do {
|
|
$node = array_pop($this->stack);
|
|
} while ($node->tagName !== $token['name']);
|
|
|
|
/* Clear the list of active formatting elements up to the last
|
|
marker. */
|
|
$this->clearTheActiveFormattingElementsUpToTheLastMarker();
|
|
|
|
/* Switch the insertion mode to "in row". (The current node
|
|
will be a tr element at this point.) */
|
|
$this->mode = self::IN_ROW;
|
|
}
|
|
|
|
/* A start tag whose tag name is one of: "caption", "col", "colgroup",
|
|
"tbody", "td", "tfoot", "th", "thead", "tr" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
|
|
array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
|
|
'thead', 'tr'))) {
|
|
/* If the stack of open elements does not have a td or th element
|
|
in table scope, then this is a parse error; ignore the token.
|
|
(fragment case) */
|
|
if(!$this->elementInScope(array('td', 'th'), true)) {
|
|
// parse error
|
|
$this->ignored = true;
|
|
|
|
/* Otherwise, close the cell (see below) and reprocess the current
|
|
token. */
|
|
} else {
|
|
$this->closeCell();
|
|
$this->emitToken($token);
|
|
}
|
|
|
|
/* An end tag whose tag name is one of: "body", "caption", "col",
|
|
"colgroup", "html" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
|
|
array('body', 'caption', 'col', 'colgroup', 'html'))) {
|
|
/* Parse error. Ignore the token. */
|
|
$this->ignored = true;
|
|
|
|
/* An end tag whose tag name is one of: "table", "tbody", "tfoot",
|
|
"thead", "tr" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
|
|
array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
|
|
/* If the stack of open elements does not have a td or th element
|
|
in table scope, then this is a parse error; ignore the token.
|
|
(innerHTML case) */
|
|
if(!$this->elementInScope(array('td', 'th'), true)) {
|
|
// Parse error
|
|
$this->ignored = true;
|
|
|
|
/* Otherwise, close the cell (see below) and reprocess the current
|
|
token. */
|
|
} else {
|
|
$this->closeCell();
|
|
$this->emitToken($token);
|
|
}
|
|
|
|
/* Anything else */
|
|
} else {
|
|
/* Process the token as if the insertion mode was "in body". */
|
|
$this->processWithRulesFor($token, self::IN_BODY);
|
|
}
|
|
break;
|
|
|
|
case self::IN_SELECT:
|
|
/* Handle the token as follows: */
|
|
|
|
/* A character token */
|
|
if(
|
|
$token['type'] === HTML5_Tokenizer::CHARACTER ||
|
|
$token['type'] === HTML5_Tokenizer::SPACECHARACTER
|
|
) {
|
|
/* Append the token's character to the current node. */
|
|
$this->insertText($token['data']);
|
|
|
|
/* A comment token */
|
|
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
|
/* Append a Comment node to the current node with the data
|
|
attribute set to the data given in the comment token. */
|
|
$this->insertComment($token['data']);
|
|
|
|
} elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
|
// parse error
|
|
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
|
|
$this->processWithRulesFor($token, self::INBODY);
|
|
|
|
/* A start tag token whose tag name is "option" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
$token['name'] === 'option') {
|
|
/* If the current node is an option element, act as if an end tag
|
|
with the tag name "option" had been seen. */
|
|
if(end($this->stack)->tagName === 'option') {
|
|
$this->emitToken(array(
|
|
'name' => 'option',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
}
|
|
|
|
/* Insert an HTML element for the token. */
|
|
$this->insertElement($token);
|
|
|
|
/* A start tag token whose tag name is "optgroup" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
$token['name'] === 'optgroup') {
|
|
/* If the current node is an option element, act as if an end tag
|
|
with the tag name "option" had been seen. */
|
|
if(end($this->stack)->tagName === 'option') {
|
|
$this->emitToken(array(
|
|
'name' => 'option',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
}
|
|
|
|
/* If the current node is an optgroup element, act as if an end tag
|
|
with the tag name "optgroup" had been seen. */
|
|
if(end($this->stack)->tagName === 'optgroup') {
|
|
$this->emitToken(array(
|
|
'name' => 'optgroup',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
}
|
|
|
|
/* Insert an HTML element for the token. */
|
|
$this->insertElement($token);
|
|
|
|
/* An end tag token whose tag name is "optgroup" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
|
$token['name'] === 'optgroup') {
|
|
/* First, if the current node is an option element, and the node
|
|
immediately before it in the stack of open elements is an optgroup
|
|
element, then act as if an end tag with the tag name "option" had
|
|
been seen. */
|
|
$elements_in_stack = count($this->stack);
|
|
|
|
if($this->stack[$elements_in_stack - 1]->tagName === 'option' &&
|
|
$this->stack[$elements_in_stack - 2]->tagName === 'optgroup') {
|
|
$this->emitToken(array(
|
|
'name' => 'option',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
}
|
|
|
|
/* If the current node is an optgroup element, then pop that node
|
|
from the stack of open elements. Otherwise, this is a parse error,
|
|
ignore the token. */
|
|
if(end($this->stack)->tagName === 'optgroup') {
|
|
array_pop($this->stack);
|
|
} else {
|
|
// parse error
|
|
$this->ignored = true;
|
|
}
|
|
|
|
/* An end tag token whose tag name is "option" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
|
$token['name'] === 'option') {
|
|
/* If the current node is an option element, then pop that node
|
|
from the stack of open elements. Otherwise, this is a parse error,
|
|
ignore the token. */
|
|
if(end($this->stack)->tagName === 'option') {
|
|
array_pop($this->stack);
|
|
} else {
|
|
// parse error
|
|
$this->ignored = true;
|
|
}
|
|
|
|
/* An end tag whose tag name is "select" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
|
$token['name'] === 'select') {
|
|
/* If the stack of open elements does not have an element in table
|
|
scope with the same tag name as the token, this is a parse error.
|
|
Ignore the token. (fragment case) */
|
|
if(!$this->elementInScope($token['name'], true)) {
|
|
$this->ignored = true;
|
|
// parse error
|
|
|
|
/* Otherwise: */
|
|
} else {
|
|
/* Pop elements from the stack of open elements until a select
|
|
element has been popped from the stack. */
|
|
do {
|
|
$node = array_pop($this->stack);
|
|
} while ($node->tagName !== 'select');
|
|
|
|
/* Reset the insertion mode appropriately. */
|
|
$this->resetInsertionMode();
|
|
}
|
|
|
|
/* A start tag whose tag name is "select" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'select') {
|
|
/* Parse error. Act as if the token had been an end tag with the
|
|
tag name "select" instead. */
|
|
$this->emitToken(array(
|
|
'name' => 'select',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
($token['name'] === 'input' || $token['name'] === 'textarea')) {
|
|
// parse error
|
|
$this->emitToken(array(
|
|
'name' => 'select',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
$this->emitToken($token);
|
|
|
|
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'script') {
|
|
$this->processWithRulesFor($token, self::IN_HEAD);
|
|
|
|
} elseif($token['type'] === HTML5_Tokenizer::EOF) {
|
|
// XERROR: If the current node is not the root html element, then this is a parse error.
|
|
/* Stop parsing */
|
|
|
|
/* Anything else */
|
|
} else {
|
|
/* Parse error. Ignore the token. */
|
|
$this->ignored = true;
|
|
}
|
|
break;
|
|
|
|
case self::IN_SELECT_IN_TABLE:
|
|
|
|
if($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
in_array($token['name'], array('caption', 'table', 'tbody',
|
|
'tfoot', 'thead', 'tr', 'td', 'th'))) {
|
|
// parse error
|
|
$this->emitToken(array(
|
|
'name' => 'select',
|
|
'type' => HTML5_Tokenizer::ENDTAG,
|
|
));
|
|
$this->emitToken($token);
|
|
|
|
/* An end tag whose tag name is one of: "caption", "table", "tbody",
|
|
"tfoot", "thead", "tr", "td", "th" */
|
|
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
|
in_array($token['name'], array('caption', 'table', 'tbody', 'tfoot', 'thead', 'tr', 'td', 'th'))) {
|
|
/* Parse error. */
|
|
// parse error
|
|
|
|
/* If the stack of open elements has an element in table scope with
|
|
the same tag name as that of the token, then act as if an end tag
|
|
with the tag name "select" had been seen, and reprocess the token.
|
|
Otherwise, ignore the token. */
|
|
if($this->elementInScope($token['name'], true)) {
|
|
$this->emitToken(array(
|
|
'name' => 'select',
|
|
'type' => HTML5_Tokenizer::ENDTAG
|
|
));
|
|
|
|
$this->emitToken($token);
|
|
} else {
|
|
$this->ignored = true;
|
|
}
|
|
} else {
|
|
$this->processWithRulesFor($token, self::IN_SELECT);
|
|
}
|
|
break;
|
|
|
|
case self::IN_FOREIGN_CONTENT:
|
|
if ($token['type'] === HTML5_Tokenizer::CHARACTER ||
|
|
$token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
|
$this->insertText($token['data']);
|
|
} elseif ($token['type'] === HTML5_Tokenizer::COMMENT) {
|
|
$this->insertComment($token['data']);
|
|
} elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
|
// XERROR: parse error
|
|
} elseif ($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
|
$token['name'] === 'script' && end($this->stack)->tagName === 'script' &&
|
|
end($this->stack)->namespaceURI === self::NS_SVG) {
|
|
array_pop($this->stack);
|
|
// a bunch of script running mumbo jumbo
|
|
} elseif (
|
|
($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
((
|
|
$token['name'] !== 'mglyph' &&
|
|
$token['name'] !== 'malignmark' &&
|
|
end($this->stack)->namespaceURI === self::NS_MATHML &&
|
|
in_array(end($this->stack)->tagName, array('mi', 'mo', 'mn', 'ms', 'mtext'))
|
|
) ||
|
|
(
|
|
$token['name'] === 'svg' &&
|
|
end($this->stack)->namespaceURI === self::NS_MATHML &&
|
|
end($this->stack)->tagName === 'annotation-xml'
|
|
) ||
|
|
(
|
|
end($this->stack)->namespaceURI === self::NS_SVG &&
|
|
in_array(end($this->stack)->tagName, array('foreignObject', 'desc', 'title'))
|
|
) ||
|
|
(
|
|
// XSKETCHY
|
|
end($this->stack)->namespaceURI === self::NS_HTML
|
|
))
|
|
) || $token['type'] === HTML5_Tokenizer::ENDTAG
|
|
) {
|
|
$this->processWithRulesFor($token, $this->secondary_mode);
|
|
/* If, after doing so, the insertion mode is still "in foreign
|
|
* content", but there is no element in scope that has a namespace
|
|
* other than the HTML namespace, switch the insertion mode to the
|
|
* secondary insertion mode. */
|
|
if ($this->mode === self::IN_FOREIGN_CONTENT) {
|
|
$found = false;
|
|
// this basically duplicates elementInScope()
|
|
for ($i = count($this->stack) - 1; $i >= 0; $i--) {
|
|
$node = $this->stack[$i];
|
|
if ($node->namespaceURI !== self::NS_HTML) {
|
|
$found = true;
|
|
break;
|
|
} elseif (in_array($node->tagName, array('table', 'html',
|
|
'applet', 'caption', 'td', 'th', 'button', 'marquee',
|
|
'object')) || ($node->tagName === 'foreignObject' &&
|
|
$node->namespaceURI === self::NS_SVG)) {
|
|
break;
|
|
}
|
|
}
|
|
if (!$found) {
|
|
$this->mode = $this->secondary_mode;
|
|
}
|
|
}
|
|
} elseif ($token['type'] === HTML5_Tokenizer::EOF || (
|
|
$token['type'] === HTML5_Tokenizer::STARTTAG &&
|
|
(in_array($token['name'], array('b', "big", "blockquote", "body", "br",
|
|
"center", "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2",
|
|
"h3", "h4", "h5", "h6", "head", "hr", "i", " |