Initial checkin
This commit is contained in:
commit
6348e70daa
393 changed files with 59765 additions and 0 deletions
120
library/HTML5/Data.php
Normal file
120
library/HTML5/Data.php
Normal file
|
|
@ -0,0 +1,120 @@
|
|||
<?php
|
||||
|
||||
// warning: this file is encoded in UTF-8!
|
||||
|
||||
class HTML5_Data
|
||||
{
|
||||
|
||||
// at some point this should be moved to a .ser file. Another
|
||||
// possible optimization is to give UTF-8 bytes, not Unicode
|
||||
// codepoints
|
||||
protected static $realCodepointTable = array(
|
||||
0x0D => 0x000A, // LINE FEED (LF)
|
||||
0x80 => 0x20AC, // EURO SIGN ('€')
|
||||
0x81 => 0xFFFD, // REPLACEMENT CHARACTER
|
||||
0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚')
|
||||
0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
|
||||
0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
|
||||
0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…')
|
||||
0x86 => 0x2020, // DAGGER ('†')
|
||||
0x87 => 0x2021, // DOUBLE DAGGER ('‡')
|
||||
0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ')
|
||||
0x89 => 0x2030, // PER MILLE SIGN ('‰')
|
||||
0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
|
||||
0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹')
|
||||
0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
|
||||
0x8D => 0xFFFD, // REPLACEMENT CHARACTER
|
||||
0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
|
||||
0x8F => 0xFFFD, // REPLACEMENT CHARACTER
|
||||
0x90 => 0xFFFD, // REPLACEMENT CHARACTER
|
||||
0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘')
|
||||
0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’')
|
||||
0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
|
||||
0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”')
|
||||
0x95 => 0x2022, // BULLET ('•')
|
||||
0x96 => 0x2013, // EN DASH ('–')
|
||||
0x97 => 0x2014, // EM DASH ('—')
|
||||
0x98 => 0x02DC, // SMALL TILDE ('˜')
|
||||
0x99 => 0x2122, // TRADE MARK SIGN ('™')
|
||||
0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
|
||||
0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›')
|
||||
0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
|
||||
0x9D => 0xFFFD, // REPLACEMENT CHARACTER
|
||||
0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
|
||||
0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ')
|
||||
);
|
||||
|
||||
protected static $namedCharacterReferences;
|
||||
|
||||
protected static $namedCharacterReferenceMaxLength;
|
||||
|
||||
/**
|
||||
* Returns the "real" Unicode codepoint of a malformed character
|
||||
* reference.
|
||||
*/
|
||||
public static function getRealCodepoint($ref) {
|
||||
if (!isset(self::$realCodepointTable[$ref])) return false;
|
||||
else return self::$realCodepointTable[$ref];
|
||||
}
|
||||
|
||||
public static function getNamedCharacterReferences() {
|
||||
if (!self::$namedCharacterReferences) {
|
||||
self::$namedCharacterReferences = unserialize(
|
||||
file_get_contents(dirname(__FILE__) . '/named-character-references.ser'));
|
||||
}
|
||||
return self::$namedCharacterReferences;
|
||||
}
|
||||
|
||||
public static function getNamedCharacterReferenceMaxLength() {
|
||||
if (!self::$namedCharacterReferenceMaxLength) {
|
||||
$namedCharacterReferences = self::getNamedCharacterReferences();
|
||||
$lengths = array_map('strlen', array_keys($namedCharacterReferences));
|
||||
self::$namedCharacterReferenceMaxLength = max($lengths);
|
||||
}
|
||||
return self::$namedCharacterReferenceMaxLength;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Converts a Unicode codepoint to sequence of UTF-8 bytes.
|
||||
* @note Shamelessly stolen from HTML Purifier, which is also
|
||||
* shamelessly stolen from Feyd (which is in public domain).
|
||||
*/
|
||||
public static function utf8chr($code) {
|
||||
if($code > 0x10FFFF or $code < 0x0 or
|
||||
($code >= 0xD800 and $code <= 0xDFFF) ) {
|
||||
// bits are set outside the "valid" range as defined
|
||||
// by UNICODE 4.1.0
|
||||
return "\xEF\xBF\xBD";
|
||||
}
|
||||
|
||||
$x = $y = $z = $w = 0;
|
||||
if ($code < 0x80) {
|
||||
// regular ASCII character
|
||||
$x = $code;
|
||||
} else {
|
||||
// set up bits for UTF-8
|
||||
$x = ($code & 0x3F) | 0x80;
|
||||
if ($code < 0x800) {
|
||||
$y = (($code & 0x7FF) >> 6) | 0xC0;
|
||||
} else {
|
||||
$y = (($code & 0xFC0) >> 6) | 0x80;
|
||||
if($code < 0x10000) {
|
||||
$z = (($code >> 12) & 0x0F) | 0xE0;
|
||||
} else {
|
||||
$z = (($code >> 12) & 0x3F) | 0x80;
|
||||
$w = (($code >> 18) & 0x07) | 0xF0;
|
||||
}
|
||||
}
|
||||
}
|
||||
// set up the actual character
|
||||
$ret = '';
|
||||
if($w) $ret .= chr($w);
|
||||
if($z) $ret .= chr($z);
|
||||
if($y) $ret .= chr($y);
|
||||
$ret .= chr($x);
|
||||
|
||||
return $ret;
|
||||
}
|
||||
|
||||
}
|
||||
284
library/HTML5/InputStream.php
Normal file
284
library/HTML5/InputStream.php
Normal file
|
|
@ -0,0 +1,284 @@
|
|||
<?php
|
||||
|
||||
/*
|
||||
|
||||
Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included
|
||||
in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*/
|
||||
|
||||
// Some conventions:
|
||||
// /* */ indicates verbatim text from the HTML 5 specification
|
||||
// // indicates regular comments
|
||||
|
||||
class HTML5_InputStream {
|
||||
/**
|
||||
* The string data we're parsing.
|
||||
*/
|
||||
private $data;
|
||||
|
||||
/**
|
||||
* The current integer byte position we are in $data
|
||||
*/
|
||||
private $char;
|
||||
|
||||
/**
|
||||
* Length of $data; when $char === $data, we are at the end-of-file.
|
||||
*/
|
||||
private $EOF;
|
||||
|
||||
/**
|
||||
* Parse errors.
|
||||
*/
|
||||
public $errors = array();
|
||||
|
||||
/**
|
||||
* @param $data Data to parse
|
||||
*/
|
||||
public function __construct($data) {
|
||||
|
||||
/* Given an encoding, the bytes in the input stream must be
|
||||
converted to Unicode characters for the tokeniser, as
|
||||
described by the rules for that encoding, except that the
|
||||
leading U+FEFF BYTE ORDER MARK character, if any, must not
|
||||
be stripped by the encoding layer (it is stripped by the rule below).
|
||||
|
||||
Bytes or sequences of bytes in the original byte stream that
|
||||
could not be converted to Unicode characters must be converted
|
||||
to U+FFFD REPLACEMENT CHARACTER code points. */
|
||||
|
||||
// XXX currently assuming input data is UTF-8; once we
|
||||
// build encoding detection this will no longer be the case
|
||||
//
|
||||
// We previously had an mbstring implementation here, but that
|
||||
// implementation is heavily non-conforming, so it's been
|
||||
// omitted.
|
||||
if (extension_loaded('iconv')) {
|
||||
// non-conforming
|
||||
$data = @iconv('UTF-8', 'UTF-8//IGNORE', $data);
|
||||
} else {
|
||||
// we can make a conforming native implementation
|
||||
throw new Exception('Not implemented, please install mbstring or iconv');
|
||||
}
|
||||
|
||||
/* One leading U+FEFF BYTE ORDER MARK character must be
|
||||
ignored if any are present. */
|
||||
if (substr($data, 0, 3) === "\xEF\xBB\xBF") {
|
||||
$data = substr($data, 3);
|
||||
}
|
||||
|
||||
/* All U+0000 NULL characters in the input must be replaced
|
||||
by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such
|
||||
characters is a parse error. */
|
||||
for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) {
|
||||
$this->errors[] = array(
|
||||
'type' => HTML5_Tokenizer::PARSEERROR,
|
||||
'data' => 'null-character'
|
||||
);
|
||||
}
|
||||
/* U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED
|
||||
(LF) characters are treated specially. Any CR characters
|
||||
that are followed by LF characters must be removed, and any
|
||||
CR characters not followed by LF characters must be converted
|
||||
to LF characters. Thus, newlines in HTML DOMs are represented
|
||||
by LF characters, and there are never any CR characters in the
|
||||
input to the tokenization stage. */
|
||||
$data = str_replace(
|
||||
array(
|
||||
"\0",
|
||||
"\r\n",
|
||||
"\r"
|
||||
),
|
||||
array(
|
||||
"\xEF\xBF\xBD",
|
||||
"\n",
|
||||
"\n"
|
||||
),
|
||||
$data
|
||||
);
|
||||
|
||||
/* Any occurrences of any characters in the ranges U+0001 to
|
||||
U+0008, U+000B, U+000E to U+001F, U+007F to U+009F,
|
||||
U+D800 to U+DFFF , U+FDD0 to U+FDEF, and
|
||||
characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
|
||||
U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
|
||||
U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
|
||||
U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
|
||||
U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and
|
||||
U+10FFFF are parse errors. (These are all control characters
|
||||
or permanently undefined Unicode characters.) */
|
||||
// Check PCRE is loaded.
|
||||
if (extension_loaded('pcre')) {
|
||||
$count = preg_match_all(
|
||||
'/(?:
|
||||
[\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F
|
||||
|
|
||||
\xC2[\x80-\x9F] # U+0080 to U+009F
|
||||
|
|
||||
\xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF
|
||||
|
|
||||
\xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF
|
||||
|
|
||||
\xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF
|
||||
|
|
||||
[\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16})
|
||||
)/x',
|
||||
$data,
|
||||
$matches
|
||||
);
|
||||
for ($i = 0; $i < $count; $i++) {
|
||||
$this->errors[] = array(
|
||||
'type' => HTML5_Tokenizer::PARSEERROR,
|
||||
'data' => 'invalid-codepoint'
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// XXX: Need non-PCRE impl, probably using substr_count
|
||||
}
|
||||
|
||||
$this->data = $data;
|
||||
$this->char = 0;
|
||||
$this->EOF = strlen($data);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current line that the tokenizer is at.
|
||||
*/
|
||||
public function getCurrentLine() {
|
||||
// Check the string isn't empty
|
||||
if($this->EOF) {
|
||||
// Add one to $this->char because we want the number for the next
|
||||
// byte to be processed.
|
||||
return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1;
|
||||
} else {
|
||||
// If the string is empty, we are on the first line (sorta).
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current column of the current line that the tokenizer is at.
|
||||
*/
|
||||
public function getColumnOffset() {
|
||||
// strrpos is weird, and the offset needs to be negative for what we
|
||||
// want (i.e., the last \n before $this->char). This needs to not have
|
||||
// one (to make it point to the next character, the one we want the
|
||||
// position of) added to it because strrpos's behaviour includes the
|
||||
// final offset byte.
|
||||
$lastLine = strrpos($this->data, "\n", $this->char - 1 - strlen($this->data));
|
||||
|
||||
// However, for here we want the length up until the next byte to be
|
||||
// processed, so add one to the current byte ($this->char).
|
||||
if($lastLine !== false) {
|
||||
$findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
|
||||
} else {
|
||||
$findLengthOf = substr($this->data, 0, $this->char);
|
||||
}
|
||||
|
||||
// Get the length for the string we need.
|
||||
if(extension_loaded('iconv')) {
|
||||
return iconv_strlen($findLengthOf, 'utf-8');
|
||||
} elseif(extension_loaded('mbstring')) {
|
||||
return mb_strlen($findLengthOf, 'utf-8');
|
||||
} elseif(extension_loaded('xml')) {
|
||||
return strlen(utf8_decode($findLengthOf));
|
||||
} else {
|
||||
$count = count_chars($findLengthOf);
|
||||
// 0x80 = 0x7F - 0 + 1 (one added to get inclusive range)
|
||||
// 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range)
|
||||
return array_sum(array_slice($count, 0, 0x80)) +
|
||||
array_sum(array_slice($count, 0xC2, 0x33));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the currently consume character.
|
||||
* @note This performs bounds checking
|
||||
*/
|
||||
public function char() {
|
||||
return ($this->char++ < $this->EOF)
|
||||
? $this->data[$this->char - 1]
|
||||
: false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all characters until EOF.
|
||||
* @note This performs bounds checking
|
||||
*/
|
||||
public function remainingChars() {
|
||||
if($this->char < $this->EOF) {
|
||||
$data = substr($this->data, $this->char);
|
||||
$this->char = $this->EOF;
|
||||
return $data;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Matches as far as possible until we reach a certain set of bytes
|
||||
* and returns the matched substring.
|
||||
* @param $bytes Bytes to match.
|
||||
*/
|
||||
public function charsUntil($bytes, $max = null) {
|
||||
if ($this->char < $this->EOF) {
|
||||
if ($max === 0 || $max) {
|
||||
$len = strcspn($this->data, $bytes, $this->char, $max);
|
||||
} else {
|
||||
$len = strcspn($this->data, $bytes, $this->char);
|
||||
}
|
||||
$string = (string) substr($this->data, $this->char, $len);
|
||||
$this->char += $len;
|
||||
return $string;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Matches as far as possible with a certain set of bytes
|
||||
* and returns the matched substring.
|
||||
* @param $bytes Bytes to match.
|
||||
*/
|
||||
public function charsWhile($bytes, $max = null) {
|
||||
if ($this->char < $this->EOF) {
|
||||
if ($max === 0 || $max) {
|
||||
$len = strspn($this->data, $bytes, $this->char, $max);
|
||||
} else {
|
||||
$len = strspn($this->data, $bytes, $this->char);
|
||||
}
|
||||
$string = (string) substr($this->data, $this->char, $len);
|
||||
$this->char += $len;
|
||||
return $string;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Unconsume one character.
|
||||
*/
|
||||
public function unget() {
|
||||
if ($this->char <= $this->EOF) {
|
||||
$this->char--;
|
||||
}
|
||||
}
|
||||
}
|
||||
36
library/HTML5/Parser.php
Normal file
36
library/HTML5/Parser.php
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
<?php
|
||||
|
||||
require_once dirname(__FILE__) . '/Data.php';
|
||||
require_once dirname(__FILE__) . '/InputStream.php';
|
||||
require_once dirname(__FILE__) . '/TreeBuilder.php';
|
||||
require_once dirname(__FILE__) . '/Tokenizer.php';
|
||||
|
||||
/**
|
||||
* Outwards facing interface for HTML5.
|
||||
*/
|
||||
class HTML5_Parser
|
||||
{
|
||||
/**
|
||||
* Parses a full HTML document.
|
||||
* @param $text HTML text to parse
|
||||
* @param $builder Custom builder implementation
|
||||
* @return Parsed HTML as DOMDocument
|
||||
*/
|
||||
static public function parse($text, $builder = null) {
|
||||
$tokenizer = new HTML5_Tokenizer($text, $builder);
|
||||
$tokenizer->parse();
|
||||
return $tokenizer->save();
|
||||
}
|
||||
/**
|
||||
* Parses an HTML fragment.
|
||||
* @param $text HTML text to parse
|
||||
* @param $context String name of context element to pretend parsing is in.
|
||||
* @param $builder Custom builder implementation
|
||||
* @return Parsed HTML as DOMDocument
|
||||
*/
|
||||
static public function parseFragment($text, $context = null, $builder = null) {
|
||||
$tokenizer = new HTML5_Tokenizer($text, $builder);
|
||||
$tokenizer->parseFragment($context);
|
||||
return $tokenizer->save();
|
||||
}
|
||||
}
|
||||
2307
library/HTML5/Tokenizer.php
Normal file
2307
library/HTML5/Tokenizer.php
Normal file
|
|
@ -0,0 +1,2307 @@
|
|||
<?php
|
||||
|
||||
/*
|
||||
|
||||
Copyright 2007 Jeroen van der Meer <http://jero.net/>
|
||||
Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
|
||||
Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included
|
||||
in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*/
|
||||
|
||||
// Some conventions:
|
||||
// /* */ indicates verbatim text from the HTML 5 specification
|
||||
// // indicates regular comments
|
||||
|
||||
// all flags are in hyphenated form
|
||||
|
||||
class HTML5_Tokenizer {
|
||||
/**
|
||||
* Points to an InputStream object.
|
||||
*/
|
||||
protected $stream;
|
||||
|
||||
/**
|
||||
* Tree builder that the tokenizer emits token to.
|
||||
*/
|
||||
private $tree;
|
||||
|
||||
/**
|
||||
* Current content model we are parsing as.
|
||||
*/
|
||||
protected $content_model;
|
||||
|
||||
/**
|
||||
* Current token that is being built, but not yet emitted. Also
|
||||
* is the last token emitted, if applicable.
|
||||
*/
|
||||
protected $token;
|
||||
|
||||
// These are constants describing the content model
|
||||
const PCDATA = 0;
|
||||
const RCDATA = 1;
|
||||
const CDATA = 2;
|
||||
const PLAINTEXT = 3;
|
||||
|
||||
// These are constants describing tokens
|
||||
// XXX should probably be moved somewhere else, probably the
|
||||
// HTML5 class.
|
||||
const DOCTYPE = 0;
|
||||
const STARTTAG = 1;
|
||||
const ENDTAG = 2;
|
||||
const COMMENT = 3;
|
||||
const CHARACTER = 4;
|
||||
const SPACECHARACTER = 5;
|
||||
const EOF = 6;
|
||||
const PARSEERROR = 7;
|
||||
|
||||
// These are constants representing bunches of characters.
|
||||
const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
|
||||
const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
|
||||
const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
|
||||
const DIGIT = '0123456789';
|
||||
const HEX = '0123456789ABCDEFabcdef';
|
||||
const WHITESPACE = "\t\n\x0c ";
|
||||
|
||||
/**
|
||||
* @param $data Data to parse
|
||||
*/
|
||||
public function __construct($data, $builder = null) {
|
||||
$this->stream = new HTML5_InputStream($data);
|
||||
if (!$builder) $this->tree = new HTML5_TreeBuilder;
|
||||
$this->content_model = self::PCDATA;
|
||||
}
|
||||
|
||||
public function parseFragment($context = null) {
|
||||
$this->tree->setupContext($context);
|
||||
if ($this->tree->content_model) {
|
||||
$this->content_model = $this->tree->content_model;
|
||||
$this->tree->content_model = null;
|
||||
}
|
||||
$this->parse();
|
||||
}
|
||||
|
||||
// XXX maybe convert this into an iterator? regardless, this function
|
||||
// and the save function should go into a Parser facade of some sort
|
||||
/**
|
||||
* Performs the actual parsing of the document.
|
||||
*/
|
||||
public function parse() {
|
||||
// Current state
|
||||
$state = 'data';
|
||||
// This is used to avoid having to have look-behind in the data state.
|
||||
$lastFourChars = '';
|
||||
/**
|
||||
* Escape flag as specified by the HTML5 specification: "used to
|
||||
* control the behavior of the tokeniser. It is either true or
|
||||
* false, and initially must be set to the false state."
|
||||
*/
|
||||
$escape = false;
|
||||
//echo "\n\n";
|
||||
while($state !== null) {
|
||||
|
||||
/*echo $state . ' ';
|
||||
switch ($this->content_model) {
|
||||
case self::PCDATA: echo 'PCDATA'; break;
|
||||
case self::RCDATA: echo 'RCDATA'; break;
|
||||
case self::CDATA: echo 'CDATA'; break;
|
||||
case self::PLAINTEXT: echo 'PLAINTEXT'; break;
|
||||
}
|
||||
if ($escape) echo " escape";
|
||||
echo "\n";*/
|
||||
|
||||
switch($state) {
|
||||
case 'data':
|
||||
|
||||
/* Consume the next input character */
|
||||
$char = $this->stream->char();
|
||||
$lastFourChars .= $char;
|
||||
if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
|
||||
|
||||
// see below for meaning
|
||||
$hyp_cond =
|
||||
!$escape &&
|
||||
(
|
||||
$this->content_model === self::RCDATA ||
|
||||
$this->content_model === self::CDATA
|
||||
);
|
||||
$amp_cond =
|
||||
!$escape &&
|
||||
(
|
||||
$this->content_model === self::PCDATA ||
|
||||
$this->content_model === self::RCDATA
|
||||
);
|
||||
$lt_cond =
|
||||
$this->content_model === self::PCDATA ||
|
||||
(
|
||||
(
|
||||
$this->content_model === self::RCDATA ||
|
||||
$this->content_model === self::CDATA
|
||||
) &&
|
||||
!$escape
|
||||
);
|
||||
$gt_cond =
|
||||
$escape &&
|
||||
(
|
||||
$this->content_model === self::RCDATA ||
|
||||
$this->content_model === self::CDATA
|
||||
);
|
||||
|
||||
if($char === '&' && $amp_cond) {
|
||||
/* U+0026 AMPERSAND (&)
|
||||
When the content model flag is set to one of the PCDATA or RCDATA
|
||||
states and the escape flag is false: switch to the
|
||||
character reference data state. Otherwise: treat it as per
|
||||
the "anything else" entry below. */
|
||||
$state = 'characterReferenceData';
|
||||
|
||||
} elseif(
|
||||
$char === '-' &&
|
||||
$hyp_cond &&
|
||||
$lastFourChars === '<!--'
|
||||
) {
|
||||
/*
|
||||
U+002D HYPHEN-MINUS (-)
|
||||
If the content model flag is set to either the RCDATA state or
|
||||
the CDATA state, and the escape flag is false, and there are at
|
||||
least three characters before this one in the input stream, and the
|
||||
last four characters in the input stream, including this one, are
|
||||
U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
|
||||
and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
|
||||
$escape = true;
|
||||
|
||||
/* In any case, emit the input character as a character token. Stay
|
||||
in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::CHARACTER,
|
||||
'data' => '-'
|
||||
));
|
||||
// We do the "any case" part as part of "anything else".
|
||||
|
||||
/* U+003C LESS-THAN SIGN (<) */
|
||||
} elseif($char === '<' && $lt_cond) {
|
||||
/* When the content model flag is set to the PCDATA state: switch
|
||||
to the tag open state.
|
||||
|
||||
When the content model flag is set to either the RCDATA state or
|
||||
the CDATA state and the escape flag is false: switch to the tag
|
||||
open state.
|
||||
|
||||
Otherwise: treat it as per the "anything else" entry below. */
|
||||
$state = 'tagOpen';
|
||||
|
||||
/* U+003E GREATER-THAN SIGN (>) */
|
||||
} elseif(
|
||||
$char === '>' &&
|
||||
$gt_cond &&
|
||||
substr($lastFourChars, 1) === '-->'
|
||||
) {
|
||||
/* If the content model flag is set to either the RCDATA state or
|
||||
the CDATA state, and the escape flag is true, and the last three
|
||||
characters in the input stream including this one are U+002D
|
||||
HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
|
||||
set the escape flag to false. */
|
||||
$escape = false;
|
||||
|
||||
/* In any case, emit the input character as a character token.
|
||||
Stay in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::CHARACTER,
|
||||
'data' => '>'
|
||||
));
|
||||
// We do the "any case" part as part of "anything else".
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Emit an end-of-file token. */
|
||||
$state = null;
|
||||
$this->tree->emitToken(array(
|
||||
'type' => self::EOF
|
||||
));
|
||||
|
||||
} elseif($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
// Directly after emitting a token you switch back to the "data
|
||||
// state". At that point spaceCharacters are important so they are
|
||||
// emitted separately.
|
||||
$chars = $this->stream->charsWhile(self::WHITESPACE);
|
||||
$this->emitToken(array(
|
||||
'type' => self::SPACECHARACTER,
|
||||
'data' => $char . $chars
|
||||
));
|
||||
$lastFourChars .= $chars;
|
||||
if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
|
||||
|
||||
} else {
|
||||
/* Anything else
|
||||
THIS IS AN OPTIMIZATION: Get as many character that
|
||||
otherwise would also be treated as a character token and emit it
|
||||
as a single character token. Stay in the data state. */
|
||||
|
||||
$mask = '';
|
||||
if ($hyp_cond) $mask .= '-';
|
||||
if ($amp_cond) $mask .= '&';
|
||||
if ($lt_cond) $mask .= '<';
|
||||
if ($gt_cond) $mask .= '>';
|
||||
|
||||
if ($mask === '') {
|
||||
$chars = $this->stream->remainingChars();
|
||||
} else {
|
||||
$chars = $this->stream->charsUntil($mask);
|
||||
}
|
||||
|
||||
$this->emitToken(array(
|
||||
'type' => self::CHARACTER,
|
||||
'data' => $char . $chars
|
||||
));
|
||||
|
||||
$lastFourChars .= $chars;
|
||||
if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
|
||||
|
||||
$state = 'data';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'characterReferenceData':
|
||||
/* (This cannot happen if the content model flag
|
||||
is set to the CDATA state.) */
|
||||
|
||||
/* Attempt to consume a character reference, with no
|
||||
additional allowed character. */
|
||||
$entity = $this->consumeCharacterReference();
|
||||
|
||||
/* If nothing is returned, emit a U+0026 AMPERSAND
|
||||
character token. Otherwise, emit the character token that
|
||||
was returned. */
|
||||
// This is all done when consuming the character reference.
|
||||
$this->emitToken(array(
|
||||
'type' => self::CHARACTER,
|
||||
'data' => $entity
|
||||
));
|
||||
|
||||
/* Finally, switch to the data state. */
|
||||
$state = 'data';
|
||||
break;
|
||||
|
||||
case 'tagOpen':
|
||||
$char = $this->stream->char();
|
||||
|
||||
switch($this->content_model) {
|
||||
case self::RCDATA:
|
||||
case self::CDATA:
|
||||
/* Consume the next input character. If it is a
|
||||
U+002F SOLIDUS (/) character, switch to the close
|
||||
tag open state. Otherwise, emit a U+003C LESS-THAN
|
||||
SIGN character token and reconsume the current input
|
||||
character in the data state. */
|
||||
// We consumed above.
|
||||
|
||||
if($char === '/') {
|
||||
$state = 'closeTagOpen';
|
||||
|
||||
} else {
|
||||
$this->emitToken(array(
|
||||
'type' => self::CHARACTER,
|
||||
'data' => '<'
|
||||
));
|
||||
|
||||
$this->stream->unget();
|
||||
|
||||
$state = 'data';
|
||||
}
|
||||
break;
|
||||
|
||||
case self::PCDATA:
|
||||
/* If the content model flag is set to the PCDATA state
|
||||
Consume the next input character: */
|
||||
// We consumed above.
|
||||
|
||||
if($char === '!') {
|
||||
/* U+0021 EXCLAMATION MARK (!)
|
||||
Switch to the markup declaration open state. */
|
||||
$state = 'markupDeclarationOpen';
|
||||
|
||||
} elseif($char === '/') {
|
||||
/* U+002F SOLIDUS (/)
|
||||
Switch to the close tag open state. */
|
||||
$state = 'closeTagOpen';
|
||||
|
||||
} elseif('A' <= $char && $char <= 'Z') {
|
||||
/* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
|
||||
Create a new start tag token, set its tag name to the lowercase
|
||||
version of the input character (add 0x0020 to the character's code
|
||||
point), then switch to the tag name state. (Don't emit the token
|
||||
yet; further details will be filled in before it is emitted.) */
|
||||
$this->token = array(
|
||||
'name' => strtolower($char),
|
||||
'type' => self::STARTTAG,
|
||||
'attr' => array()
|
||||
);
|
||||
|
||||
$state = 'tagName';
|
||||
|
||||
} elseif('a' <= $char && $char <= 'z') {
|
||||
/* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
|
||||
Create a new start tag token, set its tag name to the input
|
||||
character, then switch to the tag name state. (Don't emit
|
||||
the token yet; further details will be filled in before it
|
||||
is emitted.) */
|
||||
$this->token = array(
|
||||
'name' => $char,
|
||||
'type' => self::STARTTAG,
|
||||
'attr' => array()
|
||||
);
|
||||
|
||||
$state = 'tagName';
|
||||
|
||||
} elseif($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Parse error. Emit a U+003C LESS-THAN SIGN character token and a
|
||||
U+003E GREATER-THAN SIGN character token. Switch to the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-tag-name-but-got-right-bracket'
|
||||
));
|
||||
$this->emitToken(array(
|
||||
'type' => self::CHARACTER,
|
||||
'data' => '<>'
|
||||
));
|
||||
|
||||
$state = 'data';
|
||||
|
||||
} elseif($char === '?') {
|
||||
/* U+003F QUESTION MARK (?)
|
||||
Parse error. Switch to the bogus comment state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-tag-name-but-got-question-mark'
|
||||
));
|
||||
$this->token = array(
|
||||
'data' => '?',
|
||||
'type' => self::COMMENT
|
||||
);
|
||||
$state = 'bogusComment';
|
||||
|
||||
} else {
|
||||
/* Anything else
|
||||
Parse error. Emit a U+003C LESS-THAN SIGN character token and
|
||||
reconsume the current input character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-tag-name'
|
||||
));
|
||||
$this->emitToken(array(
|
||||
'type' => self::CHARACTER,
|
||||
'data' => '<'
|
||||
));
|
||||
|
||||
$state = 'data';
|
||||
$this->stream->unget();
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'closeTagOpen':
|
||||
if (
|
||||
$this->content_model === self::RCDATA ||
|
||||
$this->content_model === self::CDATA
|
||||
) {
|
||||
/* If the content model flag is set to the RCDATA or CDATA
|
||||
states... */
|
||||
$name = strtolower($this->stream->charsWhile(self::ALPHA));
|
||||
$following = $this->stream->char();
|
||||
$this->stream->unget();
|
||||
if (
|
||||
!$this->token ||
|
||||
$this->token['name'] !== $name ||
|
||||
$this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
|
||||
) {
|
||||
/* if no start tag token has ever been emitted by this instance
|
||||
of the tokenizer (fragment case), or, if the next few
|
||||
characters do not match the tag name of the last start tag
|
||||
token emitted (compared in an ASCII case-insensitive manner),
|
||||
or if they do but they are not immediately followed by one of
|
||||
the following characters:
|
||||
|
||||
* U+0009 CHARACTER TABULATION
|
||||
* U+000A LINE FEED (LF)
|
||||
* U+000C FORM FEED (FF)
|
||||
* U+0020 SPACE
|
||||
* U+003E GREATER-THAN SIGN (>)
|
||||
* U+002F SOLIDUS (/)
|
||||
* EOF
|
||||
|
||||
...then emit a U+003C LESS-THAN SIGN character token, a
|
||||
U+002F SOLIDUS character token, and switch to the data
|
||||
state to process the next input character. */
|
||||
// XXX: Probably ought to replace in_array with $following === x ||...
|
||||
|
||||
// We also need to emit $name now we've consumed that, as we
|
||||
// know it'll just be emitted as a character token.
|
||||
$this->emitToken(array(
|
||||
'type' => self::CHARACTER,
|
||||
'data' => '</' . $name
|
||||
));
|
||||
|
||||
$state = 'data';
|
||||
} else {
|
||||
// This matches what would happen if we actually did the
|
||||
// otherwise below (but we can't because we've consumed too
|
||||
// much).
|
||||
|
||||
// Start the end tag token with the name we already have.
|
||||
$this->token = array(
|
||||
'name' => $name,
|
||||
'type' => self::ENDTAG
|
||||
);
|
||||
|
||||
// Change to tag name state.
|
||||
$state = 'tagName';
|
||||
}
|
||||
} elseif ($this->content_model === self::PCDATA) {
|
||||
/* Otherwise, if the content model flag is set to the PCDATA
|
||||
state [...]: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if ('A' <= $char && $char <= 'Z') {
|
||||
/* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
|
||||
Create a new end tag token, set its tag name to the lowercase version
|
||||
of the input character (add 0x0020 to the character's code point), then
|
||||
switch to the tag name state. (Don't emit the token yet; further details
|
||||
will be filled in before it is emitted.) */
|
||||
$this->token = array(
|
||||
'name' => strtolower($char),
|
||||
'type' => self::ENDTAG
|
||||
);
|
||||
|
||||
$state = 'tagName';
|
||||
|
||||
} elseif ('a' <= $char && $char <= 'z') {
|
||||
/* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
|
||||
Create a new end tag token, set its tag name to the
|
||||
input character, then switch to the tag name state.
|
||||
(Don't emit the token yet; further details will be
|
||||
filled in before it is emitted.) */
|
||||
$this->token = array(
|
||||
'name' => $char,
|
||||
'type' => self::ENDTAG
|
||||
);
|
||||
|
||||
$state = 'tagName';
|
||||
|
||||
} elseif($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Parse error. Switch to the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-closing-tag-but-got-right-bracket'
|
||||
));
|
||||
$state = 'data';
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
|
||||
SOLIDUS character token. Reconsume the EOF character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-closing-tag-but-got-eof'
|
||||
));
|
||||
$this->emitToken(array(
|
||||
'type' => self::CHARACTER,
|
||||
'data' => '</'
|
||||
));
|
||||
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* Parse error. Switch to the bogus comment state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-closing-tag-but-got-char'
|
||||
));
|
||||
$this->token = array(
|
||||
'data' => $char,
|
||||
'type' => self::COMMENT
|
||||
);
|
||||
$state = 'bogusComment';
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'tagName':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
Switch to the before attribute name state. */
|
||||
$state = 'beforeAttributeName';
|
||||
|
||||
} elseif($char === '/') {
|
||||
/* U+002F SOLIDUS (/)
|
||||
Switch to the self-closing start tag state. */
|
||||
$state = 'selfClosingStartTag';
|
||||
|
||||
} elseif($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Emit the current tag token. Switch to the data state. */
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
|
||||
} elseif('A' <= $char && $char <= 'Z') {
|
||||
/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
|
||||
Append the lowercase version of the current input
|
||||
character (add 0x0020 to the character's code point) to
|
||||
the current tag token's tag name. Stay in the tag name state. */
|
||||
$chars = $this->stream->charsWhile(self::UPPER_ALPHA);
|
||||
|
||||
$this->token['name'] .= strtolower($char . $chars);
|
||||
$state = 'tagName';
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Parse error. Emit the current tag token. Reconsume the EOF
|
||||
character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-tag-name'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* Anything else
|
||||
Append the current input character to the current tag token's tag name.
|
||||
Stay in the tag name state. */
|
||||
$chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA);
|
||||
|
||||
$this->token['name'] .= $char . $chars;
|
||||
$state = 'tagName';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'beforeAttributeName':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
// this conditional is optimized, check bottom
|
||||
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
Stay in the before attribute name state. */
|
||||
$state = 'beforeAttributeName';
|
||||
|
||||
} elseif($char === '/') {
|
||||
/* U+002F SOLIDUS (/)
|
||||
Switch to the self-closing start tag state. */
|
||||
$state = 'selfClosingStartTag';
|
||||
|
||||
} elseif($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Emit the current tag token. Switch to the data state. */
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
|
||||
} elseif('A' <= $char && $char <= 'Z') {
|
||||
/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
|
||||
Start a new attribute in the current tag token. Set that
|
||||
attribute's name to the lowercase version of the current
|
||||
input character (add 0x0020 to the character's code
|
||||
point), and its value to the empty string. Switch to the
|
||||
attribute name state.*/
|
||||
$this->token['attr'][] = array(
|
||||
'name' => strtolower($char),
|
||||
'value' => ''
|
||||
);
|
||||
|
||||
$state = 'attributeName';
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Parse error. Emit the current tag token. Reconsume the EOF
|
||||
character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-attribute-name-but-got-eof'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* U+0022 QUOTATION MARK (")
|
||||
U+0027 APOSTROPHE (')
|
||||
U+003D EQUALS SIGN (=)
|
||||
Parse error. Treat it as per the "anything else" entry
|
||||
below. */
|
||||
if($char === '"' || $char === "'" || $char === '=') {
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'invalid-character-in-attribute-name'
|
||||
));
|
||||
}
|
||||
|
||||
/* Anything else
|
||||
Start a new attribute in the current tag token. Set that attribute's
|
||||
name to the current input character, and its value to the empty string.
|
||||
Switch to the attribute name state. */
|
||||
$this->token['attr'][] = array(
|
||||
'name' => $char,
|
||||
'value' => ''
|
||||
);
|
||||
|
||||
$state = 'attributeName';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'attributeName':
|
||||
// Consume the next input character:
|
||||
$char = $this->stream->char();
|
||||
|
||||
// this conditional is optimized, check bottom
|
||||
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
Switch to the after attribute name state. */
|
||||
$state = 'afterAttributeName';
|
||||
|
||||
} elseif($char === '/') {
|
||||
/* U+002F SOLIDUS (/)
|
||||
Switch to the self-closing start tag state. */
|
||||
$state = 'selfClosingStartTag';
|
||||
|
||||
} elseif($char === '=') {
|
||||
/* U+003D EQUALS SIGN (=)
|
||||
Switch to the before attribute value state. */
|
||||
$state = 'beforeAttributeValue';
|
||||
|
||||
} elseif($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Emit the current tag token. Switch to the data state. */
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
|
||||
} elseif('A' <= $char && $char <= 'Z') {
|
||||
/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
|
||||
Append the lowercase version of the current input
|
||||
character (add 0x0020 to the character's code point) to
|
||||
the current attribute's name. Stay in the attribute name
|
||||
state. */
|
||||
$chars = $this->stream->charsWhile(self::UPPER_ALPHA);
|
||||
|
||||
$last = count($this->token['attr']) - 1;
|
||||
$this->token['attr'][$last]['name'] .= strtolower($char . $chars);
|
||||
|
||||
$state = 'attributeName';
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Parse error. Emit the current tag token. Reconsume the EOF
|
||||
character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-attribute-name'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* U+0022 QUOTATION MARK (")
|
||||
U+0027 APOSTROPHE (')
|
||||
Parse error. Treat it as per the "anything else"
|
||||
entry below. */
|
||||
if($char === '"' || $char === "'") {
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'invalid-character-in-attribute-name'
|
||||
));
|
||||
}
|
||||
|
||||
/* Anything else
|
||||
Append the current input character to the current attribute's name.
|
||||
Stay in the attribute name state. */
|
||||
$chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA);
|
||||
|
||||
$last = count($this->token['attr']) - 1;
|
||||
$this->token['attr'][$last]['name'] .= $char . $chars;
|
||||
|
||||
$state = 'attributeName';
|
||||
}
|
||||
|
||||
/* When the user agent leaves the attribute name state
|
||||
(and before emitting the tag token, if appropriate), the
|
||||
complete attribute's name must be compared to the other
|
||||
attributes on the same token; if there is already an
|
||||
attribute on the token with the exact same name, then this
|
||||
is a parse error and the new attribute must be dropped, along
|
||||
with the value that gets associated with it (if any). */
|
||||
// this might be implemented in the emitToken method
|
||||
break;
|
||||
|
||||
case 'afterAttributeName':
|
||||
// Consume the next input character:
|
||||
$char = $this->stream->char();
|
||||
|
||||
// this is an optimized conditional, check the bottom
|
||||
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
Stay in the after attribute name state. */
|
||||
$state = 'afterAttributeName';
|
||||
|
||||
} elseif($char === '/') {
|
||||
/* U+002F SOLIDUS (/)
|
||||
Switch to the self-closing start tag state. */
|
||||
$state = 'selfClosingStartTag';
|
||||
|
||||
} elseif($char === '=') {
|
||||
/* U+003D EQUALS SIGN (=)
|
||||
Switch to the before attribute value state. */
|
||||
$state = 'beforeAttributeValue';
|
||||
|
||||
} elseif($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Emit the current tag token. Switch to the data state. */
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
|
||||
} elseif('A' <= $char && $char <= 'Z') {
|
||||
/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
|
||||
Start a new attribute in the current tag token. Set that
|
||||
attribute's name to the lowercase version of the current
|
||||
input character (add 0x0020 to the character's code
|
||||
point), and its value to the empty string. Switch to the
|
||||
attribute name state. */
|
||||
$this->token['attr'][] = array(
|
||||
'name' => strtolower($char),
|
||||
'value' => ''
|
||||
);
|
||||
|
||||
$state = 'attributeName';
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Parse error. Emit the current tag token. Reconsume the EOF
|
||||
character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-end-of-tag-but-got-eof'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* U+0022 QUOTATION MARK (")
|
||||
U+0027 APOSTROPHE (')
|
||||
Parse error. Treat it as per the "anything else"
|
||||
entry below. */
|
||||
if($char === '"' || $char === "'") {
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'invalid-character-after-attribute-name'
|
||||
));
|
||||
}
|
||||
|
||||
/* Anything else
|
||||
Start a new attribute in the current tag token. Set that attribute's
|
||||
name to the current input character, and its value to the empty string.
|
||||
Switch to the attribute name state. */
|
||||
$this->token['attr'][] = array(
|
||||
'name' => $char,
|
||||
'value' => ''
|
||||
);
|
||||
|
||||
$state = 'attributeName';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'beforeAttributeValue':
|
||||
// Consume the next input character:
|
||||
$char = $this->stream->char();
|
||||
|
||||
// this is an optimized conditional
|
||||
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
Stay in the before attribute value state. */
|
||||
$state = 'beforeAttributeValue';
|
||||
|
||||
} elseif($char === '"') {
|
||||
/* U+0022 QUOTATION MARK (")
|
||||
Switch to the attribute value (double-quoted) state. */
|
||||
$state = 'attributeValueDoubleQuoted';
|
||||
|
||||
} elseif($char === '&') {
|
||||
/* U+0026 AMPERSAND (&)
|
||||
Switch to the attribute value (unquoted) state and reconsume
|
||||
this input character. */
|
||||
$this->stream->unget();
|
||||
$state = 'attributeValueUnquoted';
|
||||
|
||||
} elseif($char === '\'') {
|
||||
/* U+0027 APOSTROPHE (')
|
||||
Switch to the attribute value (single-quoted) state. */
|
||||
$state = 'attributeValueSingleQuoted';
|
||||
|
||||
} elseif($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Parse error. Emit the current tag token. Switch to the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-attribute-value-but-got-right-bracket'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Parse error. Emit the current tag token. Reconsume
|
||||
the character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-attribute-value-but-got-eof'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* U+003D EQUALS SIGN (=)
|
||||
Parse error. Treat it as per the "anything else" entry below. */
|
||||
if($char === '=') {
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'equals-in-unquoted-attribute-value'
|
||||
));
|
||||
}
|
||||
|
||||
/* Anything else
|
||||
Append the current input character to the current attribute's value.
|
||||
Switch to the attribute value (unquoted) state. */
|
||||
$last = count($this->token['attr']) - 1;
|
||||
$this->token['attr'][$last]['value'] .= $char;
|
||||
|
||||
$state = 'attributeValueUnquoted';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'attributeValueDoubleQuoted':
|
||||
// Consume the next input character:
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === '"') {
|
||||
/* U+0022 QUOTATION MARK (")
|
||||
Switch to the after attribute value (quoted) state. */
|
||||
$state = 'afterAttributeValueQuoted';
|
||||
|
||||
} elseif($char === '&') {
|
||||
/* U+0026 AMPERSAND (&)
|
||||
Switch to the character reference in attribute value
|
||||
state, with the additional allowed character
|
||||
being U+0022 QUOTATION MARK ("). */
|
||||
$this->characterReferenceInAttributeValue('"');
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Parse error. Emit the current tag token. Reconsume the character
|
||||
in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-attribute-value-double-quote'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* Anything else
|
||||
Append the current input character to the current attribute's value.
|
||||
Stay in the attribute value (double-quoted) state. */
|
||||
$chars = $this->stream->charsUntil('"&');
|
||||
|
||||
$last = count($this->token['attr']) - 1;
|
||||
$this->token['attr'][$last]['value'] .= $char . $chars;
|
||||
|
||||
$state = 'attributeValueDoubleQuoted';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'attributeValueSingleQuoted':
|
||||
// Consume the next input character:
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === "'") {
|
||||
/* U+0022 QUOTATION MARK (')
|
||||
Switch to the after attribute value state. */
|
||||
$state = 'afterAttributeValueQuoted';
|
||||
|
||||
} elseif($char === '&') {
|
||||
/* U+0026 AMPERSAND (&)
|
||||
Switch to the entity in attribute value state. */
|
||||
$this->characterReferenceInAttributeValue("'");
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Parse error. Emit the current tag token. Reconsume the character
|
||||
in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-attribute-value-single-quote'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* Anything else
|
||||
Append the current input character to the current attribute's value.
|
||||
Stay in the attribute value (single-quoted) state. */
|
||||
$chars = $this->stream->charsUntil("'&");
|
||||
|
||||
$last = count($this->token['attr']) - 1;
|
||||
$this->token['attr'][$last]['value'] .= $char . $chars;
|
||||
|
||||
$state = 'attributeValueSingleQuoted';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'attributeValueUnquoted':
|
||||
// Consume the next input character:
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
Switch to the before attribute name state. */
|
||||
$state = 'beforeAttributeName';
|
||||
|
||||
} elseif($char === '&') {
|
||||
/* U+0026 AMPERSAND (&)
|
||||
Switch to the entity in attribute value state. */
|
||||
$this->characterReferenceInAttributeValue();
|
||||
|
||||
} elseif($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Emit the current tag token. Switch to the data state. */
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
|
||||
} elseif ($char === false) {
|
||||
/* EOF
|
||||
Parse error. Emit the current tag token. Reconsume
|
||||
the character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-attribute-value-no-quotes'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* U+0022 QUOTATION MARK (")
|
||||
U+0027 APOSTROPHE (')
|
||||
U+003D EQUALS SIGN (=)
|
||||
Parse error. Treat it as per the "anything else"
|
||||
entry below. */
|
||||
if($char === '"' || $char === "'" || $char === '=') {
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-character-in-unquoted-attribute-value'
|
||||
));
|
||||
}
|
||||
|
||||
/* Anything else
|
||||
Append the current input character to the current attribute's value.
|
||||
Stay in the attribute value (unquoted) state. */
|
||||
$chars = $this->stream->charsUntil("\t\n\x0c &>\"'=");
|
||||
|
||||
$last = count($this->token['attr']) - 1;
|
||||
$this->token['attr'][$last]['value'] .= $char . $chars;
|
||||
|
||||
$state = 'attributeValueUnquoted';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'afterAttributeValueQuoted':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
Switch to the before attribute name state. */
|
||||
$state = 'beforeAttributeName';
|
||||
|
||||
} elseif ($char === '/') {
|
||||
/* U+002F SOLIDUS (/)
|
||||
Switch to the self-closing start tag state. */
|
||||
$state = 'selfClosingStartTag';
|
||||
|
||||
} elseif ($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Emit the current tag token. Switch to the data state. */
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
|
||||
} elseif ($char === false) {
|
||||
/* EOF
|
||||
Parse error. Emit the current tag token. Reconsume the EOF
|
||||
character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-EOF-after-attribute-value'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* Anything else
|
||||
Parse error. Reconsume the character in the before attribute
|
||||
name state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-character-after-attribute-value'
|
||||
));
|
||||
$this->stream->unget();
|
||||
$state = 'beforeAttributeName';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'selfClosingStartTag':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if ($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Set the self-closing flag of the current tag token.
|
||||
Emit the current tag token. Switch to the data state. */
|
||||
// not sure if this is the name we want
|
||||
$this->token['self-closing'] = true;
|
||||
/* When an end tag token is emitted with its self-closing flag set,
|
||||
that is a parse error. */
|
||||
if ($this->token['type'] === self::ENDTAG) {
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'self-closing-end-tag'
|
||||
));
|
||||
}
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
|
||||
} elseif ($char === false) {
|
||||
/* EOF
|
||||
Parse error. Emit the current tag token. Reconsume the
|
||||
EOF character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-eof-after-self-closing'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* Anything else
|
||||
Parse error. Reconsume the character in the before attribute name state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-character-after-self-closing'
|
||||
));
|
||||
$this->stream->unget();
|
||||
$state = 'beforeAttributeName';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'bogusComment':
|
||||
/* (This can only happen if the content model flag is set to the PCDATA state.) */
|
||||
/* Consume every character up to the first U+003E GREATER-THAN SIGN
|
||||
character (>) or the end of the file (EOF), whichever comes first. Emit
|
||||
a comment token whose data is the concatenation of all the characters
|
||||
starting from and including the character that caused the state machine
|
||||
to switch into the bogus comment state, up to and including the last
|
||||
consumed character before the U+003E character, if any, or up to the
|
||||
end of the file otherwise. (If the comment was started by the end of
|
||||
the file (EOF), the token is empty.) */
|
||||
$this->token['data'] .= (string) $this->stream->charsUntil('>');
|
||||
$this->stream->char();
|
||||
|
||||
$this->emitToken($this->token);
|
||||
|
||||
/* Switch to the data state. */
|
||||
$state = 'data';
|
||||
break;
|
||||
|
||||
case 'markupDeclarationOpen':
|
||||
// Consume for below
|
||||
$hyphens = $this->stream->charsWhile('-', 2);
|
||||
if ($hyphens === '-') {
|
||||
$this->stream->unget();
|
||||
}
|
||||
if ($hyphens !== '--') {
|
||||
$alpha = $this->stream->charsWhile(self::ALPHA, 7);
|
||||
}
|
||||
|
||||
/* If the next two characters are both U+002D HYPHEN-MINUS (-)
|
||||
characters, consume those two characters, create a comment token whose
|
||||
data is the empty string, and switch to the comment state. */
|
||||
if($hyphens === '--') {
|
||||
$state = 'commentStart';
|
||||
$this->token = array(
|
||||
'data' => '',
|
||||
'type' => self::COMMENT
|
||||
);
|
||||
|
||||
/* Otherwise if the next seven characters are a case-insensitive match
|
||||
for the word "DOCTYPE", then consume those characters and switch to the
|
||||
DOCTYPE state. */
|
||||
} elseif(strtoupper($alpha) === 'DOCTYPE') {
|
||||
$state = 'doctype';
|
||||
|
||||
// XXX not implemented
|
||||
/* Otherwise, if the insertion mode is "in foreign content"
|
||||
and the current node is not an element in the HTML namespace
|
||||
and the next seven characters are an ASCII case-sensitive
|
||||
match for the string "[CDATA[" (the five uppercase letters
|
||||
"CDATA" with a U+005B LEFT SQUARE BRACKET character before
|
||||
and after), then consume those characters and switch to the
|
||||
CDATA section state (which is unrelated to the content model
|
||||
flag's CDATA state). */
|
||||
|
||||
/* Otherwise, is is a parse error. Switch to the bogus comment state.
|
||||
The next character that is consumed, if any, is the first character
|
||||
that will be in the comment. */
|
||||
} else {
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-dashes-or-doctype'
|
||||
));
|
||||
$this->token = array(
|
||||
'data' => (string) $alpha,
|
||||
'type' => self::COMMENT
|
||||
);
|
||||
$state = 'bogusComment';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'commentStart':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if ($char === '-') {
|
||||
/* U+002D HYPHEN-MINUS (-)
|
||||
Switch to the comment start dash state. */
|
||||
$state = 'commentStartDash';
|
||||
} elseif ($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Parse error. Emit the comment token. Switch to the
|
||||
data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'incorrect-comment'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
} elseif ($char === false) {
|
||||
/* EOF
|
||||
Parse error. Emit the comment token. Reconsume the
|
||||
EOF character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-comment'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
} else {
|
||||
/* Anything else
|
||||
Append the input character to the comment token's
|
||||
data. Switch to the comment state. */
|
||||
$this->token['data'] .= $char;
|
||||
$state = 'comment';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'commentStartDash':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
if ($char === '-') {
|
||||
/* U+002D HYPHEN-MINUS (-)
|
||||
Switch to the comment end state */
|
||||
$state = 'commentEnd';
|
||||
} elseif ($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Parse error. Emit the comment token. Switch to the
|
||||
data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'incorrect-comment'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
} elseif ($char === false) {
|
||||
/* Parse error. Emit the comment token. Reconsume the
|
||||
EOF character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-comment'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
} else {
|
||||
$this->token['data'] .= '-' . $char;
|
||||
$state = 'comment';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'comment':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === '-') {
|
||||
/* U+002D HYPHEN-MINUS (-)
|
||||
Switch to the comment end dash state */
|
||||
$state = 'commentEndDash';
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Parse error. Emit the comment token. Reconsume the EOF character
|
||||
in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-comment'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* Anything else
|
||||
Append the input character to the comment token's data. Stay in
|
||||
the comment state. */
|
||||
$chars = $this->stream->charsUntil('-');
|
||||
|
||||
$this->token['data'] .= $char . $chars;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'commentEndDash':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === '-') {
|
||||
/* U+002D HYPHEN-MINUS (-)
|
||||
Switch to the comment end state */
|
||||
$state = 'commentEnd';
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Parse error. Emit the comment token. Reconsume the EOF character
|
||||
in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-comment-end-dash'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* Anything else
|
||||
Append a U+002D HYPHEN-MINUS (-) character and the input
|
||||
character to the comment token's data. Switch to the comment state. */
|
||||
$this->token['data'] .= '-'.$char;
|
||||
$state = 'comment';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'commentEnd':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Emit the comment token. Switch to the data state. */
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
|
||||
} elseif($char === '-') {
|
||||
/* U+002D HYPHEN-MINUS (-)
|
||||
Parse error. Append a U+002D HYPHEN-MINUS (-) character
|
||||
to the comment token's data. Stay in the comment end
|
||||
state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-dash-after-double-dash-in-comment'
|
||||
));
|
||||
$this->token['data'] .= '-';
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Parse error. Emit the comment token. Reconsume the
|
||||
EOF character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-comment-double-dash'
|
||||
));
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* Anything else
|
||||
Parse error. Append two U+002D HYPHEN-MINUS (-)
|
||||
characters and the input character to the comment token's
|
||||
data. Switch to the comment state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-char-in-comment'
|
||||
));
|
||||
$this->token['data'] .= '--'.$char;
|
||||
$state = 'comment';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'doctype':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
Switch to the before DOCTYPE name state. */
|
||||
$state = 'beforeDoctypeName';
|
||||
|
||||
} else {
|
||||
/* Anything else
|
||||
Parse error. Reconsume the current character in the
|
||||
before DOCTYPE name state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'need-space-after-doctype'
|
||||
));
|
||||
$this->stream->unget();
|
||||
$state = 'beforeDoctypeName';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'beforeDoctypeName':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
Stay in the before DOCTYPE name state. */
|
||||
|
||||
} elseif($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Parse error. Create a new DOCTYPE token. Set its
|
||||
force-quirks flag to on. Emit the token. Switch to the
|
||||
data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-doctype-name-but-got-right-bracket'
|
||||
));
|
||||
$this->emitToken(array(
|
||||
'name' => '',
|
||||
'type' => self::DOCTYPE,
|
||||
'force-quirks' => true,
|
||||
'error' => true
|
||||
));
|
||||
|
||||
$state = 'data';
|
||||
|
||||
} elseif('A' <= $char && $char <= 'Z') {
|
||||
/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
|
||||
Create a new DOCTYPE token. Set the token's name to the
|
||||
lowercase version of the input character (add 0x0020 to
|
||||
the character's code point). Switch to the DOCTYPE name
|
||||
state. */
|
||||
$this->token = array(
|
||||
'name' => strtolower($char),
|
||||
'type' => self::DOCTYPE,
|
||||
'error' => true
|
||||
);
|
||||
|
||||
$state = 'doctypeName';
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Parse error. Create a new DOCTYPE token. Set its
|
||||
force-quirks flag to on. Emit the token. Reconsume the
|
||||
EOF character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-doctype-name-but-got-eof'
|
||||
));
|
||||
$this->emitToken(array(
|
||||
'name' => '',
|
||||
'type' => self::DOCTYPE,
|
||||
'force-quirks' => true,
|
||||
'error' => true
|
||||
));
|
||||
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* Anything else
|
||||
Create a new DOCTYPE token. Set the token's name to the
|
||||
current input character. Switch to the DOCTYPE name state. */
|
||||
$this->token = array(
|
||||
'name' => $char,
|
||||
'type' => self::DOCTYPE,
|
||||
'error' => true
|
||||
);
|
||||
|
||||
$state = 'doctypeName';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'doctypeName':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
Switch to the after DOCTYPE name state. */
|
||||
$state = 'afterDoctypeName';
|
||||
|
||||
} elseif($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Emit the current DOCTYPE token. Switch to the data state. */
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
|
||||
} elseif('A' <= $char && $char <= 'Z') {
|
||||
/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
|
||||
Append the lowercase version of the input character
|
||||
(add 0x0020 to the character's code point) to the current
|
||||
DOCTYPE token's name. Stay in the DOCTYPE name state. */
|
||||
$this->token['name'] .= strtolower($char);
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Parse error. Set the DOCTYPE token's force-quirks flag
|
||||
to on. Emit that DOCTYPE token. Reconsume the EOF
|
||||
character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-doctype-name'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* Anything else
|
||||
Append the current input character to the current
|
||||
DOCTYPE token's name. Stay in the DOCTYPE name state. */
|
||||
$this->token['name'] .= $char;
|
||||
}
|
||||
|
||||
// XXX this is probably some sort of quirks mode designation,
|
||||
// check tree-builder to be sure. In general 'error' needs
|
||||
// to be specc'ified, this probably means removing it at the end
|
||||
$this->token['error'] = ($this->token['name'] === 'HTML')
|
||||
? false
|
||||
: true;
|
||||
break;
|
||||
|
||||
case 'afterDoctypeName':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
Stay in the after DOCTYPE name state. */
|
||||
|
||||
} elseif($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Emit the current DOCTYPE token. Switch to the data state. */
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Parse error. Set the DOCTYPE token's force-quirks flag
|
||||
to on. Emit that DOCTYPE token. Reconsume the EOF
|
||||
character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* Anything else */
|
||||
|
||||
$nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5));
|
||||
if ($nextSix === 'PUBLIC') {
|
||||
/* If the next six characters are an ASCII
|
||||
case-insensitive match for the word "PUBLIC", then
|
||||
consume those characters and switch to the before
|
||||
DOCTYPE public identifier state. */
|
||||
$state = 'beforeDoctypePublicIdentifier';
|
||||
|
||||
} elseif ($nextSix === 'SYSTEM') {
|
||||
/* Otherwise, if the next six characters are an ASCII
|
||||
case-insensitive match for the word "SYSTEM", then
|
||||
consume those characters and switch to the before
|
||||
DOCTYPE system identifier state. */
|
||||
$state = 'beforeDoctypeSystemIdentifier';
|
||||
|
||||
} else {
|
||||
/* Otherwise, this is the parse error. Set the DOCTYPE
|
||||
token's force-quirks flag to on. Switch to the bogus
|
||||
DOCTYPE state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-space-or-right-bracket-in-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->token['error'] = true;
|
||||
$state = 'bogusDoctype';
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'beforeDoctypePublicIdentifier':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
Stay in the before DOCTYPE public identifier state. */
|
||||
} elseif ($char === '"') {
|
||||
/* U+0022 QUOTATION MARK (")
|
||||
Set the DOCTYPE token's public identifier to the empty
|
||||
string (not missing), then switch to the DOCTYPE public
|
||||
identifier (double-quoted) state. */
|
||||
$this->token['public'] = '';
|
||||
$state = 'doctypePublicIdentifierDoubleQuoted';
|
||||
} elseif ($char === "'") {
|
||||
/* U+0027 APOSTROPHE (')
|
||||
Set the DOCTYPE token's public identifier to the empty
|
||||
string (not missing), then switch to the DOCTYPE public
|
||||
identifier (single-quoted) state. */
|
||||
$this->token['public'] = '';
|
||||
$state = 'doctypePublicIdentifierSingleQuoted';
|
||||
} elseif ($char === '>') {
|
||||
/* Parse error. Set the DOCTYPE token's force-quirks flag
|
||||
to on. Emit that DOCTYPE token. Switch to the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-end-of-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
} elseif ($char === false) {
|
||||
/* Parse error. Set the DOCTYPE token's force-quirks
|
||||
flag to on. Emit that DOCTYPE token. Reconsume the EOF
|
||||
character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
} else {
|
||||
/* Parse error. Set the DOCTYPE token's force-quirks flag
|
||||
to on. Switch to the bogus DOCTYPE state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-char-in-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$state = 'bogusDoctype';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'doctypePublicIdentifierDoubleQuoted':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if ($char === '"') {
|
||||
/* U+0022 QUOTATION MARK (")
|
||||
Switch to the after DOCTYPE public identifier state. */
|
||||
$state = 'afterDoctypePublicIdentifier';
|
||||
} elseif ($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Parse error. Set the DOCTYPE token's force-quirks flag
|
||||
to on. Emit that DOCTYPE token. Switch to the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-end-of-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
} elseif ($char === false) {
|
||||
/* EOF
|
||||
Parse error. Set the DOCTYPE token's force-quirks flag
|
||||
to on. Emit that DOCTYPE token. Reconsume the EOF
|
||||
character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
} else {
|
||||
/* Anything else
|
||||
Append the current input character to the current
|
||||
DOCTYPE token's public identifier. Stay in the DOCTYPE
|
||||
public identifier (double-quoted) state. */
|
||||
$this->token['public'] .= $char;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'doctypePublicIdentifierSingleQuoted':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if ($char === "'") {
|
||||
/* U+0027 APOSTROPHE (')
|
||||
Switch to the after DOCTYPE public identifier state. */
|
||||
$state = 'afterDoctypePublicIdentifier';
|
||||
} elseif ($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Parse error. Set the DOCTYPE token's force-quirks flag
|
||||
to on. Emit that DOCTYPE token. Switch to the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-end-of-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
} elseif ($char === false) {
|
||||
/* EOF
|
||||
Parse error. Set the DOCTYPE token's force-quirks flag
|
||||
to on. Emit that DOCTYPE token. Reconsume the EOF
|
||||
character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
} else {
|
||||
/* Anything else
|
||||
Append the current input character to the current
|
||||
DOCTYPE token's public identifier. Stay in the DOCTYPE
|
||||
public identifier (double-quoted) state. */
|
||||
$this->token['public'] .= $char;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'afterDoctypePublicIdentifier':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
Stay in the after DOCTYPE public identifier state. */
|
||||
} elseif ($char === '"') {
|
||||
/* U+0022 QUOTATION MARK (")
|
||||
Set the DOCTYPE token's system identifier to the
|
||||
empty string (not missing), then switch to the DOCTYPE
|
||||
system identifier (double-quoted) state. */
|
||||
$this->token['system'] = '';
|
||||
$state = 'doctypeSystemIdentifierDoubleQuoted';
|
||||
} elseif ($char === "'") {
|
||||
/* U+0027 APOSTROPHE (')
|
||||
Set the DOCTYPE token's system identifier to the
|
||||
empty string (not missing), then switch to the DOCTYPE
|
||||
system identifier (single-quoted) state. */
|
||||
$this->token['system'] = '';
|
||||
$state = 'doctypeSystemIdentifierSingleQuoted';
|
||||
} elseif ($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Emit the current DOCTYPE token. Switch to the data state. */
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
} elseif ($char === false) {
|
||||
/* Parse error. Set the DOCTYPE token's force-quirks
|
||||
flag to on. Emit that DOCTYPE token. Reconsume the EOF
|
||||
character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
} else {
|
||||
/* Anything else
|
||||
Parse error. Set the DOCTYPE token's force-quirks flag
|
||||
to on. Switch to the bogus DOCTYPE state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-char-in-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$state = 'bogusDoctype';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'beforeDoctypeSystemIdentifier':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
Stay in the before DOCTYPE system identifier state. */
|
||||
} elseif ($char === '"') {
|
||||
/* U+0022 QUOTATION MARK (")
|
||||
Set the DOCTYPE token's system identifier to the empty
|
||||
string (not missing), then switch to the DOCTYPE system
|
||||
identifier (double-quoted) state. */
|
||||
$this->token['system'] = '';
|
||||
$state = 'doctypeSystemIdentifierDoubleQuoted';
|
||||
} elseif ($char === "'") {
|
||||
/* U+0027 APOSTROPHE (')
|
||||
Set the DOCTYPE token's system identifier to the empty
|
||||
string (not missing), then switch to the DOCTYPE system
|
||||
identifier (single-quoted) state. */
|
||||
$this->token['system'] = '';
|
||||
$state = 'doctypeSystemIdentifierSingleQuoted';
|
||||
} elseif ($char === '>') {
|
||||
/* Parse error. Set the DOCTYPE token's force-quirks flag
|
||||
to on. Emit that DOCTYPE token. Switch to the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-char-in-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
} elseif ($char === false) {
|
||||
/* Parse error. Set the DOCTYPE token's force-quirks
|
||||
flag to on. Emit that DOCTYPE token. Reconsume the EOF
|
||||
character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
} else {
|
||||
/* Parse error. Set the DOCTYPE token's force-quirks flag
|
||||
to on. Switch to the bogus DOCTYPE state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-char-in-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$state = 'bogusDoctype';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'doctypeSystemIdentifierDoubleQuoted':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if ($char === '"') {
|
||||
/* U+0022 QUOTATION MARK (")
|
||||
Switch to the after DOCTYPE system identifier state. */
|
||||
$state = 'afterDoctypeSystemIdentifier';
|
||||
} elseif ($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Parse error. Set the DOCTYPE token's force-quirks flag
|
||||
to on. Emit that DOCTYPE token. Switch to the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-end-of-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
} elseif ($char === false) {
|
||||
/* EOF
|
||||
Parse error. Set the DOCTYPE token's force-quirks flag
|
||||
to on. Emit that DOCTYPE token. Reconsume the EOF
|
||||
character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
} else {
|
||||
/* Anything else
|
||||
Append the current input character to the current
|
||||
DOCTYPE token's system identifier. Stay in the DOCTYPE
|
||||
system identifier (double-quoted) state. */
|
||||
$this->token['system'] .= $char;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'doctypeSystemIdentifierSingleQuoted':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if ($char === "'") {
|
||||
/* U+0027 APOSTROPHE (')
|
||||
Switch to the after DOCTYPE system identifier state. */
|
||||
$state = 'afterDoctypeSystemIdentifier';
|
||||
} elseif ($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Parse error. Set the DOCTYPE token's force-quirks flag
|
||||
to on. Emit that DOCTYPE token. Switch to the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-end-of-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
} elseif ($char === false) {
|
||||
/* EOF
|
||||
Parse error. Set the DOCTYPE token's force-quirks flag
|
||||
to on. Emit that DOCTYPE token. Reconsume the EOF
|
||||
character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
} else {
|
||||
/* Anything else
|
||||
Append the current input character to the current
|
||||
DOCTYPE token's system identifier. Stay in the DOCTYPE
|
||||
system identifier (double-quoted) state. */
|
||||
$this->token['system'] .= $char;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'afterDoctypeSystemIdentifier':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
Stay in the after DOCTYPE system identifier state. */
|
||||
} elseif ($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Emit the current DOCTYPE token. Switch to the data state. */
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
} elseif ($char === false) {
|
||||
/* Parse error. Set the DOCTYPE token's force-quirks
|
||||
flag to on. Emit that DOCTYPE token. Reconsume the EOF
|
||||
character in the data state. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'eof-in-doctype'
|
||||
));
|
||||
$this->token['force-quirks'] = true;
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
} else {
|
||||
/* Anything else
|
||||
Parse error. Switch to the bogus DOCTYPE state.
|
||||
(This does not set the DOCTYPE token's force-quirks
|
||||
flag to on.) */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'unexpected-char-in-doctype'
|
||||
));
|
||||
$state = 'bogusDoctype';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'bogusDoctype':
|
||||
/* Consume the next input character: */
|
||||
$char = $this->stream->char();
|
||||
|
||||
if ($char === '>') {
|
||||
/* U+003E GREATER-THAN SIGN (>)
|
||||
Emit the DOCTYPE token. Switch to the data state. */
|
||||
$this->emitToken($this->token);
|
||||
$state = 'data';
|
||||
|
||||
} elseif($char === false) {
|
||||
/* EOF
|
||||
Emit the DOCTYPE token. Reconsume the EOF character in
|
||||
the data state. */
|
||||
$this->emitToken($this->token);
|
||||
$this->stream->unget();
|
||||
$state = 'data';
|
||||
|
||||
} else {
|
||||
/* Anything else
|
||||
Stay in the bogus DOCTYPE state. */
|
||||
}
|
||||
break;
|
||||
|
||||
// case 'cdataSection':
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a serialized representation of the tree.
|
||||
*/
|
||||
public function save() {
|
||||
return $this->tree->save();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the input stream.
|
||||
*/
|
||||
public function stream() {
|
||||
return $this->stream;
|
||||
}
|
||||
|
||||
private function consumeCharacterReference($allowed = false, $inattr = false) {
|
||||
// This goes quite far against spec, and is far closer to the Python
|
||||
// impl., mainly because we don't do the large unconsuming the spec
|
||||
// requires.
|
||||
|
||||
// All consumed characters.
|
||||
$chars = $this->stream->char();
|
||||
|
||||
/* This section defines how to consume a character
|
||||
reference. This definition is used when parsing character
|
||||
references in text and in attributes.
|
||||
|
||||
The behavior depends on the identity of the next character
|
||||
(the one immediately after the U+0026 AMPERSAND character): */
|
||||
|
||||
if (
|
||||
$chars[0] === "\x09" ||
|
||||
$chars[0] === "\x0A" ||
|
||||
$chars[0] === "\x0C" ||
|
||||
$chars[0] === "\x20" ||
|
||||
$chars[0] === '<' ||
|
||||
$chars[0] === '&' ||
|
||||
$chars === false ||
|
||||
$chars[0] === $allowed
|
||||
) {
|
||||
/* U+0009 CHARACTER TABULATION
|
||||
U+000A LINE FEED (LF)
|
||||
U+000C FORM FEED (FF)
|
||||
U+0020 SPACE
|
||||
U+003C LESS-THAN SIGN
|
||||
U+0026 AMPERSAND
|
||||
EOF
|
||||
The additional allowed character, if there is one
|
||||
Not a character reference. No characters are consumed,
|
||||
and nothing is returned. (This is not an error, either.) */
|
||||
// We already consumed, so unconsume.
|
||||
$this->stream->unget();
|
||||
return '&';
|
||||
} elseif ($chars[0] === '#') {
|
||||
/* Consume the U+0023 NUMBER SIGN. */
|
||||
// Um, yeah, we already did that.
|
||||
/* The behavior further depends on the character after
|
||||
the U+0023 NUMBER SIGN: */
|
||||
$chars .= $this->stream->char();
|
||||
if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
|
||||
/* U+0078 LATIN SMALL LETTER X
|
||||
U+0058 LATIN CAPITAL LETTER X */
|
||||
/* Consume the X. */
|
||||
// Um, yeah, we already did that.
|
||||
/* Follow the steps below, but using the range of
|
||||
characters U+0030 DIGIT ZERO through to U+0039 DIGIT
|
||||
NINE, U+0061 LATIN SMALL LETTER A through to U+0066
|
||||
LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
|
||||
A, through to U+0046 LATIN CAPITAL LETTER F (in other
|
||||
words, 0123456789, ABCDEF, abcdef). */
|
||||
$char_class = self::HEX;
|
||||
/* When it comes to interpreting the
|
||||
number, interpret it as a hexadecimal number. */
|
||||
$hex = true;
|
||||
} else {
|
||||
/* Anything else */
|
||||
// Unconsume because we shouldn't have consumed this.
|
||||
$chars = $chars[0];
|
||||
$this->stream->unget();
|
||||
/* Follow the steps below, but using the range of
|
||||
characters U+0030 DIGIT ZERO through to U+0039 DIGIT
|
||||
NINE (i.e. just 0123456789). */
|
||||
$char_class = self::DIGIT;
|
||||
/* When it comes to interpreting the number,
|
||||
interpret it as a decimal number. */
|
||||
$hex = false;
|
||||
}
|
||||
|
||||
/* Consume as many characters as match the range of characters given above. */
|
||||
$consumed = $this->stream->charsWhile($char_class);
|
||||
if ($consumed === '' || $consumed === false) {
|
||||
/* If no characters match the range, then don't consume
|
||||
any characters (and unconsume the U+0023 NUMBER SIGN
|
||||
character and, if appropriate, the X character). This
|
||||
is a parse error; nothing is returned. */
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-numeric-entity'
|
||||
));
|
||||
return '&' . $chars;
|
||||
} else {
|
||||
/* Otherwise, if the next character is a U+003B SEMICOLON,
|
||||
consume that too. If it isn't, there is a parse error. */
|
||||
if ($this->stream->char() !== ';') {
|
||||
$this->stream->unget();
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'numeric-entity-without-semicolon'
|
||||
));
|
||||
}
|
||||
|
||||
/* If one or more characters match the range, then take
|
||||
them all and interpret the string of characters as a number
|
||||
(either hexadecimal or decimal as appropriate). */
|
||||
$codepoint = $hex ? hexdec($consumed) : (int) $consumed;
|
||||
|
||||
/* If that number is one of the numbers in the first column
|
||||
of the following table, then this is a parse error. Find the
|
||||
row with that number in the first column, and return a
|
||||
character token for the Unicode character given in the
|
||||
second column of that row. */
|
||||
$new_codepoint = HTML5_Data::getRealCodepoint($codepoint);
|
||||
if ($new_codepoint) {
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'illegal-windows-1252-entity'
|
||||
));
|
||||
$codepoint = $new_codepoint;
|
||||
} else {
|
||||
/* Otherwise, if the number is in the range 0x0000 to 0x0008,
|
||||
U+000B, U+000E to 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF ,
|
||||
0xFDD0 to 0xFDEF, or is one of 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF,
|
||||
0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
|
||||
0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF,
|
||||
0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE,
|
||||
0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
||||
0x10FFFE, or 0x10FFFF, or is higher than 0x10FFFF, then this
|
||||
is a parse error; return a character token for the U+FFFD
|
||||
REPLACEMENT CHARACTER character instead. */
|
||||
// && has higher precedence than ||
|
||||
if (
|
||||
$codepoint >= 0x0000 && $codepoint <= 0x0008 ||
|
||||
$codepoint === 0x000B ||
|
||||
$codepoint >= 0x000E && $codepoint <= 0x001F ||
|
||||
$codepoint >= 0x007F && $codepoint <= 0x009F ||
|
||||
$codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
|
||||
$codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
|
||||
($codepoint & 0xFFFE) === 0xFFFE ||
|
||||
$codepoint > 0x10FFFF
|
||||
) {
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'illegal-codepoint-for-numeric-entity'
|
||||
));
|
||||
$codepoint = 0xFFFD;
|
||||
}
|
||||
}
|
||||
|
||||
/* Otherwise, return a character token for the Unicode
|
||||
character whose code point is that number. */
|
||||
return HTML5_Data::utf8chr($codepoint);
|
||||
}
|
||||
|
||||
} else {
|
||||
/* Anything else */
|
||||
|
||||
/* Consume the maximum number of characters possible,
|
||||
with the consumed characters matching one of the
|
||||
identifiers in the first column of the named character
|
||||
references table (in a case-sensitive manner). */
|
||||
|
||||
// we will implement this by matching the longest
|
||||
// alphanumeric + semicolon string, and then working
|
||||
// our way backwards
|
||||
$chars .= $this->stream->charsWhile(self::DIGIT . self::ALPHA . ';', HTML5_Data::getNamedCharacterReferenceMaxLength() - 1);
|
||||
$len = strlen($chars);
|
||||
|
||||
$refs = HTML5_Data::getNamedCharacterReferences();
|
||||
$codepoint = false;
|
||||
for($c = $len; $c > 0; $c--) {
|
||||
$id = substr($chars, 0, $c);
|
||||
if(isset($refs[$id])) {
|
||||
$codepoint = $refs[$id];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* If no match can be made, then this is a parse error.
|
||||
No characters are consumed, and nothing is returned. */
|
||||
if (!$codepoint) {
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'expected-named-entity'
|
||||
));
|
||||
return '&' . $chars;
|
||||
}
|
||||
|
||||
/* If the last character matched is not a U+003B SEMICOLON
|
||||
(;), there is a parse error. */
|
||||
$semicolon = true;
|
||||
if (substr($id, -1) !== ';') {
|
||||
$this->emitToken(array(
|
||||
'type' => self::PARSEERROR,
|
||||
'data' => 'named-entity-without-semicolon'
|
||||
));
|
||||
$semicolon = false;
|
||||
}
|
||||
|
||||
|
||||
/* If the character reference is being consumed as part of
|
||||
an attribute, and the last character matched is not a
|
||||
U+003B SEMICOLON (;), and the next character is in the
|
||||
range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
|
||||
LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
|
||||
or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
|
||||
then, for historical reasons, all the characters that were
|
||||
matched after the U+0026 AMPERSAND (&) must be unconsumed,
|
||||
and nothing is returned. */
|
||||
if (
|
||||
$inattr && !$semicolon &&
|
||||
strspn(substr($chars, $c, 1), self::ALPHA . self::DIGIT)
|
||||
) {
|
||||
return '&' . $chars;
|
||||
}
|
||||
|
||||
/* Otherwise, return a character token for the character
|
||||
corresponding to the character reference name (as given
|
||||
by the second column of the named character references table). */
|
||||
return HTML5_Data::utf8chr($codepoint) . substr($chars, $c);
|
||||
}
|
||||
}
|
||||
|
||||
private function characterReferenceInAttributeValue($allowed = false) {
|
||||
/* Attempt to consume a character reference. */
|
||||
$entity = $this->consumeCharacterReference($allowed, true);
|
||||
|
||||
/* If nothing is returned, append a U+0026 AMPERSAND
|
||||
character to the current attribute's value.
|
||||
|
||||
Otherwise, append the returned character token to the
|
||||
current attribute's value. */
|
||||
$char = (!$entity)
|
||||
? '&'
|
||||
: $entity;
|
||||
|
||||
$last = count($this->token['attr']) - 1;
|
||||
$this->token['attr'][$last]['value'] .= $char;
|
||||
|
||||
/* Finally, switch back to the attribute value state that you
|
||||
were in when were switched into this state. */
|
||||
}
|
||||
|
||||
/**
|
||||
* Emits a token, passing it on to the tree builder.
|
||||
*/
|
||||
protected function emitToken($token, $checkStream = true) {
|
||||
if ($checkStream) {
|
||||
// Emit errors from input stream.
|
||||
while ($this->stream->errors) {
|
||||
$this->emitToken(array_shift($this->stream->errors), false);
|
||||
}
|
||||
}
|
||||
|
||||
// the current structure of attributes is not a terribly good one
|
||||
$this->tree->emitToken($token);
|
||||
|
||||
if(is_int($this->tree->content_model)) {
|
||||
$this->content_model = $this->tree->content_model;
|
||||
$this->tree->content_model = null;
|
||||
|
||||
} elseif($token['type'] === self::ENDTAG) {
|
||||
$this->content_model = self::PCDATA;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
3715
library/HTML5/TreeBuilder.php
Normal file
3715
library/HTML5/TreeBuilder.php
Normal file
|
|
@ -0,0 +1,3715 @@
|
|||
<?php
|
||||
|
||||
/*
|
||||
|
||||
Copyright 2007 Jeroen van der Meer <http://jero.net/>
|
||||
Copyright 2009 Edward Z. Yang <edwardzyang@thewritingpot.com>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included
|
||||
in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*/
|
||||
|
||||
// Tags for FIX ME!!!: (in order of priority)
|
||||
// XXX - should be fixed NAO!
|
||||
// XERROR - with regards to parse errors
|
||||
// XSCRIPT - with regards to scripting mode
|
||||
// XENCODING - with regards to encoding (for reparsing tests)
|
||||
|
||||
class HTML5_TreeBuilder {
|
||||
public $stack = array();
|
||||
public $content_model;
|
||||
|
||||
private $mode;
|
||||
private $original_mode;
|
||||
private $secondary_mode;
|
||||
private $dom;
|
||||
// Whether or not normal insertion of nodes should actually foster
|
||||
// parent (used in one case in spec)
|
||||
private $foster_parent = false;
|
||||
private $a_formatting = array();
|
||||
|
||||
private $head_pointer = null;
|
||||
private $form_pointer = null;
|
||||
|
||||
private $flag_frameset_ok = true;
|
||||
private $flag_force_quirks = false;
|
||||
private $ignored = false;
|
||||
private $quirks_mode = null;
|
||||
// this gets to 2 when we want to ignore the next lf character, and
|
||||
// is decrement at the beginning of each processed token (this way,
|
||||
// code can check for (bool)$ignore_lf_token, but it phases out
|
||||
// appropriately)
|
||||
private $ignore_lf_token = 0;
|
||||
private $fragment = false;
|
||||
private $root;
|
||||
|
||||
private $scoping = array('applet','button','caption','html','marquee','object','table','td','th', 'svg:foreignObject');
|
||||
private $formatting = array('a','b','big','code','em','font','i','nobr','s','small','strike','strong','tt','u');
|
||||
private $special = array('address','area','article','aside','base','basefont','bgsound',
|
||||
'blockquote','body','br','center','col','colgroup','command','dd','details','dialog','dir','div','dl',
|
||||
'dt','embed','fieldset','figure','footer','form','frame','frameset','h1','h2','h3','h4','h5',
|
||||
'h6','head','header','hgroup','hr','iframe','img','input','isindex','li','link',
|
||||
'listing','menu','meta','nav','noembed','noframes','noscript','ol',
|
||||
'p','param','plaintext','pre','script','select','spacer','style',
|
||||
'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
|
||||
|
||||
// Tree construction modes
|
||||
const INITIAL = 0;
|
||||
const BEFORE_HTML = 1;
|
||||
const BEFORE_HEAD = 2;
|
||||
const IN_HEAD = 3;
|
||||
const IN_HEAD_NOSCRIPT = 4;
|
||||
const AFTER_HEAD = 5;
|
||||
const IN_BODY = 6;
|
||||
const IN_CDATA_RCDATA = 7;
|
||||
const IN_TABLE = 8;
|
||||
const IN_CAPTION = 9;
|
||||
const IN_COLUMN_GROUP = 10;
|
||||
const IN_TABLE_BODY = 11;
|
||||
const IN_ROW = 12;
|
||||
const IN_CELL = 13;
|
||||
const IN_SELECT = 14;
|
||||
const IN_SELECT_IN_TABLE= 15;
|
||||
const IN_FOREIGN_CONTENT= 16;
|
||||
const AFTER_BODY = 17;
|
||||
const IN_FRAMESET = 18;
|
||||
const AFTER_FRAMESET = 19;
|
||||
const AFTER_AFTER_BODY = 20;
|
||||
const AFTER_AFTER_FRAMESET = 21;
|
||||
|
||||
/**
|
||||
* Converts a magic number to a readable name. Use for debugging.
|
||||
*/
|
||||
private function strConst($number) {
|
||||
static $lookup;
|
||||
if (!$lookup) {
|
||||
$r = new ReflectionClass('HTML5_TreeBuilder');
|
||||
$lookup = array_flip($r->getConstants());
|
||||
}
|
||||
return $lookup[$number];
|
||||
}
|
||||
|
||||
// The different types of elements.
|
||||
const SPECIAL = 100;
|
||||
const SCOPING = 101;
|
||||
const FORMATTING = 102;
|
||||
const PHRASING = 103;
|
||||
|
||||
// Quirks modes in $quirks_mode
|
||||
const NO_QUIRKS = 200;
|
||||
const QUIRKS_MODE = 201;
|
||||
const LIMITED_QUIRKS_MODE = 202;
|
||||
|
||||
// Marker to be placed in $a_formatting
|
||||
const MARKER = 300;
|
||||
|
||||
// Namespaces for foreign content
|
||||
const NS_HTML = null; // to prevent DOM from requiring NS on everything
|
||||
const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
|
||||
const NS_SVG = 'http://www.w3.org/2000/svg';
|
||||
const NS_XLINK = 'http://www.w3.org/1999/xlink';
|
||||
const NS_XML = 'http://www.w3.org/XML/1998/namespace';
|
||||
const NS_XMLNS = 'http://www.w3.org/2000/xmlns/';
|
||||
|
||||
public function __construct() {
|
||||
$this->mode = self::INITIAL;
|
||||
$this->dom = new DOMDocument;
|
||||
|
||||
$this->dom->encoding = 'UTF-8';
|
||||
$this->dom->preserveWhiteSpace = true;
|
||||
$this->dom->substituteEntities = true;
|
||||
$this->dom->strictErrorChecking = false;
|
||||
}
|
||||
|
||||
// Process tag tokens
|
||||
public function emitToken($token, $mode = null) {
|
||||
// XXX: ignore parse errors... why are we emitting them, again?
|
||||
if ($token['type'] === HTML5_Tokenizer::PARSEERROR) return;
|
||||
if ($mode === null) $mode = $this->mode;
|
||||
|
||||
/*
|
||||
$backtrace = debug_backtrace();
|
||||
if ($backtrace[1]['class'] !== 'HTML5_TreeBuilder') echo "--\n";
|
||||
echo $this->strConst($mode);
|
||||
if ($this->original_mode) echo " (originally ".$this->strConst($this->original_mode).")";
|
||||
echo "\n ";
|
||||
token_dump($token);
|
||||
$this->printStack();
|
||||
$this->printActiveFormattingElements();
|
||||
if ($this->foster_parent) echo " -> this is a foster parent mode\n";
|
||||
*/
|
||||
|
||||
if ($this->ignore_lf_token) $this->ignore_lf_token--;
|
||||
$this->ignored = false;
|
||||
// indenting is a little wonky, this can be changed later on
|
||||
switch ($mode) {
|
||||
|
||||
case self::INITIAL:
|
||||
|
||||
/* A character token that is one of U+0009 CHARACTER TABULATION,
|
||||
* U+000A LINE FEED (LF), U+000C FORM FEED (FF), or U+0020 SPACE */
|
||||
if ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
||||
/* Ignore the token. */
|
||||
$this->ignored = true;
|
||||
} elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
||||
if (
|
||||
$token['name'] !== 'html' || !empty($token['public']) ||
|
||||
!empty($token['system']) || $token !== 'about:legacy-compat'
|
||||
) {
|
||||
/* If the DOCTYPE token's name is not a case-sensitive match
|
||||
* for the string "html", or if the token's public identifier
|
||||
* is not missing, or if the token's system identifier is
|
||||
* neither missing nor a case-sensitive match for the string
|
||||
* "about:legacy-compat", then there is a parse error (this
|
||||
* is the DOCTYPE parse error). */
|
||||
// DOCTYPE parse error
|
||||
}
|
||||
/* Append a DocumentType node to the Document node, with the name
|
||||
* attribute set to the name given in the DOCTYPE token, or the
|
||||
* empty string if the name was missing; the publicId attribute
|
||||
* set to the public identifier given in the DOCTYPE token, or
|
||||
* the empty string if the public identifier was missing; the
|
||||
* systemId attribute set to the system identifier given in the
|
||||
* DOCTYPE token, or the empty string if the system identifier
|
||||
* was missing; and the other attributes specific to
|
||||
* DocumentType objects set to null and empty lists as
|
||||
* appropriate. Associate the DocumentType node with the
|
||||
* Document object so that it is returned as the value of the
|
||||
* doctype attribute of the Document object. */
|
||||
if (!isset($token['public'])) $token['public'] = null;
|
||||
if (!isset($token['system'])) $token['system'] = null;
|
||||
// Yes this is hacky. I'm kind of annoyed that I can't appendChild
|
||||
// a doctype to DOMDocument. Maybe I haven't chanted the right
|
||||
// syllables.
|
||||
$impl = new DOMImplementation();
|
||||
// This call can fail for particularly pathological cases (namely,
|
||||
// the qualifiedName parameter ($token['name']) could be missing.
|
||||
if ($token['name']) {
|
||||
$doctype = $impl->createDocumentType($token['name'], $token['public'], $token['system']);
|
||||
$this->dom->appendChild($doctype);
|
||||
} else {
|
||||
// It looks like libxml's not actually *able* to express this case.
|
||||
// So... don't.
|
||||
$this->dom->emptyDoctype = true;
|
||||
}
|
||||
$public = is_null($token['public']) ? false : strtolower($token['public']);
|
||||
$system = is_null($token['system']) ? false : strtolower($token['system']);
|
||||
$publicStartsWithForQuirks = array(
|
||||
"+//silmaril//dtd html pro v0r11 19970101//",
|
||||
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
|
||||
"-//as//dtd html 3.0 aswedit + extensions//",
|
||||
"-//ietf//dtd html 2.0 level 1//",
|
||||
"-//ietf//dtd html 2.0 level 2//",
|
||||
"-//ietf//dtd html 2.0 strict level 1//",
|
||||
"-//ietf//dtd html 2.0 strict level 2//",
|
||||
"-//ietf//dtd html 2.0 strict//",
|
||||
"-//ietf//dtd html 2.0//",
|
||||
"-//ietf//dtd html 2.1e//",
|
||||
"-//ietf//dtd html 3.0//",
|
||||
"-//ietf//dtd html 3.2 final//",
|
||||
"-//ietf//dtd html 3.2//",
|
||||
"-//ietf//dtd html 3//",
|
||||
"-//ietf//dtd html level 0//",
|
||||
"-//ietf//dtd html level 1//",
|
||||
"-//ietf//dtd html level 2//",
|
||||
"-//ietf//dtd html level 3//",
|
||||
"-//ietf//dtd html strict level 0//",
|
||||
"-//ietf//dtd html strict level 1//",
|
||||
"-//ietf//dtd html strict level 2//",
|
||||
"-//ietf//dtd html strict level 3//",
|
||||
"-//ietf//dtd html strict//",
|
||||
"-//ietf//dtd html//",
|
||||
"-//metrius//dtd metrius presentational//",
|
||||
"-//microsoft//dtd internet explorer 2.0 html strict//",
|
||||
"-//microsoft//dtd internet explorer 2.0 html//",
|
||||
"-//microsoft//dtd internet explorer 2.0 tables//",
|
||||
"-//microsoft//dtd internet explorer 3.0 html strict//",
|
||||
"-//microsoft//dtd internet explorer 3.0 html//",
|
||||
"-//microsoft//dtd internet explorer 3.0 tables//",
|
||||
"-//netscape comm. corp.//dtd html//",
|
||||
"-//netscape comm. corp.//dtd strict html//",
|
||||
"-//o'reilly and associates//dtd html 2.0//",
|
||||
"-//o'reilly and associates//dtd html extended 1.0//",
|
||||
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
|
||||
"-//spyglass//dtd html 2.0 extended//",
|
||||
"-//sq//dtd html 2.0 hotmetal + extensions//",
|
||||
"-//sun microsystems corp.//dtd hotjava html//",
|
||||
"-//sun microsystems corp.//dtd hotjava strict html//",
|
||||
"-//w3c//dtd html 3 1995-03-24//",
|
||||
"-//w3c//dtd html 3.2 draft//",
|
||||
"-//w3c//dtd html 3.2 final//",
|
||||
"-//w3c//dtd html 3.2//",
|
||||
"-//w3c//dtd html 3.2s draft//",
|
||||
"-//w3c//dtd html 4.0 frameset//",
|
||||
"-//w3c//dtd html 4.0 transitional//",
|
||||
"-//w3c//dtd html experimental 19960712//",
|
||||
"-//w3c//dtd html experimental 970421//",
|
||||
"-//w3c//dtd w3 html//",
|
||||
"-//w3o//dtd w3 html 3.0//",
|
||||
"-//webtechs//dtd mozilla html 2.0//",
|
||||
"-//webtechs//dtd mozilla html//",
|
||||
);
|
||||
$publicSetToForQuirks = array(
|
||||
"-//w3o//dtd w3 html strict 3.0//",
|
||||
"-/w3c/dtd html 4.0 transitional/en",
|
||||
"html",
|
||||
);
|
||||
$publicStartsWithAndSystemForQuirks = array(
|
||||
"-//w3c//dtd html 4.01 frameset//",
|
||||
"-//w3c//dtd html 4.01 transitional//",
|
||||
);
|
||||
$publicStartsWithForLimitedQuirks = array(
|
||||
"-//w3c//dtd xhtml 1.0 frameset//",
|
||||
"-//w3c//dtd xhtml 1.0 transitional//",
|
||||
);
|
||||
$publicStartsWithAndSystemForLimitedQuirks = array(
|
||||
"-//w3c//dtd html 4.01 frameset//",
|
||||
"-//w3c//dtd html 4.01 transitional//",
|
||||
);
|
||||
// first, do easy checks
|
||||
if (
|
||||
!empty($token['force-quirks']) ||
|
||||
strtolower($token['name']) !== 'html'
|
||||
) {
|
||||
$this->quirks_mode = self::QUIRKS_MODE;
|
||||
} else {
|
||||
do {
|
||||
if ($system) {
|
||||
foreach ($publicStartsWithAndSystemForQuirks as $x) {
|
||||
if (strncmp($public, $x, strlen($x)) === 0) {
|
||||
$this->quirks_mode = self::QUIRKS_MODE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!is_null($this->quirks_mode)) break;
|
||||
foreach ($publicStartsWithAndSystemForLimitedQuirks as $x) {
|
||||
if (strncmp($public, $x, strlen($x)) === 0) {
|
||||
$this->quirks_mode = self::LIMITED_QUIRKS_MODE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!is_null($this->quirks_mode)) break;
|
||||
}
|
||||
foreach ($publicSetToForQuirks as $x) {
|
||||
if ($public === $x) {
|
||||
$this->quirks_mode = self::QUIRKS_MODE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!is_null($this->quirks_mode)) break;
|
||||
foreach ($publicStartsWithForLimitedQuirks as $x) {
|
||||
if (strncmp($public, $x, strlen($x)) === 0) {
|
||||
$this->quirks_mode = self::LIMITED_QUIRKS_MODE;
|
||||
}
|
||||
}
|
||||
if (!is_null($this->quirks_mode)) break;
|
||||
if ($system === "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
|
||||
$this->quirks_mode = self::QUIRKS_MODE;
|
||||
break;
|
||||
}
|
||||
foreach ($publicStartsWithForQuirks as $x) {
|
||||
if (strncmp($public, $x, strlen($x)) === 0) {
|
||||
$this->quirks_mode = self::QUIRKS_MODE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (is_null($this->quirks_mode)) {
|
||||
$this->quirks_mode = self::NO_QUIRKS;
|
||||
}
|
||||
} while (false);
|
||||
}
|
||||
$this->mode = self::BEFORE_HTML;
|
||||
} else {
|
||||
// parse error
|
||||
/* Switch the insertion mode to "before html", then reprocess the
|
||||
* current token. */
|
||||
$this->mode = self::BEFORE_HTML;
|
||||
$this->quirks_mode = self::QUIRKS_MODE;
|
||||
$this->emitToken($token);
|
||||
}
|
||||
break;
|
||||
|
||||
case self::BEFORE_HTML:
|
||||
|
||||
/* A DOCTYPE token */
|
||||
if($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
||||
// Parse error. Ignore the token.
|
||||
$this->ignored = true;
|
||||
|
||||
/* A comment token */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
||||
/* Append a Comment node to the Document object with the data
|
||||
attribute set to the data given in the comment token. */
|
||||
$comment = $this->dom->createComment($token['data']);
|
||||
$this->dom->appendChild($comment);
|
||||
|
||||
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
|
||||
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
|
||||
or U+0020 SPACE */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
||||
/* Ignore the token. */
|
||||
$this->ignored = true;
|
||||
|
||||
/* A start tag whose tag name is "html" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] == 'html') {
|
||||
/* Create an element for the token in the HTML namespace. Append it
|
||||
* to the Document object. Put this element in the stack of open
|
||||
* elements. */
|
||||
$html = $this->insertElement($token, false);
|
||||
$this->dom->appendChild($html);
|
||||
$this->stack[] = $html;
|
||||
|
||||
$this->mode = self::BEFORE_HEAD;
|
||||
|
||||
} else {
|
||||
/* Create an html element. Append it to the Document object. Put
|
||||
* this element in the stack of open elements. */
|
||||
$html = $this->dom->createElementNS(self::NS_HTML, 'html');
|
||||
$this->dom->appendChild($html);
|
||||
$this->stack[] = $html;
|
||||
|
||||
/* Switch the insertion mode to "before head", then reprocess the
|
||||
* current token. */
|
||||
$this->mode = self::BEFORE_HEAD;
|
||||
$this->emitToken($token);
|
||||
}
|
||||
break;
|
||||
|
||||
case self::BEFORE_HEAD:
|
||||
|
||||
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
|
||||
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
|
||||
or U+0020 SPACE */
|
||||
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
||||
/* Ignore the token. */
|
||||
$this->ignored = true;
|
||||
|
||||
/* A comment token */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
||||
/* Append a Comment node to the current node with the data attribute
|
||||
set to the data given in the comment token. */
|
||||
$this->insertComment($token['data']);
|
||||
|
||||
/* A DOCTYPE token */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
||||
/* Parse error. Ignore the token */
|
||||
$this->ignored = true;
|
||||
// parse error
|
||||
|
||||
/* A start tag token with the tag name "html" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
|
||||
/* Process the token using the rules for the "in body"
|
||||
* insertion mode. */
|
||||
$this->processWithRulesFor($token, self::IN_BODY);
|
||||
|
||||
/* A start tag token with the tag name "head" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'head') {
|
||||
/* Insert an HTML element for the token. */
|
||||
$element = $this->insertElement($token);
|
||||
|
||||
/* Set the head element pointer to this new element node. */
|
||||
$this->head_pointer = $element;
|
||||
|
||||
/* Change the insertion mode to "in head". */
|
||||
$this->mode = self::IN_HEAD;
|
||||
|
||||
/* An end tag whose tag name is one of: "head", "body", "html", "br" */
|
||||
} elseif(
|
||||
$token['type'] === HTML5_Tokenizer::ENDTAG && (
|
||||
$token['name'] === 'head' || $token['name'] === 'body' ||
|
||||
$token['name'] === 'html' || $token['name'] === 'br'
|
||||
)) {
|
||||
/* Act as if a start tag token with the tag name "head" and no
|
||||
* attributes had been seen, then reprocess the current token. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'head',
|
||||
'type' => HTML5_Tokenizer::STARTTAG,
|
||||
'attr' => array()
|
||||
));
|
||||
$this->emitToken($token);
|
||||
|
||||
/* Any other end tag */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG) {
|
||||
/* Parse error. Ignore the token. */
|
||||
$this->ignored = true;
|
||||
|
||||
} else {
|
||||
/* Act as if a start tag token with the tag name "head" and no
|
||||
* attributes had been seen, then reprocess the current token.
|
||||
* Note: This will result in an empty head element being
|
||||
* generated, with the current token being reprocessed in the
|
||||
* "after head" insertion mode. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'head',
|
||||
'type' => HTML5_Tokenizer::STARTTAG,
|
||||
'attr' => array()
|
||||
));
|
||||
$this->emitToken($token);
|
||||
}
|
||||
break;
|
||||
|
||||
case self::IN_HEAD:
|
||||
|
||||
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
|
||||
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
|
||||
or U+0020 SPACE. */
|
||||
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
||||
/* Insert the character into the current node. */
|
||||
$this->insertText($token['data']);
|
||||
|
||||
/* A comment token */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
||||
/* Append a Comment node to the current node with the data attribute
|
||||
set to the data given in the comment token. */
|
||||
$this->insertComment($token['data']);
|
||||
|
||||
/* A DOCTYPE token */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
||||
/* Parse error. Ignore the token. */
|
||||
$this->ignored = true;
|
||||
// parse error
|
||||
|
||||
/* A start tag whose tag name is "html" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
$token['name'] === 'html') {
|
||||
$this->processWithRulesFor($token, self::IN_BODY);
|
||||
|
||||
/* A start tag whose tag name is one of: "base", "command", "link" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
($token['name'] === 'base' || $token['name'] === 'command' ||
|
||||
$token['name'] === 'link')) {
|
||||
/* Insert an HTML element for the token. Immediately pop the
|
||||
* current node off the stack of open elements. */
|
||||
$this->insertElement($token);
|
||||
array_pop($this->stack);
|
||||
|
||||
// YYY: Acknowledge the token's self-closing flag, if it is set.
|
||||
|
||||
/* A start tag whose tag name is "meta" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'meta') {
|
||||
/* Insert an HTML element for the token. Immediately pop the
|
||||
* current node off the stack of open elements. */
|
||||
$this->insertElement($token);
|
||||
array_pop($this->stack);
|
||||
|
||||
// XERROR: Acknowledge the token's self-closing flag, if it is set.
|
||||
|
||||
// XENCODING: If the element has a charset attribute, and its value is a
|
||||
// supported encoding, and the confidence is currently tentative,
|
||||
// then change the encoding to the encoding given by the value of
|
||||
// the charset attribute.
|
||||
//
|
||||
// Otherwise, if the element has a content attribute, and applying
|
||||
// the algorithm for extracting an encoding from a Content-Type to
|
||||
// its value returns a supported encoding encoding, and the
|
||||
// confidence is currently tentative, then change the encoding to
|
||||
// the encoding encoding.
|
||||
|
||||
/* A start tag with the tag name "title" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'title') {
|
||||
$this->insertRCDATAElement($token);
|
||||
|
||||
/* A start tag whose tag name is "noscript", if the scripting flag is enabled, or
|
||||
* A start tag whose tag name is one of: "noframes", "style" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
($token['name'] === 'noscript' || $token['name'] === 'noframes' || $token['name'] === 'style')) {
|
||||
// XSCRIPT: Scripting flag not respected
|
||||
$this->insertCDATAElement($token);
|
||||
|
||||
// XSCRIPT: Scripting flag disable not implemented
|
||||
|
||||
/* A start tag with the tag name "script" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'script') {
|
||||
/* 1. Create an element for the token in the HTML namespace. */
|
||||
$node = $this->insertElement($token, false);
|
||||
|
||||
/* 2. Mark the element as being "parser-inserted" */
|
||||
// Uhhh... XSCRIPT
|
||||
|
||||
/* 3. If the parser was originally created for the HTML
|
||||
* fragment parsing algorithm, then mark the script element as
|
||||
* "already executed". (fragment case) */
|
||||
// ditto... XSCRIPT
|
||||
|
||||
/* 4. Append the new element to the current node and push it onto
|
||||
* the stack of open elements. */
|
||||
end($this->stack)->appendChild($node);
|
||||
$this->stack[] = $node;
|
||||
// I guess we could squash these together
|
||||
|
||||
/* 6. Let the original insertion mode be the current insertion mode. */
|
||||
$this->original_mode = $this->mode;
|
||||
/* 7. Switch the insertion mode to "in CDATA/RCDATA" */
|
||||
$this->mode = self::IN_CDATA_RCDATA;
|
||||
/* 5. Switch the tokeniser's content model flag to the CDATA state. */
|
||||
$this->content_model = HTML5_Tokenizer::CDATA;
|
||||
|
||||
/* An end tag with the tag name "head" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'head') {
|
||||
/* Pop the current node (which will be the head element) off the stack of open elements. */
|
||||
array_pop($this->stack);
|
||||
|
||||
/* Change the insertion mode to "after head". */
|
||||
$this->mode = self::AFTER_HEAD;
|
||||
|
||||
// Slight logic inversion here to minimize duplication
|
||||
/* A start tag with the tag name "head". */
|
||||
/* An end tag whose tag name is not one of: "body", "html", "br" */
|
||||
} elseif(($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'head') ||
|
||||
($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] !== 'html' &&
|
||||
$token['name'] !== 'body' && $token['name'] !== 'br')) {
|
||||
// Parse error. Ignore the token.
|
||||
$this->ignored = true;
|
||||
|
||||
/* Anything else */
|
||||
} else {
|
||||
/* Act as if an end tag token with the tag name "head" had been
|
||||
* seen, and reprocess the current token. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'head',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
|
||||
/* Then, reprocess the current token. */
|
||||
$this->emitToken($token);
|
||||
}
|
||||
break;
|
||||
|
||||
case self::IN_HEAD_NOSCRIPT:
|
||||
if ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
||||
// parse error
|
||||
} elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
|
||||
$this->processWithRulesFor($token, self::IN_BODY);
|
||||
} elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'noscript') {
|
||||
/* Pop the current node (which will be a noscript element) from the
|
||||
* stack of open elements; the new current node will be a head
|
||||
* element. */
|
||||
array_pop($this->stack);
|
||||
$this->mode = self::IN_HEAD;
|
||||
} elseif (
|
||||
($token['type'] === HTML5_Tokenizer::SPACECHARACTER) ||
|
||||
($token['type'] === HTML5_Tokenizer::COMMENT) ||
|
||||
($token['type'] === HTML5_Tokenizer::STARTTAG && (
|
||||
$token['name'] === 'link' || $token['name'] === 'meta' ||
|
||||
$token['name'] === 'noframes' || $token['name'] === 'style'))) {
|
||||
$this->processWithRulesFor($token, self::IN_HEAD);
|
||||
// inverted logic
|
||||
} elseif (
|
||||
($token['type'] === HTML5_Tokenizer::STARTTAG && (
|
||||
$token['name'] === 'head' || $token['name'] === 'noscript')) ||
|
||||
($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
||||
$token['name'] !== 'br')) {
|
||||
// parse error
|
||||
} else {
|
||||
// parse error
|
||||
$this->emitToken(array(
|
||||
'type' => HTML5_Tokenizer::ENDTAG,
|
||||
'name' => 'noscript',
|
||||
));
|
||||
$this->emitToken($token);
|
||||
}
|
||||
break;
|
||||
|
||||
case self::AFTER_HEAD:
|
||||
/* Handle the token as follows: */
|
||||
|
||||
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
|
||||
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
|
||||
or U+0020 SPACE */
|
||||
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
||||
/* Append the character to the current node. */
|
||||
$this->insertText($token['data']);
|
||||
|
||||
/* A comment token */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
||||
/* Append a Comment node to the current node with the data attribute
|
||||
set to the data given in the comment token. */
|
||||
$this->insertComment($token['data']);
|
||||
|
||||
} elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
||||
// parse error
|
||||
|
||||
} elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
|
||||
$this->processWithRulesFor($token, self::IN_BODY);
|
||||
|
||||
/* A start tag token with the tag name "body" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'body') {
|
||||
$this->insertElement($token);
|
||||
|
||||
/* Set the frameset-ok flag to "not ok". */
|
||||
$this->flag_frameset_ok = false;
|
||||
|
||||
/* Change the insertion mode to "in body". */
|
||||
$this->mode = self::IN_BODY;
|
||||
|
||||
/* A start tag token with the tag name "frameset" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'frameset') {
|
||||
/* Insert a frameset element for the token. */
|
||||
$this->insertElement($token);
|
||||
|
||||
/* Change the insertion mode to "in frameset". */
|
||||
$this->mode = self::IN_FRAMESET;
|
||||
|
||||
/* A start tag token whose tag name is one of: "base", "link", "meta",
|
||||
"script", "style", "title" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
|
||||
array('base', 'link', 'meta', 'noframes', 'script', 'style', 'title'))) {
|
||||
// parse error
|
||||
/* Push the node pointed to by the head element pointer onto the
|
||||
* stack of open elements. */
|
||||
$this->stack[] = $this->head_pointer;
|
||||
$this->processWithRulesFor($token, self::IN_HEAD);
|
||||
array_splice($this->stack, array_search($this->head_pointer, $this->stack, true), 1);
|
||||
|
||||
// inversion of specification
|
||||
} elseif(
|
||||
($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'head') ||
|
||||
($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
||||
$token['name'] !== 'body' && $token['name'] !== 'html' &&
|
||||
$token['name'] !== 'br')) {
|
||||
// parse error
|
||||
|
||||
/* Anything else */
|
||||
} else {
|
||||
$this->emitToken(array(
|
||||
'name' => 'body',
|
||||
'type' => HTML5_Tokenizer::STARTTAG,
|
||||
'attr' => array()
|
||||
));
|
||||
$this->flag_frameset_ok = true;
|
||||
$this->emitToken($token);
|
||||
}
|
||||
break;
|
||||
|
||||
case self::IN_BODY:
|
||||
/* Handle the token as follows: */
|
||||
|
||||
switch($token['type']) {
|
||||
/* A character token */
|
||||
case HTML5_Tokenizer::CHARACTER:
|
||||
case HTML5_Tokenizer::SPACECHARACTER:
|
||||
/* Reconstruct the active formatting elements, if any. */
|
||||
$this->reconstructActiveFormattingElements();
|
||||
|
||||
/* Append the token's character to the current node. */
|
||||
$this->insertText($token['data']);
|
||||
|
||||
/* If the token is not one of U+0009 CHARACTER TABULATION,
|
||||
* U+000A LINE FEED (LF), U+000C FORM FEED (FF), or U+0020
|
||||
* SPACE, then set the frameset-ok flag to "not ok". */
|
||||
// i.e., if any of the characters is not whitespace
|
||||
if (strlen($token['data']) !== strspn($token['data'], HTML5_Tokenizer::WHITESPACE)) {
|
||||
$this->flag_frameset_ok = false;
|
||||
}
|
||||
break;
|
||||
|
||||
/* A comment token */
|
||||
case HTML5_Tokenizer::COMMENT:
|
||||
/* Append a Comment node to the current node with the data
|
||||
attribute set to the data given in the comment token. */
|
||||
$this->insertComment($token['data']);
|
||||
break;
|
||||
|
||||
case HTML5_Tokenizer::DOCTYPE:
|
||||
// parse error
|
||||
break;
|
||||
|
||||
case HTML5_Tokenizer::STARTTAG:
|
||||
switch($token['name']) {
|
||||
case 'html':
|
||||
// parse error
|
||||
/* For each attribute on the token, check to see if the
|
||||
* attribute is already present on the top element of the
|
||||
* stack of open elements. If it is not, add the attribute
|
||||
* and its corresponding value to that element. */
|
||||
foreach($token['attr'] as $attr) {
|
||||
if(!$this->stack[0]->hasAttribute($attr['name'])) {
|
||||
$this->stack[0]->setAttribute($attr['name'], $attr['value']);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'base': case 'command': case 'link': case 'meta': case 'noframes':
|
||||
case 'script': case 'style': case 'title':
|
||||
/* Process the token as if the insertion mode had been "in
|
||||
head". */
|
||||
$this->processWithRulesFor($token, self::IN_HEAD);
|
||||
break;
|
||||
|
||||
/* A start tag token with the tag name "body" */
|
||||
case 'body':
|
||||
/* Parse error. If the second element on the stack of open
|
||||
elements is not a body element, or, if the stack of open
|
||||
elements has only one node on it, then ignore the token.
|
||||
(fragment case) */
|
||||
if(count($this->stack) === 1 || $this->stack[1]->tagName !== 'body') {
|
||||
$this->ignored = true;
|
||||
// Ignore
|
||||
|
||||
/* Otherwise, for each attribute on the token, check to see
|
||||
if the attribute is already present on the body element (the
|
||||
second element) on the stack of open elements. If it is not,
|
||||
add the attribute and its corresponding value to that
|
||||
element. */
|
||||
} else {
|
||||
foreach($token['attr'] as $attr) {
|
||||
if(!$this->stack[1]->hasAttribute($attr['name'])) {
|
||||
$this->stack[1]->setAttribute($attr['name'], $attr['value']);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'frameset':
|
||||
// parse error
|
||||
/* If the second element on the stack of open elements is
|
||||
* not a body element, or, if the stack of open elements
|
||||
* has only one node on it, then ignore the token.
|
||||
* (fragment case) */
|
||||
if(count($this->stack) === 1 || $this->stack[1]->tagName !== 'body') {
|
||||
$this->ignored = true;
|
||||
// Ignore
|
||||
} elseif (!$this->flag_frameset_ok) {
|
||||
$this->ignored = true;
|
||||
// Ignore
|
||||
} else {
|
||||
/* 1. Remove the second element on the stack of open
|
||||
* elements from its parent node, if it has one. */
|
||||
if($this->stack[1]->parentNode) {
|
||||
$this->stack[1]->parentNode->removeChild($this->stack[1]);
|
||||
}
|
||||
|
||||
/* 2. Pop all the nodes from the bottom of the stack of
|
||||
* open elements, from the current node up to the root
|
||||
* html element. */
|
||||
array_splice($this->stack, 1);
|
||||
|
||||
$this->insertElement($token);
|
||||
$this->mode = self::IN_FRAMESET;
|
||||
}
|
||||
break;
|
||||
|
||||
// in spec, there is a diversion here
|
||||
|
||||
case 'address': case 'article': case 'aside': case 'blockquote':
|
||||
case 'center': case 'datagrid': case 'details': case 'dialog': case 'dir':
|
||||
case 'div': case 'dl': case 'fieldset': case 'figure': case 'footer':
|
||||
case 'header': case 'hgroup': case 'menu': case 'nav':
|
||||
case 'ol': case 'p': case 'section': case 'ul':
|
||||
/* If the stack of open elements has a p element in scope,
|
||||
then act as if an end tag with the tag name p had been
|
||||
seen. */
|
||||
if($this->elementInScope('p')) {
|
||||
$this->emitToken(array(
|
||||
'name' => 'p',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
}
|
||||
|
||||
/* Insert an HTML element for the token. */
|
||||
$this->insertElement($token);
|
||||
break;
|
||||
|
||||
/* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
|
||||
"h5", "h6" */
|
||||
case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
|
||||
/* If the stack of open elements has a p element in scope,
|
||||
then act as if an end tag with the tag name p had been seen. */
|
||||
if($this->elementInScope('p')) {
|
||||
$this->emitToken(array(
|
||||
'name' => 'p',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
}
|
||||
|
||||
/* If the current node is an element whose tag name is one
|
||||
* of "h1", "h2", "h3", "h4", "h5", or "h6", then this is a
|
||||
* parse error; pop the current node off the stack of open
|
||||
* elements. */
|
||||
$peek = array_pop($this->stack);
|
||||
if (in_array($peek->tagName, array("h1", "h2", "h3", "h4", "h5", "h6"))) {
|
||||
// parse error
|
||||
} else {
|
||||
$this->stack[] = $peek;
|
||||
}
|
||||
|
||||
/* Insert an HTML element for the token. */
|
||||
$this->insertElement($token);
|
||||
break;
|
||||
|
||||
case 'pre': case 'listing':
|
||||
/* If the stack of open elements has a p element in scope,
|
||||
then act as if an end tag with the tag name p had been seen. */
|
||||
if($this->elementInScope('p')) {
|
||||
$this->emitToken(array(
|
||||
'name' => 'p',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
}
|
||||
$this->insertElement($token);
|
||||
/* If the next token is a U+000A LINE FEED (LF) character
|
||||
* token, then ignore that token and move on to the next
|
||||
* one. (Newlines at the start of pre blocks are ignored as
|
||||
* an authoring convenience.) */
|
||||
$this->ignore_lf_token = 2;
|
||||
$this->flag_frameset_ok = false;
|
||||
break;
|
||||
|
||||
/* A start tag whose tag name is "form" */
|
||||
case 'form':
|
||||
/* If the form element pointer is not null, ignore the
|
||||
token with a parse error. */
|
||||
if($this->form_pointer !== null) {
|
||||
$this->ignored = true;
|
||||
// Ignore.
|
||||
|
||||
/* Otherwise: */
|
||||
} else {
|
||||
/* If the stack of open elements has a p element in
|
||||
scope, then act as if an end tag with the tag name p
|
||||
had been seen. */
|
||||
if($this->elementInScope('p')) {
|
||||
$this->emitToken(array(
|
||||
'name' => 'p',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
}
|
||||
|
||||
/* Insert an HTML element for the token, and set the
|
||||
form element pointer to point to the element created. */
|
||||
$element = $this->insertElement($token);
|
||||
$this->form_pointer = $element;
|
||||
}
|
||||
break;
|
||||
|
||||
// condensed specification
|
||||
case 'li': case 'dd': case 'dt':
|
||||
/* 1. Set the frameset-ok flag to "not ok". */
|
||||
$this->flag_frameset_ok = false;
|
||||
|
||||
$stack_length = count($this->stack) - 1;
|
||||
for($n = $stack_length; 0 <= $n; $n--) {
|
||||
/* 2. Initialise node to be the current node (the
|
||||
bottommost node of the stack). */
|
||||
$stop = false;
|
||||
$node = $this->stack[$n];
|
||||
$cat = $this->getElementCategory($node);
|
||||
|
||||
// for case 'li':
|
||||
/* 3. If node is an li element, then act as if an end
|
||||
* tag with the tag name "li" had been seen, then jump
|
||||
* to the last step. */
|
||||
// for case 'dd': case 'dt':
|
||||
/* If node is a dd or dt element, then act as if an end
|
||||
* tag with the same tag name as node had been seen, then
|
||||
* jump to the last step. */
|
||||
if(($token['name'] === 'li' && $node->tagName === 'li') ||
|
||||
($token['name'] !== 'li' && ($node->tagName === 'dd' || $node->tagName === 'dt'))) { // limited conditional
|
||||
$this->emitToken(array(
|
||||
'type' => HTML5_Tokenizer::ENDTAG,
|
||||
'name' => $node->tagName,
|
||||
));
|
||||
break;
|
||||
}
|
||||
|
||||
/* 4. If node is not in the formatting category, and is
|
||||
not in the phrasing category, and is not an address,
|
||||
div or p element, then stop this algorithm. */
|
||||
if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
|
||||
$node->tagName !== 'address' && $node->tagName !== 'div' &&
|
||||
$node->tagName !== 'p') {
|
||||
break;
|
||||
}
|
||||
|
||||
/* 5. Otherwise, set node to the previous entry in the
|
||||
* stack of open elements and return to step 2. */
|
||||
}
|
||||
|
||||
/* 6. This is the last step. */
|
||||
|
||||
/* If the stack of open elements has a p element in scope,
|
||||
then act as if an end tag with the tag name p had been
|
||||
seen. */
|
||||
if($this->elementInScope('p')) {
|
||||
$this->emitToken(array(
|
||||
'name' => 'p',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
}
|
||||
|
||||
/* Finally, insert an HTML element with the same tag
|
||||
name as the token's. */
|
||||
$this->insertElement($token);
|
||||
break;
|
||||
|
||||
/* A start tag token whose tag name is "plaintext" */
|
||||
case 'plaintext':
|
||||
/* If the stack of open elements has a p element in scope,
|
||||
then act as if an end tag with the tag name p had been
|
||||
seen. */
|
||||
if($this->elementInScope('p')) {
|
||||
$this->emitToken(array(
|
||||
'name' => 'p',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
}
|
||||
|
||||
/* Insert an HTML element for the token. */
|
||||
$this->insertElement($token);
|
||||
|
||||
$this->content_model = HTML5_Tokenizer::PLAINTEXT;
|
||||
break;
|
||||
|
||||
// more diversions
|
||||
|
||||
/* A start tag whose tag name is "a" */
|
||||
case 'a':
|
||||
/* If the list of active formatting elements contains
|
||||
an element whose tag name is "a" between the end of the
|
||||
list and the last marker on the list (or the start of
|
||||
the list if there is no marker on the list), then this
|
||||
is a parse error; act as if an end tag with the tag name
|
||||
"a" had been seen, then remove that element from the list
|
||||
of active formatting elements and the stack of open
|
||||
elements if the end tag didn't already remove it (it
|
||||
might not have if the element is not in table scope). */
|
||||
$leng = count($this->a_formatting);
|
||||
|
||||
for($n = $leng - 1; $n >= 0; $n--) {
|
||||
if($this->a_formatting[$n] === self::MARKER) {
|
||||
break;
|
||||
|
||||
} elseif($this->a_formatting[$n]->tagName === 'a') {
|
||||
$a = $this->a_formatting[$n];
|
||||
$this->emitToken(array(
|
||||
'name' => 'a',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
if (in_array($a, $this->a_formatting)) {
|
||||
$a_i = array_search($a, $this->a_formatting, true);
|
||||
if($a_i !== false) array_splice($this->a_formatting, $a_i, 1);
|
||||
}
|
||||
if (in_array($a, $this->stack)) {
|
||||
$a_i = array_search($a, $this->stack, true);
|
||||
if ($a_i !== false) array_splice($this->stack, $a_i, 1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Reconstruct the active formatting elements, if any. */
|
||||
$this->reconstructActiveFormattingElements();
|
||||
|
||||
/* Insert an HTML element for the token. */
|
||||
$el = $this->insertElement($token);
|
||||
|
||||
/* Add that element to the list of active formatting
|
||||
elements. */
|
||||
$this->a_formatting[] = $el;
|
||||
break;
|
||||
|
||||
case 'b': case 'big': case 'code': case 'em': case 'font': case 'i':
|
||||
case 's': case 'small': case 'strike':
|
||||
case 'strong': case 'tt': case 'u':
|
||||
/* Reconstruct the active formatting elements, if any. */
|
||||
$this->reconstructActiveFormattingElements();
|
||||
|
||||
/* Insert an HTML element for the token. */
|
||||
$el = $this->insertElement($token);
|
||||
|
||||
/* Add that element to the list of active formatting
|
||||
elements. */
|
||||
$this->a_formatting[] = $el;
|
||||
break;
|
||||
|
||||
case 'nobr':
|
||||
/* Reconstruct the active formatting elements, if any. */
|
||||
$this->reconstructActiveFormattingElements();
|
||||
|
||||
/* If the stack of open elements has a nobr element in
|
||||
* scope, then this is a parse error; act as if an end tag
|
||||
* with the tag name "nobr" had been seen, then once again
|
||||
* reconstruct the active formatting elements, if any. */
|
||||
if ($this->elementInScope('nobr')) {
|
||||
$this->emitToken(array(
|
||||
'name' => 'nobr',
|
||||
'type' => HTML5_Tokenizer::ENDTAG,
|
||||
));
|
||||
$this->reconstructActiveFormattingElements();
|
||||
}
|
||||
|
||||
/* Insert an HTML element for the token. */
|
||||
$el = $this->insertElement($token);
|
||||
|
||||
/* Add that element to the list of active formatting
|
||||
elements. */
|
||||
$this->a_formatting[] = $el;
|
||||
break;
|
||||
|
||||
// another diversion
|
||||
|
||||
/* A start tag token whose tag name is "button" */
|
||||
case 'button':
|
||||
/* If the stack of open elements has a button element in scope,
|
||||
then this is a parse error; act as if an end tag with the tag
|
||||
name "button" had been seen, then reprocess the token. (We don't
|
||||
do that. Unnecessary.) (I hope you're right! -- ezyang) */
|
||||
if($this->elementInScope('button')) {
|
||||
$this->emitToken(array(
|
||||
'name' => 'button',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
}
|
||||
|
||||
/* Reconstruct the active formatting elements, if any. */
|
||||
$this->reconstructActiveFormattingElements();
|
||||
|
||||
/* Insert an HTML element for the token. */
|
||||
$this->insertElement($token);
|
||||
|
||||
/* Insert a marker at the end of the list of active
|
||||
formatting elements. */
|
||||
$this->a_formatting[] = self::MARKER;
|
||||
|
||||
$this->flag_frameset_ok = false;
|
||||
break;
|
||||
|
||||
case 'applet': case 'marquee': case 'object':
|
||||
/* Reconstruct the active formatting elements, if any. */
|
||||
$this->reconstructActiveFormattingElements();
|
||||
|
||||
/* Insert an HTML element for the token. */
|
||||
$this->insertElement($token);
|
||||
|
||||
/* Insert a marker at the end of the list of active
|
||||
formatting elements. */
|
||||
$this->a_formatting[] = self::MARKER;
|
||||
|
||||
$this->flag_frameset_ok = false;
|
||||
break;
|
||||
|
||||
// spec diversion
|
||||
|
||||
/* A start tag whose tag name is "table" */
|
||||
case 'table':
|
||||
/* If the stack of open elements has a p element in scope,
|
||||
then act as if an end tag with the tag name p had been seen. */
|
||||
if($this->quirks_mode !== self::QUIRKS_MODE &&
|
||||
$this->elementInScope('p')) {
|
||||
$this->emitToken(array(
|
||||
'name' => 'p',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
}
|
||||
|
||||
/* Insert an HTML element for the token. */
|
||||
$this->insertElement($token);
|
||||
|
||||
$this->flag_frameset_ok = false;
|
||||
|
||||
/* Change the insertion mode to "in table". */
|
||||
$this->mode = self::IN_TABLE;
|
||||
break;
|
||||
|
||||
/* A start tag whose tag name is one of: "area", "basefont",
|
||||
"bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
|
||||
case 'area': case 'basefont': case 'bgsound': case 'br':
|
||||
case 'embed': case 'img': case 'input': case 'keygen': case 'spacer':
|
||||
case 'wbr':
|
||||
/* Reconstruct the active formatting elements, if any. */
|
||||
$this->reconstructActiveFormattingElements();
|
||||
|
||||
/* Insert an HTML element for the token. */
|
||||
$this->insertElement($token);
|
||||
|
||||
/* Immediately pop the current node off the stack of open elements. */
|
||||
array_pop($this->stack);
|
||||
|
||||
// YYY: Acknowledge the token's self-closing flag, if it is set.
|
||||
|
||||
$this->flag_frameset_ok = false;
|
||||
break;
|
||||
|
||||
case 'param': case 'source':
|
||||
/* Insert an HTML element for the token. */
|
||||
$this->insertElement($token);
|
||||
|
||||
/* Immediately pop the current node off the stack of open elements. */
|
||||
array_pop($this->stack);
|
||||
|
||||
// YYY: Acknowledge the token's self-closing flag, if it is set.
|
||||
break;
|
||||
|
||||
/* A start tag whose tag name is "hr" */
|
||||
case 'hr':
|
||||
/* If the stack of open elements has a p element in scope,
|
||||
then act as if an end tag with the tag name p had been seen. */
|
||||
if($this->elementInScope('p')) {
|
||||
$this->emitToken(array(
|
||||
'name' => 'p',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
}
|
||||
|
||||
/* Insert an HTML element for the token. */
|
||||
$this->insertElement($token);
|
||||
|
||||
/* Immediately pop the current node off the stack of open elements. */
|
||||
array_pop($this->stack);
|
||||
|
||||
// YYY: Acknowledge the token's self-closing flag, if it is set.
|
||||
|
||||
$this->flag_frameset_ok = false;
|
||||
break;
|
||||
|
||||
/* A start tag whose tag name is "image" */
|
||||
case 'image':
|
||||
/* Parse error. Change the token's tag name to "img" and
|
||||
reprocess it. (Don't ask.) */
|
||||
$token['name'] = 'img';
|
||||
$this->emitToken($token);
|
||||
break;
|
||||
|
||||
/* A start tag whose tag name is "isindex" */
|
||||
case 'isindex':
|
||||
/* Parse error. */
|
||||
|
||||
/* If the form element pointer is not null,
|
||||
then ignore the token. */
|
||||
if($this->form_pointer === null) {
|
||||
/* Act as if a start tag token with the tag name "form" had
|
||||
been seen. */
|
||||
/* If the token has an attribute called "action", set
|
||||
* the action attribute on the resulting form
|
||||
* element to the value of the "action" attribute of
|
||||
* the token. */
|
||||
$attr = array();
|
||||
$action = $this->getAttr($token, 'action');
|
||||
if ($action !== false) {
|
||||
$attr[] = array('name' => 'action', 'value' => $action);
|
||||
}
|
||||
$this->emitToken(array(
|
||||
'name' => 'form',
|
||||
'type' => HTML5_Tokenizer::STARTTAG,
|
||||
'attr' => $attr
|
||||
));
|
||||
|
||||
/* Act as if a start tag token with the tag name "hr" had
|
||||
been seen. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'hr',
|
||||
'type' => HTML5_Tokenizer::STARTTAG,
|
||||
'attr' => array()
|
||||
));
|
||||
|
||||
/* Act as if a start tag token with the tag name "p" had
|
||||
been seen. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'p',
|
||||
'type' => HTML5_Tokenizer::STARTTAG,
|
||||
'attr' => array()
|
||||
));
|
||||
|
||||
/* Act as if a start tag token with the tag name "label"
|
||||
had been seen. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'label',
|
||||
'type' => HTML5_Tokenizer::STARTTAG,
|
||||
'attr' => array()
|
||||
));
|
||||
|
||||
/* Act as if a stream of character tokens had been seen. */
|
||||
$prompt = $this->getAttr($token, 'prompt');
|
||||
if ($prompt === false) {
|
||||
$prompt = 'This is a searchable index. '.
|
||||
'Insert your search keywords here: ';
|
||||
}
|
||||
$this->emitToken(array(
|
||||
'data' => $prompt,
|
||||
'type' => HTML5_Tokenizer::CHARACTER,
|
||||
));
|
||||
|
||||
/* Act as if a start tag token with the tag name "input"
|
||||
had been seen, with all the attributes from the "isindex"
|
||||
token, except with the "name" attribute set to the value
|
||||
"isindex" (ignoring any explicit "name" attribute). */
|
||||
$attr = array();
|
||||
foreach ($token['attr'] as $keypair) {
|
||||
if ($keypair['name'] === 'name' || $keypair['name'] === 'action' ||
|
||||
$keypair['name'] === 'prompt') continue;
|
||||
$attr[] = $keypair;
|
||||
}
|
||||
$attr[] = array('name' => 'name', 'value' => 'isindex');
|
||||
|
||||
$this->emitToken(array(
|
||||
'name' => 'input',
|
||||
'type' => HTML5_Tokenizer::STARTTAG,
|
||||
'attr' => $attr
|
||||
));
|
||||
|
||||
/* Act as if an end tag token with the tag name "label"
|
||||
had been seen. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'label',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
|
||||
/* Act as if an end tag token with the tag name "p" had
|
||||
been seen. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'p',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
|
||||
/* Act as if a start tag token with the tag name "hr" had
|
||||
been seen. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'hr',
|
||||
'type' => HTML5_Tokenizer::STARTTAG
|
||||
));
|
||||
|
||||
/* Act as if an end tag token with the tag name "form" had
|
||||
been seen. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'form',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
} else {
|
||||
$this->ignored = true;
|
||||
}
|
||||
break;
|
||||
|
||||
/* A start tag whose tag name is "textarea" */
|
||||
case 'textarea':
|
||||
$this->insertElement($token);
|
||||
|
||||
/* If the next token is a U+000A LINE FEED (LF)
|
||||
* character token, then ignore that token and move on to
|
||||
* the next one. (Newlines at the start of textarea
|
||||
* elements are ignored as an authoring convenience.)
|
||||
* need flag, see also <pre> */
|
||||
$this->ignore_lf_token = 2;
|
||||
|
||||
$this->original_mode = $this->mode;
|
||||
$this->flag_frameset_ok = false;
|
||||
$this->mode = self::IN_CDATA_RCDATA;
|
||||
|
||||
/* Switch the tokeniser's content model flag to the
|
||||
RCDATA state. */
|
||||
$this->content_model = HTML5_Tokenizer::RCDATA;
|
||||
break;
|
||||
|
||||
/* A start tag token whose tag name is "xmp" */
|
||||
case 'xmp':
|
||||
/* Reconstruct the active formatting elements, if any. */
|
||||
$this->reconstructActiveFormattingElements();
|
||||
|
||||
$this->flag_frameset_ok = false;
|
||||
|
||||
$this->insertCDATAElement($token);
|
||||
break;
|
||||
|
||||
case 'iframe':
|
||||
$this->flag_frameset_ok = false;
|
||||
$this->insertCDATAElement($token);
|
||||
break;
|
||||
|
||||
case 'noembed': case 'noscript':
|
||||
// XSCRIPT: should check scripting flag
|
||||
$this->insertCDATAElement($token);
|
||||
break;
|
||||
|
||||
/* A start tag whose tag name is "select" */
|
||||
case 'select':
|
||||
/* Reconstruct the active formatting elements, if any. */
|
||||
$this->reconstructActiveFormattingElements();
|
||||
|
||||
/* Insert an HTML element for the token. */
|
||||
$this->insertElement($token);
|
||||
|
||||
$this->flag_frameset_ok = false;
|
||||
|
||||
/* If the insertion mode is one of in table", "in caption",
|
||||
* "in column group", "in table body", "in row", or "in
|
||||
* cell", then switch the insertion mode to "in select in
|
||||
* table". Otherwise, switch the insertion mode to "in
|
||||
* select". */
|
||||
if (
|
||||
$this->mode === self::IN_TABLE || $this->mode === self::IN_CAPTION ||
|
||||
$this->mode === self::IN_COLUMN_GROUP || $this->mode ==+self::IN_TABLE_BODY ||
|
||||
$this->mode === self::IN_ROW || $this->mode === self::IN_CELL
|
||||
) {
|
||||
$this->mode = self::IN_SELECT_IN_TABLE;
|
||||
} else {
|
||||
$this->mode = self::IN_SELECT;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'option': case 'optgroup':
|
||||
if ($this->elementInScope('option')) {
|
||||
$this->emitToken(array(
|
||||
'name' => 'option',
|
||||
'type' => HTML5_Tokenizer::ENDTAG,
|
||||
));
|
||||
}
|
||||
$this->reconstructActiveFormattingElements();
|
||||
$this->insertElement($token);
|
||||
break;
|
||||
|
||||
case 'rp': case 'rt':
|
||||
/* If the stack of open elements has a ruby element in scope, then generate
|
||||
* implied end tags. If the current node is not then a ruby element, this is
|
||||
* a parse error; pop all the nodes from the current node up to the node
|
||||
* immediately before the bottommost ruby element on the stack of open elements.
|
||||
*/
|
||||
if ($this->elementInScope('ruby')) {
|
||||
$this->generateImpliedEndTags();
|
||||
}
|
||||
$peek = false;
|
||||
do {
|
||||
if ($peek) {
|
||||
// parse error
|
||||
}
|
||||
$peek = array_pop($this->stack);
|
||||
} while ($peek->tagName !== 'ruby');
|
||||
$this->stack[] = $peek; // we popped one too many
|
||||
$this->insertElement($token);
|
||||
break;
|
||||
|
||||
// spec diversion
|
||||
|
||||
case 'math':
|
||||
$this->reconstructActiveFormattingElements();
|
||||
$token = $this->adjustMathMLAttributes($token);
|
||||
$token = $this->adjustForeignAttributes($token);
|
||||
$this->insertForeignElement($token, self::NS_MATHML);
|
||||
if (isset($token['self-closing'])) {
|
||||
// XERROR: acknowledge the token's self-closing flag
|
||||
array_pop($this->stack);
|
||||
}
|
||||
if ($this->mode !== self::IN_FOREIGN_CONTENT) {
|
||||
$this->secondary_mode = $this->mode;
|
||||
$this->mode = self::IN_FOREIGN_CONTENT;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'svg':
|
||||
$this->reconstructActiveFormattingElements();
|
||||
$token = $this->adjustSVGAttributes($token);
|
||||
$token = $this->adjustForeignAttributes($token);
|
||||
$this->insertForeignElement($token, self::NS_SVG);
|
||||
if (isset($token['self-closing'])) {
|
||||
// XERROR: acknowledge the token's self-closing flag
|
||||
array_pop($this->stack);
|
||||
}
|
||||
if ($this->mode !== self::IN_FOREIGN_CONTENT) {
|
||||
$this->secondary_mode = $this->mode;
|
||||
$this->mode = self::IN_FOREIGN_CONTENT;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'caption': case 'col': case 'colgroup': case 'frame': case 'head':
|
||||
case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': case 'tr':
|
||||
// parse error
|
||||
break;
|
||||
|
||||
/* A start tag token not covered by the previous entries */
|
||||
default:
|
||||
/* Reconstruct the active formatting elements, if any. */
|
||||
$this->reconstructActiveFormattingElements();
|
||||
|
||||
$this->insertElement($token);
|
||||
/* This element will be a phrasing element. */
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case HTML5_Tokenizer::ENDTAG:
|
||||
switch($token['name']) {
|
||||
/* An end tag with the tag name "body" */
|
||||
case 'body':
|
||||
/* If the second element in the stack of open elements is
|
||||
not a body element, this is a parse error. Ignore the token.
|
||||
(innerHTML case) */
|
||||
if(count($this->stack) < 2 || $this->stack[1]->tagName !== 'body') {
|
||||
$this->ignored = true;
|
||||
|
||||
/* Otherwise, if there is a node in the stack of open
|
||||
* elements that is not either a dd element, a dt
|
||||
* element, an li element, an optgroup element, an
|
||||
* option element, a p element, an rp element, an rt
|
||||
* element, a tbody element, a td element, a tfoot
|
||||
* element, a th element, a thead element, a tr element,
|
||||
* the body element, or the html element, then this is a
|
||||
* parse error. */
|
||||
} else {
|
||||
// XERROR: implement this check for parse error
|
||||
}
|
||||
|
||||
/* Change the insertion mode to "after body". */
|
||||
$this->mode = self::AFTER_BODY;
|
||||
break;
|
||||
|
||||
/* An end tag with the tag name "html" */
|
||||
case 'html':
|
||||
/* Act as if an end tag with tag name "body" had been seen,
|
||||
then, if that token wasn't ignored, reprocess the current
|
||||
token. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'body',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
|
||||
if (!$this->ignored) $this->emitToken($token);
|
||||
break;
|
||||
|
||||
case 'address': case 'article': case 'aside': case 'blockquote':
|
||||
case 'center': case 'datagrid': case 'details': case 'dir':
|
||||
case 'div': case 'dl': case 'fieldset': case 'figure': case 'footer':
|
||||
case 'header': case 'hgroup': case 'listing': case 'menu':
|
||||
case 'nav': case 'ol': case 'pre': case 'section': case 'ul':
|
||||
/* If the stack of open elements has an element in scope
|
||||
with the same tag name as that of the token, then generate
|
||||
implied end tags. */
|
||||
if($this->elementInScope($token['name'])) {
|
||||
$this->generateImpliedEndTags();
|
||||
|
||||
/* Now, if the current node is not an element with
|
||||
the same tag name as that of the token, then this
|
||||
is a parse error. */
|
||||
// XERROR: implement parse error logic
|
||||
|
||||
/* If the stack of open elements has an element in
|
||||
scope with the same tag name as that of the token,
|
||||
then pop elements from this stack until an element
|
||||
with that tag name has been popped from the stack. */
|
||||
do {
|
||||
$node = array_pop($this->stack);
|
||||
} while ($node->tagName !== $token['name']);
|
||||
} else {
|
||||
// parse error
|
||||
}
|
||||
break;
|
||||
|
||||
/* An end tag whose tag name is "form" */
|
||||
case 'form':
|
||||
/* Let node be the element that the form element pointer is set to. */
|
||||
$node = $this->form_pointer;
|
||||
/* Set the form element pointer to null. */
|
||||
$this->form_pointer = null;
|
||||
/* If node is null or the stack of open elements does not
|
||||
* have node in scope, then this is a parse error; ignore the token. */
|
||||
if ($node === null || !in_array($node, $this->stack)) {
|
||||
// parse error
|
||||
$this->ignored = true;
|
||||
} else {
|
||||
/* 1. Generate implied end tags. */
|
||||
$this->generateImpliedEndTags();
|
||||
/* 2. If the current node is not node, then this is a parse error. */
|
||||
if (end($this->stack) !== $node) {
|
||||
// parse error
|
||||
}
|
||||
/* 3. Remove node from the stack of open elements. */
|
||||
array_splice($this->stack, array_search($node, $this->stack, true), 1);
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
/* An end tag whose tag name is "p" */
|
||||
case 'p':
|
||||
/* If the stack of open elements has a p element in scope,
|
||||
then generate implied end tags, except for p elements. */
|
||||
if($this->elementInScope('p')) {
|
||||
/* Generate implied end tags, except for elements with
|
||||
* the same tag name as the token. */
|
||||
$this->generateImpliedEndTags(array('p'));
|
||||
|
||||
/* If the current node is not a p element, then this is
|
||||
a parse error. */
|
||||
// XERROR: implement
|
||||
|
||||
/* Pop elements from the stack of open elements until
|
||||
* an element with the same tag name as the token has
|
||||
* been popped from the stack. */
|
||||
do {
|
||||
$node = array_pop($this->stack);
|
||||
} while ($node->tagName !== 'p');
|
||||
|
||||
} else {
|
||||
// parse error
|
||||
$this->emitToken(array(
|
||||
'name' => 'p',
|
||||
'type' => HTML5_Tokenizer::STARTTAG,
|
||||
));
|
||||
$this->emitToken($token);
|
||||
}
|
||||
break;
|
||||
|
||||
/* An end tag whose tag name is "dd", "dt", or "li" */
|
||||
case 'dd': case 'dt': case 'li':
|
||||
if($this->elementInScope($token['name'])) {
|
||||
$this->generateImpliedEndTags(array($token['name']));
|
||||
|
||||
/* If the current node is not an element with the same
|
||||
tag name as the token, then this is a parse error. */
|
||||
// XERROR: implement parse error
|
||||
|
||||
/* Pop elements from the stack of open elements until
|
||||
* an element with the same tag name as the token has
|
||||
* been popped from the stack. */
|
||||
do {
|
||||
$node = array_pop($this->stack);
|
||||
} while ($node->tagName !== $token['name']);
|
||||
|
||||
} else {
|
||||
// parse error
|
||||
}
|
||||
break;
|
||||
|
||||
/* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
|
||||
"h5", "h6" */
|
||||
case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
|
||||
$elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
|
||||
|
||||
/* If the stack of open elements has in scope an element whose
|
||||
tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
|
||||
generate implied end tags. */
|
||||
if($this->elementInScope($elements)) {
|
||||
$this->generateImpliedEndTags();
|
||||
|
||||
/* Now, if the current node is not an element with the same
|
||||
tag name as that of the token, then this is a parse error. */
|
||||
// XERROR: implement parse error
|
||||
|
||||
/* If the stack of open elements has in scope an element
|
||||
whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
|
||||
"h6", then pop elements from the stack until an element
|
||||
with one of those tag names has been popped from the stack. */
|
||||
do {
|
||||
$node = array_pop($this->stack);
|
||||
} while (!in_array($node->tagName, $elements));
|
||||
} else {
|
||||
// parse error
|
||||
}
|
||||
break;
|
||||
|
||||
/* An end tag whose tag name is one of: "a", "b", "big", "em",
|
||||
"font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
|
||||
case 'a': case 'b': case 'big': case 'code': case 'em': case 'font':
|
||||
case 'i': case 'nobr': case 's': case 'small': case 'strike':
|
||||
case 'strong': case 'tt': case 'u':
|
||||
// XERROR: generally speaking this needs parse error logic
|
||||
/* 1. Let the formatting element be the last element in
|
||||
the list of active formatting elements that:
|
||||
* is between the end of the list and the last scope
|
||||
marker in the list, if any, or the start of the list
|
||||
otherwise, and
|
||||
* has the same tag name as the token.
|
||||
*/
|
||||
while(true) {
|
||||
for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
|
||||
if($this->a_formatting[$a] === self::MARKER) {
|
||||
break;
|
||||
|
||||
} elseif($this->a_formatting[$a]->tagName === $token['name']) {
|
||||
$formatting_element = $this->a_formatting[$a];
|
||||
$in_stack = in_array($formatting_element, $this->stack, true);
|
||||
$fe_af_pos = $a;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* If there is no such node, or, if that node is
|
||||
also in the stack of open elements but the element
|
||||
is not in scope, then this is a parse error. Abort
|
||||
these steps. The token is ignored. */
|
||||
if(!isset($formatting_element) || ($in_stack &&
|
||||
!$this->elementInScope($token['name']))) {
|
||||
$this->ignored = true;
|
||||
break;
|
||||
|
||||
/* Otherwise, if there is such a node, but that node
|
||||
is not in the stack of open elements, then this is a
|
||||
parse error; remove the element from the list, and
|
||||
abort these steps. */
|
||||
} elseif(isset($formatting_element) && !$in_stack) {
|
||||
unset($this->a_formatting[$fe_af_pos]);
|
||||
$this->a_formatting = array_merge($this->a_formatting);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Otherwise, there is a formatting element and that
|
||||
* element is in the stack and is in scope. If the
|
||||
* element is not the current node, this is a parse
|
||||
* error. In any case, proceed with the algorithm as
|
||||
* written in the following steps. */
|
||||
// XERROR: implement me
|
||||
|
||||
/* 2. Let the furthest block be the topmost node in the
|
||||
stack of open elements that is lower in the stack
|
||||
than the formatting element, and is not an element in
|
||||
the phrasing or formatting categories. There might
|
||||
not be one. */
|
||||
$fe_s_pos = array_search($formatting_element, $this->stack, true);
|
||||
$length = count($this->stack);
|
||||
|
||||
for($s = $fe_s_pos + 1; $s < $length; $s++) {
|
||||
$category = $this->getElementCategory($this->stack[$s]);
|
||||
|
||||
if($category !== self::PHRASING && $category !== self::FORMATTING) {
|
||||
$furthest_block = $this->stack[$s];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* 3. If there is no furthest block, then the UA must
|
||||
skip the subsequent steps and instead just pop all
|
||||
the nodes from the bottom of the stack of open
|
||||
elements, from the current node up to the formatting
|
||||
element, and remove the formatting element from the
|
||||
list of active formatting elements. */
|
||||
if(!isset($furthest_block)) {
|
||||
for($n = $length - 1; $n >= $fe_s_pos; $n--) {
|
||||
array_pop($this->stack);
|
||||
}
|
||||
|
||||
unset($this->a_formatting[$fe_af_pos]);
|
||||
$this->a_formatting = array_merge($this->a_formatting);
|
||||
break;
|
||||
}
|
||||
|
||||
/* 4. Let the common ancestor be the element
|
||||
immediately above the formatting element in the stack
|
||||
of open elements. */
|
||||
$common_ancestor = $this->stack[$fe_s_pos - 1];
|
||||
|
||||
/* 5. Let a bookmark note the position of the
|
||||
formatting element in the list of active formatting
|
||||
elements relative to the elements on either side
|
||||
of it in the list. */
|
||||
$bookmark = $fe_af_pos;
|
||||
|
||||
/* 6. Let node and last node be the furthest block.
|
||||
Follow these steps: */
|
||||
$node = $furthest_block;
|
||||
$last_node = $furthest_block;
|
||||
|
||||
while(true) {
|
||||
for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
|
||||
/* 6.1 Let node be the element immediately
|
||||
prior to node in the stack of open elements. */
|
||||
$node = $this->stack[$n];
|
||||
|
||||
/* 6.2 If node is not in the list of active
|
||||
formatting elements, then remove node from
|
||||
the stack of open elements and then go back
|
||||
to step 1. */
|
||||
if(!in_array($node, $this->a_formatting, true)) {
|
||||
array_splice($this->stack, $n, 1);
|
||||
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* 6.3 Otherwise, if node is the formatting
|
||||
element, then go to the next step in the overall
|
||||
algorithm. */
|
||||
if($node === $formatting_element) {
|
||||
break;
|
||||
|
||||
/* 6.4 Otherwise, if last node is the furthest
|
||||
block, then move the aforementioned bookmark to
|
||||
be immediately after the node in the list of
|
||||
active formatting elements. */
|
||||
} elseif($last_node === $furthest_block) {
|
||||
$bookmark = array_search($node, $this->a_formatting, true) + 1;
|
||||
}
|
||||
|
||||
/* 6.5 Create an element for the token for which
|
||||
* the element node was created, replace the entry
|
||||
* for node in the list of active formatting
|
||||
* elements with an entry for the new element,
|
||||
* replace the entry for node in the stack of open
|
||||
* elements with an entry for the new element, and
|
||||
* let node be the new element. */
|
||||
// we don't know what the token is anymore
|
||||
$clone = $node->cloneNode();
|
||||
$a_pos = array_search($node, $this->a_formatting, true);
|
||||
$s_pos = array_search($node, $this->stack, true);
|
||||
$this->a_formatting[$a_pos] = $clone;
|
||||
$this->stack[$s_pos] = $clone;
|
||||
$node = $clone;
|
||||
|
||||
/* 6.6 Insert last node into node, first removing
|
||||
it from its previous parent node if any. */
|
||||
if($last_node->parentNode !== null) {
|
||||
$last_node->parentNode->removeChild($last_node);
|
||||
}
|
||||
|
||||
$node->appendChild($last_node);
|
||||
|
||||
/* 6.7 Let last node be node. */
|
||||
$last_node = $node;
|
||||
|
||||
/* 6.8 Return to step 1 of this inner set of steps. */
|
||||
}
|
||||
|
||||
/* 7. If the common ancestor node is a table, tbody,
|
||||
* tfoot, thead, or tr element, then, foster parent
|
||||
* whatever last node ended up being in the previous
|
||||
* step, first removing it from its previous parent
|
||||
* node if any. */
|
||||
if ($last_node->parentNode) { // common step
|
||||
$last_node->parentNode->removeChild($last_node);
|
||||
}
|
||||
if (in_array($common_ancestor->tagName, array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
|
||||
$this->fosterParent($last_node);
|
||||
/* Otherwise, append whatever last node ended up being
|
||||
* in the previous step to the common ancestor node,
|
||||
* first removing it from its previous parent node if
|
||||
* any. */
|
||||
} else {
|
||||
$common_ancestor->appendChild($last_node);
|
||||
}
|
||||
|
||||
/* 8. Create an element for the token for which the
|
||||
* formatting element was created. */
|
||||
$clone = $formatting_element->cloneNode();
|
||||
|
||||
/* 9. Take all of the child nodes of the furthest
|
||||
block and append them to the element created in the
|
||||
last step. */
|
||||
while($furthest_block->hasChildNodes()) {
|
||||
$child = $furthest_block->firstChild;
|
||||
$furthest_block->removeChild($child);
|
||||
$clone->appendChild($child);
|
||||
}
|
||||
|
||||
/* 10. Append that clone to the furthest block. */
|
||||
$furthest_block->appendChild($clone);
|
||||
|
||||
/* 11. Remove the formatting element from the list
|
||||
of active formatting elements, and insert the new element
|
||||
into the list of active formatting elements at the
|
||||
position of the aforementioned bookmark. */
|
||||
$fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
|
||||
array_splice($this->a_formatting, $fe_af_pos, 1);
|
||||
|
||||
$af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
|
||||
$af_part2 = array_slice($this->a_formatting, $bookmark);
|
||||
$this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
|
||||
|
||||
/* 12. Remove the formatting element from the stack
|
||||
of open elements, and insert the new element into the stack
|
||||
of open elements immediately below the position of the
|
||||
furthest block in that stack. */
|
||||
$fe_s_pos = array_search($formatting_element, $this->stack, true);
|
||||
array_splice($this->stack, $fe_s_pos, 1);
|
||||
|
||||
$fb_s_pos = array_search($furthest_block, $this->stack, true);
|
||||
$s_part1 = array_slice($this->stack, 0, $fb_s_pos + 1);
|
||||
$s_part2 = array_slice($this->stack, $fb_s_pos + 1);
|
||||
$this->stack = array_merge($s_part1, array($clone), $s_part2);
|
||||
|
||||
/* 13. Jump back to step 1 in this series of steps. */
|
||||
unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
|
||||
}
|
||||
break;
|
||||
|
||||
case 'applet': case 'button': case 'marquee': case 'object':
|
||||
/* If the stack of open elements has an element in scope whose
|
||||
tag name matches the tag name of the token, then generate implied
|
||||
tags. */
|
||||
if($this->elementInScope($token['name'])) {
|
||||
$this->generateImpliedEndTags();
|
||||
|
||||
/* Now, if the current node is not an element with the same
|
||||
tag name as the token, then this is a parse error. */
|
||||
// XERROR: implement logic
|
||||
|
||||
/* Pop elements from the stack of open elements until
|
||||
* an element with the same tag name as the token has
|
||||
* been popped from the stack. */
|
||||
do {
|
||||
$node = array_pop($this->stack);
|
||||
} while ($node->tagName !== $token['name']);
|
||||
|
||||
/* Clear the list of active formatting elements up to the
|
||||
* last marker. */
|
||||
$keys = array_keys($this->a_formatting, self::MARKER, true);
|
||||
$marker = end($keys);
|
||||
|
||||
for($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
|
||||
array_pop($this->a_formatting);
|
||||
}
|
||||
} else {
|
||||
// parse error
|
||||
}
|
||||
break;
|
||||
|
||||
case 'br':
|
||||
// Parse error
|
||||
$this->emitToken(array(
|
||||
'name' => 'br',
|
||||
'type' => HTML5_Tokenizer::STARTTAG,
|
||||
));
|
||||
break;
|
||||
|
||||
/* An end tag token not covered by the previous entries */
|
||||
default:
|
||||
for($n = count($this->stack) - 1; $n >= 0; $n--) {
|
||||
/* Initialise node to be the current node (the bottommost
|
||||
node of the stack). */
|
||||
$node = $this->stack[$n];
|
||||
|
||||
/* If node has the same tag name as the end tag token,
|
||||
then: */
|
||||
if($token['name'] === $node->tagName) {
|
||||
/* Generate implied end tags. */
|
||||
$this->generateImpliedEndTags();
|
||||
|
||||
/* If the tag name of the end tag token does not
|
||||
match the tag name of the current node, this is a
|
||||
parse error. */
|
||||
// XERROR: implement this
|
||||
|
||||
/* Pop all the nodes from the current node up to
|
||||
node, including node, then stop these steps. */
|
||||
// XSKETCHY
|
||||
do {
|
||||
$pop = array_pop($this->stack);
|
||||
} while ($pop !== $node);
|
||||
break;
|
||||
|
||||
} else {
|
||||
$category = $this->getElementCategory($node);
|
||||
|
||||
if($category !== self::FORMATTING && $category !== self::PHRASING) {
|
||||
/* Otherwise, if node is in neither the formatting
|
||||
category nor the phrasing category, then this is a
|
||||
parse error. Stop this algorithm. The end tag token
|
||||
is ignored. */
|
||||
$this->ignored = true;
|
||||
break;
|
||||
// parse error
|
||||
}
|
||||
}
|
||||
/* Set node to the previous entry in the stack of open elements. Loop. */
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case self::IN_CDATA_RCDATA:
|
||||
if (
|
||||
$token['type'] === HTML5_Tokenizer::CHARACTER ||
|
||||
$token['type'] === HTML5_Tokenizer::SPACECHARACTER
|
||||
) {
|
||||
$this->insertText($token['data']);
|
||||
} elseif ($token['type'] === HTML5_Tokenizer::EOF) {
|
||||
// parse error
|
||||
/* If the current node is a script element, mark the script
|
||||
* element as "already executed". */
|
||||
// probably not necessary
|
||||
array_pop($this->stack);
|
||||
$this->mode = $this->original_mode;
|
||||
$this->emitToken($token);
|
||||
} elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'script') {
|
||||
array_pop($this->stack);
|
||||
$this->mode = $this->original_mode;
|
||||
// we're ignoring all of the execution stuff
|
||||
} elseif ($token['type'] === HTML5_Tokenizer::ENDTAG) {
|
||||
array_pop($this->stack);
|
||||
$this->mode = $this->original_mode;
|
||||
}
|
||||
break;
|
||||
|
||||
case self::IN_TABLE:
|
||||
$clear = array('html', 'table');
|
||||
|
||||
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
|
||||
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
|
||||
or U+0020 SPACE */
|
||||
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER &&
|
||||
/* If the current table is tainted, then act as described in
|
||||
* the "anything else" entry below. */
|
||||
// Note: hsivonen has a test that fails due to this line
|
||||
// because he wants to convince Hixie not to do taint
|
||||
!$this->currentTableIsTainted()) {
|
||||
/* Append the character to the current node. */
|
||||
$this->insertText($token['data']);
|
||||
|
||||
/* A comment token */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
||||
/* Append a Comment node to the current node with the data
|
||||
attribute set to the data given in the comment token. */
|
||||
$this->insertComment($token['data']);
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
||||
// parse error
|
||||
|
||||
/* A start tag whose tag name is "caption" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
$token['name'] === 'caption') {
|
||||
/* Clear the stack back to a table context. */
|
||||
$this->clearStackToTableContext($clear);
|
||||
|
||||
/* Insert a marker at the end of the list of active
|
||||
formatting elements. */
|
||||
$this->a_formatting[] = self::MARKER;
|
||||
|
||||
/* Insert an HTML element for the token, then switch the
|
||||
insertion mode to "in caption". */
|
||||
$this->insertElement($token);
|
||||
$this->mode = self::IN_CAPTION;
|
||||
|
||||
/* A start tag whose tag name is "colgroup" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
$token['name'] === 'colgroup') {
|
||||
/* Clear the stack back to a table context. */
|
||||
$this->clearStackToTableContext($clear);
|
||||
|
||||
/* Insert an HTML element for the token, then switch the
|
||||
insertion mode to "in column group". */
|
||||
$this->insertElement($token);
|
||||
$this->mode = self::IN_COLUMN_GROUP;
|
||||
|
||||
/* A start tag whose tag name is "col" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
$token['name'] === 'col') {
|
||||
$this->emitToken(array(
|
||||
'name' => 'colgroup',
|
||||
'type' => HTML5_Tokenizer::STARTTAG,
|
||||
'attr' => array()
|
||||
));
|
||||
|
||||
$this->emitToken($token);
|
||||
|
||||
/* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
|
||||
array('tbody', 'tfoot', 'thead'))) {
|
||||
/* Clear the stack back to a table context. */
|
||||
$this->clearStackToTableContext($clear);
|
||||
|
||||
/* Insert an HTML element for the token, then switch the insertion
|
||||
mode to "in table body". */
|
||||
$this->insertElement($token);
|
||||
$this->mode = self::IN_TABLE_BODY;
|
||||
|
||||
/* A start tag whose tag name is one of: "td", "th", "tr" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
in_array($token['name'], array('td', 'th', 'tr'))) {
|
||||
/* Act as if a start tag token with the tag name "tbody" had been
|
||||
seen, then reprocess the current token. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'tbody',
|
||||
'type' => HTML5_Tokenizer::STARTTAG,
|
||||
'attr' => array()
|
||||
));
|
||||
|
||||
$this->emitToken($token);
|
||||
|
||||
/* A start tag whose tag name is "table" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
$token['name'] === 'table') {
|
||||
/* Parse error. Act as if an end tag token with the tag name "table"
|
||||
had been seen, then, if that token wasn't ignored, reprocess the
|
||||
current token. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'table',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
|
||||
if (!$this->ignored) $this->emitToken($token);
|
||||
|
||||
/* An end tag whose tag name is "table" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
||||
$token['name'] === 'table') {
|
||||
/* If the stack of open elements does not have an element in table
|
||||
scope with the same tag name as the token, this is a parse error.
|
||||
Ignore the token. (fragment case) */
|
||||
if(!$this->elementInScope($token['name'], true)) {
|
||||
$this->ignored = true;
|
||||
|
||||
/* Otherwise: */
|
||||
} else {
|
||||
do {
|
||||
$node = array_pop($this->stack);
|
||||
} while ($node->tagName !== 'table');
|
||||
|
||||
/* Reset the insertion mode appropriately. */
|
||||
$this->resetInsertionMode();
|
||||
}
|
||||
|
||||
/* An end tag whose tag name is one of: "body", "caption", "col",
|
||||
"colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
|
||||
array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
|
||||
'tfoot', 'th', 'thead', 'tr'))) {
|
||||
// Parse error. Ignore the token.
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
($token['name'] === 'style' || $token['name'] === 'script')) {
|
||||
$this->processWithRulesFor($token, self::IN_HEAD);
|
||||
|
||||
} elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'input' &&
|
||||
// assignment is intentional
|
||||
/* If the token does not have an attribute with the name "type", or
|
||||
* if it does, but that attribute's value is not an ASCII
|
||||
* case-insensitive match for the string "hidden", then: act as
|
||||
* described in the "anything else" entry below. */
|
||||
($type = $this->getAttr($token, 'type')) && strtolower($type) === 'hidden') {
|
||||
// I.e., if its an input with the type attribute == 'hidden'
|
||||
/* Otherwise */
|
||||
// parse error
|
||||
$this->insertElement($token);
|
||||
array_pop($this->stack);
|
||||
} elseif ($token['type'] === HTML5_Tokenizer::EOF) {
|
||||
/* If the current node is not the root html element, then this is a parse error. */
|
||||
if (end($this->stack)->tagName !== 'html') {
|
||||
// Note: It can only be the current node in the fragment case.
|
||||
// parse error
|
||||
}
|
||||
/* Stop parsing. */
|
||||
/* Anything else */
|
||||
} else {
|
||||
/* Parse error. Process the token as if the insertion mode was "in
|
||||
body", with the following exception: */
|
||||
|
||||
$old = $this->foster_parent;
|
||||
$this->foster_parent = true;
|
||||
$this->processWithRulesFor($token, self::IN_BODY);
|
||||
$this->foster_parent = $old;
|
||||
}
|
||||
break;
|
||||
|
||||
case self::IN_CAPTION:
|
||||
/* An end tag whose tag name is "caption" */
|
||||
if($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'caption') {
|
||||
/* If the stack of open elements does not have an element in table
|
||||
scope with the same tag name as the token, this is a parse error.
|
||||
Ignore the token. (fragment case) */
|
||||
if(!$this->elementInScope($token['name'], true)) {
|
||||
$this->ignored = true;
|
||||
// Ignore
|
||||
|
||||
/* Otherwise: */
|
||||
} else {
|
||||
/* Generate implied end tags. */
|
||||
$this->generateImpliedEndTags();
|
||||
|
||||
/* Now, if the current node is not a caption element, then this
|
||||
is a parse error. */
|
||||
// XERROR: implement
|
||||
|
||||
/* Pop elements from this stack until a caption element has
|
||||
been popped from the stack. */
|
||||
do {
|
||||
$node = array_pop($this->stack);
|
||||
} while ($node->tagName !== 'caption');
|
||||
|
||||
/* Clear the list of active formatting elements up to the last
|
||||
marker. */
|
||||
$this->clearTheActiveFormattingElementsUpToTheLastMarker();
|
||||
|
||||
/* Switch the insertion mode to "in table". */
|
||||
$this->mode = self::IN_TABLE;
|
||||
}
|
||||
|
||||
/* A start tag whose tag name is one of: "caption", "col", "colgroup",
|
||||
"tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
|
||||
name is "table" */
|
||||
} elseif(($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
|
||||
array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
|
||||
'thead', 'tr'))) || ($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
||||
$token['name'] === 'table')) {
|
||||
/* Parse error. Act as if an end tag with the tag name "caption"
|
||||
had been seen, then, if that token wasn't ignored, reprocess the
|
||||
current token. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'caption',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
|
||||
if (!$this->ignored) $this->emitToken($token);
|
||||
|
||||
/* An end tag whose tag name is one of: "body", "col", "colgroup",
|
||||
"html", "tbody", "td", "tfoot", "th", "thead", "tr" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
|
||||
array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
|
||||
'thead', 'tr'))) {
|
||||
// Parse error. Ignore the token.
|
||||
$this->ignored = true;
|
||||
|
||||
/* Anything else */
|
||||
} else {
|
||||
/* Process the token as if the insertion mode was "in body". */
|
||||
$this->processWithRulesFor($token, self::IN_BODY);
|
||||
}
|
||||
break;
|
||||
|
||||
case self::IN_COLUMN_GROUP:
|
||||
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
|
||||
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
|
||||
or U+0020 SPACE */
|
||||
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
||||
/* Append the character to the current node. */
|
||||
$this->insertText($token['data']);
|
||||
|
||||
/* A comment token */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
||||
/* Append a Comment node to the current node with the data
|
||||
attribute set to the data given in the comment token. */
|
||||
$this->insertToken($token['data']);
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
||||
// parse error
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
|
||||
$this->processWithRulesFor($token, self::IN_BODY);
|
||||
|
||||
/* A start tag whose tag name is "col" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'col') {
|
||||
/* Insert a col element for the token. Immediately pop the current
|
||||
node off the stack of open elements. */
|
||||
$this->insertElement($token);
|
||||
array_pop($this->stack);
|
||||
// XERROR: Acknowledge the token's self-closing flag, if it is set.
|
||||
|
||||
/* An end tag whose tag name is "colgroup" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
||||
$token['name'] === 'colgroup') {
|
||||
/* If the current node is the root html element, then this is a
|
||||
parse error, ignore the token. (fragment case) */
|
||||
if(end($this->stack)->tagName === 'html') {
|
||||
$this->ignored = true;
|
||||
|
||||
/* Otherwise, pop the current node (which will be a colgroup
|
||||
element) from the stack of open elements. Switch the insertion
|
||||
mode to "in table". */
|
||||
} else {
|
||||
array_pop($this->stack);
|
||||
$this->mode = self::IN_TABLE;
|
||||
}
|
||||
|
||||
/* An end tag whose tag name is "col" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'col') {
|
||||
/* Parse error. Ignore the token. */
|
||||
$this->ignored = true;
|
||||
|
||||
/* An end-of-file token */
|
||||
/* If the current node is the root html element */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::EOF && end($this->stack)->tagName === 'html') {
|
||||
/* Stop parsing */
|
||||
|
||||
/* Anything else */
|
||||
} else {
|
||||
/* Act as if an end tag with the tag name "colgroup" had been seen,
|
||||
and then, if that token wasn't ignored, reprocess the current token. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'colgroup',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
|
||||
if (!$this->ignored) $this->emitToken($token);
|
||||
}
|
||||
break;
|
||||
|
||||
case self::IN_TABLE_BODY:
|
||||
$clear = array('tbody', 'tfoot', 'thead', 'html');
|
||||
|
||||
/* A start tag whose tag name is "tr" */
|
||||
if($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'tr') {
|
||||
/* Clear the stack back to a table body context. */
|
||||
$this->clearStackToTableContext($clear);
|
||||
|
||||
/* Insert a tr element for the token, then switch the insertion
|
||||
mode to "in row". */
|
||||
$this->insertElement($token);
|
||||
$this->mode = self::IN_ROW;
|
||||
|
||||
/* A start tag whose tag name is one of: "th", "td" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
($token['name'] === 'th' || $token['name'] === 'td')) {
|
||||
/* Parse error. Act as if a start tag with the tag name "tr" had
|
||||
been seen, then reprocess the current token. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'tr',
|
||||
'type' => HTML5_Tokenizer::STARTTAG,
|
||||
'attr' => array()
|
||||
));
|
||||
|
||||
$this->emitToken($token);
|
||||
|
||||
/* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
||||
in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
|
||||
/* If the stack of open elements does not have an element in table
|
||||
scope with the same tag name as the token, this is a parse error.
|
||||
Ignore the token. */
|
||||
if(!$this->elementInScope($token['name'], true)) {
|
||||
// Parse error
|
||||
$this->ignored = true;
|
||||
|
||||
/* Otherwise: */
|
||||
} else {
|
||||
/* Clear the stack back to a table body context. */
|
||||
$this->clearStackToTableContext($clear);
|
||||
|
||||
/* Pop the current node from the stack of open elements. Switch
|
||||
the insertion mode to "in table". */
|
||||
array_pop($this->stack);
|
||||
$this->mode = self::IN_TABLE;
|
||||
}
|
||||
|
||||
/* A start tag whose tag name is one of: "caption", "col", "colgroup",
|
||||
"tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
|
||||
} elseif(($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
|
||||
array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead'))) ||
|
||||
($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'table')) {
|
||||
/* If the stack of open elements does not have a tbody, thead, or
|
||||
tfoot element in table scope, this is a parse error. Ignore the
|
||||
token. (fragment case) */
|
||||
if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
|
||||
// parse error
|
||||
$this->ignored = true;
|
||||
|
||||
/* Otherwise: */
|
||||
} else {
|
||||
/* Clear the stack back to a table body context. */
|
||||
$this->clearStackToTableContext($clear);
|
||||
|
||||
/* Act as if an end tag with the same tag name as the current
|
||||
node ("tbody", "tfoot", or "thead") had been seen, then
|
||||
reprocess the current token. */
|
||||
$this->emitToken(array(
|
||||
'name' => end($this->stack)->tagName,
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
|
||||
$this->emitToken($token);
|
||||
}
|
||||
|
||||
/* An end tag whose tag name is one of: "body", "caption", "col",
|
||||
"colgroup", "html", "td", "th", "tr" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
|
||||
array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
|
||||
/* Parse error. Ignore the token. */
|
||||
$this->ignored = true;
|
||||
|
||||
/* Anything else */
|
||||
} else {
|
||||
/* Process the token as if the insertion mode was "in table". */
|
||||
$this->processWithRulesFor($token, self::IN_TABLE);
|
||||
}
|
||||
break;
|
||||
|
||||
case self::IN_ROW:
|
||||
$clear = array('tr', 'html');
|
||||
|
||||
/* A start tag whose tag name is one of: "th", "td" */
|
||||
if($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
($token['name'] === 'th' || $token['name'] === 'td')) {
|
||||
/* Clear the stack back to a table row context. */
|
||||
$this->clearStackToTableContext($clear);
|
||||
|
||||
/* Insert an HTML element for the token, then switch the insertion
|
||||
mode to "in cell". */
|
||||
$this->insertElement($token);
|
||||
$this->mode = self::IN_CELL;
|
||||
|
||||
/* Insert a marker at the end of the list of active formatting
|
||||
elements. */
|
||||
$this->a_formatting[] = self::MARKER;
|
||||
|
||||
/* An end tag whose tag name is "tr" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'tr') {
|
||||
/* If the stack of open elements does not have an element in table
|
||||
scope with the same tag name as the token, this is a parse error.
|
||||
Ignore the token. (fragment case) */
|
||||
if(!$this->elementInScope($token['name'], true)) {
|
||||
// Ignore.
|
||||
$this->ignored = true;
|
||||
|
||||
/* Otherwise: */
|
||||
} else {
|
||||
/* Clear the stack back to a table row context. */
|
||||
$this->clearStackToTableContext($clear);
|
||||
|
||||
/* Pop the current node (which will be a tr element) from the
|
||||
stack of open elements. Switch the insertion mode to "in table
|
||||
body". */
|
||||
array_pop($this->stack);
|
||||
$this->mode = self::IN_TABLE_BODY;
|
||||
}
|
||||
|
||||
/* A start tag whose tag name is one of: "caption", "col", "colgroup",
|
||||
"tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
|
||||
} elseif(($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
|
||||
array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) ||
|
||||
($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'table')) {
|
||||
/* Act as if an end tag with the tag name "tr" had been seen, then,
|
||||
if that token wasn't ignored, reprocess the current token. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'tr',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
if (!$this->ignored) $this->emitToken($token);
|
||||
|
||||
/* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
||||
in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
|
||||
/* If the stack of open elements does not have an element in table
|
||||
scope with the same tag name as the token, this is a parse error.
|
||||
Ignore the token. */
|
||||
if(!$this->elementInScope($token['name'], true)) {
|
||||
$this->ignored = true;
|
||||
|
||||
/* Otherwise: */
|
||||
} else {
|
||||
/* Otherwise, act as if an end tag with the tag name "tr" had
|
||||
been seen, then reprocess the current token. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'tr',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
|
||||
$this->emitToken($token);
|
||||
}
|
||||
|
||||
/* An end tag whose tag name is one of: "body", "caption", "col",
|
||||
"colgroup", "html", "td", "th" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
|
||||
array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th'))) {
|
||||
/* Parse error. Ignore the token. */
|
||||
$this->ignored = true;
|
||||
|
||||
/* Anything else */
|
||||
} else {
|
||||
/* Process the token as if the insertion mode was "in table". */
|
||||
$this->processWithRulesFor($token, self::IN_TABLE);
|
||||
}
|
||||
break;
|
||||
|
||||
case self::IN_CELL:
|
||||
/* An end tag whose tag name is one of: "td", "th" */
|
||||
if($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
||||
($token['name'] === 'td' || $token['name'] === 'th')) {
|
||||
/* If the stack of open elements does not have an element in table
|
||||
scope with the same tag name as that of the token, then this is a
|
||||
parse error and the token must be ignored. */
|
||||
if(!$this->elementInScope($token['name'], true)) {
|
||||
$this->ignored = true;
|
||||
|
||||
/* Otherwise: */
|
||||
} else {
|
||||
/* Generate implied end tags, except for elements with the same
|
||||
tag name as the token. */
|
||||
$this->generateImpliedEndTags(array($token['name']));
|
||||
|
||||
/* Now, if the current node is not an element with the same tag
|
||||
name as the token, then this is a parse error. */
|
||||
// XERROR: Implement parse error code
|
||||
|
||||
/* Pop elements from this stack until an element with the same
|
||||
tag name as the token has been popped from the stack. */
|
||||
do {
|
||||
$node = array_pop($this->stack);
|
||||
} while ($node->tagName !== $token['name']);
|
||||
|
||||
/* Clear the list of active formatting elements up to the last
|
||||
marker. */
|
||||
$this->clearTheActiveFormattingElementsUpToTheLastMarker();
|
||||
|
||||
/* Switch the insertion mode to "in row". (The current node
|
||||
will be a tr element at this point.) */
|
||||
$this->mode = self::IN_ROW;
|
||||
}
|
||||
|
||||
/* A start tag whose tag name is one of: "caption", "col", "colgroup",
|
||||
"tbody", "td", "tfoot", "th", "thead", "tr" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
|
||||
array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
|
||||
'thead', 'tr'))) {
|
||||
/* If the stack of open elements does not have a td or th element
|
||||
in table scope, then this is a parse error; ignore the token.
|
||||
(fragment case) */
|
||||
if(!$this->elementInScope(array('td', 'th'), true)) {
|
||||
// parse error
|
||||
$this->ignored = true;
|
||||
|
||||
/* Otherwise, close the cell (see below) and reprocess the current
|
||||
token. */
|
||||
} else {
|
||||
$this->closeCell();
|
||||
$this->emitToken($token);
|
||||
}
|
||||
|
||||
/* An end tag whose tag name is one of: "body", "caption", "col",
|
||||
"colgroup", "html" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
|
||||
array('body', 'caption', 'col', 'colgroup', 'html'))) {
|
||||
/* Parse error. Ignore the token. */
|
||||
$this->ignored = true;
|
||||
|
||||
/* An end tag whose tag name is one of: "table", "tbody", "tfoot",
|
||||
"thead", "tr" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
|
||||
array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
|
||||
/* If the stack of open elements does not have a td or th element
|
||||
in table scope, then this is a parse error; ignore the token.
|
||||
(innerHTML case) */
|
||||
if(!$this->elementInScope(array('td', 'th'), true)) {
|
||||
// Parse error
|
||||
$this->ignored = true;
|
||||
|
||||
/* Otherwise, close the cell (see below) and reprocess the current
|
||||
token. */
|
||||
} else {
|
||||
$this->closeCell();
|
||||
$this->emitToken($token);
|
||||
}
|
||||
|
||||
/* Anything else */
|
||||
} else {
|
||||
/* Process the token as if the insertion mode was "in body". */
|
||||
$this->processWithRulesFor($token, self::IN_BODY);
|
||||
}
|
||||
break;
|
||||
|
||||
case self::IN_SELECT:
|
||||
/* Handle the token as follows: */
|
||||
|
||||
/* A character token */
|
||||
if(
|
||||
$token['type'] === HTML5_Tokenizer::CHARACTER ||
|
||||
$token['type'] === HTML5_Tokenizer::SPACECHARACTER
|
||||
) {
|
||||
/* Append the token's character to the current node. */
|
||||
$this->insertText($token['data']);
|
||||
|
||||
/* A comment token */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
||||
/* Append a Comment node to the current node with the data
|
||||
attribute set to the data given in the comment token. */
|
||||
$this->insertComment($token['data']);
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
||||
// parse error
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
|
||||
$this->processWithRulesFor($token, self::INBODY);
|
||||
|
||||
/* A start tag token whose tag name is "option" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
$token['name'] === 'option') {
|
||||
/* If the current node is an option element, act as if an end tag
|
||||
with the tag name "option" had been seen. */
|
||||
if(end($this->stack)->tagName === 'option') {
|
||||
$this->emitToken(array(
|
||||
'name' => 'option',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
}
|
||||
|
||||
/* Insert an HTML element for the token. */
|
||||
$this->insertElement($token);
|
||||
|
||||
/* A start tag token whose tag name is "optgroup" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
$token['name'] === 'optgroup') {
|
||||
/* If the current node is an option element, act as if an end tag
|
||||
with the tag name "option" had been seen. */
|
||||
if(end($this->stack)->tagName === 'option') {
|
||||
$this->emitToken(array(
|
||||
'name' => 'option',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
}
|
||||
|
||||
/* If the current node is an optgroup element, act as if an end tag
|
||||
with the tag name "optgroup" had been seen. */
|
||||
if(end($this->stack)->tagName === 'optgroup') {
|
||||
$this->emitToken(array(
|
||||
'name' => 'optgroup',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
}
|
||||
|
||||
/* Insert an HTML element for the token. */
|
||||
$this->insertElement($token);
|
||||
|
||||
/* An end tag token whose tag name is "optgroup" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
||||
$token['name'] === 'optgroup') {
|
||||
/* First, if the current node is an option element, and the node
|
||||
immediately before it in the stack of open elements is an optgroup
|
||||
element, then act as if an end tag with the tag name "option" had
|
||||
been seen. */
|
||||
$elements_in_stack = count($this->stack);
|
||||
|
||||
if($this->stack[$elements_in_stack - 1]->tagName === 'option' &&
|
||||
$this->stack[$elements_in_stack - 2]->tagName === 'optgroup') {
|
||||
$this->emitToken(array(
|
||||
'name' => 'option',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
}
|
||||
|
||||
/* If the current node is an optgroup element, then pop that node
|
||||
from the stack of open elements. Otherwise, this is a parse error,
|
||||
ignore the token. */
|
||||
if(end($this->stack)->tagName === 'optgroup') {
|
||||
array_pop($this->stack);
|
||||
} else {
|
||||
// parse error
|
||||
$this->ignored = true;
|
||||
}
|
||||
|
||||
/* An end tag token whose tag name is "option" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
||||
$token['name'] === 'option') {
|
||||
/* If the current node is an option element, then pop that node
|
||||
from the stack of open elements. Otherwise, this is a parse error,
|
||||
ignore the token. */
|
||||
if(end($this->stack)->tagName === 'option') {
|
||||
array_pop($this->stack);
|
||||
} else {
|
||||
// parse error
|
||||
$this->ignored = true;
|
||||
}
|
||||
|
||||
/* An end tag whose tag name is "select" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
||||
$token['name'] === 'select') {
|
||||
/* If the stack of open elements does not have an element in table
|
||||
scope with the same tag name as the token, this is a parse error.
|
||||
Ignore the token. (fragment case) */
|
||||
if(!$this->elementInScope($token['name'], true)) {
|
||||
$this->ignored = true;
|
||||
// parse error
|
||||
|
||||
/* Otherwise: */
|
||||
} else {
|
||||
/* Pop elements from the stack of open elements until a select
|
||||
element has been popped from the stack. */
|
||||
do {
|
||||
$node = array_pop($this->stack);
|
||||
} while ($node->tagName !== 'select');
|
||||
|
||||
/* Reset the insertion mode appropriately. */
|
||||
$this->resetInsertionMode();
|
||||
}
|
||||
|
||||
/* A start tag whose tag name is "select" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'select') {
|
||||
/* Parse error. Act as if the token had been an end tag with the
|
||||
tag name "select" instead. */
|
||||
$this->emitToken(array(
|
||||
'name' => 'select',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
($token['name'] === 'input' || $token['name'] === 'textarea')) {
|
||||
// parse error
|
||||
$this->emitToken(array(
|
||||
'name' => 'select',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
$this->emitToken($token);
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'script') {
|
||||
$this->processWithRulesFor($token, self::IN_HEAD);
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::EOF) {
|
||||
// XERROR: If the current node is not the root html element, then this is a parse error.
|
||||
/* Stop parsing */
|
||||
|
||||
/* Anything else */
|
||||
} else {
|
||||
/* Parse error. Ignore the token. */
|
||||
$this->ignored = true;
|
||||
}
|
||||
break;
|
||||
|
||||
case self::IN_SELECT_IN_TABLE:
|
||||
|
||||
if($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
in_array($token['name'], array('caption', 'table', 'tbody',
|
||||
'tfoot', 'thead', 'tr', 'td', 'th'))) {
|
||||
// parse error
|
||||
$this->emitToken(array(
|
||||
'name' => 'select',
|
||||
'type' => HTML5_Tokenizer::ENDTAG,
|
||||
));
|
||||
$this->emitToken($token);
|
||||
|
||||
/* An end tag whose tag name is one of: "caption", "table", "tbody",
|
||||
"tfoot", "thead", "tr", "td", "th" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
||||
in_array($token['name'], array('caption', 'table', 'tbody', 'tfoot', 'thead', 'tr', 'td', 'th'))) {
|
||||
/* Parse error. */
|
||||
// parse error
|
||||
|
||||
/* If the stack of open elements has an element in table scope with
|
||||
the same tag name as that of the token, then act as if an end tag
|
||||
with the tag name "select" had been seen, and reprocess the token.
|
||||
Otherwise, ignore the token. */
|
||||
if($this->elementInScope($token['name'], true)) {
|
||||
$this->emitToken(array(
|
||||
'name' => 'select',
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
|
||||
$this->emitToken($token);
|
||||
} else {
|
||||
$this->ignored = true;
|
||||
}
|
||||
} else {
|
||||
$this->processWithRulesFor($token, self::IN_SELECT);
|
||||
}
|
||||
break;
|
||||
|
||||
case self::IN_FOREIGN_CONTENT:
|
||||
if ($token['type'] === HTML5_Tokenizer::CHARACTER ||
|
||||
$token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
||||
$this->insertText($token['data']);
|
||||
} elseif ($token['type'] === HTML5_Tokenizer::COMMENT) {
|
||||
$this->insertComment($token['data']);
|
||||
} elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
||||
// XERROR: parse error
|
||||
} elseif ($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
||||
$token['name'] === 'script' && end($this->stack)->tagName === 'script' &&
|
||||
end($this->stack)->namespaceURI === self::NS_SVG) {
|
||||
array_pop($this->stack);
|
||||
// a bunch of script running mumbo jumbo
|
||||
} elseif (
|
||||
($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
((
|
||||
$token['name'] !== 'mglyph' &&
|
||||
$token['name'] !== 'malignmark' &&
|
||||
end($this->stack)->namespaceURI === self::NS_MATHML &&
|
||||
in_array(end($this->stack)->tagName, array('mi', 'mo', 'mn', 'ms', 'mtext'))
|
||||
) ||
|
||||
(
|
||||
$token['name'] === 'svg' &&
|
||||
end($this->stack)->namespaceURI === self::NS_MATHML &&
|
||||
end($this->stack)->tagName === 'annotation-xml'
|
||||
) ||
|
||||
(
|
||||
end($this->stack)->namespaceURI === self::NS_SVG &&
|
||||
in_array(end($this->stack)->tagName, array('foreignObject', 'desc', 'title'))
|
||||
) ||
|
||||
(
|
||||
// XSKETCHY
|
||||
end($this->stack)->namespaceURI === self::NS_HTML
|
||||
))
|
||||
) || $token['type'] === HTML5_Tokenizer::ENDTAG
|
||||
) {
|
||||
$this->processWithRulesFor($token, $this->secondary_mode);
|
||||
/* If, after doing so, the insertion mode is still "in foreign
|
||||
* content", but there is no element in scope that has a namespace
|
||||
* other than the HTML namespace, switch the insertion mode to the
|
||||
* secondary insertion mode. */
|
||||
if ($this->mode === self::IN_FOREIGN_CONTENT) {
|
||||
$found = false;
|
||||
// this basically duplicates elementInScope()
|
||||
for ($i = count($this->stack) - 1; $i >= 0; $i--) {
|
||||
$node = $this->stack[$i];
|
||||
if ($node->namespaceURI !== self::NS_HTML) {
|
||||
$found = true;
|
||||
break;
|
||||
} elseif (in_array($node->tagName, array('table', 'html',
|
||||
'applet', 'caption', 'td', 'th', 'button', 'marquee',
|
||||
'object')) || ($node->tagName === 'foreignObject' &&
|
||||
$node->namespaceURI === self::NS_SVG)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!$found) {
|
||||
$this->mode = $this->secondary_mode;
|
||||
}
|
||||
}
|
||||
} elseif ($token['type'] === HTML5_Tokenizer::EOF || (
|
||||
$token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
(in_array($token['name'], array('b', "big", "blockquote", "body", "br",
|
||||
"center", "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2",
|
||||
"h3", "h4", "h5", "h6", "head", "hr", "i", "img", "li", "listing",
|
||||
"menu", "meta", "nobr", "ol", "p", "pre", "ruby", "s", "small",
|
||||
"span", "strong", "strike", "sub", "sup", "table", "tt", "u", "ul",
|
||||
"var")) || ($token['name'] === 'font' && ($this->getAttr($token, 'color') ||
|
||||
$this->getAttr($token, 'face') || $this->getAttr($token, 'size')))))) {
|
||||
// XERROR: parse error
|
||||
do {
|
||||
$node = array_pop($this->stack);
|
||||
} while ($node->namespaceURI !== self::NS_HTML);
|
||||
$this->stack[] = $node;
|
||||
$this->mode = $this->secondary_mode;
|
||||
$this->emitToken($token);
|
||||
} elseif ($token['type'] === HTML5_Tokenizer::STARTTAG) {
|
||||
static $svg_lookup = array(
|
||||
'altglyph' => 'altGlyph',
|
||||
'altglyphdef' => 'altGlyphDef',
|
||||
'altglyphitem' => 'altGlyphItem',
|
||||
'animatecolor' => 'animateColor',
|
||||
'animatemotion' => 'animateMotion',
|
||||
'animatetransform' => 'animateTransform',
|
||||
'clippath' => 'clipPath',
|
||||
'feblend' => 'feBlend',
|
||||
'fecolormatrix' => 'feColorMatrix',
|
||||
'fecomponenttransfer' => 'feComponentTransfer',
|
||||
'fecomposite' => 'feComposite',
|
||||
'feconvolvematrix' => 'feConvolveMatrix',
|
||||
'fediffuselighting' => 'feDiffuseLighting',
|
||||
'fedisplacementmap' => 'feDisplacementMap',
|
||||
'fedistantlight' => 'feDistantLight',
|
||||
'feflood' => 'feFlood',
|
||||
'fefunca' => 'feFuncA',
|
||||
'fefuncb' => 'feFuncB',
|
||||
'fefuncg' => 'feFuncG',
|
||||
'fefuncr' => 'feFuncR',
|
||||
'fegaussianblur' => 'feGaussianBlur',
|
||||
'feimage' => 'feImage',
|
||||
'femerge' => 'feMerge',
|
||||
'femergenode' => 'feMergeNode',
|
||||
'femorphology' => 'feMorphology',
|
||||
'feoffset' => 'feOffset',
|
||||
'fepointlight' => 'fePointLight',
|
||||
'fespecularlighting' => 'feSpecularLighting',
|
||||
'fespotlight' => 'feSpotLight',
|
||||
'fetile' => 'feTile',
|
||||
'feturbulence' => 'feTurbulence',
|
||||
'foreignobject' => 'foreignObject',
|
||||
'glyphref' => 'glyphRef',
|
||||
'lineargradient' => 'linearGradient',
|
||||
'radialgradient' => 'radialGradient',
|
||||
'textpath' => 'textPath',
|
||||
);
|
||||
$current = end($this->stack);
|
||||
if ($current->namespaceURI === self::NS_MATHML) {
|
||||
$token = $this->adjustMathMLAttributes($token);
|
||||
}
|
||||
if ($current->namespaceURI === self::NS_SVG &&
|
||||
isset($svg_lookup[$token['name']])) {
|
||||
$token['name'] = $svg_lookup[$token['name']];
|
||||
}
|
||||
if ($current->namespaceURI === self::NS_SVG) {
|
||||
$token = $this->adjustSVGAttributes($token);
|
||||
}
|
||||
$token = $this->adjustForeignAttributes($token);
|
||||
$this->insertForeignElement($token, $current->namespaceURI);
|
||||
if (isset($token['self-closing'])) {
|
||||
array_pop($this->stack);
|
||||
// XERROR: acknowledge self-closing flag
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case self::AFTER_BODY:
|
||||
/* Handle the token as follows: */
|
||||
|
||||
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
|
||||
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
|
||||
or U+0020 SPACE */
|
||||
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
||||
/* Process the token as it would be processed if the insertion mode
|
||||
was "in body". */
|
||||
$this->processWithRulesFor($token, self::IN_BODY);
|
||||
|
||||
/* A comment token */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
||||
/* Append a Comment node to the first element in the stack of open
|
||||
elements (the html element), with the data attribute set to the
|
||||
data given in the comment token. */
|
||||
$comment = $this->dom->createComment($token['data']);
|
||||
$this->stack[0]->appendChild($comment);
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
||||
// parse error
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
|
||||
$this->processWithRulesFor($token, self::IN_BODY);
|
||||
|
||||
/* An end tag with the tag name "html" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'html') {
|
||||
/* If the parser was originally created as part of the HTML
|
||||
* fragment parsing algorithm, this is a parse error; ignore
|
||||
* the token. (fragment case) */
|
||||
$this->ignored = true;
|
||||
// XERROR: implement this
|
||||
|
||||
$this->mode = self::AFTER_AFTER_BODY;
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::EOF) {
|
||||
/* Stop parsing */
|
||||
|
||||
/* Anything else */
|
||||
} else {
|
||||
/* Parse error. Set the insertion mode to "in body" and reprocess
|
||||
the token. */
|
||||
$this->mode = self::IN_BODY;
|
||||
$this->emitToken($token);
|
||||
}
|
||||
break;
|
||||
|
||||
case self::IN_FRAMESET:
|
||||
/* Handle the token as follows: */
|
||||
|
||||
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
|
||||
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
|
||||
U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
|
||||
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
||||
/* Append the character to the current node. */
|
||||
$this->insertText($token['data']);
|
||||
|
||||
/* A comment token */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
||||
/* Append a Comment node to the current node with the data
|
||||
attribute set to the data given in the comment token. */
|
||||
$this->insertComment($token['data']);
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
||||
// parse error
|
||||
|
||||
/* A start tag with the tag name "frameset" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
$token['name'] === 'frameset') {
|
||||
$this->insertElement($token);
|
||||
|
||||
/* An end tag with the tag name "frameset" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
||||
$token['name'] === 'frameset') {
|
||||
/* If the current node is the root html element, then this is a
|
||||
parse error; ignore the token. (fragment case) */
|
||||
if(end($this->stack)->tagName === 'html') {
|
||||
$this->ignored = true;
|
||||
// Parse error
|
||||
|
||||
} else {
|
||||
/* Otherwise, pop the current node from the stack of open
|
||||
elements. */
|
||||
array_pop($this->stack);
|
||||
|
||||
/* If the parser was not originally created as part of the HTML
|
||||
* fragment parsing algorithm (fragment case), and the current
|
||||
* node is no longer a frameset element, then switch the
|
||||
* insertion mode to "after frameset". */
|
||||
$this->mode = self::AFTER_FRAMESET;
|
||||
}
|
||||
|
||||
/* A start tag with the tag name "frame" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
$token['name'] === 'frame') {
|
||||
/* Insert an HTML element for the token. */
|
||||
$this->insertElement($token);
|
||||
|
||||
/* Immediately pop the current node off the stack of open elements. */
|
||||
array_pop($this->stack);
|
||||
|
||||
// XERROR: Acknowledge the token's self-closing flag, if it is set.
|
||||
|
||||
/* A start tag with the tag name "noframes" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
$token['name'] === 'noframes') {
|
||||
/* Process the token using the rules for the "in head" insertion mode. */
|
||||
$this->processwithRulesFor($token, self::IN_HEAD);
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::EOF) {
|
||||
// XERROR: If the current node is not the root html element, then this is a parse error.
|
||||
/* Stop parsing */
|
||||
/* Anything else */
|
||||
} else {
|
||||
/* Parse error. Ignore the token. */
|
||||
$this->ignored = true;
|
||||
}
|
||||
break;
|
||||
|
||||
case self::AFTER_FRAMESET:
|
||||
/* Handle the token as follows: */
|
||||
|
||||
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
|
||||
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
|
||||
U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
|
||||
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
|
||||
/* Append the character to the current node. */
|
||||
$this->insertText($token['data']);
|
||||
|
||||
/* A comment token */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
|
||||
/* Append a Comment node to the current node with the data
|
||||
attribute set to the data given in the comment token. */
|
||||
$this->insertComment($token['data']);
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::DOCTYPE) {
|
||||
// parse error
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
|
||||
$this->processWithRulesFor($token, self::IN_BODY);
|
||||
|
||||
/* An end tag with the tag name "html" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::ENDTAG &&
|
||||
$token['name'] === 'html') {
|
||||
$this->mode = self::AFTER_AFTER_FRAMESET;
|
||||
|
||||
/* A start tag with the tag name "noframes" */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG &&
|
||||
$token['name'] === 'noframes') {
|
||||
$this->processWithRulesFor($token, self::IN_HEAD);
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::EOF) {
|
||||
/* Stop parsing */
|
||||
|
||||
/* Anything else */
|
||||
} else {
|
||||
/* Parse error. Ignore the token. */
|
||||
$this->ignored = true;
|
||||
}
|
||||
break;
|
||||
|
||||
case self::AFTER_AFTER_BODY:
|
||||
/* A comment token */
|
||||
if($token['type'] === HTML5_Tokenizer::COMMENT) {
|
||||
/* Append a Comment node to the Document object with the data
|
||||
attribute set to the data given in the comment token. */
|
||||
$comment = $this->dom->createComment($token['data']);
|
||||
$this->dom->appendChild($comment);
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::DOCTYPE ||
|
||||
$token['type'] === HTML5_Tokenizer::SPACECHARACTER ||
|
||||
($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html')) {
|
||||
$this->processWithRulesFor($token, self::IN_BODY);
|
||||
|
||||
/* An end-of-file token */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::EOF) {
|
||||
/* OMG DONE!! */
|
||||
} else {
|
||||
// parse error
|
||||
$this->mode = self::IN_BODY;
|
||||
$this->emitToken($token);
|
||||
}
|
||||
break;
|
||||
|
||||
case self::AFTER_AFTER_FRAMESET:
|
||||
/* A comment token */
|
||||
if($token['type'] === HTML5_Tokenizer::COMMENT) {
|
||||
/* Append a Comment node to the Document object with the data
|
||||
attribute set to the data given in the comment token. */
|
||||
$comment = $this->dom->createComment($token['data']);
|
||||
$this->dom->appendChild($comment);
|
||||
|
||||
} elseif($token['type'] === HTML5_Tokenizer::DOCTYPE ||
|
||||
$token['type'] === HTML5_Tokenizer::SPACECHARACTER ||
|
||||
($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html')) {
|
||||
$this->processWithRulesFor($token, self::IN_BODY);
|
||||
|
||||
/* An end-of-file token */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::EOF) {
|
||||
/* OMG DONE!! */
|
||||
} elseif($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'nofrmaes') {
|
||||
$this->processWithRulesFor($token, self::IN_HEAD);
|
||||
} else {
|
||||
// parse error
|
||||
}
|
||||
break;
|
||||
}
|
||||
// end funky indenting
|
||||
}
|
||||
|
||||
private function insertElement($token, $append = true) {
|
||||
$el = $this->dom->createElementNS(self::NS_HTML, $token['name']);
|
||||
|
||||
if (!empty($token['attr'])) {
|
||||
foreach($token['attr'] as $attr) {
|
||||
if(!$el->hasAttribute($attr['name'])) {
|
||||
$el->setAttribute($attr['name'], $attr['value']);
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($append) {
|
||||
$this->appendToRealParent($el);
|
||||
$this->stack[] = $el;
|
||||
}
|
||||
|
||||
return $el;
|
||||
}
|
||||
|
||||
private function insertText($data) {
|
||||
if ($data === '') return;
|
||||
if ($this->ignore_lf_token) {
|
||||
if ($data[0] === "\n") {
|
||||
$data = substr($data, 1);
|
||||
if ($data === false) return;
|
||||
}
|
||||
}
|
||||
$text = $this->dom->createTextNode($data);
|
||||
$this->appendToRealParent($text);
|
||||
}
|
||||
|
||||
private function insertComment($data) {
|
||||
$comment = $this->dom->createComment($data);
|
||||
$this->appendToRealParent($comment);
|
||||
}
|
||||
|
||||
private function appendToRealParent($node) {
|
||||
// this is only for the foster_parent case
|
||||
/* If the current node is a table, tbody, tfoot, thead, or tr
|
||||
element, then, whenever a node would be inserted into the current
|
||||
node, it must instead be inserted into the foster parent element. */
|
||||
if(!$this->foster_parent || !in_array(end($this->stack)->tagName,
|
||||
array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
|
||||
end($this->stack)->appendChild($node);
|
||||
} else {
|
||||
$this->fosterParent($node);
|
||||
}
|
||||
}
|
||||
|
||||
private function elementInScope($el, $table = false) {
|
||||
if(is_array($el)) {
|
||||
foreach($el as $element) {
|
||||
if($this->elementInScope($element, $table)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
$leng = count($this->stack);
|
||||
|
||||
for($n = 0; $n < $leng; $n++) {
|
||||
/* 1. Initialise node to be the current node (the bottommost node of
|
||||
the stack). */
|
||||
$node = $this->stack[$leng - 1 - $n];
|
||||
|
||||
if($node->tagName === $el) {
|
||||
/* 2. If node is the target node, terminate in a match state. */
|
||||
return true;
|
||||
|
||||
// these are the common states for "in scope" and "in table scope"
|
||||
} elseif($node->tagName === 'table' || $node->tagName === 'html') {
|
||||
return false;
|
||||
|
||||
// these are only valid for "in scope"
|
||||
} elseif(!$table &&
|
||||
(in_array($node->tagName, array('applet', 'caption', 'td',
|
||||
'th', 'button', 'marquee', 'object')) ||
|
||||
$node->tagName === 'foreignObject' && $node->namespaceURI === self::NS_SVG)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Otherwise, set node to the previous entry in the stack of open
|
||||
elements and return to step 2. (This will never fail, since the loop
|
||||
will always terminate in the previous step if the top of the stack
|
||||
is reached.) */
|
||||
}
|
||||
}
|
||||
|
||||
private function reconstructActiveFormattingElements() {
|
||||
/* 1. If there are no entries in the list of active formatting elements,
|
||||
then there is nothing to reconstruct; stop this algorithm. */
|
||||
$formatting_elements = count($this->a_formatting);
|
||||
|
||||
if($formatting_elements === 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* 3. Let entry be the last (most recently added) element in the list
|
||||
of active formatting elements. */
|
||||
$entry = end($this->a_formatting);
|
||||
|
||||
/* 2. If the last (most recently added) entry in the list of active
|
||||
formatting elements is a marker, or if it is an element that is in the
|
||||
stack of open elements, then there is nothing to reconstruct; stop this
|
||||
algorithm. */
|
||||
if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for($a = $formatting_elements - 1; $a >= 0; true) {
|
||||
/* 4. If there are no entries before entry in the list of active
|
||||
formatting elements, then jump to step 8. */
|
||||
if($a === 0) {
|
||||
$step_seven = false;
|
||||
break;
|
||||
}
|
||||
|
||||
/* 5. Let entry be the entry one earlier than entry in the list of
|
||||
active formatting elements. */
|
||||
$a--;
|
||||
$entry = $this->a_formatting[$a];
|
||||
|
||||
/* 6. If entry is neither a marker nor an element that is also in
|
||||
thetack of open elements, go to step 4. */
|
||||
if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
while(true) {
|
||||
/* 7. Let entry be the element one later than entry in the list of
|
||||
active formatting elements. */
|
||||
if(isset($step_seven) && $step_seven === true) {
|
||||
$a++;
|
||||
$entry = $this->a_formatting[$a];
|
||||
}
|
||||
|
||||
/* 8. Perform a shallow clone of the element entry to obtain clone. */
|
||||
$clone = $entry->cloneNode();
|
||||
|
||||
/* 9. Append clone to the current node and push it onto the stack
|
||||
of open elements so that it is the new current node. */
|
||||
$this->appendToRealParent($clone);
|
||||
$this->stack[] = $clone;
|
||||
|
||||
/* 10. Replace the entry for entry in the list with an entry for
|
||||
clone. */
|
||||
$this->a_formatting[$a] = $clone;
|
||||
|
||||
/* 11. If the entry for clone in the list of active formatting
|
||||
elements is not the last entry in the list, return to step 7. */
|
||||
if(end($this->a_formatting) !== $clone) {
|
||||
$step_seven = true;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function clearTheActiveFormattingElementsUpToTheLastMarker() {
|
||||
/* When the steps below require the UA to clear the list of active
|
||||
formatting elements up to the last marker, the UA must perform the
|
||||
following steps: */
|
||||
|
||||
while(true) {
|
||||
/* 1. Let entry be the last (most recently added) entry in the list
|
||||
of active formatting elements. */
|
||||
$entry = end($this->a_formatting);
|
||||
|
||||
/* 2. Remove entry from the list of active formatting elements. */
|
||||
array_pop($this->a_formatting);
|
||||
|
||||
/* 3. If entry was a marker, then stop the algorithm at this point.
|
||||
The list has been cleared up to the last marker. */
|
||||
if($entry === self::MARKER) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function generateImpliedEndTags($exclude = array()) {
|
||||
/* When the steps below require the UA to generate implied end tags,
|
||||
then, if the current node is a dd element, a dt element, an li element,
|
||||
a p element, a td element, a th element, or a tr element, the UA must
|
||||
act as if an end tag with the respective tag name had been seen and
|
||||
then generate implied end tags again. */
|
||||
$node = end($this->stack);
|
||||
$elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
|
||||
|
||||
while(in_array(end($this->stack)->tagName, $elements)) {
|
||||
array_pop($this->stack);
|
||||
}
|
||||
}
|
||||
|
||||
private function getElementCategory($node) {
|
||||
if (!is_object($node)) debug_print_backtrace();
|
||||
$name = $node->tagName;
|
||||
if(in_array($name, $this->special))
|
||||
return self::SPECIAL;
|
||||
|
||||
elseif(in_array($name, $this->scoping))
|
||||
return self::SCOPING;
|
||||
|
||||
elseif(in_array($name, $this->formatting))
|
||||
return self::FORMATTING;
|
||||
|
||||
else
|
||||
return self::PHRASING;
|
||||
}
|
||||
|
||||
private function clearStackToTableContext($elements) {
|
||||
/* When the steps above require the UA to clear the stack back to a
|
||||
table context, it means that the UA must, while the current node is not
|
||||
a table element or an html element, pop elements from the stack of open
|
||||
elements. */
|
||||
while(true) {
|
||||
$name = end($this->stack)->tagName;
|
||||
|
||||
if(in_array($name, $elements)) {
|
||||
break;
|
||||
} else {
|
||||
array_pop($this->stack);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function resetInsertionMode($context = null) {
|
||||
/* 1. Let last be false. */
|
||||
$last = false;
|
||||
$leng = count($this->stack);
|
||||
|
||||
for($n = $leng - 1; $n >= 0; $n--) {
|
||||
/* 2. Let node be the last node in the stack of open elements. */
|
||||
$node = $this->stack[$n];
|
||||
|
||||
/* 3. If node is the first node in the stack of open elements, then
|
||||
* set last to true and set node to the context element. (fragment
|
||||
* case) */
|
||||
if($this->stack[0]->isSameNode($node)) {
|
||||
$last = true;
|
||||
$node = $context;
|
||||
}
|
||||
|
||||
/* 4. If node is a select element, then switch the insertion mode to
|
||||
"in select" and abort these steps. (fragment case) */
|
||||
if($node->tagName === 'select') {
|
||||
$this->mode = self::IN_SELECT;
|
||||
break;
|
||||
|
||||
/* 5. If node is a td or th element, then switch the insertion mode
|
||||
to "in cell" and abort these steps. */
|
||||
} elseif($node->tagName === 'td' || $node->nodeName === 'th') {
|
||||
$this->mode = self::IN_CELL;
|
||||
break;
|
||||
|
||||
/* 6. If node is a tr element, then switch the insertion mode to
|
||||
"in row" and abort these steps. */
|
||||
} elseif($node->tagName === 'tr') {
|
||||
$this->mode = self::IN_ROW;
|
||||
break;
|
||||
|
||||
/* 7. If node is a tbody, thead, or tfoot element, then switch the
|
||||
insertion mode to "in table body" and abort these steps. */
|
||||
} elseif(in_array($node->tagName, array('tbody', 'thead', 'tfoot'))) {
|
||||
$this->mode = self::IN_TABLE_BODY;
|
||||
break;
|
||||
|
||||
/* 8. If node is a caption element, then switch the insertion mode
|
||||
to "in caption" and abort these steps. */
|
||||
} elseif($node->tagName === 'caption') {
|
||||
$this->mode = self::IN_CAPTION;
|
||||
break;
|
||||
|
||||
/* 9. If node is a colgroup element, then switch the insertion mode
|
||||
to "in column group" and abort these steps. (innerHTML case) */
|
||||
} elseif($node->tagName === 'colgroup') {
|
||||
$this->mode = self::IN_COLUMN_GROUP;
|
||||
break;
|
||||
|
||||
/* 10. If node is a table element, then switch the insertion mode
|
||||
to "in table" and abort these steps. */
|
||||
} elseif($node->tagName === 'table') {
|
||||
$this->mode = self::IN_TABLE;
|
||||
break;
|
||||
|
||||
/* 11. If node is an element from the MathML namespace or the SVG
|
||||
* namespace, then switch the insertion mode to "in foreign
|
||||
* content", let the secondary insertion mode be "in body", and
|
||||
* abort these steps. */
|
||||
} elseif($node->namespaceURI === self::NS_SVG ||
|
||||
$node->namespaceURI === self::NS_MATHML) {
|
||||
$this->mode = self::IN_FOREIGN_CONTENT;
|
||||
$this->secondary_mode = self::IN_BODY;
|
||||
break;
|
||||
|
||||
/* 12. If node is a head element, then switch the insertion mode
|
||||
to "in body" ("in body"! not "in head"!) and abort these steps.
|
||||
(fragment case) */
|
||||
} elseif($node->tagName === 'head') {
|
||||
$this->mode = self::IN_BODY;
|
||||
break;
|
||||
|
||||
/* 13. If node is a body element, then switch the insertion mode to
|
||||
"in body" and abort these steps. */
|
||||
} elseif($node->tagName === 'body') {
|
||||
$this->mode = self::IN_BODY;
|
||||
break;
|
||||
|
||||
/* 14. If node is a frameset element, then switch the insertion
|
||||
mode to "in frameset" and abort these steps. (fragment case) */
|
||||
} elseif($node->tagName === 'frameset') {
|
||||
$this->mode = self::IN_FRAMESET;
|
||||
break;
|
||||
|
||||
/* 15. If node is an html element, then: if the head element
|
||||
pointer is null, switch the insertion mode to "before head",
|
||||
otherwise, switch the insertion mode to "after head". In either
|
||||
case, abort these steps. (fragment case) */
|
||||
} elseif($node->tagName === 'html') {
|
||||
$this->mode = ($this->head_pointer === null)
|
||||
? self::BEFORE_HEAD
|
||||
: self::AFTER_HEAD;
|
||||
|
||||
break;
|
||||
|
||||
/* 16. If last is true, then set the insertion mode to "in body"
|
||||
and abort these steps. (fragment case) */
|
||||
} elseif($last) {
|
||||
$this->mode = self::IN_BODY;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function closeCell() {
|
||||
/* If the stack of open elements has a td or th element in table scope,
|
||||
then act as if an end tag token with that tag name had been seen. */
|
||||
foreach(array('td', 'th') as $cell) {
|
||||
if($this->elementInScope($cell, true)) {
|
||||
$this->emitToken(array(
|
||||
'name' => $cell,
|
||||
'type' => HTML5_Tokenizer::ENDTAG
|
||||
));
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function processWithRulesFor($token, $mode) {
|
||||
/* "using the rules for the m insertion mode", where m is one of these
|
||||
* modes, the user agent must use the rules described under the m
|
||||
* insertion mode's section, but must leave the insertion mode
|
||||
* unchanged unless the rules in m themselves switch the insertion mode
|
||||
* to a new value. */
|
||||
return $this->emitToken($token, $mode);
|
||||
}
|
||||
|
||||
private function insertCDATAElement($token) {
|
||||
$this->insertElement($token);
|
||||
$this->original_mode = $this->mode;
|
||||
$this->mode = self::IN_CDATA_RCDATA;
|
||||
$this->content_model = HTML5_Tokenizer::CDATA;
|
||||
}
|
||||
|
||||
private function insertRCDATAElement($token) {
|
||||
$this->insertElement($token);
|
||||
$this->original_mode = $this->mode;
|
||||
$this->mode = self::IN_CDATA_RCDATA;
|
||||
$this->content_model = HTML5_Tokenizer::RCDATA;
|
||||
}
|
||||
|
||||
private function getAttr($token, $key) {
|
||||
if (!isset($token['attr'])) return false;
|
||||
$ret = false;
|
||||
foreach ($token['attr'] as $keypair) {
|
||||
if ($keypair['name'] === $key) $ret = $keypair['value'];
|
||||
}
|
||||
return $ret;
|
||||
}
|
||||
|
||||
private function getCurrentTable() {
|
||||
/* The current table is the last table element in the stack of open
|
||||
* elements, if there is one. If there is no table element in the stack
|
||||
* of open elements (fragment case), then the current table is the
|
||||
* first element in the stack of open elements (the html element). */
|
||||
for ($i = count($this->stack) - 1; $i >= 0; $i--) {
|
||||
if ($this->stack[$i]->tagName === 'table') {
|
||||
return $this->stack[$i];
|
||||
}
|
||||
}
|
||||
return $this->stack[0];
|
||||
}
|
||||
|
||||
private function getFosterParent() {
|
||||
/* The foster parent element is the parent element of the last
|
||||
table element in the stack of open elements, if there is a
|
||||
table element and it has such a parent element. If there is no
|
||||
table element in the stack of open elements (innerHTML case),
|
||||
then the foster parent element is the first element in the
|
||||
stack of open elements (the html element). Otherwise, if there
|
||||
is a table element in the stack of open elements, but the last
|
||||
table element in the stack of open elements has no parent, or
|
||||
its parent node is not an element, then the foster parent
|
||||
element is the element before the last table element in the
|
||||
stack of open elements. */
|
||||
for($n = count($this->stack) - 1; $n >= 0; $n--) {
|
||||
if($this->stack[$n]->tagName === 'table') {
|
||||
$table = $this->stack[$n];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(isset($table) && $table->parentNode !== null) {
|
||||
return $table->parentNode;
|
||||
|
||||
} elseif(!isset($table)) {
|
||||
return $this->stack[0];
|
||||
|
||||
} elseif(isset($table) && ($table->parentNode === null ||
|
||||
$table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
|
||||
return $this->stack[$n - 1];
|
||||
}
|
||||
}
|
||||
|
||||
public function fosterParent($node) {
|
||||
$foster_parent = $this->getFosterParent();
|
||||
$table = $this->getCurrentTable(); // almost equivalent to last table element, except it can be html
|
||||
/* When a node node is to be foster parented, the node node must be
|
||||
* inserted into the foster parent element, and the current table must
|
||||
* be marked as tainted. (Once the current table has been tainted,
|
||||
* whitespace characters are inserted into the foster parent element
|
||||
* instead of the current node.) */
|
||||
$table->tainted = true;
|
||||
/* If the foster parent element is the parent element of the last table
|
||||
* element in the stack of open elements, then node must be inserted
|
||||
* immediately before the last table element in the stack of open
|
||||
* elements in the foster parent element; otherwise, node must be
|
||||
* appended to the foster parent element. */
|
||||
if ($table->tagName === 'table' && $table->parentNode->isSameNode($foster_parent)) {
|
||||
$foster_parent->insertBefore($node, $table);
|
||||
} else {
|
||||
$foster_parent->appendChild($node);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* For debugging, prints the stack
|
||||
*/
|
||||
private function printStack() {
|
||||
$names = array();
|
||||
foreach ($this->stack as $i => $element) {
|
||||
$names[] = $element->tagName;
|
||||
}
|
||||
echo " -> stack [" . implode(', ', $names) . "]\n";
|
||||
}
|
||||
|
||||
/**
|
||||
* For debugging, prints active formatting elements
|
||||
*/
|
||||
private function printActiveFormattingElements() {
|
||||
if (!$this->a_formatting) return;
|
||||
$names = array();
|
||||
foreach ($this->a_formatting as $node) {
|
||||
if ($node === self::MARKER) $names[] = 'MARKER';
|
||||
else $names[] = $node->tagName;
|
||||
}
|
||||
echo " -> active formatting [" . implode(', ', $names) . "]\n";
|
||||
}
|
||||
|
||||
public function currentTableIsTainted() {
|
||||
return !empty($this->getCurrentTable()->tainted);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets up the tree constructor for building a fragment.
|
||||
*/
|
||||
public function setupContext($context = null) {
|
||||
$this->fragment = true;
|
||||
if ($context) {
|
||||
$context = $this->dom->createElementNS(self::NS_HTML, $context);
|
||||
/* 4.1. Set the HTML parser's tokenization stage's content model
|
||||
* flag according to the context element, as follows: */
|
||||
switch ($context->tagName) {
|
||||
case 'title': case 'textarea':
|
||||
$this->content_model = HTML5_Tokenizer::RCDATA;
|
||||
break;
|
||||
case 'style': case 'script': case 'xmp': case 'iframe':
|
||||
case 'noembed': case 'noframes':
|
||||
$this->content_model = HTML5_Tokenizer::CDATA;
|
||||
break;
|
||||
case 'noscript':
|
||||
// XSCRIPT: assuming scripting is enabled
|
||||
$this->content_model = HTML5_Tokenizer::CDATA;
|
||||
break;
|
||||
case 'plaintext':
|
||||
$this->content_model = HTML5_Tokenizer::PLAINTEXT;
|
||||
break;
|
||||
}
|
||||
/* 4.2. Let root be a new html element with no attributes. */
|
||||
$root = $this->dom->createElementNS(self::NS_HTML, 'html');
|
||||
$this->root = $root;
|
||||
/* 4.3 Append the element root to the Document node created above. */
|
||||
$this->dom->appendChild($root);
|
||||
/* 4.4 Set up the parser's stack of open elements so that it
|
||||
* contains just the single element root. */
|
||||
$this->stack = array($root);
|
||||
/* 4.5 Reset the parser's insertion mode appropriately. */
|
||||
$this->resetInsertionMode($context);
|
||||
/* 4.6 Set the parser's form element pointer to the nearest node
|
||||
* to the context element that is a form element (going straight up
|
||||
* the ancestor chain, and including the element itself, if it is a
|
||||
* form element), or, if there is no such form element, to null. */
|
||||
$node = $context;
|
||||
do {
|
||||
if ($node->tagName === 'form') {
|
||||
$this->form_pointer = $node;
|
||||
break;
|
||||
}
|
||||
} while ($node = $node->parentNode);
|
||||
}
|
||||
}
|
||||
|
||||
public function adjustMathMLAttributes($token) {
|
||||
foreach ($token['attr'] as &$kp) {
|
||||
if ($kp['name'] === 'definitionurl') {
|
||||
$kp['name'] = 'definitionURL';
|
||||
}
|
||||
}
|
||||
return $token;
|
||||
}
|
||||
|
||||
public function adjustSVGAttributes($token) {
|
||||
static $lookup = array(
|
||||
'attributename' => 'attributeName',
|
||||
'attributetype' => 'attributeType',
|
||||
'basefrequency' => 'baseFrequency',
|
||||
'baseprofile' => 'baseProfile',
|
||||
'calcmode' => 'calcMode',
|
||||
'clippathunits' => 'clipPathUnits',
|
||||
'contentscripttype' => 'contentScriptType',
|
||||
'contentstyletype' => 'contentStyleType',
|
||||
'diffuseconstant' => 'diffuseConstant',
|
||||
'edgemode' => 'edgeMode',
|
||||
'externalresourcesrequired' => 'externalResourcesRequired',
|
||||
'filterres' => 'filterRes',
|
||||
'filterunits' => 'filterUnits',
|
||||
'glyphref' => 'glyphRef',
|
||||
'gradienttransform' => 'gradientTransform',
|
||||
'gradientunits' => 'gradientUnits',
|
||||
'kernelmatrix' => 'kernelMatrix',
|
||||
'kernelunitlength' => 'kernelUnitLength',
|
||||
'keypoints' => 'keyPoints',
|
||||
'keysplines' => 'keySplines',
|
||||
'keytimes' => 'keyTimes',
|
||||
'lengthadjust' => 'lengthAdjust',
|
||||
'limitingconeangle' => 'limitingConeAngle',
|
||||
'markerheight' => 'markerHeight',
|
||||
'markerunits' => 'markerUnits',
|
||||
'markerwidth' => 'markerWidth',
|
||||
'maskcontentunits' => 'maskContentUnits',
|
||||
'maskunits' => 'maskUnits',
|
||||
'numoctaves' => 'numOctaves',
|
||||
'pathlength' => 'pathLength',
|
||||
'patterncontentunits' => 'patternContentUnits',
|
||||
'patterntransform' => 'patternTransform',
|
||||
'patternunits' => 'patternUnits',
|
||||
'pointsatx' => 'pointsAtX',
|
||||
'pointsaty' => 'pointsAtY',
|
||||
'pointsatz' => 'pointsAtZ',
|
||||
'preservealpha' => 'preserveAlpha',
|
||||
'preserveaspectratio' => 'preserveAspectRatio',
|
||||
'primitiveunits' => 'primitiveUnits',
|
||||
'refx' => 'refX',
|
||||
'refy' => 'refY',
|
||||
'repeatcount' => 'repeatCount',
|
||||
'repeatdur' => 'repeatDur',
|
||||
'requiredextensions' => 'requiredExtensions',
|
||||
'requiredfeatures' => 'requiredFeatures',
|
||||
'specularconstant' => 'specularConstant',
|
||||
'specularexponent' => 'specularExponent',
|
||||
'spreadmethod' => 'spreadMethod',
|
||||
'startoffset' => 'startOffset',
|
||||
'stddeviation' => 'stdDeviation',
|
||||
'stitchtiles' => 'stitchTiles',
|
||||
'surfacescale' => 'surfaceScale',
|
||||
'systemlanguage' => 'systemLanguage',
|
||||
'tablevalues' => 'tableValues',
|
||||
'targetx' => 'targetX',
|
||||
'targety' => 'targetY',
|
||||
'textlength' => 'textLength',
|
||||
'viewbox' => 'viewBox',
|
||||
'viewtarget' => 'viewTarget',
|
||||
'xchannelselector' => 'xChannelSelector',
|
||||
'ychannelselector' => 'yChannelSelector',
|
||||
'zoomandpan' => 'zoomAndPan',
|
||||
);
|
||||
foreach ($token['attr'] as &$kp) {
|
||||
if (isset($lookup[$kp['name']])) {
|
||||
$kp['name'] = $lookup[$kp['name']];
|
||||
}
|
||||
}
|
||||
return $token;
|
||||
}
|
||||
|
||||
public function adjustForeignAttributes($token) {
|
||||
static $lookup = array(
|
||||
'xlink:actuate' => array('xlink', 'actuate', self::NS_XLINK),
|
||||
'xlink:arcrole' => array('xlink', 'arcrole', self::NS_XLINK),
|
||||
'xlink:href' => array('xlink', 'href', self::NS_XLINK),
|
||||
'xlink:role' => array('xlink', 'role', self::NS_XLINK),
|
||||
'xlink:show' => array('xlink', 'show', self::NS_XLINK),
|
||||
'xlink:title' => array('xlink', 'title', self::NS_XLINK),
|
||||
'xlink:type' => array('xlink', 'type', self::NS_XLINK),
|
||||
'xml:base' => array('xml', 'base', self::NS_XML),
|
||||
'xml:lang' => array('xml', 'lang', self::NS_XML),
|
||||
'xml:space' => array('xml', 'space', self::NS_XML),
|
||||
'xmlns' => array(null, 'xmlns', self::NS_XMLNS),
|
||||
'xmlns:xlink' => array('xmlns', 'xlink', self::NS_XMLNS),
|
||||
);
|
||||
foreach ($token['attr'] as &$kp) {
|
||||
if (isset($lookup[$kp['name']])) {
|
||||
$kp['name'] = $lookup[$kp['name']];
|
||||
}
|
||||
}
|
||||
return $token;
|
||||
}
|
||||
|
||||
public function insertForeignElement($token, $namespaceURI) {
|
||||
$el = $this->dom->createElementNS($namespaceURI, $token['name']);
|
||||
if (!empty($token['attr'])) {
|
||||
foreach ($token['attr'] as $kp) {
|
||||
$attr = $kp['name'];
|
||||
if (is_array($attr)) {
|
||||
$ns = $attr[2];
|
||||
$attr = $attr[1];
|
||||
} else {
|
||||
$ns = self::NS_HTML;
|
||||
}
|
||||
if (!$el->hasAttributeNS($ns, $attr)) {
|
||||
// XSKETCHY: work around godawful libxml bug
|
||||
if ($ns === self::NS_XLINK) {
|
||||
$el->setAttribute('xlink:'.$attr, $kp['value']);
|
||||
} elseif ($ns === self::NS_HTML) {
|
||||
// Another godawful libxml bug
|
||||
$el->setAttribute($attr, $kp['value']);
|
||||
} else {
|
||||
$el->setAttributeNS($ns, $attr, $kp['value']);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
$this->appendToRealParent($el);
|
||||
$this->stack[] = $el;
|
||||
// XERROR: see below
|
||||
/* If the newly created element has an xmlns attribute in the XMLNS
|
||||
* namespace whose value is not exactly the same as the element's
|
||||
* namespace, that is a parse error. Similarly, if the newly created
|
||||
* element has an xmlns:xlink attribute in the XMLNS namespace whose
|
||||
* value is not the XLink Namespace, that is a parse error. */
|
||||
}
|
||||
|
||||
public function save() {
|
||||
$this->dom->normalize();
|
||||
if (!$this->fragment) {
|
||||
return $this->dom;
|
||||
} else {
|
||||
if ($this->root) {
|
||||
return $this->root->childNodes;
|
||||
} else {
|
||||
return $this->dom->childNodes;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
1
library/HTML5/named-character-references.ser
Normal file
1
library/HTML5/named-character-references.ser
Normal file
File diff suppressed because one or more lines are too long
Loading…
Add table
Add a link
Reference in a new issue