2307 lines
		
	
	
	
		
			106 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			2307 lines
		
	
	
	
		
			106 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | |
| 
 | |
| /*
 | |
| 
 | |
| Copyright 2007 Jeroen van der Meer <http://jero.net/>
 | |
| Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
 | |
| Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
 | |
| 
 | |
| Permission is hereby granted, free of charge, to any person obtaining a
 | |
| copy of this software and associated documentation files (the
 | |
| "Software"), to deal in the Software without restriction, including
 | |
| without limitation the rights to use, copy, modify, merge, publish,
 | |
| distribute, sublicense, and/or sell copies of the Software, and to
 | |
| permit persons to whom the Software is furnished to do so, subject to
 | |
| the following conditions:
 | |
| 
 | |
| The above copyright notice and this permission notice shall be included
 | |
| in all copies or substantial portions of the Software.
 | |
| 
 | |
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 | |
| OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | |
| MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | |
| IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | |
| CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | |
| TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | |
| SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | |
| 
 | |
| */
 | |
| 
 | |
| // Some conventions:
 | |
| // /* */ indicates verbatim text from the HTML 5 specification
 | |
| // // indicates regular comments
 | |
| 
 | |
| // all flags are in hyphenated form
 | |
| 
 | |
| class HTML5_Tokenizer {
 | |
|     /**
 | |
|      * Points to an InputStream object.
 | |
|      */
 | |
|     protected $stream;
 | |
| 
 | |
|     /**
 | |
|      * Tree builder that the tokenizer emits token to.
 | |
|      */
 | |
|     private $tree;
 | |
| 
 | |
|     /**
 | |
|      * Current content model we are parsing as.
 | |
|      */
 | |
|     protected $content_model;
 | |
| 
 | |
|     /**
 | |
|      * Current token that is being built, but not yet emitted. Also
 | |
|      * is the last token emitted, if applicable.
 | |
|      */
 | |
|     protected $token;
 | |
| 
 | |
|     // These are constants describing the content model
 | |
|     const PCDATA    = 0;
 | |
|     const RCDATA    = 1;
 | |
|     const CDATA     = 2;
 | |
|     const PLAINTEXT = 3;
 | |
| 
 | |
|     // These are constants describing tokens
 | |
|     // XXX should probably be moved somewhere else, probably the
 | |
|     // HTML5 class.
 | |
|     const DOCTYPE        = 0;
 | |
|     const STARTTAG       = 1;
 | |
|     const ENDTAG         = 2;
 | |
|     const COMMENT        = 3;
 | |
|     const CHARACTER      = 4;
 | |
|     const SPACECHARACTER = 5;
 | |
|     const EOF            = 6;
 | |
|     const PARSEERROR     = 7;
 | |
| 
 | |
|     // These are constants representing bunches of characters.
 | |
|     const ALPHA       = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
 | |
|     const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
 | |
|     const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
 | |
|     const DIGIT       = '0123456789';
 | |
|     const HEX         = '0123456789ABCDEFabcdef';
 | |
|     const WHITESPACE  = "\t\n\x0c ";
 | |
| 
 | |
|     /**
 | |
|      * @param $data Data to parse
 | |
|      */
 | |
|     public function __construct($data, $builder = null) {
 | |
|         $this->stream = new HTML5_InputStream($data);
 | |
|         if (!$builder) $this->tree = new HTML5_TreeBuilder;
 | |
|         $this->content_model = self::PCDATA;
 | |
|     }
 | |
| 
 | |
|     public function parseFragment($context = null) {
 | |
|         $this->tree->setupContext($context);
 | |
|         if ($this->tree->content_model) {
 | |
|             $this->content_model = $this->tree->content_model;
 | |
|             $this->tree->content_model = null;
 | |
|         }
 | |
|         $this->parse();
 | |
|     }
 | |
| 
 | |
|     // XXX maybe convert this into an iterator? regardless, this function
 | |
|     // and the save function should go into a Parser facade of some sort
 | |
|     /**
 | |
|      * Performs the actual parsing of the document.
 | |
|      */
 | |
|     public function parse() {
 | |
|         // Current state
 | |
|         $state = 'data';
 | |
|         // This is used to avoid having to have look-behind in the data state.
 | |
|         $lastFourChars = '';
 | |
|         /**
 | |
|          * Escape flag as specified by the HTML5 specification: "used to
 | |
|          * control the behavior of the tokeniser. It is either true or
 | |
|          * false, and initially must be set to the false state."
 | |
|          */
 | |
|         $escape = false;
 | |
|         //echo "\n\n";
 | |
|         while($state !== null) {
 | |
|             
 | |
|             /*echo $state . ' ';
 | |
|             switch ($this->content_model) {
 | |
|                 case self::PCDATA: echo 'PCDATA'; break;
 | |
|                 case self::RCDATA: echo 'RCDATA'; break;
 | |
|                 case self::CDATA: echo 'CDATA'; break;
 | |
|                 case self::PLAINTEXT: echo 'PLAINTEXT'; break;
 | |
|             }
 | |
|             if ($escape) echo " escape";
 | |
|             echo "\n";*/
 | |
|             
 | |
|             switch($state) {
 | |
|                 case 'data':
 | |
| 
 | |
|                     /* Consume the next input character */
 | |
|                     $char = $this->stream->char();
 | |
|                     $lastFourChars .= $char;
 | |
|                     if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
 | |
| 
 | |
|                     // see below for meaning
 | |
|                     $hyp_cond = 
 | |
|                         !$escape &&
 | |
|                         (
 | |
|                             $this->content_model === self::RCDATA ||
 | |
|                             $this->content_model === self::CDATA
 | |
|                         );
 | |
|                     $amp_cond =
 | |
|                         !$escape &&
 | |
|                         (
 | |
|                             $this->content_model === self::PCDATA ||
 | |
|                             $this->content_model === self::RCDATA
 | |
|                         );
 | |
|                     $lt_cond =
 | |
|                         $this->content_model === self::PCDATA ||
 | |
|                         (
 | |
|                             (
 | |
|                                 $this->content_model === self::RCDATA ||
 | |
|                                 $this->content_model === self::CDATA
 | |
|                              ) &&
 | |
|                              !$escape
 | |
|                         );
 | |
|                     $gt_cond = 
 | |
|                         $escape &&
 | |
|                         (
 | |
|                             $this->content_model === self::RCDATA ||
 | |
|                             $this->content_model === self::CDATA
 | |
|                         );
 | |
| 
 | |
|                     if($char === '&' && $amp_cond) {
 | |
|                         /* U+0026 AMPERSAND (&)
 | |
|                         When the content model flag is set to one of the PCDATA or RCDATA
 | |
|                         states and the escape flag is false: switch to the
 | |
|                         character reference data state. Otherwise: treat it as per
 | |
|                         the "anything else" entry below. */
 | |
|                         $state = 'characterReferenceData';
 | |
| 
 | |
|                     } elseif(
 | |
|                         $char === '-' &&
 | |
|                         $hyp_cond &&
 | |
|                         $lastFourChars === '<!--'
 | |
|                     ) {
 | |
|                         /*
 | |
|                         U+002D HYPHEN-MINUS (-)
 | |
|                         If the content model flag is set to either the RCDATA state or
 | |
|                         the CDATA state, and the escape flag is false, and there are at
 | |
|                         least three characters before this one in the input stream, and the
 | |
|                         last four characters in the input stream, including this one, are
 | |
|                         U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
 | |
|                         and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
 | |
|                         $escape = true;
 | |
| 
 | |
|                         /* In any case, emit the input character as a character token. Stay
 | |
|                         in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::CHARACTER,
 | |
|                             'data' => '-'
 | |
|                         ));
 | |
|                         // We do the "any case" part as part of "anything else".
 | |
| 
 | |
|                     /* U+003C LESS-THAN SIGN (<) */
 | |
|                     } elseif($char === '<' && $lt_cond) {
 | |
|                         /* When the content model flag is set to the PCDATA state: switch
 | |
|                         to the tag open state.
 | |
| 
 | |
|                         When the content model flag is set to either the RCDATA state or
 | |
|                         the CDATA state and the escape flag is false: switch to the tag
 | |
|                         open state.
 | |
| 
 | |
|                         Otherwise: treat it as per the "anything else" entry below. */
 | |
|                         $state = 'tagOpen';
 | |
| 
 | |
|                     /* U+003E GREATER-THAN SIGN (>) */
 | |
|                     } elseif(
 | |
|                         $char === '>' &&
 | |
|                         $gt_cond &&
 | |
|                         substr($lastFourChars, 1) === '-->'
 | |
|                     ) {
 | |
|                         /* If the content model flag is set to either the RCDATA state or
 | |
|                         the CDATA state, and the escape flag is true, and the last three
 | |
|                         characters in the input stream including this one are U+002D
 | |
|                         HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
 | |
|                         set the escape flag to false. */
 | |
|                         $escape = false;
 | |
| 
 | |
|                         /* In any case, emit the input character as a character token.
 | |
|                         Stay in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::CHARACTER,
 | |
|                             'data' => '>'
 | |
|                         ));
 | |
|                         // We do the "any case" part as part of "anything else".
 | |
| 
 | |
|                     } elseif($char === false) {
 | |
|                         /* EOF
 | |
|                         Emit an end-of-file token. */
 | |
|                         $state = null;
 | |
|                         $this->tree->emitToken(array(
 | |
|                             'type' => self::EOF
 | |
|                         ));
 | |
|                     
 | |
|                     } elseif($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         // Directly after emitting a token you switch back to the "data
 | |
|                         // state". At that point spaceCharacters are important so they are
 | |
|                         // emitted separately.
 | |
|                         $chars = $this->stream->charsWhile(self::WHITESPACE);
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::SPACECHARACTER,
 | |
|                             'data' => $char . $chars
 | |
|                         ));
 | |
|                         $lastFourChars .= $chars;
 | |
|                         if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
 | |
| 
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         THIS IS AN OPTIMIZATION: Get as many character that
 | |
|                         otherwise would also be treated as a character token and emit it
 | |
|                         as a single character token. Stay in the data state. */
 | |
|                         
 | |
|                         $mask = '';
 | |
|                         if ($hyp_cond) $mask .= '-';
 | |
|                         if ($amp_cond) $mask .= '&';
 | |
|                         if ($lt_cond)  $mask .= '<';
 | |
|                         if ($gt_cond)  $mask .= '>';
 | |
| 
 | |
|                         if ($mask === '') {
 | |
|                             $chars = $this->stream->remainingChars();
 | |
|                         } else {
 | |
|                             $chars = $this->stream->charsUntil($mask);
 | |
|                         }
 | |
| 
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::CHARACTER,
 | |
|                             'data' => $char . $chars
 | |
|                         ));
 | |
| 
 | |
|                         $lastFourChars .= $chars;
 | |
|                         if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
 | |
| 
 | |
|                         $state = 'data';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'characterReferenceData':
 | |
|                     /* (This cannot happen if the content model flag
 | |
|                     is set to the CDATA state.) */
 | |
| 
 | |
|                     /* Attempt to consume a character reference, with no
 | |
|                     additional allowed character. */
 | |
|                     $entity = $this->consumeCharacterReference();
 | |
| 
 | |
|                     /* If nothing is returned, emit a U+0026 AMPERSAND
 | |
|                     character token. Otherwise, emit the character token that
 | |
|                     was returned. */
 | |
|                     // This is all done when consuming the character reference.
 | |
|                     $this->emitToken(array(
 | |
|                         'type' => self::CHARACTER,
 | |
|                         'data' => $entity
 | |
|                     ));
 | |
| 
 | |
|                     /* Finally, switch to the data state. */
 | |
|                     $state = 'data';
 | |
|                 break;
 | |
| 
 | |
|                 case 'tagOpen':
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     switch($this->content_model) {
 | |
|                         case self::RCDATA:
 | |
|                         case self::CDATA:
 | |
|                             /* Consume the next input character. If it is a
 | |
|                             U+002F SOLIDUS (/) character, switch to the close
 | |
|                             tag open state. Otherwise, emit a U+003C LESS-THAN
 | |
|                             SIGN character token and reconsume the current input
 | |
|                             character in the data state. */
 | |
|                             // We consumed above.
 | |
| 
 | |
|                             if($char === '/') {
 | |
|                                 $state = 'closeTagOpen';
 | |
| 
 | |
|                             } else {
 | |
|                                 $this->emitToken(array(
 | |
|                                     'type' => self::CHARACTER,
 | |
|                                     'data' => '<'
 | |
|                                 ));
 | |
| 
 | |
|                                 $this->stream->unget();
 | |
| 
 | |
|                                 $state = 'data';
 | |
|                             }
 | |
|                         break;
 | |
| 
 | |
|                         case self::PCDATA:
 | |
|                             /* If the content model flag is set to the PCDATA state
 | |
|                             Consume the next input character: */
 | |
|                             // We consumed above.
 | |
| 
 | |
|                             if($char === '!') {
 | |
|                                 /* U+0021 EXCLAMATION MARK (!)
 | |
|                                 Switch to the markup declaration open state. */
 | |
|                                 $state = 'markupDeclarationOpen';
 | |
| 
 | |
|                             } elseif($char === '/') {
 | |
|                                 /* U+002F SOLIDUS (/)
 | |
|                                 Switch to the close tag open state. */
 | |
|                                 $state = 'closeTagOpen';
 | |
| 
 | |
|                             } elseif('A' <= $char && $char <= 'Z') {
 | |
|                                 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
 | |
|                                 Create a new start tag token, set its tag name to the lowercase
 | |
|                                 version of the input character (add 0x0020 to the character's code
 | |
|                                 point), then switch to the tag name state. (Don't emit the token
 | |
|                                 yet; further details will be filled in before it is emitted.) */
 | |
|                                 $this->token = array(
 | |
|                                     'name'  => strtolower($char),
 | |
|                                     'type'  => self::STARTTAG,
 | |
|                                     'attr'  => array()
 | |
|                                 );
 | |
| 
 | |
|                                 $state = 'tagName';
 | |
| 
 | |
|                             } elseif('a' <= $char && $char <= 'z') {
 | |
|                                 /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
 | |
|                                 Create a new start tag token, set its tag name to the input
 | |
|                                 character, then switch to the tag name state. (Don't emit
 | |
|                                 the token yet; further details will be filled in before it
 | |
|                                 is emitted.) */
 | |
|                                 $this->token = array(
 | |
|                                     'name'  => $char,
 | |
|                                     'type'  => self::STARTTAG,
 | |
|                                     'attr'  => array()
 | |
|                                 );
 | |
| 
 | |
|                                 $state = 'tagName';
 | |
| 
 | |
|                             } elseif($char === '>') {
 | |
|                                 /* U+003E GREATER-THAN SIGN (>)
 | |
|                                 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
 | |
|                                 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
 | |
|                                 $this->emitToken(array(
 | |
|                                     'type' => self::PARSEERROR,
 | |
|                                     'data' => 'expected-tag-name-but-got-right-bracket'
 | |
|                                 ));
 | |
|                                 $this->emitToken(array(
 | |
|                                     'type' => self::CHARACTER,
 | |
|                                     'data' => '<>'
 | |
|                                 ));
 | |
| 
 | |
|                                 $state = 'data';
 | |
| 
 | |
|                             } elseif($char === '?') {
 | |
|                                 /* U+003F QUESTION MARK (?)
 | |
|                                 Parse error. Switch to the bogus comment state. */
 | |
|                                 $this->emitToken(array(
 | |
|                                     'type' => self::PARSEERROR,
 | |
|                                     'data' => 'expected-tag-name-but-got-question-mark'
 | |
|                                 ));
 | |
|                                 $this->token = array(
 | |
|                                     'data' => '?',
 | |
|                                     'type' => self::COMMENT
 | |
|                                 );
 | |
|                                 $state = 'bogusComment';
 | |
| 
 | |
|                             } else {
 | |
|                                 /* Anything else
 | |
|                                 Parse error. Emit a U+003C LESS-THAN SIGN character token and
 | |
|                                 reconsume the current input character in the data state. */
 | |
|                                 $this->emitToken(array(
 | |
|                                     'type' => self::PARSEERROR,
 | |
|                                     'data' => 'expected-tag-name'
 | |
|                                 ));
 | |
|                                 $this->emitToken(array(
 | |
|                                     'type' => self::CHARACTER,
 | |
|                                     'data' => '<'
 | |
|                                 ));
 | |
| 
 | |
|                                 $state = 'data';
 | |
|                                 $this->stream->unget();
 | |
|                             }
 | |
|                         break;
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'closeTagOpen':
 | |
|                     if (
 | |
|                         $this->content_model === self::RCDATA ||
 | |
|                         $this->content_model === self::CDATA
 | |
|                     ) {
 | |
|                         /* If the content model flag is set to the RCDATA or CDATA
 | |
|                         states... */
 | |
|                         $name = strtolower($this->stream->charsWhile(self::ALPHA));
 | |
|                         $following = $this->stream->char();
 | |
|                         $this->stream->unget();
 | |
|                         if (
 | |
|                             !$this->token ||
 | |
|                             $this->token['name'] !== $name ||
 | |
|                             $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
 | |
|                         ) {
 | |
|                             /* if no start tag token has ever been emitted by this instance
 | |
|                             of the tokenizer (fragment case), or, if the next few
 | |
|                             characters do not match the tag name of the last start tag
 | |
|                             token emitted (compared in an ASCII case-insensitive manner),
 | |
|                             or if they do but they are not immediately followed by one of
 | |
|                             the following characters:
 | |
| 
 | |
|                                 * U+0009 CHARACTER TABULATION
 | |
|                                 * U+000A LINE FEED (LF)
 | |
|                                 * U+000C FORM FEED (FF)
 | |
|                                 * U+0020 SPACE
 | |
|                                 * U+003E GREATER-THAN SIGN (>)
 | |
|                                 * U+002F SOLIDUS (/)
 | |
|                                 * EOF
 | |
| 
 | |
|                             ...then emit a U+003C LESS-THAN SIGN character token, a
 | |
|                             U+002F SOLIDUS character token, and switch to the data
 | |
|                             state to process the next input character. */
 | |
|                             // XXX: Probably ought to replace in_array with $following === x ||...
 | |
| 
 | |
|                             // We also need to emit $name now we've consumed that, as we
 | |
|                             // know it'll just be emitted as a character token.
 | |
|                             $this->emitToken(array(
 | |
|                                 'type' => self::CHARACTER,
 | |
|                                 'data' => '</' . $name
 | |
|                             ));
 | |
| 
 | |
|                             $state = 'data';
 | |
|                         } else {
 | |
|                             // This matches what would happen if we actually did the
 | |
|                             // otherwise below (but we can't because we've consumed too
 | |
|                             // much).
 | |
| 
 | |
|                             // Start the end tag token with the name we already have.
 | |
|                             $this->token = array(
 | |
|                                 'name'  => $name,
 | |
|                                 'type'  => self::ENDTAG
 | |
|                             );
 | |
| 
 | |
|                             // Change to tag name state.
 | |
|                             $state = 'tagName';
 | |
|                         }
 | |
|                     } elseif ($this->content_model === self::PCDATA) {
 | |
|                         /* Otherwise, if the content model flag is set to the PCDATA
 | |
|                         state [...]: */
 | |
|                         $char = $this->stream->char();
 | |
| 
 | |
|                         if ('A' <= $char && $char <= 'Z') {
 | |
|                             /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
 | |
|                             Create a new end tag token, set its tag name to the lowercase version
 | |
|                             of the input character (add 0x0020 to the character's code point), then
 | |
|                             switch to the tag name state. (Don't emit the token yet; further details
 | |
|                             will be filled in before it is emitted.) */
 | |
|                             $this->token = array(
 | |
|                                 'name'  => strtolower($char),
 | |
|                                 'type'  => self::ENDTAG
 | |
|                             );
 | |
| 
 | |
|                             $state = 'tagName';
 | |
| 
 | |
|                         } elseif ('a' <= $char && $char <= 'z') {
 | |
|                             /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
 | |
|                             Create a new end tag token, set its tag name to the
 | |
|                             input character, then switch to the tag name state.
 | |
|                             (Don't emit the token yet; further details will be
 | |
|                             filled in before it is emitted.) */
 | |
|                             $this->token = array(
 | |
|                                 'name'  => $char,
 | |
|                                 'type'  => self::ENDTAG
 | |
|                             );
 | |
| 
 | |
|                             $state = 'tagName';
 | |
| 
 | |
|                         } elseif($char === '>') {
 | |
|                             /* U+003E GREATER-THAN SIGN (>)
 | |
|                             Parse error. Switch to the data state. */
 | |
|                             $this->emitToken(array(
 | |
|                                 'type' => self::PARSEERROR,
 | |
|                                 'data' => 'expected-closing-tag-but-got-right-bracket'
 | |
|                             ));
 | |
|                             $state = 'data';
 | |
| 
 | |
|                         } elseif($char === false) {
 | |
|                             /* EOF
 | |
|                             Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
 | |
|                             SOLIDUS character token. Reconsume the EOF character in the data state. */
 | |
|                             $this->emitToken(array(
 | |
|                                 'type' => self::PARSEERROR,
 | |
|                                 'data' => 'expected-closing-tag-but-got-eof'
 | |
|                             ));
 | |
|                             $this->emitToken(array(
 | |
|                                 'type' => self::CHARACTER,
 | |
|                                 'data' => '</'
 | |
|                             ));
 | |
| 
 | |
|                             $this->stream->unget();
 | |
|                             $state = 'data';
 | |
| 
 | |
|                         } else {
 | |
|                             /* Parse error. Switch to the bogus comment state. */
 | |
|                             $this->emitToken(array(
 | |
|                                 'type' => self::PARSEERROR,
 | |
|                                 'data' => 'expected-closing-tag-but-got-char'
 | |
|                             ));
 | |
|                             $this->token = array(
 | |
|                                 'data' => $char,
 | |
|                                 'type' => self::COMMENT
 | |
|                             );
 | |
|                             $state = 'bogusComment';
 | |
|                         }
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'tagName':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         /* U+0009 CHARACTER TABULATION
 | |
|                         U+000A LINE FEED (LF)
 | |
|                         U+000C FORM FEED (FF)
 | |
|                         U+0020 SPACE
 | |
|                         Switch to the before attribute name state. */
 | |
|                         $state = 'beforeAttributeName';
 | |
| 
 | |
|                     } elseif($char === '/') {
 | |
|                         /* U+002F SOLIDUS (/)
 | |
|                         Switch to the self-closing start tag state. */
 | |
|                         $state = 'selfClosingStartTag';
 | |
| 
 | |
|                     } elseif($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Emit the current tag token. Switch to the data state. */
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } elseif('A' <= $char && $char <= 'Z') {
 | |
|                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 | |
|                         Append the lowercase version of the current input
 | |
|                         character (add 0x0020 to the character's code point) to
 | |
|                         the current tag token's tag name. Stay in the tag name state. */
 | |
|                         $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
 | |
| 
 | |
|                         $this->token['name'] .= strtolower($char . $chars);
 | |
|                         $state = 'tagName';
 | |
| 
 | |
|                     } elseif($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Emit the current tag token. Reconsume the EOF
 | |
|                         character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-tag-name'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
| 
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Append the current input character to the current tag token's tag name.
 | |
|                         Stay in the tag name state. */
 | |
|                         $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA);
 | |
| 
 | |
|                         $this->token['name'] .= $char . $chars;
 | |
|                         $state = 'tagName';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'beforeAttributeName':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     // this conditional is optimized, check bottom
 | |
|                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         /* U+0009 CHARACTER TABULATION
 | |
|                         U+000A LINE FEED (LF)
 | |
|                         U+000C FORM FEED (FF)
 | |
|                         U+0020 SPACE
 | |
|                         Stay in the before attribute name state. */
 | |
|                         $state = 'beforeAttributeName';
 | |
| 
 | |
|                     } elseif($char === '/') {
 | |
|                         /* U+002F SOLIDUS (/)
 | |
|                         Switch to the self-closing start tag state. */
 | |
|                         $state = 'selfClosingStartTag';
 | |
| 
 | |
|                     } elseif($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Emit the current tag token. Switch to the data state. */
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } elseif('A' <= $char && $char <= 'Z') {
 | |
|                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 | |
|                         Start a new attribute in the current tag token. Set that
 | |
|                         attribute's name to the lowercase version of the current
 | |
|                         input character (add 0x0020 to the character's code
 | |
|                         point), and its value to the empty string. Switch to the
 | |
|                         attribute name state.*/
 | |
|                         $this->token['attr'][] = array(
 | |
|                             'name'  => strtolower($char),
 | |
|                             'value' => ''
 | |
|                         );
 | |
| 
 | |
|                         $state = 'attributeName';
 | |
| 
 | |
|                     } elseif($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Emit the current tag token. Reconsume the EOF
 | |
|                         character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'expected-attribute-name-but-got-eof'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
| 
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* U+0022 QUOTATION MARK (")
 | |
|                            U+0027 APOSTROPHE (')
 | |
|                            U+003D EQUALS SIGN (=)
 | |
|                         Parse error. Treat it as per the "anything else" entry
 | |
|                         below. */
 | |
|                         if($char === '"' || $char === "'" || $char === '=') {
 | |
|                             $this->emitToken(array(
 | |
|                                 'type' => self::PARSEERROR,
 | |
|                                 'data' => 'invalid-character-in-attribute-name'
 | |
|                             ));
 | |
|                         }
 | |
| 
 | |
|                         /* Anything else
 | |
|                         Start a new attribute in the current tag token. Set that attribute's
 | |
|                         name to the current input character, and its value to the empty string.
 | |
|                         Switch to the attribute name state. */
 | |
|                         $this->token['attr'][] = array(
 | |
|                             'name'  => $char,
 | |
|                             'value' => ''
 | |
|                         );
 | |
| 
 | |
|                         $state = 'attributeName';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'attributeName':
 | |
|                     // Consume the next input character:
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     // this conditional is optimized, check bottom
 | |
|                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         /* U+0009 CHARACTER TABULATION
 | |
|                         U+000A LINE FEED (LF)
 | |
|                         U+000C FORM FEED (FF)
 | |
|                         U+0020 SPACE
 | |
|                         Switch to the after attribute name state. */
 | |
|                         $state = 'afterAttributeName';
 | |
| 
 | |
|                     } elseif($char === '/') {
 | |
|                         /* U+002F SOLIDUS (/)
 | |
|                         Switch to the self-closing start tag state. */
 | |
|                         $state = 'selfClosingStartTag';
 | |
| 
 | |
|                     } elseif($char === '=') {
 | |
|                         /* U+003D EQUALS SIGN (=)
 | |
|                         Switch to the before attribute value state. */
 | |
|                         $state = 'beforeAttributeValue';
 | |
| 
 | |
|                     } elseif($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Emit the current tag token. Switch to the data state. */
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } elseif('A' <= $char && $char <= 'Z') {
 | |
|                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 | |
|                         Append the lowercase version of the current input
 | |
|                         character (add 0x0020 to the character's code point) to
 | |
|                         the current attribute's name. Stay in the attribute name
 | |
|                         state. */
 | |
|                         $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
 | |
| 
 | |
|                         $last = count($this->token['attr']) - 1;
 | |
|                         $this->token['attr'][$last]['name'] .= strtolower($char . $chars);
 | |
| 
 | |
|                         $state = 'attributeName';
 | |
| 
 | |
|                     } elseif($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Emit the current tag token. Reconsume the EOF
 | |
|                         character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-attribute-name'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
| 
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* U+0022 QUOTATION MARK (")
 | |
|                            U+0027 APOSTROPHE (')
 | |
|                         Parse error. Treat it as per the "anything else"
 | |
|                         entry below. */
 | |
|                         if($char === '"' || $char === "'") {
 | |
|                             $this->emitToken(array(
 | |
|                                 'type' => self::PARSEERROR,
 | |
|                                 'data' => 'invalid-character-in-attribute-name'
 | |
|                             ));
 | |
|                         }
 | |
| 
 | |
|                         /* Anything else
 | |
|                         Append the current input character to the current attribute's name.
 | |
|                         Stay in the attribute name state. */
 | |
|                         $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA);
 | |
| 
 | |
|                         $last = count($this->token['attr']) - 1;
 | |
|                         $this->token['attr'][$last]['name'] .= $char . $chars;
 | |
| 
 | |
|                         $state = 'attributeName';
 | |
|                     }
 | |
| 
 | |
|                     /* When the user agent leaves the attribute name state
 | |
|                     (and before emitting the tag token, if appropriate), the
 | |
|                     complete attribute's name must be compared to the other
 | |
|                     attributes on the same token; if there is already an
 | |
|                     attribute on the token with the exact same name, then this
 | |
|                     is a parse error and the new attribute must be dropped, along
 | |
|                     with the value that gets associated with it (if any). */
 | |
|                     // this might be implemented in the emitToken method
 | |
|                 break;
 | |
| 
 | |
|                 case 'afterAttributeName':
 | |
|                     // Consume the next input character:
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     // this is an optimized conditional, check the bottom
 | |
|                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         /* U+0009 CHARACTER TABULATION
 | |
|                         U+000A LINE FEED (LF)
 | |
|                         U+000C FORM FEED (FF)
 | |
|                         U+0020 SPACE
 | |
|                         Stay in the after attribute name state. */
 | |
|                         $state = 'afterAttributeName';
 | |
| 
 | |
|                     } elseif($char === '/') {
 | |
|                         /* U+002F SOLIDUS (/)
 | |
|                         Switch to the self-closing start tag state. */
 | |
|                         $state = 'selfClosingStartTag';
 | |
| 
 | |
|                     } elseif($char === '=') {
 | |
|                         /* U+003D EQUALS SIGN (=)
 | |
|                         Switch to the before attribute value state. */
 | |
|                         $state = 'beforeAttributeValue';
 | |
| 
 | |
|                     } elseif($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Emit the current tag token. Switch to the data state. */
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } elseif('A' <= $char && $char <= 'Z') {
 | |
|                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 | |
|                         Start a new attribute in the current tag token. Set that
 | |
|                         attribute's name to the lowercase version of the current
 | |
|                         input character (add 0x0020 to the character's code
 | |
|                         point), and its value to the empty string. Switch to the
 | |
|                         attribute name state. */
 | |
|                         $this->token['attr'][] = array(
 | |
|                             'name'  => strtolower($char),
 | |
|                             'value' => ''
 | |
|                         );
 | |
| 
 | |
|                         $state = 'attributeName';
 | |
| 
 | |
|                     } elseif($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Emit the current tag token. Reconsume the EOF
 | |
|                         character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'expected-end-of-tag-but-got-eof'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
| 
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* U+0022 QUOTATION MARK (")
 | |
|                            U+0027 APOSTROPHE (')
 | |
|                         Parse error. Treat it as per the "anything else"
 | |
|                         entry below. */
 | |
|                         if($char === '"' || $char === "'") {
 | |
|                             $this->emitToken(array(
 | |
|                                 'type' => self::PARSEERROR,
 | |
|                                 'data' => 'invalid-character-after-attribute-name'
 | |
|                             ));
 | |
|                         }
 | |
| 
 | |
|                         /* Anything else
 | |
|                         Start a new attribute in the current tag token. Set that attribute's
 | |
|                         name to the current input character, and its value to the empty string.
 | |
|                         Switch to the attribute name state. */
 | |
|                         $this->token['attr'][] = array(
 | |
|                             'name'  => $char,
 | |
|                             'value' => ''
 | |
|                         );
 | |
| 
 | |
|                         $state = 'attributeName';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'beforeAttributeValue':
 | |
|                     // Consume the next input character:
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     // this is an optimized conditional
 | |
|                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         /* U+0009 CHARACTER TABULATION
 | |
|                         U+000A LINE FEED (LF)
 | |
|                         U+000C FORM FEED (FF)
 | |
|                         U+0020 SPACE
 | |
|                         Stay in the before attribute value state. */
 | |
|                         $state = 'beforeAttributeValue';
 | |
| 
 | |
|                     } elseif($char === '"') {
 | |
|                         /* U+0022 QUOTATION MARK (")
 | |
|                         Switch to the attribute value (double-quoted) state. */
 | |
|                         $state = 'attributeValueDoubleQuoted';
 | |
| 
 | |
|                     } elseif($char === '&') {
 | |
|                         /* U+0026 AMPERSAND (&)
 | |
|                         Switch to the attribute value (unquoted) state and reconsume
 | |
|                         this input character. */
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'attributeValueUnquoted';
 | |
| 
 | |
|                     } elseif($char === '\'') {
 | |
|                         /* U+0027 APOSTROPHE (')
 | |
|                         Switch to the attribute value (single-quoted) state. */
 | |
|                         $state = 'attributeValueSingleQuoted';
 | |
| 
 | |
|                     } elseif($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Parse error. Emit the current tag token. Switch to the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'expected-attribute-value-but-got-right-bracket'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } elseif($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Emit the current tag token. Reconsume
 | |
|                         the character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'expected-attribute-value-but-got-eof'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* U+003D EQUALS SIGN (=)
 | |
|                         Parse error. Treat it as per the "anything else" entry below. */
 | |
|                         if($char === '=') {
 | |
|                             $this->emitToken(array(
 | |
|                                 'type' => self::PARSEERROR,
 | |
|                                 'data' => 'equals-in-unquoted-attribute-value'
 | |
|                             ));
 | |
|                         }
 | |
| 
 | |
|                         /* Anything else
 | |
|                         Append the current input character to the current attribute's value.
 | |
|                         Switch to the attribute value (unquoted) state. */
 | |
|                         $last = count($this->token['attr']) - 1;
 | |
|                         $this->token['attr'][$last]['value'] .= $char;
 | |
| 
 | |
|                         $state = 'attributeValueUnquoted';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'attributeValueDoubleQuoted':
 | |
|                     // Consume the next input character:
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === '"') {
 | |
|                         /* U+0022 QUOTATION MARK (")
 | |
|                         Switch to the after attribute value (quoted) state. */
 | |
|                         $state = 'afterAttributeValueQuoted';
 | |
| 
 | |
|                     } elseif($char === '&') {
 | |
|                         /* U+0026 AMPERSAND (&)
 | |
|                         Switch to the character reference in attribute value
 | |
|                         state, with the additional allowed character
 | |
|                         being U+0022 QUOTATION MARK ("). */
 | |
|                         $this->characterReferenceInAttributeValue('"');
 | |
| 
 | |
|                     } elseif($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Emit the current tag token. Reconsume the character
 | |
|                         in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-attribute-value-double-quote'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
| 
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Append the current input character to the current attribute's value.
 | |
|                         Stay in the attribute value (double-quoted) state. */
 | |
|                         $chars = $this->stream->charsUntil('"&');
 | |
| 
 | |
|                         $last = count($this->token['attr']) - 1;
 | |
|                         $this->token['attr'][$last]['value'] .= $char . $chars;
 | |
| 
 | |
|                         $state = 'attributeValueDoubleQuoted';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'attributeValueSingleQuoted':
 | |
|                     // Consume the next input character:
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === "'") {
 | |
|                         /* U+0022 QUOTATION MARK (')
 | |
|                         Switch to the after attribute value state. */
 | |
|                         $state = 'afterAttributeValueQuoted';
 | |
| 
 | |
|                     } elseif($char === '&') {
 | |
|                         /* U+0026 AMPERSAND (&)
 | |
|                         Switch to the entity in attribute value state. */
 | |
|                         $this->characterReferenceInAttributeValue("'");
 | |
| 
 | |
|                     } elseif($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Emit the current tag token. Reconsume the character
 | |
|                         in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-attribute-value-single-quote'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
| 
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Append the current input character to the current attribute's value.
 | |
|                         Stay in the attribute value (single-quoted) state. */
 | |
|                         $chars = $this->stream->charsUntil("'&");
 | |
| 
 | |
|                         $last = count($this->token['attr']) - 1;
 | |
|                         $this->token['attr'][$last]['value'] .= $char . $chars;
 | |
| 
 | |
|                         $state = 'attributeValueSingleQuoted';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'attributeValueUnquoted':
 | |
|                     // Consume the next input character:
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         /* U+0009 CHARACTER TABULATION
 | |
|                         U+000A LINE FEED (LF)
 | |
|                         U+000C FORM FEED (FF)
 | |
|                         U+0020 SPACE
 | |
|                         Switch to the before attribute name state. */
 | |
|                         $state = 'beforeAttributeName';
 | |
| 
 | |
|                     } elseif($char === '&') {
 | |
|                         /* U+0026 AMPERSAND (&)
 | |
|                         Switch to the entity in attribute value state. */
 | |
|                         $this->characterReferenceInAttributeValue();
 | |
| 
 | |
|                     } elseif($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Emit the current tag token. Switch to the data state. */
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } elseif ($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Emit the current tag token. Reconsume
 | |
|                         the character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-attribute-value-no-quotes'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* U+0022 QUOTATION MARK (")
 | |
|                            U+0027 APOSTROPHE (')
 | |
|                            U+003D EQUALS SIGN (=)
 | |
|                         Parse error. Treat it as per the "anything else"
 | |
|                         entry below. */
 | |
|                         if($char === '"' || $char === "'" || $char === '=') {
 | |
|                             $this->emitToken(array(
 | |
|                                 'type' => self::PARSEERROR,
 | |
|                                 'data' => 'unexpected-character-in-unquoted-attribute-value'
 | |
|                             ));
 | |
|                         }
 | |
| 
 | |
|                         /* Anything else
 | |
|                         Append the current input character to the current attribute's value.
 | |
|                         Stay in the attribute value (unquoted) state. */
 | |
|                         $chars = $this->stream->charsUntil("\t\n\x0c &>\"'=");
 | |
| 
 | |
|                         $last = count($this->token['attr']) - 1;
 | |
|                         $this->token['attr'][$last]['value'] .= $char . $chars;
 | |
| 
 | |
|                         $state = 'attributeValueUnquoted';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'afterAttributeValueQuoted':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         /* U+0009 CHARACTER TABULATION
 | |
|                            U+000A LINE FEED (LF)
 | |
|                            U+000C FORM FEED (FF)
 | |
|                            U+0020 SPACE
 | |
|                         Switch to the before attribute name state. */
 | |
|                         $state = 'beforeAttributeName';
 | |
| 
 | |
|                     } elseif ($char === '/') {
 | |
|                         /* U+002F SOLIDUS (/)
 | |
|                         Switch to the self-closing start tag state. */
 | |
|                         $state = 'selfClosingStartTag';
 | |
| 
 | |
|                     } elseif ($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Emit the current tag token. Switch to the data state. */
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } elseif ($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Emit the current tag token. Reconsume the EOF
 | |
|                         character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-EOF-after-attribute-value'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Parse error. Reconsume the character in the before attribute
 | |
|                         name state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-character-after-attribute-value'
 | |
|                         ));
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'beforeAttributeName';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'selfClosingStartTag':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if ($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Set the self-closing flag of the current tag token.
 | |
|                         Emit the current tag token. Switch to the data state. */
 | |
|                         // not sure if this is the name we want
 | |
|                         $this->token['self-closing'] = true;
 | |
|                         /* When an end tag token is emitted with its self-closing flag set,
 | |
|                         that is a parse error. */
 | |
|                         if ($this->token['type'] === self::ENDTAG) {
 | |
|                             $this->emitToken(array(
 | |
|                                 'type' => self::PARSEERROR,
 | |
|                                 'data' => 'self-closing-end-tag'
 | |
|                             ));
 | |
|                         }
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } elseif ($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Emit the current tag token. Reconsume the
 | |
|                         EOF character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-eof-after-self-closing'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Parse error. Reconsume the character in the before attribute name state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-character-after-self-closing'
 | |
|                         ));
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'beforeAttributeName';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'bogusComment':
 | |
|                     /* (This can only happen if the content model flag is set to the PCDATA state.) */
 | |
|                     /* Consume every character up to the first U+003E GREATER-THAN SIGN
 | |
|                     character (>) or the end of the file (EOF), whichever comes first. Emit
 | |
|                     a comment token whose data is the concatenation of all the characters
 | |
|                     starting from and including the character that caused the state machine
 | |
|                     to switch into the bogus comment state, up to and including the last
 | |
|                     consumed character before the U+003E character, if any, or up to the
 | |
|                     end of the file otherwise. (If the comment was started by the end of
 | |
|                     the file (EOF), the token is empty.) */
 | |
|                     $this->token['data'] .= (string) $this->stream->charsUntil('>');
 | |
|                     $this->stream->char();
 | |
| 
 | |
|                     $this->emitToken($this->token);
 | |
| 
 | |
|                     /* Switch to the data state. */
 | |
|                     $state = 'data';
 | |
|                 break;
 | |
| 
 | |
|                 case 'markupDeclarationOpen':
 | |
|                     // Consume for below
 | |
|                     $hyphens = $this->stream->charsWhile('-', 2);
 | |
|                     if ($hyphens === '-') {
 | |
|                         $this->stream->unget();
 | |
|                     }
 | |
|                     if ($hyphens !== '--') {
 | |
|                         $alpha = $this->stream->charsWhile(self::ALPHA, 7);
 | |
|                     }
 | |
| 
 | |
|                     /* If the next two characters are both U+002D HYPHEN-MINUS (-)
 | |
|                     characters, consume those two characters, create a comment token whose
 | |
|                     data is the empty string, and switch to the comment state. */
 | |
|                     if($hyphens === '--') {
 | |
|                         $state = 'commentStart';
 | |
|                         $this->token = array(
 | |
|                             'data' => '',
 | |
|                             'type' => self::COMMENT
 | |
|                         );
 | |
| 
 | |
|                     /* Otherwise if the next seven characters are a case-insensitive match
 | |
|                     for the word "DOCTYPE", then consume those characters and switch to the
 | |
|                     DOCTYPE state. */
 | |
|                     } elseif(strtoupper($alpha) === 'DOCTYPE') {
 | |
|                         $state = 'doctype';
 | |
| 
 | |
|                     // XXX not implemented
 | |
|                     /* Otherwise, if the insertion mode is "in foreign content"
 | |
|                     and the current node is not an element in the HTML namespace
 | |
|                     and the next seven characters are an ASCII case-sensitive
 | |
|                     match for the string "[CDATA[" (the five uppercase letters
 | |
|                     "CDATA" with a U+005B LEFT SQUARE BRACKET character before
 | |
|                     and after), then consume those characters and switch to the
 | |
|                     CDATA section state (which is unrelated to the content model
 | |
|                     flag's CDATA state). */
 | |
| 
 | |
|                     /* Otherwise, is is a parse error. Switch to the bogus comment state.
 | |
|                     The next character that is consumed, if any, is the first character
 | |
|                     that will be in the comment. */
 | |
|                     } else {
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'expected-dashes-or-doctype'
 | |
|                         ));
 | |
|                         $this->token = array(
 | |
|                             'data' => (string) $alpha,
 | |
|                             'type' => self::COMMENT
 | |
|                         );
 | |
|                         $state = 'bogusComment';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'commentStart':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if ($char === '-') {
 | |
|                         /* U+002D HYPHEN-MINUS (-)
 | |
|                         Switch to the comment start dash state. */
 | |
|                         $state = 'commentStartDash';
 | |
|                     } elseif ($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Parse error. Emit the comment token. Switch to the
 | |
|                         data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'incorrect-comment'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
|                     } elseif ($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Emit the comment token. Reconsume the
 | |
|                         EOF character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-comment'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Append the input character to the comment token's
 | |
|                         data. Switch to the comment state. */
 | |
|                         $this->token['data'] .= $char;
 | |
|                         $state = 'comment';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'commentStartDash':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
|                     if ($char === '-') {
 | |
|                         /* U+002D HYPHEN-MINUS (-)
 | |
|                         Switch to the comment end state */
 | |
|                         $state = 'commentEnd';
 | |
|                     } elseif ($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Parse error. Emit the comment token. Switch to the
 | |
|                         data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'incorrect-comment'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
|                     } elseif ($char === false) {
 | |
|                         /* Parse error. Emit the comment token. Reconsume the
 | |
|                         EOF character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-comment'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
|                     } else {
 | |
|                         $this->token['data'] .= '-' . $char;
 | |
|                         $state = 'comment';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'comment':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === '-') {
 | |
|                         /* U+002D HYPHEN-MINUS (-)
 | |
|                         Switch to the comment end dash state */
 | |
|                         $state = 'commentEndDash';
 | |
| 
 | |
|                     } elseif($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Emit the comment token. Reconsume the EOF character
 | |
|                         in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-comment'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Append the input character to the comment token's data. Stay in
 | |
|                         the comment state. */
 | |
|                         $chars = $this->stream->charsUntil('-');
 | |
| 
 | |
|                         $this->token['data'] .= $char . $chars;
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'commentEndDash':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === '-') {
 | |
|                         /* U+002D HYPHEN-MINUS (-)
 | |
|                         Switch to the comment end state  */
 | |
|                         $state = 'commentEnd';
 | |
| 
 | |
|                     } elseif($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Emit the comment token. Reconsume the EOF character
 | |
|                         in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-comment-end-dash'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Append a U+002D HYPHEN-MINUS (-) character and the input
 | |
|                         character to the comment token's data. Switch to the comment state. */
 | |
|                         $this->token['data'] .= '-'.$char;
 | |
|                         $state = 'comment';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'commentEnd':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Emit the comment token. Switch to the data state. */
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } elseif($char === '-') {
 | |
|                         /* U+002D HYPHEN-MINUS (-)
 | |
|                         Parse error. Append a U+002D HYPHEN-MINUS (-) character
 | |
|                         to the comment token's data. Stay in the comment end
 | |
|                         state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-dash-after-double-dash-in-comment'
 | |
|                         ));
 | |
|                         $this->token['data'] .= '-';
 | |
| 
 | |
|                     } elseif($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Emit the comment token. Reconsume the
 | |
|                         EOF character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-comment-double-dash'
 | |
|                         ));
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Parse error. Append two U+002D HYPHEN-MINUS (-)
 | |
|                         characters and the input character to the comment token's
 | |
|                         data. Switch to the comment state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-char-in-comment'
 | |
|                         ));
 | |
|                         $this->token['data'] .= '--'.$char;
 | |
|                         $state = 'comment';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'doctype':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         /* U+0009 CHARACTER TABULATION
 | |
|                            U+000A LINE FEED (LF)
 | |
|                            U+000C FORM FEED (FF)
 | |
|                            U+0020 SPACE
 | |
|                         Switch to the before DOCTYPE name state. */
 | |
|                         $state = 'beforeDoctypeName';
 | |
| 
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Parse error. Reconsume the current character in the
 | |
|                         before DOCTYPE name state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'need-space-after-doctype'
 | |
|                         ));
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'beforeDoctypeName';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'beforeDoctypeName':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         /* U+0009 CHARACTER TABULATION
 | |
|                            U+000A LINE FEED (LF)
 | |
|                            U+000C FORM FEED (FF)
 | |
|                            U+0020 SPACE
 | |
|                         Stay in the before DOCTYPE name state. */
 | |
| 
 | |
|                     } elseif($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Parse error. Create a new DOCTYPE token. Set its
 | |
|                         force-quirks flag to on. Emit the token. Switch to the
 | |
|                         data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'expected-doctype-name-but-got-right-bracket'
 | |
|                         ));
 | |
|                         $this->emitToken(array(
 | |
|                             'name' => '',
 | |
|                             'type' => self::DOCTYPE,
 | |
|                             'force-quirks' => true,
 | |
|                             'error' => true
 | |
|                         ));
 | |
| 
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } elseif('A' <= $char && $char <= 'Z') {
 | |
|                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 | |
|                         Create a new DOCTYPE token. Set the token's name to the
 | |
|                         lowercase version of the input character (add 0x0020 to
 | |
|                         the character's code point). Switch to the DOCTYPE name
 | |
|                         state. */
 | |
|                         $this->token = array(
 | |
|                             'name' => strtolower($char),
 | |
|                             'type' => self::DOCTYPE,
 | |
|                             'error' => true
 | |
|                         );
 | |
| 
 | |
|                         $state = 'doctypeName';
 | |
| 
 | |
|                     } elseif($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Create a new DOCTYPE token. Set its
 | |
|                         force-quirks flag to on. Emit the token. Reconsume the
 | |
|                         EOF character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'expected-doctype-name-but-got-eof'
 | |
|                         ));
 | |
|                         $this->emitToken(array(
 | |
|                             'name' => '',
 | |
|                             'type' => self::DOCTYPE,
 | |
|                             'force-quirks' => true,
 | |
|                             'error' => true
 | |
|                         ));
 | |
| 
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Create a new DOCTYPE token. Set the token's name to the
 | |
|                         current input character. Switch to the DOCTYPE name state. */
 | |
|                         $this->token = array(
 | |
|                             'name' => $char,
 | |
|                             'type' => self::DOCTYPE,
 | |
|                             'error' => true
 | |
|                         );
 | |
| 
 | |
|                         $state = 'doctypeName';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'doctypeName':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         /* U+0009 CHARACTER TABULATION
 | |
|                            U+000A LINE FEED (LF)
 | |
|                            U+000C FORM FEED (FF)
 | |
|                            U+0020 SPACE
 | |
|                         Switch to the after DOCTYPE name state. */
 | |
|                         $state = 'afterDoctypeName';
 | |
| 
 | |
|                     } elseif($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Emit the current DOCTYPE token. Switch to the data state. */
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } elseif('A' <= $char && $char <= 'Z') {
 | |
|                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 | |
|                         Append the lowercase version of the input character
 | |
|                         (add 0x0020 to the character's code point) to the current
 | |
|                         DOCTYPE token's name. Stay in the DOCTYPE name state. */
 | |
|                         $this->token['name'] .= strtolower($char);
 | |
| 
 | |
|                     } elseif($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Set the DOCTYPE token's force-quirks flag
 | |
|                         to on. Emit that DOCTYPE token. Reconsume the EOF
 | |
|                         character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-doctype-name'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Append the current input character to the current
 | |
|                         DOCTYPE token's name. Stay in the DOCTYPE name state. */
 | |
|                         $this->token['name'] .= $char;
 | |
|                     }
 | |
| 
 | |
|                     // XXX this is probably some sort of quirks mode designation,
 | |
|                     // check tree-builder to be sure. In general 'error' needs
 | |
|                     // to be specc'ified, this probably means removing it at the end
 | |
|                     $this->token['error'] = ($this->token['name'] === 'HTML')
 | |
|                         ? false
 | |
|                         : true;
 | |
|                 break;
 | |
| 
 | |
|                 case 'afterDoctypeName':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         /* U+0009 CHARACTER TABULATION
 | |
|                            U+000A LINE FEED (LF)
 | |
|                            U+000C FORM FEED (FF)
 | |
|                            U+0020 SPACE
 | |
|                         Stay in the after DOCTYPE name state. */
 | |
| 
 | |
|                     } elseif($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Emit the current DOCTYPE token. Switch to the data state. */
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } elseif($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Set the DOCTYPE token's force-quirks flag
 | |
|                         to on. Emit that DOCTYPE token. Reconsume the EOF
 | |
|                         character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* Anything else */
 | |
| 
 | |
|                         $nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5));
 | |
|                         if ($nextSix === 'PUBLIC') {
 | |
|                             /* If the next six characters are an ASCII
 | |
|                             case-insensitive match for the word "PUBLIC", then
 | |
|                             consume those characters and switch to the before
 | |
|                             DOCTYPE public identifier state. */
 | |
|                             $state = 'beforeDoctypePublicIdentifier';
 | |
| 
 | |
|                         } elseif ($nextSix === 'SYSTEM') {
 | |
|                             /* Otherwise, if the next six characters are an ASCII
 | |
|                             case-insensitive match for the word "SYSTEM", then
 | |
|                             consume those characters and switch to the before
 | |
|                             DOCTYPE system identifier state. */
 | |
|                             $state = 'beforeDoctypeSystemIdentifier';
 | |
| 
 | |
|                         } else {
 | |
|                             /* Otherwise, this is the parse error. Set the DOCTYPE
 | |
|                             token's force-quirks flag to on. Switch to the bogus
 | |
|                             DOCTYPE state. */
 | |
|                             $this->emitToken(array(
 | |
|                                 'type' => self::PARSEERROR,
 | |
|                                 'data' => 'expected-space-or-right-bracket-in-doctype'
 | |
|                             ));
 | |
|                             $this->token['force-quirks'] = true;
 | |
|                             $this->token['error'] = true;
 | |
|                             $state = 'bogusDoctype';
 | |
|                         }
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'beforeDoctypePublicIdentifier':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         /* U+0009 CHARACTER TABULATION
 | |
|                            U+000A LINE FEED (LF)
 | |
|                            U+000C FORM FEED (FF)
 | |
|                            U+0020 SPACE
 | |
|                         Stay in the before DOCTYPE public identifier state. */
 | |
|                     } elseif ($char === '"') {
 | |
|                         /* U+0022 QUOTATION MARK (")
 | |
|                         Set the DOCTYPE token's public identifier to the empty
 | |
|                         string (not missing), then switch to the DOCTYPE public
 | |
|                         identifier (double-quoted) state. */
 | |
|                         $this->token['public'] = '';
 | |
|                         $state = 'doctypePublicIdentifierDoubleQuoted';
 | |
|                     } elseif ($char === "'") {
 | |
|                         /* U+0027 APOSTROPHE (')
 | |
|                         Set the DOCTYPE token's public identifier to the empty
 | |
|                         string (not missing), then switch to the DOCTYPE public
 | |
|                         identifier (single-quoted) state. */
 | |
|                         $this->token['public'] = '';
 | |
|                         $state = 'doctypePublicIdentifierSingleQuoted';
 | |
|                     } elseif ($char === '>') {
 | |
|                         /* Parse error. Set the DOCTYPE token's force-quirks flag
 | |
|                         to on. Emit that DOCTYPE token. Switch to the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-end-of-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
|                     } elseif ($char === false) {
 | |
|                         /* Parse error. Set the DOCTYPE token's force-quirks
 | |
|                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
 | |
|                         character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
|                     } else {
 | |
|                         /* Parse error. Set the DOCTYPE token's force-quirks flag
 | |
|                         to on. Switch to the bogus DOCTYPE state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-char-in-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $state = 'bogusDoctype';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'doctypePublicIdentifierDoubleQuoted':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if ($char === '"') {
 | |
|                         /* U+0022 QUOTATION MARK (")
 | |
|                         Switch to the after DOCTYPE public identifier state. */
 | |
|                         $state = 'afterDoctypePublicIdentifier';
 | |
|                     } elseif ($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Parse error. Set the DOCTYPE token's force-quirks flag
 | |
|                         to on. Emit that DOCTYPE token. Switch to the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-end-of-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
|                     } elseif ($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Set the DOCTYPE token's force-quirks flag
 | |
|                         to on. Emit that DOCTYPE token. Reconsume the EOF
 | |
|                         character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Append the current input character to the current
 | |
|                         DOCTYPE token's public identifier. Stay in the DOCTYPE
 | |
|                         public identifier (double-quoted) state. */
 | |
|                         $this->token['public'] .= $char;
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'doctypePublicIdentifierSingleQuoted':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if ($char === "'") {
 | |
|                         /* U+0027 APOSTROPHE (')
 | |
|                         Switch to the after DOCTYPE public identifier state. */
 | |
|                         $state = 'afterDoctypePublicIdentifier';
 | |
|                     } elseif ($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Parse error. Set the DOCTYPE token's force-quirks flag
 | |
|                         to on. Emit that DOCTYPE token. Switch to the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-end-of-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
|                     } elseif ($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Set the DOCTYPE token's force-quirks flag
 | |
|                         to on. Emit that DOCTYPE token. Reconsume the EOF
 | |
|                         character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Append the current input character to the current
 | |
|                         DOCTYPE token's public identifier. Stay in the DOCTYPE
 | |
|                         public identifier (double-quoted) state. */
 | |
|                         $this->token['public'] .= $char;
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'afterDoctypePublicIdentifier':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         /* U+0009 CHARACTER TABULATION
 | |
|                            U+000A LINE FEED (LF)
 | |
|                            U+000C FORM FEED (FF)
 | |
|                            U+0020 SPACE
 | |
|                         Stay in the after DOCTYPE public identifier state. */
 | |
|                     } elseif ($char === '"') {
 | |
|                         /* U+0022 QUOTATION MARK (")
 | |
|                         Set the DOCTYPE token's system identifier to the
 | |
|                         empty string (not missing), then switch to the DOCTYPE
 | |
|                         system identifier (double-quoted) state. */
 | |
|                         $this->token['system'] = '';
 | |
|                         $state = 'doctypeSystemIdentifierDoubleQuoted';
 | |
|                     } elseif ($char === "'") {
 | |
|                         /* U+0027 APOSTROPHE (')
 | |
|                         Set the DOCTYPE token's system identifier to the
 | |
|                         empty string (not missing), then switch to the DOCTYPE
 | |
|                         system identifier (single-quoted) state. */
 | |
|                         $this->token['system'] = '';
 | |
|                         $state = 'doctypeSystemIdentifierSingleQuoted';
 | |
|                     } elseif ($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Emit the current DOCTYPE token. Switch to the data state. */
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
|                     } elseif ($char === false) {
 | |
|                         /* Parse error. Set the DOCTYPE token's force-quirks
 | |
|                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
 | |
|                         character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Parse error. Set the DOCTYPE token's force-quirks flag
 | |
|                         to on. Switch to the bogus DOCTYPE state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-char-in-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $state = 'bogusDoctype';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'beforeDoctypeSystemIdentifier':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         /* U+0009 CHARACTER TABULATION
 | |
|                            U+000A LINE FEED (LF)
 | |
|                            U+000C FORM FEED (FF)
 | |
|                            U+0020 SPACE
 | |
|                         Stay in the before DOCTYPE system identifier state. */
 | |
|                     } elseif ($char === '"') {
 | |
|                         /* U+0022 QUOTATION MARK (")
 | |
|                         Set the DOCTYPE token's system identifier to the empty
 | |
|                         string (not missing), then switch to the DOCTYPE system
 | |
|                         identifier (double-quoted) state. */
 | |
|                         $this->token['system'] = '';
 | |
|                         $state = 'doctypeSystemIdentifierDoubleQuoted';
 | |
|                     } elseif ($char === "'") {
 | |
|                         /* U+0027 APOSTROPHE (')
 | |
|                         Set the DOCTYPE token's system identifier to the empty
 | |
|                         string (not missing), then switch to the DOCTYPE system
 | |
|                         identifier (single-quoted) state. */
 | |
|                         $this->token['system'] = '';
 | |
|                         $state = 'doctypeSystemIdentifierSingleQuoted';
 | |
|                     } elseif ($char === '>') {
 | |
|                         /* Parse error. Set the DOCTYPE token's force-quirks flag
 | |
|                         to on. Emit that DOCTYPE token. Switch to the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-char-in-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
|                     } elseif ($char === false) {
 | |
|                         /* Parse error. Set the DOCTYPE token's force-quirks
 | |
|                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
 | |
|                         character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
|                     } else {
 | |
|                         /* Parse error. Set the DOCTYPE token's force-quirks flag
 | |
|                         to on. Switch to the bogus DOCTYPE state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-char-in-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $state = 'bogusDoctype';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'doctypeSystemIdentifierDoubleQuoted':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if ($char === '"') {
 | |
|                         /* U+0022 QUOTATION MARK (")
 | |
|                         Switch to the after DOCTYPE system identifier state. */
 | |
|                         $state = 'afterDoctypeSystemIdentifier';
 | |
|                     } elseif ($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Parse error. Set the DOCTYPE token's force-quirks flag
 | |
|                         to on. Emit that DOCTYPE token. Switch to the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-end-of-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
|                     } elseif ($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Set the DOCTYPE token's force-quirks flag
 | |
|                         to on. Emit that DOCTYPE token. Reconsume the EOF
 | |
|                         character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Append the current input character to the current
 | |
|                         DOCTYPE token's system identifier. Stay in the DOCTYPE
 | |
|                         system identifier (double-quoted) state. */
 | |
|                         $this->token['system'] .= $char;
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'doctypeSystemIdentifierSingleQuoted':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if ($char === "'") {
 | |
|                         /* U+0027 APOSTROPHE (')
 | |
|                         Switch to the after DOCTYPE system identifier state. */
 | |
|                         $state = 'afterDoctypeSystemIdentifier';
 | |
|                     } elseif ($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Parse error. Set the DOCTYPE token's force-quirks flag
 | |
|                         to on. Emit that DOCTYPE token. Switch to the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-end-of-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
|                     } elseif ($char === false) {
 | |
|                         /* EOF
 | |
|                         Parse error. Set the DOCTYPE token's force-quirks flag
 | |
|                         to on. Emit that DOCTYPE token. Reconsume the EOF
 | |
|                         character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Append the current input character to the current
 | |
|                         DOCTYPE token's system identifier. Stay in the DOCTYPE
 | |
|                         system identifier (double-quoted) state. */
 | |
|                         $this->token['system'] .= $char;
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'afterDoctypeSystemIdentifier':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 | |
|                         /* U+0009 CHARACTER TABULATION
 | |
|                            U+000A LINE FEED (LF)
 | |
|                            U+000C FORM FEED (FF)
 | |
|                            U+0020 SPACE
 | |
|                         Stay in the after DOCTYPE system identifier state. */
 | |
|                     } elseif ($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Emit the current DOCTYPE token. Switch to the data state. */
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
|                     } elseif ($char === false) {
 | |
|                         /* Parse error. Set the DOCTYPE token's force-quirks
 | |
|                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
 | |
|                         character in the data state. */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'eof-in-doctype'
 | |
|                         ));
 | |
|                         $this->token['force-quirks'] = true;
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Parse error. Switch to the bogus DOCTYPE state.
 | |
|                         (This does not set the DOCTYPE token's force-quirks
 | |
|                         flag to on.) */
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'unexpected-char-in-doctype'
 | |
|                         ));
 | |
|                         $state = 'bogusDoctype';
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 case 'bogusDoctype':
 | |
|                     /* Consume the next input character: */
 | |
|                     $char = $this->stream->char();
 | |
| 
 | |
|                     if ($char === '>') {
 | |
|                         /* U+003E GREATER-THAN SIGN (>)
 | |
|                         Emit the DOCTYPE token. Switch to the data state. */
 | |
|                         $this->emitToken($this->token);
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } elseif($char === false) {
 | |
|                         /* EOF
 | |
|                         Emit the DOCTYPE token. Reconsume the EOF character in
 | |
|                         the data state. */
 | |
|                         $this->emitToken($this->token);
 | |
|                         $this->stream->unget();
 | |
|                         $state = 'data';
 | |
| 
 | |
|                     } else {
 | |
|                         /* Anything else
 | |
|                         Stay in the bogus DOCTYPE state. */
 | |
|                     }
 | |
|                 break;
 | |
| 
 | |
|                 // case 'cdataSection':
 | |
| 
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Returns a serialized representation of the tree.
 | |
|      */
 | |
|     public function save() {
 | |
|         return $this->tree->save();
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Returns the input stream.
 | |
|      */
 | |
|     public function stream() {
 | |
|         return $this->stream;
 | |
|     }
 | |
| 
 | |
|     private function consumeCharacterReference($allowed = false, $inattr = false) {
 | |
|         // This goes quite far against spec, and is far closer to the Python
 | |
|         // impl., mainly because we don't do the large unconsuming the spec
 | |
|         // requires.
 | |
| 
 | |
|         // All consumed characters.
 | |
|         $chars = $this->stream->char();
 | |
| 
 | |
|         /* This section defines how to consume a character
 | |
|         reference. This definition is used when parsing character
 | |
|         references in text and in attributes.
 | |
| 
 | |
|         The behavior depends on the identity of the next character
 | |
|         (the one immediately after the U+0026 AMPERSAND character): */
 | |
| 
 | |
|         if (
 | |
|             $chars[0] === "\x09" ||
 | |
|             $chars[0] === "\x0A" ||
 | |
|             $chars[0] === "\x0C" ||
 | |
|             $chars[0] === "\x20" ||
 | |
|             $chars[0] === '<' ||
 | |
|             $chars[0] === '&' ||
 | |
|             $chars === false ||
 | |
|             $chars[0] === $allowed
 | |
|         ) {
 | |
|             /* U+0009 CHARACTER TABULATION
 | |
|                U+000A LINE FEED (LF)
 | |
|                U+000C FORM FEED (FF)
 | |
|                U+0020 SPACE
 | |
|                U+003C LESS-THAN SIGN
 | |
|                U+0026 AMPERSAND
 | |
|                EOF
 | |
|                The additional allowed character, if there is one
 | |
|             Not a character reference. No characters are consumed,
 | |
|             and nothing is returned. (This is not an error, either.) */
 | |
|             // We already consumed, so unconsume.
 | |
|             $this->stream->unget();
 | |
|             return '&';
 | |
|         } elseif ($chars[0] === '#') {
 | |
|             /* Consume the U+0023 NUMBER SIGN. */
 | |
|             // Um, yeah, we already did that.
 | |
|             /* The behavior further depends on the character after
 | |
|             the U+0023 NUMBER SIGN: */
 | |
|             $chars .= $this->stream->char();
 | |
|             if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
 | |
|                 /* U+0078 LATIN SMALL LETTER X
 | |
|                    U+0058 LATIN CAPITAL LETTER X */
 | |
|                 /* Consume the X. */
 | |
|                 // Um, yeah, we already did that.
 | |
|                 /* Follow the steps below, but using the range of
 | |
|                 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
 | |
|                 NINE, U+0061 LATIN SMALL LETTER A through to U+0066
 | |
|                 LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
 | |
|                 A, through to U+0046 LATIN CAPITAL LETTER F (in other
 | |
|                 words, 0123456789, ABCDEF, abcdef). */
 | |
|                 $char_class = self::HEX;
 | |
|                 /* When it comes to interpreting the
 | |
|                 number, interpret it as a hexadecimal number. */
 | |
|                 $hex = true;
 | |
|             } else {
 | |
|                 /* Anything else */
 | |
|                 // Unconsume because we shouldn't have consumed this.
 | |
|                 $chars = $chars[0];
 | |
|                 $this->stream->unget();
 | |
|                 /* Follow the steps below, but using the range of
 | |
|                 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
 | |
|                 NINE (i.e. just 0123456789). */
 | |
|                 $char_class = self::DIGIT;
 | |
|                 /* When it comes to interpreting the number,
 | |
|                 interpret it as a decimal number. */
 | |
|                 $hex = false;
 | |
|             }
 | |
| 
 | |
|             /* Consume as many characters as match the range of characters given above. */
 | |
|             $consumed = $this->stream->charsWhile($char_class);
 | |
|             if ($consumed === '' || $consumed === false) {
 | |
|                 /* If no characters match the range, then don't consume
 | |
|                 any characters (and unconsume the U+0023 NUMBER SIGN
 | |
|                 character and, if appropriate, the X character). This
 | |
|                 is a parse error; nothing is returned. */
 | |
|                 $this->emitToken(array(
 | |
|                     'type' => self::PARSEERROR,
 | |
|                     'data' => 'expected-numeric-entity'
 | |
|                 ));
 | |
|                 return '&' . $chars;
 | |
|             } else {
 | |
|                 /* Otherwise, if the next character is a U+003B SEMICOLON,
 | |
|                 consume that too. If it isn't, there is a parse error. */
 | |
|                 if ($this->stream->char() !== ';') {
 | |
|                     $this->stream->unget();
 | |
|                     $this->emitToken(array(
 | |
|                         'type' => self::PARSEERROR,
 | |
|                         'data' => 'numeric-entity-without-semicolon'
 | |
|                     ));
 | |
|                 }
 | |
| 
 | |
|                 /* If one or more characters match the range, then take
 | |
|                 them all and interpret the string of characters as a number
 | |
|                 (either hexadecimal or decimal as appropriate). */
 | |
|                 $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
 | |
| 
 | |
|                 /* If that number is one of the numbers in the first column
 | |
|                 of the following table, then this is a parse error. Find the
 | |
|                 row with that number in the first column, and return a
 | |
|                 character token for the Unicode character given in the
 | |
|                 second column of that row. */
 | |
|                 $new_codepoint = HTML5_Data::getRealCodepoint($codepoint);
 | |
|                 if ($new_codepoint) {
 | |
|                     $this->emitToken(array(
 | |
|                         'type' => self::PARSEERROR,
 | |
|                         'data' => 'illegal-windows-1252-entity'
 | |
|                     ));
 | |
|                     $codepoint = $new_codepoint;
 | |
|                 } else {
 | |
|                     /* Otherwise, if the number is in the range 0x0000 to 0x0008,
 | |
|                     U+000B,  U+000E to 0x001F,  0x007F  to 0x009F, 0xD800 to 0xDFFF ,
 | |
|                     0xFDD0 to 0xFDEF, or is one of 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF,
 | |
|                     0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
 | |
|                     0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF,
 | |
|                     0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE,
 | |
|                     0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
 | |
|                     0x10FFFE, or 0x10FFFF, or is higher than 0x10FFFF, then this
 | |
|                     is a parse error; return a character token for the U+FFFD
 | |
|                     REPLACEMENT CHARACTER character instead. */
 | |
|                     // && has higher precedence than ||
 | |
|                     if (
 | |
|                         $codepoint >= 0x0000 && $codepoint <= 0x0008 ||
 | |
|                         $codepoint === 0x000B ||
 | |
|                         $codepoint >= 0x000E && $codepoint <= 0x001F ||
 | |
|                         $codepoint >= 0x007F && $codepoint <= 0x009F ||
 | |
|                         $codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
 | |
|                         $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
 | |
|                         ($codepoint & 0xFFFE) === 0xFFFE ||
 | |
|                         $codepoint > 0x10FFFF
 | |
|                     ) {
 | |
|                         $this->emitToken(array(
 | |
|                             'type' => self::PARSEERROR,
 | |
|                             'data' => 'illegal-codepoint-for-numeric-entity'
 | |
|                         ));
 | |
|                         $codepoint = 0xFFFD;
 | |
|                     }
 | |
|                 }
 | |
| 
 | |
|                 /* Otherwise, return a character token for the Unicode
 | |
|                 character whose code point is that number. */
 | |
|                 return HTML5_Data::utf8chr($codepoint);
 | |
|             }
 | |
| 
 | |
|         } else {
 | |
|             /* Anything else */
 | |
| 
 | |
|             /* Consume the maximum number of characters possible,
 | |
|             with the consumed characters matching one of the
 | |
|             identifiers in the first column of the named character
 | |
|             references table (in a case-sensitive manner). */
 | |
| 
 | |
|             // we will implement this by matching the longest
 | |
|             // alphanumeric + semicolon string, and then working
 | |
|             // our way backwards
 | |
|             $chars .= $this->stream->charsWhile(self::DIGIT . self::ALPHA . ';', HTML5_Data::getNamedCharacterReferenceMaxLength() - 1);
 | |
|             $len = strlen($chars);
 | |
| 
 | |
|             $refs = HTML5_Data::getNamedCharacterReferences();
 | |
|             $codepoint = false;
 | |
|             for($c = $len; $c > 0; $c--) {
 | |
|                 $id = substr($chars, 0, $c);
 | |
|                 if(isset($refs[$id])) {
 | |
|                     $codepoint = $refs[$id];
 | |
|                     break;
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             /* If no match can be made, then this is a parse error.
 | |
|             No characters are consumed, and nothing is returned. */
 | |
|             if (!$codepoint) {
 | |
|                 $this->emitToken(array(
 | |
|                     'type' => self::PARSEERROR,
 | |
|                     'data' => 'expected-named-entity'
 | |
|                 ));
 | |
|                 return '&' . $chars;
 | |
|             }
 | |
| 
 | |
|             /* If the last character matched is not a U+003B SEMICOLON
 | |
|             (;), there is a parse error. */
 | |
|             $semicolon = true;
 | |
|             if (substr($id, -1) !== ';') {
 | |
|                 $this->emitToken(array(
 | |
|                     'type' => self::PARSEERROR,
 | |
|                     'data' => 'named-entity-without-semicolon'
 | |
|                 ));
 | |
|                 $semicolon = false;
 | |
|             }
 | |
| 
 | |
| 
 | |
|             /* If the character reference is being consumed as part of
 | |
|             an attribute, and the last character matched is not a
 | |
|             U+003B SEMICOLON (;), and the next character is in the
 | |
|             range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
 | |
|             LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
 | |
|             or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
 | |
|             then, for historical reasons, all the characters that were
 | |
|             matched after the U+0026 AMPERSAND (&) must be unconsumed,
 | |
|             and nothing is returned. */
 | |
|             if (
 | |
|                 $inattr && !$semicolon &&
 | |
|                 strspn(substr($chars, $c, 1), self::ALPHA . self::DIGIT)
 | |
|             ) {
 | |
|                 return '&' . $chars;
 | |
|             }
 | |
| 
 | |
|             /* Otherwise, return a character token for the character
 | |
|             corresponding to the character reference name (as given
 | |
|             by the second column of the named character references table). */
 | |
|             return HTML5_Data::utf8chr($codepoint) . substr($chars, $c);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     private function characterReferenceInAttributeValue($allowed = false) {
 | |
|         /* Attempt to consume a character reference. */
 | |
|         $entity = $this->consumeCharacterReference($allowed, true);
 | |
| 
 | |
|         /* If nothing is returned, append a U+0026 AMPERSAND
 | |
|         character to the current attribute's value.
 | |
| 
 | |
|         Otherwise, append the returned character token to the
 | |
|         current attribute's value. */
 | |
|         $char = (!$entity)
 | |
|             ? '&'
 | |
|             : $entity;
 | |
| 
 | |
|         $last = count($this->token['attr']) - 1;
 | |
|         $this->token['attr'][$last]['value'] .= $char;
 | |
| 
 | |
|         /* Finally, switch back to the attribute value state that you
 | |
|         were in when were switched into this state. */
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Emits a token, passing it on to the tree builder.
 | |
|      */
 | |
|     protected function emitToken($token, $checkStream = true) {
 | |
|         if ($checkStream) {
 | |
|             // Emit errors from input stream.
 | |
|             while ($this->stream->errors) {
 | |
|                 $this->emitToken(array_shift($this->stream->errors), false);
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // the current structure of attributes is not a terribly good one
 | |
|         $this->tree->emitToken($token);
 | |
| 
 | |
|         if(is_int($this->tree->content_model)) {
 | |
|             $this->content_model = $this->tree->content_model;
 | |
|             $this->tree->content_model = null;
 | |
| 
 | |
|         } elseif($token['type'] === self::ENDTAG) {
 | |
|             $this->content_model = self::PCDATA;
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 |