]> git.wh0rd.org - tt-rss.git/blob - lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php
faf00b829136bab16c5772ae2743193bb75649ad
[tt-rss.git] / lib / htmlpurifier / library / HTMLPurifier / Lexer / PH5P.php
1 <?php
2
3 /**
4 * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
5 * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts.
6 *
7 * @note
8 * Recent changes to PHP's DOM extension have resulted in some fatal
9 * error conditions with the original version of PH5P. Pending changes,
10 * this lexer will punt to DirectLex if DOM throughs an exception.
11 */
12
13 class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex {
14
15 public function tokenizeHTML($html, $config, $context) {
16 $new_html = $this->normalize($html, $config, $context);
17 $new_html = $this->wrapHTML($new_html, $config, $context);
18 try {
19 $parser = new HTML5($new_html);
20 $doc = $parser->save();
21 } catch (DOMException $e) {
22 // Uh oh, it failed. Punt to DirectLex.
23 $lexer = new HTMLPurifier_Lexer_DirectLex();
24 $context->register('PH5PError', $e); // save the error, so we can detect it
25 return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
26 }
27 $tokens = array();
28 $this->tokenizeDOM(
29 $doc->getElementsByTagName('html')->item(0)-> // <html>
30 getElementsByTagName('body')->item(0)-> // <body>
31 getElementsByTagName('div')->item(0) // <div>
32 , $tokens);
33 return $tokens;
34 }
35
36 }
37
38 /*
39
40 Copyright 2007 Jeroen van der Meer <http://jero.net/>
41
42 Permission is hereby granted, free of charge, to any person obtaining a
43 copy of this software and associated documentation files (the
44 "Software"), to deal in the Software without restriction, including
45 without limitation the rights to use, copy, modify, merge, publish,
46 distribute, sublicense, and/or sell copies of the Software, and to
47 permit persons to whom the Software is furnished to do so, subject to
48 the following conditions:
49
50 The above copyright notice and this permission notice shall be included
51 in all copies or substantial portions of the Software.
52
53 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
54 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
55 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
56 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
57 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
58 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
59 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
60
61 */
62
63 class HTML5 {
64 private $data;
65 private $char;
66 private $EOF;
67 private $state;
68 private $tree;
69 private $token;
70 private $content_model;
71 private $escape = false;
72 private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
73 'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
74 'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
75 'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
76 'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
77 'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
78 'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
79 'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
80 'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
81 'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
82 'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
83 'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
84 'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
85 'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
86 'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
87 'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
88 'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
89 'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
90 'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
91 'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
92 'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
93 'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
94 'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
95 'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
96 'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
97 'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
98 'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
99 'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
100 'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
101 'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
102 'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
103 'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
104 'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
105 'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
106 'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
107 'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
108 'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
109 'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
110 'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
111 'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
112 'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
113 'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
114
115 const PCDATA = 0;
116 const RCDATA = 1;
117 const CDATA = 2;
118 const PLAINTEXT = 3;
119
120 const DOCTYPE = 0;
121 const STARTTAG = 1;
122 const ENDTAG = 2;
123 const COMMENT = 3;
124 const CHARACTR = 4;
125 const EOF = 5;
126
127 public function __construct($data) {
128
129 $this->data = $data;
130 $this->char = -1;
131 $this->EOF = strlen($data);
132 $this->tree = new HTML5TreeConstructer;
133 $this->content_model = self::PCDATA;
134
135 $this->state = 'data';
136
137 while($this->state !== null) {
138 $this->{$this->state.'State'}();
139 }
140 }
141
142 public function save() {
143 return $this->tree->save();
144 }
145
146 private function char() {
147 return ($this->char < $this->EOF)
148 ? $this->data[$this->char]
149 : false;
150 }
151
152 private function character($s, $l = 0) {
153 if($s + $l < $this->EOF) {
154 if($l === 0) {
155 return $this->data[$s];
156 } else {
157 return substr($this->data, $s, $l);
158 }
159 }
160 }
161
162 private function characters($char_class, $start) {
163 return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
164 }
165
166 private function dataState() {
167 // Consume the next input character
168 $this->char++;
169 $char = $this->char();
170
171 if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
172 /* U+0026 AMPERSAND (&)
173 When the content model flag is set to one of the PCDATA or RCDATA
174 states: switch to the entity data state. Otherwise: treat it as per
175 the "anything else" entry below. */
176 $this->state = 'entityData';
177
178 } elseif($char === '-') {
179 /* If the content model flag is set to either the RCDATA state or
180 the CDATA state, and the escape flag is false, and there are at
181 least three characters before this one in the input stream, and the
182 last four characters in the input stream, including this one, are
183 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
184 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
185 if(($this->content_model === self::RCDATA || $this->content_model ===
186 self::CDATA) && $this->escape === false &&
187 $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
188 $this->escape = true;
189 }
190
191 /* In any case, emit the input character as a character token. Stay
192 in the data state. */
193 $this->emitToken(array(
194 'type' => self::CHARACTR,
195 'data' => $char
196 ));
197
198 /* U+003C LESS-THAN SIGN (<) */
199 } elseif($char === '<' && ($this->content_model === self::PCDATA ||
200 (($this->content_model === self::RCDATA ||
201 $this->content_model === self::CDATA) && $this->escape === false))) {
202 /* When the content model flag is set to the PCDATA state: switch
203 to the tag open state.
204
205 When the content model flag is set to either the RCDATA state or
206 the CDATA state and the escape flag is false: switch to the tag
207 open state.
208
209 Otherwise: treat it as per the "anything else" entry below. */
210 $this->state = 'tagOpen';
211
212 /* U+003E GREATER-THAN SIGN (>) */
213 } elseif($char === '>') {
214 /* If the content model flag is set to either the RCDATA state or
215 the CDATA state, and the escape flag is true, and the last three
216 characters in the input stream including this one are U+002D
217 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
218 set the escape flag to false. */
219 if(($this->content_model === self::RCDATA ||
220 $this->content_model === self::CDATA) && $this->escape === true &&
221 $this->character($this->char, 3) === '-->') {
222 $this->escape = false;
223 }
224
225 /* In any case, emit the input character as a character token.
226 Stay in the data state. */
227 $this->emitToken(array(
228 'type' => self::CHARACTR,
229 'data' => $char
230 ));
231
232 } elseif($this->char === $this->EOF) {
233 /* EOF
234 Emit an end-of-file token. */
235 $this->EOF();
236
237 } elseif($this->content_model === self::PLAINTEXT) {
238 /* When the content model flag is set to the PLAINTEXT state
239 THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
240 the text and emit it as a character token. */
241 $this->emitToken(array(
242 'type' => self::CHARACTR,
243 'data' => substr($this->data, $this->char)
244 ));
245
246 $this->EOF();
247
248 } else {
249 /* Anything else
250 THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
251 otherwise would also be treated as a character token and emit it
252 as a single character token. Stay in the data state. */
253 $len = strcspn($this->data, '<&', $this->char);
254 $char = substr($this->data, $this->char, $len);
255 $this->char += $len - 1;
256
257 $this->emitToken(array(
258 'type' => self::CHARACTR,
259 'data' => $char
260 ));
261
262 $this->state = 'data';
263 }
264 }
265
266 private function entityDataState() {
267 // Attempt to consume an entity.
268 $entity = $this->entity();
269
270 // If nothing is returned, emit a U+0026 AMPERSAND character token.
271 // Otherwise, emit the character token that was returned.
272 $char = (!$entity) ? '&' : $entity;
273 $this->emitToken(array(
274 'type' => self::CHARACTR,
275 'data' => $char
276 ));
277
278 // Finally, switch to the data state.
279 $this->state = 'data';
280 }
281
282 private function tagOpenState() {
283 switch($this->content_model) {
284 case self::RCDATA:
285 case self::CDATA:
286 /* If the next input character is a U+002F SOLIDUS (/) character,
287 consume it and switch to the close tag open state. If the next
288 input character is not a U+002F SOLIDUS (/) character, emit a
289 U+003C LESS-THAN SIGN character token and switch to the data
290 state to process the next input character. */
291 if($this->character($this->char + 1) === '/') {
292 $this->char++;
293 $this->state = 'closeTagOpen';
294
295 } else {
296 $this->emitToken(array(
297 'type' => self::CHARACTR,
298 'data' => '<'
299 ));
300
301 $this->state = 'data';
302 }
303 break;
304
305 case self::PCDATA:
306 // If the content model flag is set to the PCDATA state
307 // Consume the next input character:
308 $this->char++;
309 $char = $this->char();
310
311 if($char === '!') {
312 /* U+0021 EXCLAMATION MARK (!)
313 Switch to the markup declaration open state. */
314 $this->state = 'markupDeclarationOpen';
315
316 } elseif($char === '/') {
317 /* U+002F SOLIDUS (/)
318 Switch to the close tag open state. */
319 $this->state = 'closeTagOpen';
320
321 } elseif(preg_match('/^[A-Za-z]$/', $char)) {
322 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
323 Create a new start tag token, set its tag name to the lowercase
324 version of the input character (add 0x0020 to the character's code
325 point), then switch to the tag name state. (Don't emit the token
326 yet; further details will be filled in before it is emitted.) */
327 $this->token = array(
328 'name' => strtolower($char),
329 'type' => self::STARTTAG,
330 'attr' => array()
331 );
332
333 $this->state = 'tagName';
334
335 } elseif($char === '>') {
336 /* U+003E GREATER-THAN SIGN (>)
337 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
338 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
339 $this->emitToken(array(
340 'type' => self::CHARACTR,
341 'data' => '<>'
342 ));
343
344 $this->state = 'data';
345
346 } elseif($char === '?') {
347 /* U+003F QUESTION MARK (?)
348 Parse error. Switch to the bogus comment state. */
349 $this->state = 'bogusComment';
350
351 } else {
352 /* Anything else
353 Parse error. Emit a U+003C LESS-THAN SIGN character token and
354 reconsume the current input character in the data state. */
355 $this->emitToken(array(
356 'type' => self::CHARACTR,
357 'data' => '<'
358 ));
359
360 $this->char--;
361 $this->state = 'data';
362 }
363 break;
364 }
365 }
366
367 private function closeTagOpenState() {
368 $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
369 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
370
371 if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
372 (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
373 $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
374 /* If the content model flag is set to the RCDATA or CDATA states then
375 examine the next few characters. If they do not match the tag name of
376 the last start tag token emitted (case insensitively), or if they do but
377 they are not immediately followed by one of the following characters:
378 * U+0009 CHARACTER TABULATION
379 * U+000A LINE FEED (LF)
380 * U+000B LINE TABULATION
381 * U+000C FORM FEED (FF)
382 * U+0020 SPACE
383 * U+003E GREATER-THAN SIGN (>)
384 * U+002F SOLIDUS (/)
385 * EOF
386 ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
387 token, a U+002F SOLIDUS character token, and switch to the data state
388 to process the next input character. */
389 $this->emitToken(array(
390 'type' => self::CHARACTR,
391 'data' => '</'
392 ));
393
394 $this->state = 'data';
395
396 } else {
397 /* Otherwise, if the content model flag is set to the PCDATA state,
398 or if the next few characters do match that tag name, consume the
399 next input character: */
400 $this->char++;
401 $char = $this->char();
402
403 if(preg_match('/^[A-Za-z]$/', $char)) {
404 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
405 Create a new end tag token, set its tag name to the lowercase version
406 of the input character (add 0x0020 to the character's code point), then
407 switch to the tag name state. (Don't emit the token yet; further details
408 will be filled in before it is emitted.) */
409 $this->token = array(
410 'name' => strtolower($char),
411 'type' => self::ENDTAG
412 );
413
414 $this->state = 'tagName';
415
416 } elseif($char === '>') {
417 /* U+003E GREATER-THAN SIGN (>)
418 Parse error. Switch to the data state. */
419 $this->state = 'data';
420
421 } elseif($this->char === $this->EOF) {
422 /* EOF
423 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
424 SOLIDUS character token. Reconsume the EOF character in the data state. */
425 $this->emitToken(array(
426 'type' => self::CHARACTR,
427 'data' => '</'
428 ));
429
430 $this->char--;
431 $this->state = 'data';
432
433 } else {
434 /* Parse error. Switch to the bogus comment state. */
435 $this->state = 'bogusComment';
436 }
437 }
438 }
439
440 private function tagNameState() {
441 // Consume the next input character:
442 $this->char++;
443 $char = $this->character($this->char);
444
445 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
446 /* U+0009 CHARACTER TABULATION
447 U+000A LINE FEED (LF)
448 U+000B LINE TABULATION
449 U+000C FORM FEED (FF)
450 U+0020 SPACE
451 Switch to the before attribute name state. */
452 $this->state = 'beforeAttributeName';
453
454 } elseif($char === '>') {
455 /* U+003E GREATER-THAN SIGN (>)
456 Emit the current tag token. Switch to the data state. */
457 $this->emitToken($this->token);
458 $this->state = 'data';
459
460 } elseif($this->char === $this->EOF) {
461 /* EOF
462 Parse error. Emit the current tag token. Reconsume the EOF
463 character in the data state. */
464 $this->emitToken($this->token);
465
466 $this->char--;
467 $this->state = 'data';
468
469 } elseif($char === '/') {
470 /* U+002F SOLIDUS (/)
471 Parse error unless this is a permitted slash. Switch to the before
472 attribute name state. */
473 $this->state = 'beforeAttributeName';
474
475 } else {
476 /* Anything else
477 Append the current input character to the current tag token's tag name.
478 Stay in the tag name state. */
479 $this->token['name'] .= strtolower($char);
480 $this->state = 'tagName';
481 }
482 }
483
484 private function beforeAttributeNameState() {
485 // Consume the next input character:
486 $this->char++;
487 $char = $this->character($this->char);
488
489 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
490 /* U+0009 CHARACTER TABULATION
491 U+000A LINE FEED (LF)
492 U+000B LINE TABULATION
493 U+000C FORM FEED (FF)
494 U+0020 SPACE
495 Stay in the before attribute name state. */
496 $this->state = 'beforeAttributeName';
497
498 } elseif($char === '>') {
499 /* U+003E GREATER-THAN SIGN (>)
500 Emit the current tag token. Switch to the data state. */
501 $this->emitToken($this->token);
502 $this->state = 'data';
503
504 } elseif($char === '/') {
505 /* U+002F SOLIDUS (/)
506 Parse error unless this is a permitted slash. Stay in the before
507 attribute name state. */
508 $this->state = 'beforeAttributeName';
509
510 } elseif($this->char === $this->EOF) {
511 /* EOF
512 Parse error. Emit the current tag token. Reconsume the EOF
513 character in the data state. */
514 $this->emitToken($this->token);
515
516 $this->char--;
517 $this->state = 'data';
518
519 } else {
520 /* Anything else
521 Start a new attribute in the current tag token. Set that attribute's
522 name to the current input character, and its value to the empty string.
523 Switch to the attribute name state. */
524 $this->token['attr'][] = array(
525 'name' => strtolower($char),
526 'value' => null
527 );
528
529 $this->state = 'attributeName';
530 }
531 }
532
533 private function attributeNameState() {
534 // Consume the next input character:
535 $this->char++;
536 $char = $this->character($this->char);
537
538 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
539 /* U+0009 CHARACTER TABULATION
540 U+000A LINE FEED (LF)
541 U+000B LINE TABULATION
542 U+000C FORM FEED (FF)
543 U+0020 SPACE
544 Stay in the before attribute name state. */
545 $this->state = 'afterAttributeName';
546
547 } elseif($char === '=') {
548 /* U+003D EQUALS SIGN (=)
549 Switch to the before attribute value state. */
550 $this->state = 'beforeAttributeValue';
551
552 } elseif($char === '>') {
553 /* U+003E GREATER-THAN SIGN (>)
554 Emit the current tag token. Switch to the data state. */
555 $this->emitToken($this->token);
556 $this->state = 'data';
557
558 } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
559 /* U+002F SOLIDUS (/)
560 Parse error unless this is a permitted slash. Switch to the before
561 attribute name state. */
562 $this->state = 'beforeAttributeName';
563
564 } elseif($this->char === $this->EOF) {
565 /* EOF
566 Parse error. Emit the current tag token. Reconsume the EOF
567 character in the data state. */
568 $this->emitToken($this->token);
569
570 $this->char--;
571 $this->state = 'data';
572
573 } else {
574 /* Anything else
575 Append the current input character to the current attribute's name.
576 Stay in the attribute name state. */
577 $last = count($this->token['attr']) - 1;
578 $this->token['attr'][$last]['name'] .= strtolower($char);
579
580 $this->state = 'attributeName';
581 }
582 }
583
584 private function afterAttributeNameState() {
585 // Consume the next input character:
586 $this->char++;
587 $char = $this->character($this->char);
588
589 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
590 /* U+0009 CHARACTER TABULATION
591 U+000A LINE FEED (LF)
592 U+000B LINE TABULATION
593 U+000C FORM FEED (FF)
594 U+0020 SPACE
595 Stay in the after attribute name state. */
596 $this->state = 'afterAttributeName';
597
598 } elseif($char === '=') {
599 /* U+003D EQUALS SIGN (=)
600 Switch to the before attribute value state. */
601 $this->state = 'beforeAttributeValue';
602
603 } elseif($char === '>') {
604 /* U+003E GREATER-THAN SIGN (>)
605 Emit the current tag token. Switch to the data state. */
606 $this->emitToken($this->token);
607 $this->state = 'data';
608
609 } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
610 /* U+002F SOLIDUS (/)
611 Parse error unless this is a permitted slash. Switch to the
612 before attribute name state. */
613 $this->state = 'beforeAttributeName';
614
615 } elseif($this->char === $this->EOF) {
616 /* EOF
617 Parse error. Emit the current tag token. Reconsume the EOF
618 character in the data state. */
619 $this->emitToken($this->token);
620
621 $this->char--;
622 $this->state = 'data';
623
624 } else {
625 /* Anything else
626 Start a new attribute in the current tag token. Set that attribute's
627 name to the current input character, and its value to the empty string.
628 Switch to the attribute name state. */
629 $this->token['attr'][] = array(
630 'name' => strtolower($char),
631 'value' => null
632 );
633
634 $this->state = 'attributeName';
635 }
636 }
637
638 private function beforeAttributeValueState() {
639 // Consume the next input character:
640 $this->char++;
641 $char = $this->character($this->char);
642
643 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
644 /* U+0009 CHARACTER TABULATION
645 U+000A LINE FEED (LF)
646 U+000B LINE TABULATION
647 U+000C FORM FEED (FF)
648 U+0020 SPACE
649 Stay in the before attribute value state. */
650 $this->state = 'beforeAttributeValue';
651
652 } elseif($char === '"') {
653 /* U+0022 QUOTATION MARK (")
654 Switch to the attribute value (double-quoted) state. */
655 $this->state = 'attributeValueDoubleQuoted';
656
657 } elseif($char === '&') {
658 /* U+0026 AMPERSAND (&)
659 Switch to the attribute value (unquoted) state and reconsume
660 this input character. */
661 $this->char--;
662 $this->state = 'attributeValueUnquoted';
663
664 } elseif($char === '\'') {
665 /* U+0027 APOSTROPHE (')
666 Switch to the attribute value (single-quoted) state. */
667 $this->state = 'attributeValueSingleQuoted';
668
669 } elseif($char === '>') {
670 /* U+003E GREATER-THAN SIGN (>)
671 Emit the current tag token. Switch to the data state. */
672 $this->emitToken($this->token);
673 $this->state = 'data';
674
675 } else {
676 /* Anything else
677 Append the current input character to the current attribute's value.
678 Switch to the attribute value (unquoted) state. */
679 $last = count($this->token['attr']) - 1;
680 $this->token['attr'][$last]['value'] .= $char;
681
682 $this->state = 'attributeValueUnquoted';
683 }
684 }
685
686 private function attributeValueDoubleQuotedState() {
687 // Consume the next input character:
688 $this->char++;
689 $char = $this->character($this->char);
690
691 if($char === '"') {
692 /* U+0022 QUOTATION MARK (")
693 Switch to the before attribute name state. */
694 $this->state = 'beforeAttributeName';
695
696 } elseif($char === '&') {
697 /* U+0026 AMPERSAND (&)
698 Switch to the entity in attribute value state. */
699 $this->entityInAttributeValueState('double');
700
701 } elseif($this->char === $this->EOF) {
702 /* EOF
703 Parse error. Emit the current tag token. Reconsume the character
704 in the data state. */
705 $this->emitToken($this->token);
706
707 $this->char--;
708 $this->state = 'data';
709
710 } else {
711 /* Anything else
712 Append the current input character to the current attribute's value.
713 Stay in the attribute value (double-quoted) state. */
714 $last = count($this->token['attr']) - 1;
715 $this->token['attr'][$last]['value'] .= $char;
716
717 $this->state = 'attributeValueDoubleQuoted';
718 }
719 }
720
721 private function attributeValueSingleQuotedState() {
722 // Consume the next input character:
723 $this->char++;
724 $char = $this->character($this->char);
725
726 if($char === '\'') {
727 /* U+0022 QUOTATION MARK (')
728 Switch to the before attribute name state. */
729 $this->state = 'beforeAttributeName';
730
731 } elseif($char === '&') {
732 /* U+0026 AMPERSAND (&)
733 Switch to the entity in attribute value state. */
734 $this->entityInAttributeValueState('single');
735
736 } elseif($this->char === $this->EOF) {
737 /* EOF
738 Parse error. Emit the current tag token. Reconsume the character
739 in the data state. */
740 $this->emitToken($this->token);
741
742 $this->char--;
743 $this->state = 'data';
744
745 } else {
746 /* Anything else
747 Append the current input character to the current attribute's value.
748 Stay in the attribute value (single-quoted) state. */
749 $last = count($this->token['attr']) - 1;
750 $this->token['attr'][$last]['value'] .= $char;
751
752 $this->state = 'attributeValueSingleQuoted';
753 }
754 }
755
756 private function attributeValueUnquotedState() {
757 // Consume the next input character:
758 $this->char++;
759 $char = $this->character($this->char);
760
761 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
762 /* U+0009 CHARACTER TABULATION
763 U+000A LINE FEED (LF)
764 U+000B LINE TABULATION
765 U+000C FORM FEED (FF)
766 U+0020 SPACE
767 Switch to the before attribute name state. */
768 $this->state = 'beforeAttributeName';
769
770 } elseif($char === '&') {
771 /* U+0026 AMPERSAND (&)
772 Switch to the entity in attribute value state. */
773 $this->entityInAttributeValueState();
774
775 } elseif($char === '>') {
776 /* U+003E GREATER-THAN SIGN (>)
777 Emit the current tag token. Switch to the data state. */
778 $this->emitToken($this->token);
779 $this->state = 'data';
780
781 } else {
782 /* Anything else
783 Append the current input character to the current attribute's value.
784 Stay in the attribute value (unquoted) state. */
785 $last = count($this->token['attr']) - 1;
786 $this->token['attr'][$last]['value'] .= $char;
787
788 $this->state = 'attributeValueUnquoted';
789 }
790 }
791
792 private function entityInAttributeValueState() {
793 // Attempt to consume an entity.
794 $entity = $this->entity();
795
796 // If nothing is returned, append a U+0026 AMPERSAND character to the
797 // current attribute's value. Otherwise, emit the character token that
798 // was returned.
799 $char = (!$entity)
800 ? '&'
801 : $entity;
802
803 $last = count($this->token['attr']) - 1;
804 $this->token['attr'][$last]['value'] .= $char;
805 }
806
807 private function bogusCommentState() {
808 /* Consume every character up to the first U+003E GREATER-THAN SIGN
809 character (>) or the end of the file (EOF), whichever comes first. Emit
810 a comment token whose data is the concatenation of all the characters
811 starting from and including the character that caused the state machine
812 to switch into the bogus comment state, up to and including the last
813 consumed character before the U+003E character, if any, or up to the
814 end of the file otherwise. (If the comment was started by the end of
815 the file (EOF), the token is empty.) */
816 $data = $this->characters('^>', $this->char);
817 $this->emitToken(array(
818 'data' => $data,
819 'type' => self::COMMENT
820 ));
821
822 $this->char += strlen($data);
823
824 /* Switch to the data state. */
825 $this->state = 'data';
826
827 /* If the end of the file was reached, reconsume the EOF character. */
828 if($this->char === $this->EOF) {
829 $this->char = $this->EOF - 1;
830 }
831 }
832
833 private function markupDeclarationOpenState() {
834 /* If the next two characters are both U+002D HYPHEN-MINUS (-)
835 characters, consume those two characters, create a comment token whose
836 data is the empty string, and switch to the comment state. */
837 if($this->character($this->char + 1, 2) === '--') {
838 $this->char += 2;
839 $this->state = 'comment';
840 $this->token = array(
841 'data' => null,
842 'type' => self::COMMENT
843 );
844
845 /* Otherwise if the next seven chacacters are a case-insensitive match
846 for the word "DOCTYPE", then consume those characters and switch to the
847 DOCTYPE state. */
848 } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
849 $this->char += 7;
850 $this->state = 'doctype';
851
852 /* Otherwise, is is a parse error. Switch to the bogus comment state.
853 The next character that is consumed, if any, is the first character
854 that will be in the comment. */
855 } else {
856 $this->char++;
857 $this->state = 'bogusComment';
858 }
859 }
860
861 private function commentState() {
862 /* Consume the next input character: */
863 $this->char++;
864 $char = $this->char();
865
866 /* U+002D HYPHEN-MINUS (-) */
867 if($char === '-') {
868 /* Switch to the comment dash state */
869 $this->state = 'commentDash';
870
871 /* EOF */
872 } elseif($this->char === $this->EOF) {
873 /* Parse error. Emit the comment token. Reconsume the EOF character
874 in the data state. */
875 $this->emitToken($this->token);
876 $this->char--;
877 $this->state = 'data';
878
879 /* Anything else */
880 } else {
881 /* Append the input character to the comment token's data. Stay in
882 the comment state. */
883 $this->token['data'] .= $char;
884 }
885 }
886
887 private function commentDashState() {
888 /* Consume the next input character: */
889 $this->char++;
890 $char = $this->char();
891
892 /* U+002D HYPHEN-MINUS (-) */
893 if($char === '-') {
894 /* Switch to the comment end state */
895 $this->state = 'commentEnd';
896
897 /* EOF */
898 } elseif($this->char === $this->EOF) {
899 /* Parse error. Emit the comment token. Reconsume the EOF character
900 in the data state. */
901 $this->emitToken($this->token);
902 $this->char--;
903 $this->state = 'data';
904
905 /* Anything else */
906 } else {
907 /* Append a U+002D HYPHEN-MINUS (-) character and the input
908 character to the comment token's data. Switch to the comment state. */
909 $this->token['data'] .= '-'.$char;
910 $this->state = 'comment';
911 }
912 }
913
914 private function commentEndState() {
915 /* Consume the next input character: */
916 $this->char++;
917 $char = $this->char();
918
919 if($char === '>') {
920 $this->emitToken($this->token);
921 $this->state = 'data';
922
923 } elseif($char === '-') {
924 $this->token['data'] .= '-';
925
926 } elseif($this->char === $this->EOF) {
927 $this->emitToken($this->token);
928 $this->char--;
929 $this->state = 'data';
930
931 } else {
932 $this->token['data'] .= '--'.$char;
933 $this->state = 'comment';
934 }
935 }
936
937 private function doctypeState() {
938 /* Consume the next input character: */
939 $this->char++;
940 $char = $this->char();
941
942 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
943 $this->state = 'beforeDoctypeName';
944
945 } else {
946 $this->char--;
947 $this->state = 'beforeDoctypeName';
948 }
949 }
950
951 private function beforeDoctypeNameState() {
952 /* Consume the next input character: */
953 $this->char++;
954 $char = $this->char();
955
956 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
957 // Stay in the before DOCTYPE name state.
958
959 } elseif(preg_match('/^[a-z]$/', $char)) {
960 $this->token = array(
961 'name' => strtoupper($char),
962 'type' => self::DOCTYPE,
963 'error' => true
964 );
965
966 $this->state = 'doctypeName';
967
968 } elseif($char === '>') {
969 $this->emitToken(array(
970 'name' => null,
971 'type' => self::DOCTYPE,
972 'error' => true
973 ));
974
975 $this->state = 'data';
976
977 } elseif($this->char === $this->EOF) {
978 $this->emitToken(array(
979 'name' => null,
980 'type' => self::DOCTYPE,
981 'error' => true
982 ));
983
984 $this->char--;
985 $this->state = 'data';
986
987 } else {
988 $this->token = array(
989 'name' => $char,
990 'type' => self::DOCTYPE,
991 'error' => true
992 );
993
994 $this->state = 'doctypeName';
995 }
996 }
997
998 private function doctypeNameState() {
999 /* Consume the next input character: */
1000 $this->char++;
1001 $char = $this->char();
1002
1003 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1004 $this->state = 'AfterDoctypeName';
1005
1006 } elseif($char === '>') {
1007 $this->emitToken($this->token);
1008 $this->state = 'data';
1009
1010 } elseif(preg_match('/^[a-z]$/', $char)) {
1011 $this->token['name'] .= strtoupper($char);
1012
1013 } elseif($this->char === $this->EOF) {
1014 $this->emitToken($this->token);
1015 $this->char--;
1016 $this->state = 'data';
1017
1018 } else {
1019 $this->token['name'] .= $char;
1020 }
1021
1022 $this->token['error'] = ($this->token['name'] === 'HTML')
1023 ? false
1024 : true;
1025 }
1026
1027 private function afterDoctypeNameState() {
1028 /* Consume the next input character: */
1029 $this->char++;
1030 $char = $this->char();
1031
1032 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1033 // Stay in the DOCTYPE name state.
1034
1035 } elseif($char === '>') {
1036 $this->emitToken($this->token);
1037 $this->state = 'data';
1038
1039 } elseif($this->char === $this->EOF) {
1040 $this->emitToken($this->token);
1041 $this->char--;
1042 $this->state = 'data';
1043
1044 } else {
1045 $this->token['error'] = true;
1046 $this->state = 'bogusDoctype';
1047 }
1048 }
1049
1050 private function bogusDoctypeState() {
1051 /* Consume the next input character: */
1052 $this->char++;
1053 $char = $this->char();
1054
1055 if($char === '>') {
1056 $this->emitToken($this->token);
1057 $this->state = 'data';
1058
1059 } elseif($this->char === $this->EOF) {
1060 $this->emitToken($this->token);
1061 $this->char--;
1062 $this->state = 'data';
1063
1064 } else {
1065 // Stay in the bogus DOCTYPE state.
1066 }
1067 }
1068
1069 private function entity() {
1070 $start = $this->char;
1071
1072 // This section defines how to consume an entity. This definition is
1073 // used when parsing entities in text and in attributes.
1074
1075 // The behaviour depends on the identity of the next character (the
1076 // one immediately after the U+0026 AMPERSAND character):
1077
1078 switch($this->character($this->char + 1)) {
1079 // U+0023 NUMBER SIGN (#)
1080 case '#':
1081
1082 // The behaviour further depends on the character after the
1083 // U+0023 NUMBER SIGN:
1084 switch($this->character($this->char + 1)) {
1085 // U+0078 LATIN SMALL LETTER X
1086 // U+0058 LATIN CAPITAL LETTER X
1087 case 'x':
1088 case 'X':
1089 // Follow the steps below, but using the range of
1090 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1091 // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1092 // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1093 // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1094 // words, 0-9, A-F, a-f).
1095 $char = 1;
1096 $char_class = '0-9A-Fa-f';
1097 break;
1098
1099 // Anything else
1100 default:
1101 // Follow the steps below, but using the range of
1102 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1103 // NINE (i.e. just 0-9).
1104 $char = 0;
1105 $char_class = '0-9';
1106 break;
1107 }
1108
1109 // Consume as many characters as match the range of characters
1110 // given above.
1111 $this->char++;
1112 $e_name = $this->characters($char_class, $this->char + $char + 1);
1113 $entity = $this->character($start, $this->char);
1114 $cond = strlen($e_name) > 0;
1115
1116 // The rest of the parsing happens bellow.
1117 break;
1118
1119 // Anything else
1120 default:
1121 // Consume the maximum number of characters possible, with the
1122 // consumed characters case-sensitively matching one of the
1123 // identifiers in the first column of the entities table.
1124 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1125 $len = strlen($e_name);
1126
1127 for($c = 1; $c <= $len; $c++) {
1128 $id = substr($e_name, 0, $c);
1129 $this->char++;
1130
1131 if(in_array($id, $this->entities)) {
1132 if ($e_name[$c-1] !== ';') {
1133 if ($c < $len && $e_name[$c] == ';') {
1134 $this->char++; // consume extra semicolon
1135 }
1136 }
1137 $entity = $id;
1138 break;
1139 }
1140 }
1141
1142 $cond = isset($entity);
1143 // The rest of the parsing happens bellow.
1144 break;
1145 }
1146
1147 if(!$cond) {
1148 // If no match can be made, then this is a parse error. No
1149 // characters are consumed, and nothing is returned.
1150 $this->char = $start;
1151 return false;
1152 }
1153
1154 // Return a character token for the character corresponding to the
1155 // entity name (as given by the second column of the entities table).
1156 return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8');
1157 }
1158
1159 private function emitToken($token) {
1160 $emit = $this->tree->emitToken($token);
1161
1162 if(is_int($emit)) {
1163 $this->content_model = $emit;
1164
1165 } elseif($token['type'] === self::ENDTAG) {
1166 $this->content_model = self::PCDATA;
1167 }
1168 }
1169
1170 private function EOF() {
1171 $this->state = null;
1172 $this->tree->emitToken(array(
1173 'type' => self::EOF
1174 ));
1175 }
1176 }
1177
1178 class HTML5TreeConstructer {
1179 public $stack = array();
1180
1181 private $phase;
1182 private $mode;
1183 private $dom;
1184 private $foster_parent = null;
1185 private $a_formatting = array();
1186
1187 private $head_pointer = null;
1188 private $form_pointer = null;
1189
1190 private $scoping = array('button','caption','html','marquee','object','table','td','th');
1191 private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u');
1192 private $special = array('address','area','base','basefont','bgsound',
1193 'blockquote','body','br','center','col','colgroup','dd','dir','div','dl',
1194 'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5',
1195 'h6','head','hr','iframe','image','img','input','isindex','li','link',
1196 'listing','menu','meta','noembed','noframes','noscript','ol','optgroup',
1197 'option','p','param','plaintext','pre','script','select','spacer','style',
1198 'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
1199
1200 // The different phases.
1201 const INIT_PHASE = 0;
1202 const ROOT_PHASE = 1;
1203 const MAIN_PHASE = 2;
1204 const END_PHASE = 3;
1205
1206 // The different insertion modes for the main phase.
1207 const BEFOR_HEAD = 0;
1208 const IN_HEAD = 1;
1209 const AFTER_HEAD = 2;
1210 const IN_BODY = 3;
1211 const IN_TABLE = 4;
1212 const IN_CAPTION = 5;
1213 const IN_CGROUP = 6;
1214 const IN_TBODY = 7;
1215 const IN_ROW = 8;
1216 const IN_CELL = 9;
1217 const IN_SELECT = 10;
1218 const AFTER_BODY = 11;
1219 const IN_FRAME = 12;
1220 const AFTR_FRAME = 13;
1221
1222 // The different types of elements.
1223 const SPECIAL = 0;
1224 const SCOPING = 1;
1225 const FORMATTING = 2;
1226 const PHRASING = 3;
1227
1228 const MARKER = 0;
1229
1230 public function __construct() {
1231 $this->phase = self::INIT_PHASE;
1232 $this->mode = self::BEFOR_HEAD;
1233 $this->dom = new DOMDocument;
1234
1235 $this->dom->encoding = 'UTF-8';
1236 $this->dom->preserveWhiteSpace = true;
1237 $this->dom->substituteEntities = true;
1238 $this->dom->strictErrorChecking = false;
1239 }
1240
1241 // Process tag tokens
1242 public function emitToken($token) {
1243 switch($this->phase) {
1244 case self::INIT_PHASE: return $this->initPhase($token); break;
1245 case self::ROOT_PHASE: return $this->rootElementPhase($token); break;
1246 case self::MAIN_PHASE: return $this->mainPhase($token); break;
1247 case self::END_PHASE : return $this->trailingEndPhase($token); break;
1248 }
1249 }
1250
1251 private function initPhase($token) {
1252 /* Initially, the tree construction stage must handle each token
1253 emitted from the tokenisation stage as follows: */
1254
1255 /* A DOCTYPE token that is marked as being in error
1256 A comment token
1257 A start tag token
1258 An end tag token
1259 A character token that is not one of one of U+0009 CHARACTER TABULATION,
1260 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1261 or U+0020 SPACE
1262 An end-of-file token */
1263 if((isset($token['error']) && $token['error']) ||
1264 $token['type'] === HTML5::COMMENT ||
1265 $token['type'] === HTML5::STARTTAG ||
1266 $token['type'] === HTML5::ENDTAG ||
1267 $token['type'] === HTML5::EOF ||
1268 ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1269 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) {
1270 /* This specification does not define how to handle this case. In
1271 particular, user agents may ignore the entirety of this specification
1272 altogether for such documents, and instead invoke special parse modes
1273 with a greater emphasis on backwards compatibility. */
1274
1275 $this->phase = self::ROOT_PHASE;
1276 return $this->rootElementPhase($token);
1277
1278 /* A DOCTYPE token marked as being correct */
1279 } elseif(isset($token['error']) && !$token['error']) {
1280 /* Append a DocumentType node to the Document node, with the name
1281 attribute set to the name given in the DOCTYPE token (which will be
1282 "HTML"), and the other attributes specific to DocumentType objects
1283 set to null, empty lists, or the empty string as appropriate. */
1284 $doctype = new DOMDocumentType(null, null, 'HTML');
1285
1286 /* Then, switch to the root element phase of the tree construction
1287 stage. */
1288 $this->phase = self::ROOT_PHASE;
1289
1290 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1291 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1292 or U+0020 SPACE */
1293 } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/',
1294 $token['data'])) {
1295 /* Append that character to the Document node. */
1296 $text = $this->dom->createTextNode($token['data']);
1297 $this->dom->appendChild($text);
1298 }
1299 }
1300
1301 private function rootElementPhase($token) {
1302 /* After the initial phase, as each token is emitted from the tokenisation
1303 stage, it must be processed as described in this section. */
1304
1305 /* A DOCTYPE token */
1306 if($token['type'] === HTML5::DOCTYPE) {
1307 // Parse error. Ignore the token.
1308
1309 /* A comment token */
1310 } elseif($token['type'] === HTML5::COMMENT) {
1311 /* Append a Comment node to the Document object with the data
1312 attribute set to the data given in the comment token. */
1313 $comment = $this->dom->createComment($token['data']);
1314 $this->dom->appendChild($comment);
1315
1316 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1317 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1318 or U+0020 SPACE */
1319 } elseif($token['type'] === HTML5::CHARACTR &&
1320 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1321 /* Append that character to the Document node. */
1322 $text = $this->dom->createTextNode($token['data']);
1323 $this->dom->appendChild($text);
1324
1325 /* A character token that is not one of U+0009 CHARACTER TABULATION,
1326 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1327 (FF), or U+0020 SPACE
1328 A start tag token
1329 An end tag token
1330 An end-of-file token */
1331 } elseif(($token['type'] === HTML5::CHARACTR &&
1332 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1333 $token['type'] === HTML5::STARTTAG ||
1334 $token['type'] === HTML5::ENDTAG ||
1335 $token['type'] === HTML5::EOF) {
1336 /* Create an HTMLElement node with the tag name html, in the HTML
1337 namespace. Append it to the Document object. Switch to the main
1338 phase and reprocess the current token. */
1339 $html = $this->dom->createElement('html');
1340 $this->dom->appendChild($html);
1341 $this->stack[] = $html;
1342
1343 $this->phase = self::MAIN_PHASE;
1344 return $this->mainPhase($token);
1345 }
1346 }
1347
1348 private function mainPhase($token) {
1349 /* Tokens in the main phase must be handled as follows: */
1350
1351 /* A DOCTYPE token */
1352 if($token['type'] === HTML5::DOCTYPE) {
1353 // Parse error. Ignore the token.
1354
1355 /* A start tag token with the tag name "html" */
1356 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1357 /* If this start tag token was not the first start tag token, then
1358 it is a parse error. */
1359
1360 /* For each attribute on the token, check to see if the attribute
1361 is already present on the top element of the stack of open elements.
1362 If it is not, add the attribute and its corresponding value to that
1363 element. */
1364 foreach($token['attr'] as $attr) {
1365 if(!$this->stack[0]->hasAttribute($attr['name'])) {
1366 $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1367 }
1368 }
1369
1370 /* An end-of-file token */
1371 } elseif($token['type'] === HTML5::EOF) {
1372 /* Generate implied end tags. */
1373 $this->generateImpliedEndTags();
1374
1375 /* Anything else. */
1376 } else {
1377 /* Depends on the insertion mode: */
1378 switch($this->mode) {
1379 case self::BEFOR_HEAD: return $this->beforeHead($token); break;
1380 case self::IN_HEAD: return $this->inHead($token); break;
1381 case self::AFTER_HEAD: return $this->afterHead($token); break;
1382 case self::IN_BODY: return $this->inBody($token); break;
1383 case self::IN_TABLE: return $this->inTable($token); break;
1384 case self::IN_CAPTION: return $this->inCaption($token); break;
1385 case self::IN_CGROUP: return $this->inColumnGroup($token); break;
1386 case self::IN_TBODY: return $this->inTableBody($token); break;
1387 case self::IN_ROW: return $this->inRow($token); break;
1388 case self::IN_CELL: return $this->inCell($token); break;
1389 case self::IN_SELECT: return $this->inSelect($token); break;
1390 case self::AFTER_BODY: return $this->afterBody($token); break;
1391 case self::IN_FRAME: return $this->inFrameset($token); break;
1392 case self::AFTR_FRAME: return $this->afterFrameset($token); break;
1393 case self::END_PHASE: return $this->trailingEndPhase($token); break;
1394 }
1395 }
1396 }
1397
1398 private function beforeHead($token) {
1399 /* Handle the token as follows: */
1400
1401 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1402 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1403 or U+0020 SPACE */
1404 if($token['type'] === HTML5::CHARACTR &&
1405 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1406 /* Append the character to the current node. */
1407 $this->insertText($token['data']);
1408
1409 /* A comment token */
1410 } elseif($token['type'] === HTML5::COMMENT) {
1411 /* Append a Comment node to the current node with the data attribute
1412 set to the data given in the comment token. */
1413 $this->insertComment($token['data']);
1414
1415 /* A start tag token with the tag name "head" */
1416 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1417 /* Create an element for the token, append the new element to the
1418 current node and push it onto the stack of open elements. */
1419 $element = $this->insertElement($token);
1420
1421 /* Set the head element pointer to this new element node. */
1422 $this->head_pointer = $element;
1423
1424 /* Change the insertion mode to "in head". */
1425 $this->mode = self::IN_HEAD;
1426
1427 /* A start tag token whose tag name is one of: "base", "link", "meta",
1428 "script", "style", "title". Or an end tag with the tag name "html".
1429 Or a character token that is not one of U+0009 CHARACTER TABULATION,
1430 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1431 or U+0020 SPACE. Or any other start tag token */
1432 } elseif($token['type'] === HTML5::STARTTAG ||
1433 ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1434 ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/',
1435 $token['data']))) {
1436 /* Act as if a start tag token with the tag name "head" and no
1437 attributes had been seen, then reprocess the current token. */
1438 $this->beforeHead(array(
1439 'name' => 'head',
1440 'type' => HTML5::STARTTAG,
1441 'attr' => array()
1442 ));
1443
1444 return $this->inHead($token);
1445
1446 /* Any other end tag */
1447 } elseif($token['type'] === HTML5::ENDTAG) {
1448 /* Parse error. Ignore the token. */
1449 }
1450 }
1451
1452 private function inHead($token) {
1453 /* Handle the token as follows: */
1454
1455 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1456 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1457 or U+0020 SPACE.
1458
1459 THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1460 or script element, append the character to the current node regardless
1461 of its content. */
1462 if(($token['type'] === HTML5::CHARACTR &&
1463 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1464 $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName,
1465 array('title', 'style', 'script')))) {
1466 /* Append the character to the current node. */
1467 $this->insertText($token['data']);
1468
1469 /* A comment token */
1470 } elseif($token['type'] === HTML5::COMMENT) {
1471 /* Append a Comment node to the current node with the data attribute
1472 set to the data given in the comment token. */
1473 $this->insertComment($token['data']);
1474
1475 } elseif($token['type'] === HTML5::ENDTAG &&
1476 in_array($token['name'], array('title', 'style', 'script'))) {
1477 array_pop($this->stack);
1478 return HTML5::PCDATA;
1479
1480 /* A start tag with the tag name "title" */
1481 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
1482 /* Create an element for the token and append the new element to the
1483 node pointed to by the head element pointer, or, if that is null
1484 (innerHTML case), to the current node. */
1485 if($this->head_pointer !== null) {
1486 $element = $this->insertElement($token, false);
1487 $this->head_pointer->appendChild($element);
1488
1489 } else {
1490 $element = $this->insertElement($token);
1491 }
1492
1493 /* Switch the tokeniser's content model flag to the RCDATA state. */
1494 return HTML5::RCDATA;
1495
1496 /* A start tag with the tag name "style" */
1497 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
1498 /* Create an element for the token and append the new element to the
1499 node pointed to by the head element pointer, or, if that is null
1500 (innerHTML case), to the current node. */
1501 if($this->head_pointer !== null) {
1502 $element = $this->insertElement($token, false);
1503 $this->head_pointer->appendChild($element);
1504
1505 } else {
1506 $this->insertElement($token);
1507 }
1508
1509 /* Switch the tokeniser's content model flag to the CDATA state. */
1510 return HTML5::CDATA;
1511
1512 /* A start tag with the tag name "script" */
1513 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
1514 /* Create an element for the token. */
1515 $element = $this->insertElement($token, false);
1516 $this->head_pointer->appendChild($element);
1517
1518 /* Switch the tokeniser's content model flag to the CDATA state. */
1519 return HTML5::CDATA;
1520
1521 /* A start tag with the tag name "base", "link", or "meta" */
1522 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1523 array('base', 'link', 'meta'))) {
1524 /* Create an element for the token and append the new element to the
1525 node pointed to by the head element pointer, or, if that is null
1526 (innerHTML case), to the current node. */
1527 if($this->head_pointer !== null) {
1528 $element = $this->insertElement($token, false);
1529 $this->head_pointer->appendChild($element);
1530 array_pop($this->stack);
1531
1532 } else {
1533 $this->insertElement($token);
1534 }
1535
1536 /* An end tag with the tag name "head" */
1537 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
1538 /* If the current node is a head element, pop the current node off
1539 the stack of open elements. */
1540 if($this->head_pointer->isSameNode(end($this->stack))) {
1541 array_pop($this->stack);
1542
1543 /* Otherwise, this is a parse error. */
1544 } else {
1545 // k
1546 }
1547
1548 /* Change the insertion mode to "after head". */
1549 $this->mode = self::AFTER_HEAD;
1550
1551 /* A start tag with the tag name "head" or an end tag except "html". */
1552 } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
1553 ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) {
1554 // Parse error. Ignore the token.
1555
1556 /* Anything else */
1557 } else {
1558 /* If the current node is a head element, act as if an end tag
1559 token with the tag name "head" had been seen. */
1560 if($this->head_pointer->isSameNode(end($this->stack))) {
1561 $this->inHead(array(
1562 'name' => 'head',
1563 'type' => HTML5::ENDTAG
1564 ));
1565
1566 /* Otherwise, change the insertion mode to "after head". */
1567 } else {
1568 $this->mode = self::AFTER_HEAD;
1569 }
1570
1571 /* Then, reprocess the current token. */
1572 return $this->afterHead($token);
1573 }
1574 }
1575
1576 private function afterHead($token) {
1577 /* Handle the token as follows: */
1578
1579 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1580 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1581 or U+0020 SPACE */
1582 if($token['type'] === HTML5::CHARACTR &&
1583 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1584 /* Append the character to the current node. */
1585 $this->insertText($token['data']);
1586
1587 /* A comment token */
1588 } elseif($token['type'] === HTML5::COMMENT) {
1589 /* Append a Comment node to the current node with the data attribute
1590 set to the data given in the comment token. */
1591 $this->insertComment($token['data']);
1592
1593 /* A start tag token with the tag name "body" */
1594 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
1595 /* Insert a body element for the token. */
1596 $this->insertElement($token);
1597
1598 /* Change the insertion mode to "in body". */
1599 $this->mode = self::IN_BODY;
1600
1601 /* A start tag token with the tag name "frameset" */
1602 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
1603 /* Insert a frameset element for the token. */
1604 $this->insertElement($token);
1605
1606 /* Change the insertion mode to "in frameset". */
1607 $this->mode = self::IN_FRAME;
1608
1609 /* A start tag token whose tag name is one of: "base", "link", "meta",
1610 "script", "style", "title" */
1611 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1612 array('base', 'link', 'meta', 'script', 'style', 'title'))) {
1613 /* Parse error. Switch the insertion mode back to "in head" and
1614 reprocess the token. */
1615 $this->mode = self::IN_HEAD;
1616 return $this->inHead($token);
1617
1618 /* Anything else */
1619 } else {
1620 /* Act as if a start tag token with the tag name "body" and no
1621 attributes had been seen, and then reprocess the current token. */
1622 $this->afterHead(array(
1623 'name' => 'body',
1624 'type' => HTML5::STARTTAG,
1625 'attr' => array()
1626 ));
1627
1628 return $this->inBody($token);
1629 }
1630 }
1631
1632 private function inBody($token) {
1633 /* Handle the token as follows: */
1634
1635 switch($token['type']) {
1636 /* A character token */
1637 case HTML5::CHARACTR:
1638 /* Reconstruct the active formatting elements, if any. */
1639 $this->reconstructActiveFormattingElements();
1640
1641 /* Append the token's character to the current node. */
1642 $this->insertText($token['data']);
1643 break;
1644
1645 /* A comment token */
1646 case HTML5::COMMENT:
1647 /* Append a Comment node to the current node with the data
1648 attribute set to the data given in the comment token. */
1649 $this->insertComment($token['data']);
1650 break;
1651
1652 case HTML5::STARTTAG:
1653 switch($token['name']) {
1654 /* A start tag token whose tag name is one of: "script",
1655 "style" */
1656 case 'script': case 'style':
1657 /* Process the token as if the insertion mode had been "in
1658 head". */
1659 return $this->inHead($token);
1660 break;
1661
1662 /* A start tag token whose tag name is one of: "base", "link",
1663 "meta", "title" */
1664 case 'base': case 'link': case 'meta': case 'title':
1665 /* Parse error. Process the token as if the insertion mode
1666 had been "in head". */
1667 return $this->inHead($token);
1668 break;
1669
1670 /* A start tag token with the tag name "body" */
1671 case 'body':
1672 /* Parse error. If the second element on the stack of open
1673 elements is not a body element, or, if the stack of open
1674 elements has only one node on it, then ignore the token.
1675 (innerHTML case) */
1676 if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
1677 // Ignore
1678
1679 /* Otherwise, for each attribute on the token, check to see
1680 if the attribute is already present on the body element (the
1681 second element) on the stack of open elements. If it is not,
1682 add the attribute and its corresponding value to that
1683 element. */
1684 } else {
1685 foreach($token['attr'] as $attr) {
1686 if(!$this->stack[1]->hasAttribute($attr['name'])) {
1687 $this->stack[1]->setAttribute($attr['name'], $attr['value']);
1688 }
1689 }
1690 }
1691 break;
1692
1693 /* A start tag whose tag name is one of: "address",
1694 "blockquote", "center", "dir", "div", "dl", "fieldset",
1695 "listing", "menu", "ol", "p", "ul" */
1696 case 'address': case 'blockquote': case 'center': case 'dir':
1697 case 'div': case 'dl': case 'fieldset': case 'listing':
1698 case 'menu': case 'ol': case 'p': case 'ul':
1699 /* If the stack of open elements has a p element in scope,
1700 then act as if an end tag with the tag name p had been
1701 seen. */
1702 if($this->elementInScope('p')) {
1703 $this->emitToken(array(
1704 'name' => 'p',
1705 'type' => HTML5::ENDTAG
1706 ));
1707 }
1708
1709 /* Insert an HTML element for the token. */
1710 $this->insertElement($token);
1711 break;
1712
1713 /* A start tag whose tag name is "form" */
1714 case 'form':
1715 /* If the form element pointer is not null, ignore the
1716 token with a parse error. */
1717 if($this->form_pointer !== null) {
1718 // Ignore.
1719
1720 /* Otherwise: */
1721 } else {
1722 /* If the stack of open elements has a p element in
1723 scope, then act as if an end tag with the tag name p
1724 had been seen. */
1725 if($this->elementInScope('p')) {
1726 $this->emitToken(array(
1727 'name' => 'p',
1728 'type' => HTML5::ENDTAG
1729 ));
1730 }
1731
1732 /* Insert an HTML element for the token, and set the
1733 form element pointer to point to the element created. */
1734 $element = $this->insertElement($token);
1735 $this->form_pointer = $element;
1736 }
1737 break;
1738
1739 /* A start tag whose tag name is "li", "dd" or "dt" */
1740 case 'li': case 'dd': case 'dt':
1741 /* If the stack of open elements has a p element in scope,
1742 then act as if an end tag with the tag name p had been
1743 seen. */
1744 if($this->elementInScope('p')) {
1745 $this->emitToken(array(
1746 'name' => 'p',
1747 'type' => HTML5::ENDTAG
1748 ));
1749 }
1750
1751 $stack_length = count($this->stack) - 1;
1752
1753 for($n = $stack_length; 0 <= $n; $n--) {
1754 /* 1. Initialise node to be the current node (the
1755 bottommost node of the stack). */
1756 $stop = false;
1757 $node = $this->stack[$n];
1758 $cat = $this->getElementCategory($node->tagName);
1759
1760 /* 2. If node is an li, dd or dt element, then pop all
1761 the nodes from the current node up to node, including
1762 node, then stop this algorithm. */
1763 if($token['name'] === $node->tagName || ($token['name'] !== 'li'
1764 && ($node->tagName === 'dd' || $node->tagName === 'dt'))) {
1765 for($x = $stack_length; $x >= $n ; $x--) {
1766 array_pop($this->stack);
1767 }
1768
1769 break;
1770 }
1771
1772 /* 3. If node is not in the formatting category, and is
1773 not in the phrasing category, and is not an address or
1774 div element, then stop this algorithm. */
1775 if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
1776 $node->tagName !== 'address' && $node->tagName !== 'div') {
1777 break;
1778 }
1779 }
1780
1781 /* Finally, insert an HTML element with the same tag
1782 name as the token's. */
1783 $this->insertElement($token);
1784 break;
1785
1786 /* A start tag token whose tag name is "plaintext" */
1787 case 'plaintext':
1788 /* If the stack of open elements has a p element in scope,
1789 then act as if an end tag with the tag name p had been
1790 seen. */
1791 if($this->elementInScope('p')) {
1792 $this->emitToken(array(
1793 'name' => 'p',
1794 'type' => HTML5::ENDTAG
1795 ));
1796 }
1797
1798 /* Insert an HTML element for the token. */
1799 $this->insertElement($token);
1800
1801 return HTML5::PLAINTEXT;
1802 break;
1803
1804 /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
1805 "h5", "h6" */
1806 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
1807 /* If the stack of open elements has a p element in scope,
1808 then act as if an end tag with the tag name p had been seen. */
1809 if($this->elementInScope('p')) {
1810 $this->emitToken(array(
1811 'name' => 'p',
1812 'type' => HTML5::ENDTAG
1813 ));
1814 }
1815
1816 /* If the stack of open elements has in scope an element whose
1817 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
1818 this is a parse error; pop elements from the stack until an
1819 element with one of those tag names has been popped from the
1820 stack. */
1821 while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
1822 array_pop($this->stack);
1823 }
1824
1825 /* Insert an HTML element for the token. */
1826 $this->insertElement($token);
1827 break;
1828
1829 /* A start tag whose tag name is "a" */
1830 case 'a':
1831 /* If the list of active formatting elements contains
1832 an element whose tag name is "a" between the end of the
1833 list and the last marker on the list (or the start of
1834 the list if there is no marker on the list), then this
1835 is a parse error; act as if an end tag with the tag name
1836 "a" had been seen, then remove that element from the list
1837 of active formatting elements and the stack of open
1838 elements if the end tag didn't already remove it (it
1839 might not have if the element is not in table scope). */
1840 $leng = count($this->a_formatting);
1841
1842 for($n = $leng - 1; $n >= 0; $n--) {
1843 if($this->a_formatting[$n] === self::MARKER) {
1844 break;
1845
1846 } elseif($this->a_formatting[$n]->nodeName === 'a') {
1847 $this->emitToken(array(
1848 'name' => 'a',
1849 'type' => HTML5::ENDTAG
1850 ));
1851 break;
1852 }
1853 }
1854
1855 /* Reconstruct the active formatting elements, if any. */
1856 $this->reconstructActiveFormattingElements();
1857
1858 /* Insert an HTML element for the token. */
1859 $el = $this->insertElement($token);
1860
1861 /* Add that element to the list of active formatting
1862 elements. */
1863 $this->a_formatting[] = $el;
1864 break;
1865
1866 /* A start tag whose tag name is one of: "b", "big", "em", "font",
1867 "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
1868 case 'b': case 'big': case 'em': case 'font': case 'i':
1869 case 'nobr': case 's': case 'small': case 'strike':
1870 case 'strong': case 'tt': case 'u':
1871 /* Reconstruct the active formatting elements, if any. */
1872 $this->reconstructActiveFormattingElements();
1873
1874 /* Insert an HTML element for the token. */
1875 $el = $this->insertElement($token);
1876
1877 /* Add that element to the list of active formatting
1878 elements. */
1879 $this->a_formatting[] = $el;
1880 break;
1881
1882 /* A start tag token whose tag name is "button" */
1883 case 'button':
1884 /* If the stack of open elements has a button element in scope,
1885 then this is a parse error; act as if an end tag with the tag
1886 name "button" had been seen, then reprocess the token. (We don't
1887 do that. Unnecessary.) */
1888 if($this->elementInScope('button')) {
1889 $this->inBody(array(
1890 'name' => 'button',
1891 'type' => HTML5::ENDTAG
1892 ));
1893 }
1894
1895 /* Reconstruct the active formatting elements, if any. */
1896 $this->reconstructActiveFormattingElements();
1897
1898 /* Insert an HTML element for the token. */
1899 $this->insertElement($token);
1900
1901 /* Insert a marker at the end of the list of active
1902 formatting elements. */
1903 $this->a_formatting[] = self::MARKER;
1904 break;
1905
1906 /* A start tag token whose tag name is one of: "marquee", "object" */
1907 case 'marquee': case 'object':
1908 /* Reconstruct the active formatting elements, if any. */
1909 $this->reconstructActiveFormattingElements();
1910
1911 /* Insert an HTML element for the token. */
1912 $this->insertElement($token);
1913
1914 /* Insert a marker at the end of the list of active
1915 formatting elements. */
1916 $this->a_formatting[] = self::MARKER;
1917 break;
1918
1919 /* A start tag token whose tag name is "xmp" */
1920 case 'xmp':
1921 /* Reconstruct the active formatting elements, if any. */
1922 $this->reconstructActiveFormattingElements();
1923
1924 /* Insert an HTML element for the token. */
1925 $this->insertElement($token);
1926
1927 /* Switch the content model flag to the CDATA state. */
1928 return HTML5::CDATA;
1929 break;
1930
1931 /* A start tag whose tag name is "table" */
1932 case 'table':
1933 /* If the stack of open elements has a p element in scope,
1934 then act as if an end tag with the tag name p had been seen. */
1935 if($this->elementInScope('p')) {
1936 $this->emitToken(array(
1937 'name' => 'p',
1938 'type' => HTML5::ENDTAG
1939 ));
1940 }
1941
1942 /* Insert an HTML element for the token. */
1943 $this->insertElement($token);
1944
1945 /* Change the insertion mode to "in table". */
1946 $this->mode = self::IN_TABLE;
1947 break;
1948
1949 /* A start tag whose tag name is one of: "area", "basefont",
1950 "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
1951 case 'area': case 'basefont': case 'bgsound': case 'br':
1952 case 'embed': case 'img': case 'param': case 'spacer':
1953 case 'wbr':
1954 /* Reconstruct the active formatting elements, if any. */
1955 $this->reconstructActiveFormattingElements();
1956
1957 /* Insert an HTML element for the token. */
1958 $this->insertElement($token);
1959
1960 /* Immediately pop the current node off the stack of open elements. */
1961 array_pop($this->stack);
1962 break;
1963
1964 /* A start tag whose tag name is "hr" */
1965 case 'hr':
1966 /* If the stack of open elements has a p element in scope,
1967 then act as if an end tag with the tag name p had been seen. */
1968 if($this->elementInScope('p')) {
1969 $this->emitToken(array(
1970 'name' => 'p',
1971 'type' => HTML5::ENDTAG
1972 ));
1973 }
1974
1975 /* Insert an HTML element for the token. */
1976 $this->insertElement($token);
1977
1978 /* Immediately pop the current node off the stack of open elements. */
1979 array_pop($this->stack);
1980 break;
1981
1982 /* A start tag whose tag name is "image" */
1983 case 'image':
1984 /* Parse error. Change the token's tag name to "img" and
1985 reprocess it. (Don't ask.) */
1986 $token['name'] = 'img';
1987 return $this->inBody($token);
1988 break;
1989
1990 /* A start tag whose tag name is "input" */
1991 case 'input':
1992 /* Reconstruct the active formatting elements, if any. */
1993 $this->reconstructActiveFormattingElements();
1994
1995 /* Insert an input element for the token. */
1996 $element = $this->insertElement($token, false);
1997
1998 /* If the form element pointer is not null, then associate the
1999 input element with the form element pointed to by the form
2000 element pointer. */
2001 $this->form_pointer !== null
2002 ? $this->form_pointer->appendChild($element)
2003 : end($this->stack)->appendChild($element);
2004
2005 /* Pop that input element off the stack of open elements. */
2006 array_pop($this->stack);
2007 break;
2008
2009 /* A start tag whose tag name is "isindex" */
2010 case 'isindex':
2011 /* Parse error. */
2012 // w/e
2013
2014 /* If the form element pointer is not null,
2015 then ignore the token. */
2016 if($this->form_pointer === null) {
2017 /* Act as if a start tag token with the tag name "form" had
2018 been seen. */
2019 $this->inBody(array(
2020 'name' => 'body',
2021 'type' => HTML5::STARTTAG,
2022 'attr' => array()
2023 ));
2024
2025 /* Act as if a start tag token with the tag name "hr" had
2026 been seen. */
2027 $this->inBody(array(
2028 'name' => 'hr',
2029 'type' => HTML5::STARTTAG,
2030 'attr' => array()
2031 ));
2032
2033 /* Act as if a start tag token with the tag name "p" had
2034 been seen. */
2035 $this->inBody(array(
2036 'name' => 'p',
2037 'type' => HTML5::STARTTAG,
2038 'attr' => array()
2039 ));
2040
2041 /* Act as if a start tag token with the tag name "label"
2042 had been seen. */
2043 $this->inBody(array(
2044 'name' => 'label',
2045 'type' => HTML5::STARTTAG,
2046 'attr' => array()
2047 ));
2048
2049 /* Act as if a stream of character tokens had been seen. */
2050 $this->insertText('This is a searchable index. '.
2051 'Insert your search keywords here: ');
2052
2053 /* Act as if a start tag token with the tag name "input"
2054 had been seen, with all the attributes from the "isindex"
2055 token, except with the "name" attribute set to the value
2056 "isindex" (ignoring any explicit "name" attribute). */
2057 $attr = $token['attr'];
2058 $attr[] = array('name' => 'name', 'value' => 'isindex');
2059
2060 $this->inBody(array(
2061 'name' => 'input',
2062 'type' => HTML5::STARTTAG,
2063 'attr' => $attr
2064 ));
2065
2066 /* Act as if a stream of character tokens had been seen
2067 (see below for what they should say). */
2068 $this->insertText('This is a searchable index. '.
2069 'Insert your search keywords here: ');
2070
2071 /* Act as if an end tag token with the tag name "label"
2072 had been seen. */
2073 $this->inBody(array(
2074 'name' => 'label',
2075 'type' => HTML5::ENDTAG
2076 ));
2077
2078 /* Act as if an end tag token with the tag name "p" had
2079 been seen. */
2080 $this->inBody(array(
2081 'name' => 'p',
2082 'type' => HTML5::ENDTAG
2083 ));
2084
2085 /* Act as if a start tag token with the tag name "hr" had
2086 been seen. */
2087 $this->inBody(array(
2088 'name' => 'hr',
2089 'type' => HTML5::ENDTAG
2090 ));
2091
2092 /* Act as if an end tag token with the tag name "form" had
2093 been seen. */
2094 $this->inBody(array(
2095 'name' => 'form',
2096 'type' => HTML5::ENDTAG
2097 ));
2098 }
2099 break;
2100
2101 /* A start tag whose tag name is "textarea" */
2102 case 'textarea':
2103 $this->insertElement($token);
2104
2105 /* Switch the tokeniser's content model flag to the
2106 RCDATA state. */
2107 return HTML5::RCDATA;
2108 break;
2109
2110 /* A start tag whose tag name is one of: "iframe", "noembed",
2111 "noframes" */
2112 case 'iframe': case 'noembed': case 'noframes':
2113 $this->insertElement($token);
2114
2115 /* Switch the tokeniser's content model flag to the CDATA state. */
2116 return HTML5::CDATA;
2117 break;
2118
2119 /* A start tag whose tag name is "select" */
2120 case 'select':
2121 /* Reconstruct the active formatting elements, if any. */
2122 $this->reconstructActiveFormattingElements();
2123
2124 /* Insert an HTML element for the token. */
2125 $this->insertElement($token);
2126
2127 /* Change the insertion mode to "in select". */
2128 $this->mode = self::IN_SELECT;
2129 break;
2130
2131 /* A start or end tag whose tag name is one of: "caption", "col",
2132 "colgroup", "frame", "frameset", "head", "option", "optgroup",
2133 "tbody", "td", "tfoot", "th", "thead", "tr". */
2134 case 'caption': case 'col': case 'colgroup': case 'frame':
2135 case 'frameset': case 'head': case 'option': case 'optgroup':
2136 case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead':
2137 case 'tr':
2138 // Parse error. Ignore the token.
2139 break;
2140
2141 /* A start or end tag whose tag name is one of: "event-source",
2142 "section", "nav", "article", "aside", "header", "footer",
2143 "datagrid", "command" */
2144 case 'event-source': case 'section': case 'nav': case 'article':
2145 case 'aside': case 'header': case 'footer': case 'datagrid':
2146 case 'command':
2147 // Work in progress!
2148 break;
2149
2150 /* A start tag token not covered by the previous entries */
2151 default:
2152 /* Reconstruct the active formatting elements, if any. */
2153 $this->reconstructActiveFormattingElements();
2154
2155 $this->insertElement($token, true, true);
2156 break;
2157 }
2158 break;
2159
2160 case HTML5::ENDTAG:
2161 switch($token['name']) {
2162 /* An end tag with the tag name "body" */
2163 case 'body':
2164 /* If the second element in the stack of open elements is
2165 not a body element, this is a parse error. Ignore the token.
2166 (innerHTML case) */
2167 if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2168 // Ignore.
2169
2170 /* If the current node is not the body element, then this
2171 is a parse error. */
2172 } elseif(end($this->stack)->nodeName !== 'body') {
2173 // Parse error.
2174 }
2175
2176 /* Change the insertion mode to "after body". */
2177 $this->mode = self::AFTER_BODY;
2178 break;
2179
2180 /* An end tag with the tag name "html" */
2181 case 'html':
2182 /* Act as if an end tag with tag name "body" had been seen,
2183 then, if that token wasn't ignored, reprocess the current
2184 token. */
2185 $this->inBody(array(
2186 'name' => 'body',
2187 'type' => HTML5::ENDTAG
2188 ));
2189
2190 return $this->afterBody($token);
2191 break;
2192
2193 /* An end tag whose tag name is one of: "address", "blockquote",
2194 "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2195 "ol", "pre", "ul" */
2196 case 'address': case 'blockquote': case 'center': case 'dir':
2197 case 'div': case 'dl': case 'fieldset': case 'listing':
2198 case 'menu': case 'ol': case 'pre': case 'ul':
2199 /* If the stack of open elements has an element in scope
2200 with the same tag name as that of the token, then generate
2201 implied end tags. */
2202 if($this->elementInScope($token['name'])) {
2203 $this->generateImpliedEndTags();
2204
2205 /* Now, if the current node is not an element with
2206 the same tag name as that of the token, then this
2207 is a parse error. */
2208 // w/e
2209
2210 /* If the stack of open elements has an element in
2211 scope with the same tag name as that of the token,
2212 then pop elements from this stack until an element
2213 with that tag name has been popped from the stack. */
2214 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2215 if($this->stack[$n]->nodeName === $token['name']) {
2216 $n = -1;
2217 }
2218
2219 array_pop($this->stack);
2220 }
2221 }
2222 break;
2223
2224 /* An end tag whose tag name is "form" */
2225 case 'form':
2226 /* If the stack of open elements has an element in scope
2227 with the same tag name as that of the token, then generate
2228 implied end tags. */
2229 if($this->elementInScope($token['name'])) {
2230 $this->generateImpliedEndTags();
2231
2232 }
2233
2234 if(end($this->stack)->nodeName !== $token['name']) {
2235 /* Now, if the current node is not an element with the
2236 same tag name as that of the token, then this is a parse
2237 error. */
2238 // w/e
2239
2240 } else {
2241 /* Otherwise, if the current node is an element with
2242 the same tag name as that of the token pop that element
2243 from the stack. */
2244 array_pop($this->stack);
2245 }
2246
2247 /* In any case, set the form element pointer to null. */
2248 $this->form_pointer = null;
2249 break;
2250
2251 /* An end tag whose tag name is "p" */
2252 case 'p':
2253 /* If the stack of open elements has a p element in scope,
2254 then generate implied end tags, except for p elements. */
2255 if($this->elementInScope('p')) {
2256 $this->generateImpliedEndTags(array('p'));
2257
2258 /* If the current node is not a p element, then this is
2259 a parse error. */
2260 // k
2261
2262 /* If the stack of open elements has a p element in
2263 scope, then pop elements from this stack until the stack
2264 no longer has a p element in scope. */
2265 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2266 if($this->elementInScope('p')) {
2267 array_pop($this->stack);
2268
2269 } else {
2270 break;
2271 }
2272 }
2273 }
2274 break;
2275
2276 /* An end tag whose tag name is "dd", "dt", or "li" */
2277 case 'dd': case 'dt': case 'li':
2278 /* If the stack of open elements has an element in scope
2279 whose tag name matches the tag name of the token, then
2280 generate implied end tags, except for elements with the
2281 same tag name as the token. */
2282 if($this->elementInScope($token['name'])) {
2283 $this->generateImpliedEndTags(array($token['name']));
2284
2285 /* If the current node is not an element with the same
2286 tag name as the token, then this is a parse error. */
2287 // w/e
2288
2289 /* If the stack of open elements has an element in scope
2290 whose tag name matches the tag name of the token, then
2291 pop elements from this stack until an element with that
2292 tag name has been popped from the stack. */
2293 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2294 if($this->stack[$n]->nodeName === $token['name']) {
2295 $n = -1;
2296 }
2297
2298 array_pop($this->stack);
2299 }
2300 }
2301 break;
2302
2303 /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2304 "h5", "h6" */
2305 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
2306 $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2307
2308 /* If the stack of open elements has in scope an element whose
2309 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2310 generate implied end tags. */
2311 if($this->elementInScope($elements)) {
2312 $this->generateImpliedEndTags();
2313
2314 /* Now, if the current node is not an element with the same
2315 tag name as that of the token, then this is a parse error. */
2316 // w/e
2317
2318 /* If the stack of open elements has in scope an element
2319 whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2320 "h6", then pop elements from the stack until an element
2321 with one of those tag names has been popped from the stack. */
2322 while($this->elementInScope($elements)) {
2323 array_pop($this->stack);
2324 }
2325 }
2326 break;
2327
2328 /* An end tag whose tag name is one of: "a", "b", "big", "em",
2329 "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2330 case 'a': case 'b': case 'big': case 'em': case 'font':
2331 case 'i': case 'nobr': case 's': case 'small': case 'strike':
2332 case 'strong': case 'tt': case 'u':
2333 /* 1. Let the formatting element be the last element in
2334 the list of active formatting elements that:
2335 * is between the end of the list and the last scope
2336 marker in the list, if any, or the start of the list
2337 otherwise, and
2338 * has the same tag name as the token.
2339 */
2340 while(true) {
2341 for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
2342 if($this->a_formatting[$a] === self::MARKER) {
2343 break;
2344
2345 } elseif($this->a_formatting[$a]->tagName === $token['name']) {
2346 $formatting_element = $this->a_formatting[$a];
2347 $in_stack = in_array($formatting_element, $this->stack, true);
2348 $fe_af_pos = $a;
2349 break;
2350 }
2351 }
2352
2353 /* If there is no such node, or, if that node is
2354 also in the stack of open elements but the element
2355 is not in scope, then this is a parse error. Abort
2356 these steps. The token is ignored. */
2357 if(!isset($formatting_element) || ($in_stack &&
2358 !$this->elementInScope($token['name']))) {
2359 break;
2360
2361 /* Otherwise, if there is such a node, but that node
2362 is not in the stack of open elements, then this is a
2363 parse error; remove the element from the list, and
2364 abort these steps. */
2365 } elseif(isset($formatting_element) && !$in_stack) {
2366 unset($this->a_formatting[$fe_af_pos]);
2367 $this->a_formatting = array_merge($this->a_formatting);
2368 break;
2369 }
2370
2371 /* 2. Let the furthest block be the topmost node in the
2372 stack of open elements that is lower in the stack
2373 than the formatting element, and is not an element in
2374 the phrasing or formatting categories. There might
2375 not be one. */
2376 $fe_s_pos = array_search($formatting_element, $this->stack, true);
2377 $length = count($this->stack);
2378
2379 for($s = $fe_s_pos + 1; $s < $length; $s++) {
2380 $category = $this->getElementCategory($this->stack[$s]->nodeName);
2381
2382 if($category !== self::PHRASING && $category !== self::FORMATTING) {
2383 $furthest_block = $this->stack[$s];
2384 }
2385 }
2386
2387 /* 3. If there is no furthest block, then the UA must
2388 skip the subsequent steps and instead just pop all
2389 the nodes from the bottom of the stack of open
2390 elements, from the current node up to the formatting
2391 element, and remove the formatting element from the
2392 list of active formatting elements. */
2393 if(!isset($furthest_block)) {
2394 for($n = $length - 1; $n >= $fe_s_pos; $n--) {
2395 array_pop($this->stack);
2396 }
2397
2398 unset($this->a_formatting[$fe_af_pos]);
2399 $this->a_formatting = array_merge($this->a_formatting);
2400 break;
2401 }
2402
2403 /* 4. Let the common ancestor be the element
2404 immediately above the formatting element in the stack
2405 of open elements. */
2406 $common_ancestor = $this->stack[$fe_s_pos - 1];
2407
2408 /* 5. If the furthest block has a parent node, then
2409 remove the furthest block from its parent node. */
2410 if($furthest_block->parentNode !== null) {
2411 $furthest_block->parentNode->removeChild($furthest_block);
2412 }
2413
2414 /* 6. Let a bookmark note the position of the
2415 formatting element in the list of active formatting
2416 elements relative to the elements on either side
2417 of it in the list. */
2418 $bookmark = $fe_af_pos;
2419
2420 /* 7. Let node and last node be the furthest block.
2421 Follow these steps: */
2422 $node = $furthest_block;
2423 $last_node = $furthest_block;
2424
2425 while(true) {
2426 for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
2427 /* 7.1 Let node be the element immediately
2428 prior to node in the stack of open elements. */
2429 $node = $this->stack[$n];
2430
2431 /* 7.2 If node is not in the list of active
2432 formatting elements, then remove node from
2433 the stack of open elements and then go back
2434 to step 1. */
2435 if(!in_array($node, $this->a_formatting, true)) {
2436 unset($this->stack[$n]);
2437 $this->stack = array_merge($this->stack);
2438
2439 } else {
2440 break;
2441 }
2442 }
2443
2444 /* 7.3 Otherwise, if node is the formatting
2445 element, then go to the next step in the overall
2446 algorithm. */
2447 if($node === $formatting_element) {
2448 break;
2449
2450 /* 7.4 Otherwise, if last node is the furthest
2451 block, then move the aforementioned bookmark to
2452 be immediately after the node in the list of
2453 active formatting elements. */
2454 } elseif($last_node === $furthest_block) {
2455 $bookmark = array_search($node, $this->a_formatting, true) + 1;
2456 }
2457
2458 /* 7.5 If node has any children, perform a
2459 shallow clone of node, replace the entry for
2460 node in the list of active formatting elements
2461 with an entry for the clone, replace the entry
2462 for node in the stack of open elements with an
2463 entry for the clone, and let node be the clone. */
2464 if($node->hasChildNodes()) {
2465 $clone = $node->cloneNode();
2466 $s_pos = array_search($node, $this->stack, true);
2467 $a_pos = array_search($node, $this->a_formatting, true);
2468
2469 $this->stack[$s_pos] = $clone;
2470 $this->a_formatting[$a_pos] = $clone;
2471 $node = $clone;
2472 }
2473
2474 /* 7.6 Insert last node into node, first removing
2475 it from its previous parent node if any. */
2476 if($last_node->parentNode !== null) {
2477 $last_node->parentNode->removeChild($last_node);
2478 }
2479
2480 $node->appendChild($last_node);
2481
2482 /* 7.7 Let last node be node. */
2483 $last_node = $node;
2484 }
2485
2486 /* 8. Insert whatever last node ended up being in
2487 the previous step into the common ancestor node,
2488 first removing it from its previous parent node if
2489 any. */
2490 if($last_node->parentNode !== null) {
2491 $last_node->parentNode->removeChild($last_node);
2492 }
2493
2494 $common_ancestor->appendChild($last_node);
2495
2496 /* 9. Perform a shallow clone of the formatting
2497 element. */
2498 $clone = $formatting_element->cloneNode();
2499
2500 /* 10. Take all of the child nodes of the furthest
2501 block and append them to the clone created in the
2502 last step. */
2503 while($furthest_block->hasChildNodes()) {
2504 $child = $furthest_block->firstChild;
2505 $furthest_block->removeChild($child);
2506 $clone->appendChild($child);
2507 }
2508
2509 /* 11. Append that clone to the furthest block. */
2510 $furthest_block->appendChild($clone);
2511
2512 /* 12. Remove the formatting element from the list
2513 of active formatting elements, and insert the clone
2514 into the list of active formatting elements at the
2515 position of the aforementioned bookmark. */
2516 $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
2517 unset($this->a_formatting[$fe_af_pos]);
2518 $this->a_formatting = array_merge($this->a_formatting);
2519
2520 $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
2521 $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
2522 $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
2523
2524 /* 13. Remove the formatting element from the stack
2525 of open elements, and insert the clone into the stack
2526 of open elements immediately after (i.e. in a more
2527 deeply nested position than) the position of the
2528 furthest block in that stack. */
2529 $fe_s_pos = array_search($formatting_element, $this->stack, true);
2530 $fb_s_pos = array_search($furthest_block, $this->stack, true);
2531 unset($this->stack[$fe_s_pos]);
2532
2533 $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
2534 $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
2535 $this->stack = array_merge($s_part1, array($clone), $s_part2);
2536
2537 /* 14. Jump back to step 1 in this series of steps. */
2538 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
2539 }
2540 break;
2541
2542 /* An end tag token whose tag name is one of: "button",
2543 "marquee", "object" */
2544 case 'button': case 'marquee': case 'object':
2545 /* If the stack of open elements has an element in scope whose
2546 tag name matches the tag name of the token, then generate implied
2547 tags. */
2548 if($this->elementInScope($token['name'])) {
2549 $this->generateImpliedEndTags();
2550
2551 /* Now, if the current node is not an element with the same
2552 tag name as the token, then this is a parse error. */
2553 // k
2554
2555 /* Now, if the stack of open elements has an element in scope
2556 whose tag name matches the tag name of the token, then pop
2557 elements from the stack until that element has been popped from
2558 the stack, and clear the list of active formatting elements up
2559 to the last marker. */
2560 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2561 if($this->stack[$n]->nodeName === $token['name']) {
2562 $n = -1;
2563 }
2564
2565 array_pop($this->stack);
2566 }
2567
2568 $marker = end(array_keys($this->a_formatting, self::MARKER, true));
2569
2570 for($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
2571 array_pop($this->a_formatting);
2572 }
2573 }
2574 break;
2575
2576 /* Or an end tag whose tag name is one of: "area", "basefont",
2577 "bgsound", "br", "embed", "hr", "iframe", "image", "img",
2578 "input", "isindex", "noembed", "noframes", "param", "select",
2579 "spacer", "table", "textarea", "wbr" */
2580 case 'area': case 'basefont': case 'bgsound': case 'br':
2581 case 'embed': case 'hr': case 'iframe': case 'image':
2582 case 'img': case 'input': case 'isindex': case 'noembed':
2583 case 'noframes': case 'param': case 'select': case 'spacer':
2584 case 'table': case 'textarea': case 'wbr':
2585 // Parse error. Ignore the token.
2586 break;
2587
2588 /* An end tag token not covered by the previous entries */
2589 default:
2590 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2591 /* Initialise node to be the current node (the bottommost
2592 node of the stack). */
2593 $node = end($this->stack);
2594
2595 /* If node has the same tag name as the end tag token,
2596 then: */
2597 if($token['name'] === $node->nodeName) {
2598 /* Generate implied end tags. */
2599 $this->generateImpliedEndTags();
2600
2601 /* If the tag name of the end tag token does not
2602 match the tag name of the current node, this is a
2603 parse error. */
2604 // k
2605
2606 /* Pop all the nodes from the current node up to
2607 node, including node, then stop this algorithm. */
2608 for($x = count($this->stack) - $n; $x >= $n; $x--) {
2609 array_pop($this->stack);
2610 }
2611
2612 } else {
2613 $category = $this->getElementCategory($node);
2614
2615 if($category !== self::SPECIAL && $category !== self::SCOPING) {
2616 /* Otherwise, if node is in neither the formatting
2617 category nor the phrasing category, then this is a
2618 parse error. Stop this algorithm. The end tag token
2619 is ignored. */
2620 return false;
2621 }
2622 }
2623 }
2624 break;
2625 }
2626 break;
2627 }
2628 }
2629
2630 private function inTable($token) {
2631 $clear = array('html', 'table');
2632
2633 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2634 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2635 or U+0020 SPACE */
2636 if($token['type'] === HTML5::CHARACTR &&
2637 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2638 /* Append the character to the current node. */
2639 $text = $this->dom->createTextNode($token['data']);
2640 end($this->stack)->appendChild($text);
2641
2642 /* A comment token */
2643 } elseif($token['type'] === HTML5::COMMENT) {
2644 /* Append a Comment node to the current node with the data
2645 attribute set to the data given in the comment token. */
2646 $comment = $this->dom->createComment($token['data']);
2647 end($this->stack)->appendChild($comment);
2648
2649 /* A start tag whose tag name is "caption" */
2650 } elseif($token['type'] === HTML5::STARTTAG &&
2651 $token['name'] === 'caption') {
2652 /* Clear the stack back to a table context. */
2653 $this->clearStackToTableContext($clear);
2654
2655 /* Insert a marker at the end of the list of active
2656 formatting elements. */
2657 $this->a_formatting[] = self::MARKER;
2658
2659 /* Insert an HTML element for the token, then switch the
2660 insertion mode to "in caption". */
2661 $this->insertElement($token);
2662 $this->mode = self::IN_CAPTION;
2663
2664 /* A start tag whose tag name is "colgroup" */
2665 } elseif($token['type'] === HTML5::STARTTAG &&
2666 $token['name'] === 'colgroup') {
2667 /* Clear the stack back to a table context. */
2668 $this->clearStackToTableContext($clear);
2669
2670 /* Insert an HTML element for the token, then switch the
2671 insertion mode to "in column group". */
2672 $this->insertElement($token);
2673 $this->mode = self::IN_CGROUP;
2674
2675 /* A start tag whose tag name is "col" */
2676 } elseif($token['type'] === HTML5::STARTTAG &&
2677 $token['name'] === 'col') {
2678 $this->inTable(array(
2679 'name' => 'colgroup',
2680 'type' => HTML5::STARTTAG,
2681 'attr' => array()
2682 ));
2683
2684 $this->inColumnGroup($token);
2685
2686 /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
2687 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2688 array('tbody', 'tfoot', 'thead'))) {
2689 /* Clear the stack back to a table context. */
2690 $this->clearStackToTableContext($clear);
2691
2692 /* Insert an HTML element for the token, then switch the insertion
2693 mode to "in table body". */
2694 $this->insertElement($token);
2695 $this->mode = self::IN_TBODY;
2696
2697 /* A start tag whose tag name is one of: "td", "th", "tr" */
2698 } elseif($token['type'] === HTML5::STARTTAG &&
2699 in_array($token['name'], array('td', 'th', 'tr'))) {
2700 /* Act as if a start tag token with the tag name "tbody" had been
2701 seen, then reprocess the current token. */
2702 $this->inTable(array(
2703 'name' => 'tbody',
2704 'type' => HTML5::STARTTAG,
2705 'attr' => array()
2706 ));
2707
2708 return $this->inTableBody($token);
2709
2710 /* A start tag whose tag name is "table" */
2711 } elseif($token['type'] === HTML5::STARTTAG &&
2712 $token['name'] === 'table') {
2713 /* Parse error. Act as if an end tag token with the tag name "table"
2714 had been seen, then, if that token wasn't ignored, reprocess the
2715 current token. */
2716 $this->inTable(array(
2717 'name' => 'table',
2718 'type' => HTML5::ENDTAG
2719 ));
2720
2721 return $this->mainPhase($token);
2722
2723 /* An end tag whose tag name is "table" */
2724 } elseif($token['type'] === HTML5::ENDTAG &&
2725 $token['name'] === 'table') {
2726 /* If the stack of open elements does not have an element in table
2727 scope with the same tag name as the token, this is a parse error.
2728 Ignore the token. (innerHTML case) */
2729 if(!$this->elementInScope($token['name'], true)) {
2730 return false;
2731
2732 /* Otherwise: */
2733 } else {
2734 /* Generate implied end tags. */
2735 $this->generateImpliedEndTags();
2736
2737 /* Now, if the current node is not a table element, then this
2738 is a parse error. */
2739 // w/e
2740
2741 /* Pop elements from this stack until a table element has been
2742 popped from the stack. */
2743 while(true) {
2744 $current = end($this->stack)->nodeName;
2745 array_pop($this->stack);
2746
2747 if($current === 'table') {
2748 break;
2749 }
2750 }
2751
2752 /* Reset the insertion mode appropriately. */
2753 $this->resetInsertionMode();
2754 }
2755
2756 /* An end tag whose tag name is one of: "body", "caption", "col",
2757 "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2758 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2759 array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
2760 'tfoot', 'th', 'thead', 'tr'))) {
2761 // Parse error. Ignore the token.
2762
2763 /* Anything else */
2764 } else {
2765 /* Parse error. Process the token as if the insertion mode was "in
2766 body", with the following exception: */
2767
2768 /* If the current node is a table, tbody, tfoot, thead, or tr
2769 element, then, whenever a node would be inserted into the current
2770 node, it must instead be inserted into the foster parent element. */
2771 if(in_array(end($this->stack)->nodeName,
2772 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
2773 /* The foster parent element is the parent element of the last
2774 table element in the stack of open elements, if there is a
2775 table element and it has such a parent element. If there is no
2776 table element in the stack of open elements (innerHTML case),
2777 then the foster parent element is the first element in the
2778 stack of open elements (the html element). Otherwise, if there
2779 is a table element in the stack of open elements, but the last
2780 table element in the stack of open elements has no parent, or
2781 its parent node is not an element, then the foster parent
2782 element is the element before the last table element in the
2783 stack of open elements. */
2784 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2785 if($this->stack[$n]->nodeName === 'table') {
2786 $table = $this->stack[$n];
2787 break;
2788 }
2789 }
2790
2791 if(isset($table) && $table->parentNode !== null) {
2792 $this->foster_parent = $table->parentNode;
2793
2794 } elseif(!isset($table)) {
2795 $this->foster_parent = $this->stack[0];
2796
2797 } elseif(isset($table) && ($table->parentNode === null ||
2798 $table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
2799 $this->foster_parent = $this->stack[$n - 1];
2800 }
2801 }
2802
2803 $this->inBody($token);
2804 }
2805 }
2806
2807 private function inCaption($token) {
2808 /* An end tag whose tag name is "caption" */
2809 if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
2810 /* If the stack of open elements does not have an element in table
2811 scope with the same tag name as the token, this is a parse error.
2812 Ignore the token. (innerHTML case) */
2813 if(!$this->elementInScope($token['name'], true)) {
2814 // Ignore
2815
2816 /* Otherwise: */
2817 } else {
2818 /* Generate implied end tags. */
2819 $this->generateImpliedEndTags();
2820
2821 /* Now, if the current node is not a caption element, then this
2822 is a parse error. */
2823 // w/e
2824
2825 /* Pop elements from this stack until a caption element has
2826 been popped from the stack. */
2827 while(true) {
2828 $node = end($this->stack)->nodeName;
2829 array_pop($this->stack);
2830
2831 if($node === 'caption') {
2832 break;
2833 }
2834 }
2835
2836 /* Clear the list of active formatting elements up to the last
2837 marker. */
2838 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
2839
2840 /* Switch the insertion mode to "in table". */
2841 $this->mode = self::IN_TABLE;
2842 }
2843
2844 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2845 "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
2846 name is "table" */
2847 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2848 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
2849 'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG &&
2850 $token['name'] === 'table')) {
2851 /* Parse error. Act as if an end tag with the tag name "caption"
2852 had been seen, then, if that token wasn't ignored, reprocess the
2853 current token. */
2854 $this->inCaption(array(
2855 'name' => 'caption',
2856 'type' => HTML5::ENDTAG
2857 ));
2858
2859 return $this->inTable($token);
2860
2861 /* An end tag whose tag name is one of: "body", "col", "colgroup",
2862 "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2863 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2864 array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
2865 'thead', 'tr'))) {
2866 // Parse error. Ignore the token.
2867
2868 /* Anything else */
2869 } else {
2870 /* Process the token as if the insertion mode was "in body". */
2871 $this->inBody($token);
2872 }
2873 }
2874
2875 private function inColumnGroup($token) {
2876 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2877 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2878 or U+0020 SPACE */
2879 if($token['type'] === HTML5::CHARACTR &&
2880 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2881 /* Append the character to the current node. */
2882 $text = $this->dom->createTextNode($token['data']);
2883 end($this->stack)->appendChild($text);
2884
2885 /* A comment token */
2886 } elseif($token['type'] === HTML5::COMMENT) {
2887 /* Append a Comment node to the current node with the data
2888 attribute set to the data given in the comment token. */
2889 $comment = $this->dom->createComment($token['data']);
2890 end($this->stack)->appendChild($comment);
2891
2892 /* A start tag whose tag name is "col" */
2893 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
2894 /* Insert a col element for the token. Immediately pop the current
2895 node off the stack of open elements. */
2896 $this->insertElement($token);
2897 array_pop($this->stack);
2898
2899 /* An end tag whose tag name is "colgroup" */
2900 } elseif($token['type'] === HTML5::ENDTAG &&
2901 $token['name'] === 'colgroup') {
2902 /* If the current node is the root html element, then this is a
2903 parse error, ignore the token. (innerHTML case) */
2904 if(end($this->stack)->nodeName === 'html') {
2905 // Ignore
2906
2907 /* Otherwise, pop the current node (which will be a colgroup
2908 element) from the stack of open elements. Switch the insertion
2909 mode to "in table". */
2910 } else {
2911 array_pop($this->stack);
2912 $this->mode = self::IN_TABLE;
2913 }
2914
2915 /* An end tag whose tag name is "col" */
2916 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
2917 /* Parse error. Ignore the token. */
2918
2919 /* Anything else */
2920 } else {
2921 /* Act as if an end tag with the tag name "colgroup" had been seen,
2922 and then, if that token wasn't ignored, reprocess the current token. */
2923 $this->inColumnGroup(array(
2924 'name' => 'colgroup',
2925 'type' => HTML5::ENDTAG
2926 ));
2927
2928 return $this->inTable($token);
2929 }
2930 }
2931
2932 private function inTableBody($token) {
2933 $clear = array('tbody', 'tfoot', 'thead', 'html');
2934
2935 /* A start tag whose tag name is "tr" */
2936 if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
2937 /* Clear the stack back to a table body context. */
2938 $this->clearStackToTableContext($clear);
2939
2940 /* Insert a tr element for the token, then switch the insertion
2941 mode to "in row". */
2942 $this->insertElement($token);
2943 $this->mode = self::IN_ROW;
2944
2945 /* A start tag whose tag name is one of: "th", "td" */
2946 } elseif($token['type'] === HTML5::STARTTAG &&
2947 ($token['name'] === 'th' || $token['name'] === 'td')) {
2948 /* Parse error. Act as if a start tag with the tag name "tr" had
2949 been seen, then reprocess the current token. */
2950 $this->inTableBody(array(
2951 'name' => 'tr',
2952 'type' => HTML5::STARTTAG,
2953 'attr' => array()
2954 ));
2955
2956 return $this->inRow($token);
2957
2958 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
2959 } elseif($token['type'] === HTML5::ENDTAG &&
2960 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
2961 /* If the stack of open elements does not have an element in table
2962 scope with the same tag name as the token, this is a parse error.
2963 Ignore the token. */
2964 if(!$this->elementInScope($token['name'], true)) {
2965 // Ignore
2966
2967 /* Otherwise: */
2968 } else {
2969 /* Clear the stack back to a table body context. */
2970 $this->clearStackToTableContext($clear);
2971
2972 /* Pop the current node from the stack of open elements. Switch
2973 the insertion mode to "in table". */
2974 array_pop($this->stack);
2975 $this->mode = self::IN_TABLE;
2976 }
2977
2978 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2979 "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
2980 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2981 array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) ||
2982 ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) {
2983 /* If the stack of open elements does not have a tbody, thead, or
2984 tfoot element in table scope, this is a parse error. Ignore the
2985 token. (innerHTML case) */
2986 if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
2987 // Ignore.
2988
2989 /* Otherwise: */
2990 } else {
2991 /* Clear the stack back to a table body context. */
2992 $this->clearStackToTableContext($clear);
2993
2994 /* Act as if an end tag with the same tag name as the current
2995 node ("tbody", "tfoot", or "thead") had been seen, then
2996 reprocess the current token. */
2997 $this->inTableBody(array(
2998 'name' => end($this->stack)->nodeName,
2999 'type' => HTML5::ENDTAG
3000 ));
3001
3002 return $this->mainPhase($token);
3003 }
3004
3005 /* An end tag whose tag name is one of: "body", "caption", "col",
3006 "colgroup", "html", "td", "th", "tr" */
3007 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3008 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3009 /* Parse error. Ignore the token. */
3010
3011 /* Anything else */
3012 } else {
3013 /* Process the token as if the insertion mode was "in table". */
3014 $this->inTable($token);
3015 }
3016 }
3017
3018 private function inRow($token) {
3019 $clear = array('tr', 'html');
3020
3021 /* A start tag whose tag name is one of: "th", "td" */
3022 if($token['type'] === HTML5::STARTTAG &&
3023 ($token['name'] === 'th' || $token['name'] === 'td')) {
3024 /* Clear the stack back to a table row context. */
3025 $this->clearStackToTableContext($clear);
3026
3027 /* Insert an HTML element for the token, then switch the insertion
3028 mode to "in cell". */
3029 $this->insertElement($token);
3030 $this->mode = self::IN_CELL;
3031
3032 /* Insert a marker at the end of the list of active formatting
3033 elements. */
3034 $this->a_formatting[] = self::MARKER;
3035
3036 /* An end tag whose tag name is "tr" */
3037 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3038 /* If the stack of open elements does not have an element in table
3039 scope with the same tag name as the token, this is a parse error.
3040 Ignore the token. (innerHTML case) */
3041 if(!$this->elementInScope($token['name'], true)) {
3042 // Ignore.
3043
3044 /* Otherwise: */
3045 } else {
3046 /* Clear the stack back to a table row context. */
3047 $this->clearStackToTableContext($clear);
3048
3049 /* Pop the current node (which will be a tr element) from the
3050 stack of open elements. Switch the insertion mode to "in table
3051 body". */
3052 array_pop($this->stack);
3053 $this->mode = self::IN_TBODY;
3054 }
3055
3056 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3057 "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3058 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3059 array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) {
3060 /* Act as if an end tag with the tag name "tr" had been seen, then,
3061 if that token wasn't ignored, reprocess the current token. */
3062 $this->inRow(array(
3063 'name' => 'tr',
3064 'type' => HTML5::ENDTAG
3065 ));
3066
3067 return $this->inCell($token);
3068
3069 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3070 } elseif($token['type'] === HTML5::ENDTAG &&
3071 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
3072 /* If the stack of open elements does not have an element in table
3073 scope with the same tag name as the token, this is a parse error.
3074 Ignore the token. */
3075 if(!$this->elementInScope($token['name'], true)) {
3076 // Ignore.
3077
3078 /* Otherwise: */
3079 } else {
3080 /* Otherwise, act as if an end tag with the tag name "tr" had
3081 been seen, then reprocess the current token. */
3082 $this->inRow(array(
3083 'name' => 'tr',
3084 'type' => HTML5::ENDTAG
3085 ));
3086
3087 return $this->inCell($token);
3088 }
3089
3090 /* An end tag whose tag name is one of: "body", "caption", "col",
3091 "colgroup", "html", "td", "th" */
3092 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3093 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3094 /* Parse error. Ignore the token. */
3095
3096 /* Anything else */
3097 } else {
3098 /* Process the token as if the insertion mode was "in table". */
3099 $this->inTable($token);
3100 }
3101 }
3102
3103 private function inCell($token) {
3104 /* An end tag whose tag name is one of: "td", "th" */
3105 if($token['type'] === HTML5::ENDTAG &&
3106 ($token['name'] === 'td' || $token['name'] === 'th')) {
3107 /* If the stack of open elements does not have an element in table
3108 scope with the same tag name as that of the token, then this is a
3109 parse error and the token must be ignored. */
3110 if(!$this->elementInScope($token['name'], true)) {
3111 // Ignore.
3112
3113 /* Otherwise: */
3114 } else {
3115 /* Generate implied end tags, except for elements with the same
3116 tag name as the token. */
3117 $this->generateImpliedEndTags(array($token['name']));
3118
3119 /* Now, if the current node is not an element with the same tag
3120 name as the token, then this is a parse error. */
3121 // k
3122
3123 /* Pop elements from this stack until an element with the same
3124 tag name as the token has been popped from the stack. */
3125 while(true) {
3126 $node = end($this->stack)->nodeName;
3127 array_pop($this->stack);
3128
3129 if($node === $token['name']) {
3130 break;
3131 }
3132 }
3133
3134 /* Clear the list of active formatting elements up to the last
3135 marker. */
3136 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3137
3138 /* Switch the insertion mode to "in row". (The current node
3139 will be a tr element at this point.) */
3140 $this->mode = self::IN_ROW;
3141 }
3142
3143 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3144 "tbody", "td", "tfoot", "th", "thead", "tr" */
3145 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3146 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3147 'thead', 'tr'))) {
3148 /* If the stack of open elements does not have a td or th element
3149 in table scope, then this is a parse error; ignore the token.
3150 (innerHTML case) */
3151 if(!$this->elementInScope(array('td', 'th'), true)) {
3152 // Ignore.
3153
3154 /* Otherwise, close the cell (see below) and reprocess the current
3155 token. */
3156 } else {
3157 $this->closeCell();
3158 return $this->inRow($token);
3159 }
3160
3161 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3162 "tbody", "td", "tfoot", "th", "thead", "tr" */
3163 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3164 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3165 'thead', 'tr'))) {
3166 /* If the stack of open elements does not have a td or th element
3167 in table scope, then this is a parse error; ignore the token.
3168 (innerHTML case) */
3169 if(!$this->elementInScope(array('td', 'th'), true)) {
3170 // Ignore.
3171
3172 /* Otherwise, close the cell (see below) and reprocess the current
3173 token. */
3174 } else {
3175 $this->closeCell();
3176 return $this->inRow($token);
3177 }
3178
3179 /* An end tag whose tag name is one of: "body", "caption", "col",
3180 "colgroup", "html" */
3181 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3182 array('body', 'caption', 'col', 'colgroup', 'html'))) {
3183 /* Parse error. Ignore the token. */
3184
3185 /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3186 "thead", "tr" */
3187 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3188 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
3189 /* If the stack of open elements does not have an element in table
3190 scope with the same tag name as that of the token (which can only
3191 happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
3192 then this is a parse error and the token must be ignored. */
3193 if(!$this->elementInScope($token['name'], true)) {
3194 // Ignore.
3195
3196 /* Otherwise, close the cell (see below) and reprocess the current
3197 token. */
3198 } else {
3199 $this->closeCell();
3200 return $this->inRow($token);
3201 }
3202
3203 /* Anything else */
3204 } else {
3205 /* Process the token as if the insertion mode was "in body". */
3206 $this->inBody($token);
3207 }
3208 }
3209
3210 private function inSelect($token) {
3211 /* Handle the token as follows: */
3212
3213 /* A character token */
3214 if($token['type'] === HTML5::CHARACTR) {
3215 /* Append the token's character to the current node. */
3216 $this->insertText($token['data']);
3217
3218 /* A comment token */
3219 } elseif($token['type'] === HTML5::COMMENT) {
3220 /* Append a Comment node to the current node with the data
3221 attribute set to the data given in the comment token. */
3222 $this->insertComment($token['data']);
3223
3224 /* A start tag token whose tag name is "option" */
3225 } elseif($token['type'] === HTML5::STARTTAG &&
3226 $token['name'] === 'option') {
3227 /* If the current node is an option element, act as if an end tag
3228 with the tag name "option" had been seen. */
3229 if(end($this->stack)->nodeName === 'option') {
3230 $this->inSelect(array(
3231 'name' => 'option',
3232 'type' => HTML5::ENDTAG
3233 ));
3234 }
3235
3236 /* Insert an HTML element for the token. */
3237 $this->insertElement($token);
3238
3239 /* A start tag token whose tag name is "optgroup" */
3240 } elseif($token['type'] === HTML5::STARTTAG &&
3241 $token['name'] === 'optgroup') {
3242 /* If the current node is an option element, act as if an end tag
3243 with the tag name "option" had been seen. */
3244 if(end($this->stack)->nodeName === 'option') {
3245 $this->inSelect(array(
3246 'name' => 'option',
3247 'type' => HTML5::ENDTAG
3248 ));
3249 }
3250
3251 /* If the current node is an optgroup element, act as if an end tag
3252 with the tag name "optgroup" had been seen. */
3253 if(end($this->stack)->nodeName === 'optgroup') {
3254 $this->inSelect(array(
3255 'name' => 'optgroup',
3256 'type' => HTML5::ENDTAG
3257 ));
3258 }
3259
3260 /* Insert an HTML element for the token. */
3261 $this->insertElement($token);
3262
3263 /* An end tag token whose tag name is "optgroup" */
3264 } elseif($token['type'] === HTML5::ENDTAG &&
3265 $token['name'] === 'optgroup') {
3266 /* First, if the current node is an option element, and the node
3267 immediately before it in the stack of open elements is an optgroup
3268 element, then act as if an end tag with the tag name "option" had
3269 been seen. */
3270 $elements_in_stack = count($this->stack);
3271
3272 if($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
3273 $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') {
3274 $this->inSelect(array(
3275 'name' => 'option',
3276 'type' => HTML5::ENDTAG
3277 ));
3278 }
3279
3280 /* If the current node is an optgroup element, then pop that node
3281 from the stack of open elements. Otherwise, this is a parse error,
3282 ignore the token. */
3283 if($this->stack[$elements_in_stack - 1] === 'optgroup') {
3284 array_pop($this->stack);
3285 }
3286
3287 /* An end tag token whose tag name is "option" */
3288 } elseif($token['type'] === HTML5::ENDTAG &&
3289 $token['name'] === 'option') {
3290 /* If the current node is an option element, then pop that node
3291 from the stack of open elements. Otherwise, this is a parse error,
3292 ignore the token. */
3293 if(end($this->stack)->nodeName === 'option') {
3294 array_pop($this->stack);
3295 }
3296
3297 /* An end tag whose tag name is "select" */
3298 } elseif($token['type'] === HTML5::ENDTAG &&
3299 $token['name'] === 'select') {
3300 /* If the stack of open elements does not have an element in table
3301 scope with the same tag name as the token, this is a parse error.
3302 Ignore the token. (innerHTML case) */
3303 if(!$this->elementInScope($token['name'], true)) {
3304 // w/e
3305
3306 /* Otherwise: */
3307 } else {
3308 /* Pop elements from the stack of open elements until a select
3309 element has been popped from the stack. */
3310 while(true) {
3311 $current = end($this->stack)->nodeName;
3312 array_pop($this->stack);
3313
3314 if($current === 'select') {
3315 break;
3316 }
3317 }
3318
3319 /* Reset the insertion mode appropriately. */
3320 $this->resetInsertionMode();
3321 }
3322
3323 /* A start tag whose tag name is "select" */
3324 } elseif($token['name'] === 'select' &&
3325 $token['type'] === HTML5::STARTTAG) {
3326 /* Parse error. Act as if the token had been an end tag with the
3327 tag name "select" instead. */
3328 $this->inSelect(array(
3329 'name' => 'select',
3330 'type' => HTML5::ENDTAG
3331 ));
3332
3333 /* An end tag whose tag name is one of: "caption", "table", "tbody",
3334 "tfoot", "thead", "tr", "td", "th" */
3335 } elseif(in_array($token['name'], array('caption', 'table', 'tbody',
3336 'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) {
3337 /* Parse error. */
3338 // w/e
3339
3340 /* If the stack of open elements has an element in table scope with
3341 the same tag name as that of the token, then act as if an end tag
3342 with the tag name "select" had been seen, and reprocess the token.
3343 Otherwise, ignore the token. */
3344 if($this->elementInScope($token['name'], true)) {
3345 $this->inSelect(array(
3346 'name' => 'select',
3347 'type' => HTML5::ENDTAG
3348 ));
3349
3350 $this->mainPhase($token);
3351 }
3352
3353 /* Anything else */
3354 } else {
3355 /* Parse error. Ignore the token. */
3356 }
3357 }
3358
3359 private function afterBody($token) {
3360 /* Handle the token as follows: */
3361
3362 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3363 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3364 or U+0020 SPACE */
3365 if($token['type'] === HTML5::CHARACTR &&
3366 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3367 /* Process the token as it would be processed if the insertion mode
3368 was "in body". */
3369 $this->inBody($token);
3370
3371 /* A comment token */
3372 } elseif($token['type'] === HTML5::COMMENT) {
3373 /* Append a Comment node to the first element in the stack of open
3374 elements (the html element), with the data attribute set to the
3375 data given in the comment token. */
3376 $comment = $this->dom->createComment($token['data']);
3377 $this->stack[0]->appendChild($comment);
3378
3379 /* An end tag with the tag name "html" */
3380 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
3381 /* If the parser was originally created in order to handle the
3382 setting of an element's innerHTML attribute, this is a parse error;
3383 ignore the token. (The element will be an html element in this
3384 case.) (innerHTML case) */
3385
3386 /* Otherwise, switch to the trailing end phase. */
3387 $this->phase = self::END_PHASE;
3388
3389 /* Anything else */
3390 } else {
3391 /* Parse error. Set the insertion mode to "in body" and reprocess
3392 the token. */
3393 $this->mode = self::IN_BODY;
3394 return $this->inBody($token);
3395 }
3396 }
3397
3398 private function inFrameset($token) {
3399 /* Handle the token as follows: */
3400
3401 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3402 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3403 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3404 if($token['type'] === HTML5::CHARACTR &&
3405 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3406 /* Append the character to the current node. */
3407 $this->insertText($token['data']);
3408
3409 /* A comment token */
3410 } elseif($token['type'] === HTML5::COMMENT) {
3411 /* Append a Comment node to the current node with the data
3412 attribute set to the data given in the comment token. */
3413 $this->insertComment($token['data']);
3414
3415 /* A start tag with the tag name "frameset" */
3416 } elseif($token['name'] === 'frameset' &&
3417 $token['type'] === HTML5::STARTTAG) {
3418 $this->insertElement($token);
3419
3420 /* An end tag with the tag name "frameset" */
3421 } elseif($token['name'] === 'frameset' &&
3422 $token['type'] === HTML5::ENDTAG) {
3423 /* If the current node is the root html element, then this is a
3424 parse error; ignore the token. (innerHTML case) */
3425 if(end($this->stack)->nodeName === 'html') {
3426 // Ignore
3427
3428 } else {
3429 /* Otherwise, pop the current node from the stack of open
3430 elements. */
3431 array_pop($this->stack);
3432
3433 /* If the parser was not originally created in order to handle
3434 the setting of an element's innerHTML attribute (innerHTML case),
3435 and the current node is no longer a frameset element, then change
3436 the insertion mode to "after frameset". */
3437 $this->mode = self::AFTR_FRAME;
3438 }
3439
3440 /* A start tag with the tag name "frame" */
3441 } elseif($token['name'] === 'frame' &&
3442 $token['type'] === HTML5::STARTTAG) {
3443 /* Insert an HTML element for the token. */
3444 $this->insertElement($token);
3445
3446 /* Immediately pop the current node off the stack of open elements. */
3447 array_pop($this->stack);
3448
3449 /* A start tag with the tag name "noframes" */
3450 } elseif($token['name'] === 'noframes' &&
3451 $token['type'] === HTML5::STARTTAG) {
3452 /* Process the token as if the insertion mode had been "in body". */
3453 $this->inBody($token);
3454
3455 /* Anything else */
3456 } else {
3457 /* Parse error. Ignore the token. */
3458 }
3459 }
3460
3461 private function afterFrameset($token) {
3462 /* Handle the token as follows: */
3463
3464 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3465 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3466 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3467 if($token['type'] === HTML5::CHARACTR &&
3468 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3469 /* Append the character to the current node. */
3470 $this->insertText($token['data']);
3471
3472 /* A comment token */
3473 } elseif($token['type'] === HTML5::COMMENT) {
3474 /* Append a Comment node to the current node with the data
3475 attribute set to the data given in the comment token. */
3476 $this->insertComment($token['data']);
3477
3478 /* An end tag with the tag name "html" */
3479 } elseif($token['name'] === 'html' &&
3480 $token['type'] === HTML5::ENDTAG) {
3481 /* Switch to the trailing end phase. */
3482 $this->phase = self::END_PHASE;
3483
3484 /* A start tag with the tag name "noframes" */
3485 } elseif($token['name'] === 'noframes' &&
3486 $token['type'] === HTML5::STARTTAG) {
3487 /* Process the token as if the insertion mode had been "in body". */
3488 $this->inBody($token);
3489
3490 /* Anything else */
3491 } else {
3492 /* Parse error. Ignore the token. */
3493 }
3494 }
3495
3496 private function trailingEndPhase($token) {
3497 /* After the main phase, as each token is emitted from the tokenisation
3498 stage, it must be processed as described in this section. */
3499
3500 /* A DOCTYPE token */
3501 if($token['type'] === HTML5::DOCTYPE) {
3502 // Parse error. Ignore the token.
3503
3504 /* A comment token */
3505 } elseif($token['type'] === HTML5::COMMENT) {
3506 /* Append a Comment node to the Document object with the data
3507 attribute set to the data given in the comment token. */
3508 $comment = $this->dom->createComment($token['data']);
3509 $this->dom->appendChild($comment);
3510
3511 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3512 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3513 or U+0020 SPACE */
3514 } elseif($token['type'] === HTML5::CHARACTR &&
3515 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3516 /* Process the token as it would be processed in the main phase. */
3517 $this->mainPhase($token);
3518
3519 /* A character token that is not one of U+0009 CHARACTER TABULATION,
3520 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3521 or U+0020 SPACE. Or a start tag token. Or an end tag token. */
3522 } elseif(($token['type'] === HTML5::CHARACTR &&
3523 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
3524 $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) {
3525 /* Parse error. Switch back to the main phase and reprocess the
3526 token. */
3527 $this->phase = self::MAIN_PHASE;
3528 return $this->mainPhase($token);
3529
3530 /* An end-of-file token */
3531 } elseif($token['type'] === HTML5::EOF) {
3532 /* OMG DONE!! */
3533 }
3534 }
3535
3536 private function insertElement($token, $append = true, $check = false) {
3537 // Proprietary workaround for libxml2's limitations with tag names
3538 if ($check) {
3539 // Slightly modified HTML5 tag-name modification,
3540 // removing anything that's not an ASCII letter, digit, or hyphen
3541 $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
3542 // Remove leading hyphens and numbers
3543 $token['name'] = ltrim($token['name'], '-0..9');
3544 // In theory, this should ever be needed, but just in case
3545 if ($token['name'] === '') $token['name'] = 'span'; // arbitrary generic choice
3546 }
3547
3548 $el = $this->dom->createElement($token['name']);
3549
3550 foreach($token['attr'] as $attr) {
3551 if(!$el->hasAttribute($attr['name'])) {
3552 $el->setAttribute($attr['name'], $attr['value']);
3553 }
3554 }
3555
3556 $this->appendToRealParent($el);
3557 $this->stack[] = $el;
3558
3559 return $el;
3560 }
3561
3562 private function insertText($data) {
3563 $text = $this->dom->createTextNode($data);
3564 $this->appendToRealParent($text);
3565 }
3566
3567 private function insertComment($data) {
3568 $comment = $this->dom->createComment($data);
3569 $this->appendToRealParent($comment);
3570 }
3571
3572 private function appendToRealParent($node) {
3573 if($this->foster_parent === null) {
3574 end($this->stack)->appendChild($node);
3575
3576 } elseif($this->foster_parent !== null) {
3577 /* If the foster parent element is the parent element of the
3578 last table element in the stack of open elements, then the new
3579 node must be inserted immediately before the last table element
3580 in the stack of open elements in the foster parent element;
3581 otherwise, the new node must be appended to the foster parent
3582 element. */
3583 for($n = count($this->stack) - 1; $n >= 0; $n--) {
3584 if($this->stack[$n]->nodeName === 'table' &&
3585 $this->stack[$n]->parentNode !== null) {
3586 $table = $this->stack[$n];
3587 break;
3588 }
3589 }
3590
3591 if(isset($table) && $this->foster_parent->isSameNode($table->parentNode))
3592 $this->foster_parent->insertBefore($node, $table);
3593 else
3594 $this->foster_parent->appendChild($node);
3595
3596 $this->foster_parent = null;
3597 }
3598 }
3599
3600 private function elementInScope($el, $table = false) {
3601 if(is_array($el)) {
3602 foreach($el as $element) {
3603 if($this->elementInScope($element, $table)) {
3604 return true;
3605 }
3606 }
3607
3608 return false;
3609 }
3610
3611 $leng = count($this->stack);
3612
3613 for($n = 0; $n < $leng; $n++) {
3614 /* 1. Initialise node to be the current node (the bottommost node of
3615 the stack). */
3616 $node = $this->stack[$leng - 1 - $n];
3617
3618 if($node->tagName === $el) {
3619 /* 2. If node is the target node, terminate in a match state. */
3620 return true;
3621
3622 } elseif($node->tagName === 'table') {
3623 /* 3. Otherwise, if node is a table element, terminate in a failure
3624 state. */
3625 return false;
3626
3627 } elseif($table === true && in_array($node->tagName, array('caption', 'td',
3628 'th', 'button', 'marquee', 'object'))) {
3629 /* 4. Otherwise, if the algorithm is the "has an element in scope"
3630 variant (rather than the "has an element in table scope" variant),
3631 and node is one of the following, terminate in a failure state. */
3632 return false;
3633
3634 } elseif($node === $node->ownerDocument->documentElement) {
3635 /* 5. Otherwise, if node is an html element (root element), terminate
3636 in a failure state. (This can only happen if the node is the topmost
3637 node of the stack of open elements, and prevents the next step from
3638 being invoked if there are no more elements in the stack.) */
3639 return false;
3640 }
3641
3642 /* Otherwise, set node to the previous entry in the stack of open
3643 elements and return to step 2. (This will never fail, since the loop
3644 will always terminate in the previous step if the top of the stack
3645 is reached.) */
3646 }
3647 }
3648
3649 private function reconstructActiveFormattingElements() {
3650 /* 1. If there are no entries in the list of active formatting elements,
3651 then there is nothing to reconstruct; stop this algorithm. */
3652 $formatting_elements = count($this->a_formatting);
3653
3654 if($formatting_elements === 0) {
3655 return false;
3656 }
3657
3658 /* 3. Let entry be the last (most recently added) element in the list
3659 of active formatting elements. */
3660 $entry = end($this->a_formatting);
3661
3662 /* 2. If the last (most recently added) entry in the list of active
3663 formatting elements is a marker, or if it is an element that is in the
3664 stack of open elements, then there is nothing to reconstruct; stop this
3665 algorithm. */
3666 if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3667 return false;
3668 }
3669
3670 for($a = $formatting_elements - 1; $a >= 0; true) {
3671 /* 4. If there are no entries before entry in the list of active
3672 formatting elements, then jump to step 8. */
3673 if($a === 0) {
3674 $step_seven = false;
3675 break;
3676 }
3677
3678 /* 5. Let entry be the entry one earlier than entry in the list of
3679 active formatting elements. */
3680 $a--;
3681 $entry = $this->a_formatting[$a];
3682
3683 /* 6. If entry is neither a marker nor an element that is also in
3684 thetack of open elements, go to step 4. */
3685 if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3686 break;
3687 }
3688 }
3689
3690 while(true) {
3691 /* 7. Let entry be the element one later than entry in the list of
3692 active formatting elements. */
3693 if(isset($step_seven) && $step_seven === true) {
3694 $a++;
3695 $entry = $this->a_formatting[$a];
3696 }
3697
3698 /* 8. Perform a shallow clone of the element entry to obtain clone. */
3699 $clone = $entry->cloneNode();
3700
3701 /* 9. Append clone to the current node and push it onto the stack
3702 of open elements so that it is the new current node. */
3703 end($this->stack)->appendChild($clone);
3704 $this->stack[] = $clone;
3705
3706 /* 10. Replace the entry for entry in the list with an entry for
3707 clone. */
3708 $this->a_formatting[$a] = $clone;
3709
3710 /* 11. If the entry for clone in the list of active formatting
3711 elements is not the last entry in the list, return to step 7. */
3712 if(end($this->a_formatting) !== $clone) {
3713 $step_seven = true;
3714 } else {
3715 break;
3716 }
3717 }
3718 }
3719
3720 private function clearTheActiveFormattingElementsUpToTheLastMarker() {
3721 /* When the steps below require the UA to clear the list of active
3722 formatting elements up to the last marker, the UA must perform the
3723 following steps: */
3724
3725 while(true) {
3726 /* 1. Let entry be the last (most recently added) entry in the list
3727 of active formatting elements. */
3728 $entry = end($this->a_formatting);
3729
3730 /* 2. Remove entry from the list of active formatting elements. */
3731 array_pop($this->a_formatting);
3732
3733 /* 3. If entry was a marker, then stop the algorithm at this point.
3734 The list has been cleared up to the last marker. */
3735 if($entry === self::MARKER) {
3736 break;
3737 }
3738 }
3739 }
3740
3741 private function generateImpliedEndTags($exclude = array()) {
3742 /* When the steps below require the UA to generate implied end tags,
3743 then, if the current node is a dd element, a dt element, an li element,
3744 a p element, a td element, a th element, or a tr element, the UA must
3745 act as if an end tag with the respective tag name had been seen and
3746 then generate implied end tags again. */
3747 $node = end($this->stack);
3748 $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
3749
3750 while(in_array(end($this->stack)->nodeName, $elements)) {
3751 array_pop($this->stack);
3752 }
3753 }
3754
3755 private function getElementCategory($node) {
3756 $name = $node->tagName;
3757 if(in_array($name, $this->special))
3758 return self::SPECIAL;
3759
3760 elseif(in_array($name, $this->scoping))
3761 return self::SCOPING;
3762
3763 elseif(in_array($name, $this->formatting))
3764 return self::FORMATTING;
3765
3766 else
3767 return self::PHRASING;
3768 }
3769
3770 private function clearStackToTableContext($elements) {
3771 /* When the steps above require the UA to clear the stack back to a
3772 table context, it means that the UA must, while the current node is not
3773 a table element or an html element, pop elements from the stack of open
3774 elements. If this causes any elements to be popped from the stack, then
3775 this is a parse error. */
3776 while(true) {
3777 $node = end($this->stack)->nodeName;
3778
3779 if(in_array($node, $elements)) {
3780 break;
3781 } else {
3782 array_pop($this->stack);
3783 }
3784 }
3785 }
3786
3787 private function resetInsertionMode() {
3788 /* 1. Let last be false. */
3789 $last = false;
3790 $leng = count($this->stack);
3791
3792 for($n = $leng - 1; $n >= 0; $n--) {
3793 /* 2. Let node be the last node in the stack of open elements. */
3794 $node = $this->stack[$n];
3795
3796 /* 3. If node is the first node in the stack of open elements, then
3797 set last to true. If the element whose innerHTML attribute is being
3798 set is neither a td element nor a th element, then set node to the
3799 element whose innerHTML attribute is being set. (innerHTML case) */
3800 if($this->stack[0]->isSameNode($node)) {
3801 $last = true;
3802 }
3803
3804 /* 4. If node is a select element, then switch the insertion mode to
3805 "in select" and abort these steps. (innerHTML case) */
3806 if($node->nodeName === 'select') {
3807 $this->mode = self::IN_SELECT;
3808 break;
3809
3810 /* 5. If node is a td or th element, then switch the insertion mode
3811 to "in cell" and abort these steps. */
3812 } elseif($node->nodeName === 'td' || $node->nodeName === 'th') {
3813 $this->mode = self::IN_CELL;
3814 break;
3815
3816 /* 6. If node is a tr element, then switch the insertion mode to
3817 "in row" and abort these steps. */
3818 } elseif($node->nodeName === 'tr') {
3819 $this->mode = self::IN_ROW;
3820 break;
3821
3822 /* 7. If node is a tbody, thead, or tfoot element, then switch the
3823 insertion mode to "in table body" and abort these steps. */
3824 } elseif(in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
3825 $this->mode = self::IN_TBODY;
3826 break;
3827
3828 /* 8. If node is a caption element, then switch the insertion mode
3829 to "in caption" and abort these steps. */
3830 } elseif($node->nodeName === 'caption') {
3831 $this->mode = self::IN_CAPTION;
3832 break;
3833
3834 /* 9. If node is a colgroup element, then switch the insertion mode
3835 to "in column group" and abort these steps. (innerHTML case) */
3836 } elseif($node->nodeName === 'colgroup') {
3837 $this->mode = self::IN_CGROUP;
3838 break;
3839
3840 /* 10. If node is a table element, then switch the insertion mode
3841 to "in table" and abort these steps. */
3842 } elseif($node->nodeName === 'table') {
3843 $this->mode = self::IN_TABLE;
3844 break;
3845
3846 /* 11. If node is a head element, then switch the insertion mode
3847 to "in body" ("in body"! not "in head"!) and abort these steps.
3848 (innerHTML case) */
3849 } elseif($node->nodeName === 'head') {
3850 $this->mode = self::IN_BODY;
3851 break;
3852
3853 /* 12. If node is a body element, then switch the insertion mode to
3854 "in body" and abort these steps. */
3855 } elseif($node->nodeName === 'body') {
3856 $this->mode = self::IN_BODY;
3857 break;
3858
3859 /* 13. If node is a frameset element, then switch the insertion
3860 mode to "in frameset" and abort these steps. (innerHTML case) */
3861 } elseif($node->nodeName === 'frameset') {
3862 $this->mode = self::IN_FRAME;
3863 break;
3864
3865 /* 14. If node is an html element, then: if the head element
3866 pointer is null, switch the insertion mode to "before head",
3867 otherwise, switch the insertion mode to "after head". In either
3868 case, abort these steps. (innerHTML case) */
3869 } elseif($node->nodeName === 'html') {
3870 $this->mode = ($this->head_pointer === null)
3871 ? self::BEFOR_HEAD
3872 : self::AFTER_HEAD;
3873
3874 break;
3875
3876 /* 15. If last is true, then set the insertion mode to "in body"
3877 and abort these steps. (innerHTML case) */
3878 } elseif($last) {
3879 $this->mode = self::IN_BODY;
3880 break;
3881 }
3882 }
3883 }
3884
3885 private function closeCell() {
3886 /* If the stack of open elements has a td or th element in table scope,
3887 then act as if an end tag token with that tag name had been seen. */
3888 foreach(array('td', 'th') as $cell) {
3889 if($this->elementInScope($cell, true)) {
3890 $this->inCell(array(
3891 'name' => $cell,
3892 'type' => HTML5::ENDTAG
3893 ));
3894
3895 break;
3896 }
3897 }
3898 }
3899
3900 public function save() {
3901 return $this->dom;
3902 }
3903 }
3904 ?>