]>
Commit | Line | Data |
---|---|---|
1 | <?php | |
2 | ||
3 | /** | |
4 | * Takes tokens makes them well-formed (balance end tags, etc.) | |
5 | * | |
6 | * Specification of the armor attributes this strategy uses: | |
7 | * | |
8 | * - MakeWellFormed_TagClosedError: This armor field is used to | |
9 | * suppress tag closed errors for certain tokens [TagClosedSuppress], | |
10 | * in particular, if a tag was generated automatically by HTML | |
11 | * Purifier, we may rely on our infrastructure to close it for us | |
12 | * and shouldn't report an error to the user [TagClosedAuto]. | |
13 | */ | |
14 | class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy | |
15 | { | |
16 | ||
17 | /** | |
18 | * Array stream of tokens being processed. | |
19 | */ | |
20 | protected $tokens; | |
21 | ||
22 | /** | |
23 | * Current index in $tokens. | |
24 | */ | |
25 | protected $t; | |
26 | ||
27 | /** | |
28 | * Current nesting of elements. | |
29 | */ | |
30 | protected $stack; | |
31 | ||
32 | /** | |
33 | * Injectors active in this stream processing. | |
34 | */ | |
35 | protected $injectors; | |
36 | ||
37 | /** | |
38 | * Current instance of HTMLPurifier_Config. | |
39 | */ | |
40 | protected $config; | |
41 | ||
42 | /** | |
43 | * Current instance of HTMLPurifier_Context. | |
44 | */ | |
45 | protected $context; | |
46 | ||
47 | public function execute($tokens, $config, $context) { | |
48 | ||
49 | $definition = $config->getHTMLDefinition(); | |
50 | ||
51 | // local variables | |
52 | $generator = new HTMLPurifier_Generator($config, $context); | |
53 | $escape_invalid_tags = $config->get('Core.EscapeInvalidTags'); | |
54 | // used for autoclose early abortion | |
55 | $global_parent_allowed_elements = array(); | |
56 | if (isset($definition->info[$definition->info_parent])) { | |
57 | // may be unset under testing circumstances | |
58 | $global_parent_allowed_elements = $definition->info[$definition->info_parent]->child->getAllowedElements($config); | |
59 | } | |
60 | $e = $context->get('ErrorCollector', true); | |
61 | $t = false; // token index | |
62 | $i = false; // injector index | |
63 | $token = false; // the current token | |
64 | $reprocess = false; // whether or not to reprocess the same token | |
65 | $stack = array(); | |
66 | ||
67 | // member variables | |
68 | $this->stack =& $stack; | |
69 | $this->t =& $t; | |
70 | $this->tokens =& $tokens; | |
71 | $this->config = $config; | |
72 | $this->context = $context; | |
73 | ||
74 | // context variables | |
75 | $context->register('CurrentNesting', $stack); | |
76 | $context->register('InputIndex', $t); | |
77 | $context->register('InputTokens', $tokens); | |
78 | $context->register('CurrentToken', $token); | |
79 | ||
80 | // -- begin INJECTOR -- | |
81 | ||
82 | $this->injectors = array(); | |
83 | ||
84 | $injectors = $config->getBatch('AutoFormat'); | |
85 | $def_injectors = $definition->info_injector; | |
86 | $custom_injectors = $injectors['Custom']; | |
87 | unset($injectors['Custom']); // special case | |
88 | foreach ($injectors as $injector => $b) { | |
89 | // XXX: Fix with a legitimate lookup table of enabled filters | |
90 | if (strpos($injector, '.') !== false) continue; | |
91 | $injector = "HTMLPurifier_Injector_$injector"; | |
92 | if (!$b) continue; | |
93 | $this->injectors[] = new $injector; | |
94 | } | |
95 | foreach ($def_injectors as $injector) { | |
96 | // assumed to be objects | |
97 | $this->injectors[] = $injector; | |
98 | } | |
99 | foreach ($custom_injectors as $injector) { | |
100 | if (!$injector) continue; | |
101 | if (is_string($injector)) { | |
102 | $injector = "HTMLPurifier_Injector_$injector"; | |
103 | $injector = new $injector; | |
104 | } | |
105 | $this->injectors[] = $injector; | |
106 | } | |
107 | ||
108 | // give the injectors references to the definition and context | |
109 | // variables for performance reasons | |
110 | foreach ($this->injectors as $ix => $injector) { | |
111 | $error = $injector->prepare($config, $context); | |
112 | if (!$error) continue; | |
113 | array_splice($this->injectors, $ix, 1); // rm the injector | |
114 | trigger_error("Cannot enable {$injector->name} injector because $error is not allowed", E_USER_WARNING); | |
115 | } | |
116 | ||
117 | // -- end INJECTOR -- | |
118 | ||
119 | // a note on reprocessing: | |
120 | // In order to reduce code duplication, whenever some code needs | |
121 | // to make HTML changes in order to make things "correct", the | |
122 | // new HTML gets sent through the purifier, regardless of its | |
123 | // status. This means that if we add a start token, because it | |
124 | // was totally necessary, we don't have to update nesting; we just | |
125 | // punt ($reprocess = true; continue;) and it does that for us. | |
126 | ||
127 | // isset is in loop because $tokens size changes during loop exec | |
128 | for ( | |
129 | $t = 0; | |
130 | $t == 0 || isset($tokens[$t - 1]); | |
131 | // only increment if we don't need to reprocess | |
132 | $reprocess ? $reprocess = false : $t++ | |
133 | ) { | |
134 | ||
135 | // check for a rewind | |
136 | if (is_int($i) && $i >= 0) { | |
137 | // possibility: disable rewinding if the current token has a | |
138 | // rewind set on it already. This would offer protection from | |
139 | // infinite loop, but might hinder some advanced rewinding. | |
140 | $rewind_to = $this->injectors[$i]->getRewind(); | |
141 | if (is_int($rewind_to) && $rewind_to < $t) { | |
142 | if ($rewind_to < 0) $rewind_to = 0; | |
143 | while ($t > $rewind_to) { | |
144 | $t--; | |
145 | $prev = $tokens[$t]; | |
146 | // indicate that other injectors should not process this token, | |
147 | // but we need to reprocess it | |
148 | unset($prev->skip[$i]); | |
149 | $prev->rewind = $i; | |
150 | if ($prev instanceof HTMLPurifier_Token_Start) array_pop($this->stack); | |
151 | elseif ($prev instanceof HTMLPurifier_Token_End) $this->stack[] = $prev->start; | |
152 | } | |
153 | } | |
154 | $i = false; | |
155 | } | |
156 | ||
157 | // handle case of document end | |
158 | if (!isset($tokens[$t])) { | |
159 | // kill processing if stack is empty | |
160 | if (empty($this->stack)) break; | |
161 | ||
162 | // peek | |
163 | $top_nesting = array_pop($this->stack); | |
164 | $this->stack[] = $top_nesting; | |
165 | ||
166 | // send error [TagClosedSuppress] | |
167 | if ($e && !isset($top_nesting->armor['MakeWellFormed_TagClosedError'])) { | |
168 | $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $top_nesting); | |
169 | } | |
170 | ||
171 | // append, don't splice, since this is the end | |
172 | $tokens[] = new HTMLPurifier_Token_End($top_nesting->name); | |
173 | ||
174 | // punt! | |
175 | $reprocess = true; | |
176 | continue; | |
177 | } | |
178 | ||
179 | $token = $tokens[$t]; | |
180 | ||
181 | //echo '<br>'; printTokens($tokens, $t); printTokens($this->stack); | |
182 | //flush(); | |
183 | ||
184 | // quick-check: if it's not a tag, no need to process | |
185 | if (empty($token->is_tag)) { | |
186 | if ($token instanceof HTMLPurifier_Token_Text) { | |
187 | foreach ($this->injectors as $i => $injector) { | |
188 | if (isset($token->skip[$i])) continue; | |
189 | if ($token->rewind !== null && $token->rewind !== $i) continue; | |
190 | $injector->handleText($token); | |
191 | $this->processToken($token, $i); | |
192 | $reprocess = true; | |
193 | break; | |
194 | } | |
195 | } | |
196 | // another possibility is a comment | |
197 | continue; | |
198 | } | |
199 | ||
200 | if (isset($definition->info[$token->name])) { | |
201 | $type = $definition->info[$token->name]->child->type; | |
202 | } else { | |
203 | $type = false; // Type is unknown, treat accordingly | |
204 | } | |
205 | ||
206 | // quick tag checks: anything that's *not* an end tag | |
207 | $ok = false; | |
208 | if ($type === 'empty' && $token instanceof HTMLPurifier_Token_Start) { | |
209 | // claims to be a start tag but is empty | |
210 | $token = new HTMLPurifier_Token_Empty($token->name, $token->attr, $token->line, $token->col, $token->armor); | |
211 | $ok = true; | |
212 | } elseif ($type && $type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) { | |
213 | // claims to be empty but really is a start tag | |
214 | $this->swap(new HTMLPurifier_Token_End($token->name)); | |
215 | $this->insertBefore(new HTMLPurifier_Token_Start($token->name, $token->attr, $token->line, $token->col, $token->armor)); | |
216 | // punt (since we had to modify the input stream in a non-trivial way) | |
217 | $reprocess = true; | |
218 | continue; | |
219 | } elseif ($token instanceof HTMLPurifier_Token_Empty) { | |
220 | // real empty token | |
221 | $ok = true; | |
222 | } elseif ($token instanceof HTMLPurifier_Token_Start) { | |
223 | // start tag | |
224 | ||
225 | // ...unless they also have to close their parent | |
226 | if (!empty($this->stack)) { | |
227 | ||
228 | // Performance note: you might think that it's rather | |
229 | // inefficient, recalculating the autoclose information | |
230 | // for every tag that a token closes (since when we | |
231 | // do an autoclose, we push a new token into the | |
232 | // stream and then /process/ that, before | |
233 | // re-processing this token.) But this is | |
234 | // necessary, because an injector can make an | |
235 | // arbitrary transformations to the autoclosing | |
236 | // tokens we introduce, so things may have changed | |
237 | // in the meantime. Also, doing the inefficient thing is | |
238 | // "easy" to reason about (for certain perverse definitions | |
239 | // of "easy") | |
240 | ||
241 | $parent = array_pop($this->stack); | |
242 | $this->stack[] = $parent; | |
243 | ||
244 | if (isset($definition->info[$parent->name])) { | |
245 | $elements = $definition->info[$parent->name]->child->getAllowedElements($config); | |
246 | $autoclose = !isset($elements[$token->name]); | |
247 | } else { | |
248 | $autoclose = false; | |
249 | } | |
250 | ||
251 | if ($autoclose && $definition->info[$token->name]->wrap) { | |
252 | // Check if an element can be wrapped by another | |
253 | // element to make it valid in a context (for | |
254 | // example, <ul><ul> needs a <li> in between) | |
255 | $wrapname = $definition->info[$token->name]->wrap; | |
256 | $wrapdef = $definition->info[$wrapname]; | |
257 | $elements = $wrapdef->child->getAllowedElements($config); | |
258 | $parent_elements = $definition->info[$parent->name]->child->getAllowedElements($config); | |
259 | if (isset($elements[$token->name]) && isset($parent_elements[$wrapname])) { | |
260 | $newtoken = new HTMLPurifier_Token_Start($wrapname); | |
261 | $this->insertBefore($newtoken); | |
262 | $reprocess = true; | |
263 | continue; | |
264 | } | |
265 | } | |
266 | ||
267 | $carryover = false; | |
268 | if ($autoclose && $definition->info[$parent->name]->formatting) { | |
269 | $carryover = true; | |
270 | } | |
271 | ||
272 | if ($autoclose) { | |
273 | // check if this autoclose is doomed to fail | |
274 | // (this rechecks $parent, which his harmless) | |
275 | $autoclose_ok = isset($global_parent_allowed_elements[$token->name]); | |
276 | if (!$autoclose_ok) { | |
277 | foreach ($this->stack as $ancestor) { | |
278 | $elements = $definition->info[$ancestor->name]->child->getAllowedElements($config); | |
279 | if (isset($elements[$token->name])) { | |
280 | $autoclose_ok = true; | |
281 | break; | |
282 | } | |
283 | if ($definition->info[$token->name]->wrap) { | |
284 | $wrapname = $definition->info[$token->name]->wrap; | |
285 | $wrapdef = $definition->info[$wrapname]; | |
286 | $wrap_elements = $wrapdef->child->getAllowedElements($config); | |
287 | if (isset($wrap_elements[$token->name]) && isset($elements[$wrapname])) { | |
288 | $autoclose_ok = true; | |
289 | break; | |
290 | } | |
291 | } | |
292 | } | |
293 | } | |
294 | if ($autoclose_ok) { | |
295 | // errors need to be updated | |
296 | $new_token = new HTMLPurifier_Token_End($parent->name); | |
297 | $new_token->start = $parent; | |
298 | if ($carryover) { | |
299 | $element = clone $parent; | |
300 | // [TagClosedAuto] | |
301 | $element->armor['MakeWellFormed_TagClosedError'] = true; | |
302 | $element->carryover = true; | |
303 | $this->processToken(array($new_token, $token, $element)); | |
304 | } else { | |
305 | $this->insertBefore($new_token); | |
306 | } | |
307 | // [TagClosedSuppress] | |
308 | if ($e && !isset($parent->armor['MakeWellFormed_TagClosedError'])) { | |
309 | if (!$carryover) { | |
310 | $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent); | |
311 | } else { | |
312 | $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag carryover', $parent); | |
313 | } | |
314 | } | |
315 | } else { | |
316 | $this->remove(); | |
317 | } | |
318 | $reprocess = true; | |
319 | continue; | |
320 | } | |
321 | ||
322 | } | |
323 | $ok = true; | |
324 | } | |
325 | ||
326 | if ($ok) { | |
327 | foreach ($this->injectors as $i => $injector) { | |
328 | if (isset($token->skip[$i])) continue; | |
329 | if ($token->rewind !== null && $token->rewind !== $i) continue; | |
330 | $injector->handleElement($token); | |
331 | $this->processToken($token, $i); | |
332 | $reprocess = true; | |
333 | break; | |
334 | } | |
335 | if (!$reprocess) { | |
336 | // ah, nothing interesting happened; do normal processing | |
337 | $this->swap($token); | |
338 | if ($token instanceof HTMLPurifier_Token_Start) { | |
339 | $this->stack[] = $token; | |
340 | } elseif ($token instanceof HTMLPurifier_Token_End) { | |
341 | throw new HTMLPurifier_Exception('Improper handling of end tag in start code; possible error in MakeWellFormed'); | |
342 | } | |
343 | } | |
344 | continue; | |
345 | } | |
346 | ||
347 | // sanity check: we should be dealing with a closing tag | |
348 | if (!$token instanceof HTMLPurifier_Token_End) { | |
349 | throw new HTMLPurifier_Exception('Unaccounted for tag token in input stream, bug in HTML Purifier'); | |
350 | } | |
351 | ||
352 | // make sure that we have something open | |
353 | if (empty($this->stack)) { | |
354 | if ($escape_invalid_tags) { | |
355 | if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text'); | |
356 | $this->swap(new HTMLPurifier_Token_Text( | |
357 | $generator->generateFromToken($token) | |
358 | )); | |
359 | } else { | |
360 | $this->remove(); | |
361 | if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed'); | |
362 | } | |
363 | $reprocess = true; | |
364 | continue; | |
365 | } | |
366 | ||
367 | // first, check for the simplest case: everything closes neatly. | |
368 | // Eventually, everything passes through here; if there are problems | |
369 | // we modify the input stream accordingly and then punt, so that | |
370 | // the tokens get processed again. | |
371 | $current_parent = array_pop($this->stack); | |
372 | if ($current_parent->name == $token->name) { | |
373 | $token->start = $current_parent; | |
374 | foreach ($this->injectors as $i => $injector) { | |
375 | if (isset($token->skip[$i])) continue; | |
376 | if ($token->rewind !== null && $token->rewind !== $i) continue; | |
377 | $injector->handleEnd($token); | |
378 | $this->processToken($token, $i); | |
379 | $this->stack[] = $current_parent; | |
380 | $reprocess = true; | |
381 | break; | |
382 | } | |
383 | continue; | |
384 | } | |
385 | ||
386 | // okay, so we're trying to close the wrong tag | |
387 | ||
388 | // undo the pop previous pop | |
389 | $this->stack[] = $current_parent; | |
390 | ||
391 | // scroll back the entire nest, trying to find our tag. | |
392 | // (feature could be to specify how far you'd like to go) | |
393 | $size = count($this->stack); | |
394 | // -2 because -1 is the last element, but we already checked that | |
395 | $skipped_tags = false; | |
396 | for ($j = $size - 2; $j >= 0; $j--) { | |
397 | if ($this->stack[$j]->name == $token->name) { | |
398 | $skipped_tags = array_slice($this->stack, $j); | |
399 | break; | |
400 | } | |
401 | } | |
402 | ||
403 | // we didn't find the tag, so remove | |
404 | if ($skipped_tags === false) { | |
405 | if ($escape_invalid_tags) { | |
406 | $this->swap(new HTMLPurifier_Token_Text( | |
407 | $generator->generateFromToken($token) | |
408 | )); | |
409 | if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text'); | |
410 | } else { | |
411 | $this->remove(); | |
412 | if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed'); | |
413 | } | |
414 | $reprocess = true; | |
415 | continue; | |
416 | } | |
417 | ||
418 | // do errors, in REVERSE $j order: a,b,c with </a></b></c> | |
419 | $c = count($skipped_tags); | |
420 | if ($e) { | |
421 | for ($j = $c - 1; $j > 0; $j--) { | |
422 | // notice we exclude $j == 0, i.e. the current ending tag, from | |
423 | // the errors... [TagClosedSuppress] | |
424 | if (!isset($skipped_tags[$j]->armor['MakeWellFormed_TagClosedError'])) { | |
425 | $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$j]); | |
426 | } | |
427 | } | |
428 | } | |
429 | ||
430 | // insert tags, in FORWARD $j order: c,b,a with </a></b></c> | |
431 | $replace = array($token); | |
432 | for ($j = 1; $j < $c; $j++) { | |
433 | // ...as well as from the insertions | |
434 | $new_token = new HTMLPurifier_Token_End($skipped_tags[$j]->name); | |
435 | $new_token->start = $skipped_tags[$j]; | |
436 | array_unshift($replace, $new_token); | |
437 | if (isset($definition->info[$new_token->name]) && $definition->info[$new_token->name]->formatting) { | |
438 | // [TagClosedAuto] | |
439 | $element = clone $skipped_tags[$j]; | |
440 | $element->carryover = true; | |
441 | $element->armor['MakeWellFormed_TagClosedError'] = true; | |
442 | $replace[] = $element; | |
443 | } | |
444 | } | |
445 | $this->processToken($replace); | |
446 | $reprocess = true; | |
447 | continue; | |
448 | } | |
449 | ||
450 | $context->destroy('CurrentNesting'); | |
451 | $context->destroy('InputTokens'); | |
452 | $context->destroy('InputIndex'); | |
453 | $context->destroy('CurrentToken'); | |
454 | ||
455 | unset($this->injectors, $this->stack, $this->tokens, $this->t); | |
456 | return $tokens; | |
457 | } | |
458 | ||
459 | /** | |
460 | * Processes arbitrary token values for complicated substitution patterns. | |
461 | * In general: | |
462 | * | |
463 | * If $token is an array, it is a list of tokens to substitute for the | |
464 | * current token. These tokens then get individually processed. If there | |
465 | * is a leading integer in the list, that integer determines how many | |
466 | * tokens from the stream should be removed. | |
467 | * | |
468 | * If $token is a regular token, it is swapped with the current token. | |
469 | * | |
470 | * If $token is false, the current token is deleted. | |
471 | * | |
472 | * If $token is an integer, that number of tokens (with the first token | |
473 | * being the current one) will be deleted. | |
474 | * | |
475 | * @param $token Token substitution value | |
476 | * @param $injector Injector that performed the substitution; default is if | |
477 | * this is not an injector related operation. | |
478 | */ | |
479 | protected function processToken($token, $injector = -1) { | |
480 | ||
481 | // normalize forms of token | |
482 | if (is_object($token)) $token = array(1, $token); | |
483 | if (is_int($token)) $token = array($token); | |
484 | if ($token === false) $token = array(1); | |
485 | if (!is_array($token)) throw new HTMLPurifier_Exception('Invalid token type from injector'); | |
486 | if (!is_int($token[0])) array_unshift($token, 1); | |
487 | if ($token[0] === 0) throw new HTMLPurifier_Exception('Deleting zero tokens is not valid'); | |
488 | ||
489 | // $token is now an array with the following form: | |
490 | // array(number nodes to delete, new node 1, new node 2, ...) | |
491 | ||
492 | $delete = array_shift($token); | |
493 | $old = array_splice($this->tokens, $this->t, $delete, $token); | |
494 | ||
495 | if ($injector > -1) { | |
496 | // determine appropriate skips | |
497 | $oldskip = isset($old[0]) ? $old[0]->skip : array(); | |
498 | foreach ($token as $object) { | |
499 | $object->skip = $oldskip; | |
500 | $object->skip[$injector] = true; | |
501 | } | |
502 | } | |
503 | ||
504 | } | |
505 | ||
506 | /** | |
507 | * Inserts a token before the current token. Cursor now points to | |
508 | * this token. You must reprocess after this. | |
509 | */ | |
510 | private function insertBefore($token) { | |
511 | array_splice($this->tokens, $this->t, 0, array($token)); | |
512 | } | |
513 | ||
514 | /** | |
515 | * Removes current token. Cursor now points to new token occupying previously | |
516 | * occupied space. You must reprocess after this. | |
517 | */ | |
518 | private function remove() { | |
519 | array_splice($this->tokens, $this->t, 1); | |
520 | } | |
521 | ||
522 | /** | |
523 | * Swap current token with new token. Cursor points to new token (no | |
524 | * change). You must reprocess after this. | |
525 | */ | |
526 | private function swap($token) { | |
527 | $this->tokens[$this->t] = $token; | |
528 | } | |
529 | ||
530 | } | |
531 | ||
532 | // vim: et sw=4 sts=4 |