]>
Commit | Line | Data |
---|---|---|
f45a286b AD |
1 | <?php |
2 | ||
3 | /** | |
4 | * Takes a well formed list of tokens and fixes their nesting. | |
5 | * | |
6 | * HTML elements dictate which elements are allowed to be their children, | |
7 | * for example, you can't have a p tag in a span tag. Other elements have | |
8 | * much more rigorous definitions: tables, for instance, require a specific | |
9 | * order for their elements. There are also constraints not expressible by | |
10 | * document type definitions, such as the chameleon nature of ins/del | |
11 | * tags and global child exclusions. | |
12 | * | |
13 | * The first major objective of this strategy is to iterate through all the | |
14 | * nodes (not tokens) of the list of tokens and determine whether or not | |
15 | * their children conform to the element's definition. If they do not, the | |
16 | * child definition may optionally supply an amended list of elements that | |
17 | * is valid or require that the entire node be deleted (and the previous | |
18 | * node rescanned). | |
19 | * | |
20 | * The second objective is to ensure that explicitly excluded elements of | |
21 | * an element do not appear in its children. Code that accomplishes this | |
22 | * task is pervasive through the strategy, though the two are distinct tasks | |
23 | * and could, theoretically, be seperated (although it's not recommended). | |
24 | * | |
25 | * @note Whether or not unrecognized children are silently dropped or | |
26 | * translated into text depends on the child definitions. | |
27 | * | |
28 | * @todo Enable nodes to be bubbled out of the structure. | |
29 | */ | |
30 | ||
31 | class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy | |
32 | { | |
33 | ||
34 | public function execute($tokens, $config, $context) { | |
35 | //####################################################################// | |
36 | // Pre-processing | |
37 | ||
38 | // get a copy of the HTML definition | |
39 | $definition = $config->getHTMLDefinition(); | |
40 | ||
41 | // insert implicit "parent" node, will be removed at end. | |
42 | // DEFINITION CALL | |
43 | $parent_name = $definition->info_parent; | |
44 | array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name)); | |
45 | $tokens[] = new HTMLPurifier_Token_End($parent_name); | |
46 | ||
47 | // setup the context variable 'IsInline', for chameleon processing | |
48 | // is 'false' when we are not inline, 'true' when it must always | |
49 | // be inline, and an integer when it is inline for a certain | |
50 | // branch of the document tree | |
51 | $is_inline = $definition->info_parent_def->descendants_are_inline; | |
52 | $context->register('IsInline', $is_inline); | |
53 | ||
54 | // setup error collector | |
55 | $e =& $context->get('ErrorCollector', true); | |
56 | ||
57 | //####################################################################// | |
58 | // Loop initialization | |
59 | ||
60 | // stack that contains the indexes of all parents, | |
61 | // $stack[count($stack)-1] being the current parent | |
62 | $stack = array(); | |
63 | ||
64 | // stack that contains all elements that are excluded | |
65 | // it is organized by parent elements, similar to $stack, | |
66 | // but it is only populated when an element with exclusions is | |
67 | // processed, i.e. there won't be empty exclusions. | |
68 | $exclude_stack = array(); | |
69 | ||
70 | // variable that contains the start token while we are processing | |
71 | // nodes. This enables error reporting to do its job | |
72 | $start_token = false; | |
73 | $context->register('CurrentToken', $start_token); | |
74 | ||
75 | //####################################################################// | |
76 | // Loop | |
77 | ||
78 | // iterate through all start nodes. Determining the start node | |
79 | // is complicated so it has been omitted from the loop construct | |
80 | for ($i = 0, $size = count($tokens) ; $i < $size; ) { | |
81 | ||
82 | //################################################################// | |
83 | // Gather information on children | |
84 | ||
85 | // child token accumulator | |
86 | $child_tokens = array(); | |
87 | ||
88 | // scroll to the end of this node, report number, and collect | |
89 | // all children | |
90 | for ($j = $i, $depth = 0; ; $j++) { | |
91 | if ($tokens[$j] instanceof HTMLPurifier_Token_Start) { | |
92 | $depth++; | |
93 | // skip token assignment on first iteration, this is the | |
94 | // token we currently are on | |
95 | if ($depth == 1) continue; | |
96 | } elseif ($tokens[$j] instanceof HTMLPurifier_Token_End) { | |
97 | $depth--; | |
98 | // skip token assignment on last iteration, this is the | |
99 | // end token of the token we're currently on | |
100 | if ($depth == 0) break; | |
101 | } | |
102 | $child_tokens[] = $tokens[$j]; | |
103 | } | |
104 | ||
105 | // $i is index of start token | |
106 | // $j is index of end token | |
107 | ||
108 | $start_token = $tokens[$i]; // to make token available via CurrentToken | |
109 | ||
110 | //################################################################// | |
111 | // Gather information on parent | |
112 | ||
113 | // calculate parent information | |
114 | if ($count = count($stack)) { | |
115 | $parent_index = $stack[$count-1]; | |
116 | $parent_name = $tokens[$parent_index]->name; | |
117 | if ($parent_index == 0) { | |
118 | $parent_def = $definition->info_parent_def; | |
119 | } else { | |
120 | $parent_def = $definition->info[$parent_name]; | |
121 | } | |
122 | } else { | |
123 | // processing as if the parent were the "root" node | |
124 | // unknown info, it won't be used anyway, in the future, | |
125 | // we may want to enforce one element only (this is | |
126 | // necessary for HTML Purifier to clean entire documents | |
127 | $parent_index = $parent_name = $parent_def = null; | |
128 | } | |
129 | ||
130 | // calculate context | |
131 | if ($is_inline === false) { | |
132 | // check if conditions make it inline | |
133 | if (!empty($parent_def) && $parent_def->descendants_are_inline) { | |
134 | $is_inline = $count - 1; | |
135 | } | |
136 | } else { | |
137 | // check if we're out of inline | |
138 | if ($count === $is_inline) { | |
139 | $is_inline = false; | |
140 | } | |
141 | } | |
142 | ||
143 | //################################################################// | |
144 | // Determine whether element is explicitly excluded SGML-style | |
145 | ||
146 | // determine whether or not element is excluded by checking all | |
147 | // parent exclusions. The array should not be very large, two | |
148 | // elements at most. | |
149 | $excluded = false; | |
150 | if (!empty($exclude_stack)) { | |
151 | foreach ($exclude_stack as $lookup) { | |
152 | if (isset($lookup[$tokens[$i]->name])) { | |
153 | $excluded = true; | |
154 | // no need to continue processing | |
155 | break; | |
156 | } | |
157 | } | |
158 | } | |
159 | ||
160 | //################################################################// | |
161 | // Perform child validation | |
162 | ||
163 | if ($excluded) { | |
164 | // there is an exclusion, remove the entire node | |
165 | $result = false; | |
166 | $excludes = array(); // not used, but good to initialize anyway | |
167 | } else { | |
168 | // DEFINITION CALL | |
169 | if ($i === 0) { | |
170 | // special processing for the first node | |
171 | $def = $definition->info_parent_def; | |
172 | } else { | |
173 | $def = $definition->info[$tokens[$i]->name]; | |
174 | ||
175 | } | |
176 | ||
177 | if (!empty($def->child)) { | |
178 | // have DTD child def validate children | |
179 | $result = $def->child->validateChildren( | |
180 | $child_tokens, $config, $context); | |
181 | } else { | |
182 | // weird, no child definition, get rid of everything | |
183 | $result = false; | |
184 | } | |
185 | ||
186 | // determine whether or not this element has any exclusions | |
187 | $excludes = $def->excludes; | |
188 | } | |
189 | ||
190 | // $result is now a bool or array | |
191 | ||
192 | //################################################################// | |
193 | // Process result by interpreting $result | |
194 | ||
195 | if ($result === true || $child_tokens === $result) { | |
196 | // leave the node as is | |
197 | ||
198 | // register start token as a parental node start | |
199 | $stack[] = $i; | |
200 | ||
201 | // register exclusions if there are any | |
202 | if (!empty($excludes)) $exclude_stack[] = $excludes; | |
203 | ||
204 | // move cursor to next possible start node | |
205 | $i++; | |
206 | ||
207 | } elseif($result === false) { | |
208 | // remove entire node | |
209 | ||
210 | if ($e) { | |
211 | if ($excluded) { | |
212 | $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded'); | |
213 | } else { | |
214 | $e->send(E_ERROR, 'Strategy_FixNesting: Node removed'); | |
215 | } | |
216 | } | |
217 | ||
218 | // calculate length of inner tokens and current tokens | |
219 | $length = $j - $i + 1; | |
220 | ||
221 | // perform removal | |
222 | array_splice($tokens, $i, $length); | |
223 | ||
224 | // update size | |
225 | $size -= $length; | |
226 | ||
227 | // there is no start token to register, | |
228 | // current node is now the next possible start node | |
229 | // unless it turns out that we need to do a double-check | |
230 | ||
231 | // this is a rought heuristic that covers 100% of HTML's | |
232 | // cases and 99% of all other cases. A child definition | |
233 | // that would be tricked by this would be something like: | |
234 | // ( | a b c) where it's all or nothing. Fortunately, | |
235 | // our current implementation claims that that case would | |
236 | // not allow empty, even if it did | |
237 | if (!$parent_def->child->allow_empty) { | |
238 | // we need to do a double-check | |
239 | $i = $parent_index; | |
240 | array_pop($stack); | |
241 | } | |
242 | ||
243 | // PROJECTED OPTIMIZATION: Process all children elements before | |
244 | // reprocessing parent node. | |
245 | ||
246 | } else { | |
247 | // replace node with $result | |
248 | ||
249 | // calculate length of inner tokens | |
250 | $length = $j - $i - 1; | |
251 | ||
252 | if ($e) { | |
253 | if (empty($result) && $length) { | |
254 | $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed'); | |
255 | } else { | |
256 | $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized'); | |
257 | } | |
258 | } | |
259 | ||
260 | // perform replacement | |
261 | array_splice($tokens, $i + 1, $length, $result); | |
262 | ||
263 | // update size | |
264 | $size -= $length; | |
265 | $size += count($result); | |
266 | ||
267 | // register start token as a parental node start | |
268 | $stack[] = $i; | |
269 | ||
270 | // register exclusions if there are any | |
271 | if (!empty($excludes)) $exclude_stack[] = $excludes; | |
272 | ||
273 | // move cursor to next possible start node | |
274 | $i++; | |
275 | ||
276 | } | |
277 | ||
278 | //################################################################// | |
279 | // Scroll to next start node | |
280 | ||
281 | // We assume, at this point, that $i is the index of the token | |
282 | // that is the first possible new start point for a node. | |
283 | ||
284 | // Test if the token indeed is a start tag, if not, move forward | |
285 | // and test again. | |
286 | $size = count($tokens); | |
287 | while ($i < $size and !$tokens[$i] instanceof HTMLPurifier_Token_Start) { | |
288 | if ($tokens[$i] instanceof HTMLPurifier_Token_End) { | |
289 | // pop a token index off the stack if we ended a node | |
290 | array_pop($stack); | |
291 | // pop an exclusion lookup off exclusion stack if | |
292 | // we ended node and that node had exclusions | |
293 | if ($i == 0 || $i == $size - 1) { | |
294 | // use specialized var if it's the super-parent | |
295 | $s_excludes = $definition->info_parent_def->excludes; | |
296 | } else { | |
297 | $s_excludes = $definition->info[$tokens[$i]->name]->excludes; | |
298 | } | |
299 | if ($s_excludes) { | |
300 | array_pop($exclude_stack); | |
301 | } | |
302 | } | |
303 | $i++; | |
304 | } | |
305 | ||
306 | } | |
307 | ||
308 | //####################################################################// | |
309 | // Post-processing | |
310 | ||
311 | // remove implicit parent tokens at the beginning and end | |
312 | array_shift($tokens); | |
313 | array_pop($tokens); | |
314 | ||
315 | // remove context variables | |
316 | $context->destroy('IsInline'); | |
317 | $context->destroy('CurrentToken'); | |
318 | ||
319 | //####################################################################// | |
320 | // Return | |
321 | ||
322 | return $tokens; | |
323 | ||
324 | } | |
325 | ||
326 | } | |
327 | ||
328 | // vim: et sw=4 sts=4 |