]>
Commit | Line | Data |
---|---|---|
f45a286b AD |
1 | <?php |
2 | ||
3 | /** | |
4 | * Injector that auto paragraphs text in the root node based on | |
5 | * double-spacing. | |
6 | * @todo Ensure all states are unit tested, including variations as well. | |
7 | * @todo Make a graph of the flow control for this Injector. | |
8 | */ | |
9 | class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector | |
10 | { | |
11 | ||
12 | public $name = 'AutoParagraph'; | |
13 | public $needed = array('p'); | |
14 | ||
15 | private function _pStart() { | |
16 | $par = new HTMLPurifier_Token_Start('p'); | |
17 | $par->armor['MakeWellFormed_TagClosedError'] = true; | |
18 | return $par; | |
19 | } | |
20 | ||
21 | public function handleText(&$token) { | |
22 | $text = $token->data; | |
23 | // Does the current parent allow <p> tags? | |
24 | if ($this->allowsElement('p')) { | |
25 | if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) { | |
26 | // Note that we have differing behavior when dealing with text | |
27 | // in the anonymous root node, or a node inside the document. | |
28 | // If the text as a double-newline, the treatment is the same; | |
29 | // if it doesn't, see the next if-block if you're in the document. | |
30 | ||
31 | $i = $nesting = null; | |
32 | if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) { | |
33 | // State 1.1: ... ^ (whitespace, then document end) | |
34 | // ---- | |
35 | // This is a degenerate case | |
36 | } else { | |
f4f0f80d AD |
37 | if (!$token->is_whitespace || $this->_isInline($current)) { |
38 | // State 1.2: PAR1 | |
39 | // ---- | |
f45a286b | 40 | |
f4f0f80d AD |
41 | // State 1.3: PAR1\n\nPAR2 |
42 | // ------------ | |
f45a286b | 43 | |
f4f0f80d AD |
44 | // State 1.4: <div>PAR1\n\nPAR2 (see State 2) |
45 | // ------------ | |
46 | $token = array($this->_pStart()); | |
47 | $this->_splitText($text, $token); | |
48 | } else { | |
49 | // State 1.5: \n<hr /> | |
50 | // -- | |
51 | } | |
f45a286b AD |
52 | } |
53 | } else { | |
54 | // State 2: <div>PAR1... (similar to 1.4) | |
55 | // ---- | |
56 | ||
57 | // We're in an element that allows paragraph tags, but we're not | |
58 | // sure if we're going to need them. | |
59 | if ($this->_pLookAhead()) { | |
60 | // State 2.1: <div>PAR1<b>PAR1\n\nPAR2 | |
61 | // ---- | |
62 | // Note: This will always be the first child, since any | |
63 | // previous inline element would have triggered this very | |
64 | // same routine, and found the double newline. One possible | |
65 | // exception would be a comment. | |
66 | $token = array($this->_pStart(), $token); | |
67 | } else { | |
68 | // State 2.2.1: <div>PAR1<div> | |
69 | // ---- | |
70 | ||
71 | // State 2.2.2: <div>PAR1<b>PAR1</b></div> | |
72 | // ---- | |
73 | } | |
74 | } | |
75 | // Is the current parent a <p> tag? | |
76 | } elseif ( | |
77 | !empty($this->currentNesting) && | |
78 | $this->currentNesting[count($this->currentNesting)-1]->name == 'p' | |
79 | ) { | |
80 | // State 3.1: ...<p>PAR1 | |
81 | // ---- | |
82 | ||
83 | // State 3.2: ...<p>PAR1\n\nPAR2 | |
84 | // ------------ | |
85 | $token = array(); | |
86 | $this->_splitText($text, $token); | |
87 | // Abort! | |
88 | } else { | |
89 | // State 4.1: ...<b>PAR1 | |
90 | // ---- | |
91 | ||
92 | // State 4.2: ...<b>PAR1\n\nPAR2 | |
93 | // ------------ | |
94 | } | |
95 | } | |
96 | ||
97 | public function handleElement(&$token) { | |
98 | // We don't have to check if we're already in a <p> tag for block | |
99 | // tokens, because the tag would have been autoclosed by MakeWellFormed. | |
100 | if ($this->allowsElement('p')) { | |
101 | if (!empty($this->currentNesting)) { | |
102 | if ($this->_isInline($token)) { | |
103 | // State 1: <div>...<b> | |
104 | // --- | |
105 | ||
106 | // Check if this token is adjacent to the parent token | |
107 | // (seek backwards until token isn't whitespace) | |
108 | $i = null; | |
109 | $this->backward($i, $prev); | |
110 | ||
111 | if (!$prev instanceof HTMLPurifier_Token_Start) { | |
112 | // Token wasn't adjacent | |
113 | ||
114 | if ( | |
115 | $prev instanceof HTMLPurifier_Token_Text && | |
116 | substr($prev->data, -2) === "\n\n" | |
117 | ) { | |
118 | // State 1.1.4: <div><p>PAR1</p>\n\n<b> | |
119 | // --- | |
120 | ||
121 | // Quite frankly, this should be handled by splitText | |
122 | $token = array($this->_pStart(), $token); | |
123 | } else { | |
124 | // State 1.1.1: <div><p>PAR1</p><b> | |
125 | // --- | |
126 | ||
127 | // State 1.1.2: <div><br /><b> | |
128 | // --- | |
129 | ||
130 | // State 1.1.3: <div>PAR<b> | |
131 | // --- | |
132 | } | |
133 | ||
134 | } else { | |
135 | // State 1.2.1: <div><b> | |
136 | // --- | |
137 | ||
138 | // Lookahead to see if <p> is needed. | |
139 | if ($this->_pLookAhead()) { | |
140 | // State 1.3.1: <div><b>PAR1\n\nPAR2 | |
141 | // --- | |
142 | $token = array($this->_pStart(), $token); | |
143 | } else { | |
144 | // State 1.3.2: <div><b>PAR1</b></div> | |
145 | // --- | |
146 | ||
147 | // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div> | |
148 | // --- | |
149 | } | |
150 | } | |
151 | } else { | |
152 | // State 2.3: ...<div> | |
153 | // ----- | |
154 | } | |
155 | } else { | |
156 | if ($this->_isInline($token)) { | |
157 | // State 3.1: <b> | |
158 | // --- | |
159 | // This is where the {p} tag is inserted, not reflected in | |
160 | // inputTokens yet, however. | |
161 | $token = array($this->_pStart(), $token); | |
162 | } else { | |
163 | // State 3.2: <div> | |
164 | // ----- | |
165 | } | |
166 | ||
167 | $i = null; | |
168 | if ($this->backward($i, $prev)) { | |
169 | if ( | |
170 | !$prev instanceof HTMLPurifier_Token_Text | |
171 | ) { | |
172 | // State 3.1.1: ...</p>{p}<b> | |
173 | // --- | |
174 | ||
175 | // State 3.2.1: ...</p><div> | |
176 | // ----- | |
177 | ||
178 | if (!is_array($token)) $token = array($token); | |
179 | array_unshift($token, new HTMLPurifier_Token_Text("\n\n")); | |
180 | } else { | |
181 | // State 3.1.2: ...</p>\n\n{p}<b> | |
182 | // --- | |
183 | ||
184 | // State 3.2.2: ...</p>\n\n<div> | |
185 | // ----- | |
186 | ||
187 | // Note: PAR<ELEM> cannot occur because PAR would have been | |
188 | // wrapped in <p> tags. | |
189 | } | |
190 | } | |
191 | } | |
192 | } else { | |
193 | // State 2.2: <ul><li> | |
194 | // ---- | |
195 | ||
196 | // State 2.4: <p><b> | |
197 | // --- | |
198 | } | |
199 | } | |
200 | ||
201 | /** | |
202 | * Splits up a text in paragraph tokens and appends them | |
203 | * to the result stream that will replace the original | |
204 | * @param $data String text data that will be processed | |
205 | * into paragraphs | |
206 | * @param $result Reference to array of tokens that the | |
207 | * tags will be appended onto | |
208 | * @param $config Instance of HTMLPurifier_Config | |
209 | * @param $context Instance of HTMLPurifier_Context | |
210 | */ | |
211 | private function _splitText($data, &$result) { | |
212 | $raw_paragraphs = explode("\n\n", $data); | |
213 | $paragraphs = array(); // without empty paragraphs | |
214 | $needs_start = false; | |
215 | $needs_end = false; | |
216 | ||
217 | $c = count($raw_paragraphs); | |
218 | if ($c == 1) { | |
219 | // There were no double-newlines, abort quickly. In theory this | |
220 | // should never happen. | |
221 | $result[] = new HTMLPurifier_Token_Text($data); | |
222 | return; | |
223 | } | |
224 | for ($i = 0; $i < $c; $i++) { | |
225 | $par = $raw_paragraphs[$i]; | |
226 | if (trim($par) !== '') { | |
227 | $paragraphs[] = $par; | |
228 | } else { | |
229 | if ($i == 0) { | |
230 | // Double newline at the front | |
231 | if (empty($result)) { | |
232 | // The empty result indicates that the AutoParagraph | |
233 | // injector did not add any start paragraph tokens. | |
234 | // This means that we have been in a paragraph for | |
235 | // a while, and the newline means we should start a new one. | |
236 | $result[] = new HTMLPurifier_Token_End('p'); | |
237 | $result[] = new HTMLPurifier_Token_Text("\n\n"); | |
238 | // However, the start token should only be added if | |
239 | // there is more processing to be done (i.e. there are | |
240 | // real paragraphs in here). If there are none, the | |
241 | // next start paragraph tag will be handled by the | |
242 | // next call to the injector | |
243 | $needs_start = true; | |
244 | } else { | |
245 | // We just started a new paragraph! | |
246 | // Reinstate a double-newline for presentation's sake, since | |
247 | // it was in the source code. | |
248 | array_unshift($result, new HTMLPurifier_Token_Text("\n\n")); | |
249 | } | |
250 | } elseif ($i + 1 == $c) { | |
251 | // Double newline at the end | |
252 | // There should be a trailing </p> when we're finally done. | |
253 | $needs_end = true; | |
254 | } | |
255 | } | |
256 | } | |
257 | ||
258 | // Check if this was just a giant blob of whitespace. Move this earlier, | |
259 | // perhaps? | |
260 | if (empty($paragraphs)) { | |
261 | return; | |
262 | } | |
263 | ||
264 | // Add the start tag indicated by \n\n at the beginning of $data | |
265 | if ($needs_start) { | |
266 | $result[] = $this->_pStart(); | |
267 | } | |
268 | ||
269 | // Append the paragraphs onto the result | |
270 | foreach ($paragraphs as $par) { | |
271 | $result[] = new HTMLPurifier_Token_Text($par); | |
272 | $result[] = new HTMLPurifier_Token_End('p'); | |
273 | $result[] = new HTMLPurifier_Token_Text("\n\n"); | |
274 | $result[] = $this->_pStart(); | |
275 | } | |
276 | ||
277 | // Remove trailing start token; Injector will handle this later if | |
278 | // it was indeed needed. This prevents from needing to do a lookahead, | |
279 | // at the cost of a lookbehind later. | |
280 | array_pop($result); | |
281 | ||
282 | // If there is no need for an end tag, remove all of it and let | |
283 | // MakeWellFormed close it later. | |
284 | if (!$needs_end) { | |
285 | array_pop($result); // removes \n\n | |
286 | array_pop($result); // removes </p> | |
287 | } | |
288 | ||
289 | } | |
290 | ||
291 | /** | |
292 | * Returns true if passed token is inline (and, ergo, allowed in | |
293 | * paragraph tags) | |
294 | */ | |
295 | private function _isInline($token) { | |
296 | return isset($this->htmlDefinition->info['p']->child->elements[$token->name]); | |
297 | } | |
298 | ||
299 | /** | |
300 | * Looks ahead in the token list and determines whether or not we need | |
301 | * to insert a <p> tag. | |
302 | */ | |
303 | private function _pLookAhead() { | |
304 | $this->current($i, $current); | |
305 | if ($current instanceof HTMLPurifier_Token_Start) $nesting = 1; | |
306 | else $nesting = 0; | |
307 | $ok = false; | |
308 | while ($this->forwardUntilEndToken($i, $current, $nesting)) { | |
309 | $result = $this->_checkNeedsP($current); | |
310 | if ($result !== null) { | |
311 | $ok = $result; | |
312 | break; | |
313 | } | |
314 | } | |
315 | return $ok; | |
316 | } | |
317 | ||
318 | /** | |
319 | * Determines if a particular token requires an earlier inline token | |
320 | * to get a paragraph. This should be used with _forwardUntilEndToken | |
321 | */ | |
322 | private function _checkNeedsP($current) { | |
323 | if ($current instanceof HTMLPurifier_Token_Start){ | |
324 | if (!$this->_isInline($current)) { | |
325 | // <div>PAR1<div> | |
326 | // ---- | |
327 | // Terminate early, since we hit a block element | |
328 | return false; | |
329 | } | |
330 | } elseif ($current instanceof HTMLPurifier_Token_Text) { | |
331 | if (strpos($current->data, "\n\n") !== false) { | |
332 | // <div>PAR1<b>PAR1\n\nPAR2 | |
333 | // ---- | |
334 | return true; | |
335 | } else { | |
336 | // <div>PAR1<b>PAR1... | |
337 | // ---- | |
338 | } | |
339 | } | |
340 | return null; | |
341 | } | |
342 | ||
343 | } | |
344 | ||
345 | // vim: et sw=4 sts=4 |