]>
Commit | Line | Data |
---|---|---|
f45a286b AD |
1 | <?php |
2 | ||
3 | /** | |
4 | * Generates HTML from tokens. | |
5 | * @todo Refactor interface so that configuration/context is determined | |
6 | * upon instantiation, no need for messy generateFromTokens() calls | |
7 | * @todo Make some of the more internal functions protected, and have | |
8 | * unit tests work around that | |
9 | */ | |
10 | class HTMLPurifier_Generator | |
11 | { | |
12 | ||
13 | /** | |
14 | * Whether or not generator should produce XML output | |
15 | */ | |
16 | private $_xhtml = true; | |
17 | ||
18 | /** | |
19 | * :HACK: Whether or not generator should comment the insides of <script> tags | |
20 | */ | |
21 | private $_scriptFix = false; | |
22 | ||
23 | /** | |
24 | * Cache of HTMLDefinition during HTML output to determine whether or | |
25 | * not attributes should be minimized. | |
26 | */ | |
27 | private $_def; | |
28 | ||
29 | /** | |
30 | * Cache of %Output.SortAttr | |
31 | */ | |
32 | private $_sortAttr; | |
33 | ||
f4f0f80d AD |
34 | /** |
35 | * Cache of %Output.FlashCompat | |
36 | */ | |
37 | private $_flashCompat; | |
38 | ||
39 | /** | |
40 | * Cache of %Output.FixInnerHTML | |
41 | */ | |
42 | private $_innerHTMLFix; | |
43 | ||
44 | /** | |
45 | * Stack for keeping track of object information when outputting IE | |
46 | * compatibility code. | |
47 | */ | |
48 | private $_flashStack = array(); | |
49 | ||
f45a286b AD |
50 | /** |
51 | * Configuration for the generator | |
52 | */ | |
53 | protected $config; | |
54 | ||
55 | /** | |
56 | * @param $config Instance of HTMLPurifier_Config | |
57 | * @param $context Instance of HTMLPurifier_Context | |
58 | */ | |
59 | public function __construct($config, $context) { | |
60 | $this->config = $config; | |
f4f0f80d AD |
61 | $this->_scriptFix = $config->get('Output.CommentScriptContents'); |
62 | $this->_innerHTMLFix = $config->get('Output.FixInnerHTML'); | |
63 | $this->_sortAttr = $config->get('Output.SortAttr'); | |
64 | $this->_flashCompat = $config->get('Output.FlashCompat'); | |
f45a286b AD |
65 | $this->_def = $config->getHTMLDefinition(); |
66 | $this->_xhtml = $this->_def->doctype->xml; | |
67 | } | |
68 | ||
69 | /** | |
70 | * Generates HTML from an array of tokens. | |
71 | * @param $tokens Array of HTMLPurifier_Token | |
72 | * @param $config HTMLPurifier_Config object | |
73 | * @return Generated HTML | |
74 | */ | |
75 | public function generateFromTokens($tokens) { | |
76 | if (!$tokens) return ''; | |
77 | ||
78 | // Basic algorithm | |
79 | $html = ''; | |
80 | for ($i = 0, $size = count($tokens); $i < $size; $i++) { | |
81 | if ($this->_scriptFix && $tokens[$i]->name === 'script' | |
82 | && $i + 2 < $size && $tokens[$i+2] instanceof HTMLPurifier_Token_End) { | |
83 | // script special case | |
84 | // the contents of the script block must be ONE token | |
85 | // for this to work. | |
86 | $html .= $this->generateFromToken($tokens[$i++]); | |
87 | $html .= $this->generateScriptFromToken($tokens[$i++]); | |
88 | } | |
89 | $html .= $this->generateFromToken($tokens[$i]); | |
90 | } | |
91 | ||
92 | // Tidy cleanup | |
f4f0f80d | 93 | if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) { |
f45a286b AD |
94 | $tidy = new Tidy; |
95 | $tidy->parseString($html, array( | |
96 | 'indent'=> true, | |
97 | 'output-xhtml' => $this->_xhtml, | |
98 | 'show-body-only' => true, | |
99 | 'indent-spaces' => 2, | |
100 | 'wrap' => 68, | |
101 | ), 'utf8'); | |
102 | $tidy->cleanRepair(); | |
103 | $html = (string) $tidy; // explicit cast necessary | |
104 | } | |
105 | ||
106 | // Normalize newlines to system defined value | |
f4f0f80d AD |
107 | if ($this->config->get('Core.NormalizeNewlines')) { |
108 | $nl = $this->config->get('Output.Newline'); | |
109 | if ($nl === null) $nl = PHP_EOL; | |
110 | if ($nl !== "\n") $html = str_replace("\n", $nl, $html); | |
111 | } | |
f45a286b AD |
112 | return $html; |
113 | } | |
114 | ||
115 | /** | |
116 | * Generates HTML from a single token. | |
117 | * @param $token HTMLPurifier_Token object. | |
118 | * @return Generated HTML | |
119 | */ | |
120 | public function generateFromToken($token) { | |
121 | if (!$token instanceof HTMLPurifier_Token) { | |
122 | trigger_error('Cannot generate HTML from non-HTMLPurifier_Token object', E_USER_WARNING); | |
123 | return ''; | |
124 | ||
125 | } elseif ($token instanceof HTMLPurifier_Token_Start) { | |
126 | $attr = $this->generateAttributes($token->attr, $token->name); | |
f4f0f80d AD |
127 | if ($this->_flashCompat) { |
128 | if ($token->name == "object") { | |
129 | $flash = new stdclass(); | |
130 | $flash->attr = $token->attr; | |
131 | $flash->param = array(); | |
132 | $this->_flashStack[] = $flash; | |
133 | } | |
134 | } | |
f45a286b AD |
135 | return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>'; |
136 | ||
137 | } elseif ($token instanceof HTMLPurifier_Token_End) { | |
f4f0f80d AD |
138 | $_extra = ''; |
139 | if ($this->_flashCompat) { | |
140 | if ($token->name == "object" && !empty($this->_flashStack)) { | |
141 | // doesn't do anything for now | |
142 | } | |
143 | } | |
144 | return $_extra . '</' . $token->name . '>'; | |
f45a286b AD |
145 | |
146 | } elseif ($token instanceof HTMLPurifier_Token_Empty) { | |
f4f0f80d AD |
147 | if ($this->_flashCompat && $token->name == "param" && !empty($this->_flashStack)) { |
148 | $this->_flashStack[count($this->_flashStack)-1]->param[$token->attr['name']] = $token->attr['value']; | |
149 | } | |
f45a286b AD |
150 | $attr = $this->generateAttributes($token->attr, $token->name); |
151 | return '<' . $token->name . ($attr ? ' ' : '') . $attr . | |
152 | ( $this->_xhtml ? ' /': '' ) // <br /> v. <br> | |
153 | . '>'; | |
154 | ||
155 | } elseif ($token instanceof HTMLPurifier_Token_Text) { | |
156 | return $this->escape($token->data, ENT_NOQUOTES); | |
157 | ||
158 | } elseif ($token instanceof HTMLPurifier_Token_Comment) { | |
159 | return '<!--' . $token->data . '-->'; | |
160 | } else { | |
161 | return ''; | |
162 | ||
163 | } | |
164 | } | |
165 | ||
166 | /** | |
167 | * Special case processor for the contents of script tags | |
168 | * @warning This runs into problems if there's already a literal | |
169 | * --> somewhere inside the script contents. | |
170 | */ | |
171 | public function generateScriptFromToken($token) { | |
172 | if (!$token instanceof HTMLPurifier_Token_Text) return $this->generateFromToken($token); | |
173 | // Thanks <http://lachy.id.au/log/2005/05/script-comments> | |
174 | $data = preg_replace('#//\s*$#', '', $token->data); | |
175 | return '<!--//--><![CDATA[//><!--' . "\n" . trim($data) . "\n" . '//--><!]]>'; | |
176 | } | |
177 | ||
178 | /** | |
179 | * Generates attribute declarations from attribute array. | |
180 | * @note This does not include the leading or trailing space. | |
181 | * @param $assoc_array_of_attributes Attribute array | |
182 | * @param $element Name of element attributes are for, used to check | |
183 | * attribute minimization. | |
184 | * @return Generate HTML fragment for insertion. | |
185 | */ | |
186 | public function generateAttributes($assoc_array_of_attributes, $element = false) { | |
187 | $html = ''; | |
188 | if ($this->_sortAttr) ksort($assoc_array_of_attributes); | |
189 | foreach ($assoc_array_of_attributes as $key => $value) { | |
190 | if (!$this->_xhtml) { | |
191 | // Remove namespaced attributes | |
192 | if (strpos($key, ':') !== false) continue; | |
193 | // Check if we should minimize the attribute: val="val" -> val | |
194 | if ($element && !empty($this->_def->info[$element]->attr[$key]->minimized)) { | |
195 | $html .= $key . ' '; | |
196 | continue; | |
197 | } | |
198 | } | |
f4f0f80d AD |
199 | // Workaround for Internet Explorer innerHTML bug. |
200 | // Essentially, Internet Explorer, when calculating | |
201 | // innerHTML, omits quotes if there are no instances of | |
202 | // angled brackets, quotes or spaces. However, when parsing | |
203 | // HTML (for example, when you assign to innerHTML), it | |
204 | // treats backticks as quotes. Thus, | |
205 | // <img alt="``" /> | |
206 | // becomes | |
207 | // <img alt=`` /> | |
208 | // becomes | |
209 | // <img alt='' /> | |
210 | // Fortunately, all we need to do is trigger an appropriate | |
211 | // quoting style, which we do by adding an extra space. | |
212 | // This also is consistent with the W3C spec, which states | |
213 | // that user agents may ignore leading or trailing | |
214 | // whitespace (in fact, most don't, at least for attributes | |
215 | // like alt, but an extra space at the end is barely | |
216 | // noticeable). Still, we have a configuration knob for | |
217 | // this, since this transformation is not necesary if you | |
218 | // don't process user input with innerHTML or you don't plan | |
219 | // on supporting Internet Explorer. | |
220 | if ($this->_innerHTMLFix) { | |
221 | if (strpos($value, '`') !== false) { | |
222 | // check if correct quoting style would not already be | |
223 | // triggered | |
224 | if (strcspn($value, '"\' <>') === strlen($value)) { | |
225 | // protect! | |
226 | $value .= ' '; | |
227 | } | |
228 | } | |
229 | } | |
f45a286b AD |
230 | $html .= $key.'="'.$this->escape($value).'" '; |
231 | } | |
232 | return rtrim($html); | |
233 | } | |
234 | ||
235 | /** | |
236 | * Escapes raw text data. | |
237 | * @todo This really ought to be protected, but until we have a facility | |
238 | * for properly generating HTML here w/o using tokens, it stays | |
239 | * public. | |
240 | * @param $string String data to escape for HTML. | |
241 | * @param $quote Quoting style, like htmlspecialchars. ENT_NOQUOTES is | |
242 | * permissible for non-attribute output. | |
243 | * @return String escaped data. | |
244 | */ | |
f4f0f80d AD |
245 | public function escape($string, $quote = null) { |
246 | // Workaround for APC bug on Mac Leopard reported by sidepodcast | |
247 | // http://htmlpurifier.org/phorum/read.php?3,4823,4846 | |
248 | if ($quote === null) $quote = ENT_COMPAT; | |
f45a286b AD |
249 | return htmlspecialchars($string, $quote, 'UTF-8'); |
250 | } | |
251 | ||
252 | } | |
253 | ||
254 | // vim: et sw=4 sts=4 |