]>
Commit | Line | Data |
---|---|---|
f45a286b AD |
1 | <?php |
2 | ||
3 | /** | |
4 | * HTML Purifier's internal representation of a URI. | |
5 | * @note | |
6 | * Internal data-structures are completely escaped. If the data needs | |
7 | * to be used in a non-URI context (which is very unlikely), be sure | |
8 | * to decode it first. The URI may not necessarily be well-formed until | |
9 | * validate() is called. | |
10 | */ | |
11 | class HTMLPurifier_URI | |
12 | { | |
13 | ||
14 | public $scheme, $userinfo, $host, $port, $path, $query, $fragment; | |
15 | ||
16 | /** | |
17 | * @note Automatically normalizes scheme and port | |
18 | */ | |
19 | public function __construct($scheme, $userinfo, $host, $port, $path, $query, $fragment) { | |
20 | $this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme); | |
21 | $this->userinfo = $userinfo; | |
22 | $this->host = $host; | |
23 | $this->port = is_null($port) ? $port : (int) $port; | |
24 | $this->path = $path; | |
25 | $this->query = $query; | |
26 | $this->fragment = $fragment; | |
27 | } | |
28 | ||
29 | /** | |
30 | * Retrieves a scheme object corresponding to the URI's scheme/default | |
31 | * @param $config Instance of HTMLPurifier_Config | |
32 | * @param $context Instance of HTMLPurifier_Context | |
33 | * @return Scheme object appropriate for validating this URI | |
34 | */ | |
35 | public function getSchemeObj($config, $context) { | |
36 | $registry = HTMLPurifier_URISchemeRegistry::instance(); | |
37 | if ($this->scheme !== null) { | |
38 | $scheme_obj = $registry->getScheme($this->scheme, $config, $context); | |
39 | if (!$scheme_obj) return false; // invalid scheme, clean it out | |
40 | } else { | |
41 | // no scheme: retrieve the default one | |
42 | $def = $config->getDefinition('URI'); | |
dd205fba | 43 | $scheme_obj = $def->getDefaultScheme($config, $context); |
f45a286b AD |
44 | if (!$scheme_obj) { |
45 | // something funky happened to the default scheme object | |
46 | trigger_error( | |
47 | 'Default scheme object "' . $def->defaultScheme . '" was not readable', | |
48 | E_USER_WARNING | |
49 | ); | |
50 | return false; | |
51 | } | |
52 | } | |
53 | return $scheme_obj; | |
54 | } | |
55 | ||
56 | /** | |
57 | * Generic validation method applicable for all schemes. May modify | |
58 | * this URI in order to get it into a compliant form. | |
59 | * @param $config Instance of HTMLPurifier_Config | |
60 | * @param $context Instance of HTMLPurifier_Context | |
61 | * @return True if validation/filtering succeeds, false if failure | |
62 | */ | |
63 | public function validate($config, $context) { | |
64 | ||
65 | // ABNF definitions from RFC 3986 | |
66 | $chars_sub_delims = '!$&\'()*+,;='; | |
67 | $chars_gen_delims = ':/?#[]@'; | |
68 | $chars_pchar = $chars_sub_delims . ':@'; | |
69 | ||
f45a286b AD |
70 | // validate host |
71 | if (!is_null($this->host)) { | |
72 | $host_def = new HTMLPurifier_AttrDef_URI_Host(); | |
73 | $this->host = $host_def->validate($this->host, $config, $context); | |
74 | if ($this->host === false) $this->host = null; | |
75 | } | |
76 | ||
f4f0f80d AD |
77 | // validate scheme |
78 | // NOTE: It's not appropriate to check whether or not this | |
79 | // scheme is in our registry, since a URIFilter may convert a | |
80 | // URI that we don't allow into one we do. So instead, we just | |
81 | // check if the scheme can be dropped because there is no host | |
82 | // and it is our default scheme. | |
83 | if (!is_null($this->scheme) && is_null($this->host) || $this->host === '') { | |
84 | // support for relative paths is pretty abysmal when the | |
85 | // scheme is present, so axe it when possible | |
86 | $def = $config->getDefinition('URI'); | |
87 | if ($def->defaultScheme === $this->scheme) { | |
88 | $this->scheme = null; | |
89 | } | |
90 | } | |
91 | ||
f45a286b AD |
92 | // validate username |
93 | if (!is_null($this->userinfo)) { | |
94 | $encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':'); | |
95 | $this->userinfo = $encoder->encode($this->userinfo); | |
96 | } | |
97 | ||
98 | // validate port | |
99 | if (!is_null($this->port)) { | |
100 | if ($this->port < 1 || $this->port > 65535) $this->port = null; | |
101 | } | |
102 | ||
103 | // validate path | |
104 | $path_parts = array(); | |
105 | $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/'); | |
f4f0f80d | 106 | if (!is_null($this->host)) { // this catches $this->host === '' |
f45a286b | 107 | // path-abempty (hier and relative) |
f4f0f80d AD |
108 | // http://www.example.com/my/path |
109 | // //www.example.com/my/path (looks odd, but works, and | |
110 | // recognized by most browsers) | |
111 | // (this set is valid or invalid on a scheme by scheme | |
112 | // basis, so we'll deal with it later) | |
113 | // file:///my/path | |
114 | // ///my/path | |
f45a286b | 115 | $this->path = $segments_encoder->encode($this->path); |
f4f0f80d AD |
116 | } elseif ($this->path !== '') { |
117 | if ($this->path[0] === '/') { | |
118 | // path-absolute (hier and relative) | |
119 | // http:/my/path | |
120 | // /my/path | |
121 | if (strlen($this->path) >= 2 && $this->path[1] === '/') { | |
122 | // This could happen if both the host gets stripped | |
123 | // out | |
124 | // http://my/path | |
125 | // //my/path | |
126 | $this->path = ''; | |
127 | } else { | |
128 | $this->path = $segments_encoder->encode($this->path); | |
129 | } | |
130 | } elseif (!is_null($this->scheme)) { | |
131 | // path-rootless (hier) | |
132 | // http:my/path | |
133 | // Short circuit evaluation means we don't need to check nz | |
f45a286b | 134 | $this->path = $segments_encoder->encode($this->path); |
f45a286b | 135 | } else { |
f4f0f80d AD |
136 | // path-noscheme (relative) |
137 | // my/path | |
138 | // (once again, not checking nz) | |
139 | $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@'); | |
140 | $c = strpos($this->path, '/'); | |
141 | if ($c !== false) { | |
142 | $this->path = | |
143 | $segment_nc_encoder->encode(substr($this->path, 0, $c)) . | |
144 | $segments_encoder->encode(substr($this->path, $c)); | |
145 | } else { | |
146 | $this->path = $segment_nc_encoder->encode($this->path); | |
147 | } | |
f45a286b AD |
148 | } |
149 | } else { | |
150 | // path-empty (hier and relative) | |
151 | $this->path = ''; // just to be safe | |
152 | } | |
153 | ||
154 | // qf = query and fragment | |
155 | $qf_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/?'); | |
156 | ||
157 | if (!is_null($this->query)) { | |
158 | $this->query = $qf_encoder->encode($this->query); | |
159 | } | |
160 | ||
161 | if (!is_null($this->fragment)) { | |
162 | $this->fragment = $qf_encoder->encode($this->fragment); | |
163 | } | |
164 | ||
165 | return true; | |
166 | ||
167 | } | |
168 | ||
169 | /** | |
170 | * Convert URI back to string | |
171 | * @return String URI appropriate for output | |
172 | */ | |
173 | public function toString() { | |
174 | // reconstruct authority | |
175 | $authority = null; | |
f4f0f80d AD |
176 | // there is a rendering difference between a null authority |
177 | // (http:foo-bar) and an empty string authority | |
178 | // (http:///foo-bar). | |
f45a286b AD |
179 | if (!is_null($this->host)) { |
180 | $authority = ''; | |
181 | if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@'; | |
182 | $authority .= $this->host; | |
183 | if(!is_null($this->port)) $authority .= ':' . $this->port; | |
184 | } | |
185 | ||
f4f0f80d AD |
186 | // Reconstruct the result |
187 | // One might wonder about parsing quirks from browsers after | |
188 | // this reconstruction. Unfortunately, parsing behavior depends | |
189 | // on what *scheme* was employed (file:///foo is handled *very* | |
190 | // differently than http:///foo), so unfortunately we have to | |
191 | // defer to the schemes to do the right thing. | |
f45a286b AD |
192 | $result = ''; |
193 | if (!is_null($this->scheme)) $result .= $this->scheme . ':'; | |
194 | if (!is_null($authority)) $result .= '//' . $authority; | |
195 | $result .= $this->path; | |
196 | if (!is_null($this->query)) $result .= '?' . $this->query; | |
197 | if (!is_null($this->fragment)) $result .= '#' . $this->fragment; | |
198 | ||
199 | return $result; | |
200 | } | |
201 | ||
dd205fba MK |
202 | /** |
203 | * Returns true if this URL might be considered a 'local' URL given | |
204 | * the current context. This is true when the host is null, or | |
205 | * when it matches the host supplied to the configuration. | |
206 | * | |
207 | * Note that this does not do any scheme checking, so it is mostly | |
208 | * only appropriate for metadata that doesn't care about protocol | |
209 | * security. isBenign is probably what you actually want. | |
210 | */ | |
211 | public function isLocal($config, $context) { | |
212 | if ($this->host === null) return true; | |
213 | $uri_def = $config->getDefinition('URI'); | |
214 | if ($uri_def->host === $this->host) return true; | |
215 | return false; | |
216 | } | |
217 | ||
218 | /** | |
219 | * Returns true if this URL should be considered a 'benign' URL, | |
220 | * that is: | |
221 | * | |
222 | * - It is a local URL (isLocal), and | |
223 | * - It has a equal or better level of security | |
224 | */ | |
225 | public function isBenign($config, $context) { | |
226 | if (!$this->isLocal($config, $context)) return false; | |
227 | ||
228 | $scheme_obj = $this->getSchemeObj($config, $context); | |
229 | if (!$scheme_obj) return false; // conservative approach | |
230 | ||
231 | $current_scheme_obj = $config->getDefinition('URI')->getDefaultScheme($config, $context); | |
232 | if ($current_scheme_obj->secure) { | |
233 | if (!$scheme_obj->secure) { | |
234 | return false; | |
235 | } | |
236 | } | |
237 | return true; | |
238 | } | |
239 | ||
f45a286b AD |
240 | } |
241 | ||
242 | // vim: et sw=4 sts=4 |