]>
git.wh0rd.org - tt-rss.git/blob - lib/htmlpurifier/library/HTMLPurifier/Encoder.php
9fa76bd18c84bd133c5799556360efb5e8900cf5
4 * A UTF-8 specific character encoder that handles cleaning and transforming.
5 * @note All functions in this class should be static.
7 class HTMLPurifier_Encoder
11 * Constructor throws fatal error if you attempt to instantiate class
13 private function __construct () {
14 trigger_error ( 'Cannot instantiate encoder, call methods statically' , E_USER_ERROR
);
18 * Error-handler that mutes errors, alternative to shut-up operator.
20 public static function muteErrorHandler () {}
23 * iconv wrapper which mutes errors, but doesn't work around bugs.
25 public static function unsafeIconv ( $in , $out , $text ) {
26 set_error_handler ( array ( 'HTMLPurifier_Encoder' , 'muteErrorHandler' ));
27 $r = iconv ( $in , $out , $text );
28 restore_error_handler ();
33 * iconv wrapper which mutes errors and works around bugs.
35 public static function iconv ( $in , $out , $text , $max_chunk_size = 8000 ) {
36 $code = self
:: testIconvTruncateBug ();
37 if ( $code == self
:: ICONV_OK
) {
38 return self
:: unsafeIconv ( $in , $out , $text );
39 } elseif ( $code == self
:: ICONV_TRUNCATES
) {
40 // we can only work around this if the input character set
43 if ( $max_chunk_size < 4 ) {
44 trigger_error ( 'max_chunk_size is too small' , E_USER_WARNING
);
47 // split into 8000 byte chunks, but be careful to handle
48 // multibyte boundaries properly
49 if (( $c = strlen ( $text )) <= $max_chunk_size ) {
50 return self
:: unsafeIconv ( $in , $out , $text );
55 if ( $i +
$max_chunk_size >= $c ) {
56 $r .= self
:: unsafeIconv ( $in , $out , substr ( $text , $i ));
59 // wibble the boundary
60 if ( 0x80 != ( 0xC0 & ord ( $text [ $i +
$max_chunk_size ]))) {
61 $chunk_size = $max_chunk_size ;
62 } elseif ( 0x80 != ( 0xC0 & ord ( $text [ $i +
$max_chunk_size - 1 ]))) {
63 $chunk_size = $max_chunk_size - 1 ;
64 } elseif ( 0x80 != ( 0xC0 & ord ( $text [ $i +
$max_chunk_size - 2 ]))) {
65 $chunk_size = $max_chunk_size - 2 ;
66 } elseif ( 0x80 != ( 0xC0 & ord ( $text [ $i +
$max_chunk_size - 3 ]))) {
67 $chunk_size = $max_chunk_size - 3 ;
69 return false ; // rather confusing UTF-8...
71 $chunk = substr ( $text , $i , $chunk_size ); // substr doesn't mind overlong lengths
72 $r .= self
:: unsafeIconv ( $in , $out , $chunk );
85 * Cleans a UTF-8 string for well-formedness and SGML validity
87 * It will parse according to UTF-8 and return a valid UTF8 string, with
88 * non-SGML codepoints excluded.
90 * @note Just for reference, the non-SGML code points are 0 to 31 and
91 * 127 to 159, inclusive. However, we allow code points 9, 10
92 * and 13, which are the tab, line feed and carriage return
93 * respectively. 128 and above the code points map to multibyte
94 * UTF-8 representations.
96 * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
97 * hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
98 * LGPL license. Notes on what changed are inside, but in general,
99 * the original code transformed UTF-8 text into an array of integer
100 * Unicode codepoints. Understandably, transforming that back to
101 * a string would be somewhat expensive, so the function was modded to
102 * directly operate on the string. However, this discourages code
103 * reuse, and the logic enumerated here would be useful for any
104 * function that needs to be able to understand UTF-8 characters.
105 * As of right now, only smart lossless character encoding converters
106 * would need that, and I'm probably not going to implement them.
107 * Once again, PHP 6 should solve all our problems.
109 public static function cleanUTF8 ( $str , $force_php = false ) {
111 // UTF-8 validity is checked since PHP 4.3.5
112 // This is an optimization: if the string is already valid UTF-8, no
113 // need to do PHP stuff. 99% of the time, this will be the case.
114 // The regexp matches the XML char production, as well as well as excluding
115 // non-SGML codepoints U+007F to U+009F
116 if ( preg_match ( '/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du' , $str )) {
120 $mState = 0 ; // cached expected number of octets after the current octet
121 // until the beginning of the next UTF8 character sequence
122 $mUcs4 = 0 ; // cached Unicode character
123 $mBytes = 1 ; // cached expected number of octets in the current sequence
125 // original code involved an $out that was an array of Unicode
126 // codepoints. Instead of having to convert back into UTF-8, we've
127 // decided to directly append valid UTF-8 characters onto a string
128 // $out once they're done. $char accumulates raw bytes, while $mUcs4
129 // turns into the Unicode code point, so there's some redundancy.
135 for ( $i = 0 ; $i < $len ; $i ++
) {
137 $char .= $str [ $i ]; // append byte to char
139 // When mState is zero we expect either a US-ASCII character
140 // or a multi-octet sequence.
141 if ( 0 == ( 0x80 & ( $in ))) {
142 // US-ASCII, pass straight through.
143 if (( $in <= 31 ||
$in == 127 ) &&
144 !( $in == 9 ||
$in == 13 ||
$in == 10 ) // save \r\t\n
146 // control characters, remove
153 } elseif ( 0xC0 == ( 0xE0 & ( $in ))) {
154 // First octet of 2 octet sequence
156 $mUcs4 = ( $mUcs4 & 0x1F ) << 6 ;
159 } elseif ( 0xE0 == ( 0xF0 & ( $in ))) {
160 // First octet of 3 octet sequence
162 $mUcs4 = ( $mUcs4 & 0x0F ) << 12 ;
165 } elseif ( 0xF0 == ( 0xF8 & ( $in ))) {
166 // First octet of 4 octet sequence
168 $mUcs4 = ( $mUcs4 & 0x07 ) << 18 ;
171 } elseif ( 0xF8 == ( 0xFC & ( $in ))) {
172 // First octet of 5 octet sequence.
174 // This is illegal because the encoded codepoint must be
176 // (a) not the shortest form or
177 // (b) outside the Unicode range of 0-0x10FFFF.
178 // Rather than trying to resynchronize, we will carry on
179 // until the end of the sequence and let the later error
180 // handling code catch it.
182 $mUcs4 = ( $mUcs4 & 0x03 ) << 24 ;
185 } elseif ( 0xFC == ( 0xFE & ( $in ))) {
186 // First octet of 6 octet sequence, see comments for 5
189 $mUcs4 = ( $mUcs4 & 1 ) << 30 ;
193 // Current octet is neither in the US-ASCII range nor a
194 // legal first octet of a multi-octet sequence.
201 // When mState is non-zero, we expect a continuation of the
202 // multi-octet sequence
203 if ( 0x80 == ( 0xC0 & ( $in ))) {
204 // Legal continuation.
205 $shift = ( $mState - 1 ) * 6 ;
207 $tmp = ( $tmp & 0x0000003F ) << $shift ;
210 if ( 0 == -- $mState ) {
211 // End of the multi-octet sequence. mUcs4 now contains
212 // the final Unicode codepoint to be output
214 // Check for illegal sequences and codepoints.
216 // From Unicode 3.1, non-shortest form is illegal
217 if ((( 2 == $mBytes ) && ( $mUcs4 < 0x0080 )) ||
218 (( 3 == $mBytes ) && ( $mUcs4 < 0x0800 )) ||
219 (( 4 == $mBytes ) && ( $mUcs4 < 0x10000 )) ||
221 // From Unicode 3.2, surrogate characters = illegal
222 (( $mUcs4 & 0xFFFFF800 ) == 0xD800 ) ||
223 // Codepoints outside the Unicode range are illegal
227 } elseif ( 0xFEFF != $mUcs4 && // omit BOM
228 // check for valid Char unicode codepoints
233 ( 0x20 <= $mUcs4 && 0x7E >= $mUcs4 ) ||
234 // 7F-9F is not strictly prohibited by XML,
235 // but it is non-SGML, and thus we don't allow it
236 ( 0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4 ) ||
237 ( 0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4 )
242 // initialize UTF8 cache (reset)
249 // ((0xC0 & (*in) != 0x80) && (mState != 0))
250 // Incomplete multi-octet sequence.
251 // used to result in complete fail, but we'll reset
263 * Translates a Unicode codepoint into its corresponding UTF-8 character.
264 * @note Based on Feyd's function at
265 * <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
266 * which is in public domain.
267 * @note While we're going to do code point parsing anyway, a good
268 * optimization would be to refuse to translate code points that
269 * are non-SGML characters. However, this could lead to duplication.
270 * @note This is very similar to the unichr function in
271 * maintenance/generate-entity-file.php (although this is superior,
272 * due to its sanity checks).
275 // +----------+----------+----------+----------+
276 // | 33222222 | 22221111 | 111111 | |
277 // | 10987654 | 32109876 | 54321098 | 76543210 | bit
278 // +----------+----------+----------+----------+
279 // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
280 // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
281 // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
282 // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
283 // +----------+----------+----------+----------+
284 // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
285 // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
286 // +----------+----------+----------+----------+
288 public static function unichr ( $code ) {
289 if ( $code > 1114111 or $code < 0 or
290 ( $code >= 55296 and $code <= 57343 ) ) {
291 // bits are set outside the "valid" range as defined
296 $x = $y = $z = $w = 0 ;
298 // regular ASCII character
301 // set up bits for UTF-8
302 $x = ( $code & 63 ) |
128 ;
304 $y = (( $code & 2047 ) >> 6 ) |
192 ;
306 $y = (( $code & 4032 ) >> 6 ) |
128 ;
308 $z = (( $code >> 12 ) & 15 ) |
224 ;
310 $z = (( $code >> 12 ) & 63 ) |
128 ;
311 $w = (( $code >> 18 ) & 7 ) |
240 ;
315 // set up the actual character
317 if ( $w ) $ret .= chr ( $w );
318 if ( $z ) $ret .= chr ( $z );
319 if ( $y ) $ret .= chr ( $y );
325 public static function iconvAvailable () {
326 static $iconv = null ;
327 if ( $iconv === null ) {
328 $iconv = function_exists ( 'iconv' ) && self
:: testIconvTruncateBug () != self
:: ICONV_UNUSABLE
;
334 * Converts a string to UTF-8 based on configuration.
336 public static function convertToUTF8 ( $str , $config , $context ) {
337 $encoding = $config -> get ( 'Core.Encoding' );
338 if ( $encoding === 'utf-8' ) return $str ;
339 static $iconv = null ;
340 if ( $iconv === null ) $iconv = self
:: iconvAvailable ();
341 if ( $iconv && ! $config -> get ( 'Test.ForceNoIconv' )) {
342 // unaffected by bugs, since UTF-8 support all characters
343 $str = self
:: unsafeIconv ( $encoding , 'utf-8//IGNORE' , $str );
344 if ( $str === false ) {
345 // $encoding is not a valid encoding
346 trigger_error ( 'Invalid encoding ' . $encoding , E_USER_ERROR
);
349 // If the string is bjorked by Shift_JIS or a similar encoding
350 // that doesn't support all of ASCII, convert the naughty
351 // characters to their true byte-wise ASCII/UTF-8 equivalents.
352 $str = strtr ( $str , self
:: testEncodingSupportsASCII ( $encoding ));
354 } elseif ( $encoding === 'iso-8859-1' ) {
355 $str = utf8_encode ( $str );
358 trigger_error ( 'Encoding not supported, please install iconv' , E_USER_ERROR
);
362 * Converts a string from UTF-8 based on configuration.
363 * @note Currently, this is a lossy conversion, with unexpressable
364 * characters being omitted.
366 public static function convertFromUTF8 ( $str , $config , $context ) {
367 $encoding = $config -> get ( 'Core.Encoding' );
368 if ( $escape = $config -> get ( 'Core.EscapeNonASCIICharacters' )) {
369 $str = self
:: convertToASCIIDumbLossless ( $str );
371 if ( $encoding === 'utf-8' ) return $str ;
372 static $iconv = null ;
373 if ( $iconv === null ) $iconv = self
:: iconvAvailable ();
374 if ( $iconv && ! $config -> get ( 'Test.ForceNoIconv' )) {
375 // Undo our previous fix in convertToUTF8, otherwise iconv will barf
376 $ascii_fix = self
:: testEncodingSupportsASCII ( $encoding );
377 if (! $escape && ! empty ( $ascii_fix )) {
378 $clear_fix = array ();
379 foreach ( $ascii_fix as $utf8 => $native ) $clear_fix [ $utf8 ] = '' ;
380 $str = strtr ( $str , $clear_fix );
382 $str = strtr ( $str , array_flip ( $ascii_fix ));
384 $str = self
:: iconv ( 'utf-8' , $encoding . '//IGNORE' , $str );
386 } elseif ( $encoding === 'iso-8859-1' ) {
387 $str = utf8_decode ( $str );
390 trigger_error ( 'Encoding not supported' , E_USER_ERROR
);
391 // You might be tempted to assume that the ASCII representation
392 // might be OK, however, this is *not* universally true over all
393 // encodings. So we take the conservative route here, rather
394 // than forcibly turn on %Core.EscapeNonASCIICharacters
398 * Lossless (character-wise) conversion of HTML to ASCII
399 * @param $str UTF-8 string to be converted to ASCII
400 * @returns ASCII encoded string with non-ASCII character entity-ized
401 * @warning Adapted from MediaWiki, claiming fair use: this is a common
402 * algorithm. If you disagree with this license fudgery,
403 * implement it yourself.
404 * @note Uses decimal numeric entities since they are best supported.
405 * @note This is a DUMB function: it has no concept of keeping
406 * character entities that the projected character encoding
407 * can allow. We could possibly implement a smart version
408 * but that would require it to also know which Unicode
409 * codepoints the charset supported (not an easy task).
410 * @note Sort of with cleanUTF8() but it assumes that $str is
413 public static function convertToASCIIDumbLossless ( $str ) {
418 for ( $i = 0 ; $i < $len ; $i ++
) {
419 $bytevalue = ord ( $str [ $i ] );
420 if ( $bytevalue <= 0x7F ) { //0xxx xxxx
421 $result .= chr ( $bytevalue );
423 } elseif ( $bytevalue <= 0xBF ) { //10xx xxxx
424 $working = $working << 6 ;
425 $working +
= ( $bytevalue & 0x3F );
427 if ( $bytesleft <= 0 ) {
428 $result .= "&#" . $working . ";" ;
430 } elseif ( $bytevalue <= 0xDF ) { //110x xxxx
431 $working = $bytevalue & 0x1F ;
433 } elseif ( $bytevalue <= 0xEF ) { //1110 xxxx
434 $working = $bytevalue & 0x0F ;
437 $working = $bytevalue & 0x07 ;
444 /** No bugs detected in iconv. */
447 /** Iconv truncates output if converting from UTF-8 to another
448 * character set with //IGNORE, and a non-encodable character is found */
449 const ICONV_TRUNCATES
= 1 ;
451 /** Iconv does not support //IGNORE, making it unusable for
452 * transcoding purposes */
453 const ICONV_UNUSABLE
= 2 ;
456 * glibc iconv has a known bug where it doesn't handle the magic
457 * //IGNORE stanza correctly. In particular, rather than ignore
458 * characters, it will return an EILSEQ after consuming some number
459 * of characters, and expect you to restart iconv as if it were
460 * an E2BIG. Old versions of PHP did not respect the errno, and
461 * returned the fragment, so as a result you would see iconv
462 * mysteriously truncating output. We can work around this by
463 * manually chopping our input into segments of about 8000
464 * characters, as long as PHP ignores the error code. If PHP starts
465 * paying attention to the error code, iconv becomes unusable.
467 * @returns Error code indicating severity of bug.
469 public static function testIconvTruncateBug () {
471 if ( $code === null ) {
472 // better not use iconv, otherwise infinite loop!
473 $r = self
:: unsafeIconv ( 'utf-8' , 'ascii//IGNORE' , " \xCE\xB1 " . str_repeat ( 'a' , 9000 ));
475 $code = self
:: ICONV_UNUSABLE
;
476 } elseif (( $c = strlen ( $r )) < 9000 ) {
477 $code = self
:: ICONV_TRUNCATES
;
478 } elseif ( $c > 9000 ) {
479 trigger_error ( 'Your copy of iconv is extremely buggy. Please notify HTML Purifier maintainers: include your iconv version as per phpversion()' , E_USER_ERROR
);
481 $code = self
:: ICONV_OK
;
488 * This expensive function tests whether or not a given character
489 * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
490 * fail this test, and require special processing. Variable width
491 * encodings shouldn't ever fail.
493 * @param string $encoding Encoding name to test, as per iconv format
494 * @param bool $bypass Whether or not to bypass the precompiled arrays.
495 * @return Array of UTF-8 characters to their corresponding ASCII,
496 * which can be used to "undo" any overzealous iconv action.
498 public static function testEncodingSupportsASCII ( $encoding , $bypass = false ) {
499 // All calls to iconv here are unsafe, proof by case analysis:
500 // If ICONV_OK, no difference.
501 // If ICONV_TRUNCATE, all calls involve one character inputs,
502 // so bug is not triggered.
503 // If ICONV_UNUSABLE, this call is irrelevant
504 static $encodings = array ();
506 if ( isset ( $encodings [ $encoding ])) return $encodings [ $encoding ];
507 $lenc = strtolower ( $encoding );
510 return array ( " \xC2\xA5 " => ' \\ ' , " \xE2\x80\xBE " => '~' );
512 return array ( " \xE2\x82\xA9 " => ' \\ ' );
514 if ( strpos ( $lenc , 'iso-8859-' ) === 0 ) return array ();
517 if ( self
:: unsafeIconv ( 'UTF-8' , $encoding , 'a' ) === false ) return false ;
518 for ( $i = 0x20 ; $i <= 0x7E ; $i ++
) { // all printable ASCII chars
519 $c = chr ( $i ); // UTF-8 char
520 $r = self
:: unsafeIconv ( 'UTF-8' , " $encoding //IGNORE" , $c ); // initial conversion
523 // This line is needed for iconv implementations that do not
524 // omit characters that do not exist in the target character set
525 ( $r === $c && self
:: unsafeIconv ( $encoding , 'UTF-8//IGNORE' , $r ) !== $c )
527 // Reverse engineer: what's the UTF-8 equiv of this byte
528 // sequence? This assumes that there's no variable width
529 // encoding that doesn't support ASCII.
530 $ret [ self
:: unsafeIconv ( $encoding , 'UTF-8//IGNORE' , $c )] = $c ;
533 $encodings [ $encoding ] = $ret ;
540 // vim: et sw=4 sts=4