]>
Commit | Line | Data |
---|---|---|
f45a286b AD |
1 | <?php |
2 | ||
3 | // if want to implement error collecting here, we'll need to use some sort | |
4 | // of global data (probably trigger_error) because it's impossible to pass | |
5 | // $config or $context to the callback functions. | |
6 | ||
7 | /** | |
8 | * Handles referencing and derefencing character entities | |
9 | */ | |
10 | class HTMLPurifier_EntityParser | |
11 | { | |
12 | ||
13 | /** | |
14 | * Reference to entity lookup table. | |
15 | */ | |
16 | protected $_entity_lookup; | |
17 | ||
18 | /** | |
19 | * Callback regex string for parsing entities. | |
20 | */ | |
21 | protected $_substituteEntitiesRegex = | |
22 | '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/'; | |
23 | // 1. hex 2. dec 3. string (XML style) | |
24 | ||
25 | ||
26 | /** | |
27 | * Decimal to parsed string conversion table for special entities. | |
28 | */ | |
29 | protected $_special_dec2str = | |
30 | array( | |
31 | 34 => '"', | |
32 | 38 => '&', | |
33 | 39 => "'", | |
34 | 60 => '<', | |
35 | 62 => '>' | |
36 | ); | |
37 | ||
38 | /** | |
39 | * Stripped entity names to decimal conversion table for special entities. | |
40 | */ | |
41 | protected $_special_ent2dec = | |
42 | array( | |
43 | 'quot' => 34, | |
44 | 'amp' => 38, | |
45 | 'lt' => 60, | |
46 | 'gt' => 62 | |
47 | ); | |
48 | ||
49 | /** | |
50 | * Substitutes non-special entities with their parsed equivalents. Since | |
51 | * running this whenever you have parsed character is t3h 5uck, we run | |
52 | * it before everything else. | |
53 | * | |
54 | * @param $string String to have non-special entities parsed. | |
55 | * @returns Parsed string. | |
56 | */ | |
57 | public function substituteNonSpecialEntities($string) { | |
58 | // it will try to detect missing semicolons, but don't rely on it | |
59 | return preg_replace_callback( | |
60 | $this->_substituteEntitiesRegex, | |
61 | array($this, 'nonSpecialEntityCallback'), | |
62 | $string | |
63 | ); | |
64 | } | |
65 | ||
66 | /** | |
67 | * Callback function for substituteNonSpecialEntities() that does the work. | |
68 | * | |
69 | * @param $matches PCRE matches array, with 0 the entire match, and | |
70 | * either index 1, 2 or 3 set with a hex value, dec value, | |
71 | * or string (respectively). | |
72 | * @returns Replacement string. | |
73 | */ | |
74 | ||
75 | protected function nonSpecialEntityCallback($matches) { | |
76 | // replaces all but big five | |
77 | $entity = $matches[0]; | |
78 | $is_num = (@$matches[0][1] === '#'); | |
79 | if ($is_num) { | |
80 | $is_hex = (@$entity[2] === 'x'); | |
81 | $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; | |
82 | ||
83 | // abort for special characters | |
84 | if (isset($this->_special_dec2str[$code])) return $entity; | |
85 | ||
86 | return HTMLPurifier_Encoder::unichr($code); | |
87 | } else { | |
88 | if (isset($this->_special_ent2dec[$matches[3]])) return $entity; | |
89 | if (!$this->_entity_lookup) { | |
90 | $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); | |
91 | } | |
92 | if (isset($this->_entity_lookup->table[$matches[3]])) { | |
93 | return $this->_entity_lookup->table[$matches[3]]; | |
94 | } else { | |
95 | return $entity; | |
96 | } | |
97 | } | |
98 | } | |
99 | ||
100 | /** | |
101 | * Substitutes only special entities with their parsed equivalents. | |
102 | * | |
103 | * @notice We try to avoid calling this function because otherwise, it | |
104 | * would have to be called a lot (for every parsed section). | |
105 | * | |
106 | * @param $string String to have non-special entities parsed. | |
107 | * @returns Parsed string. | |
108 | */ | |
109 | public function substituteSpecialEntities($string) { | |
110 | return preg_replace_callback( | |
111 | $this->_substituteEntitiesRegex, | |
112 | array($this, 'specialEntityCallback'), | |
113 | $string); | |
114 | } | |
115 | ||
116 | /** | |
117 | * Callback function for substituteSpecialEntities() that does the work. | |
118 | * | |
119 | * This callback has same syntax as nonSpecialEntityCallback(). | |
120 | * | |
121 | * @param $matches PCRE-style matches array, with 0 the entire match, and | |
122 | * either index 1, 2 or 3 set with a hex value, dec value, | |
123 | * or string (respectively). | |
124 | * @returns Replacement string. | |
125 | */ | |
126 | protected function specialEntityCallback($matches) { | |
127 | $entity = $matches[0]; | |
128 | $is_num = (@$matches[0][1] === '#'); | |
129 | if ($is_num) { | |
130 | $is_hex = (@$entity[2] === 'x'); | |
131 | $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; | |
132 | return isset($this->_special_dec2str[$int]) ? | |
133 | $this->_special_dec2str[$int] : | |
134 | $entity; | |
135 | } else { | |
136 | return isset($this->_special_ent2dec[$matches[3]]) ? | |
137 | $this->_special_ent2dec[$matches[3]] : | |
138 | $entity; | |
139 | } | |
140 | } | |
141 | ||
142 | } | |
143 | ||
144 | // vim: et sw=4 sts=4 |