]>
Commit | Line | Data |
---|---|---|
45516216 | 1 | enum { |
2 | _NL_COLLATE_NRULES, _NL_COLLATE_TABLEMB, _NL_CTYPE_TOLOWER, | |
3 | _NL_CTYPE_CODESET_NAME, _NL_CTYPE_MAP_TO_NONASCII | |
4 | }; | |
5 | struct locale_data { | |
6 | unsigned int nstrings; | |
7 | union locale_data_value { | |
8 | const char *string; | |
9 | unsigned int word; | |
10 | } values[]; | |
11 | }; | |
12 | extern __thread struct locale_data *const *_nl_current_LC_CTYPE | |
13 | __attribute__ ((tls_model("initial-exec"))); | |
14 | typedef unsigned long int reg_syntax_t; | |
15 | extern reg_syntax_t re_syntax_options; | |
16 | typedef enum { | |
17 | REG_ENOSYS = -1, REG_NOERROR = | |
18 | 0, REG_NOMATCH, REG_BADPAT, REG_ECOLLATE, REG_ECTYPE, REG_EESCAPE, | |
19 | REG_ESUBREG, REG_EBRACK, REG_EPAREN, REG_EBRACE, REG_BADBR, | |
20 | REG_ERANGE, REG_ESPACE, REG_BADRPT, REG_EEND, REG_ESIZE, REG_ERPAREN | |
21 | } reg_errcode_t; | |
22 | typedef struct { | |
23 | unsigned char *buffer; | |
24 | reg_syntax_t syntax; | |
25 | char *fastmap; | |
26 | unsigned char *translate; | |
27 | unsigned int re_nsub; | |
28 | unsigned fastmap_accurate:1; | |
29 | unsigned not_bol:1; | |
30 | unsigned not_eol:1; | |
31 | } regex_t; | |
32 | extern __thread void *__libc_tsd_CTYPE_TOLOWER | |
33 | __attribute__ ((tls_model("initial-exec"))); | |
34 | extern inline const int ** __attribute__ ((const))__ctype_tolower_loc(void) | |
35 | { | |
36 | union { | |
37 | void **ptr; | |
38 | const int **tablep; | |
39 | } u; | |
40 | u.ptr = (&__libc_tsd_CTYPE_TOLOWER); | |
41 | if (__builtin_expect(*u.tablep == 0, 0)) | |
42 | *u.tablep = | |
43 | ((int *)((*_nl_current_LC_CTYPE)-> | |
44 | values[((int)(_NL_CTYPE_TOLOWER) & 0xffff)]. | |
45 | string) + 128); | |
46 | return u.tablep; | |
47 | } | |
48 | extern __inline int __attribute__ ((__nothrow__)) tolower(int __c) | |
49 | { | |
50 | return __c >= -128 && __c < 256 ? (*__ctype_tolower_loc())[__c] : __c; | |
51 | } | |
52 | typedef struct { | |
53 | } mbstate_t; | |
54 | typedef unsigned long int *bitset_t; | |
55 | typedef unsigned long int *re_bitset_ptr_t; | |
56 | typedef struct { | |
57 | int nelem; | |
58 | int *elems; | |
59 | } re_node_set; | |
60 | typedef enum { | |
61 | NON_TYPE = 0, CHARACTER = 1, END_OF_RE = 2, SIMPLE_BRACKET = | |
62 | 3, OP_BACK_REF = 4, OP_PERIOD = 5, COMPLEX_BRACKET = | |
63 | 6, OP_UTF8_PERIOD = 7, OP_OPEN_SUBEXP = 8 | 0, OP_CLOSE_SUBEXP = | |
64 | 8 | 1, OP_ALT = 8 | 2, OP_DUP_ASTERISK = 8 | 3, ANCHOR = | |
65 | 8 | 4, CONCAT = 16, SUBEXP = 17, OP_DUP_PLUS = | |
66 | 18, OP_DUP_QUESTION, OP_OPEN_BRACKET, OP_CLOSE_BRACKET, | |
67 | OP_CHARSET_RANGE, OP_OPEN_DUP_NUM, OP_CLOSE_DUP_NUM, | |
68 | OP_NON_MATCH_LIST, OP_OPEN_COLL_ELEM, OP_CLOSE_COLL_ELEM, | |
69 | OP_OPEN_EQUIV_CLASS, OP_CLOSE_EQUIV_CLASS, OP_OPEN_CHAR_CLASS, | |
70 | OP_CLOSE_CHAR_CLASS, OP_WORD, OP_NOTWORD, OP_SPACE, OP_NOTSPACE, | |
71 | BACK_SLASH | |
72 | } re_token_type_t; | |
73 | typedef struct { | |
74 | long int *mbchars; | |
75 | int nmbchars; | |
76 | } re_charset_t; | |
77 | typedef struct { | |
78 | union { | |
79 | unsigned char c; | |
80 | re_bitset_ptr_t sbcset; | |
81 | re_charset_t *mbcset; | |
82 | int idx; | |
83 | } opr; | |
84 | re_token_type_t type:8; | |
85 | } re_token_t; | |
86 | typedef struct re_dfa_t re_dfa_t; | |
87 | typedef struct bin_tree_t bin_tree_t; | |
88 | struct re_dfastate_t { | |
89 | re_node_set nodes; | |
90 | }; | |
91 | typedef struct re_dfastate_t re_dfastate_t; | |
92 | struct re_dfa_t { | |
93 | re_token_t *nodes; | |
94 | unsigned int nodes_len; | |
95 | int *nexts; | |
96 | int *org_indices; | |
97 | re_node_set *edests; | |
98 | re_node_set *eclosures; | |
99 | re_dfastate_t *init_state; | |
100 | bin_tree_t *str_tree; | |
101 | re_bitset_ptr_t sb_char; | |
102 | int nbackref; | |
103 | unsigned int has_mb_node:1; | |
104 | unsigned int is_utf8:1; | |
105 | unsigned int map_notascii:1; | |
106 | int mb_cur_max; | |
107 | int *subexp_map; | |
108 | }; | |
109 | static reg_errcode_t re_compile_internal(regex_t * preg, const char *pattern, | |
110 | unsigned int length, | |
111 | reg_syntax_t syntax); | |
112 | static void re_compile_fastmap_iter(regex_t * bufp, | |
113 | const re_dfastate_t * init_state, | |
114 | char *fastmap); | |
115 | static reg_errcode_t analyze(regex_t * preg); | |
116 | static reg_errcode_t optimize_subexps(void *extra, bin_tree_t * node); | |
117 | const char *__re_compile_pattern(pattern, length, bufp) | |
118 | const char *pattern; | |
119 | { | |
120 | reg_errcode_t ret; | |
121 | ret = re_compile_internal(bufp, pattern, length, re_syntax_options); | |
122 | } | |
123 | ||
124 | int __re_compile_fastmap(bufp) | |
125 | regex_t *bufp; | |
126 | { | |
127 | re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; | |
128 | char *fastmap = bufp->fastmap; | |
129 | re_compile_fastmap_iter(bufp, dfa->init_state, fastmap); | |
130 | } | |
131 | static inline void | |
132 | __attribute__ ((always_inline)) re_set_fastmap(char *fastmap, int icase, | |
133 | int ch) | |
134 | { | |
135 | if (icase) { | |
136 | int __res; | |
137 | if (sizeof(ch) > 1) { | |
138 | if (__builtin_constant_p(ch)) { | |
139 | ; | |
140 | } else | |
141 | __res = tolower(ch); | |
142 | } | |
143 | fastmap[__res] = 1; | |
144 | } | |
145 | } | |
146 | static void re_compile_fastmap_iter(regex_t * bufp, | |
147 | const re_dfastate_t * init_state, | |
148 | char *fastmap) | |
149 | { | |
150 | re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; | |
151 | int node_cnt; | |
152 | int icase = (dfa->mb_cur_max == 1 | |
153 | && (bufp-> | |
154 | syntax & | |
155 | ((((((((((((((((((((((((unsigned long int)1) << 1) << | |
156 | 1) << 1) << 1) << 1) << 1) << 1) | |
157 | << 1) << 1) << 1) << 1) << 1) << 1) << | |
158 | 1) << 1) << 1) << 1) << 1) << 1) << 1) << 1) | |
159 | << 1))); | |
160 | for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) { | |
161 | int node = init_state->nodes.elems[node_cnt]; | |
162 | re_token_type_t type = dfa->nodes[node].type; | |
163 | if (type == CHARACTER) { | |
164 | if ((bufp-> | |
165 | syntax & | |
166 | ((((((((((((((((((((((((unsigned long int)1) << 1) | |
167 | << 1) << 1) << 1) << 1) << 1) | |
168 | << 1) << 1) << 1) << 1) << 1) << 1) | |
169 | << 1) << 1) << 1) << 1) << 1) << 1) << 1) | |
170 | << 1) << 1) << 1)) && dfa->mb_cur_max > 1) { | |
171 | unsigned char *buf = | |
172 | __builtin_alloca(dfa->mb_cur_max), *p; | |
173 | } | |
174 | int i; | |
175 | int ch; | |
176 | for (i = 0, ch = 0; | |
177 | i < (256 / (sizeof(unsigned long int) * 8)); ++i) { | |
178 | int j; | |
179 | unsigned long int w = | |
180 | dfa->nodes[node].opr.sbcset[i]; | |
181 | for (j = 0; j < (sizeof(unsigned long int) * 8); | |
182 | ++j, ++ch) | |
183 | if (w & ((unsigned long int)1 << j)) | |
184 | re_set_fastmap(fastmap, icase, | |
185 | ch); | |
186 | } | |
187 | } else if (type == COMPLEX_BRACKET) { | |
188 | int i; | |
189 | re_charset_t *cset = dfa->nodes[node].opr.mbcset; | |
190 | for (i = 0; i < cset->nmbchars; ++i) { | |
191 | char buf[256]; | |
192 | mbstate_t state; | |
193 | if (__wcrtomb(buf, cset->mbchars[i], &state) != | |
194 | (unsigned int)-1) | |
195 | re_set_fastmap(fastmap, icase, | |
196 | *(unsigned char *)buf); | |
197 | if ((bufp-> | |
198 | syntax & | |
199 | ((((((((((((((((((((((((unsigned long int) | |
200 | 1) << 1) << 1) << 1) | |
201 | << 1) << 1) << 1) << 1) | |
202 | << 1) << 1) << 1) << 1) << | |
203 | 1) << 1) << 1) << 1) << 1) << 1) | |
204 | << 1) << 1) << 1) << 1) << 1)) | |
205 | && dfa->mb_cur_max > 1) { | |
206 | if (__wcrtomb | |
207 | (buf, towlower(cset->mbchars[i]), | |
208 | &state) != (unsigned int)-1) | |
209 | re_set_fastmap(fastmap, 0, | |
210 | *(unsigned char | |
211 | *)buf); | |
212 | } | |
213 | } | |
214 | } | |
215 | } | |
216 | } | |
217 | static const bitset_t utf8_sb_map; | |
218 | static reg_errcode_t re_compile_internal(regex_t * preg, const char *pattern, | |
219 | unsigned int length, | |
220 | reg_syntax_t syntax) | |
221 | { | |
222 | reg_errcode_t err = REG_NOERROR; | |
223 | re_dfa_t *dfa; | |
224 | preg->fastmap_accurate = 0; | |
225 | preg->not_bol = preg->not_eol = 0; | |
226 | err = analyze(preg); | |
227 | if (dfa->is_utf8 | |
228 | && !(syntax & | |
229 | ((((((((((((((((((((((((unsigned long int)1) << 1) << 1) << 1) | |
230 | << 1) << 1) << 1) << 1) << 1) << 1) << 1) << | |
231 | 1) << 1) << 1) << 1) << 1) << 1) << 1) << 1) << 1) | |
232 | << 1) << 1) << 1)) && preg->translate == 0) | |
233 | optimize_utf8(dfa); | |
234 | if (dfa->mb_cur_max == 6 && __extension__( { | |
235 | unsigned int __s1_len, | |
236 | __s2_len; | |
237 | (__builtin_constant_p | |
238 | (((*_nl_current_LC_CTYPE)-> | |
239 | values[((int) | |
240 | (_NL_CTYPE_CODESET_NAME) | |
241 | & 0xffff)].string)) | |
242 | && | |
243 | __builtin_constant_p("UTF-8") | |
244 | && (__s1_len = | |
245 | strlen(((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string)), __s2_len = strlen("UTF-8"), (!((unsigned int)(const void *)((((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string)) + 1) - (unsigned int)(const void *)(((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string)) == 1) || __s1_len >= 4) && (!((unsigned int)(const void *)(("UTF-8") + 1) - (unsigned int)(const void *)("UTF-8") == 1) || __s2_len >= 4)) ? __builtin_strcmp(((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string), "UTF-8") : (__builtin_constant_p(((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string)) && ((unsigned int)(const void *)((((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string)) + 1) - (unsigned int)(const void *)(((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string)) == 1) && (__s1_len = strlen(((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string)), __s1_len < 4) ? (__builtin_constant_p("UTF-8") && ((unsigned int)(const void *)(("UTF-8") + 1) - (unsigned int)(const void *)("UTF-8") == 1) ? __builtin_strcmp(((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string), "UTF-8") : (__extension__( { | |
246 | __const | |
247 | unsigned | |
248 | char | |
249 | *__s2 | |
250 | = | |
251 | (__const | |
252 | unsigned | |
253 | char | |
254 | *) | |
255 | (__const | |
256 | char | |
257 | *) | |
258 | ("UTF-8"); | |
259 | register | |
260 | int | |
261 | __result | |
262 | = | |
263 | (((__const unsigned char *)(__const char *)(((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string)))[0] - __s2[0]); __result;} | |
264 | ))): (__builtin_constant_p("UTF-8") && ((unsigned int)(const void *)(("UTF-8") + 1) - (unsigned int)(const void *)("UTF-8") == 1) && (__s2_len = strlen("UTF-8"), __s2_len < 4) ? (__builtin_constant_p(((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string)) && ((unsigned int)(const void *)((((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string)) + 1) - (unsigned int)(const void *)(((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string)) == 1) ? __builtin_strcmp(((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string), "UTF-8") : (__extension__( { | |
265 | __const | |
266 | unsigned | |
267 | char | |
268 | *__s1 | |
269 | = | |
270 | (__const | |
271 | unsigned | |
272 | char | |
273 | *) | |
274 | (__const | |
275 | char | |
276 | *) | |
277 | (((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string)); register int __result = __s1[0] - ((__const unsigned char *)(__const char *)("UTF-8"))[0]; if (__s2_len > 0 && __result == 0) { | |
278 | } | |
279 | __result;} | |
280 | ))): __builtin_strcmp(((*_nl_current_LC_CTYPE)->values[((int)(_NL_CTYPE_CODESET_NAME) & 0xffff)].string), "UTF-8"))));} | |
281 | ) == 0) | |
282 | dfa->is_utf8 = 1; | |
283 | dfa->map_notascii = | |
284 | (((unsigned int)(*_nl_current_LC_CTYPE)-> | |
285 | values[((int)(_NL_CTYPE_MAP_TO_NONASCII) & 0xffff)].word) != 0); | |
286 | if (dfa->mb_cur_max > 1) { | |
287 | if (dfa->is_utf8) | |
288 | dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; | |
289 | else { | |
290 | int i, j, ch; | |
291 | dfa->sb_char = | |
292 | (re_bitset_ptr_t) calloc(sizeof(bitset_t), 1); | |
293 | if (__builtin_expect(dfa->sb_char == 0, 0)) | |
294 | return REG_ESPACE; | |
295 | for (i = 0, ch = 0; | |
296 | i < (256 / (sizeof(unsigned long int) * 8)); ++i) | |
297 | for (j = 0; | |
298 | j < (sizeof(unsigned long int) * 8); | |
299 | ++j, ++ch) { | |
300 | unsigned int wch = __btowc(ch); | |
301 | if (wch != (0xffffffffu)) | |
302 | dfa->sb_char[i] |= | |
303 | (unsigned long int)1 << j; | |
304 | } | |
305 | } | |
306 | } | |
307 | int node, i, mb_chars = 0, has_period = 0; | |
308 | for (node = 0; node < dfa->nodes_len; ++node) | |
309 | switch (dfa->nodes[node].type) { | |
310 | case CHARACTER: | |
311 | if (dfa->nodes[node].opr.c >= 0x80) | |
312 | mb_chars = 1; | |
313 | case ANCHOR: | |
314 | switch (dfa->nodes[node].opr.idx) { | |
315 | } | |
316 | case OP_PERIOD: | |
317 | has_period = 1; | |
318 | for (i = 0x80 / (sizeof(unsigned long int) * 8); | |
319 | i < (256 / (sizeof(unsigned long int) * 8)); ++i) | |
320 | if (dfa->nodes[node].opr.sbcset[i]) | |
321 | return; | |
322 | } | |
323 | dfa->has_mb_node = dfa->nbackref > 0 || has_period; | |
324 | } | |
325 | static reg_errcode_t analyze(regex_t * preg) | |
326 | { | |
327 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | |
328 | if (__builtin_expect | |
329 | (dfa->nexts == 0 || dfa->org_indices == 0 || dfa->edests == 0 | |
330 | || dfa->eclosures == 0, 0)) | |
331 | return REG_ESPACE; | |
332 | if (dfa->subexp_map != 0) { | |
333 | int i; | |
334 | for (i = 0; i < preg->re_nsub; i++) | |
335 | dfa->subexp_map[i] = i; | |
336 | preorder(dfa->str_tree, optimize_subexps, dfa); | |
337 | for (i = 0; i < preg->re_nsub; i++) | |
338 | if (dfa->subexp_map[i] != i) | |
339 | break; | |
340 | } | |
341 | } |