1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include <assert.h>
21 #include <ctype.h>
22 #include <limits.h>
23 #include <locale.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27
28 #if defined(_WIN32) && !defined (MB_CUR_MAX)
29 #define MB_CUR_MAX 2
30 #endif
31
32 #if defined HAVE_WCHAR_H || defined _LIBC
33 # include <wchar.h>
34 #endif /* HAVE_WCHAR_H || _LIBC */
35 #if defined HAVE_WCTYPE_H || defined _LIBC
36 # include <wctype.h>
37 #endif /* HAVE_WCTYPE_H || _LIBC */
38
39 /* In case that the system doesn't have isblank(). */
40 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
41 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
42 #endif
43
44 #ifdef _LIBC
45 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
46 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
47 # include <locale/localeinfo.h>
48 # include <locale/elem-hash.h>
49 # include <locale/coll-lookup.h>
50 # endif
51 #endif
52
53 /* This is for other GNU distributions with internationalized messages. */
54 #if HAVE_LIBINTL_H || defined _LIBC
55 # include <libintl.h>
56 # ifdef _LIBC
57 # undef gettext
58 # define gettext(msgid) \
59 INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
60 # endif
61 #else
62 # define gettext(msgid) (msgid)
63 #endif
64
65 #ifndef gettext_noop
66 /* This define is so xgettext can find the internationalizable
67 strings. */
68 # define gettext_noop(String) String
69 #endif
70
71 #include "_regex.h" /* gnupg */
72 #include "regex_internal.h"
73
74 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
75 int length, reg_syntax_t syntax);
76 static void re_compile_fastmap_iter (regex_t *bufp,
77 const re_dfastate_t *init_state,
78 char *fastmap);
79 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
80 static reg_errcode_t init_word_char (re_dfa_t *dfa);
81 #ifdef RE_ENABLE_I18N
82 static void free_charset (re_charset_t *cset);
83 #endif /* RE_ENABLE_I18N */
84 static void free_workarea_compile (regex_t *preg);
85 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
86 static reg_errcode_t analyze (re_dfa_t *dfa);
87 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
88 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
89 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
90 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
91 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
92 unsigned int constraint);
93 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
94 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
95 int node, int root);
96 static void calc_inveclosure (re_dfa_t *dfa);
97 static int fetch_number (re_string_t *input, re_token_t *token,
98 reg_syntax_t syntax);
99 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
100 static int peek_token (re_token_t *token, re_string_t *input,
101 reg_syntax_t syntax);
102 static int peek_token_bracket (re_token_t *token, re_string_t *input,
103 reg_syntax_t syntax);
104 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
105 reg_syntax_t syntax, reg_errcode_t *err);
106 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
107 re_token_t *token, reg_syntax_t syntax,
108 int nest, reg_errcode_t *err);
109 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
110 re_token_t *token, reg_syntax_t syntax,
111 int nest, reg_errcode_t *err);
112 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
113 re_token_t *token, reg_syntax_t syntax,
114 int nest, reg_errcode_t *err);
115 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
116 re_token_t *token, reg_syntax_t syntax,
117 int nest, reg_errcode_t *err);
118 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
119 re_dfa_t *dfa, re_token_t *token,
120 reg_syntax_t syntax, reg_errcode_t *err);
121 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
122 re_token_t *token, reg_syntax_t syntax,
123 reg_errcode_t *err);
124 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
125 re_string_t *regexp,
126 re_token_t *token, int token_len,
127 re_dfa_t *dfa,
128 reg_syntax_t syntax);
129 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
130 re_string_t *regexp,
131 re_token_t *token);
132 #ifndef _LIBC
133 # ifdef RE_ENABLE_I18N
134 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
135 re_charset_t *mbcset, int *range_alloc,
136 bracket_elem_t *start_elem,
137 bracket_elem_t *end_elem);
138 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
139 re_charset_t *mbcset,
140 int *coll_syxmalloc,
141 const unsigned char *name);
142 # else /* not RE_ENABLE_I18N */
143 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
144 bracket_elem_t *start_elem,
145 bracket_elem_t *end_elem);
146 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
147 const unsigned char *name);
148 # endif /* not RE_ENABLE_I18N */
149 #endif /* not _LIBC */
150 #ifdef RE_ENABLE_I18N
151 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
152 re_charset_t *mbcset,
153 int *equiv_class_alloc,
154 const unsigned char *name);
155 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
156 re_charset_t *mbcset,
157 int *char_class_alloc,
158 const unsigned char *class_name,
159 reg_syntax_t syntax);
160 #else /* not RE_ENABLE_I18N */
161 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
162 const unsigned char *name);
163 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
164 const unsigned char *class_name,
165 reg_syntax_t syntax);
166 #endif /* not RE_ENABLE_I18N */
167 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
168 static void free_bin_tree (bin_tree_t *tree);
169 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
170 re_token_type_t type, int index);
171 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
172
173 /* This table gives an error message for each of the error codes listed
174 in regex.h. Obviously the order here has to be same as there.
175 POSIX doesn't require that we do anything for REG_NOERROR,
176 but why not be nice? */
177
178 const char __re_error_msgid[] attribute_hidden =
179 {
180 #define REG_NOERROR_IDX 0
181 gettext_noop ("Success") /* REG_NOERROR */
182 "\0"
183 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
184 gettext_noop ("No match") /* REG_NOMATCH */
185 "\0"
186 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
187 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
188 "\0"
189 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
190 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
191 "\0"
192 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
193 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
194 "\0"
195 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
196 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
197 "\0"
198 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
199 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
200 "\0"
201 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
202 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
203 "\0"
204 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
205 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
206 "\0"
207 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
208 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
209 "\0"
210 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
211 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
212 "\0"
213 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
214 gettext_noop ("Invalid range end") /* REG_ERANGE */
215 "\0"
216 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
217 gettext_noop ("Memory exhausted") /* REG_ESPACE */
218 "\0"
219 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
220 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
221 "\0"
222 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
223 gettext_noop ("Premature end of regular expression") /* REG_EEND */
224 "\0"
225 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
226 gettext_noop ("Regular expression too big") /* REG_ESIZE */
227 "\0"
228 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
229 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
230 };
231
232 const size_t __re_error_msgid_idx[] attribute_hidden =
233 {
234 REG_NOERROR_IDX,
235 REG_NOMATCH_IDX,
236 REG_BADPAT_IDX,
237 REG_ECOLLATE_IDX,
238 REG_ECTYPE_IDX,
239 REG_EESCAPE_IDX,
240 REG_ESUBREG_IDX,
241 REG_EBRACK_IDX,
242 REG_EPAREN_IDX,
243 REG_EBRACE_IDX,
244 REG_BADBR_IDX,
245 REG_ERANGE_IDX,
246 REG_ESPACE_IDX,
247 REG_BADRPT_IDX,
248 REG_EEND_IDX,
249 REG_ESIZE_IDX,
250 REG_ERPAREN_IDX
251 };
252
253 /* Entry points for GNU code. */
254
255 /* re_compile_pattern is the GNU regular expression compiler: it
256 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
257 Returns 0 if the pattern was valid, otherwise an error string.
258
259 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
260 are set in BUFP on entry. */
261
262 const char *
re_compile_pattern(pattern,length,bufp)263 re_compile_pattern (pattern, length, bufp)
264 const char *pattern;
265 size_t length;
266 struct re_pattern_buffer *bufp;
267 {
268 reg_errcode_t ret;
269
270 /* GNU code is written to assume at least RE_NREGS registers will be set
271 (and at least one extra will be -1). */
272 bufp->regs_allocated = REGS_UNALLOCATED;
273
274 /* And GNU code determines whether or not to get register information
275 by passing null for the REGS argument to re_match, etc., not by
276 setting no_sub. */
277 bufp->no_sub = 0;
278
279 /* Match anchors at newline. */
280 bufp->newline_anchor = 1;
281
282 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
283
284 if (!ret)
285 return NULL;
286 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
287 }
288 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)289 weak_alias (__re_compile_pattern, re_compile_pattern)
290 #endif
291
292 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
293 also be assigned to arbitrarily: each pattern buffer stores its own
294 syntax, so it can be changed between regex compilations. */
295 /* This has no initializer because initialized variables in Emacs
296 become read-only after dumping. */
297 reg_syntax_t re_syntax_options;
298
299
300 /* Specify the precise syntax of regexps for compilation. This provides
301 for compatibility for various utilities which historically have
302 different, incompatible syntaxes.
303
304 The argument SYNTAX is a bit mask comprised of the various bits
305 defined in regex.h. We return the old syntax. */
306
307 reg_syntax_t
308 re_set_syntax (syntax)
309 reg_syntax_t syntax;
310 {
311 reg_syntax_t ret = re_syntax_options;
312
313 re_syntax_options = syntax;
314 return ret;
315 }
316 #ifdef _LIBC
317 weak_alias (__re_set_syntax, re_set_syntax)
318 #endif
319
320 int
321 re_compile_fastmap (bufp)
322 struct re_pattern_buffer *bufp;
323 {
324 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
325 char *fastmap = bufp->fastmap;
326
327 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
328 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
329 if (dfa->init_state != dfa->init_state_word)
330 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
331 if (dfa->init_state != dfa->init_state_nl)
332 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
333 if (dfa->init_state != dfa->init_state_begbuf)
334 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
335 bufp->fastmap_accurate = 1;
336 return 0;
337 }
338 #ifdef _LIBC
339 weak_alias (__re_compile_fastmap, re_compile_fastmap)
340 #endif
341
342 /* Helper function for re_compile_fastmap.
343 Compile fastmap for the initial_state INIT_STATE. */
344
345 static void
346 re_compile_fastmap_iter (bufp, init_state, fastmap)
347 regex_t *bufp;
348 const re_dfastate_t *init_state;
349 char *fastmap;
350 {
351 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
352 int node_cnt;
353 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
354 {
355 int node = init_state->nodes.elems[node_cnt];
356 re_token_type_t type = dfa->nodes[node].type;
357 if (type == OP_CONTEXT_NODE)
358 {
359 node = dfa->nodes[node].opr.ctx_info->entity;
360 type = dfa->nodes[node].type;
361 }
362
363 if (type == CHARACTER)
364 fastmap[dfa->nodes[node].opr.c] = 1;
365 else if (type == SIMPLE_BRACKET)
366 {
367 int i, j, ch;
368 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
369 for (j = 0; j < UINT_BITS; ++j, ++ch)
370 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
371 fastmap[ch] = 1;
372 }
373 #ifdef RE_ENABLE_I18N
374 else if (type == COMPLEX_BRACKET)
375 {
376 int i;
377 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
378 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
379 || cset->nranges || cset->nchar_classes)
380 {
381 # ifdef _LIBC
382 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
383 {
384 /* In this case we want to catch the bytes which are
385 the first byte of any collation elements.
386 e.g. In da_DK, we want to catch 'a' since "aa"
387 is a valid collation element, and don't catch
388 'b' since 'b' is the only collation element
389 which starts from 'b'. */
390 int j, ch;
391 const int32_t *table = (const int32_t *)
392 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
393 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
394 for (j = 0; j < UINT_BITS; ++j, ++ch)
395 if (table[ch] < 0)
396 fastmap[ch] = 1;
397 }
398 # else
399 if (MB_CUR_MAX > 1)
400 for (i = 0; i < SBC_MAX; ++i)
401 if (__btowc (i) == WEOF)
402 fastmap[i] = 1;
403 # endif /* not _LIBC */
404 }
405 for (i = 0; i < cset->nmbchars; ++i)
406 {
407 char buf[256];
408 wctomb (buf, cset->mbchars[i]);
409 fastmap[*(unsigned char *) buf] = 1;
410 }
411 }
412 #endif /* RE_ENABLE_I18N */
413 else if (type == END_OF_RE || type == OP_PERIOD
414 #ifdef RE_ENABLE_I18N
415 || type == COMPLEX_BRACKET
416 #endif /* RE_ENABLE_I18N */
417 )
418 {
419 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
420 if (type == END_OF_RE)
421 bufp->can_be_null = 1;
422 return;
423 }
424 }
425 }
426
427 /* Entry point for POSIX code. */
428 /* regcomp takes a regular expression as a string and compiles it.
429
430 PREG is a regex_t *. We do not expect any fields to be initialized,
431 since POSIX says we shouldn't. Thus, we set
432
433 `buffer' to the compiled pattern;
434 `used' to the length of the compiled pattern;
435 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
436 REG_EXTENDED bit in CFLAGS is set; otherwise, to
437 RE_SYNTAX_POSIX_BASIC;
438 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
439 `fastmap' to an allocated space for the fastmap;
440 `fastmap_accurate' to zero;
441 `re_nsub' to the number of subexpressions in PATTERN.
442
443 PATTERN is the address of the pattern string.
444
445 CFLAGS is a series of bits which affect compilation.
446
447 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
448 use POSIX basic syntax.
449
450 If REG_NEWLINE is set, then . and [^...] don't match newline.
451 Also, regexec will try a match beginning after every newline.
452
453 If REG_ICASE is set, then we considers upper- and lowercase
454 versions of letters to be equivalent when matching.
455
456 If REG_NOSUB is set, then when PREG is passed to regexec, that
457 routine will report only success or failure, and nothing about the
458 registers.
459
460 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
461 the return codes and their meanings.) */
462
463 int
regcomp(preg,pattern,cflags)464 regcomp (preg, pattern, cflags)
465 regex_t *__restrict preg;
466 const char *__restrict pattern;
467 int cflags;
468 {
469 reg_errcode_t ret;
470 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
471 : RE_SYNTAX_POSIX_BASIC);
472
473 preg->buffer = NULL;
474 preg->allocated = 0;
475 preg->used = 0;
476
477 /* Try to allocate space for the fastmap. */
478 preg->fastmap = re_malloc (char, SBC_MAX);
479 if (BE (preg->fastmap == NULL, 0))
480 return REG_ESPACE;
481
482 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
483
484 /* If REG_NEWLINE is set, newlines are treated differently. */
485 if (cflags & REG_NEWLINE)
486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
487 syntax &= ~RE_DOT_NEWLINE;
488 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
489 /* It also changes the matching behavior. */
490 preg->newline_anchor = 1;
491 }
492 else
493 preg->newline_anchor = 0;
494 preg->no_sub = !!(cflags & REG_NOSUB);
495 preg->translate = NULL;
496
497 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
498
499 /* POSIX doesn't distinguish between an unmatched open-group and an
500 unmatched close-group: both are REG_EPAREN. */
501 if (ret == REG_ERPAREN)
502 ret = REG_EPAREN;
503
504 /* We have already checked preg->fastmap != NULL. */
505 if (BE (ret == REG_NOERROR, 1))
506 {
507 /* Compute the fastmap now, since regexec cannot modify the pattern
508 buffer. */
509 if (BE (re_compile_fastmap (preg) == -2, 0))
510 {
511 /* Some error occurred while computing the fastmap, just forget
512 about it. */
513 re_free (preg->fastmap);
514 preg->fastmap = NULL;
515 }
516 }
517
518 return (int) ret;
519 }
520 #ifdef _LIBC
521 weak_alias (__regcomp, regcomp)
522 #endif
523
524 /* Returns a message corresponding to an error code, ERRCODE, returned
525 from either regcomp or regexec. We don't use PREG here. */
526
527 size_t
528 regerror (errcode, preg, errbuf, errbuf_size)
529 int errcode;
530 const regex_t *preg;
531 char *errbuf;
532 size_t errbuf_size;
533 {
534 const char *msg;
535 size_t msg_size;
536
537 if (BE (errcode < 0
538 || errcode >= (int) (sizeof (__re_error_msgid_idx)
539 / sizeof (__re_error_msgid_idx[0])), 0))
540 /* Only error codes returned by the rest of the code should be passed
541 to this routine. If we are given anything else, or if other regex
542 code generates an invalid error code, then the program has a bug.
543 Dump core so we can fix it. */
544 abort ();
545
546 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
547
548 msg_size = strlen (msg) + 1; /* Includes the null. */
549
550 if (BE (errbuf_size != 0, 1))
551 {
552 if (BE (msg_size > errbuf_size, 0))
553 {
554 #if defined HAVE_MEMPCPY || defined _LIBC
555 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
556 #else
557 memcpy (errbuf, msg, errbuf_size - 1);
558 errbuf[errbuf_size - 1] = 0;
559 #endif
560 }
561 else
562 memcpy (errbuf, msg, msg_size);
563 }
564
565 return msg_size;
566 }
567 #ifdef _LIBC
568 weak_alias (__regerror, regerror)
569 #endif
570
571 /* Free dynamically allocated space used by PREG. */
572
573 void
574 regfree (preg)
575 regex_t *preg;
576 {
577 int i, j;
578 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
579 if (BE (dfa != NULL, 1))
580 {
581 re_free (dfa->subexps);
582
583 for (i = 0; i < dfa->nodes_len; ++i)
584 {
585 re_token_t *node = dfa->nodes + i;
586 #ifdef RE_ENABLE_I18N
587 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
588 free_charset (node->opr.mbcset);
589 else
590 #endif /* RE_ENABLE_I18N */
591 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
592 re_free (node->opr.sbcset);
593 else if (node->type == OP_CONTEXT_NODE)
594 {
595 if (dfa->nodes[node->opr.ctx_info->entity].type == OP_BACK_REF)
596 {
597 if (node->opr.ctx_info->bkref_eclosure != NULL)
598 re_node_set_free (node->opr.ctx_info->bkref_eclosure);
599 re_free (node->opr.ctx_info->bkref_eclosure);
600 }
601 re_free (node->opr.ctx_info);
602 }
603 }
604 re_free (dfa->firsts);
605 re_free (dfa->nexts);
606 for (i = 0; i < dfa->nodes_len; ++i)
607 {
608 if (dfa->eclosures != NULL)
609 re_node_set_free (dfa->eclosures + i);
610 if (dfa->inveclosures != NULL)
611 re_node_set_free (dfa->inveclosures + i);
612 if (dfa->edests != NULL)
613 re_node_set_free (dfa->edests + i);
614 }
615 re_free (dfa->edests);
616 re_free (dfa->eclosures);
617 re_free (dfa->inveclosures);
618 re_free (dfa->nodes);
619
620 for (i = 0; i <= dfa->state_hash_mask; ++i)
621 {
622 struct re_state_table_entry *entry = dfa->state_table + i;
623 for (j = 0; j < entry->num; ++j)
624 {
625 re_dfastate_t *state = entry->array[j];
626 if (state->entrance_nodes != &state->nodes)
627 {
628 re_node_set_free (state->entrance_nodes);
629 re_free (state->entrance_nodes);
630 }
631 re_node_set_free (&state->nodes);
632 re_free (state->trtable);
633 re_free (state->trtable_search);
634 re_free (state);
635 }
636 re_free (entry->array);
637 }
638 re_free (dfa->state_table);
639
640 if (dfa->word_char != NULL)
641 re_free (dfa->word_char);
642 #ifdef DEBUG
643 re_free (dfa->re_str);
644 #endif
645 re_free (dfa);
646 }
647 re_free (preg->fastmap);
648 }
649 #ifdef _LIBC
650 weak_alias (__regfree, regfree)
651 #endif
652
653 /* Entry points compatible with 4.2 BSD regex library. We don't define
654 them unless specifically requested. */
655
656 #if defined _REGEX_RE_COMP || defined _LIBC
657
658 /* BSD has one and only one pattern buffer. */
659 static struct re_pattern_buffer re_comp_buf;
660
661 char *
662 # ifdef _LIBC
663 /* Make these definitions weak in libc, so POSIX programs can redefine
664 these names if they don't use our functions, and still use
665 regcomp/regexec above without link errors. */
666 weak_function
667 # endif
re_comp(s)668 re_comp (s)
669 const char *s;
670 {
671 reg_errcode_t ret;
672
673 if (!s)
674 {
675 if (!re_comp_buf.buffer)
676 return gettext ("No previous regular expression");
677 return 0;
678 }
679
680 if (!re_comp_buf.buffer)
681 {
682 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
683 if (re_comp_buf.fastmap == NULL)
684 return (char *) gettext (__re_error_msgid
685 + __re_error_msgid_idx[(int) REG_ESPACE]);
686 }
687
688 /* Since `re_exec' always passes NULL for the `regs' argument, we
689 don't need to initialize the pattern buffer fields which affect it. */
690
691 /* Match anchors at newlines. */
692 re_comp_buf.newline_anchor = 1;
693
694 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
695
696 if (!ret)
697 return NULL;
698
699 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
700 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
701 }
702 #endif /* _REGEX_RE_COMP */
703
704 /* Internal entry point.
705 Compile the regular expression PATTERN, whose length is LENGTH.
706 SYNTAX indicate regular expression's syntax. */
707
708 static reg_errcode_t
re_compile_internal(preg,pattern,length,syntax)709 re_compile_internal (preg, pattern, length, syntax)
710 regex_t *preg;
711 const char * pattern;
712 int length;
713 reg_syntax_t syntax;
714 {
715 reg_errcode_t err = REG_NOERROR;
716 re_dfa_t *dfa;
717 re_string_t regexp;
718
719 /* Initialize the pattern buffer. */
720 preg->fastmap_accurate = 0;
721 preg->syntax = syntax;
722 preg->not_bol = preg->not_eol = 0;
723 preg->used = 0;
724 preg->re_nsub = 0;
725
726 /* Initialize the dfa. */
727 dfa = (re_dfa_t *) preg->buffer;
728 if (preg->allocated < sizeof (re_dfa_t))
729 {
730 /* If zero allocated, but buffer is non-null, try to realloc
731 enough space. This loses if buffer's address is bogus, but
732 that is the user's responsibility. If ->buffer is NULL this
733 is a simple allocation. */
734 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
735 if (dfa == NULL)
736 return REG_ESPACE;
737 preg->allocated = sizeof (re_dfa_t);
738 }
739 preg->buffer = (unsigned char *) dfa;
740 preg->used = sizeof (re_dfa_t);
741
742 err = init_dfa (dfa, length);
743 if (BE (err != REG_NOERROR, 0))
744 {
745 re_free (dfa);
746 preg->buffer = NULL;
747 return err;
748 }
749 #ifdef DEBUG
750 dfa->re_str = re_malloc (char, length + 1);
751 strncpy (dfa->re_str, pattern, length + 1);
752 #endif
753
754 err = re_string_construct (®exp, pattern, length, preg->translate,
755 syntax & RE_ICASE);
756 if (BE (err != REG_NOERROR, 0))
757 {
758 re_free (dfa);
759 preg->buffer = NULL;
760 return err;
761 }
762
763 /* Parse the regular expression, and build a structure tree. */
764 preg->re_nsub = 0;
765 dfa->str_tree = parse (®exp, preg, syntax, &err);
766 if (BE (dfa->str_tree == NULL, 0))
767 goto re_compile_internal_free_return;
768
769 /* Analyze the tree and collect information which is necessary to
770 create the dfa. */
771 err = analyze (dfa);
772 if (BE (err != REG_NOERROR, 0))
773 goto re_compile_internal_free_return;
774
775 /* Then create the initial state of the dfa. */
776 err = create_initial_state (dfa);
777 if (BE (err != REG_NOERROR, 0))
778 goto re_compile_internal_free_return;
779
780 re_compile_internal_free_return:
781 /* Release work areas. */
782 free_workarea_compile (preg);
783 re_string_destruct (®exp);
784
785 return err;
786 }
787
788 /* Initialize DFA. We use the length of the regular expression PAT_LEN
789 as the initial length of some arrays. */
790
791 static reg_errcode_t
init_dfa(dfa,pat_len)792 init_dfa (dfa, pat_len)
793 re_dfa_t *dfa;
794 int pat_len;
795 {
796 int table_size;
797
798 memset (dfa, '\0', sizeof (re_dfa_t));
799
800 dfa->nodes_alloc = pat_len + 1;
801 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
802
803 dfa->states_alloc = pat_len + 1;
804
805 /* table_size = 2 ^ ceil(log pat_len) */
806 for (table_size = 1; table_size > 0; table_size <<= 1)
807 if (table_size > pat_len)
808 break;
809
810 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
811 dfa->state_hash_mask = table_size - 1;
812
813 dfa->subexps_alloc = 1;
814 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
815 dfa->word_char = NULL;
816
817 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
818 || dfa->subexps == NULL, 0))
819 {
820 /* We don't bother to free anything which was allocated. Very
821 soon the process will go down anyway. */
822 dfa->subexps = NULL;
823 dfa->state_table = NULL;
824 dfa->nodes = NULL;
825 return REG_ESPACE;
826 }
827 return REG_NOERROR;
828 }
829
830 /* Initialize WORD_CHAR table, which indicate which character is
831 "word". In this case "word" means that it is the word construction
832 character used by some operators like "\<", "\>", etc. */
833
834 static reg_errcode_t
init_word_char(dfa)835 init_word_char (dfa)
836 re_dfa_t *dfa;
837 {
838 int i, j, ch;
839 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
840 if (BE (dfa->word_char == NULL, 0))
841 return REG_ESPACE;
842 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
843 for (j = 0; j < UINT_BITS; ++j, ++ch)
844 if (isalnum (ch) || ch == '_')
845 dfa->word_char[i] |= 1 << j;
846 return REG_NOERROR;
847 }
848
849 /* Free the work area which are only used while compiling. */
850
851 static void
free_workarea_compile(preg)852 free_workarea_compile (preg)
853 regex_t *preg;
854 {
855 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
856 free_bin_tree (dfa->str_tree);
857 dfa->str_tree = NULL;
858 }
859
860 /* Create initial states for all contexts. */
861
862 static reg_errcode_t
create_initial_state(dfa)863 create_initial_state (dfa)
864 re_dfa_t *dfa;
865 {
866 int first, i;
867 reg_errcode_t err;
868 re_node_set init_nodes;
869
870 /* Initial states have the epsilon closure of the node which is
871 the first node of the regular expression. */
872 first = dfa->str_tree->first;
873 dfa->init_node = first;
874 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
875 if (BE (err != REG_NOERROR, 0))
876 return err;
877
878 /* The back-references which are in initial states can epsilon transit,
879 since in this case all of the subexpressions can be null.
880 Then we add epsilon closures of the nodes which are the next nodes of
881 the back-references. */
882 if (dfa->nbackref > 0)
883 for (i = 0; i < init_nodes.nelem; ++i)
884 {
885 int node_idx = init_nodes.elems[i];
886 re_token_type_t type = dfa->nodes[node_idx].type;
887
888 int clexp_idx;
889 int entity = (type != OP_CONTEXT_NODE ? node_idx
890 : dfa->nodes[node_idx].opr.ctx_info->entity);
891 if ((type != OP_CONTEXT_NODE
892 || (dfa->nodes[entity].type != OP_BACK_REF))
893 && (type != OP_BACK_REF))
894 continue;
895 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
896 {
897 re_token_t *clexp_node;
898 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
899 if (clexp_node->type == OP_CLOSE_SUBEXP
900 && clexp_node->opr.idx + 1 == dfa->nodes[entity].opr.idx)
901 break;
902 }
903 if (clexp_idx == init_nodes.nelem)
904 continue;
905
906 if (type == OP_CONTEXT_NODE
907 && (dfa->nodes[dfa->nodes[node_idx].opr.ctx_info->entity].type
908 == OP_BACK_REF))
909 {
910 int prev_nelem = init_nodes.nelem;
911 re_node_set_merge (&init_nodes,
912 dfa->nodes[node_idx].opr.ctx_info->bkref_eclosure);
913 if (prev_nelem < init_nodes.nelem)
914 i = 0;
915 }
916 else if (type == OP_BACK_REF)
917 {
918 int next_idx = dfa->nexts[node_idx];
919 if (!re_node_set_contains (&init_nodes, next_idx))
920 {
921 re_node_set_merge (&init_nodes, dfa->eclosures + next_idx);
922 i = 0;
923 }
924 }
925 }
926
927 /* It must be the first time to invoke acquire_state. */
928 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
929 /* We don't check ERR here, since the initial state must not be NULL. */
930 if (BE (dfa->init_state == NULL, 0))
931 return err;
932 if (dfa->init_state->has_constraint)
933 {
934 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
935 CONTEXT_WORD);
936 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
937 CONTEXT_NEWLINE);
938 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
939 &init_nodes,
940 CONTEXT_NEWLINE
941 | CONTEXT_BEGBUF);
942 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
943 || dfa->init_state_begbuf == NULL, 0))
944 return err;
945 }
946 else
947 dfa->init_state_word = dfa->init_state_nl
948 = dfa->init_state_begbuf = dfa->init_state;
949
950 re_node_set_free (&init_nodes);
951 return REG_NOERROR;
952 }
953
954 /* Analyze the structure tree, and calculate "first", "next", "edest",
955 "eclosure", and "inveclosure". */
956
957 static reg_errcode_t
analyze(dfa)958 analyze (dfa)
959 re_dfa_t *dfa;
960 {
961 int i;
962 reg_errcode_t ret;
963
964 /* Allocate arrays. */
965 dfa->firsts = re_malloc (int, dfa->nodes_alloc);
966 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
967 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
968 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
969 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
970 if (BE (dfa->firsts == NULL || dfa->nexts == NULL || dfa->edests == NULL
971 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
972 return REG_ESPACE;
973 /* Initialize them. */
974 for (i = 0; i < dfa->nodes_len; ++i)
975 {
976 dfa->firsts[i] = -1;
977 dfa->nexts[i] = -1;
978 re_node_set_init_empty (dfa->edests + i);
979 re_node_set_init_empty (dfa->eclosures + i);
980 re_node_set_init_empty (dfa->inveclosures + i);
981 }
982
983 ret = analyze_tree (dfa, dfa->str_tree);
984 if (BE (ret == REG_NOERROR, 1))
985 {
986 ret = calc_eclosure (dfa);
987 if (ret == REG_NOERROR)
988 calc_inveclosure (dfa);
989 }
990 return ret;
991 }
992
993 /* Helper functions for analyze.
994 This function calculate "first", "next", and "edest" for the subtree
995 whose root is NODE. */
996
997 static reg_errcode_t
analyze_tree(dfa,node)998 analyze_tree (dfa, node)
999 re_dfa_t *dfa;
1000 bin_tree_t *node;
1001 {
1002 reg_errcode_t ret;
1003 if (node->first == -1)
1004 calc_first (dfa, node);
1005 if (node->next == -1)
1006 calc_next (dfa, node);
1007 if (node->eclosure.nelem == 0)
1008 calc_epsdest (dfa, node);
1009 /* Calculate "first" etc. for the left child. */
1010 if (node->left != NULL)
1011 {
1012 ret = analyze_tree (dfa, node->left);
1013 if (BE (ret != REG_NOERROR, 0))
1014 return ret;
1015 }
1016 /* Calculate "first" etc. for the right child. */
1017 if (node->right != NULL)
1018 {
1019 ret = analyze_tree (dfa, node->right);
1020 if (BE (ret != REG_NOERROR, 0))
1021 return ret;
1022 }
1023 return REG_NOERROR;
1024 }
1025
1026 /* Calculate "first" for the node NODE. */
1027 static void
calc_first(dfa,node)1028 calc_first (dfa, node)
1029 re_dfa_t *dfa;
1030 bin_tree_t *node;
1031 {
1032 int idx, type;
1033 idx = node->node_idx;
1034 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1035
1036 switch (type)
1037 {
1038 #ifdef DEBUG
1039 case OP_OPEN_BRACKET:
1040 case OP_CLOSE_BRACKET:
1041 case OP_OPEN_DUP_NUM:
1042 case OP_CLOSE_DUP_NUM:
1043 case OP_NON_MATCH_LIST:
1044 case OP_OPEN_COLL_ELEM:
1045 case OP_CLOSE_COLL_ELEM:
1046 case OP_OPEN_EQUIV_CLASS:
1047 case OP_CLOSE_EQUIV_CLASS:
1048 case OP_OPEN_CHAR_CLASS:
1049 case OP_CLOSE_CHAR_CLASS:
1050 /* These must not be appeared here. */
1051 assert (0);
1052 #endif
1053 case END_OF_RE:
1054 case CHARACTER:
1055 case OP_PERIOD:
1056 case OP_DUP_ASTERISK:
1057 case OP_DUP_QUESTION:
1058 #ifdef RE_ENABLE_I18N
1059 case COMPLEX_BRACKET:
1060 #endif /* RE_ENABLE_I18N */
1061 case SIMPLE_BRACKET:
1062 case OP_BACK_REF:
1063 case ANCHOR:
1064 case OP_OPEN_SUBEXP:
1065 case OP_CLOSE_SUBEXP:
1066 node->first = idx;
1067 break;
1068 case OP_DUP_PLUS:
1069 #ifdef DEBUG
1070 assert (node->left != NULL);
1071 #endif
1072 if (node->left->first == -1)
1073 calc_first (dfa, node->left);
1074 node->first = node->left->first;
1075 break;
1076 case OP_ALT:
1077 node->first = idx;
1078 break;
1079 /* else fall through */
1080 default:
1081 #ifdef DEBUG
1082 assert (node->left != NULL);
1083 #endif
1084 if (node->left->first == -1)
1085 calc_first (dfa, node->left);
1086 node->first = node->left->first;
1087 break;
1088 }
1089 if (node->type == 0)
1090 dfa->firsts[idx] = node->first;
1091 }
1092
1093 /* Calculate "next" for the node NODE. */
1094
1095 static void
calc_next(dfa,node)1096 calc_next (dfa, node)
1097 re_dfa_t *dfa;
1098 bin_tree_t *node;
1099 {
1100 int idx, type;
1101 bin_tree_t *parent = node->parent;
1102 if (parent == NULL)
1103 {
1104 node->next = -1;
1105 idx = node->node_idx;
1106 if (node->type == 0)
1107 dfa->nexts[idx] = node->next;
1108 return;
1109 }
1110
1111 idx = parent->node_idx;
1112 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1113
1114 switch (type)
1115 {
1116 case OP_DUP_ASTERISK:
1117 case OP_DUP_PLUS:
1118 node->next = idx;
1119 break;
1120 case CONCAT:
1121 if (parent->left == node)
1122 {
1123 if (parent->right->first == -1)
1124 calc_first (dfa, parent->right);
1125 node->next = parent->right->first;
1126 break;
1127 }
1128 /* else fall through */
1129 default:
1130 if (parent->next == -1)
1131 calc_next (dfa, parent);
1132 node->next = parent->next;
1133 break;
1134 }
1135 idx = node->node_idx;
1136 if (node->type == 0)
1137 dfa->nexts[idx] = node->next;
1138 }
1139
1140 /* Calculate "edest" for the node NODE. */
1141
1142 static void
calc_epsdest(dfa,node)1143 calc_epsdest (dfa, node)
1144 re_dfa_t *dfa;
1145 bin_tree_t *node;
1146 {
1147 int idx;
1148 idx = node->node_idx;
1149 if (node->type == 0)
1150 {
1151 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1152 || dfa->nodes[idx].type == OP_DUP_PLUS
1153 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1154 {
1155 if (node->left->first == -1)
1156 calc_first (dfa, node->left);
1157 if (node->next == -1)
1158 calc_next (dfa, node);
1159 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1160 node->next);
1161 }
1162 else if (dfa->nodes[idx].type == OP_ALT)
1163 {
1164 int left, right;
1165 if (node->left != NULL)
1166 {
1167 if (node->left->first == -1)
1168 calc_first (dfa, node->left);
1169 left = node->left->first;
1170 }
1171 else
1172 {
1173 if (node->next == -1)
1174 calc_next (dfa, node);
1175 left = node->next;
1176 }
1177 if (node->right != NULL)
1178 {
1179 if (node->right->first == -1)
1180 calc_first (dfa, node->right);
1181 right = node->right->first;
1182 }
1183 else
1184 {
1185 if (node->next == -1)
1186 calc_next (dfa, node);
1187 right = node->next;
1188 }
1189 re_node_set_init_2 (dfa->edests + idx, left, right);
1190 }
1191 else if (dfa->nodes[idx].type == ANCHOR
1192 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1193 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP)
1194 re_node_set_init_1 (dfa->edests + idx, node->next);
1195 }
1196 }
1197
1198 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1199 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1200 otherwise return the error code. */
1201
1202 static reg_errcode_t
duplicate_node(new_idx,dfa,org_idx,constraint)1203 duplicate_node (new_idx, dfa, org_idx, constraint)
1204 re_dfa_t *dfa;
1205 int *new_idx, org_idx;
1206 unsigned int constraint;
1207 {
1208 re_token_t dup;
1209 int dup_idx;
1210 reg_errcode_t err;
1211
1212 dup.type = OP_CONTEXT_NODE;
1213 if (dfa->nodes[org_idx].type == OP_CONTEXT_NODE)
1214 {
1215 /* If the node whose index is ORG_IDX is the same as the intended
1216 node, use it. */
1217 if (dfa->nodes[org_idx].constraint == constraint)
1218 {
1219 *new_idx = org_idx;
1220 return REG_NOERROR;
1221 }
1222 dup.constraint = constraint |
1223 dfa->nodes[org_idx].constraint;
1224 }
1225 else
1226 dup.constraint = constraint;
1227
1228 /* In case that `entity' points OP_CONTEXT_NODE,
1229 we correct `entity' to real entity in calc_inveclosures(). */
1230 dup.opr.ctx_info = malloc (sizeof (*dup.opr.ctx_info));
1231 dup_idx = re_dfa_add_node (dfa, dup, 1);
1232 if (BE (dup.opr.ctx_info == NULL || dup_idx == -1, 0))
1233 return REG_ESPACE;
1234 dup.opr.ctx_info->entity = org_idx;
1235 dup.opr.ctx_info->bkref_eclosure = NULL;
1236
1237 dfa->nodes[dup_idx].duplicated = 1;
1238 dfa->firsts[dup_idx] = dfa->firsts[org_idx];
1239 dfa->nexts[dup_idx] = dfa->nexts[org_idx];
1240 err = re_node_set_init_copy (dfa->edests + dup_idx, dfa->edests + org_idx);
1241 if (BE (err != REG_NOERROR, 0))
1242 return err;
1243 /* Since we don't duplicate epsilon nodes, epsilon closure have
1244 only itself. */
1245 err = re_node_set_init_1 (dfa->eclosures + dup_idx, dup_idx);
1246 if (BE (err != REG_NOERROR, 0))
1247 return err;
1248 err = re_node_set_init_1 (dfa->inveclosures + dup_idx, dup_idx);
1249 if (BE (err != REG_NOERROR, 0))
1250 return err;
1251 /* Then we must update inveclosure for this node.
1252 We process them at last part of calc_eclosure(),
1253 since we don't complete to calculate them here. */
1254
1255 *new_idx = dup_idx;
1256 return REG_NOERROR;
1257 }
1258
1259 static void
calc_inveclosure(dfa)1260 calc_inveclosure (dfa)
1261 re_dfa_t *dfa;
1262 {
1263 int src, idx, dest, entity;
1264 for (src = 0; src < dfa->nodes_len; ++src)
1265 {
1266 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1267 {
1268 dest = dfa->eclosures[src].elems[idx];
1269 re_node_set_insert (dfa->inveclosures + dest, src);
1270 }
1271
1272 entity = src;
1273 while (dfa->nodes[entity].type == OP_CONTEXT_NODE)
1274 {
1275 entity = dfa->nodes[entity].opr.ctx_info->entity;
1276 re_node_set_merge (dfa->inveclosures + src,
1277 dfa->inveclosures + entity);
1278 dfa->nodes[src].opr.ctx_info->entity = entity;
1279 }
1280 }
1281 }
1282
1283 /* Calculate "eclosure" for all the node in DFA. */
1284
1285 static reg_errcode_t
calc_eclosure(dfa)1286 calc_eclosure (dfa)
1287 re_dfa_t *dfa;
1288 {
1289 int idx, node_idx, max, incomplete = 0;
1290 #ifdef DEBUG
1291 assert (dfa->nodes_len > 0);
1292 #endif
1293 /* For each nodes, calculate epsilon closure. */
1294 for (node_idx = 0, max = dfa->nodes_len; ; ++node_idx)
1295 {
1296 reg_errcode_t err;
1297 re_node_set eclosure_elem;
1298 if (node_idx == max)
1299 {
1300 if (!incomplete)
1301 break;
1302 incomplete = 0;
1303 node_idx = 0;
1304 }
1305
1306 #ifdef DEBUG
1307 assert (dfa->nodes[node_idx].type != OP_CONTEXT_NODE);
1308 assert (dfa->eclosures[node_idx].nelem != -1);
1309 #endif
1310 /* If we have already calculated, skip it. */
1311 if (dfa->eclosures[node_idx].nelem != 0)
1312 continue;
1313 /* Calculate epsilon closure of `node_idx'. */
1314 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1315 if (BE (err != REG_NOERROR, 0))
1316 return err;
1317
1318 if (dfa->eclosures[node_idx].nelem == 0)
1319 {
1320 incomplete = 1;
1321 re_node_set_free (&eclosure_elem);
1322 }
1323 }
1324
1325 /* for duplicated nodes. */
1326 for (idx = max; idx < dfa->nodes_len; ++idx)
1327 {
1328 int entity, i, constraint;
1329 re_node_set *bkref_eclosure;
1330 entity = dfa->nodes[idx].opr.ctx_info->entity;
1331 re_node_set_merge (dfa->inveclosures + idx, dfa->inveclosures + entity);
1332 if (dfa->nodes[entity].type != OP_BACK_REF)
1333 continue;
1334
1335 /* If the node is backreference, duplicate the epsilon closure of
1336 the next node. Since it may epsilon transit. */
1337 /* Note: duplicate_node() may realloc dfa->eclosures, etc. */
1338 bkref_eclosure = re_malloc (re_node_set, 1);
1339 if (BE (bkref_eclosure == NULL, 0))
1340 return REG_ESPACE;
1341 re_node_set_init_empty (bkref_eclosure);
1342 constraint = dfa->nodes[idx].constraint;
1343 for (i = 0; i < dfa->eclosures[dfa->nexts[idx]].nelem; ++i)
1344 {
1345 int dest_node_idx = dfa->eclosures[dfa->nexts[idx]].elems[i];
1346 if (!IS_EPSILON_NODE (dfa->nodes[dest_node_idx].type))
1347 {
1348 reg_errcode_t err;
1349 err = duplicate_node (&dest_node_idx, dfa, dest_node_idx,
1350 constraint);
1351 if (BE (err != REG_NOERROR, 0))
1352 return err;
1353 }
1354 re_node_set_insert (bkref_eclosure, dest_node_idx);
1355 }
1356 dfa->nodes[idx].opr.ctx_info->bkref_eclosure = bkref_eclosure;
1357 }
1358
1359 return REG_NOERROR;
1360 }
1361
1362 /* Calculate epsilon closure of NODE. */
1363
1364 static reg_errcode_t
calc_eclosure_iter(new_set,dfa,node,root)1365 calc_eclosure_iter (new_set, dfa, node, root)
1366 re_node_set *new_set;
1367 re_dfa_t *dfa;
1368 int node, root;
1369 {
1370 reg_errcode_t err;
1371 unsigned int constraint;
1372 int i, max, incomplete = 0;
1373 re_node_set eclosure;
1374 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1375 if (BE (err != REG_NOERROR, 0))
1376 return err;
1377
1378 /* This indicates that we are calculating this node now.
1379 We reference this value to avoid infinite loop. */
1380 dfa->eclosures[node].nelem = -1;
1381
1382 constraint = ((dfa->nodes[node].type == ANCHOR)
1383 ? dfa->nodes[node].opr.ctx_type : 0);
1384
1385 /* Expand each epsilon destination nodes. */
1386 if (dfa->edests[node].nelem != 0)
1387 for (i = 0; i < dfa->edests[node].nelem; ++i)
1388 {
1389 re_node_set eclosure_elem;
1390 int edest = dfa->edests[node].elems[i];
1391 /* If calculating the epsilon closure of `edest' is in progress,
1392 return intermediate result. */
1393 if (dfa->eclosures[edest].nelem == -1)
1394 {
1395 incomplete = 1;
1396 continue;
1397 }
1398 /* If we haven't calculated the epsilon closure of `edest' yet,
1399 calculate now. Otherwise use calculated epsilon closure. */
1400 if (dfa->eclosures[edest].nelem == 0)
1401 {
1402 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1403 if (BE (err != REG_NOERROR, 0))
1404 return err;
1405 }
1406 else
1407 eclosure_elem = dfa->eclosures[edest];
1408 /* Merge the epsilon closure of `edest'. */
1409 re_node_set_merge (&eclosure, &eclosure_elem);
1410 /* If the epsilon closure of `edest' is incomplete,
1411 the epsilon closure of this node is also incomplete. */
1412 if (dfa->eclosures[edest].nelem == 0)
1413 {
1414 incomplete = 1;
1415 re_node_set_free (&eclosure_elem);
1416 }
1417 }
1418
1419 /* If the current node has constraints, duplicate all non-epsilon nodes.
1420 Since they must inherit the constraints. */
1421 if (constraint)
1422 for (i = 0, max = eclosure.nelem; i < max; ++i)
1423 {
1424 int dest = eclosure.elems[i];
1425 if (!IS_EPSILON_NODE (dfa->nodes[dest].type))
1426 {
1427 int dup_dest;
1428 reg_errcode_t err;
1429 err = duplicate_node (&dup_dest, dfa, dest, constraint);
1430 if (BE (err != REG_NOERROR, 0))
1431 return err;
1432 if (dest != dup_dest)
1433 {
1434 re_node_set_remove_at (&eclosure, i--);
1435 re_node_set_insert (&eclosure, dup_dest);
1436 --max;
1437 }
1438 }
1439 }
1440
1441 /* Epsilon closures include itself. */
1442 re_node_set_insert (&eclosure, node);
1443 if (incomplete && !root)
1444 dfa->eclosures[node].nelem = 0;
1445 else
1446 dfa->eclosures[node] = eclosure;
1447 *new_set = eclosure;
1448 return REG_NOERROR;
1449 }
1450
1451 /* Functions for token which are used in the parser. */
1452
1453 /* Fetch a token from INPUT.
1454 We must not use this function inside bracket expressions. */
1455
1456 static re_token_t
fetch_token(input,syntax)1457 fetch_token (input, syntax)
1458 re_string_t *input;
1459 reg_syntax_t syntax;
1460 {
1461 re_token_t token;
1462 int consumed_byte;
1463 consumed_byte = peek_token (&token, input, syntax);
1464 re_string_skip_bytes (input, consumed_byte);
1465 return token;
1466 }
1467
1468 /* Peek a token from INPUT, and return the length of the token.
1469 We must not use this function inside bracket expressions. */
1470
1471 static int
peek_token(token,input,syntax)1472 peek_token (token, input, syntax)
1473 re_token_t *token;
1474 re_string_t *input;
1475 reg_syntax_t syntax;
1476 {
1477 unsigned char c;
1478
1479 if (re_string_eoi (input))
1480 {
1481 token->type = END_OF_RE;
1482 return 0;
1483 }
1484
1485 c = re_string_peek_byte (input, 0);
1486 token->opr.c = c;
1487
1488 #ifdef RE_ENABLE_I18N
1489 token->mb_partial = 0;
1490 if (MB_CUR_MAX > 1 &&
1491 !re_string_first_byte (input, re_string_cur_idx (input)))
1492 {
1493 token->type = CHARACTER;
1494 token->mb_partial = 1;
1495 return 1;
1496 }
1497 #endif
1498 if (c == '\\')
1499 {
1500 unsigned char c2;
1501 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1502 {
1503 token->type = BACK_SLASH;
1504 return 1;
1505 }
1506
1507 c2 = re_string_peek_byte_case (input, 1);
1508 token->opr.c = c2;
1509 token->type = CHARACTER;
1510 switch (c2)
1511 {
1512 case '|':
1513 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1514 token->type = OP_ALT;
1515 break;
1516 case '1': case '2': case '3': case '4': case '5':
1517 case '6': case '7': case '8': case '9':
1518 if (!(syntax & RE_NO_BK_REFS))
1519 {
1520 token->type = OP_BACK_REF;
1521 token->opr.idx = c2 - '0';
1522 }
1523 break;
1524 case '<':
1525 if (!(syntax & RE_NO_GNU_OPS))
1526 {
1527 token->type = ANCHOR;
1528 token->opr.idx = WORD_FIRST;
1529 }
1530 break;
1531 case '>':
1532 if (!(syntax & RE_NO_GNU_OPS))
1533 {
1534 token->type = ANCHOR;
1535 token->opr.idx = WORD_LAST;
1536 }
1537 break;
1538 case 'b':
1539 if (!(syntax & RE_NO_GNU_OPS))
1540 {
1541 token->type = ANCHOR;
1542 token->opr.idx = WORD_DELIM;
1543 }
1544 break;
1545 case 'B':
1546 if (!(syntax & RE_NO_GNU_OPS))
1547 {
1548 token->type = ANCHOR;
1549 token->opr.idx = INSIDE_WORD;
1550 }
1551 break;
1552 case 'w':
1553 if (!(syntax & RE_NO_GNU_OPS))
1554 token->type = OP_WORD;
1555 break;
1556 case 'W':
1557 if (!(syntax & RE_NO_GNU_OPS))
1558 token->type = OP_NOTWORD;
1559 break;
1560 case '`':
1561 if (!(syntax & RE_NO_GNU_OPS))
1562 {
1563 token->type = ANCHOR;
1564 token->opr.idx = BUF_FIRST;
1565 }
1566 break;
1567 case '\'':
1568 if (!(syntax & RE_NO_GNU_OPS))
1569 {
1570 token->type = ANCHOR;
1571 token->opr.idx = BUF_LAST;
1572 }
1573 break;
1574 case '(':
1575 if (!(syntax & RE_NO_BK_PARENS))
1576 token->type = OP_OPEN_SUBEXP;
1577 break;
1578 case ')':
1579 if (!(syntax & RE_NO_BK_PARENS))
1580 token->type = OP_CLOSE_SUBEXP;
1581 break;
1582 case '+':
1583 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1584 token->type = OP_DUP_PLUS;
1585 break;
1586 case '?':
1587 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1588 token->type = OP_DUP_QUESTION;
1589 break;
1590 case '{':
1591 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1592 token->type = OP_OPEN_DUP_NUM;
1593 break;
1594 case '}':
1595 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1596 token->type = OP_CLOSE_DUP_NUM;
1597 break;
1598 default:
1599 break;
1600 }
1601 return 2;
1602 }
1603
1604 token->type = CHARACTER;
1605 switch (c)
1606 {
1607 case '\n':
1608 if (syntax & RE_NEWLINE_ALT)
1609 token->type = OP_ALT;
1610 break;
1611 case '|':
1612 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1613 token->type = OP_ALT;
1614 break;
1615 case '*':
1616 token->type = OP_DUP_ASTERISK;
1617 break;
1618 case '+':
1619 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1620 token->type = OP_DUP_PLUS;
1621 break;
1622 case '?':
1623 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1624 token->type = OP_DUP_QUESTION;
1625 break;
1626 case '{':
1627 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1628 token->type = OP_OPEN_DUP_NUM;
1629 break;
1630 case '}':
1631 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1632 token->type = OP_CLOSE_DUP_NUM;
1633 break;
1634 case '(':
1635 if (syntax & RE_NO_BK_PARENS)
1636 token->type = OP_OPEN_SUBEXP;
1637 break;
1638 case ')':
1639 if (syntax & RE_NO_BK_PARENS)
1640 token->type = OP_CLOSE_SUBEXP;
1641 break;
1642 case '[':
1643 token->type = OP_OPEN_BRACKET;
1644 break;
1645 case '.':
1646 token->type = OP_PERIOD;
1647 break;
1648 case '^':
1649 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1650 re_string_cur_idx (input) != 0)
1651 {
1652 char prev = re_string_peek_byte (input, -1);
1653 if (prev != '|' && prev != '(' &&
1654 (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1655 break;
1656 }
1657 token->type = ANCHOR;
1658 token->opr.idx = LINE_FIRST;
1659 break;
1660 case '$':
1661 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1662 re_string_cur_idx (input) + 1 != re_string_length (input))
1663 {
1664 re_token_t next;
1665 re_string_skip_bytes (input, 1);
1666 peek_token (&next, input, syntax);
1667 re_string_skip_bytes (input, -1);
1668 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1669 break;
1670 }
1671 token->type = ANCHOR;
1672 token->opr.idx = LINE_LAST;
1673 break;
1674 default:
1675 break;
1676 }
1677 return 1;
1678 }
1679
1680 /* Peek a token from INPUT, and return the length of the token.
1681 We must not use this function out of bracket expressions. */
1682
1683 static int
peek_token_bracket(token,input,syntax)1684 peek_token_bracket (token, input, syntax)
1685 re_token_t *token;
1686 re_string_t *input;
1687 reg_syntax_t syntax;
1688 {
1689 unsigned char c;
1690 if (re_string_eoi (input))
1691 {
1692 token->type = END_OF_RE;
1693 return 0;
1694 }
1695 c = re_string_peek_byte (input, 0);
1696 token->opr.c = c;
1697
1698 #ifdef RE_ENABLE_I18N
1699 if (MB_CUR_MAX > 1 &&
1700 !re_string_first_byte (input, re_string_cur_idx (input)))
1701 {
1702 token->type = CHARACTER;
1703 return 1;
1704 }
1705 #endif /* RE_ENABLE_I18N */
1706
1707 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1708 {
1709 /* In this case, '\' escape a character. */
1710 unsigned char c2;
1711 c2 = re_string_peek_byte (input, 1);
1712 token->opr.c = c2;
1713 token->type = CHARACTER;
1714 return 1;
1715 }
1716 if (c == '[') /* '[' is a special char in a bracket exps. */
1717 {
1718 unsigned char c2;
1719 int token_len;
1720 c2 = re_string_peek_byte (input, 1);
1721 token->opr.c = c2;
1722 token_len = 2;
1723 switch (c2)
1724 {
1725 case '.':
1726 token->type = OP_OPEN_COLL_ELEM;
1727 break;
1728 case '=':
1729 token->type = OP_OPEN_EQUIV_CLASS;
1730 break;
1731 case ':':
1732 if (syntax & RE_CHAR_CLASSES)
1733 {
1734 token->type = OP_OPEN_CHAR_CLASS;
1735 break;
1736 }
1737 /* else fall through. */
1738 default:
1739 token->type = CHARACTER;
1740 token->opr.c = c;
1741 token_len = 1;
1742 break;
1743 }
1744 return token_len;
1745 }
1746 switch (c)
1747 {
1748 case '-':
1749 token->type = OP_CHARSET_RANGE;
1750 break;
1751 case ']':
1752 token->type = OP_CLOSE_BRACKET;
1753 break;
1754 case '^':
1755 token->type = OP_NON_MATCH_LIST;
1756 break;
1757 default:
1758 token->type = CHARACTER;
1759 }
1760 return 1;
1761 }
1762
1763 /* Functions for parser. */
1764
1765 /* Entry point of the parser.
1766 Parse the regular expression REGEXP and return the structure tree.
1767 If an error is occurred, ERR is set by error code, and return NULL.
1768 This function build the following tree, from regular expression <reg_exp>:
1769 CAT
1770 / \
1771 / \
1772 <reg_exp> EOR
1773
1774 CAT means concatenation.
1775 EOR means end of regular expression. */
1776
1777 static bin_tree_t *
parse(regexp,preg,syntax,err)1778 parse (regexp, preg, syntax, err)
1779 re_string_t *regexp;
1780 regex_t *preg;
1781 reg_syntax_t syntax;
1782 reg_errcode_t *err;
1783 {
1784 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1785 bin_tree_t *tree, *eor, *root;
1786 re_token_t current_token;
1787 int new_idx;
1788 current_token = fetch_token (regexp, syntax);
1789 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
1790 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1791 return NULL;
1792 new_idx = re_dfa_add_node (dfa, current_token, 0);
1793 eor = create_tree (NULL, NULL, 0, new_idx);
1794 if (tree != NULL)
1795 root = create_tree (tree, eor, CONCAT, 0);
1796 else
1797 root = eor;
1798 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1799 return *err = REG_ESPACE, NULL;
1800 return root;
1801 }
1802
1803 /* This function build the following tree, from regular expression
1804 <branch1>|<branch2>:
1805 ALT
1806 / \
1807 / \
1808 <branch1> <branch2>
1809
1810 ALT means alternative, which represents the operator `|'. */
1811
1812 static bin_tree_t *
parse_reg_exp(regexp,preg,token,syntax,nest,err)1813 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1814 re_string_t *regexp;
1815 regex_t *preg;
1816 re_token_t *token;
1817 reg_syntax_t syntax;
1818 int nest;
1819 reg_errcode_t *err;
1820 {
1821 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1822 bin_tree_t *tree, *branch = NULL;
1823 int new_idx;
1824 tree = parse_branch (regexp, preg, token, syntax, nest, err);
1825 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1826 return NULL;
1827
1828 while (token->type == OP_ALT)
1829 {
1830 re_token_t alt_token = *token;
1831 new_idx = re_dfa_add_node (dfa, alt_token, 0);
1832 *token = fetch_token (regexp, syntax);
1833 if (token->type != OP_ALT && token->type != END_OF_RE
1834 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1835 {
1836 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1837 if (BE (*err != REG_NOERROR && branch == NULL, 0))
1838 {
1839 free_bin_tree (tree);
1840 return NULL;
1841 }
1842 }
1843 else
1844 branch = NULL;
1845 tree = create_tree (tree, branch, 0, new_idx);
1846 if (BE (new_idx == -1 || tree == NULL, 0))
1847 return *err = REG_ESPACE, NULL;
1848 dfa->has_plural_match = 1;
1849 }
1850 return tree;
1851 }
1852
1853 /* This function build the following tree, from regular expression
1854 <exp1><exp2>:
1855 CAT
1856 / \
1857 / \
1858 <exp1> <exp2>
1859
1860 CAT means concatenation. */
1861
1862 static bin_tree_t *
parse_branch(regexp,preg,token,syntax,nest,err)1863 parse_branch (regexp, preg, token, syntax, nest, err)
1864 re_string_t *regexp;
1865 regex_t *preg;
1866 re_token_t *token;
1867 reg_syntax_t syntax;
1868 int nest;
1869 reg_errcode_t *err;
1870 {
1871 bin_tree_t *tree, *exp;
1872 tree = parse_expression (regexp, preg, token, syntax, nest, err);
1873 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1874 return NULL;
1875
1876 while (token->type != OP_ALT && token->type != END_OF_RE
1877 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1878 {
1879 exp = parse_expression (regexp, preg, token, syntax, nest, err);
1880 if (BE (*err != REG_NOERROR && exp == NULL, 0))
1881 {
1882 free_bin_tree (tree);
1883 return NULL;
1884 }
1885 if (tree != NULL && exp != NULL)
1886 {
1887 tree = create_tree (tree, exp, CONCAT, 0);
1888 if (tree == NULL)
1889 return *err = REG_ESPACE, NULL;
1890 }
1891 else if (tree == NULL)
1892 tree = exp;
1893 /* Otherwise exp == NULL, we don't need to create new tree. */
1894 }
1895 return tree;
1896 }
1897
1898 /* This function build the following tree, from regular expression a*:
1899 *
1900 |
1901 a
1902 */
1903
1904 static bin_tree_t *
parse_expression(regexp,preg,token,syntax,nest,err)1905 parse_expression (regexp, preg, token, syntax, nest, err)
1906 re_string_t *regexp;
1907 regex_t *preg;
1908 re_token_t *token;
1909 reg_syntax_t syntax;
1910 int nest;
1911 reg_errcode_t *err;
1912 {
1913 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1914 bin_tree_t *tree;
1915 int new_idx;
1916 switch (token->type)
1917 {
1918 case CHARACTER:
1919 new_idx = re_dfa_add_node (dfa, *token, 0);
1920 tree = create_tree (NULL, NULL, 0, new_idx);
1921 if (BE (new_idx == -1 || tree == NULL, 0))
1922 return *err = REG_ESPACE, NULL;
1923 #ifdef RE_ENABLE_I18N
1924 if (MB_CUR_MAX > 1)
1925 {
1926 while (!re_string_eoi (regexp)
1927 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1928 {
1929 bin_tree_t *mbc_remain;
1930 *token = fetch_token (regexp, syntax);
1931 new_idx = re_dfa_add_node (dfa, *token, 0);
1932 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1933 tree = create_tree (tree, mbc_remain, CONCAT, 0);
1934 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1935 return *err = REG_ESPACE, NULL;
1936 }
1937 }
1938 #endif
1939 break;
1940 case OP_OPEN_SUBEXP:
1941 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1942 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1943 return NULL;
1944 break;
1945 case OP_OPEN_BRACKET:
1946 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1947 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1948 return NULL;
1949 break;
1950 case OP_BACK_REF:
1951 if (BE (preg->re_nsub < token->opr.idx
1952 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1953 {
1954 *err = REG_ESUBREG;
1955 return NULL;
1956 }
1957 new_idx = re_dfa_add_node (dfa, *token, 0);
1958 tree = create_tree (NULL, NULL, 0, new_idx);
1959 if (BE (new_idx == -1 || tree == NULL, 0))
1960 return *err = REG_ESPACE, NULL;
1961 ++dfa->nbackref;
1962 dfa->has_mb_node = 1;
1963 break;
1964 case OP_DUP_ASTERISK:
1965 case OP_DUP_PLUS:
1966 case OP_DUP_QUESTION:
1967 case OP_OPEN_DUP_NUM:
1968 if (syntax & RE_CONTEXT_INVALID_OPS)
1969 return *err = REG_BADRPT, NULL;
1970 else if (syntax & RE_CONTEXT_INDEP_OPS)
1971 {
1972 *token = fetch_token (regexp, syntax);
1973 return parse_expression (regexp, preg, token, syntax, nest, err);
1974 }
1975 /* else fall through */
1976 case OP_CLOSE_SUBEXP:
1977 if ((token->type == OP_CLOSE_SUBEXP) &&
1978 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
1979 return *err = REG_ERPAREN, NULL;
1980 /* else fall through */
1981 case OP_CLOSE_DUP_NUM:
1982 /* We treat it as a normal character. */
1983
1984 /* Then we can these characters as normal characters. */
1985 token->type = CHARACTER;
1986 new_idx = re_dfa_add_node (dfa, *token, 0);
1987 tree = create_tree (NULL, NULL, 0, new_idx);
1988 if (BE (new_idx == -1 || tree == NULL, 0))
1989 return *err = REG_ESPACE, NULL;
1990 break;
1991 case ANCHOR:
1992 if (dfa->word_char == NULL)
1993 {
1994 *err = init_word_char (dfa);
1995 if (BE (*err != REG_NOERROR, 0))
1996 return NULL;
1997 }
1998 if (token->opr.ctx_type == WORD_DELIM)
1999 {
2000 bin_tree_t *tree_first, *tree_last;
2001 int idx_first, idx_last;
2002 token->opr.ctx_type = WORD_FIRST;
2003 idx_first = re_dfa_add_node (dfa, *token, 0);
2004 tree_first = create_tree (NULL, NULL, 0, idx_first);
2005 token->opr.ctx_type = WORD_LAST;
2006 idx_last = re_dfa_add_node (dfa, *token, 0);
2007 tree_last = create_tree (NULL, NULL, 0, idx_last);
2008 token->type = OP_ALT;
2009 new_idx = re_dfa_add_node (dfa, *token, 0);
2010 tree = create_tree (tree_first, tree_last, 0, new_idx);
2011 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2012 || tree_first == NULL || tree_last == NULL
2013 || tree == NULL, 0))
2014 return *err = REG_ESPACE, NULL;
2015 }
2016 else
2017 {
2018 new_idx = re_dfa_add_node (dfa, *token, 0);
2019 tree = create_tree (NULL, NULL, 0, new_idx);
2020 if (BE (new_idx == -1 || tree == NULL, 0))
2021 return *err = REG_ESPACE, NULL;
2022 }
2023 /* We must return here, since ANCHORs can't be followed
2024 by repetition operators.
2025 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2026 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2027 *token = fetch_token (regexp, syntax);
2028 return tree;
2029 case OP_PERIOD:
2030 new_idx = re_dfa_add_node (dfa, *token, 0);
2031 tree = create_tree (NULL, NULL, 0, new_idx);
2032 if (BE (new_idx == -1 || tree == NULL, 0))
2033 return *err = REG_ESPACE, NULL;
2034 if (MB_CUR_MAX > 1)
2035 dfa->has_mb_node = 1;
2036 break;
2037 case OP_WORD:
2038 tree = build_word_op (dfa, 0, err);
2039 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2040 return NULL;
2041 break;
2042 case OP_NOTWORD:
2043 tree = build_word_op (dfa, 1, err);
2044 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2045 return NULL;
2046 break;
2047 case OP_ALT:
2048 case END_OF_RE:
2049 return NULL;
2050 case BACK_SLASH:
2051 *err = REG_EESCAPE;
2052 return NULL;
2053 default:
2054 /* Must not happen? */
2055 #ifdef DEBUG
2056 assert (0);
2057 #endif
2058 return NULL;
2059 }
2060 *token = fetch_token (regexp, syntax);
2061
2062 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2063 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2064 {
2065 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2066 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2067 return NULL;
2068 dfa->has_plural_match = 1;
2069 }
2070
2071 return tree;
2072 }
2073
2074 /* This function build the following tree, from regular expression
2075 (<reg_exp>):
2076 SUBEXP
2077 |
2078 <reg_exp>
2079 */
2080
2081 static bin_tree_t *
parse_sub_exp(regexp,preg,token,syntax,nest,err)2082 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2083 re_string_t *regexp;
2084 regex_t *preg;
2085 re_token_t *token;
2086 reg_syntax_t syntax;
2087 int nest;
2088 reg_errcode_t *err;
2089 {
2090 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2091 bin_tree_t *tree, *left_par, *right_par;
2092 size_t cur_nsub;
2093 int new_idx;
2094 cur_nsub = preg->re_nsub++;
2095 if (dfa->subexps_alloc < preg->re_nsub)
2096 {
2097 re_subexp_t *new_array;
2098 dfa->subexps_alloc *= 2;
2099 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2100 if (BE (new_array == NULL, 0))
2101 {
2102 dfa->subexps_alloc /= 2;
2103 *err = REG_ESPACE;
2104 return NULL;
2105 }
2106 dfa->subexps = new_array;
2107 }
2108 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2109 dfa->subexps[cur_nsub].end = -1;
2110
2111 new_idx = re_dfa_add_node (dfa, *token, 0);
2112 left_par = create_tree (NULL, NULL, 0, new_idx);
2113 if (BE (new_idx == -1 || left_par == NULL, 0))
2114 return *err = REG_ESPACE, NULL;
2115 dfa->nodes[new_idx].opr.idx = cur_nsub;
2116 *token = fetch_token (regexp, syntax);
2117
2118 /* The subexpression may be a null string. */
2119 if (token->type == OP_CLOSE_SUBEXP)
2120 tree = NULL;
2121 else
2122 {
2123 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2124 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2125 return NULL;
2126 }
2127 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2128 {
2129 free_bin_tree (tree);
2130 *err = REG_BADPAT;
2131 return NULL;
2132 }
2133 new_idx = re_dfa_add_node (dfa, *token, 0);
2134 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2135 right_par = create_tree (NULL, NULL, 0, new_idx);
2136 tree = ((tree == NULL) ? right_par
2137 : create_tree (tree, right_par, CONCAT, 0));
2138 tree = create_tree (left_par, tree, CONCAT, 0);
2139 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2140 return *err = REG_ESPACE, NULL;
2141 dfa->nodes[new_idx].opr.idx = cur_nsub;
2142
2143 return tree;
2144 }
2145
2146 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2147
2148 static bin_tree_t *
parse_dup_op(dup_elem,regexp,dfa,token,syntax,err)2149 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2150 bin_tree_t *dup_elem;
2151 re_string_t *regexp;
2152 re_dfa_t *dfa;
2153 re_token_t *token;
2154 reg_syntax_t syntax;
2155 reg_errcode_t *err;
2156 {
2157 re_token_t dup_token;
2158 bin_tree_t *tree = dup_elem, *work_tree;
2159 int new_idx, start_idx = re_string_cur_idx (regexp);
2160 re_token_t start_token = *token;
2161 if (token->type == OP_OPEN_DUP_NUM)
2162 {
2163 int i;
2164 int end = 0;
2165 int start = fetch_number (regexp, token, syntax);
2166 bin_tree_t *elem;
2167 if (start == -1)
2168 {
2169 if (token->type == CHARACTER && token->opr.c == ',')
2170 start = 0; /* We treat "{,m}" as "{0,m}". */
2171 else
2172 {
2173 *err = REG_BADBR; /* <re>{} is invalid. */
2174 return NULL;
2175 }
2176 }
2177 if (BE (start != -2, 1))
2178 {
2179 /* We treat "{n}" as "{n,n}". */
2180 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2181 : ((token->type == CHARACTER && token->opr.c == ',')
2182 ? fetch_number (regexp, token, syntax) : -2));
2183 }
2184 if (BE (start == -2 || end == -2, 0))
2185 {
2186 /* Invalid sequence. */
2187 if (token->type == OP_CLOSE_DUP_NUM)
2188 goto parse_dup_op_invalid_interval;
2189 else
2190 goto parse_dup_op_ebrace;
2191 }
2192 if (BE (start == 0 && end == 0, 0))
2193 {
2194 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2195 *token = fetch_token (regexp, syntax);
2196 free_bin_tree (dup_elem);
2197 return NULL;
2198 }
2199
2200 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2201 elem = tree;
2202 for (i = 0; i < start; ++i)
2203 if (i != 0)
2204 {
2205 work_tree = duplicate_tree (elem, dfa);
2206 tree = create_tree (tree, work_tree, CONCAT, 0);
2207 if (BE (work_tree == NULL || tree == NULL, 0))
2208 goto parse_dup_op_espace;
2209 }
2210
2211 if (end == -1)
2212 {
2213 /* We treat "<re>{0,}" as "<re>*". */
2214 dup_token.type = OP_DUP_ASTERISK;
2215 if (start > 0)
2216 {
2217 elem = duplicate_tree (elem, dfa);
2218 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2219 work_tree = create_tree (elem, NULL, 0, new_idx);
2220 tree = create_tree (tree, work_tree, CONCAT, 0);
2221 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2222 || tree == NULL, 0))
2223 goto parse_dup_op_espace;
2224 }
2225 else
2226 {
2227 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2228 tree = create_tree (elem, NULL, 0, new_idx);
2229 if (BE (new_idx == -1 || tree == NULL, 0))
2230 goto parse_dup_op_espace;
2231 }
2232 }
2233 else if (end - start > 0)
2234 {
2235 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2236 dup_token.type = OP_DUP_QUESTION;
2237 if (start > 0)
2238 {
2239 elem = duplicate_tree (elem, dfa);
2240 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2241 elem = create_tree (elem, NULL, 0, new_idx);
2242 tree = create_tree (tree, elem, CONCAT, 0);
2243 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2244 goto parse_dup_op_espace;
2245 }
2246 else
2247 {
2248 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2249 tree = elem = create_tree (elem, NULL, 0, new_idx);
2250 if (BE (new_idx == -1 || tree == NULL, 0))
2251 goto parse_dup_op_espace;
2252 }
2253 for (i = 1; i < end - start; ++i)
2254 {
2255 work_tree = duplicate_tree (elem, dfa);
2256 tree = create_tree (tree, work_tree, CONCAT, 0);
2257 if (BE (work_tree == NULL || tree == NULL, 0))
2258 return *err = REG_ESPACE, NULL;
2259 }
2260 }
2261 }
2262 else
2263 {
2264 new_idx = re_dfa_add_node (dfa, *token, 0);
2265 tree = create_tree (tree, NULL, 0, new_idx);
2266 if (BE (new_idx == -1 || tree == NULL, 0))
2267 return *err = REG_ESPACE, NULL;
2268 }
2269 *token = fetch_token (regexp, syntax);
2270 return tree;
2271
2272 parse_dup_op_espace:
2273 free_bin_tree (tree);
2274 *err = REG_ESPACE;
2275 return NULL;
2276
2277 parse_dup_op_ebrace:
2278 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2279 {
2280 *err = REG_EBRACE;
2281 return NULL;
2282 }
2283 goto parse_dup_op_rollback;
2284 parse_dup_op_invalid_interval:
2285 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2286 {
2287 *err = REG_BADBR;
2288 return NULL;
2289 }
2290 parse_dup_op_rollback:
2291 re_string_set_index (regexp, start_idx);
2292 *token = start_token;
2293 token->type = CHARACTER;
2294 return dup_elem;
2295 }
2296
2297 /* Size of the names for collating symbol/equivalence_class/character_class.
2298 I'm not sure, but maybe enough. */
2299 #define BRACKET_NAME_BUF_SIZE 32
2300
2301 #ifndef _LIBC
2302 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2303 Build the range expression which starts from START_ELEM, and ends
2304 at END_ELEM. The result are written to MBCSET and SBCSET.
2305 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2306 mbcset->range_ends, is a pointer argument sinse we may
2307 update it. */
2308
2309 static reg_errcode_t
2310 # ifdef RE_ENABLE_I18N
build_range_exp(sbcset,mbcset,range_alloc,start_elem,end_elem)2311 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2312 re_charset_t *mbcset;
2313 int *range_alloc;
2314 # else /* not RE_ENABLE_I18N */
2315 build_range_exp (sbcset, start_elem, end_elem)
2316 # endif /* not RE_ENABLE_I18N */
2317 re_bitset_ptr_t sbcset;
2318 bracket_elem_t *start_elem, *end_elem;
2319 {
2320 unsigned int start_ch, end_ch;
2321 /* Equivalence Classes and Character Classes can't be a range start/end. */
2322 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2323 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2324 0))
2325 return REG_ERANGE;
2326
2327 /* We can handle no multi character collating elements without libc
2328 support. */
2329 if (BE ((start_elem->type == COLL_SYM
2330 && strlen ((char *) start_elem->opr.name) > 1)
2331 || (end_elem->type == COLL_SYM
2332 && strlen ((char *) end_elem->opr.name) > 1), 0))
2333 return REG_ECOLLATE;
2334
2335 # ifdef RE_ENABLE_I18N
2336 {
2337 wchar_t wc, start_wc, end_wc;
2338 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2339
2340 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2341 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2342 : 0));
2343 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2344 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2345 : 0));
2346 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2347 ? __btowc (start_ch) : start_elem->opr.wch);
2348 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2349 ? __btowc (end_ch) : end_elem->opr.wch);
2350 cmp_buf[0] = start_wc;
2351 cmp_buf[4] = end_wc;
2352 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2353 return REG_ERANGE;
2354
2355 /* Check the space of the arrays. */
2356 if (*range_alloc == mbcset->nranges)
2357 {
2358 /* There are not enough space, need realloc. */
2359 wchar_t *new_array_start, *new_array_end;
2360 int new_nranges;
2361
2362 /* +1 in case of mbcset->nranges is 0. */
2363 new_nranges = 2 * mbcset->nranges + 1;
2364 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2365 are NULL if *range_alloc == 0. */
2366 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2367 new_nranges);
2368 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2369 new_nranges);
2370
2371 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2372 return REG_ESPACE;
2373
2374 mbcset->range_starts = new_array_start;
2375 mbcset->range_ends = new_array_end;
2376 *range_alloc = new_nranges;
2377 }
2378
2379 mbcset->range_starts[mbcset->nranges] = start_wc;
2380 mbcset->range_ends[mbcset->nranges++] = end_wc;
2381
2382 /* Build the table for single byte characters. */
2383 for (wc = 0; wc <= SBC_MAX; ++wc)
2384 {
2385 cmp_buf[2] = wc;
2386 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2387 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2388 bitset_set (sbcset, wc);
2389 }
2390 }
2391 # else /* not RE_ENABLE_I18N */
2392 {
2393 unsigned int ch;
2394 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2395 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2396 : 0));
2397 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2398 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2399 : 0));
2400 if (start_ch > end_ch)
2401 return REG_ERANGE;
2402 /* Build the table for single byte characters. */
2403 for (ch = 0; ch <= SBC_MAX; ++ch)
2404 if (start_ch <= ch && ch <= end_ch)
2405 bitset_set (sbcset, ch);
2406 }
2407 # endif /* not RE_ENABLE_I18N */
2408 return REG_NOERROR;
2409 }
2410 #endif /* not _LIBC */
2411
2412 #ifndef _LIBC
2413 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2414 Build the collating element which is represented by NAME.
2415 The result are written to MBCSET and SBCSET.
2416 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2417 pointer argument since we may update it. */
2418
2419 static reg_errcode_t
2420 # ifdef RE_ENABLE_I18N
build_collating_symbol(sbcset,mbcset,coll_syxmalloc,name)2421 build_collating_symbol (sbcset, mbcset, coll_syxmalloc, name)
2422 re_charset_t *mbcset;
2423 int *coll_syxmalloc;
2424 # else /* not RE_ENABLE_I18N */
2425 build_collating_symbol (sbcset, name)
2426 # endif /* not RE_ENABLE_I18N */
2427 re_bitset_ptr_t sbcset;
2428 const unsigned char *name;
2429 {
2430 size_t name_len = strlen ((const char *) name);
2431 if (BE (name_len != 1, 0))
2432 return REG_ECOLLATE;
2433 else
2434 {
2435 bitset_set (sbcset, name[0]);
2436 return REG_NOERROR;
2437 }
2438 }
2439 #endif /* not _LIBC */
2440
2441 /* This function parse bracket expression like "[abc]", "[a-c]",
2442 "[[.a-a.]]" etc. */
2443
2444 static bin_tree_t *
parse_bracket_exp(regexp,dfa,token,syntax,err)2445 parse_bracket_exp (regexp, dfa, token, syntax, err)
2446 re_string_t *regexp;
2447 re_dfa_t *dfa;
2448 re_token_t *token;
2449 reg_syntax_t syntax;
2450 reg_errcode_t *err;
2451 {
2452 #ifdef _LIBC
2453 const unsigned char *collseqmb;
2454 const char *collseqwc;
2455 uint32_t nrules;
2456 int32_t table_size;
2457 const int32_t *symb_table;
2458 const unsigned char *extra;
2459
2460 /* Local function for parse_bracket_exp used in _LIBC environement.
2461 Seek the collating symbol entry correspondings to NAME.
2462 Return the index of the symbol in the SYMB_TABLE. */
2463
2464 static inline int32_t
2465 seek_collating_symbol_entry (name, name_len)
2466 const unsigned char *name;
2467 size_t name_len;
2468 {
2469 int32_t hash = elem_hash ((const char *) name, name_len);
2470 int32_t elem = hash % table_size;
2471 int32_t second = hash % (table_size - 2);
2472 while (symb_table[2 * elem] != 0)
2473 {
2474 /* First compare the hashing value. */
2475 if (symb_table[2 * elem] == hash
2476 /* Compare the length of the name. */
2477 && name_len == extra[symb_table[2 * elem + 1]]
2478 /* Compare the name. */
2479 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2480 name_len) == 0)
2481 {
2482 /* Yep, this is the entry. */
2483 break;
2484 }
2485
2486 /* Next entry. */
2487 elem += second;
2488 }
2489 return elem;
2490 }
2491
2492 /* Local function for parse_bracket_exp used in _LIBC environement.
2493 Look up the collation sequence value of BR_ELEM.
2494 Return the value if succeeded, UINT_MAX otherwise. */
2495
2496 static inline unsigned int
2497 lookup_collation_sequence_value (br_elem)
2498 bracket_elem_t *br_elem;
2499 {
2500 if (br_elem->type == SB_CHAR)
2501 {
2502 /*
2503 if (MB_CUR_MAX == 1)
2504 */
2505 if (nrules == 0)
2506 return collseqmb[br_elem->opr.ch];
2507 else
2508 {
2509 wint_t wc = __btowc (br_elem->opr.ch);
2510 return collseq_table_lookup (collseqwc, wc);
2511 }
2512 }
2513 else if (br_elem->type == MB_CHAR)
2514 {
2515 return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2516 }
2517 else if (br_elem->type == COLL_SYM)
2518 {
2519 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2520 if (nrules != 0)
2521 {
2522 int32_t elem, idx;
2523 elem = seek_collating_symbol_entry (br_elem->opr.name,
2524 sym_name_len);
2525 if (symb_table[2 * elem] != 0)
2526 {
2527 /* We found the entry. */
2528 idx = symb_table[2 * elem + 1];
2529 /* Skip the name of collating element name. */
2530 idx += 1 + extra[idx];
2531 /* Skip the byte sequence of the collating element. */
2532 idx += 1 + extra[idx];
2533 /* Adjust for the alignment. */
2534 idx = (idx + 3) & ~3;
2535 /* Skip the multibyte collation sequence value. */
2536 idx += sizeof (unsigned int);
2537 /* Skip the wide char sequence of the collating element. */
2538 idx += sizeof (unsigned int) *
2539 (1 + *(unsigned int *) (extra + idx));
2540 /* Return the collation sequence value. */
2541 return *(unsigned int *) (extra + idx);
2542 }
2543 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2544 {
2545 /* No valid character. Match it as a single byte
2546 character. */
2547 return collseqmb[br_elem->opr.name[0]];
2548 }
2549 }
2550 else if (sym_name_len == 1)
2551 return collseqmb[br_elem->opr.name[0]];
2552 }
2553 return UINT_MAX;
2554 }
2555
2556 /* Local function for parse_bracket_exp used in _LIBC environement.
2557 Build the range expression which starts from START_ELEM, and ends
2558 at END_ELEM. The result are written to MBCSET and SBCSET.
2559 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2560 mbcset->range_ends, is a pointer argument sinse we may
2561 update it. */
2562
2563 static inline reg_errcode_t
2564 # ifdef RE_ENABLE_I18N
2565 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2566 re_charset_t *mbcset;
2567 int *range_alloc;
2568 # else /* not RE_ENABLE_I18N */
2569 build_range_exp (sbcset, start_elem, end_elem)
2570 # endif /* not RE_ENABLE_I18N */
2571 re_bitset_ptr_t sbcset;
2572 bracket_elem_t *start_elem, *end_elem;
2573 {
2574 unsigned int ch;
2575 uint32_t start_collseq;
2576 uint32_t end_collseq;
2577
2578 # ifdef RE_ENABLE_I18N
2579 /* Check the space of the arrays. */
2580 if (*range_alloc == mbcset->nranges)
2581 {
2582 /* There are not enough space, need realloc. */
2583 uint32_t *new_array_start;
2584 uint32_t *new_array_end;
2585 int new_nranges;
2586
2587 /* +1 in case of mbcset->nranges is 0. */
2588 new_nranges = 2 * mbcset->nranges + 1;
2589 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2590 are NULL if *range_alloc == 0. */
2591 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2592 new_nranges);
2593 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2594 new_nranges);
2595
2596 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2597 return REG_ESPACE;
2598
2599 mbcset->range_starts = new_array_start;
2600 mbcset->range_ends = new_array_end;
2601 *range_alloc = new_nranges;
2602 }
2603 # endif /* RE_ENABLE_I18N */
2604
2605 /* Equivalence Classes and Character Classes can't be a range
2606 start/end. */
2607 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2608 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2609 0))
2610 return REG_ERANGE;
2611
2612 start_collseq = lookup_collation_sequence_value (start_elem);
2613 end_collseq = lookup_collation_sequence_value (end_elem);
2614 /* Check start/end collation sequence values. */
2615 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2616 return REG_ECOLLATE;
2617 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2618 return REG_ERANGE;
2619
2620 # ifdef RE_ENABLE_I18N
2621 /* Got valid collation sequence values, add them as a new entry. */
2622 mbcset->range_starts[mbcset->nranges] = start_collseq;
2623 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2624 # endif /* RE_ENABLE_I18N */
2625
2626 /* Build the table for single byte characters. */
2627 for (ch = 0; ch <= SBC_MAX; ch++)
2628 {
2629 uint32_t ch_collseq;
2630 /*
2631 if (MB_CUR_MAX == 1)
2632 */
2633 if (nrules == 0)
2634 ch_collseq = collseqmb[ch];
2635 else
2636 ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2637 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2638 bitset_set (sbcset, ch);
2639 }
2640 return REG_NOERROR;
2641 }
2642
2643 /* Local function for parse_bracket_exp used in _LIBC environement.
2644 Build the collating element which is represented by NAME.
2645 The result are written to MBCSET and SBCSET.
2646 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2647 pointer argument sinse we may update it. */
2648
2649 static inline reg_errcode_t
2650 # ifdef RE_ENABLE_I18N
2651 build_collating_symbol (sbcset, mbcset, coll_syxmalloc, name)
2652 re_charset_t *mbcset;
2653 int *coll_syxmalloc;
2654 # else /* not RE_ENABLE_I18N */
2655 build_collating_symbol (sbcset, name)
2656 # endif /* not RE_ENABLE_I18N */
2657 re_bitset_ptr_t sbcset;
2658 const unsigned char *name;
2659 {
2660 int32_t elem, idx;
2661 size_t name_len = strlen ((const char *) name);
2662 if (nrules != 0)
2663 {
2664 elem = seek_collating_symbol_entry (name, name_len);
2665 if (symb_table[2 * elem] != 0)
2666 {
2667 /* We found the entry. */
2668 idx = symb_table[2 * elem + 1];
2669 /* Skip the name of collating element name. */
2670 idx += 1 + extra[idx];
2671 }
2672 else if (symb_table[2 * elem] == 0 && name_len == 1)
2673 {
2674 /* No valid character, treat it as a normal
2675 character. */
2676 bitset_set (sbcset, name[0]);
2677 return REG_NOERROR;
2678 }
2679 else
2680 return REG_ECOLLATE;
2681
2682 # ifdef RE_ENABLE_I18N
2683 /* Got valid collation sequence, add it as a new entry. */
2684 /* Check the space of the arrays. */
2685 if (*coll_syxmalloc == mbcset->ncoll_syms)
2686 {
2687 /* Not enough, realloc it. */
2688 /* +1 in case of mbcset->ncoll_syms is 0. */
2689 *coll_syxmalloc = 2 * mbcset->ncoll_syms + 1;
2690 /* Use realloc since mbcset->coll_syms is NULL
2691 if *alloc == 0. */
2692 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2693 *coll_syxmalloc);
2694 if (BE (mbcset->coll_syms == NULL, 0))
2695 return REG_ESPACE;
2696 }
2697 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2698 # endif /* RE_ENABLE_I18N */
2699 return REG_NOERROR;
2700 }
2701 else
2702 {
2703 if (BE (name_len != 1, 0))
2704 return REG_ECOLLATE;
2705 else
2706 {
2707 bitset_set (sbcset, name[0]);
2708 return REG_NOERROR;
2709 }
2710 }
2711 }
2712 #endif
2713
2714 re_token_t br_token;
2715 re_bitset_ptr_t sbcset;
2716 #ifdef RE_ENABLE_I18N
2717 re_charset_t *mbcset;
2718 int coll_syxmalloc = 0, range_alloc = 0, mbchar_alloc = 0;
2719 int equiv_class_alloc = 0, char_class_alloc = 0;
2720 #else /* not RE_ENABLE_I18N */
2721 int non_match = 0;
2722 #endif /* not RE_ENABLE_I18N */
2723 bin_tree_t *work_tree;
2724 int token_len, new_idx;
2725 #ifdef _LIBC
2726 collseqmb = (const unsigned char *)
2727 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2728 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2729 if (nrules)
2730 {
2731 /*
2732 if (MB_CUR_MAX > 1)
2733 */
2734 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2735 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2736 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2737 _NL_COLLATE_SYMB_TABLEMB);
2738 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2739 _NL_COLLATE_SYMB_EXTRAMB);
2740 }
2741 #endif
2742 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2743 #ifdef RE_ENABLE_I18N
2744 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2745 #endif /* RE_ENABLE_I18N */
2746 #ifdef RE_ENABLE_I18N
2747 if (BE (sbcset == NULL || mbcset == NULL, 0))
2748 #else
2749 if (BE (sbcset == NULL, 0))
2750 #endif /* RE_ENABLE_I18N */
2751 {
2752 *err = REG_ESPACE;
2753 return NULL;
2754 }
2755
2756 token_len = peek_token_bracket (token, regexp, syntax);
2757 if (BE (token->type == END_OF_RE, 0))
2758 {
2759 *err = REG_BADPAT;
2760 goto parse_bracket_exp_free_return;
2761 }
2762 if (token->type == OP_NON_MATCH_LIST)
2763 {
2764 #ifdef RE_ENABLE_I18N
2765 int i;
2766 mbcset->non_match = 1;
2767 #else /* not RE_ENABLE_I18N */
2768 non_match = 1;
2769 #endif /* not RE_ENABLE_I18N */
2770 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2771 bitset_set (sbcset, '\0');
2772 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2773 token_len = peek_token_bracket (token, regexp, syntax);
2774 if (BE (token->type == END_OF_RE, 0))
2775 {
2776 *err = REG_BADPAT;
2777 goto parse_bracket_exp_free_return;
2778 }
2779 #ifdef RE_ENABLE_I18N
2780 if (MB_CUR_MAX > 1)
2781 for (i = 0; i < SBC_MAX; ++i)
2782 if (__btowc (i) == WEOF)
2783 bitset_set (sbcset, i);
2784 #endif /* RE_ENABLE_I18N */
2785 }
2786
2787 /* We treat the first ']' as a normal character. */
2788 if (token->type == OP_CLOSE_BRACKET)
2789 token->type = CHARACTER;
2790
2791 while (1)
2792 {
2793 bracket_elem_t start_elem, end_elem;
2794 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2795 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2796 reg_errcode_t ret;
2797 int token_len2 = 0, is_range_exp = 0;
2798 re_token_t token2;
2799
2800 start_elem.opr.name = start_name_buf;
2801 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2802 syntax);
2803 if (BE (ret != REG_NOERROR, 0))
2804 {
2805 *err = ret;
2806 goto parse_bracket_exp_free_return;
2807 }
2808
2809 token_len = peek_token_bracket (token, regexp, syntax);
2810 if (BE (token->type == END_OF_RE, 0))
2811 {
2812 *err = REG_BADPAT;
2813 goto parse_bracket_exp_free_return;
2814 }
2815 if (token->type == OP_CHARSET_RANGE)
2816 {
2817 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
2818 token_len2 = peek_token_bracket (&token2, regexp, syntax);
2819 if (BE (token->type == END_OF_RE, 0))
2820 {
2821 *err = REG_BADPAT;
2822 goto parse_bracket_exp_free_return;
2823 }
2824 if (token2.type == OP_CLOSE_BRACKET)
2825 {
2826 /* We treat the last '-' as a normal character. */
2827 re_string_skip_bytes (regexp, -token_len);
2828 token->type = CHARACTER;
2829 }
2830 else
2831 is_range_exp = 1;
2832 }
2833
2834 if (is_range_exp == 1)
2835 {
2836 end_elem.opr.name = end_name_buf;
2837 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2838 dfa, syntax);
2839 if (BE (ret != REG_NOERROR, 0))
2840 {
2841 *err = ret;
2842 goto parse_bracket_exp_free_return;
2843 }
2844
2845 token_len = peek_token_bracket (token, regexp, syntax);
2846 if (BE (token->type == END_OF_RE, 0))
2847 {
2848 *err = REG_BADPAT;
2849 goto parse_bracket_exp_free_return;
2850 }
2851 *err = build_range_exp (sbcset,
2852 #ifdef RE_ENABLE_I18N
2853 mbcset, &range_alloc,
2854 #endif /* RE_ENABLE_I18N */
2855 &start_elem, &end_elem);
2856 if (BE (*err != REG_NOERROR, 0))
2857 goto parse_bracket_exp_free_return;
2858 }
2859 else
2860 {
2861 switch (start_elem.type)
2862 {
2863 case SB_CHAR:
2864 bitset_set (sbcset, start_elem.opr.ch);
2865 break;
2866 #ifdef RE_ENABLE_I18N
2867 case MB_CHAR:
2868 /* Check whether the array has enough space. */
2869 if (mbchar_alloc == mbcset->nmbchars)
2870 {
2871 /* Not enough, realloc it. */
2872 /* +1 in case of mbcset->nmbchars is 0. */
2873 mbchar_alloc = 2 * mbcset->nmbchars + 1;
2874 /* Use realloc since array is NULL if *alloc == 0. */
2875 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2876 mbchar_alloc);
2877 if (BE (mbcset->mbchars == NULL, 0))
2878 goto parse_bracket_exp_espace;
2879 }
2880 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2881 break;
2882 #endif /* RE_ENABLE_I18N */
2883 case EQUIV_CLASS:
2884 *err = build_equiv_class (sbcset,
2885 #ifdef RE_ENABLE_I18N
2886 mbcset, &equiv_class_alloc,
2887 #endif /* RE_ENABLE_I18N */
2888 start_elem.opr.name);
2889 if (BE (*err != REG_NOERROR, 0))
2890 goto parse_bracket_exp_free_return;
2891 break;
2892 case COLL_SYM:
2893 *err = build_collating_symbol (sbcset,
2894 #ifdef RE_ENABLE_I18N
2895 mbcset, &coll_syxmalloc,
2896 #endif /* RE_ENABLE_I18N */
2897 start_elem.opr.name);
2898 if (BE (*err != REG_NOERROR, 0))
2899 goto parse_bracket_exp_free_return;
2900 break;
2901 case CHAR_CLASS:
2902 ret = build_charclass (sbcset,
2903 #ifdef RE_ENABLE_I18N
2904 mbcset, &char_class_alloc,
2905 #endif /* RE_ENABLE_I18N */
2906 start_elem.opr.name, syntax);
2907 if (BE (ret != REG_NOERROR, 0))
2908 goto parse_bracket_exp_espace;
2909 break;
2910 default:
2911 assert (0);
2912 break;
2913 }
2914 }
2915 if (token->type == OP_CLOSE_BRACKET)
2916 break;
2917 }
2918
2919 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2920
2921 /* If it is non-matching list. */
2922 #ifdef RE_ENABLE_I18N
2923 if (mbcset->non_match)
2924 #else /* not RE_ENABLE_I18N */
2925 if (non_match)
2926 #endif /* not RE_ENABLE_I18N */
2927 bitset_not (sbcset);
2928
2929 /* Build a tree for simple bracket. */
2930 br_token.type = SIMPLE_BRACKET;
2931 br_token.opr.sbcset = sbcset;
2932 new_idx = re_dfa_add_node (dfa, br_token, 0);
2933 work_tree = create_tree (NULL, NULL, 0, new_idx);
2934 if (BE (new_idx == -1 || work_tree == NULL, 0))
2935 goto parse_bracket_exp_espace;
2936
2937 #ifdef RE_ENABLE_I18N
2938 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2939 || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2940 || mbcset->non_match)))
2941 {
2942 re_token_t alt_token;
2943 bin_tree_t *mbc_tree;
2944 /* Build a tree for complex bracket. */
2945 br_token.type = COMPLEX_BRACKET;
2946 br_token.opr.mbcset = mbcset;
2947 dfa->has_mb_node = 1;
2948 new_idx = re_dfa_add_node (dfa, br_token, 0);
2949 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
2950 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
2951 goto parse_bracket_exp_espace;
2952 /* Then join them by ALT node. */
2953 dfa->has_plural_match = 1;
2954 alt_token.type = OP_ALT;
2955 new_idx = re_dfa_add_node (dfa, alt_token, 0);
2956 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
2957 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
2958 return work_tree;
2959 }
2960 else
2961 {
2962 free_charset (mbcset);
2963 return work_tree;
2964 }
2965 #else /* not RE_ENABLE_I18N */
2966 return work_tree;
2967 #endif /* not RE_ENABLE_I18N */
2968
2969 parse_bracket_exp_espace:
2970 *err = REG_ESPACE;
2971 parse_bracket_exp_free_return:
2972 re_free (sbcset);
2973 #ifdef RE_ENABLE_I18N
2974 free_charset (mbcset);
2975 #endif /* RE_ENABLE_I18N */
2976 return NULL;
2977 }
2978
2979 /* Parse an element in the bracket expression. */
2980
2981 static reg_errcode_t
parse_bracket_element(elem,regexp,token,token_len,dfa,syntax)2982 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
2983 bracket_elem_t *elem;
2984 re_string_t *regexp;
2985 re_token_t *token;
2986 int token_len;
2987 re_dfa_t *dfa;
2988 reg_syntax_t syntax;
2989 {
2990 #ifdef RE_ENABLE_I18N
2991 int cur_char_size;
2992 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
2993 if (cur_char_size > 1)
2994 {
2995 elem->type = MB_CHAR;
2996 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
2997 re_string_skip_bytes (regexp, cur_char_size);
2998 return REG_NOERROR;
2999 }
3000 #endif /* RE_ENABLE_I18N */
3001 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3002 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3003 || token->type == OP_OPEN_EQUIV_CLASS)
3004 return parse_bracket_symbol (elem, regexp, token);
3005 elem->type = SB_CHAR;
3006 elem->opr.ch = token->opr.c;
3007 return REG_NOERROR;
3008 }
3009
3010 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3011 such as [:<character_class>:], [.<collating_element>.], and
3012 [=<equivalent_class>=]. */
3013
3014 static reg_errcode_t
parse_bracket_symbol(elem,regexp,token)3015 parse_bracket_symbol (elem, regexp, token)
3016 bracket_elem_t *elem;
3017 re_string_t *regexp;
3018 re_token_t *token;
3019 {
3020 unsigned char ch, delim = token->opr.c;
3021 int i = 0;
3022 for (;; ++i)
3023 {
3024 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3025 return REG_EBRACK;
3026 if (token->type == OP_OPEN_CHAR_CLASS)
3027 ch = re_string_fetch_byte_case (regexp);
3028 else
3029 ch = re_string_fetch_byte (regexp);
3030 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3031 break;
3032 elem->opr.name[i] = ch;
3033 }
3034 re_string_skip_bytes (regexp, 1);
3035 elem->opr.name[i] = '\0';
3036 switch (token->type)
3037 {
3038 case OP_OPEN_COLL_ELEM:
3039 elem->type = COLL_SYM;
3040 break;
3041 case OP_OPEN_EQUIV_CLASS:
3042 elem->type = EQUIV_CLASS;
3043 break;
3044 case OP_OPEN_CHAR_CLASS:
3045 elem->type = CHAR_CLASS;
3046 break;
3047 default:
3048 break;
3049 }
3050 return REG_NOERROR;
3051 }
3052
3053 /* Helper function for parse_bracket_exp.
3054 Build the equivalence class which is represented by NAME.
3055 The result are written to MBCSET and SBCSET.
3056 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3057 is a pointer argument sinse we may update it. */
3058
3059 static reg_errcode_t
3060 #ifdef RE_ENABLE_I18N
build_equiv_class(sbcset,mbcset,equiv_class_alloc,name)3061 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3062 re_charset_t *mbcset;
3063 int *equiv_class_alloc;
3064 #else /* not RE_ENABLE_I18N */
3065 build_equiv_class (sbcset, name)
3066 #endif /* not RE_ENABLE_I18N */
3067 re_bitset_ptr_t sbcset;
3068 const unsigned char *name;
3069 {
3070 #if defined _LIBC && defined RE_ENABLE_I18N
3071 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3072 if (nrules != 0)
3073 {
3074 const int32_t *table, *indirect;
3075 const unsigned char *weights, *extra, *cp;
3076 unsigned char char_buf[2];
3077 int32_t idx1, idx2;
3078 unsigned int ch;
3079 size_t len;
3080 /* This #include defines a local function! */
3081 # include <locale/weight.h>
3082 /* Calculate the index for equivalence class. */
3083 cp = name;
3084 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3085 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3086 _NL_COLLATE_WEIGHTMB);
3087 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3088 _NL_COLLATE_EXTRAMB);
3089 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3090 _NL_COLLATE_INDIRECTMB);
3091 idx1 = findidx (&cp);
3092 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3093 /* This isn't a valid character. */
3094 return REG_ECOLLATE;
3095
3096 /* Build single byte matcing table for this equivalence class. */
3097 char_buf[1] = (unsigned char) '\0';
3098 len = weights[idx1];
3099 for (ch = 0; ch < SBC_MAX; ++ch)
3100 {
3101 char_buf[0] = ch;
3102 cp = char_buf;
3103 idx2 = findidx (&cp);
3104 /*
3105 idx2 = table[ch];
3106 */
3107 if (idx2 == 0)
3108 /* This isn't a valid character. */
3109 continue;
3110 if (len == weights[idx2])
3111 {
3112 int cnt = 0;
3113 while (cnt <= len &&
3114 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3115 ++cnt;
3116
3117 if (cnt > len)
3118 bitset_set (sbcset, ch);
3119 }
3120 }
3121 /* Check whether the array has enough space. */
3122 if (*equiv_class_alloc == mbcset->nequiv_classes)
3123 {
3124 /* Not enough, realloc it. */
3125 /* +1 in case of mbcset->nequiv_classes is 0. */
3126 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3127 /* Use realloc since the array is NULL if *alloc == 0. */
3128 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3129 *equiv_class_alloc);
3130 if (BE (mbcset->equiv_classes == NULL, 0))
3131 return REG_ESPACE;
3132 }
3133 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3134 }
3135 else
3136 #endif /* _LIBC && RE_ENABLE_I18N */
3137 {
3138 if (BE (strlen ((const char *) name) != 1, 0))
3139 return REG_ECOLLATE;
3140 bitset_set (sbcset, *name);
3141 }
3142 return REG_NOERROR;
3143 }
3144
3145 /* Helper function for parse_bracket_exp.
3146 Build the character class which is represented by NAME.
3147 The result are written to MBCSET and SBCSET.
3148 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3149 is a pointer argument sinse we may update it. */
3150
3151 static reg_errcode_t
3152 #ifdef RE_ENABLE_I18N
build_charclass(sbcset,mbcset,char_class_alloc,class_name,syntax)3153 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3154 re_charset_t *mbcset;
3155 int *char_class_alloc;
3156 #else /* not RE_ENABLE_I18N */
3157 build_charclass (sbcset, class_name, syntax)
3158 #endif /* not RE_ENABLE_I18N */
3159 re_bitset_ptr_t sbcset;
3160 const unsigned char *class_name;
3161 reg_syntax_t syntax;
3162 {
3163 int i;
3164 const char *name = (const char *) class_name;
3165
3166 /* In case of REG_ICASE "upper" and "lower" match the both of
3167 upper and lower cases. */
3168 if ((syntax & RE_ICASE)
3169 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3170 name = "alpha";
3171
3172 #ifdef RE_ENABLE_I18N
3173 /* Check the space of the arrays. */
3174 if (*char_class_alloc == mbcset->nchar_classes)
3175 {
3176 /* Not enough, realloc it. */
3177 /* +1 in case of mbcset->nchar_classes is 0. */
3178 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3179 /* Use realloc since array is NULL if *alloc == 0. */
3180 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3181 *char_class_alloc);
3182 if (BE (mbcset->char_classes == NULL, 0))
3183 return REG_ESPACE;
3184 }
3185 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3186 #endif /* RE_ENABLE_I18N */
3187
3188 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3189 for (i = 0; i < SBC_MAX; ++i) \
3190 { \
3191 if (ctype_func (i)) \
3192 bitset_set (sbcset, i); \
3193 }
3194
3195 if (strcmp (name, "alnum") == 0)
3196 BUILD_CHARCLASS_LOOP (isalnum)
3197 else if (strcmp (name, "cntrl") == 0)
3198 BUILD_CHARCLASS_LOOP (iscntrl)
3199 else if (strcmp (name, "lower") == 0)
3200 BUILD_CHARCLASS_LOOP (islower)
3201 else if (strcmp (name, "space") == 0)
3202 BUILD_CHARCLASS_LOOP (isspace)
3203 else if (strcmp (name, "alpha") == 0)
3204 BUILD_CHARCLASS_LOOP (isalpha)
3205 else if (strcmp (name, "digit") == 0)
3206 BUILD_CHARCLASS_LOOP (isdigit)
3207 else if (strcmp (name, "print") == 0)
3208 BUILD_CHARCLASS_LOOP (isprint)
3209 else if (strcmp (name, "upper") == 0)
3210 BUILD_CHARCLASS_LOOP (isupper)
3211 else if (strcmp (name, "blank") == 0)
3212 BUILD_CHARCLASS_LOOP (isblank)
3213 else if (strcmp (name, "graph") == 0)
3214 BUILD_CHARCLASS_LOOP (isgraph)
3215 else if (strcmp (name, "punct") == 0)
3216 BUILD_CHARCLASS_LOOP (ispunct)
3217 else if (strcmp (name, "xdigit") == 0)
3218 BUILD_CHARCLASS_LOOP (isxdigit)
3219 else
3220 return REG_ECTYPE;
3221
3222 return REG_NOERROR;
3223 }
3224
3225 static bin_tree_t *
build_word_op(dfa,not,err)3226 build_word_op (dfa, not, err)
3227 re_dfa_t *dfa;
3228 int not;
3229 reg_errcode_t *err;
3230 {
3231 re_bitset_ptr_t sbcset;
3232 #ifdef RE_ENABLE_I18N
3233 re_charset_t *mbcset;
3234 int alloc = 0;
3235 #else /* not RE_ENABLE_I18N */
3236 int non_match = 0;
3237 #endif /* not RE_ENABLE_I18N */
3238 reg_errcode_t ret;
3239 re_token_t br_token;
3240 bin_tree_t *tree;
3241 int new_idx;
3242
3243 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3244 #ifdef RE_ENABLE_I18N
3245 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3246 #endif /* RE_ENABLE_I18N */
3247
3248 #ifdef RE_ENABLE_I18N
3249 if (BE (sbcset == NULL || mbcset == NULL, 0))
3250 #else /* not RE_ENABLE_I18N */
3251 if (BE (sbcset == NULL, 0))
3252 #endif /* not RE_ENABLE_I18N */
3253 {
3254 *err = REG_ESPACE;
3255 return NULL;
3256 }
3257
3258 if (not)
3259 {
3260 #ifdef RE_ENABLE_I18N
3261 int i;
3262 /*
3263 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3264 bitset_set(cset->sbcset, '\0');
3265 */
3266 mbcset->non_match = 1;
3267 if (MB_CUR_MAX > 1)
3268 for (i = 0; i < SBC_MAX; ++i)
3269 if (__btowc (i) == WEOF)
3270 bitset_set (sbcset, i);
3271 #else /* not RE_ENABLE_I18N */
3272 non_match = 1;
3273 #endif /* not RE_ENABLE_I18N */
3274 }
3275
3276 /* We don't care the syntax in this case. */
3277 ret = build_charclass (sbcset,
3278 #ifdef RE_ENABLE_I18N
3279 mbcset, &alloc,
3280 #endif /* RE_ENABLE_I18N */
3281 (const unsigned char *) "alpha", 0);
3282
3283 if (BE (ret != REG_NOERROR, 0))
3284 {
3285 re_free (sbcset);
3286 #ifdef RE_ENABLE_I18N
3287 free_charset (mbcset);
3288 #endif /* RE_ENABLE_I18N */
3289 *err = REG_ESPACE;
3290 return NULL;
3291 }
3292 /* \w match '_' also. */
3293 bitset_set (sbcset, '_');
3294
3295 /* If it is non-matching list. */
3296 #ifdef RE_ENABLE_I18N
3297 if (mbcset->non_match)
3298 #else /* not RE_ENABLE_I18N */
3299 if (non_match)
3300 #endif /* not RE_ENABLE_I18N */
3301 bitset_not (sbcset);
3302
3303 /* Build a tree for simple bracket. */
3304 br_token.type = SIMPLE_BRACKET;
3305 br_token.opr.sbcset = sbcset;
3306 new_idx = re_dfa_add_node (dfa, br_token, 0);
3307 tree = create_tree (NULL, NULL, 0, new_idx);
3308 if (BE (new_idx == -1 || tree == NULL, 0))
3309 goto build_word_op_espace;
3310
3311 #ifdef RE_ENABLE_I18N
3312 if (MB_CUR_MAX > 1)
3313 {
3314 re_token_t alt_token;
3315 bin_tree_t *mbc_tree;
3316 /* Build a tree for complex bracket. */
3317 br_token.type = COMPLEX_BRACKET;
3318 br_token.opr.mbcset = mbcset;
3319 dfa->has_mb_node = 1;
3320 new_idx = re_dfa_add_node (dfa, br_token, 0);
3321 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3322 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3323 goto build_word_op_espace;
3324 /* Then join them by ALT node. */
3325 alt_token.type = OP_ALT;
3326 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3327 tree = create_tree (tree, mbc_tree, 0, new_idx);
3328 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3329 return tree;
3330 }
3331 else
3332 {
3333 free_charset (mbcset);
3334 return tree;
3335 }
3336 #else /* not RE_ENABLE_I18N */
3337 return tree;
3338 #endif /* not RE_ENABLE_I18N */
3339
3340 build_word_op_espace:
3341 re_free (sbcset);
3342 #ifdef RE_ENABLE_I18N
3343 free_charset (mbcset);
3344 #endif /* RE_ENABLE_I18N */
3345 *err = REG_ESPACE;
3346 return NULL;
3347 }
3348
3349 /* This is intended for the expressions like "a{1,3}".
3350 Fetch a number from `input', and return the number.
3351 Return -1, if the number field is empty like "{,1}".
3352 Return -2, If an error is occurred. */
3353
3354 static int
fetch_number(input,token,syntax)3355 fetch_number (input, token, syntax)
3356 re_string_t *input;
3357 re_token_t *token;
3358 reg_syntax_t syntax;
3359 {
3360 int num = -1;
3361 unsigned char c;
3362 while (1)
3363 {
3364 *token = fetch_token (input, syntax);
3365 c = token->opr.c;
3366 if (BE (token->type == END_OF_RE, 0))
3367 return -2;
3368 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3369 break;
3370 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3371 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3372 num = (num > RE_DUP_MAX) ? -2 : num;
3373 }
3374 return num;
3375 }
3376
3377 #ifdef RE_ENABLE_I18N
3378 static void
free_charset(re_charset_t * cset)3379 free_charset (re_charset_t *cset)
3380 {
3381 re_free (cset->mbchars);
3382 # ifdef _LIBC
3383 re_free (cset->coll_syms);
3384 re_free (cset->equiv_classes);
3385 re_free (cset->range_starts);
3386 re_free (cset->range_ends);
3387 # endif
3388 re_free (cset->char_classes);
3389 re_free (cset);
3390 }
3391 #endif /* RE_ENABLE_I18N */
3392
3393 /* Functions for binary tree operation. */
3394
3395 /* Create a node of tree.
3396 Note: This function automatically free left and right if malloc fails. */
3397
3398 static bin_tree_t *
create_tree(left,right,type,index)3399 create_tree (left, right, type, index)
3400 bin_tree_t *left;
3401 bin_tree_t *right;
3402 re_token_type_t type;
3403 int index;
3404 {
3405 bin_tree_t *tree;
3406 tree = re_malloc (bin_tree_t, 1);
3407 if (BE (tree == NULL, 0))
3408 {
3409 free_bin_tree (left);
3410 free_bin_tree (right);
3411 return NULL;
3412 }
3413 tree->parent = NULL;
3414 tree->left = left;
3415 tree->right = right;
3416 tree->type = type;
3417 tree->node_idx = index;
3418 tree->first = -1;
3419 tree->next = -1;
3420 re_node_set_init_empty (&tree->eclosure);
3421
3422 if (left != NULL)
3423 left->parent = tree;
3424 if (right != NULL)
3425 right->parent = tree;
3426 return tree;
3427 }
3428
3429 /* Free the sub tree pointed by TREE. */
3430
3431 static void
free_bin_tree(tree)3432 free_bin_tree (tree)
3433 bin_tree_t *tree;
3434 {
3435 if (tree == NULL)
3436 return;
3437 /*re_node_set_free (&tree->eclosure);*/
3438 free_bin_tree (tree->left);
3439 free_bin_tree (tree->right);
3440 re_free (tree);
3441 }
3442
3443 /* Duplicate the node SRC, and return new node. */
3444
3445 static bin_tree_t *
duplicate_tree(src,dfa)3446 duplicate_tree (src, dfa)
3447 const bin_tree_t *src;
3448 re_dfa_t *dfa;
3449 {
3450 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3451 int new_node_idx;
3452 /* Since node indies must be according to Post-order of the tree,
3453 we must duplicate the left at first. */
3454 if (src->left != NULL)
3455 {
3456 left = duplicate_tree (src->left, dfa);
3457 if (left == NULL)
3458 return NULL;
3459 }
3460
3461 /* Secondaly, duplicate the right. */
3462 if (src->right != NULL)
3463 {
3464 right = duplicate_tree (src->right, dfa);
3465 if (right == NULL)
3466 {
3467 free_bin_tree (left);
3468 return NULL;
3469 }
3470 }
3471
3472 /* At last, duplicate itself. */
3473 if (src->type == NON_TYPE)
3474 {
3475 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3476 dfa->nodes[new_node_idx].duplicated = 1;
3477 if (BE (new_node_idx == -1, 0))
3478 {
3479 free_bin_tree (left);
3480 free_bin_tree (right);
3481 return NULL;
3482 }
3483 }
3484 else
3485 new_node_idx = src->type;
3486
3487 new_tree = create_tree (left, right, src->type, new_node_idx);
3488 if (BE (new_tree == NULL, 0))
3489 {
3490 free_bin_tree (left);
3491 free_bin_tree (right);
3492 }
3493 return new_tree;
3494 }
3495