1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 /* this is for removing a compiler warning */
gkfooo()22 void gkfooo() { return; }
23
24 #ifdef USE_GKREGEX
25
26 #ifdef HAVE_CONFIG_H
27 #include "config.h"
28 #endif
29
30 #ifdef _LIBC
31 /* We have to keep the namespace clean. */
32 # define regfree(preg) __regfree (preg)
33 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
34 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
35 # define regerror(errcode, preg, errbuf, errbuf_size) \
36 __regerror(errcode, preg, errbuf, errbuf_size)
37 # define re_set_registers(bu, re, nu, st, en) \
38 __re_set_registers (bu, re, nu, st, en)
39 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
40 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
41 # define re_match(bufp, string, size, pos, regs) \
42 __re_match (bufp, string, size, pos, regs)
43 # define re_search(bufp, string, size, startpos, range, regs) \
44 __re_search (bufp, string, size, startpos, range, regs)
45 # define re_compile_pattern(pattern, length, bufp) \
46 __re_compile_pattern (pattern, length, bufp)
47 # define re_set_syntax(syntax) __re_set_syntax (syntax)
48 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
49 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
50 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
51
52 # include "../locale/localeinfo.h"
53 #endif
54
55 #include "GKlib.h"
56
57
58 /******************************************************************************/
59 /******************************************************************************/
60 /******************************************************************************/
61 /* GKINCLUDE #include "regex_internal.h" */
62 /******************************************************************************/
63 /******************************************************************************/
64 /******************************************************************************/
65 /* Extended regular expression matching and search library.
66 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
67 This file is part of the GNU C Library.
68 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
69
70 The GNU C Library is free software; you can redistribute it and/or
71 modify it under the terms of the GNU Lesser General Public
72 License as published by the Free Software Foundation; either
73 version 2.1 of the License, or (at your option) any later version.
74
75 The GNU C Library is distributed in the hope that it will be useful,
76 but WITHOUT ANY WARRANTY; without even the implied warranty of
77 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
78 Lesser General Public License for more details.
79
80 You should have received a copy of the GNU Lesser General Public
81 License along with the GNU C Library; if not, write to the Free
82 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
83 02111-1307 USA. */
84
85 #ifndef _REGEX_INTERNAL_H
86 #define _REGEX_INTERNAL_H 1
87
88 #include <assert.h>
89 #include <ctype.h>
90 #include <stdio.h>
91 #include <stdlib.h>
92 #include <string.h>
93
94 #if defined(__MINGW32_VERSION) || defined(_MSC_VER)
95 #define strcasecmp stricmp
96 #endif
97
98 #if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
99 # include <langinfo.h>
100 #endif
101 #if defined HAVE_LOCALE_H || defined _LIBC
102 # include <locale.h>
103 #endif
104 #if defined HAVE_WCHAR_H || defined _LIBC
105 # include <wchar.h>
106 #endif /* HAVE_WCHAR_H || _LIBC */
107 #if defined HAVE_WCTYPE_H || defined _LIBC
108 # include <wctype.h>
109 #endif /* HAVE_WCTYPE_H || _LIBC */
110 #if defined HAVE_STDBOOL_H || defined _LIBC
111 # include <stdbool.h>
112 #else
113 typedef enum { false, true } bool;
114 #endif /* HAVE_STDBOOL_H || _LIBC */
115 #if defined HAVE_STDINT_H || defined _LIBC
116 # include <stdint.h>
117 #endif /* HAVE_STDINT_H || _LIBC */
118 #if defined _LIBC
119 # include <bits/libc-lock.h>
120 #else
121 # define __libc_lock_define(CLASS,NAME)
122 # define __libc_lock_init(NAME) do { } while (0)
123 # define __libc_lock_lock(NAME) do { } while (0)
124 # define __libc_lock_unlock(NAME) do { } while (0)
125 #endif
126
127 /* In case that the system doesn't have isblank(). */
128 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
129 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
130 #endif
131
132 #ifdef _LIBC
133 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
134 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
135 # include <locale/localeinfo.h>
136 # include <locale/elem-hash.h>
137 # include <locale/coll-lookup.h>
138 # endif
139 #endif
140
141 /* This is for other GNU distributions with internationalized messages. */
142 #if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
143 # include <libintl.h>
144 # ifdef _LIBC
145 # undef gettext
146 # define gettext(msgid) \
147 INTUSE(__dcgettext) (_libc_intl_domainname, msgid, LC_MESSAGES)
148 # endif
149 #else
150 # define gettext(msgid) (msgid)
151 #endif
152
153 #ifndef gettext_noop
154 /* This define is so xgettext can find the internationalizable
155 strings. */
156 # define gettext_noop(String) String
157 #endif
158
159 /* For loser systems without the definition. */
160 #ifndef SIZE_MAX
161 # define SIZE_MAX ((size_t) -1)
162 #endif
163
164 #if (defined MB_CUR_MAX && HAVE_LOCALE_H && HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_WCRTOMB && HAVE_MBRTOWC && HAVE_WCSCOLL) || _LIBC
165 # define RE_ENABLE_I18N
166 #endif
167
168 #if __GNUC__ >= 3
169 # define BE(expr, val) __builtin_expect (expr, val)
170 #else
171 # define BE(expr, val) (expr)
172 # define inline
173 #endif
174
175 /* Number of single byte character. */
176 #define SBC_MAX 256
177
178 #define COLL_ELEM_LEN_MAX 8
179
180 /* The character which represents newline. */
181 #define NEWLINE_CHAR '\n'
182 #define WIDE_NEWLINE_CHAR L'\n'
183
184 /* Rename to standard API for using out of glibc. */
185 #ifndef _LIBC
186 # define __wctype wctype
187 # define __iswctype iswctype
188 # define __btowc btowc
189 # define __mempcpy mempcpy
190 # define __wcrtomb wcrtomb
191 # define __regfree regfree
192 # define attribute_hidden
193 #endif /* not _LIBC */
194
195 #ifdef __GNUC__
196 # define __attribute(arg) __attribute__ (arg)
197 #else
198 # define __attribute(arg)
199 #endif
200
201 extern const char __re_error_msgid[] attribute_hidden;
202 extern const size_t __re_error_msgid_idx[] attribute_hidden;
203
204 /* An integer used to represent a set of bits. It must be unsigned,
205 and must be at least as wide as unsigned int. */
206 typedef unsigned long int bitset_word_t;
207 /* All bits set in a bitset_word_t. */
208 #define BITSET_WORD_MAX ULONG_MAX
209 /* Number of bits in a bitset_word_t. */
210 #define BITSET_WORD_BITS (sizeof (bitset_word_t) * CHAR_BIT)
211 /* Number of bitset_word_t in a bit_set. */
212 #define BITSET_WORDS (SBC_MAX / BITSET_WORD_BITS)
213 typedef bitset_word_t bitset_t[BITSET_WORDS];
214 typedef bitset_word_t *re_bitset_ptr_t;
215 typedef const bitset_word_t *re_const_bitset_ptr_t;
216
217 #define bitset_set(set,i) \
218 (set[i / BITSET_WORD_BITS] |= (bitset_word_t) 1 << i % BITSET_WORD_BITS)
219 #define bitset_clear(set,i) \
220 (set[i / BITSET_WORD_BITS] &= ~((bitset_word_t) 1 << i % BITSET_WORD_BITS))
221 #define bitset_contain(set,i) \
222 (set[i / BITSET_WORD_BITS] & ((bitset_word_t) 1 << i % BITSET_WORD_BITS))
223 #define bitset_empty(set) memset (set, '\0', sizeof (bitset_t))
224 #define bitset_set_all(set) memset (set, '\xff', sizeof (bitset_t))
225 #define bitset_copy(dest,src) memcpy (dest, src, sizeof (bitset_t))
226
227 #define PREV_WORD_CONSTRAINT 0x0001
228 #define PREV_NOTWORD_CONSTRAINT 0x0002
229 #define NEXT_WORD_CONSTRAINT 0x0004
230 #define NEXT_NOTWORD_CONSTRAINT 0x0008
231 #define PREV_NEWLINE_CONSTRAINT 0x0010
232 #define NEXT_NEWLINE_CONSTRAINT 0x0020
233 #define PREV_BEGBUF_CONSTRAINT 0x0040
234 #define NEXT_ENDBUF_CONSTRAINT 0x0080
235 #define WORD_DELIM_CONSTRAINT 0x0100
236 #define NOT_WORD_DELIM_CONSTRAINT 0x0200
237
238 typedef enum
239 {
240 INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
241 WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
242 WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
243 INSIDE_NOTWORD = PREV_NOTWORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
244 LINE_FIRST = PREV_NEWLINE_CONSTRAINT,
245 LINE_LAST = NEXT_NEWLINE_CONSTRAINT,
246 BUF_FIRST = PREV_BEGBUF_CONSTRAINT,
247 BUF_LAST = NEXT_ENDBUF_CONSTRAINT,
248 WORD_DELIM = WORD_DELIM_CONSTRAINT,
249 NOT_WORD_DELIM = NOT_WORD_DELIM_CONSTRAINT
250 } re_context_type;
251
252 typedef struct
253 {
254 int alloc;
255 int nelem;
256 int *elems;
257 } re_node_set;
258
259 typedef enum
260 {
261 NON_TYPE = 0,
262
263 /* Node type, These are used by token, node, tree. */
264 CHARACTER = 1,
265 END_OF_RE = 2,
266 SIMPLE_BRACKET = 3,
267 OP_BACK_REF = 4,
268 OP_PERIOD = 5,
269 #ifdef RE_ENABLE_I18N
270 COMPLEX_BRACKET = 6,
271 OP_UTF8_PERIOD = 7,
272 #endif /* RE_ENABLE_I18N */
273
274 /* We define EPSILON_BIT as a macro so that OP_OPEN_SUBEXP is used
275 when the debugger shows values of this enum type. */
276 #define EPSILON_BIT 8
277 OP_OPEN_SUBEXP = EPSILON_BIT | 0,
278 OP_CLOSE_SUBEXP = EPSILON_BIT | 1,
279 OP_ALT = EPSILON_BIT | 2,
280 OP_DUP_ASTERISK = EPSILON_BIT | 3,
281 ANCHOR = EPSILON_BIT | 4,
282
283 /* Tree type, these are used only by tree. */
284 CONCAT = 16,
285 SUBEXP = 17,
286
287 /* Token type, these are used only by token. */
288 OP_DUP_PLUS = 18,
289 OP_DUP_QUESTION,
290 OP_OPEN_BRACKET,
291 OP_CLOSE_BRACKET,
292 OP_CHARSET_RANGE,
293 OP_OPEN_DUP_NUM,
294 OP_CLOSE_DUP_NUM,
295 OP_NON_MATCH_LIST,
296 OP_OPEN_COLL_ELEM,
297 OP_CLOSE_COLL_ELEM,
298 OP_OPEN_EQUIV_CLASS,
299 OP_CLOSE_EQUIV_CLASS,
300 OP_OPEN_CHAR_CLASS,
301 OP_CLOSE_CHAR_CLASS,
302 OP_WORD,
303 OP_NOTWORD,
304 OP_SPACE,
305 OP_NOTSPACE,
306 BACK_SLASH
307
308 } re_token_type_t;
309
310 #ifdef RE_ENABLE_I18N
311 typedef struct
312 {
313 /* Multibyte characters. */
314 wchar_t *mbchars;
315
316 /* Collating symbols. */
317 # ifdef _LIBC
318 int32_t *coll_syms;
319 # endif
320
321 /* Equivalence classes. */
322 # ifdef _LIBC
323 int32_t *equiv_classes;
324 # endif
325
326 /* Range expressions. */
327 # ifdef _LIBC
328 uint32_t *range_starts;
329 uint32_t *range_ends;
330 # else /* not _LIBC */
331 wchar_t *range_starts;
332 wchar_t *range_ends;
333 # endif /* not _LIBC */
334
335 /* Character classes. */
336 wctype_t *char_classes;
337
338 /* If this character set is the non-matching list. */
339 unsigned int non_match : 1;
340
341 /* # of multibyte characters. */
342 int nmbchars;
343
344 /* # of collating symbols. */
345 int ncoll_syms;
346
347 /* # of equivalence classes. */
348 int nequiv_classes;
349
350 /* # of range expressions. */
351 int nranges;
352
353 /* # of character classes. */
354 int nchar_classes;
355 } re_charset_t;
356 #endif /* RE_ENABLE_I18N */
357
358 typedef struct
359 {
360 union
361 {
362 unsigned char c; /* for CHARACTER */
363 re_bitset_ptr_t sbcset; /* for SIMPLE_BRACKET */
364 #ifdef RE_ENABLE_I18N
365 re_charset_t *mbcset; /* for COMPLEX_BRACKET */
366 #endif /* RE_ENABLE_I18N */
367 int idx; /* for BACK_REF */
368 re_context_type ctx_type; /* for ANCHOR */
369 } opr;
370 #if __GNUC__ >= 2
371 re_token_type_t type : 8;
372 #else
373 re_token_type_t type;
374 #endif
375 unsigned int constraint : 10; /* context constraint */
376 unsigned int duplicated : 1;
377 unsigned int opt_subexp : 1;
378 #ifdef RE_ENABLE_I18N
379 unsigned int accept_mb : 1;
380 /* These 2 bits can be moved into the union if needed (e.g. if running out
381 of bits; move opr.c to opr.c.c and move the flags to opr.c.flags). */
382 unsigned int mb_partial : 1;
383 #endif
384 unsigned int word_char : 1;
385 } re_token_t;
386
387 #define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT)
388
389 struct re_string_t
390 {
391 /* Indicate the raw buffer which is the original string passed as an
392 argument of regexec(), re_search(), etc.. */
393 const unsigned char *raw_mbs;
394 /* Store the multibyte string. In case of "case insensitive mode" like
395 REG_ICASE, upper cases of the string are stored, otherwise MBS points
396 the same address that RAW_MBS points. */
397 unsigned char *mbs;
398 #ifdef RE_ENABLE_I18N
399 /* Store the wide character string which is corresponding to MBS. */
400 wint_t *wcs;
401 int *offsets;
402 mbstate_t cur_state;
403 #endif
404 /* Index in RAW_MBS. Each character mbs[i] corresponds to
405 raw_mbs[raw_mbs_idx + i]. */
406 int raw_mbs_idx;
407 /* The length of the valid characters in the buffers. */
408 int valid_len;
409 /* The corresponding number of bytes in raw_mbs array. */
410 int valid_raw_len;
411 /* The length of the buffers MBS and WCS. */
412 int bufs_len;
413 /* The index in MBS, which is updated by re_string_fetch_byte. */
414 int cur_idx;
415 /* length of RAW_MBS array. */
416 int raw_len;
417 /* This is RAW_LEN - RAW_MBS_IDX + VALID_LEN - VALID_RAW_LEN. */
418 int len;
419 /* End of the buffer may be shorter than its length in the cases such
420 as re_match_2, re_search_2. Then, we use STOP for end of the buffer
421 instead of LEN. */
422 int raw_stop;
423 /* This is RAW_STOP - RAW_MBS_IDX adjusted through OFFSETS. */
424 int stop;
425
426 /* The context of mbs[0]. We store the context independently, since
427 the context of mbs[0] may be different from raw_mbs[0], which is
428 the beginning of the input string. */
429 unsigned int tip_context;
430 /* The translation passed as a part of an argument of re_compile_pattern. */
431 RE_TRANSLATE_TYPE trans;
432 /* Copy of re_dfa_t's word_char. */
433 re_const_bitset_ptr_t word_char;
434 /* 1 if REG_ICASE. */
435 unsigned char icase;
436 unsigned char is_utf8;
437 unsigned char map_notascii;
438 unsigned char mbs_allocated;
439 unsigned char offsets_needed;
440 unsigned char newline_anchor;
441 unsigned char word_ops_used;
442 int mb_cur_max;
443 };
444 typedef struct re_string_t re_string_t;
445
446
447 struct re_dfa_t;
448 typedef struct re_dfa_t re_dfa_t;
449
450 #ifndef _LIBC
451 # ifdef __i386__
452 # define internal_function __attribute ((regparm (3), stdcall))
453 # else
454 # define internal_function
455 # endif
456 #endif
457
458 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
459 int new_buf_len)
460 internal_function;
461 #ifdef RE_ENABLE_I18N
462 static void build_wcs_buffer (re_string_t *pstr) internal_function;
463 static int build_wcs_upper_buffer (re_string_t *pstr) internal_function;
464 #endif /* RE_ENABLE_I18N */
465 static void build_upper_buffer (re_string_t *pstr) internal_function;
466 static void re_string_translate_buffer (re_string_t *pstr) internal_function;
467 static unsigned int re_string_context_at (const re_string_t *input, int idx,
468 int eflags)
469 internal_function __attribute ((pure));
470 #define re_string_peek_byte(pstr, offset) \
471 ((pstr)->mbs[(pstr)->cur_idx + offset])
472 #define re_string_fetch_byte(pstr) \
473 ((pstr)->mbs[(pstr)->cur_idx++])
474 #define re_string_first_byte(pstr, idx) \
475 ((idx) == (pstr)->valid_len || (pstr)->wcs[idx] != WEOF)
476 #define re_string_is_single_byte_char(pstr, idx) \
477 ((pstr)->wcs[idx] != WEOF && ((pstr)->valid_len == (idx) + 1 \
478 || (pstr)->wcs[(idx) + 1] != WEOF))
479 #define re_string_eoi(pstr) ((pstr)->stop <= (pstr)->cur_idx)
480 #define re_string_cur_idx(pstr) ((pstr)->cur_idx)
481 #define re_string_get_buffer(pstr) ((pstr)->mbs)
482 #define re_string_length(pstr) ((pstr)->len)
483 #define re_string_byte_at(pstr,idx) ((pstr)->mbs[idx])
484 #define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx))
485 #define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx))
486
487 #ifdef __GNUC__
488 # define alloca(size) __builtin_alloca (size)
489 # define HAVE_ALLOCA 1
490 #elif defined(_MSC_VER)
491 # include <malloc.h>
492 # define alloca _alloca
493 # define HAVE_ALLOCA 1
494 #else
495 # error No alloca()
496 #endif
497
498 #ifndef _LIBC
499 # if HAVE_ALLOCA
500 /* The OS usually guarantees only one guard page at the bottom of the stack,
501 and a page size can be as small as 4096 bytes. So we cannot safely
502 allocate anything larger than 4096 bytes. Also care for the possibility
503 of a few compiler-allocated temporary stack slots. */
504 # define __libc_use_alloca(n) ((n) < 4032)
505 # else
506 /* alloca is implemented with malloc, so just use malloc. */
507 # define __libc_use_alloca(n) 0
508 # endif
509 #endif
510
511 #define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t)))
512 #define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t)))
513 #define re_free(p) free (p)
514
515 struct bin_tree_t
516 {
517 struct bin_tree_t *parent;
518 struct bin_tree_t *left;
519 struct bin_tree_t *right;
520 struct bin_tree_t *first;
521 struct bin_tree_t *next;
522
523 re_token_t token;
524
525 /* `node_idx' is the index in dfa->nodes, if `type' == 0.
526 Otherwise `type' indicate the type of this node. */
527 int node_idx;
528 };
529 typedef struct bin_tree_t bin_tree_t;
530
531 #define BIN_TREE_STORAGE_SIZE \
532 ((1024 - sizeof (void *)) / sizeof (bin_tree_t))
533
534 struct bin_tree_storage_t
535 {
536 struct bin_tree_storage_t *next;
537 bin_tree_t data[BIN_TREE_STORAGE_SIZE];
538 };
539 typedef struct bin_tree_storage_t bin_tree_storage_t;
540
541 #define CONTEXT_WORD 1
542 #define CONTEXT_NEWLINE (CONTEXT_WORD << 1)
543 #define CONTEXT_BEGBUF (CONTEXT_NEWLINE << 1)
544 #define CONTEXT_ENDBUF (CONTEXT_BEGBUF << 1)
545
546 #define IS_WORD_CONTEXT(c) ((c) & CONTEXT_WORD)
547 #define IS_NEWLINE_CONTEXT(c) ((c) & CONTEXT_NEWLINE)
548 #define IS_BEGBUF_CONTEXT(c) ((c) & CONTEXT_BEGBUF)
549 #define IS_ENDBUF_CONTEXT(c) ((c) & CONTEXT_ENDBUF)
550 #define IS_ORDINARY_CONTEXT(c) ((c) == 0)
551
552 #define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_')
553 #define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR)
554 #define IS_WIDE_WORD_CHAR(ch) (iswalnum (ch) || (ch) == L'_')
555 #define IS_WIDE_NEWLINE(ch) ((ch) == WIDE_NEWLINE_CHAR)
556
557 #define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \
558 ((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
559 || ((constraint & PREV_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
560 || ((constraint & PREV_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context))\
561 || ((constraint & PREV_BEGBUF_CONSTRAINT) && !IS_BEGBUF_CONTEXT (context)))
562
563 #define NOT_SATISFY_NEXT_CONSTRAINT(constraint,context) \
564 ((((constraint) & NEXT_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
565 || (((constraint) & NEXT_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
566 || (((constraint) & NEXT_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context)) \
567 || (((constraint) & NEXT_ENDBUF_CONSTRAINT) && !IS_ENDBUF_CONTEXT (context)))
568
569 struct re_dfastate_t
570 {
571 unsigned int hash;
572 re_node_set nodes;
573 re_node_set non_eps_nodes;
574 re_node_set inveclosure;
575 re_node_set *entrance_nodes;
576 struct re_dfastate_t **trtable, **word_trtable;
577 unsigned int context : 4;
578 unsigned int halt : 1;
579 /* If this state can accept `multi byte'.
580 Note that we refer to multibyte characters, and multi character
581 collating elements as `multi byte'. */
582 unsigned int accept_mb : 1;
583 /* If this state has backreference node(s). */
584 unsigned int has_backref : 1;
585 unsigned int has_constraint : 1;
586 };
587 typedef struct re_dfastate_t re_dfastate_t;
588
589 struct re_state_table_entry
590 {
591 int num;
592 int alloc;
593 re_dfastate_t **array;
594 };
595
596 /* Array type used in re_sub_match_last_t and re_sub_match_top_t. */
597
598 typedef struct
599 {
600 int next_idx;
601 int alloc;
602 re_dfastate_t **array;
603 } state_array_t;
604
605 /* Store information about the node NODE whose type is OP_CLOSE_SUBEXP. */
606
607 typedef struct
608 {
609 int node;
610 int str_idx; /* The position NODE match at. */
611 state_array_t path;
612 } re_sub_match_last_t;
613
614 /* Store information about the node NODE whose type is OP_OPEN_SUBEXP.
615 And information about the node, whose type is OP_CLOSE_SUBEXP,
616 corresponding to NODE is stored in LASTS. */
617
618 typedef struct
619 {
620 int str_idx;
621 int node;
622 state_array_t *path;
623 int alasts; /* Allocation size of LASTS. */
624 int nlasts; /* The number of LASTS. */
625 re_sub_match_last_t **lasts;
626 } re_sub_match_top_t;
627
628 struct re_backref_cache_entry
629 {
630 int node;
631 int str_idx;
632 int subexp_from;
633 int subexp_to;
634 char more;
635 char unused;
636 unsigned short int eps_reachable_subexps_map;
637 };
638
639 typedef struct
640 {
641 /* The string object corresponding to the input string. */
642 re_string_t input;
643 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
644 const re_dfa_t *const dfa;
645 #else
646 const re_dfa_t *dfa;
647 #endif
648 /* EFLAGS of the argument of regexec. */
649 int eflags;
650 /* Where the matching ends. */
651 int match_last;
652 int last_node;
653 /* The state log used by the matcher. */
654 re_dfastate_t **state_log;
655 int state_log_top;
656 /* Back reference cache. */
657 int nbkref_ents;
658 int abkref_ents;
659 struct re_backref_cache_entry *bkref_ents;
660 int max_mb_elem_len;
661 int nsub_tops;
662 int asub_tops;
663 re_sub_match_top_t **sub_tops;
664 } re_match_context_t;
665
666 typedef struct
667 {
668 re_dfastate_t **sifted_states;
669 re_dfastate_t **limited_states;
670 int last_node;
671 int last_str_idx;
672 re_node_set limits;
673 } re_sift_context_t;
674
675 struct re_fail_stack_ent_t
676 {
677 int idx;
678 int node;
679 regmatch_t *regs;
680 re_node_set eps_via_nodes;
681 };
682
683 struct re_fail_stack_t
684 {
685 int num;
686 int alloc;
687 struct re_fail_stack_ent_t *stack;
688 };
689
690 struct re_dfa_t
691 {
692 re_token_t *nodes;
693 size_t nodes_alloc;
694 size_t nodes_len;
695 int *nexts;
696 int *org_indices;
697 re_node_set *edests;
698 re_node_set *eclosures;
699 re_node_set *inveclosures;
700 struct re_state_table_entry *state_table;
701 re_dfastate_t *init_state;
702 re_dfastate_t *init_state_word;
703 re_dfastate_t *init_state_nl;
704 re_dfastate_t *init_state_begbuf;
705 bin_tree_t *str_tree;
706 bin_tree_storage_t *str_tree_storage;
707 re_bitset_ptr_t sb_char;
708 int str_tree_storage_idx;
709
710 /* number of subexpressions `re_nsub' is in regex_t. */
711 unsigned int state_hash_mask;
712 int init_node;
713 int nbackref; /* The number of backreference in this dfa. */
714
715 /* Bitmap expressing which backreference is used. */
716 bitset_word_t used_bkref_map;
717 bitset_word_t completed_bkref_map;
718
719 unsigned int has_plural_match : 1;
720 /* If this dfa has "multibyte node", which is a backreference or
721 a node which can accept multibyte character or multi character
722 collating element. */
723 unsigned int has_mb_node : 1;
724 unsigned int is_utf8 : 1;
725 unsigned int map_notascii : 1;
726 unsigned int word_ops_used : 1;
727 int mb_cur_max;
728 bitset_t word_char;
729 reg_syntax_t syntax;
730 int *subexp_map;
731 #ifdef DEBUG
732 char* re_str;
733 #endif
734 __libc_lock_define (, lock)
735 };
736
737 #define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))
738 #define re_node_set_remove(set,id) \
739 (re_node_set_remove_at (set, re_node_set_contains (set, id) - 1))
740 #define re_node_set_empty(p) ((p)->nelem = 0)
741 #define re_node_set_free(set) re_free ((set)->elems)
742
743
744 typedef enum
745 {
746 SB_CHAR,
747 MB_CHAR,
748 EQUIV_CLASS,
749 COLL_SYM,
750 CHAR_CLASS
751 } bracket_elem_type;
752
753 typedef struct
754 {
755 bracket_elem_type type;
756 union
757 {
758 unsigned char ch;
759 unsigned char *name;
760 wchar_t wch;
761 } opr;
762 } bracket_elem_t;
763
764
765 /* Inline functions for bitset operation. */
766 static inline void
bitset_not(bitset_t set)767 bitset_not (bitset_t set)
768 {
769 int bitset_i;
770 for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
771 set[bitset_i] = ~set[bitset_i];
772 }
773
774 static inline void
bitset_merge(bitset_t dest,const bitset_t src)775 bitset_merge (bitset_t dest, const bitset_t src)
776 {
777 int bitset_i;
778 for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
779 dest[bitset_i] |= src[bitset_i];
780 }
781
782 static inline void
bitset_mask(bitset_t dest,const bitset_t src)783 bitset_mask (bitset_t dest, const bitset_t src)
784 {
785 int bitset_i;
786 for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
787 dest[bitset_i] &= src[bitset_i];
788 }
789
790 #ifdef RE_ENABLE_I18N
791 /* Inline functions for re_string. */
792 static inline int
internal_function(pure)793 internal_function __attribute ((pure))
794 re_string_char_size_at (const re_string_t *pstr, int idx)
795 {
796 int byte_idx;
797 if (pstr->mb_cur_max == 1)
798 return 1;
799 for (byte_idx = 1; idx + byte_idx < pstr->valid_len; ++byte_idx)
800 if (pstr->wcs[idx + byte_idx] != WEOF)
801 break;
802 return byte_idx;
803 }
804
805 static inline wint_t
internal_function(pure)806 internal_function __attribute ((pure))
807 re_string_wchar_at (const re_string_t *pstr, int idx)
808 {
809 if (pstr->mb_cur_max == 1)
810 return (wint_t) pstr->mbs[idx];
811 return (wint_t) pstr->wcs[idx];
812 }
813
814 static int
internal_function(pure)815 internal_function __attribute ((pure))
816 re_string_elem_size_at (const re_string_t *pstr, int idx)
817 {
818 # ifdef _LIBC
819 const unsigned char *p, *extra;
820 const int32_t *table, *indirect;
821 int32_t tmp;
822 # include <locale/weight.h>
823 uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
824
825 if (nrules != 0)
826 {
827 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
828 extra = (const unsigned char *)
829 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
830 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
831 _NL_COLLATE_INDIRECTMB);
832 p = pstr->mbs + idx;
833 tmp = findidx (&p);
834 return p - pstr->mbs - idx;
835 }
836 else
837 # endif /* _LIBC */
838 return 1;
839 }
840 #endif /* RE_ENABLE_I18N */
841
842 #endif /* _REGEX_INTERNAL_H */
843
844 /******************************************************************************/
845 /******************************************************************************/
846 /******************************************************************************/
847 /* GKINCLUDE #include "regex_internal.c" */
848 /******************************************************************************/
849 /******************************************************************************/
850 /******************************************************************************/
851 /* Extended regular expression matching and search library.
852 Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
853 This file is part of the GNU C Library.
854 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
855
856 The GNU C Library is free software; you can redistribute it and/or
857 modify it under the terms of the GNU Lesser General Public
858 License as published by the Free Software Foundation; either
859 version 2.1 of the License, or (at your option) any later version.
860
861 The GNU C Library is distributed in the hope that it will be useful,
862 but WITHOUT ANY WARRANTY; without even the implied warranty of
863 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
864 Lesser General Public License for more details.
865
866 You should have received a copy of the GNU Lesser General Public
867 License along with the GNU C Library; if not, write to the Free
868 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
869 02111-1307 USA. */
870
871 static void re_string_construct_common (const char *str, int len,
872 re_string_t *pstr,
873 RE_TRANSLATE_TYPE trans, int icase,
874 const re_dfa_t *dfa) internal_function;
875 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
876 const re_node_set *nodes,
877 unsigned int hash) internal_function;
878 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
879 const re_node_set *nodes,
880 unsigned int context,
881 unsigned int hash) internal_function;
882
883 /* Functions for string operation. */
884
885 /* This function allocate the buffers. It is necessary to call
886 re_string_reconstruct before using the object. */
887
888 static reg_errcode_t
889 internal_function
re_string_allocate(re_string_t * pstr,const char * str,int len,int init_len,RE_TRANSLATE_TYPE trans,int icase,const re_dfa_t * dfa)890 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
891 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
892 {
893 reg_errcode_t ret;
894 int init_buf_len;
895
896 /* Ensure at least one character fits into the buffers. */
897 if (init_len < dfa->mb_cur_max)
898 init_len = dfa->mb_cur_max;
899 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
900 re_string_construct_common (str, len, pstr, trans, icase, dfa);
901
902 ret = re_string_realloc_buffers (pstr, init_buf_len);
903 if (BE (ret != REG_NOERROR, 0))
904 return ret;
905
906 pstr->word_char = dfa->word_char;
907 pstr->word_ops_used = dfa->word_ops_used;
908 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
909 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
910 pstr->valid_raw_len = pstr->valid_len;
911 return REG_NOERROR;
912 }
913
914 /* This function allocate the buffers, and initialize them. */
915
916 static reg_errcode_t
917 internal_function
re_string_construct(re_string_t * pstr,const char * str,int len,RE_TRANSLATE_TYPE trans,int icase,const re_dfa_t * dfa)918 re_string_construct (re_string_t *pstr, const char *str, int len,
919 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
920 {
921 reg_errcode_t ret;
922 memset (pstr, '\0', sizeof (re_string_t));
923 re_string_construct_common (str, len, pstr, trans, icase, dfa);
924
925 if (len > 0)
926 {
927 ret = re_string_realloc_buffers (pstr, len + 1);
928 if (BE (ret != REG_NOERROR, 0))
929 return ret;
930 }
931 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
932
933 if (icase)
934 {
935 #ifdef RE_ENABLE_I18N
936 if (dfa->mb_cur_max > 1)
937 {
938 while (1)
939 {
940 ret = build_wcs_upper_buffer (pstr);
941 if (BE (ret != REG_NOERROR, 0))
942 return ret;
943 if (pstr->valid_raw_len >= len)
944 break;
945 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
946 break;
947 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
948 if (BE (ret != REG_NOERROR, 0))
949 return ret;
950 }
951 }
952 else
953 #endif /* RE_ENABLE_I18N */
954 build_upper_buffer (pstr);
955 }
956 else
957 {
958 #ifdef RE_ENABLE_I18N
959 if (dfa->mb_cur_max > 1)
960 build_wcs_buffer (pstr);
961 else
962 #endif /* RE_ENABLE_I18N */
963 {
964 if (trans != NULL)
965 re_string_translate_buffer (pstr);
966 else
967 {
968 pstr->valid_len = pstr->bufs_len;
969 pstr->valid_raw_len = pstr->bufs_len;
970 }
971 }
972 }
973
974 return REG_NOERROR;
975 }
976
977 /* Helper functions for re_string_allocate, and re_string_construct. */
978
979 static reg_errcode_t
980 internal_function
re_string_realloc_buffers(re_string_t * pstr,int new_buf_len)981 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
982 {
983 #ifdef RE_ENABLE_I18N
984 if (pstr->mb_cur_max > 1)
985 {
986 wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
987 if (BE (new_wcs == NULL, 0))
988 return REG_ESPACE;
989 pstr->wcs = new_wcs;
990 if (pstr->offsets != NULL)
991 {
992 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
993 if (BE (new_offsets == NULL, 0))
994 return REG_ESPACE;
995 pstr->offsets = new_offsets;
996 }
997 }
998 #endif /* RE_ENABLE_I18N */
999 if (pstr->mbs_allocated)
1000 {
1001 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
1002 new_buf_len);
1003 if (BE (new_mbs == NULL, 0))
1004 return REG_ESPACE;
1005 pstr->mbs = new_mbs;
1006 }
1007 pstr->bufs_len = new_buf_len;
1008 return REG_NOERROR;
1009 }
1010
1011
1012 static void
1013 internal_function
re_string_construct_common(const char * str,int len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,int icase,const re_dfa_t * dfa)1014 re_string_construct_common (const char *str, int len, re_string_t *pstr,
1015 RE_TRANSLATE_TYPE trans, int icase,
1016 const re_dfa_t *dfa)
1017 {
1018 pstr->raw_mbs = (const unsigned char *) str;
1019 pstr->len = len;
1020 pstr->raw_len = len;
1021 pstr->trans = trans;
1022 pstr->icase = icase ? 1 : 0;
1023 pstr->mbs_allocated = (trans != NULL || icase);
1024 pstr->mb_cur_max = dfa->mb_cur_max;
1025 pstr->is_utf8 = dfa->is_utf8;
1026 pstr->map_notascii = dfa->map_notascii;
1027 pstr->stop = pstr->len;
1028 pstr->raw_stop = pstr->stop;
1029 }
1030
1031 #ifdef RE_ENABLE_I18N
1032
1033 /* Build wide character buffer PSTR->WCS.
1034 If the byte sequence of the string are:
1035 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
1036 Then wide character buffer will be:
1037 <wc1> , WEOF , <wc2> , WEOF , <wc3>
1038 We use WEOF for padding, they indicate that the position isn't
1039 a first byte of a multibyte character.
1040
1041 Note that this function assumes PSTR->VALID_LEN elements are already
1042 built and starts from PSTR->VALID_LEN. */
1043
1044 static void
1045 internal_function
build_wcs_buffer(re_string_t * pstr)1046 build_wcs_buffer (re_string_t *pstr)
1047 {
1048 #ifdef _LIBC
1049 unsigned char buf[MB_LEN_MAX];
1050 assert (MB_LEN_MAX >= pstr->mb_cur_max);
1051 #else
1052 unsigned char buf[64];
1053 #endif
1054 mbstate_t prev_st;
1055 int byte_idx, end_idx, remain_len;
1056 size_t mbclen;
1057
1058 /* Build the buffers from pstr->valid_len to either pstr->len or
1059 pstr->bufs_len. */
1060 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1061 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
1062 {
1063 wchar_t wc;
1064 const char *p;
1065
1066 remain_len = end_idx - byte_idx;
1067 prev_st = pstr->cur_state;
1068 /* Apply the translation if we need. */
1069 if (BE (pstr->trans != NULL, 0))
1070 {
1071 int i, ch;
1072
1073 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
1074 {
1075 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
1076 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
1077 }
1078 p = (const char *) buf;
1079 }
1080 else
1081 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
1082 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
1083 if (BE (mbclen == (size_t) -2, 0))
1084 {
1085 /* The buffer doesn't have enough space, finish to build. */
1086 pstr->cur_state = prev_st;
1087 break;
1088 }
1089 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
1090 {
1091 /* We treat these cases as a singlebyte character. */
1092 mbclen = 1;
1093 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
1094 if (BE (pstr->trans != NULL, 0))
1095 wc = pstr->trans[wc];
1096 pstr->cur_state = prev_st;
1097 }
1098
1099 /* Write wide character and padding. */
1100 pstr->wcs[byte_idx++] = wc;
1101 /* Write paddings. */
1102 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1103 pstr->wcs[byte_idx++] = WEOF;
1104 }
1105 pstr->valid_len = byte_idx;
1106 pstr->valid_raw_len = byte_idx;
1107 }
1108
1109 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
1110 but for REG_ICASE. */
1111
1112 static reg_errcode_t
1113 internal_function
build_wcs_upper_buffer(re_string_t * pstr)1114 build_wcs_upper_buffer (re_string_t *pstr)
1115 {
1116 mbstate_t prev_st;
1117 int src_idx, byte_idx, end_idx, remain_len;
1118 size_t mbclen;
1119 #ifdef _LIBC
1120 char buf[MB_LEN_MAX];
1121 assert (MB_LEN_MAX >= pstr->mb_cur_max);
1122 #else
1123 char buf[64];
1124 #endif
1125
1126 byte_idx = pstr->valid_len;
1127 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1128
1129 /* The following optimization assumes that ASCII characters can be
1130 mapped to wide characters with a simple cast. */
1131 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
1132 {
1133 while (byte_idx < end_idx)
1134 {
1135 wchar_t wc;
1136
1137 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
1138 && mbsinit (&pstr->cur_state))
1139 {
1140 /* In case of a singlebyte character. */
1141 pstr->mbs[byte_idx]
1142 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
1143 /* The next step uses the assumption that wchar_t is encoded
1144 ASCII-safe: all ASCII values can be converted like this. */
1145 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
1146 ++byte_idx;
1147 continue;
1148 }
1149
1150 remain_len = end_idx - byte_idx;
1151 prev_st = pstr->cur_state;
1152 mbclen = mbrtowc (&wc,
1153 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
1154 + byte_idx), remain_len, &pstr->cur_state);
1155 if (BE (mbclen + 2 > 2, 1))
1156 {
1157 wchar_t wcu = wc;
1158 if (iswlower (wc))
1159 {
1160 size_t mbcdlen;
1161
1162 wcu = towupper (wc);
1163 mbcdlen = wcrtomb (buf, wcu, &prev_st);
1164 if (BE (mbclen == mbcdlen, 1))
1165 memcpy (pstr->mbs + byte_idx, buf, mbclen);
1166 else
1167 {
1168 src_idx = byte_idx;
1169 goto offsets_needed;
1170 }
1171 }
1172 else
1173 memcpy (pstr->mbs + byte_idx,
1174 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
1175 pstr->wcs[byte_idx++] = wcu;
1176 /* Write paddings. */
1177 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1178 pstr->wcs[byte_idx++] = WEOF;
1179 }
1180 else if (mbclen == (size_t) -1 || mbclen == 0)
1181 {
1182 /* It is an invalid character or '\0'. Just use the byte. */
1183 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
1184 pstr->mbs[byte_idx] = ch;
1185 /* And also cast it to wide char. */
1186 pstr->wcs[byte_idx++] = (wchar_t) ch;
1187 if (BE (mbclen == (size_t) -1, 0))
1188 pstr->cur_state = prev_st;
1189 }
1190 else
1191 {
1192 /* The buffer doesn't have enough space, finish to build. */
1193 pstr->cur_state = prev_st;
1194 break;
1195 }
1196 }
1197 pstr->valid_len = byte_idx;
1198 pstr->valid_raw_len = byte_idx;
1199 return REG_NOERROR;
1200 }
1201 else
1202 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
1203 {
1204 wchar_t wc;
1205 const char *p;
1206 offsets_needed:
1207 remain_len = end_idx - byte_idx;
1208 prev_st = pstr->cur_state;
1209 if (BE (pstr->trans != NULL, 0))
1210 {
1211 int i, ch;
1212
1213 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
1214 {
1215 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
1216 buf[i] = pstr->trans[ch];
1217 }
1218 p = (const char *) buf;
1219 }
1220 else
1221 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
1222 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
1223 if (BE (mbclen + 2 > 2, 1))
1224 {
1225 wchar_t wcu = wc;
1226 if (iswlower (wc))
1227 {
1228 size_t mbcdlen;
1229
1230 wcu = towupper (wc);
1231 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
1232 if (BE (mbclen == mbcdlen, 1))
1233 memcpy (pstr->mbs + byte_idx, buf, mbclen);
1234 else if (mbcdlen != (size_t) -1)
1235 {
1236 size_t i;
1237
1238 if (byte_idx + mbcdlen > pstr->bufs_len)
1239 {
1240 pstr->cur_state = prev_st;
1241 break;
1242 }
1243
1244 if (pstr->offsets == NULL)
1245 {
1246 pstr->offsets = re_malloc (int, pstr->bufs_len);
1247
1248 if (pstr->offsets == NULL)
1249 return REG_ESPACE;
1250 }
1251 if (!pstr->offsets_needed)
1252 {
1253 for (i = 0; i < (size_t) byte_idx; ++i)
1254 pstr->offsets[i] = i;
1255 pstr->offsets_needed = 1;
1256 }
1257
1258 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
1259 pstr->wcs[byte_idx] = wcu;
1260 pstr->offsets[byte_idx] = src_idx;
1261 for (i = 1; i < mbcdlen; ++i)
1262 {
1263 pstr->offsets[byte_idx + i]
1264 = src_idx + (i < mbclen ? i : mbclen - 1);
1265 pstr->wcs[byte_idx + i] = WEOF;
1266 }
1267 pstr->len += mbcdlen - mbclen;
1268 if (pstr->raw_stop > src_idx)
1269 pstr->stop += mbcdlen - mbclen;
1270 end_idx = (pstr->bufs_len > pstr->len)
1271 ? pstr->len : pstr->bufs_len;
1272 byte_idx += mbcdlen;
1273 src_idx += mbclen;
1274 continue;
1275 }
1276 else
1277 memcpy (pstr->mbs + byte_idx, p, mbclen);
1278 }
1279 else
1280 memcpy (pstr->mbs + byte_idx, p, mbclen);
1281
1282 if (BE (pstr->offsets_needed != 0, 0))
1283 {
1284 size_t i;
1285 for (i = 0; i < mbclen; ++i)
1286 pstr->offsets[byte_idx + i] = src_idx + i;
1287 }
1288 src_idx += mbclen;
1289
1290 pstr->wcs[byte_idx++] = wcu;
1291 /* Write paddings. */
1292 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1293 pstr->wcs[byte_idx++] = WEOF;
1294 }
1295 else if (mbclen == (size_t) -1 || mbclen == 0)
1296 {
1297 /* It is an invalid character or '\0'. Just use the byte. */
1298 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
1299
1300 if (BE (pstr->trans != NULL, 0))
1301 ch = pstr->trans [ch];
1302 pstr->mbs[byte_idx] = ch;
1303
1304 if (BE (pstr->offsets_needed != 0, 0))
1305 pstr->offsets[byte_idx] = src_idx;
1306 ++src_idx;
1307
1308 /* And also cast it to wide char. */
1309 pstr->wcs[byte_idx++] = (wchar_t) ch;
1310 if (BE (mbclen == (size_t) -1, 0))
1311 pstr->cur_state = prev_st;
1312 }
1313 else
1314 {
1315 /* The buffer doesn't have enough space, finish to build. */
1316 pstr->cur_state = prev_st;
1317 break;
1318 }
1319 }
1320 pstr->valid_len = byte_idx;
1321 pstr->valid_raw_len = src_idx;
1322 return REG_NOERROR;
1323 }
1324
1325 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
1326 Return the index. */
1327
1328 static int
1329 internal_function
re_string_skip_chars(re_string_t * pstr,int new_raw_idx,wint_t * last_wc)1330 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
1331 {
1332 mbstate_t prev_st;
1333 int rawbuf_idx;
1334 size_t mbclen;
1335 wchar_t wc = WEOF;
1336
1337 /* Skip the characters which are not necessary to check. */
1338 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
1339 rawbuf_idx < new_raw_idx;)
1340 {
1341 int remain_len;
1342 remain_len = pstr->len - rawbuf_idx;
1343 prev_st = pstr->cur_state;
1344 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
1345 remain_len, &pstr->cur_state);
1346 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
1347 {
1348 /* We treat these cases as a single byte character. */
1349 if (mbclen == 0 || remain_len == 0)
1350 wc = L'\0';
1351 else
1352 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
1353 mbclen = 1;
1354 pstr->cur_state = prev_st;
1355 }
1356 /* Then proceed the next character. */
1357 rawbuf_idx += mbclen;
1358 }
1359 *last_wc = (wint_t) wc;
1360 return rawbuf_idx;
1361 }
1362 #endif /* RE_ENABLE_I18N */
1363
1364 /* Build the buffer PSTR->MBS, and apply the translation if we need.
1365 This function is used in case of REG_ICASE. */
1366
1367 static void
1368 internal_function
build_upper_buffer(re_string_t * pstr)1369 build_upper_buffer (re_string_t *pstr)
1370 {
1371 int char_idx, end_idx;
1372 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1373
1374 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
1375 {
1376 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
1377 if (BE (pstr->trans != NULL, 0))
1378 ch = pstr->trans[ch];
1379 if (islower (ch))
1380 pstr->mbs[char_idx] = toupper (ch);
1381 else
1382 pstr->mbs[char_idx] = ch;
1383 }
1384 pstr->valid_len = char_idx;
1385 pstr->valid_raw_len = char_idx;
1386 }
1387
1388 /* Apply TRANS to the buffer in PSTR. */
1389
1390 static void
1391 internal_function
re_string_translate_buffer(re_string_t * pstr)1392 re_string_translate_buffer (re_string_t *pstr)
1393 {
1394 int buf_idx, end_idx;
1395 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1396
1397 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
1398 {
1399 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
1400 pstr->mbs[buf_idx] = pstr->trans[ch];
1401 }
1402
1403 pstr->valid_len = buf_idx;
1404 pstr->valid_raw_len = buf_idx;
1405 }
1406
1407 /* This function re-construct the buffers.
1408 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
1409 convert to upper case in case of REG_ICASE, apply translation. */
1410
1411 static reg_errcode_t
1412 internal_function
re_string_reconstruct(re_string_t * pstr,int idx,int eflags)1413 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
1414 {
1415 int offset = idx - pstr->raw_mbs_idx;
1416 if (BE (offset < 0, 0))
1417 {
1418 /* Reset buffer. */
1419 #ifdef RE_ENABLE_I18N
1420 if (pstr->mb_cur_max > 1)
1421 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
1422 #endif /* RE_ENABLE_I18N */
1423 pstr->len = pstr->raw_len;
1424 pstr->stop = pstr->raw_stop;
1425 pstr->valid_len = 0;
1426 pstr->raw_mbs_idx = 0;
1427 pstr->valid_raw_len = 0;
1428 pstr->offsets_needed = 0;
1429 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
1430 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
1431 if (!pstr->mbs_allocated)
1432 pstr->mbs = (unsigned char *) pstr->raw_mbs;
1433 offset = idx;
1434 }
1435
1436 if (BE (offset != 0, 1))
1437 {
1438 /* Should the already checked characters be kept? */
1439 if (BE (offset < pstr->valid_raw_len, 1))
1440 {
1441 /* Yes, move them to the front of the buffer. */
1442 #ifdef RE_ENABLE_I18N
1443 if (BE (pstr->offsets_needed, 0))
1444 {
1445 int low = 0, high = pstr->valid_len, mid;
1446 do
1447 {
1448 mid = (high + low) / 2;
1449 if (pstr->offsets[mid] > offset)
1450 high = mid;
1451 else if (pstr->offsets[mid] < offset)
1452 low = mid + 1;
1453 else
1454 break;
1455 }
1456 while (low < high);
1457 if (pstr->offsets[mid] < offset)
1458 ++mid;
1459 pstr->tip_context = re_string_context_at (pstr, mid - 1,
1460 eflags);
1461 /* This can be quite complicated, so handle specially
1462 only the common and easy case where the character with
1463 different length representation of lower and upper
1464 case is present at or after offset. */
1465 if (pstr->valid_len > offset
1466 && mid == offset && pstr->offsets[mid] == offset)
1467 {
1468 memmove (pstr->wcs, pstr->wcs + offset,
1469 (pstr->valid_len - offset) * sizeof (wint_t));
1470 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
1471 pstr->valid_len -= offset;
1472 pstr->valid_raw_len -= offset;
1473 for (low = 0; low < pstr->valid_len; low++)
1474 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
1475 }
1476 else
1477 {
1478 /* Otherwise, just find out how long the partial multibyte
1479 character at offset is and fill it with WEOF/255. */
1480 pstr->len = pstr->raw_len - idx + offset;
1481 pstr->stop = pstr->raw_stop - idx + offset;
1482 pstr->offsets_needed = 0;
1483 while (mid > 0 && pstr->offsets[mid - 1] == offset)
1484 --mid;
1485 while (mid < pstr->valid_len)
1486 if (pstr->wcs[mid] != WEOF)
1487 break;
1488 else
1489 ++mid;
1490 if (mid == pstr->valid_len)
1491 pstr->valid_len = 0;
1492 else
1493 {
1494 pstr->valid_len = pstr->offsets[mid] - offset;
1495 if (pstr->valid_len)
1496 {
1497 for (low = 0; low < pstr->valid_len; ++low)
1498 pstr->wcs[low] = WEOF;
1499 memset (pstr->mbs, 255, pstr->valid_len);
1500 }
1501 }
1502 pstr->valid_raw_len = pstr->valid_len;
1503 }
1504 }
1505 else
1506 #endif
1507 {
1508 pstr->tip_context = re_string_context_at (pstr, offset - 1,
1509 eflags);
1510 #ifdef RE_ENABLE_I18N
1511 if (pstr->mb_cur_max > 1)
1512 memmove (pstr->wcs, pstr->wcs + offset,
1513 (pstr->valid_len - offset) * sizeof (wint_t));
1514 #endif /* RE_ENABLE_I18N */
1515 if (BE (pstr->mbs_allocated, 0))
1516 memmove (pstr->mbs, pstr->mbs + offset,
1517 pstr->valid_len - offset);
1518 pstr->valid_len -= offset;
1519 pstr->valid_raw_len -= offset;
1520 #if DEBUG
1521 assert (pstr->valid_len > 0);
1522 #endif
1523 }
1524 }
1525 else
1526 {
1527 /* No, skip all characters until IDX. */
1528 int prev_valid_len = pstr->valid_len;
1529
1530 #ifdef RE_ENABLE_I18N
1531 if (BE (pstr->offsets_needed, 0))
1532 {
1533 pstr->len = pstr->raw_len - idx + offset;
1534 pstr->stop = pstr->raw_stop - idx + offset;
1535 pstr->offsets_needed = 0;
1536 }
1537 #endif
1538 pstr->valid_len = 0;
1539 #ifdef RE_ENABLE_I18N
1540 if (pstr->mb_cur_max > 1)
1541 {
1542 int wcs_idx;
1543 wint_t wc = WEOF;
1544
1545 if (pstr->is_utf8)
1546 {
1547 const unsigned char *raw, *p, *q, *end;
1548
1549 /* Special case UTF-8. Multi-byte chars start with any
1550 byte other than 0x80 - 0xbf. */
1551 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
1552 end = raw + (offset - pstr->mb_cur_max);
1553 if (end < pstr->raw_mbs)
1554 end = pstr->raw_mbs;
1555 p = raw + offset - 1;
1556 #ifdef _LIBC
1557 /* We know the wchar_t encoding is UCS4, so for the simple
1558 case, ASCII characters, skip the conversion step. */
1559 if (isascii (*p) && BE (pstr->trans == NULL, 1))
1560 {
1561 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
1562 /* pstr->valid_len = 0; */
1563 wc = (wchar_t) *p;
1564 }
1565 else
1566 #endif
1567 for (; p >= end; --p)
1568 if ((*p & 0xc0) != 0x80)
1569 {
1570 mbstate_t cur_state;
1571 wchar_t wc2;
1572 int mlen = raw + pstr->len - p;
1573 unsigned char buf[6];
1574 size_t mbclen;
1575
1576 q = p;
1577 if (BE (pstr->trans != NULL, 0))
1578 {
1579 int i = mlen < 6 ? mlen : 6;
1580 while (--i >= 0)
1581 buf[i] = pstr->trans[p[i]];
1582 q = buf;
1583 }
1584 /* XXX Don't use mbrtowc, we know which conversion
1585 to use (UTF-8 -> UCS4). */
1586 memset (&cur_state, 0, sizeof (cur_state));
1587 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
1588 &cur_state);
1589 if (raw + offset - p <= mbclen
1590 && mbclen < (size_t) -2)
1591 {
1592 memset (&pstr->cur_state, '\0',
1593 sizeof (mbstate_t));
1594 pstr->valid_len = mbclen - (raw + offset - p);
1595 wc = wc2;
1596 }
1597 break;
1598 }
1599 }
1600
1601 if (wc == WEOF)
1602 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
1603 if (wc == WEOF)
1604 pstr->tip_context
1605 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
1606 else
1607 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
1608 && IS_WIDE_WORD_CHAR (wc))
1609 ? CONTEXT_WORD
1610 : ((IS_WIDE_NEWLINE (wc)
1611 && pstr->newline_anchor)
1612 ? CONTEXT_NEWLINE : 0));
1613 if (BE (pstr->valid_len, 0))
1614 {
1615 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
1616 pstr->wcs[wcs_idx] = WEOF;
1617 if (pstr->mbs_allocated)
1618 memset (pstr->mbs, 255, pstr->valid_len);
1619 }
1620 pstr->valid_raw_len = pstr->valid_len;
1621 }
1622 else
1623 #endif /* RE_ENABLE_I18N */
1624 {
1625 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
1626 pstr->valid_raw_len = 0;
1627 if (pstr->trans)
1628 c = pstr->trans[c];
1629 pstr->tip_context = (bitset_contain (pstr->word_char, c)
1630 ? CONTEXT_WORD
1631 : ((IS_NEWLINE (c) && pstr->newline_anchor)
1632 ? CONTEXT_NEWLINE : 0));
1633 }
1634 }
1635 if (!BE (pstr->mbs_allocated, 0))
1636 pstr->mbs += offset;
1637 }
1638 pstr->raw_mbs_idx = idx;
1639 pstr->len -= offset;
1640 pstr->stop -= offset;
1641
1642 /* Then build the buffers. */
1643 #ifdef RE_ENABLE_I18N
1644 if (pstr->mb_cur_max > 1)
1645 {
1646 if (pstr->icase)
1647 {
1648 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
1649 if (BE (ret != REG_NOERROR, 0))
1650 return ret;
1651 }
1652 else
1653 build_wcs_buffer (pstr);
1654 }
1655 else
1656 #endif /* RE_ENABLE_I18N */
1657 if (BE (pstr->mbs_allocated, 0))
1658 {
1659 if (pstr->icase)
1660 build_upper_buffer (pstr);
1661 else if (pstr->trans != NULL)
1662 re_string_translate_buffer (pstr);
1663 }
1664 else
1665 pstr->valid_len = pstr->len;
1666
1667 pstr->cur_idx = 0;
1668 return REG_NOERROR;
1669 }
1670
1671 static unsigned char
internal_function(pure)1672 internal_function __attribute ((pure))
1673 re_string_peek_byte_case (const re_string_t *pstr, int idx)
1674 {
1675 int ch, off;
1676
1677 /* Handle the common (easiest) cases first. */
1678 if (BE (!pstr->mbs_allocated, 1))
1679 return re_string_peek_byte (pstr, idx);
1680
1681 #ifdef RE_ENABLE_I18N
1682 if (pstr->mb_cur_max > 1
1683 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
1684 return re_string_peek_byte (pstr, idx);
1685 #endif
1686
1687 off = pstr->cur_idx + idx;
1688 #ifdef RE_ENABLE_I18N
1689 if (pstr->offsets_needed)
1690 off = pstr->offsets[off];
1691 #endif
1692
1693 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
1694
1695 #ifdef RE_ENABLE_I18N
1696 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
1697 this function returns CAPITAL LETTER I instead of first byte of
1698 DOTLESS SMALL LETTER I. The latter would confuse the parser,
1699 since peek_byte_case doesn't advance cur_idx in any way. */
1700 if (pstr->offsets_needed && !isascii (ch))
1701 return re_string_peek_byte (pstr, idx);
1702 #endif
1703
1704 return ch;
1705 }
1706
1707 static unsigned char
internal_function(pure)1708 internal_function __attribute ((pure))
1709 re_string_fetch_byte_case (re_string_t *pstr)
1710 {
1711 if (BE (!pstr->mbs_allocated, 1))
1712 return re_string_fetch_byte (pstr);
1713
1714 #ifdef RE_ENABLE_I18N
1715 if (pstr->offsets_needed)
1716 {
1717 int off, ch;
1718
1719 /* For tr_TR.UTF-8 [[:islower:]] there is
1720 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
1721 in that case the whole multi-byte character and return
1722 the original letter. On the other side, with
1723 [[: DOTLESS SMALL LETTER I return [[:I, as doing
1724 anything else would complicate things too much. */
1725
1726 if (!re_string_first_byte (pstr, pstr->cur_idx))
1727 return re_string_fetch_byte (pstr);
1728
1729 off = pstr->offsets[pstr->cur_idx];
1730 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
1731
1732 if (! isascii (ch))
1733 return re_string_fetch_byte (pstr);
1734
1735 re_string_skip_bytes (pstr,
1736 re_string_char_size_at (pstr, pstr->cur_idx));
1737 return ch;
1738 }
1739 #endif
1740
1741 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
1742 }
1743
1744 static void
1745 internal_function
re_string_destruct(re_string_t * pstr)1746 re_string_destruct (re_string_t *pstr)
1747 {
1748 #ifdef RE_ENABLE_I18N
1749 re_free (pstr->wcs);
1750 re_free (pstr->offsets);
1751 #endif /* RE_ENABLE_I18N */
1752 if (pstr->mbs_allocated)
1753 re_free (pstr->mbs);
1754 }
1755
1756 /* Return the context at IDX in INPUT. */
1757
1758 static unsigned int
1759 internal_function
re_string_context_at(const re_string_t * input,int idx,int eflags)1760 re_string_context_at (const re_string_t *input, int idx, int eflags)
1761 {
1762 int c;
1763 if (BE (idx < 0, 0))
1764 /* In this case, we use the value stored in input->tip_context,
1765 since we can't know the character in input->mbs[-1] here. */
1766 return input->tip_context;
1767 if (BE (idx == input->len, 0))
1768 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
1769 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
1770 #ifdef RE_ENABLE_I18N
1771 if (input->mb_cur_max > 1)
1772 {
1773 wint_t wc;
1774 int wc_idx = idx;
1775 while(input->wcs[wc_idx] == WEOF)
1776 {
1777 #ifdef DEBUG
1778 /* It must not happen. */
1779 assert (wc_idx >= 0);
1780 #endif
1781 --wc_idx;
1782 if (wc_idx < 0)
1783 return input->tip_context;
1784 }
1785 wc = input->wcs[wc_idx];
1786 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
1787 return CONTEXT_WORD;
1788 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
1789 ? CONTEXT_NEWLINE : 0);
1790 }
1791 else
1792 #endif
1793 {
1794 c = re_string_byte_at (input, idx);
1795 if (bitset_contain (input->word_char, c))
1796 return CONTEXT_WORD;
1797 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
1798 }
1799 }
1800
1801 /* Functions for set operation. */
1802
1803 static reg_errcode_t
1804 internal_function
re_node_set_alloc(re_node_set * set,int size)1805 re_node_set_alloc (re_node_set *set, int size)
1806 {
1807 set->alloc = size;
1808 set->nelem = 0;
1809 set->elems = re_malloc (int, size);
1810 if (BE (set->elems == NULL, 0))
1811 return REG_ESPACE;
1812 return REG_NOERROR;
1813 }
1814
1815 static reg_errcode_t
1816 internal_function
re_node_set_init_1(re_node_set * set,int elem)1817 re_node_set_init_1 (re_node_set *set, int elem)
1818 {
1819 set->alloc = 1;
1820 set->nelem = 1;
1821 set->elems = re_malloc (int, 1);
1822 if (BE (set->elems == NULL, 0))
1823 {
1824 set->alloc = set->nelem = 0;
1825 return REG_ESPACE;
1826 }
1827 set->elems[0] = elem;
1828 return REG_NOERROR;
1829 }
1830
1831 static reg_errcode_t
1832 internal_function
re_node_set_init_2(re_node_set * set,int elem1,int elem2)1833 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
1834 {
1835 set->alloc = 2;
1836 set->elems = re_malloc (int, 2);
1837 if (BE (set->elems == NULL, 0))
1838 return REG_ESPACE;
1839 if (elem1 == elem2)
1840 {
1841 set->nelem = 1;
1842 set->elems[0] = elem1;
1843 }
1844 else
1845 {
1846 set->nelem = 2;
1847 if (elem1 < elem2)
1848 {
1849 set->elems[0] = elem1;
1850 set->elems[1] = elem2;
1851 }
1852 else
1853 {
1854 set->elems[0] = elem2;
1855 set->elems[1] = elem1;
1856 }
1857 }
1858 return REG_NOERROR;
1859 }
1860
1861 static reg_errcode_t
1862 internal_function
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)1863 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1864 {
1865 dest->nelem = src->nelem;
1866 if (src->nelem > 0)
1867 {
1868 dest->alloc = dest->nelem;
1869 dest->elems = re_malloc (int, dest->alloc);
1870 if (BE (dest->elems == NULL, 0))
1871 {
1872 dest->alloc = dest->nelem = 0;
1873 return REG_ESPACE;
1874 }
1875 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1876 }
1877 else
1878 re_node_set_init_empty (dest);
1879 return REG_NOERROR;
1880 }
1881
1882 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1883 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1884 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1885
1886 static reg_errcode_t
1887 internal_function
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1888 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1889 const re_node_set *src2)
1890 {
1891 int i1, i2, is, id, delta, sbase;
1892 if (src1->nelem == 0 || src2->nelem == 0)
1893 return REG_NOERROR;
1894
1895 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1896 conservative estimate. */
1897 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1898 {
1899 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1900 int *new_elems = re_realloc (dest->elems, int, new_alloc);
1901 if (BE (new_elems == NULL, 0))
1902 return REG_ESPACE;
1903 dest->elems = new_elems;
1904 dest->alloc = new_alloc;
1905 }
1906
1907 /* Find the items in the intersection of SRC1 and SRC2, and copy
1908 into the top of DEST those that are not already in DEST itself. */
1909 sbase = dest->nelem + src1->nelem + src2->nelem;
1910 i1 = src1->nelem - 1;
1911 i2 = src2->nelem - 1;
1912 id = dest->nelem - 1;
1913 for (;;)
1914 {
1915 if (src1->elems[i1] == src2->elems[i2])
1916 {
1917 /* Try to find the item in DEST. Maybe we could binary search? */
1918 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1919 --id;
1920
1921 if (id < 0 || dest->elems[id] != src1->elems[i1])
1922 dest->elems[--sbase] = src1->elems[i1];
1923
1924 if (--i1 < 0 || --i2 < 0)
1925 break;
1926 }
1927
1928 /* Lower the highest of the two items. */
1929 else if (src1->elems[i1] < src2->elems[i2])
1930 {
1931 if (--i2 < 0)
1932 break;
1933 }
1934 else
1935 {
1936 if (--i1 < 0)
1937 break;
1938 }
1939 }
1940
1941 id = dest->nelem - 1;
1942 is = dest->nelem + src1->nelem + src2->nelem - 1;
1943 delta = is - sbase + 1;
1944
1945 /* Now copy. When DELTA becomes zero, the remaining
1946 DEST elements are already in place; this is more or
1947 less the same loop that is in re_node_set_merge. */
1948 dest->nelem += delta;
1949 if (delta > 0 && id >= 0)
1950 for (;;)
1951 {
1952 if (dest->elems[is] > dest->elems[id])
1953 {
1954 /* Copy from the top. */
1955 dest->elems[id + delta--] = dest->elems[is--];
1956 if (delta == 0)
1957 break;
1958 }
1959 else
1960 {
1961 /* Slide from the bottom. */
1962 dest->elems[id + delta] = dest->elems[id];
1963 if (--id < 0)
1964 break;
1965 }
1966 }
1967
1968 /* Copy remaining SRC elements. */
1969 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1970
1971 return REG_NOERROR;
1972 }
1973
1974 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1975 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1976
1977 static reg_errcode_t
1978 internal_function
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1979 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1980 const re_node_set *src2)
1981 {
1982 int i1, i2, id;
1983 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1984 {
1985 dest->alloc = src1->nelem + src2->nelem;
1986 dest->elems = re_malloc (int, dest->alloc);
1987 if (BE (dest->elems == NULL, 0))
1988 return REG_ESPACE;
1989 }
1990 else
1991 {
1992 if (src1 != NULL && src1->nelem > 0)
1993 return re_node_set_init_copy (dest, src1);
1994 else if (src2 != NULL && src2->nelem > 0)
1995 return re_node_set_init_copy (dest, src2);
1996 else
1997 re_node_set_init_empty (dest);
1998 return REG_NOERROR;
1999 }
2000 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
2001 {
2002 if (src1->elems[i1] > src2->elems[i2])
2003 {
2004 dest->elems[id++] = src2->elems[i2++];
2005 continue;
2006 }
2007 if (src1->elems[i1] == src2->elems[i2])
2008 ++i2;
2009 dest->elems[id++] = src1->elems[i1++];
2010 }
2011 if (i1 < src1->nelem)
2012 {
2013 memcpy (dest->elems + id, src1->elems + i1,
2014 (src1->nelem - i1) * sizeof (int));
2015 id += src1->nelem - i1;
2016 }
2017 else if (i2 < src2->nelem)
2018 {
2019 memcpy (dest->elems + id, src2->elems + i2,
2020 (src2->nelem - i2) * sizeof (int));
2021 id += src2->nelem - i2;
2022 }
2023 dest->nelem = id;
2024 return REG_NOERROR;
2025 }
2026
2027 /* Calculate the union set of the sets DEST and SRC. And store it to
2028 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
2029
2030 static reg_errcode_t
2031 internal_function
re_node_set_merge(re_node_set * dest,const re_node_set * src)2032 re_node_set_merge (re_node_set *dest, const re_node_set *src)
2033 {
2034 int is, id, sbase, delta;
2035 if (src == NULL || src->nelem == 0)
2036 return REG_NOERROR;
2037 if (dest->alloc < 2 * src->nelem + dest->nelem)
2038 {
2039 int new_alloc = 2 * (src->nelem + dest->alloc);
2040 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
2041 if (BE (new_buffer == NULL, 0))
2042 return REG_ESPACE;
2043 dest->elems = new_buffer;
2044 dest->alloc = new_alloc;
2045 }
2046
2047 if (BE (dest->nelem == 0, 0))
2048 {
2049 dest->nelem = src->nelem;
2050 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
2051 return REG_NOERROR;
2052 }
2053
2054 /* Copy into the top of DEST the items of SRC that are not
2055 found in DEST. Maybe we could binary search in DEST? */
2056 for (sbase = dest->nelem + 2 * src->nelem,
2057 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
2058 {
2059 if (dest->elems[id] == src->elems[is])
2060 is--, id--;
2061 else if (dest->elems[id] < src->elems[is])
2062 dest->elems[--sbase] = src->elems[is--];
2063 else /* if (dest->elems[id] > src->elems[is]) */
2064 --id;
2065 }
2066
2067 if (is >= 0)
2068 {
2069 /* If DEST is exhausted, the remaining items of SRC must be unique. */
2070 sbase -= is + 1;
2071 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
2072 }
2073
2074 id = dest->nelem - 1;
2075 is = dest->nelem + 2 * src->nelem - 1;
2076 delta = is - sbase + 1;
2077 if (delta == 0)
2078 return REG_NOERROR;
2079
2080 /* Now copy. When DELTA becomes zero, the remaining
2081 DEST elements are already in place. */
2082 dest->nelem += delta;
2083 for (;;)
2084 {
2085 if (dest->elems[is] > dest->elems[id])
2086 {
2087 /* Copy from the top. */
2088 dest->elems[id + delta--] = dest->elems[is--];
2089 if (delta == 0)
2090 break;
2091 }
2092 else
2093 {
2094 /* Slide from the bottom. */
2095 dest->elems[id + delta] = dest->elems[id];
2096 if (--id < 0)
2097 {
2098 /* Copy remaining SRC elements. */
2099 memcpy (dest->elems, dest->elems + sbase,
2100 delta * sizeof (int));
2101 break;
2102 }
2103 }
2104 }
2105
2106 return REG_NOERROR;
2107 }
2108
2109 /* Insert the new element ELEM to the re_node_set* SET.
2110 SET should not already have ELEM.
2111 return -1 if an error is occured, return 1 otherwise. */
2112
2113 static int
2114 internal_function
re_node_set_insert(re_node_set * set,int elem)2115 re_node_set_insert (re_node_set *set, int elem)
2116 {
2117 int idx;
2118 /* In case the set is empty. */
2119 if (set->alloc == 0)
2120 {
2121 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
2122 return 1;
2123 else
2124 return -1;
2125 }
2126
2127 if (BE (set->nelem, 0) == 0)
2128 {
2129 /* We already guaranteed above that set->alloc != 0. */
2130 set->elems[0] = elem;
2131 ++set->nelem;
2132 return 1;
2133 }
2134
2135 /* Realloc if we need. */
2136 if (set->alloc == set->nelem)
2137 {
2138 int *new_elems;
2139 set->alloc = set->alloc * 2;
2140 new_elems = re_realloc (set->elems, int, set->alloc);
2141 if (BE (new_elems == NULL, 0))
2142 return -1;
2143 set->elems = new_elems;
2144 }
2145
2146 /* Move the elements which follows the new element. Test the
2147 first element separately to skip a check in the inner loop. */
2148 if (elem < set->elems[0])
2149 {
2150 idx = 0;
2151 for (idx = set->nelem; idx > 0; idx--)
2152 set->elems[idx] = set->elems[idx - 1];
2153 }
2154 else
2155 {
2156 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
2157 set->elems[idx] = set->elems[idx - 1];
2158 }
2159
2160 /* Insert the new element. */
2161 set->elems[idx] = elem;
2162 ++set->nelem;
2163 return 1;
2164 }
2165
2166 /* Insert the new element ELEM to the re_node_set* SET.
2167 SET should not already have any element greater than or equal to ELEM.
2168 Return -1 if an error is occured, return 1 otherwise. */
2169
2170 static int
2171 internal_function
re_node_set_insert_last(re_node_set * set,int elem)2172 re_node_set_insert_last (re_node_set *set, int elem)
2173 {
2174 /* Realloc if we need. */
2175 if (set->alloc == set->nelem)
2176 {
2177 int *new_elems;
2178 set->alloc = (set->alloc + 1) * 2;
2179 new_elems = re_realloc (set->elems, int, set->alloc);
2180 if (BE (new_elems == NULL, 0))
2181 return -1;
2182 set->elems = new_elems;
2183 }
2184
2185 /* Insert the new element. */
2186 set->elems[set->nelem++] = elem;
2187 return 1;
2188 }
2189
2190 /* Compare two node sets SET1 and SET2.
2191 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
2192
2193 static int
internal_function(pure)2194 internal_function __attribute ((pure))
2195 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
2196 {
2197 int i;
2198 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
2199 return 0;
2200 for (i = set1->nelem ; --i >= 0 ; )
2201 if (set1->elems[i] != set2->elems[i])
2202 return 0;
2203 return 1;
2204 }
2205
2206 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
2207
2208 static int
internal_function(pure)2209 internal_function __attribute ((pure))
2210 re_node_set_contains (const re_node_set *set, int elem)
2211 {
2212 unsigned int idx, right, mid;
2213 if (set->nelem <= 0)
2214 return 0;
2215
2216 /* Binary search the element. */
2217 idx = 0;
2218 right = set->nelem - 1;
2219 while (idx < right)
2220 {
2221 mid = (idx + right) / 2;
2222 if (set->elems[mid] < elem)
2223 idx = mid + 1;
2224 else
2225 right = mid;
2226 }
2227 return set->elems[idx] == elem ? idx + 1 : 0;
2228 }
2229
2230 static void
2231 internal_function
re_node_set_remove_at(re_node_set * set,int idx)2232 re_node_set_remove_at (re_node_set *set, int idx)
2233 {
2234 if (idx < 0 || idx >= set->nelem)
2235 return;
2236 --set->nelem;
2237 for (; idx < set->nelem; idx++)
2238 set->elems[idx] = set->elems[idx + 1];
2239 }
2240
2241
2242 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
2243 Or return -1, if an error will be occured. */
2244
2245 static int
2246 internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)2247 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
2248 {
2249 int type = token.type;
2250 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
2251 {
2252 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
2253 int *new_nexts, *new_indices;
2254 re_node_set *new_edests, *new_eclosures;
2255 re_token_t *new_nodes;
2256
2257 /* Avoid overflows. */
2258 if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
2259 return -1;
2260
2261 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
2262 if (BE (new_nodes == NULL, 0))
2263 return -1;
2264 dfa->nodes = new_nodes;
2265 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
2266 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
2267 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
2268 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
2269 if (BE (new_nexts == NULL || new_indices == NULL
2270 || new_edests == NULL || new_eclosures == NULL, 0))
2271 return -1;
2272 dfa->nexts = new_nexts;
2273 dfa->org_indices = new_indices;
2274 dfa->edests = new_edests;
2275 dfa->eclosures = new_eclosures;
2276 dfa->nodes_alloc = new_nodes_alloc;
2277 }
2278 dfa->nodes[dfa->nodes_len] = token;
2279 dfa->nodes[dfa->nodes_len].constraint = 0;
2280 #ifdef RE_ENABLE_I18N
2281 dfa->nodes[dfa->nodes_len].accept_mb =
2282 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
2283 #endif
2284 dfa->nexts[dfa->nodes_len] = -1;
2285 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
2286 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
2287 return dfa->nodes_len++;
2288 }
2289
2290 static inline unsigned int
2291 internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)2292 calc_state_hash (const re_node_set *nodes, unsigned int context)
2293 {
2294 unsigned int hash = nodes->nelem + context;
2295 int i;
2296 for (i = 0 ; i < nodes->nelem ; i++)
2297 hash += nodes->elems[i];
2298 return hash;
2299 }
2300
2301 /* Search for the state whose node_set is equivalent to NODES.
2302 Return the pointer to the state, if we found it in the DFA.
2303 Otherwise create the new one and return it. In case of an error
2304 return NULL and set the error code in ERR.
2305 Note: - We assume NULL as the invalid state, then it is possible that
2306 return value is NULL and ERR is REG_NOERROR.
2307 - We never return non-NULL value in case of any errors, it is for
2308 optimization. */
2309
2310 static re_dfastate_t *
2311 internal_function
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)2312 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
2313 const re_node_set *nodes)
2314 {
2315 unsigned int hash;
2316 re_dfastate_t *new_state;
2317 struct re_state_table_entry *spot;
2318 int i;
2319 if (BE (nodes->nelem == 0, 0))
2320 {
2321 *err = REG_NOERROR;
2322 return NULL;
2323 }
2324 hash = calc_state_hash (nodes, 0);
2325 spot = dfa->state_table + (hash & dfa->state_hash_mask);
2326
2327 for (i = 0 ; i < spot->num ; i++)
2328 {
2329 re_dfastate_t *state = spot->array[i];
2330 if (hash != state->hash)
2331 continue;
2332 if (re_node_set_compare (&state->nodes, nodes))
2333 return state;
2334 }
2335
2336 /* There are no appropriate state in the dfa, create the new one. */
2337 new_state = create_ci_newstate (dfa, nodes, hash);
2338 if (BE (new_state == NULL, 0))
2339 *err = REG_ESPACE;
2340
2341 return new_state;
2342 }
2343
2344 /* Search for the state whose node_set is equivalent to NODES and
2345 whose context is equivalent to CONTEXT.
2346 Return the pointer to the state, if we found it in the DFA.
2347 Otherwise create the new one and return it. In case of an error
2348 return NULL and set the error code in ERR.
2349 Note: - We assume NULL as the invalid state, then it is possible that
2350 return value is NULL and ERR is REG_NOERROR.
2351 - We never return non-NULL value in case of any errors, it is for
2352 optimization. */
2353
2354 static re_dfastate_t *
2355 internal_function
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)2356 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
2357 const re_node_set *nodes, unsigned int context)
2358 {
2359 unsigned int hash;
2360 re_dfastate_t *new_state;
2361 struct re_state_table_entry *spot;
2362 int i;
2363 if (nodes->nelem == 0)
2364 {
2365 *err = REG_NOERROR;
2366 return NULL;
2367 }
2368 hash = calc_state_hash (nodes, context);
2369 spot = dfa->state_table + (hash & dfa->state_hash_mask);
2370
2371 for (i = 0 ; i < spot->num ; i++)
2372 {
2373 re_dfastate_t *state = spot->array[i];
2374 if (state->hash == hash
2375 && state->context == context
2376 && re_node_set_compare (state->entrance_nodes, nodes))
2377 return state;
2378 }
2379 /* There are no appropriate state in `dfa', create the new one. */
2380 new_state = create_cd_newstate (dfa, nodes, context, hash);
2381 if (BE (new_state == NULL, 0))
2382 *err = REG_ESPACE;
2383
2384 return new_state;
2385 }
2386
2387 /* Finish initialization of the new state NEWSTATE, and using its hash value
2388 HASH put in the appropriate bucket of DFA's state table. Return value
2389 indicates the error code if failed. */
2390
2391 static reg_errcode_t
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,unsigned int hash)2392 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
2393 unsigned int hash)
2394 {
2395 struct re_state_table_entry *spot;
2396 reg_errcode_t err;
2397 int i;
2398
2399 newstate->hash = hash;
2400 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
2401 if (BE (err != REG_NOERROR, 0))
2402 return REG_ESPACE;
2403 for (i = 0; i < newstate->nodes.nelem; i++)
2404 {
2405 int elem = newstate->nodes.elems[i];
2406 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
2407 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
2408 }
2409
2410 spot = dfa->state_table + (hash & dfa->state_hash_mask);
2411 if (BE (spot->alloc <= spot->num, 0))
2412 {
2413 int new_alloc = 2 * spot->num + 2;
2414 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
2415 new_alloc);
2416 if (BE (new_array == NULL, 0))
2417 return REG_ESPACE;
2418 spot->array = new_array;
2419 spot->alloc = new_alloc;
2420 }
2421 spot->array[spot->num++] = newstate;
2422 return REG_NOERROR;
2423 }
2424
2425 static void
free_state(re_dfastate_t * state)2426 free_state (re_dfastate_t *state)
2427 {
2428 re_node_set_free (&state->non_eps_nodes);
2429 re_node_set_free (&state->inveclosure);
2430 if (state->entrance_nodes != &state->nodes)
2431 {
2432 re_node_set_free (state->entrance_nodes);
2433 re_free (state->entrance_nodes);
2434 }
2435 re_node_set_free (&state->nodes);
2436 re_free (state->word_trtable);
2437 re_free (state->trtable);
2438 re_free (state);
2439 }
2440
2441 /* Create the new state which is independ of contexts.
2442 Return the new state if succeeded, otherwise return NULL. */
2443
2444 static re_dfastate_t *
2445 internal_function
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int hash)2446 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
2447 unsigned int hash)
2448 {
2449 int i;
2450 reg_errcode_t err;
2451 re_dfastate_t *newstate;
2452
2453 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
2454 if (BE (newstate == NULL, 0))
2455 return NULL;
2456 err = re_node_set_init_copy (&newstate->nodes, nodes);
2457 if (BE (err != REG_NOERROR, 0))
2458 {
2459 re_free (newstate);
2460 return NULL;
2461 }
2462
2463 newstate->entrance_nodes = &newstate->nodes;
2464 for (i = 0 ; i < nodes->nelem ; i++)
2465 {
2466 re_token_t *node = dfa->nodes + nodes->elems[i];
2467 re_token_type_t type = node->type;
2468 if (type == CHARACTER && !node->constraint)
2469 continue;
2470 #ifdef RE_ENABLE_I18N
2471 newstate->accept_mb |= node->accept_mb;
2472 #endif /* RE_ENABLE_I18N */
2473
2474 /* If the state has the halt node, the state is a halt state. */
2475 if (type == END_OF_RE)
2476 newstate->halt = 1;
2477 else if (type == OP_BACK_REF)
2478 newstate->has_backref = 1;
2479 else if (type == ANCHOR || node->constraint)
2480 newstate->has_constraint = 1;
2481 }
2482 err = register_state (dfa, newstate, hash);
2483 if (BE (err != REG_NOERROR, 0))
2484 {
2485 free_state (newstate);
2486 newstate = NULL;
2487 }
2488 return newstate;
2489 }
2490
2491 /* Create the new state which is depend on the context CONTEXT.
2492 Return the new state if succeeded, otherwise return NULL. */
2493
2494 static re_dfastate_t *
2495 internal_function
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,unsigned int hash)2496 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
2497 unsigned int context, unsigned int hash)
2498 {
2499 int i, nctx_nodes = 0;
2500 reg_errcode_t err;
2501 re_dfastate_t *newstate;
2502
2503 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
2504 if (BE (newstate == NULL, 0))
2505 return NULL;
2506 err = re_node_set_init_copy (&newstate->nodes, nodes);
2507 if (BE (err != REG_NOERROR, 0))
2508 {
2509 re_free (newstate);
2510 return NULL;
2511 }
2512
2513 newstate->context = context;
2514 newstate->entrance_nodes = &newstate->nodes;
2515
2516 for (i = 0 ; i < nodes->nelem ; i++)
2517 {
2518 unsigned int constraint = 0;
2519 re_token_t *node = dfa->nodes + nodes->elems[i];
2520 re_token_type_t type = node->type;
2521 if (node->constraint)
2522 constraint = node->constraint;
2523
2524 if (type == CHARACTER && !constraint)
2525 continue;
2526 #ifdef RE_ENABLE_I18N
2527 newstate->accept_mb |= node->accept_mb;
2528 #endif /* RE_ENABLE_I18N */
2529
2530 /* If the state has the halt node, the state is a halt state. */
2531 if (type == END_OF_RE)
2532 newstate->halt = 1;
2533 else if (type == OP_BACK_REF)
2534 newstate->has_backref = 1;
2535 else if (type == ANCHOR)
2536 constraint = node->opr.ctx_type;
2537
2538 if (constraint)
2539 {
2540 if (newstate->entrance_nodes == &newstate->nodes)
2541 {
2542 newstate->entrance_nodes = re_malloc (re_node_set, 1);
2543 if (BE (newstate->entrance_nodes == NULL, 0))
2544 {
2545 free_state (newstate);
2546 return NULL;
2547 }
2548 re_node_set_init_copy (newstate->entrance_nodes, nodes);
2549 nctx_nodes = 0;
2550 newstate->has_constraint = 1;
2551 }
2552
2553 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
2554 {
2555 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
2556 ++nctx_nodes;
2557 }
2558 }
2559 }
2560 err = register_state (dfa, newstate, hash);
2561 if (BE (err != REG_NOERROR, 0))
2562 {
2563 free_state (newstate);
2564 newstate = NULL;
2565 }
2566 return newstate;
2567 }
2568
2569 /******************************************************************************/
2570 /******************************************************************************/
2571 /******************************************************************************/
2572 /* GKINCLUDE #include "regcomp.c" */
2573 /******************************************************************************/
2574 /******************************************************************************/
2575 /******************************************************************************/
2576 /* Extended regular expression matching and search library.
2577 Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
2578 This file is part of the GNU C Library.
2579 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
2580
2581 The GNU C Library is free software; you can redistribute it and/or
2582 modify it under the terms of the GNU Lesser General Public
2583 License as published by the Free Software Foundation; either
2584 version 2.1 of the License, or (at your option) any later version.
2585
2586 The GNU C Library is distributed in the hope that it will be useful,
2587 but WITHOUT ANY WARRANTY; without even the implied warranty of
2588 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2589 Lesser General Public License for more details.
2590
2591 You should have received a copy of the GNU Lesser General Public
2592 License along with the GNU C Library; if not, write to the Free
2593 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
2594 02111-1307 USA. */
2595
2596 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
2597 size_t length, reg_syntax_t syntax);
2598 static void re_compile_fastmap_iter (regex_t *bufp,
2599 const re_dfastate_t *init_state,
2600 char *fastmap);
2601 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
2602 #ifdef RE_ENABLE_I18N
2603 static void free_charset (re_charset_t *cset);
2604 #endif /* RE_ENABLE_I18N */
2605 static void free_workarea_compile (regex_t *preg);
2606 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
2607 #ifdef RE_ENABLE_I18N
2608 static void optimize_utf8 (re_dfa_t *dfa);
2609 #endif
2610 static reg_errcode_t analyze (regex_t *preg);
2611 static reg_errcode_t preorder (bin_tree_t *root,
2612 reg_errcode_t (fn (void *, bin_tree_t *)),
2613 void *extra);
2614 static reg_errcode_t postorder (bin_tree_t *root,
2615 reg_errcode_t (fn (void *, bin_tree_t *)),
2616 void *extra);
2617 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
2618 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
2619 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
2620 bin_tree_t *node);
2621 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
2622 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
2623 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
2624 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
2625 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
2626 unsigned int constraint);
2627 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
2628 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
2629 int node, int root);
2630 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
2631 static int fetch_number (re_string_t *input, re_token_t *token,
2632 reg_syntax_t syntax);
2633 static int peek_token (re_token_t *token, re_string_t *input,
2634 reg_syntax_t syntax) internal_function;
2635 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
2636 reg_syntax_t syntax, reg_errcode_t *err);
2637 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
2638 re_token_t *token, reg_syntax_t syntax,
2639 int nest, reg_errcode_t *err);
2640 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
2641 re_token_t *token, reg_syntax_t syntax,
2642 int nest, reg_errcode_t *err);
2643 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
2644 re_token_t *token, reg_syntax_t syntax,
2645 int nest, reg_errcode_t *err);
2646 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
2647 re_token_t *token, reg_syntax_t syntax,
2648 int nest, reg_errcode_t *err);
2649 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
2650 re_dfa_t *dfa, re_token_t *token,
2651 reg_syntax_t syntax, reg_errcode_t *err);
2652 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
2653 re_token_t *token, reg_syntax_t syntax,
2654 reg_errcode_t *err);
2655 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
2656 re_string_t *regexp,
2657 re_token_t *token, int token_len,
2658 re_dfa_t *dfa,
2659 reg_syntax_t syntax,
2660 int accept_hyphen);
2661 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
2662 re_string_t *regexp,
2663 re_token_t *token);
2664 #ifdef RE_ENABLE_I18N
2665 static reg_errcode_t build_equiv_class (bitset_t sbcset,
2666 re_charset_t *mbcset,
2667 int *equiv_class_alloc,
2668 const unsigned char *name);
2669 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2670 bitset_t sbcset,
2671 re_charset_t *mbcset,
2672 int *char_class_alloc,
2673 const unsigned char *class_name,
2674 reg_syntax_t syntax);
2675 #else /* not RE_ENABLE_I18N */
2676 static reg_errcode_t build_equiv_class (bitset_t sbcset,
2677 const unsigned char *name);
2678 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2679 bitset_t sbcset,
2680 const unsigned char *class_name,
2681 reg_syntax_t syntax);
2682 #endif /* not RE_ENABLE_I18N */
2683 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
2684 RE_TRANSLATE_TYPE trans,
2685 const unsigned char *class_name,
2686 const unsigned char *extra,
2687 int non_match, reg_errcode_t *err);
2688 static bin_tree_t *create_tree (re_dfa_t *dfa,
2689 bin_tree_t *left, bin_tree_t *right,
2690 re_token_type_t type);
2691 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
2692 bin_tree_t *left, bin_tree_t *right,
2693 const re_token_t *token);
2694 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
2695 static void free_token (re_token_t *node);
2696 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
2697 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
2698
2699 /* This table gives an error message for each of the error codes listed
2700 in regex.h. Obviously the order here has to be same as there.
2701 POSIX doesn't require that we do anything for REG_NOERROR,
2702 but why not be nice? */
2703
2704 const char __re_error_msgid[] attribute_hidden =
2705 {
2706 #define REG_NOERROR_IDX 0
2707 gettext_noop ("Success") /* REG_NOERROR */
2708 "\0"
2709 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
2710 gettext_noop ("No match") /* REG_NOMATCH */
2711 "\0"
2712 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
2713 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
2714 "\0"
2715 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
2716 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
2717 "\0"
2718 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
2719 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
2720 "\0"
2721 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
2722 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
2723 "\0"
2724 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
2725 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
2726 "\0"
2727 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
2728 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
2729 "\0"
2730 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
2731 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
2732 "\0"
2733 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
2734 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
2735 "\0"
2736 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
2737 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
2738 "\0"
2739 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
2740 gettext_noop ("Invalid range end") /* REG_ERANGE */
2741 "\0"
2742 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
2743 gettext_noop ("Memory exhausted") /* REG_ESPACE */
2744 "\0"
2745 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
2746 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
2747 "\0"
2748 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
2749 gettext_noop ("Premature end of regular expression") /* REG_EEND */
2750 "\0"
2751 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
2752 gettext_noop ("Regular expression too big") /* REG_ESIZE */
2753 "\0"
2754 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
2755 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
2756 };
2757
2758 const size_t __re_error_msgid_idx[] attribute_hidden =
2759 {
2760 REG_NOERROR_IDX,
2761 REG_NOMATCH_IDX,
2762 REG_BADPAT_IDX,
2763 REG_ECOLLATE_IDX,
2764 REG_ECTYPE_IDX,
2765 REG_EESCAPE_IDX,
2766 REG_ESUBREG_IDX,
2767 REG_EBRACK_IDX,
2768 REG_EPAREN_IDX,
2769 REG_EBRACE_IDX,
2770 REG_BADBR_IDX,
2771 REG_ERANGE_IDX,
2772 REG_ESPACE_IDX,
2773 REG_BADRPT_IDX,
2774 REG_EEND_IDX,
2775 REG_ESIZE_IDX,
2776 REG_ERPAREN_IDX
2777 };
2778
2779 /* Entry points for GNU code. */
2780
2781 /* re_compile_pattern is the GNU regular expression compiler: it
2782 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
2783 Returns 0 if the pattern was valid, otherwise an error string.
2784
2785 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
2786 are set in BUFP on entry. */
2787
2788 const char *
re_compile_pattern(pattern,length,bufp)2789 re_compile_pattern (pattern, length, bufp)
2790 const char *pattern;
2791 size_t length;
2792 struct re_pattern_buffer *bufp;
2793 {
2794 reg_errcode_t ret;
2795
2796 /* And GNU code determines whether or not to get register information
2797 by passing null for the REGS argument to re_match, etc., not by
2798 setting no_sub, unless RE_NO_SUB is set. */
2799 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
2800
2801 /* Match anchors at newline. */
2802 bufp->newline_anchor = 1;
2803
2804 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
2805
2806 if (!ret)
2807 return NULL;
2808 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
2809 }
2810 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)2811 weak_alias (__re_compile_pattern, re_compile_pattern)
2812 #endif
2813
2814 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
2815 also be assigned to arbitrarily: each pattern buffer stores its own
2816 syntax, so it can be changed between regex compilations. */
2817 /* This has no initializer because initialized variables in Emacs
2818 become read-only after dumping. */
2819 reg_syntax_t re_syntax_options;
2820
2821
2822 /* Specify the precise syntax of regexps for compilation. This provides
2823 for compatibility for various utilities which historically have
2824 different, incompatible syntaxes.
2825
2826 The argument SYNTAX is a bit mask comprised of the various bits
2827 defined in regex.h. We return the old syntax. */
2828
2829 reg_syntax_t
2830 re_set_syntax (syntax)
2831 reg_syntax_t syntax;
2832 {
2833 reg_syntax_t ret = re_syntax_options;
2834
2835 re_syntax_options = syntax;
2836 return ret;
2837 }
2838 #ifdef _LIBC
2839 weak_alias (__re_set_syntax, re_set_syntax)
2840 #endif
2841
2842 int
2843 re_compile_fastmap (bufp)
2844 struct re_pattern_buffer *bufp;
2845 {
2846 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
2847 char *fastmap = bufp->fastmap;
2848
2849 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
2850 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
2851 if (dfa->init_state != dfa->init_state_word)
2852 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
2853 if (dfa->init_state != dfa->init_state_nl)
2854 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
2855 if (dfa->init_state != dfa->init_state_begbuf)
2856 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
2857 bufp->fastmap_accurate = 1;
2858 return 0;
2859 }
2860 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)2861 weak_alias (__re_compile_fastmap, re_compile_fastmap)
2862 #endif
2863
2864 static inline void
2865 __attribute ((always_inline))
2866 re_set_fastmap (char *fastmap, int icase, int ch)
2867 {
2868 fastmap[ch] = 1;
2869 if (icase)
2870 fastmap[tolower (ch)] = 1;
2871 }
2872
2873 /* Helper function for re_compile_fastmap.
2874 Compile fastmap for the initial_state INIT_STATE. */
2875
2876 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)2877 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
2878 char *fastmap)
2879 {
2880 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
2881 int node_cnt;
2882 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
2883 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
2884 {
2885 int node = init_state->nodes.elems[node_cnt];
2886 re_token_type_t type = dfa->nodes[node].type;
2887
2888 if (type == CHARACTER)
2889 {
2890 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
2891 #ifdef RE_ENABLE_I18N
2892 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
2893 {
2894 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
2895 wchar_t wc;
2896 mbstate_t state;
2897
2898 p = buf;
2899 *p++ = dfa->nodes[node].opr.c;
2900 while (++node < dfa->nodes_len
2901 && dfa->nodes[node].type == CHARACTER
2902 && dfa->nodes[node].mb_partial)
2903 *p++ = dfa->nodes[node].opr.c;
2904 memset (&state, '\0', sizeof (state));
2905 if (mbrtowc (&wc, (const char *) buf, p - buf,
2906 &state) == p - buf
2907 && (__wcrtomb ((char *) buf, towlower (wc), &state)
2908 != (size_t) -1))
2909 re_set_fastmap (fastmap, 0, buf[0]);
2910 }
2911 #endif
2912 }
2913 else if (type == SIMPLE_BRACKET)
2914 {
2915 int i, ch;
2916 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
2917 {
2918 int j;
2919 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
2920 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
2921 if (w & ((bitset_word_t) 1 << j))
2922 re_set_fastmap (fastmap, icase, ch);
2923 }
2924 }
2925 #ifdef RE_ENABLE_I18N
2926 else if (type == COMPLEX_BRACKET)
2927 {
2928 int i;
2929 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
2930 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
2931 || cset->nranges || cset->nchar_classes)
2932 {
2933 # ifdef _LIBC
2934 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
2935 {
2936 /* In this case we want to catch the bytes which are
2937 the first byte of any collation elements.
2938 e.g. In da_DK, we want to catch 'a' since "aa"
2939 is a valid collation element, and don't catch
2940 'b' since 'b' is the only collation element
2941 which starts from 'b'. */
2942 const int32_t *table = (const int32_t *)
2943 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
2944 for (i = 0; i < SBC_MAX; ++i)
2945 if (table[i] < 0)
2946 re_set_fastmap (fastmap, icase, i);
2947 }
2948 # else
2949 if (dfa->mb_cur_max > 1)
2950 for (i = 0; i < SBC_MAX; ++i)
2951 if (__btowc (i) == WEOF)
2952 re_set_fastmap (fastmap, icase, i);
2953 # endif /* not _LIBC */
2954 }
2955 for (i = 0; i < cset->nmbchars; ++i)
2956 {
2957 char buf[256];
2958 mbstate_t state;
2959 memset (&state, '\0', sizeof (state));
2960 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
2961 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
2962 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
2963 {
2964 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
2965 != (size_t) -1)
2966 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
2967 }
2968 }
2969 }
2970 #endif /* RE_ENABLE_I18N */
2971 else if (type == OP_PERIOD
2972 #ifdef RE_ENABLE_I18N
2973 || type == OP_UTF8_PERIOD
2974 #endif /* RE_ENABLE_I18N */
2975 || type == END_OF_RE)
2976 {
2977 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
2978 if (type == END_OF_RE)
2979 bufp->can_be_null = 1;
2980 return;
2981 }
2982 }
2983 }
2984
2985 /* Entry point for POSIX code. */
2986 /* regcomp takes a regular expression as a string and compiles it.
2987
2988 PREG is a regex_t *. We do not expect any fields to be initialized,
2989 since POSIX says we shouldn't. Thus, we set
2990
2991 `buffer' to the compiled pattern;
2992 `used' to the length of the compiled pattern;
2993 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
2994 REG_EXTENDED bit in CFLAGS is set; otherwise, to
2995 RE_SYNTAX_POSIX_BASIC;
2996 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
2997 `fastmap' to an allocated space for the fastmap;
2998 `fastmap_accurate' to zero;
2999 `re_nsub' to the number of subexpressions in PATTERN.
3000
3001 PATTERN is the address of the pattern string.
3002
3003 CFLAGS is a series of bits which affect compilation.
3004
3005 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
3006 use POSIX basic syntax.
3007
3008 If REG_NEWLINE is set, then . and [^...] don't match newline.
3009 Also, regexec will try a match beginning after every newline.
3010
3011 If REG_ICASE is set, then we considers upper- and lowercase
3012 versions of letters to be equivalent when matching.
3013
3014 If REG_NOSUB is set, then when PREG is passed to regexec, that
3015 routine will report only success or failure, and nothing about the
3016 registers.
3017
3018 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
3019 the return codes and their meanings.) */
3020
3021 int
regcomp(preg,pattern,cflags)3022 regcomp (preg, pattern, cflags)
3023 regex_t *__restrict preg;
3024 const char *__restrict pattern;
3025 int cflags;
3026 {
3027 reg_errcode_t ret;
3028 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
3029 : RE_SYNTAX_POSIX_BASIC);
3030
3031 preg->buffer = NULL;
3032 preg->allocated = 0;
3033 preg->used = 0;
3034
3035 /* Try to allocate space for the fastmap. */
3036 preg->fastmap = re_malloc (char, SBC_MAX);
3037 if (BE (preg->fastmap == NULL, 0))
3038 return REG_ESPACE;
3039
3040 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
3041
3042 /* If REG_NEWLINE is set, newlines are treated differently. */
3043 if (cflags & REG_NEWLINE)
3044 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
3045 syntax &= ~RE_DOT_NEWLINE;
3046 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
3047 /* It also changes the matching behavior. */
3048 preg->newline_anchor = 1;
3049 }
3050 else
3051 preg->newline_anchor = 0;
3052 preg->no_sub = !!(cflags & REG_NOSUB);
3053 preg->translate = NULL;
3054
3055 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
3056
3057 /* POSIX doesn't distinguish between an unmatched open-group and an
3058 unmatched close-group: both are REG_EPAREN. */
3059 if (ret == REG_ERPAREN)
3060 ret = REG_EPAREN;
3061
3062 /* We have already checked preg->fastmap != NULL. */
3063 if (BE (ret == REG_NOERROR, 1))
3064 /* Compute the fastmap now, since regexec cannot modify the pattern
3065 buffer. This function never fails in this implementation. */
3066 (void) re_compile_fastmap (preg);
3067 else
3068 {
3069 /* Some error occurred while compiling the expression. */
3070 re_free (preg->fastmap);
3071 preg->fastmap = NULL;
3072 }
3073
3074 return (int) ret;
3075 }
3076 #ifdef _LIBC
weak_alias(__regcomp,regcomp)3077 weak_alias (__regcomp, regcomp)
3078 #endif
3079
3080 /* Returns a message corresponding to an error code, ERRCODE, returned
3081 from either regcomp or regexec. We don't use PREG here. */
3082
3083 /* regerror ( int errcode, preg, errbuf, errbuf_size) */
3084 size_t
3085 regerror (
3086 int errcode,
3087 const regex_t *__restrict preg,
3088 char *__restrict errbuf,
3089 size_t errbuf_size)
3090 {
3091 const char *msg;
3092 size_t msg_size;
3093
3094 if (BE (errcode < 0
3095 || errcode >= (int) (sizeof (__re_error_msgid_idx)
3096 / sizeof (__re_error_msgid_idx[0])), 0))
3097 /* Only error codes returned by the rest of the code should be passed
3098 to this routine. If we are given anything else, or if other regex
3099 code generates an invalid error code, then the program has a bug.
3100 Dump core so we can fix it. */
3101 abort ();
3102
3103 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3104
3105 msg_size = strlen (msg) + 1; /* Includes the null. */
3106
3107 if (BE (errbuf_size != 0, 1))
3108 {
3109 if (BE (msg_size > errbuf_size, 0))
3110 {
3111 #if defined HAVE_MEMPCPY || defined _LIBC
3112 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
3113 #else
3114 memcpy (errbuf, msg, errbuf_size - 1);
3115 errbuf[errbuf_size - 1] = 0;
3116 #endif
3117 }
3118 else
3119 memcpy (errbuf, msg, msg_size);
3120 }
3121
3122 return msg_size;
3123 }
3124 #ifdef _LIBC
3125 weak_alias (__regerror, regerror)
3126 #endif
3127
3128
3129 #ifdef RE_ENABLE_I18N
3130 /* This static array is used for the map to single-byte characters when
3131 UTF-8 is used. Otherwise we would allocate memory just to initialize
3132 it the same all the time. UTF-8 is the preferred encoding so this is
3133 a worthwhile optimization. */
3134 static const bitset_t utf8_sb_map =
3135 {
3136 /* Set the first 128 bits. */
3137 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
3138 };
3139 #endif
3140
3141
3142 static void
free_dfa_content(re_dfa_t * dfa)3143 free_dfa_content (re_dfa_t *dfa)
3144 {
3145 int i, j;
3146
3147 if (dfa->nodes)
3148 for (i = 0; i < dfa->nodes_len; ++i)
3149 free_token (dfa->nodes + i);
3150 re_free (dfa->nexts);
3151 for (i = 0; i < dfa->nodes_len; ++i)
3152 {
3153 if (dfa->eclosures != NULL)
3154 re_node_set_free (dfa->eclosures + i);
3155 if (dfa->inveclosures != NULL)
3156 re_node_set_free (dfa->inveclosures + i);
3157 if (dfa->edests != NULL)
3158 re_node_set_free (dfa->edests + i);
3159 }
3160 re_free (dfa->edests);
3161 re_free (dfa->eclosures);
3162 re_free (dfa->inveclosures);
3163 re_free (dfa->nodes);
3164
3165 if (dfa->state_table)
3166 for (i = 0; i <= dfa->state_hash_mask; ++i)
3167 {
3168 struct re_state_table_entry *entry = dfa->state_table + i;
3169 for (j = 0; j < entry->num; ++j)
3170 {
3171 re_dfastate_t *state = entry->array[j];
3172 free_state (state);
3173 }
3174 re_free (entry->array);
3175 }
3176 re_free (dfa->state_table);
3177 #ifdef RE_ENABLE_I18N
3178 if (dfa->sb_char != utf8_sb_map)
3179 re_free (dfa->sb_char);
3180 #endif
3181 re_free (dfa->subexp_map);
3182 #ifdef DEBUG
3183 re_free (dfa->re_str);
3184 #endif
3185
3186 re_free (dfa);
3187 }
3188
3189
3190 /* Free dynamically allocated space used by PREG. */
3191
3192 void
regfree(preg)3193 regfree (preg)
3194 regex_t *preg;
3195 {
3196 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3197 if (BE (dfa != NULL, 1))
3198 free_dfa_content (dfa);
3199 preg->buffer = NULL;
3200 preg->allocated = 0;
3201
3202 re_free (preg->fastmap);
3203 preg->fastmap = NULL;
3204
3205 re_free (preg->translate);
3206 preg->translate = NULL;
3207 }
3208 #ifdef _LIBC
3209 weak_alias (__regfree, regfree)
3210 #endif
3211
3212 /* Entry points compatible with 4.2 BSD regex library. We don't define
3213 them unless specifically requested. */
3214
3215 #if defined _REGEX_RE_COMP || defined _LIBC
3216
3217 /* BSD has one and only one pattern buffer. */
3218 static struct re_pattern_buffer re_comp_buf;
3219
3220 char *
3221 # ifdef _LIBC
3222 /* Make these definitions weak in libc, so POSIX programs can redefine
3223 these names if they don't use our functions, and still use
3224 regcomp/regexec above without link errors. */
3225 weak_function
3226 # endif
re_comp(s)3227 re_comp (s)
3228 const char *s;
3229 {
3230 reg_errcode_t ret;
3231 char *fastmap;
3232
3233 if (!s)
3234 {
3235 if (!re_comp_buf.buffer)
3236 return gettext ("No previous regular expression");
3237 return 0;
3238 }
3239
3240 if (re_comp_buf.buffer)
3241 {
3242 fastmap = re_comp_buf.fastmap;
3243 re_comp_buf.fastmap = NULL;
3244 __regfree (&re_comp_buf);
3245 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
3246 re_comp_buf.fastmap = fastmap;
3247 }
3248
3249 if (re_comp_buf.fastmap == NULL)
3250 {
3251 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
3252 if (re_comp_buf.fastmap == NULL)
3253 return (char *) gettext (__re_error_msgid
3254 + __re_error_msgid_idx[(int) REG_ESPACE]);
3255 }
3256
3257 /* Since `re_exec' always passes NULL for the `regs' argument, we
3258 don't need to initialize the pattern buffer fields which affect it. */
3259
3260 /* Match anchors at newlines. */
3261 re_comp_buf.newline_anchor = 1;
3262
3263 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
3264
3265 if (!ret)
3266 return NULL;
3267
3268 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
3269 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3270 }
3271
3272 #ifdef _LIBC
libc_freeres_fn(free_mem)3273 libc_freeres_fn (free_mem)
3274 {
3275 __regfree (&re_comp_buf);
3276 }
3277 #endif
3278
3279 #endif /* _REGEX_RE_COMP */
3280
3281 /* Internal entry point.
3282 Compile the regular expression PATTERN, whose length is LENGTH.
3283 SYNTAX indicate regular expression's syntax. */
3284
3285 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)3286 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
3287 reg_syntax_t syntax)
3288 {
3289 reg_errcode_t err = REG_NOERROR;
3290 re_dfa_t *dfa;
3291 re_string_t regexp;
3292
3293 /* Initialize the pattern buffer. */
3294 preg->fastmap_accurate = 0;
3295 preg->syntax = syntax;
3296 preg->not_bol = preg->not_eol = 0;
3297 preg->used = 0;
3298 preg->re_nsub = 0;
3299 preg->can_be_null = 0;
3300 preg->regs_allocated = REGS_UNALLOCATED;
3301
3302 /* Initialize the dfa. */
3303 dfa = (re_dfa_t *) preg->buffer;
3304 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
3305 {
3306 /* If zero allocated, but buffer is non-null, try to realloc
3307 enough space. This loses if buffer's address is bogus, but
3308 that is the user's responsibility. If ->buffer is NULL this
3309 is a simple allocation. */
3310 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
3311 if (dfa == NULL)
3312 return REG_ESPACE;
3313 preg->allocated = sizeof (re_dfa_t);
3314 preg->buffer = (unsigned char *) dfa;
3315 }
3316 preg->used = sizeof (re_dfa_t);
3317
3318 err = init_dfa (dfa, length);
3319 if (BE (err != REG_NOERROR, 0))
3320 {
3321 free_dfa_content (dfa);
3322 preg->buffer = NULL;
3323 preg->allocated = 0;
3324 return err;
3325 }
3326 #ifdef DEBUG
3327 /* Note: length+1 will not overflow since it is checked in init_dfa. */
3328 dfa->re_str = re_malloc (char, length + 1);
3329 strncpy (dfa->re_str, pattern, length + 1);
3330 #endif
3331
3332 __libc_lock_init (dfa->lock);
3333
3334 err = re_string_construct (®exp, pattern, length, preg->translate,
3335 syntax & RE_ICASE, dfa);
3336 if (BE (err != REG_NOERROR, 0))
3337 {
3338 re_compile_internal_free_return:
3339 free_workarea_compile (preg);
3340 re_string_destruct (®exp);
3341 free_dfa_content (dfa);
3342 preg->buffer = NULL;
3343 preg->allocated = 0;
3344 return err;
3345 }
3346
3347 /* Parse the regular expression, and build a structure tree. */
3348 preg->re_nsub = 0;
3349 dfa->str_tree = parse (®exp, preg, syntax, &err);
3350 if (BE (dfa->str_tree == NULL, 0))
3351 goto re_compile_internal_free_return;
3352
3353 /* Analyze the tree and create the nfa. */
3354 err = analyze (preg);
3355 if (BE (err != REG_NOERROR, 0))
3356 goto re_compile_internal_free_return;
3357
3358 #ifdef RE_ENABLE_I18N
3359 /* If possible, do searching in single byte encoding to speed things up. */
3360 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
3361 optimize_utf8 (dfa);
3362 #endif
3363
3364 /* Then create the initial state of the dfa. */
3365 err = create_initial_state (dfa);
3366
3367 /* Release work areas. */
3368 free_workarea_compile (preg);
3369 re_string_destruct (®exp);
3370
3371 if (BE (err != REG_NOERROR, 0))
3372 {
3373 free_dfa_content (dfa);
3374 preg->buffer = NULL;
3375 preg->allocated = 0;
3376 }
3377
3378 return err;
3379 }
3380
3381 /* Initialize DFA. We use the length of the regular expression PAT_LEN
3382 as the initial length of some arrays. */
3383
3384 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)3385 init_dfa (re_dfa_t *dfa, size_t pat_len)
3386 {
3387 unsigned int table_size;
3388 #ifndef _LIBC
3389 char *codeset_name;
3390 #endif
3391
3392 memset (dfa, '\0', sizeof (re_dfa_t));
3393
3394 /* Force allocation of str_tree_storage the first time. */
3395 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3396
3397 /* Avoid overflows. */
3398 if (pat_len == SIZE_MAX)
3399 return REG_ESPACE;
3400
3401 dfa->nodes_alloc = pat_len + 1;
3402 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
3403
3404 /* table_size = 2 ^ ceil(log pat_len) */
3405 for (table_size = 1; ; table_size <<= 1)
3406 if (table_size > pat_len)
3407 break;
3408
3409 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
3410 dfa->state_hash_mask = table_size - 1;
3411
3412 dfa->mb_cur_max = MB_CUR_MAX;
3413 #ifdef _LIBC
3414 if (dfa->mb_cur_max == 6
3415 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
3416 dfa->is_utf8 = 1;
3417 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
3418 != 0);
3419 #else
3420 # ifdef HAVE_LANGINFO_CODESET
3421 codeset_name = nl_langinfo (CODESET);
3422 # else
3423 codeset_name = getenv ("LC_ALL");
3424 if (codeset_name == NULL || codeset_name[0] == '\0')
3425 codeset_name = getenv ("LC_CTYPE");
3426 if (codeset_name == NULL || codeset_name[0] == '\0')
3427 codeset_name = getenv ("LANG");
3428 if (codeset_name == NULL)
3429 codeset_name = "";
3430 else if (strchr (codeset_name, '.') != NULL)
3431 codeset_name = strchr (codeset_name, '.') + 1;
3432 # endif
3433
3434 if (strcasecmp (codeset_name, "UTF-8") == 0
3435 || strcasecmp (codeset_name, "UTF8") == 0)
3436 dfa->is_utf8 = 1;
3437
3438 /* We check exhaustively in the loop below if this charset is a
3439 superset of ASCII. */
3440 dfa->map_notascii = 0;
3441 #endif
3442
3443 #ifdef RE_ENABLE_I18N
3444 if (dfa->mb_cur_max > 1)
3445 {
3446 if (dfa->is_utf8)
3447 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
3448 else
3449 {
3450 int i, j, ch;
3451
3452 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3453 if (BE (dfa->sb_char == NULL, 0))
3454 return REG_ESPACE;
3455
3456 /* Set the bits corresponding to single byte chars. */
3457 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
3458 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
3459 {
3460 wint_t wch = __btowc (ch);
3461 if (wch != WEOF)
3462 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
3463 # ifndef _LIBC
3464 if (isascii (ch) && wch != ch)
3465 dfa->map_notascii = 1;
3466 # endif
3467 }
3468 }
3469 }
3470 #endif
3471
3472 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
3473 return REG_ESPACE;
3474 return REG_NOERROR;
3475 }
3476
3477 /* Initialize WORD_CHAR table, which indicate which character is
3478 "word". In this case "word" means that it is the word construction
3479 character used by some operators like "\<", "\>", etc. */
3480
3481 static void
3482 internal_function
init_word_char(re_dfa_t * dfa)3483 init_word_char (re_dfa_t *dfa)
3484 {
3485 int i, j, ch;
3486 dfa->word_ops_used = 1;
3487 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
3488 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
3489 if (isalnum (ch) || ch == '_')
3490 dfa->word_char[i] |= (bitset_word_t) 1 << j;
3491 }
3492
3493 /* Free the work area which are only used while compiling. */
3494
3495 static void
free_workarea_compile(regex_t * preg)3496 free_workarea_compile (regex_t *preg)
3497 {
3498 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3499 bin_tree_storage_t *storage, *next;
3500 for (storage = dfa->str_tree_storage; storage; storage = next)
3501 {
3502 next = storage->next;
3503 re_free (storage);
3504 }
3505 dfa->str_tree_storage = NULL;
3506 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3507 dfa->str_tree = NULL;
3508 re_free (dfa->org_indices);
3509 dfa->org_indices = NULL;
3510 }
3511
3512 /* Create initial states for all contexts. */
3513
3514 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)3515 create_initial_state (re_dfa_t *dfa)
3516 {
3517 int first, i;
3518 reg_errcode_t err;
3519 re_node_set init_nodes;
3520
3521 /* Initial states have the epsilon closure of the node which is
3522 the first node of the regular expression. */
3523 first = dfa->str_tree->first->node_idx;
3524 dfa->init_node = first;
3525 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
3526 if (BE (err != REG_NOERROR, 0))
3527 return err;
3528
3529 /* The back-references which are in initial states can epsilon transit,
3530 since in this case all of the subexpressions can be null.
3531 Then we add epsilon closures of the nodes which are the next nodes of
3532 the back-references. */
3533 if (dfa->nbackref > 0)
3534 for (i = 0; i < init_nodes.nelem; ++i)
3535 {
3536 int node_idx = init_nodes.elems[i];
3537 re_token_type_t type = dfa->nodes[node_idx].type;
3538
3539 int clexp_idx;
3540 if (type != OP_BACK_REF)
3541 continue;
3542 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
3543 {
3544 re_token_t *clexp_node;
3545 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
3546 if (clexp_node->type == OP_CLOSE_SUBEXP
3547 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
3548 break;
3549 }
3550 if (clexp_idx == init_nodes.nelem)
3551 continue;
3552
3553 if (type == OP_BACK_REF)
3554 {
3555 int dest_idx = dfa->edests[node_idx].elems[0];
3556 if (!re_node_set_contains (&init_nodes, dest_idx))
3557 {
3558 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
3559 i = 0;
3560 }
3561 }
3562 }
3563
3564 /* It must be the first time to invoke acquire_state. */
3565 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
3566 /* We don't check ERR here, since the initial state must not be NULL. */
3567 if (BE (dfa->init_state == NULL, 0))
3568 return err;
3569 if (dfa->init_state->has_constraint)
3570 {
3571 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
3572 CONTEXT_WORD);
3573 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
3574 CONTEXT_NEWLINE);
3575 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
3576 &init_nodes,
3577 CONTEXT_NEWLINE
3578 | CONTEXT_BEGBUF);
3579 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
3580 || dfa->init_state_begbuf == NULL, 0))
3581 return err;
3582 }
3583 else
3584 dfa->init_state_word = dfa->init_state_nl
3585 = dfa->init_state_begbuf = dfa->init_state;
3586
3587 re_node_set_free (&init_nodes);
3588 return REG_NOERROR;
3589 }
3590
3591 #ifdef RE_ENABLE_I18N
3592 /* If it is possible to do searching in single byte encoding instead of UTF-8
3593 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
3594 DFA nodes where needed. */
3595
3596 static void
optimize_utf8(re_dfa_t * dfa)3597 optimize_utf8 (re_dfa_t *dfa)
3598 {
3599 int node, i, mb_chars = 0, has_period = 0;
3600
3601 for (node = 0; node < dfa->nodes_len; ++node)
3602 switch (dfa->nodes[node].type)
3603 {
3604 case CHARACTER:
3605 if (dfa->nodes[node].opr.c >= 0x80)
3606 mb_chars = 1;
3607 break;
3608 case ANCHOR:
3609 switch (dfa->nodes[node].opr.idx)
3610 {
3611 case LINE_FIRST:
3612 case LINE_LAST:
3613 case BUF_FIRST:
3614 case BUF_LAST:
3615 break;
3616 default:
3617 /* Word anchors etc. cannot be handled. */
3618 return;
3619 }
3620 break;
3621 case OP_PERIOD:
3622 has_period = 1;
3623 break;
3624 case OP_BACK_REF:
3625 case OP_ALT:
3626 case END_OF_RE:
3627 case OP_DUP_ASTERISK:
3628 case OP_OPEN_SUBEXP:
3629 case OP_CLOSE_SUBEXP:
3630 break;
3631 case COMPLEX_BRACKET:
3632 return;
3633 case SIMPLE_BRACKET:
3634 /* Just double check. The non-ASCII range starts at 0x80. */
3635 assert (0x80 % BITSET_WORD_BITS == 0);
3636 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
3637 if (dfa->nodes[node].opr.sbcset[i])
3638 return;
3639 break;
3640 default:
3641 abort ();
3642 }
3643
3644 if (mb_chars || has_period)
3645 for (node = 0; node < dfa->nodes_len; ++node)
3646 {
3647 if (dfa->nodes[node].type == CHARACTER
3648 && dfa->nodes[node].opr.c >= 0x80)
3649 dfa->nodes[node].mb_partial = 0;
3650 else if (dfa->nodes[node].type == OP_PERIOD)
3651 dfa->nodes[node].type = OP_UTF8_PERIOD;
3652 }
3653
3654 /* The search can be in single byte locale. */
3655 dfa->mb_cur_max = 1;
3656 dfa->is_utf8 = 0;
3657 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
3658 }
3659 #endif
3660
3661 /* Analyze the structure tree, and calculate "first", "next", "edest",
3662 "eclosure", and "inveclosure". */
3663
3664 static reg_errcode_t
analyze(regex_t * preg)3665 analyze (regex_t *preg)
3666 {
3667 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3668 reg_errcode_t ret;
3669
3670 /* Allocate arrays. */
3671 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
3672 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
3673 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
3674 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
3675 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
3676 || dfa->eclosures == NULL, 0))
3677 return REG_ESPACE;
3678
3679 dfa->subexp_map = re_malloc (int, preg->re_nsub);
3680 if (dfa->subexp_map != NULL)
3681 {
3682 int i;
3683 for (i = 0; i < preg->re_nsub; i++)
3684 dfa->subexp_map[i] = i;
3685 preorder (dfa->str_tree, optimize_subexps, dfa);
3686 for (i = 0; i < preg->re_nsub; i++)
3687 if (dfa->subexp_map[i] != i)
3688 break;
3689 if (i == preg->re_nsub)
3690 {
3691 free (dfa->subexp_map);
3692 dfa->subexp_map = NULL;
3693 }
3694 }
3695
3696 ret = postorder (dfa->str_tree, lower_subexps, preg);
3697 if (BE (ret != REG_NOERROR, 0))
3698 return ret;
3699 ret = postorder (dfa->str_tree, calc_first, dfa);
3700 if (BE (ret != REG_NOERROR, 0))
3701 return ret;
3702 preorder (dfa->str_tree, calc_next, dfa);
3703 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
3704 if (BE (ret != REG_NOERROR, 0))
3705 return ret;
3706 ret = calc_eclosure (dfa);
3707 if (BE (ret != REG_NOERROR, 0))
3708 return ret;
3709
3710 /* We only need this during the prune_impossible_nodes pass in regexec.c;
3711 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
3712 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
3713 || dfa->nbackref)
3714 {
3715 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
3716 if (BE (dfa->inveclosures == NULL, 0))
3717 return REG_ESPACE;
3718 ret = calc_inveclosure (dfa);
3719 }
3720
3721 return ret;
3722 }
3723
3724 /* Our parse trees are very unbalanced, so we cannot use a stack to
3725 implement parse tree visits. Instead, we use parent pointers and
3726 some hairy code in these two functions. */
3727 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)3728 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
3729 void *extra)
3730 {
3731 bin_tree_t *node, *prev;
3732
3733 for (node = root; ; )
3734 {
3735 /* Descend down the tree, preferably to the left (or to the right
3736 if that's the only child). */
3737 while (node->left || node->right)
3738 if (node->left)
3739 node = node->left;
3740 else
3741 node = node->right;
3742
3743 do
3744 {
3745 reg_errcode_t err = fn (extra, node);
3746 if (BE (err != REG_NOERROR, 0))
3747 return err;
3748 if (node->parent == NULL)
3749 return REG_NOERROR;
3750 prev = node;
3751 node = node->parent;
3752 }
3753 /* Go up while we have a node that is reached from the right. */
3754 while (node->right == prev || node->right == NULL);
3755 node = node->right;
3756 }
3757 }
3758
3759 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)3760 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
3761 void *extra)
3762 {
3763 bin_tree_t *node;
3764
3765 for (node = root; ; )
3766 {
3767 reg_errcode_t err = fn (extra, node);
3768 if (BE (err != REG_NOERROR, 0))
3769 return err;
3770
3771 /* Go to the left node, or up and to the right. */
3772 if (node->left)
3773 node = node->left;
3774 else
3775 {
3776 bin_tree_t *prev = NULL;
3777 while (node->right == prev || node->right == NULL)
3778 {
3779 prev = node;
3780 node = node->parent;
3781 if (!node)
3782 return REG_NOERROR;
3783 }
3784 node = node->right;
3785 }
3786 }
3787 }
3788
3789 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
3790 re_search_internal to map the inner one's opr.idx to this one's. Adjust
3791 backreferences as well. Requires a preorder visit. */
3792 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)3793 optimize_subexps (void *extra, bin_tree_t *node)
3794 {
3795 re_dfa_t *dfa = (re_dfa_t *) extra;
3796
3797 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
3798 {
3799 int idx = node->token.opr.idx;
3800 node->token.opr.idx = dfa->subexp_map[idx];
3801 dfa->used_bkref_map |= 1 << node->token.opr.idx;
3802 }
3803
3804 else if (node->token.type == SUBEXP
3805 && node->left && node->left->token.type == SUBEXP)
3806 {
3807 int other_idx = node->left->token.opr.idx;
3808
3809 node->left = node->left->left;
3810 if (node->left)
3811 node->left->parent = node;
3812
3813 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
3814 if (other_idx < BITSET_WORD_BITS)
3815 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
3816 }
3817
3818 return REG_NOERROR;
3819 }
3820
3821 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
3822 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
3823 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)3824 lower_subexps (void *extra, bin_tree_t *node)
3825 {
3826 regex_t *preg = (regex_t *) extra;
3827 reg_errcode_t err = REG_NOERROR;
3828
3829 if (node->left && node->left->token.type == SUBEXP)
3830 {
3831 node->left = lower_subexp (&err, preg, node->left);
3832 if (node->left)
3833 node->left->parent = node;
3834 }
3835 if (node->right && node->right->token.type == SUBEXP)
3836 {
3837 node->right = lower_subexp (&err, preg, node->right);
3838 if (node->right)
3839 node->right->parent = node;
3840 }
3841
3842 return err;
3843 }
3844
3845 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)3846 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
3847 {
3848 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3849 bin_tree_t *body = node->left;
3850 bin_tree_t *op, *cls, *tree1, *tree;
3851
3852 if (preg->no_sub
3853 /* We do not optimize empty subexpressions, because otherwise we may
3854 have bad CONCAT nodes with NULL children. This is obviously not
3855 very common, so we do not lose much. An example that triggers
3856 this case is the sed "script" /\(\)/x. */
3857 && node->left != NULL
3858 && (node->token.opr.idx >= BITSET_WORD_BITS
3859 || !(dfa->used_bkref_map
3860 & ((bitset_word_t) 1 << node->token.opr.idx))))
3861 return node->left;
3862
3863 /* Convert the SUBEXP node to the concatenation of an
3864 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
3865 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
3866 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
3867 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
3868 tree = create_tree (dfa, op, tree1, CONCAT);
3869 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
3870 {
3871 *err = REG_ESPACE;
3872 return NULL;
3873 }
3874
3875 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
3876 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
3877 return tree;
3878 }
3879
3880 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
3881 nodes. Requires a postorder visit. */
3882 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)3883 calc_first (void *extra, bin_tree_t *node)
3884 {
3885 re_dfa_t *dfa = (re_dfa_t *) extra;
3886 if (node->token.type == CONCAT)
3887 {
3888 node->first = node->left->first;
3889 node->node_idx = node->left->node_idx;
3890 }
3891 else
3892 {
3893 node->first = node;
3894 node->node_idx = re_dfa_add_node (dfa, node->token);
3895 if (BE (node->node_idx == -1, 0))
3896 return REG_ESPACE;
3897 }
3898 return REG_NOERROR;
3899 }
3900
3901 /* Pass 2: compute NEXT on the tree. Preorder visit. */
3902 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)3903 calc_next (void *extra, bin_tree_t *node)
3904 {
3905 switch (node->token.type)
3906 {
3907 case OP_DUP_ASTERISK:
3908 node->left->next = node;
3909 break;
3910 case CONCAT:
3911 node->left->next = node->right->first;
3912 node->right->next = node->next;
3913 break;
3914 default:
3915 if (node->left)
3916 node->left->next = node->next;
3917 if (node->right)
3918 node->right->next = node->next;
3919 break;
3920 }
3921 return REG_NOERROR;
3922 }
3923
3924 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
3925 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)3926 link_nfa_nodes (void *extra, bin_tree_t *node)
3927 {
3928 re_dfa_t *dfa = (re_dfa_t *) extra;
3929 int idx = node->node_idx;
3930 reg_errcode_t err = REG_NOERROR;
3931
3932 switch (node->token.type)
3933 {
3934 case CONCAT:
3935 break;
3936
3937 case END_OF_RE:
3938 assert (node->next == NULL);
3939 break;
3940
3941 case OP_DUP_ASTERISK:
3942 case OP_ALT:
3943 {
3944 int left, right;
3945 dfa->has_plural_match = 1;
3946 if (node->left != NULL)
3947 left = node->left->first->node_idx;
3948 else
3949 left = node->next->node_idx;
3950 if (node->right != NULL)
3951 right = node->right->first->node_idx;
3952 else
3953 right = node->next->node_idx;
3954 assert (left > -1);
3955 assert (right > -1);
3956 err = re_node_set_init_2 (dfa->edests + idx, left, right);
3957 }
3958 break;
3959
3960 case ANCHOR:
3961 case OP_OPEN_SUBEXP:
3962 case OP_CLOSE_SUBEXP:
3963 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
3964 break;
3965
3966 case OP_BACK_REF:
3967 dfa->nexts[idx] = node->next->node_idx;
3968 if (node->token.type == OP_BACK_REF)
3969 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
3970 break;
3971
3972 default:
3973 assert (!IS_EPSILON_NODE (node->token.type));
3974 dfa->nexts[idx] = node->next->node_idx;
3975 break;
3976 }
3977
3978 return err;
3979 }
3980
3981 /* Duplicate the epsilon closure of the node ROOT_NODE.
3982 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
3983 to their own constraint. */
3984
3985 static reg_errcode_t
3986 internal_function
duplicate_node_closure(re_dfa_t * dfa,int top_org_node,int top_clone_node,int root_node,unsigned int init_constraint)3987 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
3988 int root_node, unsigned int init_constraint)
3989 {
3990 int org_node, clone_node, ret;
3991 unsigned int constraint = init_constraint;
3992 for (org_node = top_org_node, clone_node = top_clone_node;;)
3993 {
3994 int org_dest, clone_dest;
3995 if (dfa->nodes[org_node].type == OP_BACK_REF)
3996 {
3997 /* If the back reference epsilon-transit, its destination must
3998 also have the constraint. Then duplicate the epsilon closure
3999 of the destination of the back reference, and store it in
4000 edests of the back reference. */
4001 org_dest = dfa->nexts[org_node];
4002 re_node_set_empty (dfa->edests + clone_node);
4003 clone_dest = duplicate_node (dfa, org_dest, constraint);
4004 if (BE (clone_dest == -1, 0))
4005 return REG_ESPACE;
4006 dfa->nexts[clone_node] = dfa->nexts[org_node];
4007 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4008 if (BE (ret < 0, 0))
4009 return REG_ESPACE;
4010 }
4011 else if (dfa->edests[org_node].nelem == 0)
4012 {
4013 /* In case of the node can't epsilon-transit, don't duplicate the
4014 destination and store the original destination as the
4015 destination of the node. */
4016 dfa->nexts[clone_node] = dfa->nexts[org_node];
4017 break;
4018 }
4019 else if (dfa->edests[org_node].nelem == 1)
4020 {
4021 /* In case of the node can epsilon-transit, and it has only one
4022 destination. */
4023 org_dest = dfa->edests[org_node].elems[0];
4024 re_node_set_empty (dfa->edests + clone_node);
4025 if (dfa->nodes[org_node].type == ANCHOR)
4026 {
4027 /* In case of the node has another constraint, append it. */
4028 if (org_node == root_node && clone_node != org_node)
4029 {
4030 /* ...but if the node is root_node itself, it means the
4031 epsilon closure have a loop, then tie it to the
4032 destination of the root_node. */
4033 ret = re_node_set_insert (dfa->edests + clone_node,
4034 org_dest);
4035 if (BE (ret < 0, 0))
4036 return REG_ESPACE;
4037 break;
4038 }
4039 constraint |= dfa->nodes[org_node].opr.ctx_type;
4040 }
4041 clone_dest = duplicate_node (dfa, org_dest, constraint);
4042 if (BE (clone_dest == -1, 0))
4043 return REG_ESPACE;
4044 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4045 if (BE (ret < 0, 0))
4046 return REG_ESPACE;
4047 }
4048 else /* dfa->edests[org_node].nelem == 2 */
4049 {
4050 /* In case of the node can epsilon-transit, and it has two
4051 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
4052 org_dest = dfa->edests[org_node].elems[0];
4053 re_node_set_empty (dfa->edests + clone_node);
4054 /* Search for a duplicated node which satisfies the constraint. */
4055 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
4056 if (clone_dest == -1)
4057 {
4058 /* There are no such a duplicated node, create a new one. */
4059 reg_errcode_t err;
4060 clone_dest = duplicate_node (dfa, org_dest, constraint);
4061 if (BE (clone_dest == -1, 0))
4062 return REG_ESPACE;
4063 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4064 if (BE (ret < 0, 0))
4065 return REG_ESPACE;
4066 err = duplicate_node_closure (dfa, org_dest, clone_dest,
4067 root_node, constraint);
4068 if (BE (err != REG_NOERROR, 0))
4069 return err;
4070 }
4071 else
4072 {
4073 /* There are a duplicated node which satisfy the constraint,
4074 use it to avoid infinite loop. */
4075 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4076 if (BE (ret < 0, 0))
4077 return REG_ESPACE;
4078 }
4079
4080 org_dest = dfa->edests[org_node].elems[1];
4081 clone_dest = duplicate_node (dfa, org_dest, constraint);
4082 if (BE (clone_dest == -1, 0))
4083 return REG_ESPACE;
4084 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4085 if (BE (ret < 0, 0))
4086 return REG_ESPACE;
4087 }
4088 org_node = org_dest;
4089 clone_node = clone_dest;
4090 }
4091 return REG_NOERROR;
4092 }
4093
4094 /* Search for a node which is duplicated from the node ORG_NODE, and
4095 satisfies the constraint CONSTRAINT. */
4096
4097 static int
search_duplicated_node(const re_dfa_t * dfa,int org_node,unsigned int constraint)4098 search_duplicated_node (const re_dfa_t *dfa, int org_node,
4099 unsigned int constraint)
4100 {
4101 int idx;
4102 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
4103 {
4104 if (org_node == dfa->org_indices[idx]
4105 && constraint == dfa->nodes[idx].constraint)
4106 return idx; /* Found. */
4107 }
4108 return -1; /* Not found. */
4109 }
4110
4111 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
4112 Return the index of the new node, or -1 if insufficient storage is
4113 available. */
4114
4115 static int
duplicate_node(re_dfa_t * dfa,int org_idx,unsigned int constraint)4116 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
4117 {
4118 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
4119 if (BE (dup_idx != -1, 1))
4120 {
4121 dfa->nodes[dup_idx].constraint = constraint;
4122 if (dfa->nodes[org_idx].type == ANCHOR)
4123 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
4124 dfa->nodes[dup_idx].duplicated = 1;
4125
4126 /* Store the index of the original node. */
4127 dfa->org_indices[dup_idx] = org_idx;
4128 }
4129 return dup_idx;
4130 }
4131
4132 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)4133 calc_inveclosure (re_dfa_t *dfa)
4134 {
4135 int src, idx, ret;
4136 for (idx = 0; idx < dfa->nodes_len; ++idx)
4137 re_node_set_init_empty (dfa->inveclosures + idx);
4138
4139 for (src = 0; src < dfa->nodes_len; ++src)
4140 {
4141 int *elems = dfa->eclosures[src].elems;
4142 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
4143 {
4144 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
4145 if (BE (ret == -1, 0))
4146 return REG_ESPACE;
4147 }
4148 }
4149
4150 return REG_NOERROR;
4151 }
4152
4153 /* Calculate "eclosure" for all the node in DFA. */
4154
4155 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)4156 calc_eclosure (re_dfa_t *dfa)
4157 {
4158 int node_idx, incomplete;
4159 #ifdef DEBUG
4160 assert (dfa->nodes_len > 0);
4161 #endif
4162 incomplete = 0;
4163 /* For each nodes, calculate epsilon closure. */
4164 for (node_idx = 0; ; ++node_idx)
4165 {
4166 reg_errcode_t err;
4167 re_node_set eclosure_elem;
4168 if (node_idx == dfa->nodes_len)
4169 {
4170 if (!incomplete)
4171 break;
4172 incomplete = 0;
4173 node_idx = 0;
4174 }
4175
4176 #ifdef DEBUG
4177 assert (dfa->eclosures[node_idx].nelem != -1);
4178 #endif
4179
4180 /* If we have already calculated, skip it. */
4181 if (dfa->eclosures[node_idx].nelem != 0)
4182 continue;
4183 /* Calculate epsilon closure of `node_idx'. */
4184 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
4185 if (BE (err != REG_NOERROR, 0))
4186 return err;
4187
4188 if (dfa->eclosures[node_idx].nelem == 0)
4189 {
4190 incomplete = 1;
4191 re_node_set_free (&eclosure_elem);
4192 }
4193 }
4194 return REG_NOERROR;
4195 }
4196
4197 /* Calculate epsilon closure of NODE. */
4198
4199 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,int node,int root)4200 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
4201 {
4202 reg_errcode_t err;
4203 unsigned int constraint;
4204 int i, incomplete;
4205 re_node_set eclosure;
4206 incomplete = 0;
4207 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
4208 if (BE (err != REG_NOERROR, 0))
4209 return err;
4210
4211 /* This indicates that we are calculating this node now.
4212 We reference this value to avoid infinite loop. */
4213 dfa->eclosures[node].nelem = -1;
4214
4215 constraint = ((dfa->nodes[node].type == ANCHOR)
4216 ? dfa->nodes[node].opr.ctx_type : 0);
4217 /* If the current node has constraints, duplicate all nodes.
4218 Since they must inherit the constraints. */
4219 if (constraint
4220 && dfa->edests[node].nelem
4221 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
4222 {
4223 err = duplicate_node_closure (dfa, node, node, node, constraint);
4224 if (BE (err != REG_NOERROR, 0))
4225 return err;
4226 }
4227
4228 /* Expand each epsilon destination nodes. */
4229 if (IS_EPSILON_NODE(dfa->nodes[node].type))
4230 for (i = 0; i < dfa->edests[node].nelem; ++i)
4231 {
4232 re_node_set eclosure_elem;
4233 int edest = dfa->edests[node].elems[i];
4234 /* If calculating the epsilon closure of `edest' is in progress,
4235 return intermediate result. */
4236 if (dfa->eclosures[edest].nelem == -1)
4237 {
4238 incomplete = 1;
4239 continue;
4240 }
4241 /* If we haven't calculated the epsilon closure of `edest' yet,
4242 calculate now. Otherwise use calculated epsilon closure. */
4243 if (dfa->eclosures[edest].nelem == 0)
4244 {
4245 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
4246 if (BE (err != REG_NOERROR, 0))
4247 return err;
4248 }
4249 else
4250 eclosure_elem = dfa->eclosures[edest];
4251 /* Merge the epsilon closure of `edest'. */
4252 re_node_set_merge (&eclosure, &eclosure_elem);
4253 /* If the epsilon closure of `edest' is incomplete,
4254 the epsilon closure of this node is also incomplete. */
4255 if (dfa->eclosures[edest].nelem == 0)
4256 {
4257 incomplete = 1;
4258 re_node_set_free (&eclosure_elem);
4259 }
4260 }
4261
4262 /* Epsilon closures include itself. */
4263 re_node_set_insert (&eclosure, node);
4264 if (incomplete && !root)
4265 dfa->eclosures[node].nelem = 0;
4266 else
4267 dfa->eclosures[node] = eclosure;
4268 *new_set = eclosure;
4269 return REG_NOERROR;
4270 }
4271
4272 /* Functions for token which are used in the parser. */
4273
4274 /* Fetch a token from INPUT.
4275 We must not use this function inside bracket expressions. */
4276
4277 static void
4278 internal_function
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)4279 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
4280 {
4281 re_string_skip_bytes (input, peek_token (result, input, syntax));
4282 }
4283
4284 /* Peek a token from INPUT, and return the length of the token.
4285 We must not use this function inside bracket expressions. */
4286
4287 static int
4288 internal_function
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)4289 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
4290 {
4291 unsigned char c;
4292
4293 if (re_string_eoi (input))
4294 {
4295 token->type = END_OF_RE;
4296 return 0;
4297 }
4298
4299 c = re_string_peek_byte (input, 0);
4300 token->opr.c = c;
4301
4302 token->word_char = 0;
4303 #ifdef RE_ENABLE_I18N
4304 token->mb_partial = 0;
4305 if (input->mb_cur_max > 1 &&
4306 !re_string_first_byte (input, re_string_cur_idx (input)))
4307 {
4308 token->type = CHARACTER;
4309 token->mb_partial = 1;
4310 return 1;
4311 }
4312 #endif
4313 if (c == '\\')
4314 {
4315 unsigned char c2;
4316 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
4317 {
4318 token->type = BACK_SLASH;
4319 return 1;
4320 }
4321
4322 c2 = re_string_peek_byte_case (input, 1);
4323 token->opr.c = c2;
4324 token->type = CHARACTER;
4325 #ifdef RE_ENABLE_I18N
4326 if (input->mb_cur_max > 1)
4327 {
4328 wint_t wc = re_string_wchar_at (input,
4329 re_string_cur_idx (input) + 1);
4330 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
4331 }
4332 else
4333 #endif
4334 token->word_char = IS_WORD_CHAR (c2) != 0;
4335
4336 switch (c2)
4337 {
4338 case '|':
4339 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
4340 token->type = OP_ALT;
4341 break;
4342 case '1': case '2': case '3': case '4': case '5':
4343 case '6': case '7': case '8': case '9':
4344 if (!(syntax & RE_NO_BK_REFS))
4345 {
4346 token->type = OP_BACK_REF;
4347 token->opr.idx = c2 - '1';
4348 }
4349 break;
4350 case '<':
4351 if (!(syntax & RE_NO_GNU_OPS))
4352 {
4353 token->type = ANCHOR;
4354 token->opr.ctx_type = WORD_FIRST;
4355 }
4356 break;
4357 case '>':
4358 if (!(syntax & RE_NO_GNU_OPS))
4359 {
4360 token->type = ANCHOR;
4361 token->opr.ctx_type = WORD_LAST;
4362 }
4363 break;
4364 case 'b':
4365 if (!(syntax & RE_NO_GNU_OPS))
4366 {
4367 token->type = ANCHOR;
4368 token->opr.ctx_type = WORD_DELIM;
4369 }
4370 break;
4371 case 'B':
4372 if (!(syntax & RE_NO_GNU_OPS))
4373 {
4374 token->type = ANCHOR;
4375 token->opr.ctx_type = NOT_WORD_DELIM;
4376 }
4377 break;
4378 case 'w':
4379 if (!(syntax & RE_NO_GNU_OPS))
4380 token->type = OP_WORD;
4381 break;
4382 case 'W':
4383 if (!(syntax & RE_NO_GNU_OPS))
4384 token->type = OP_NOTWORD;
4385 break;
4386 case 's':
4387 if (!(syntax & RE_NO_GNU_OPS))
4388 token->type = OP_SPACE;
4389 break;
4390 case 'S':
4391 if (!(syntax & RE_NO_GNU_OPS))
4392 token->type = OP_NOTSPACE;
4393 break;
4394 case '`':
4395 if (!(syntax & RE_NO_GNU_OPS))
4396 {
4397 token->type = ANCHOR;
4398 token->opr.ctx_type = BUF_FIRST;
4399 }
4400 break;
4401 case '\'':
4402 if (!(syntax & RE_NO_GNU_OPS))
4403 {
4404 token->type = ANCHOR;
4405 token->opr.ctx_type = BUF_LAST;
4406 }
4407 break;
4408 case '(':
4409 if (!(syntax & RE_NO_BK_PARENS))
4410 token->type = OP_OPEN_SUBEXP;
4411 break;
4412 case ')':
4413 if (!(syntax & RE_NO_BK_PARENS))
4414 token->type = OP_CLOSE_SUBEXP;
4415 break;
4416 case '+':
4417 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
4418 token->type = OP_DUP_PLUS;
4419 break;
4420 case '?':
4421 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
4422 token->type = OP_DUP_QUESTION;
4423 break;
4424 case '{':
4425 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
4426 token->type = OP_OPEN_DUP_NUM;
4427 break;
4428 case '}':
4429 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
4430 token->type = OP_CLOSE_DUP_NUM;
4431 break;
4432 default:
4433 break;
4434 }
4435 return 2;
4436 }
4437
4438 token->type = CHARACTER;
4439 #ifdef RE_ENABLE_I18N
4440 if (input->mb_cur_max > 1)
4441 {
4442 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
4443 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
4444 }
4445 else
4446 #endif
4447 token->word_char = IS_WORD_CHAR (token->opr.c);
4448
4449 switch (c)
4450 {
4451 case '\n':
4452 if (syntax & RE_NEWLINE_ALT)
4453 token->type = OP_ALT;
4454 break;
4455 case '|':
4456 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
4457 token->type = OP_ALT;
4458 break;
4459 case '*':
4460 token->type = OP_DUP_ASTERISK;
4461 break;
4462 case '+':
4463 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
4464 token->type = OP_DUP_PLUS;
4465 break;
4466 case '?':
4467 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
4468 token->type = OP_DUP_QUESTION;
4469 break;
4470 case '{':
4471 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
4472 token->type = OP_OPEN_DUP_NUM;
4473 break;
4474 case '}':
4475 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
4476 token->type = OP_CLOSE_DUP_NUM;
4477 break;
4478 case '(':
4479 if (syntax & RE_NO_BK_PARENS)
4480 token->type = OP_OPEN_SUBEXP;
4481 break;
4482 case ')':
4483 if (syntax & RE_NO_BK_PARENS)
4484 token->type = OP_CLOSE_SUBEXP;
4485 break;
4486 case '[':
4487 token->type = OP_OPEN_BRACKET;
4488 break;
4489 case '.':
4490 token->type = OP_PERIOD;
4491 break;
4492 case '^':
4493 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
4494 re_string_cur_idx (input) != 0)
4495 {
4496 char prev = re_string_peek_byte (input, -1);
4497 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
4498 break;
4499 }
4500 token->type = ANCHOR;
4501 token->opr.ctx_type = LINE_FIRST;
4502 break;
4503 case '$':
4504 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
4505 re_string_cur_idx (input) + 1 != re_string_length (input))
4506 {
4507 re_token_t next;
4508 re_string_skip_bytes (input, 1);
4509 peek_token (&next, input, syntax);
4510 re_string_skip_bytes (input, -1);
4511 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
4512 break;
4513 }
4514 token->type = ANCHOR;
4515 token->opr.ctx_type = LINE_LAST;
4516 break;
4517 default:
4518 break;
4519 }
4520 return 1;
4521 }
4522
4523 /* Peek a token from INPUT, and return the length of the token.
4524 We must not use this function out of bracket expressions. */
4525
4526 static int
4527 internal_function
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)4528 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
4529 {
4530 unsigned char c;
4531 if (re_string_eoi (input))
4532 {
4533 token->type = END_OF_RE;
4534 return 0;
4535 }
4536 c = re_string_peek_byte (input, 0);
4537 token->opr.c = c;
4538
4539 #ifdef RE_ENABLE_I18N
4540 if (input->mb_cur_max > 1 &&
4541 !re_string_first_byte (input, re_string_cur_idx (input)))
4542 {
4543 token->type = CHARACTER;
4544 return 1;
4545 }
4546 #endif /* RE_ENABLE_I18N */
4547
4548 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
4549 && re_string_cur_idx (input) + 1 < re_string_length (input))
4550 {
4551 /* In this case, '\' escape a character. */
4552 unsigned char c2;
4553 re_string_skip_bytes (input, 1);
4554 c2 = re_string_peek_byte (input, 0);
4555 token->opr.c = c2;
4556 token->type = CHARACTER;
4557 return 1;
4558 }
4559 if (c == '[') /* '[' is a special char in a bracket exps. */
4560 {
4561 unsigned char c2;
4562 int token_len;
4563 if (re_string_cur_idx (input) + 1 < re_string_length (input))
4564 c2 = re_string_peek_byte (input, 1);
4565 else
4566 c2 = 0;
4567 token->opr.c = c2;
4568 token_len = 2;
4569 switch (c2)
4570 {
4571 case '.':
4572 token->type = OP_OPEN_COLL_ELEM;
4573 break;
4574 case '=':
4575 token->type = OP_OPEN_EQUIV_CLASS;
4576 break;
4577 case ':':
4578 if (syntax & RE_CHAR_CLASSES)
4579 {
4580 token->type = OP_OPEN_CHAR_CLASS;
4581 break;
4582 }
4583 /* else fall through. */
4584 default:
4585 token->type = CHARACTER;
4586 token->opr.c = c;
4587 token_len = 1;
4588 break;
4589 }
4590 return token_len;
4591 }
4592 switch (c)
4593 {
4594 case '-':
4595 token->type = OP_CHARSET_RANGE;
4596 break;
4597 case ']':
4598 token->type = OP_CLOSE_BRACKET;
4599 break;
4600 case '^':
4601 token->type = OP_NON_MATCH_LIST;
4602 break;
4603 default:
4604 token->type = CHARACTER;
4605 }
4606 return 1;
4607 }
4608
4609 /* Functions for parser. */
4610
4611 /* Entry point of the parser.
4612 Parse the regular expression REGEXP and return the structure tree.
4613 If an error is occured, ERR is set by error code, and return NULL.
4614 This function build the following tree, from regular expression <reg_exp>:
4615 CAT
4616 / \
4617 / \
4618 <reg_exp> EOR
4619
4620 CAT means concatenation.
4621 EOR means end of regular expression. */
4622
4623 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)4624 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
4625 reg_errcode_t *err)
4626 {
4627 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4628 bin_tree_t *tree, *eor, *root;
4629 re_token_t current_token;
4630 dfa->syntax = syntax;
4631 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
4632 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
4633 if (BE (*err != REG_NOERROR && tree == NULL, 0))
4634 return NULL;
4635 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
4636 if (tree != NULL)
4637 root = create_tree (dfa, tree, eor, CONCAT);
4638 else
4639 root = eor;
4640 if (BE (eor == NULL || root == NULL, 0))
4641 {
4642 *err = REG_ESPACE;
4643 return NULL;
4644 }
4645 return root;
4646 }
4647
4648 /* This function build the following tree, from regular expression
4649 <branch1>|<branch2>:
4650 ALT
4651 / \
4652 / \
4653 <branch1> <branch2>
4654
4655 ALT means alternative, which represents the operator `|'. */
4656
4657 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)4658 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
4659 reg_syntax_t syntax, int nest, reg_errcode_t *err)
4660 {
4661 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4662 bin_tree_t *tree, *branch = NULL;
4663 tree = parse_branch (regexp, preg, token, syntax, nest, err);
4664 if (BE (*err != REG_NOERROR && tree == NULL, 0))
4665 return NULL;
4666
4667 while (token->type == OP_ALT)
4668 {
4669 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
4670 if (token->type != OP_ALT && token->type != END_OF_RE
4671 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
4672 {
4673 branch = parse_branch (regexp, preg, token, syntax, nest, err);
4674 if (BE (*err != REG_NOERROR && branch == NULL, 0))
4675 return NULL;
4676 }
4677 else
4678 branch = NULL;
4679 tree = create_tree (dfa, tree, branch, OP_ALT);
4680 if (BE (tree == NULL, 0))
4681 {
4682 *err = REG_ESPACE;
4683 return NULL;
4684 }
4685 }
4686 return tree;
4687 }
4688
4689 /* This function build the following tree, from regular expression
4690 <exp1><exp2>:
4691 CAT
4692 / \
4693 / \
4694 <exp1> <exp2>
4695
4696 CAT means concatenation. */
4697
4698 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)4699 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
4700 reg_syntax_t syntax, int nest, reg_errcode_t *err)
4701 {
4702 bin_tree_t *tree, *exp;
4703 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4704 tree = parse_expression (regexp, preg, token, syntax, nest, err);
4705 if (BE (*err != REG_NOERROR && tree == NULL, 0))
4706 return NULL;
4707
4708 while (token->type != OP_ALT && token->type != END_OF_RE
4709 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
4710 {
4711 exp = parse_expression (regexp, preg, token, syntax, nest, err);
4712 if (BE (*err != REG_NOERROR && exp == NULL, 0))
4713 {
4714 return NULL;
4715 }
4716 if (tree != NULL && exp != NULL)
4717 {
4718 tree = create_tree (dfa, tree, exp, CONCAT);
4719 if (tree == NULL)
4720 {
4721 *err = REG_ESPACE;
4722 return NULL;
4723 }
4724 }
4725 else if (tree == NULL)
4726 tree = exp;
4727 /* Otherwise exp == NULL, we don't need to create new tree. */
4728 }
4729 return tree;
4730 }
4731
4732 /* This function build the following tree, from regular expression a*:
4733 *
4734 |
4735 a
4736 */
4737
4738 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)4739 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
4740 reg_syntax_t syntax, int nest, reg_errcode_t *err)
4741 {
4742 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4743 bin_tree_t *tree;
4744 switch (token->type)
4745 {
4746 case CHARACTER:
4747 tree = create_token_tree (dfa, NULL, NULL, token);
4748 if (BE (tree == NULL, 0))
4749 {
4750 *err = REG_ESPACE;
4751 return NULL;
4752 }
4753 #ifdef RE_ENABLE_I18N
4754 if (dfa->mb_cur_max > 1)
4755 {
4756 while (!re_string_eoi (regexp)
4757 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
4758 {
4759 bin_tree_t *mbc_remain;
4760 fetch_token (token, regexp, syntax);
4761 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
4762 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
4763 if (BE (mbc_remain == NULL || tree == NULL, 0))
4764 {
4765 *err = REG_ESPACE;
4766 return NULL;
4767 }
4768 }
4769 }
4770 #endif
4771 break;
4772 case OP_OPEN_SUBEXP:
4773 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
4774 if (BE (*err != REG_NOERROR && tree == NULL, 0))
4775 return NULL;
4776 break;
4777 case OP_OPEN_BRACKET:
4778 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
4779 if (BE (*err != REG_NOERROR && tree == NULL, 0))
4780 return NULL;
4781 break;
4782 case OP_BACK_REF:
4783 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
4784 {
4785 *err = REG_ESUBREG;
4786 return NULL;
4787 }
4788 dfa->used_bkref_map |= 1 << token->opr.idx;
4789 tree = create_token_tree (dfa, NULL, NULL, token);
4790 if (BE (tree == NULL, 0))
4791 {
4792 *err = REG_ESPACE;
4793 return NULL;
4794 }
4795 ++dfa->nbackref;
4796 dfa->has_mb_node = 1;
4797 break;
4798 case OP_OPEN_DUP_NUM:
4799 if (syntax & RE_CONTEXT_INVALID_DUP)
4800 {
4801 *err = REG_BADRPT;
4802 return NULL;
4803 }
4804 /* FALLTHROUGH */
4805 case OP_DUP_ASTERISK:
4806 case OP_DUP_PLUS:
4807 case OP_DUP_QUESTION:
4808 if (syntax & RE_CONTEXT_INVALID_OPS)
4809 {
4810 *err = REG_BADRPT;
4811 return NULL;
4812 }
4813 else if (syntax & RE_CONTEXT_INDEP_OPS)
4814 {
4815 fetch_token (token, regexp, syntax);
4816 return parse_expression (regexp, preg, token, syntax, nest, err);
4817 }
4818 /* else fall through */
4819 case OP_CLOSE_SUBEXP:
4820 if ((token->type == OP_CLOSE_SUBEXP) &&
4821 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
4822 {
4823 *err = REG_ERPAREN;
4824 return NULL;
4825 }
4826 /* else fall through */
4827 case OP_CLOSE_DUP_NUM:
4828 /* We treat it as a normal character. */
4829
4830 /* Then we can these characters as normal characters. */
4831 token->type = CHARACTER;
4832 /* mb_partial and word_char bits should be initialized already
4833 by peek_token. */
4834 tree = create_token_tree (dfa, NULL, NULL, token);
4835 if (BE (tree == NULL, 0))
4836 {
4837 *err = REG_ESPACE;
4838 return NULL;
4839 }
4840 break;
4841 case ANCHOR:
4842 if ((token->opr.ctx_type
4843 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
4844 && dfa->word_ops_used == 0)
4845 init_word_char (dfa);
4846 if (token->opr.ctx_type == WORD_DELIM
4847 || token->opr.ctx_type == NOT_WORD_DELIM)
4848 {
4849 bin_tree_t *tree_first, *tree_last;
4850 if (token->opr.ctx_type == WORD_DELIM)
4851 {
4852 token->opr.ctx_type = WORD_FIRST;
4853 tree_first = create_token_tree (dfa, NULL, NULL, token);
4854 token->opr.ctx_type = WORD_LAST;
4855 }
4856 else
4857 {
4858 token->opr.ctx_type = INSIDE_WORD;
4859 tree_first = create_token_tree (dfa, NULL, NULL, token);
4860 token->opr.ctx_type = INSIDE_NOTWORD;
4861 }
4862 tree_last = create_token_tree (dfa, NULL, NULL, token);
4863 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
4864 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
4865 {
4866 *err = REG_ESPACE;
4867 return NULL;
4868 }
4869 }
4870 else
4871 {
4872 tree = create_token_tree (dfa, NULL, NULL, token);
4873 if (BE (tree == NULL, 0))
4874 {
4875 *err = REG_ESPACE;
4876 return NULL;
4877 }
4878 }
4879 /* We must return here, since ANCHORs can't be followed
4880 by repetition operators.
4881 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
4882 it must not be "<ANCHOR(^)><REPEAT(*)>". */
4883 fetch_token (token, regexp, syntax);
4884 return tree;
4885 case OP_PERIOD:
4886 tree = create_token_tree (dfa, NULL, NULL, token);
4887 if (BE (tree == NULL, 0))
4888 {
4889 *err = REG_ESPACE;
4890 return NULL;
4891 }
4892 if (dfa->mb_cur_max > 1)
4893 dfa->has_mb_node = 1;
4894 break;
4895 case OP_WORD:
4896 case OP_NOTWORD:
4897 tree = build_charclass_op (dfa, regexp->trans,
4898 (const unsigned char *) "alnum",
4899 (const unsigned char *) "_",
4900 token->type == OP_NOTWORD, err);
4901 if (BE (*err != REG_NOERROR && tree == NULL, 0))
4902 return NULL;
4903 break;
4904 case OP_SPACE:
4905 case OP_NOTSPACE:
4906 tree = build_charclass_op (dfa, regexp->trans,
4907 (const unsigned char *) "space",
4908 (const unsigned char *) "",
4909 token->type == OP_NOTSPACE, err);
4910 if (BE (*err != REG_NOERROR && tree == NULL, 0))
4911 return NULL;
4912 break;
4913 case OP_ALT:
4914 case END_OF_RE:
4915 return NULL;
4916 case BACK_SLASH:
4917 *err = REG_EESCAPE;
4918 return NULL;
4919 default:
4920 /* Must not happen? */
4921 #ifdef DEBUG
4922 assert (0);
4923 #endif
4924 return NULL;
4925 }
4926 fetch_token (token, regexp, syntax);
4927
4928 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
4929 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
4930 {
4931 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
4932 if (BE (*err != REG_NOERROR && tree == NULL, 0))
4933 return NULL;
4934 /* In BRE consecutive duplications are not allowed. */
4935 if ((syntax & RE_CONTEXT_INVALID_DUP)
4936 && (token->type == OP_DUP_ASTERISK
4937 || token->type == OP_OPEN_DUP_NUM))
4938 {
4939 *err = REG_BADRPT;
4940 return NULL;
4941 }
4942 }
4943
4944 return tree;
4945 }
4946
4947 /* This function build the following tree, from regular expression
4948 (<reg_exp>):
4949 SUBEXP
4950 |
4951 <reg_exp>
4952 */
4953
4954 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)4955 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
4956 reg_syntax_t syntax, int nest, reg_errcode_t *err)
4957 {
4958 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4959 bin_tree_t *tree;
4960 size_t cur_nsub;
4961 cur_nsub = preg->re_nsub++;
4962
4963 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
4964
4965 /* The subexpression may be a null string. */
4966 if (token->type == OP_CLOSE_SUBEXP)
4967 tree = NULL;
4968 else
4969 {
4970 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
4971 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
4972 *err = REG_EPAREN;
4973 if (BE (*err != REG_NOERROR, 0))
4974 return NULL;
4975 }
4976
4977 if (cur_nsub <= '9' - '1')
4978 dfa->completed_bkref_map |= 1 << cur_nsub;
4979
4980 tree = create_tree (dfa, tree, NULL, SUBEXP);
4981 if (BE (tree == NULL, 0))
4982 {
4983 *err = REG_ESPACE;
4984 return NULL;
4985 }
4986 tree->token.opr.idx = cur_nsub;
4987 return tree;
4988 }
4989
4990 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
4991
4992 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)4993 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
4994 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
4995 {
4996 bin_tree_t *tree = NULL, *old_tree = NULL;
4997 int i, start, end, start_idx = re_string_cur_idx (regexp);
4998 re_token_t start_token = *token;
4999
5000 if (token->type == OP_OPEN_DUP_NUM)
5001 {
5002 end = 0;
5003 start = fetch_number (regexp, token, syntax);
5004 if (start == -1)
5005 {
5006 if (token->type == CHARACTER && token->opr.c == ',')
5007 start = 0; /* We treat "{,m}" as "{0,m}". */
5008 else
5009 {
5010 *err = REG_BADBR; /* <re>{} is invalid. */
5011 return NULL;
5012 }
5013 }
5014 if (BE (start != -2, 1))
5015 {
5016 /* We treat "{n}" as "{n,n}". */
5017 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
5018 : ((token->type == CHARACTER && token->opr.c == ',')
5019 ? fetch_number (regexp, token, syntax) : -2));
5020 }
5021 if (BE (start == -2 || end == -2, 0))
5022 {
5023 /* Invalid sequence. */
5024 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
5025 {
5026 if (token->type == END_OF_RE)
5027 *err = REG_EBRACE;
5028 else
5029 *err = REG_BADBR;
5030
5031 return NULL;
5032 }
5033
5034 /* If the syntax bit is set, rollback. */
5035 re_string_set_index (regexp, start_idx);
5036 *token = start_token;
5037 token->type = CHARACTER;
5038 /* mb_partial and word_char bits should be already initialized by
5039 peek_token. */
5040 return elem;
5041 }
5042
5043 if (BE (end != -1 && start > end, 0))
5044 {
5045 /* First number greater than second. */
5046 *err = REG_BADBR;
5047 return NULL;
5048 }
5049 }
5050 else
5051 {
5052 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
5053 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
5054 }
5055
5056 fetch_token (token, regexp, syntax);
5057
5058 if (BE (elem == NULL, 0))
5059 return NULL;
5060 if (BE (start == 0 && end == 0, 0))
5061 {
5062 postorder (elem, free_tree, NULL);
5063 return NULL;
5064 }
5065
5066 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
5067 if (BE (start > 0, 0))
5068 {
5069 tree = elem;
5070 for (i = 2; i <= start; ++i)
5071 {
5072 elem = duplicate_tree (elem, dfa);
5073 tree = create_tree (dfa, tree, elem, CONCAT);
5074 if (BE (elem == NULL || tree == NULL, 0))
5075 goto parse_dup_op_espace;
5076 }
5077
5078 if (start == end)
5079 return tree;
5080
5081 /* Duplicate ELEM before it is marked optional. */
5082 elem = duplicate_tree (elem, dfa);
5083 old_tree = tree;
5084 }
5085 else
5086 old_tree = NULL;
5087
5088 if (elem->token.type == SUBEXP)
5089 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
5090
5091 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
5092 if (BE (tree == NULL, 0))
5093 goto parse_dup_op_espace;
5094
5095 /* This loop is actually executed only when end != -1,
5096 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
5097 already created the start+1-th copy. */
5098 for (i = start + 2; i <= end; ++i)
5099 {
5100 elem = duplicate_tree (elem, dfa);
5101 tree = create_tree (dfa, tree, elem, CONCAT);
5102 if (BE (elem == NULL || tree == NULL, 0))
5103 goto parse_dup_op_espace;
5104
5105 tree = create_tree (dfa, tree, NULL, OP_ALT);
5106 if (BE (tree == NULL, 0))
5107 goto parse_dup_op_espace;
5108 }
5109
5110 if (old_tree)
5111 tree = create_tree (dfa, old_tree, tree, CONCAT);
5112
5113 return tree;
5114
5115 parse_dup_op_espace:
5116 *err = REG_ESPACE;
5117 return NULL;
5118 }
5119
5120 /* Size of the names for collating symbol/equivalence_class/character_class.
5121 I'm not sure, but maybe enough. */
5122 #define BRACKET_NAME_BUF_SIZE 32
5123
5124 #ifndef _LIBC
5125 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
5126 Build the range expression which starts from START_ELEM, and ends
5127 at END_ELEM. The result are written to MBCSET and SBCSET.
5128 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
5129 mbcset->range_ends, is a pointer argument sinse we may
5130 update it. */
5131
5132 static reg_errcode_t
5133 internal_function
5134 # ifdef RE_ENABLE_I18N
build_range_exp(bitset_t sbcset,re_charset_t * mbcset,int * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)5135 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
5136 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
5137 # else /* not RE_ENABLE_I18N */
5138 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
5139 bracket_elem_t *end_elem)
5140 # endif /* not RE_ENABLE_I18N */
5141 {
5142 unsigned int start_ch, end_ch;
5143 /* Equivalence Classes and Character Classes can't be a range start/end. */
5144 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
5145 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
5146 0))
5147 return REG_ERANGE;
5148
5149 /* We can handle no multi character collating elements without libc
5150 support. */
5151 if (BE ((start_elem->type == COLL_SYM
5152 && strlen ((char *) start_elem->opr.name) > 1)
5153 || (end_elem->type == COLL_SYM
5154 && strlen ((char *) end_elem->opr.name) > 1), 0))
5155 return REG_ECOLLATE;
5156
5157 # ifdef RE_ENABLE_I18N
5158 {
5159 wchar_t wc;
5160 wint_t start_wc;
5161 wint_t end_wc;
5162 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
5163
5164 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
5165 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
5166 : 0));
5167 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
5168 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
5169 : 0));
5170 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
5171 ? __btowc (start_ch) : start_elem->opr.wch);
5172 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
5173 ? __btowc (end_ch) : end_elem->opr.wch);
5174 if (start_wc == WEOF || end_wc == WEOF)
5175 return REG_ECOLLATE;
5176 cmp_buf[0] = start_wc;
5177 cmp_buf[4] = end_wc;
5178 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
5179 return REG_ERANGE;
5180
5181 /* Got valid collation sequence values, add them as a new entry.
5182 However, for !_LIBC we have no collation elements: if the
5183 character set is single byte, the single byte character set
5184 that we build below suffices. parse_bracket_exp passes
5185 no MBCSET if dfa->mb_cur_max == 1. */
5186 if (mbcset)
5187 {
5188 /* Check the space of the arrays. */
5189 if (BE (*range_alloc == mbcset->nranges, 0))
5190 {
5191 /* There is not enough space, need realloc. */
5192 wchar_t *new_array_start, *new_array_end;
5193 int new_nranges;
5194
5195 /* +1 in case of mbcset->nranges is 0. */
5196 new_nranges = 2 * mbcset->nranges + 1;
5197 /* Use realloc since mbcset->range_starts and mbcset->range_ends
5198 are NULL if *range_alloc == 0. */
5199 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
5200 new_nranges);
5201 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
5202 new_nranges);
5203
5204 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
5205 return REG_ESPACE;
5206
5207 mbcset->range_starts = new_array_start;
5208 mbcset->range_ends = new_array_end;
5209 *range_alloc = new_nranges;
5210 }
5211
5212 mbcset->range_starts[mbcset->nranges] = start_wc;
5213 mbcset->range_ends[mbcset->nranges++] = end_wc;
5214 }
5215
5216 /* Build the table for single byte characters. */
5217 for (wc = 0; wc < SBC_MAX; ++wc)
5218 {
5219 cmp_buf[2] = wc;
5220 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
5221 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
5222 bitset_set (sbcset, wc);
5223 }
5224 }
5225 # else /* not RE_ENABLE_I18N */
5226 {
5227 unsigned int ch;
5228 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
5229 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
5230 : 0));
5231 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
5232 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
5233 : 0));
5234 if (start_ch > end_ch)
5235 return REG_ERANGE;
5236 /* Build the table for single byte characters. */
5237 for (ch = 0; ch < SBC_MAX; ++ch)
5238 if (start_ch <= ch && ch <= end_ch)
5239 bitset_set (sbcset, ch);
5240 }
5241 # endif /* not RE_ENABLE_I18N */
5242 return REG_NOERROR;
5243 }
5244 #endif /* not _LIBC */
5245
5246 #ifndef _LIBC
5247 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
5248 Build the collating element which is represented by NAME.
5249 The result are written to MBCSET and SBCSET.
5250 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
5251 pointer argument since we may update it. */
5252
5253 static reg_errcode_t
5254 internal_function
5255 # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,int * coll_sym_alloc,const unsigned char * name)5256 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
5257 int *coll_sym_alloc, const unsigned char *name)
5258 # else /* not RE_ENABLE_I18N */
5259 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
5260 # endif /* not RE_ENABLE_I18N */
5261 {
5262 size_t name_len = strlen ((const char *) name);
5263 if (BE (name_len != 1, 0))
5264 return REG_ECOLLATE;
5265 else
5266 {
5267 bitset_set (sbcset, name[0]);
5268 return REG_NOERROR;
5269 }
5270 }
5271 #endif /* not _LIBC */
5272
5273 /* This function parse bracket expression like "[abc]", "[a-c]",
5274 "[[.a-a.]]" etc. */
5275
5276 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)5277 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
5278 reg_syntax_t syntax, reg_errcode_t *err)
5279 {
5280 #ifdef _LIBC
5281 const unsigned char *collseqmb;
5282 const char *collseqwc;
5283 uint32_t nrules;
5284 int32_t table_size;
5285 const int32_t *symb_table;
5286 const unsigned char *extra;
5287
5288 /* Local function for parse_bracket_exp used in _LIBC environement.
5289 Seek the collating symbol entry correspondings to NAME.
5290 Return the index of the symbol in the SYMB_TABLE. */
5291
5292 auto inline int32_t
5293 __attribute ((always_inline))
5294 seek_collating_symbol_entry (name, name_len)
5295 const unsigned char *name;
5296 size_t name_len;
5297 {
5298 int32_t hash = elem_hash ((const char *) name, name_len);
5299 int32_t elem = hash % table_size;
5300 if (symb_table[2 * elem] != 0)
5301 {
5302 int32_t second = hash % (table_size - 2) + 1;
5303
5304 do
5305 {
5306 /* First compare the hashing value. */
5307 if (symb_table[2 * elem] == hash
5308 /* Compare the length of the name. */
5309 && name_len == extra[symb_table[2 * elem + 1]]
5310 /* Compare the name. */
5311 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
5312 name_len) == 0)
5313 {
5314 /* Yep, this is the entry. */
5315 break;
5316 }
5317
5318 /* Next entry. */
5319 elem += second;
5320 }
5321 while (symb_table[2 * elem] != 0);
5322 }
5323 return elem;
5324 }
5325
5326 /* Local function for parse_bracket_exp used in _LIBC environement.
5327 Look up the collation sequence value of BR_ELEM.
5328 Return the value if succeeded, UINT_MAX otherwise. */
5329
5330 auto inline unsigned int
5331 __attribute ((always_inline))
5332 lookup_collation_sequence_value (br_elem)
5333 bracket_elem_t *br_elem;
5334 {
5335 if (br_elem->type == SB_CHAR)
5336 {
5337 /*
5338 if (MB_CUR_MAX == 1)
5339 */
5340 if (nrules == 0)
5341 return collseqmb[br_elem->opr.ch];
5342 else
5343 {
5344 wint_t wc = __btowc (br_elem->opr.ch);
5345 return __collseq_table_lookup (collseqwc, wc);
5346 }
5347 }
5348 else if (br_elem->type == MB_CHAR)
5349 {
5350 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
5351 }
5352 else if (br_elem->type == COLL_SYM)
5353 {
5354 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
5355 if (nrules != 0)
5356 {
5357 int32_t elem, idx;
5358 elem = seek_collating_symbol_entry (br_elem->opr.name,
5359 sym_name_len);
5360 if (symb_table[2 * elem] != 0)
5361 {
5362 /* We found the entry. */
5363 idx = symb_table[2 * elem + 1];
5364 /* Skip the name of collating element name. */
5365 idx += 1 + extra[idx];
5366 /* Skip the byte sequence of the collating element. */
5367 idx += 1 + extra[idx];
5368 /* Adjust for the alignment. */
5369 idx = (idx + 3) & ~3;
5370 /* Skip the multibyte collation sequence value. */
5371 idx += sizeof (unsigned int);
5372 /* Skip the wide char sequence of the collating element. */
5373 idx += sizeof (unsigned int) *
5374 (1 + *(unsigned int *) (extra + idx));
5375 /* Return the collation sequence value. */
5376 return *(unsigned int *) (extra + idx);
5377 }
5378 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
5379 {
5380 /* No valid character. Match it as a single byte
5381 character. */
5382 return collseqmb[br_elem->opr.name[0]];
5383 }
5384 }
5385 else if (sym_name_len == 1)
5386 return collseqmb[br_elem->opr.name[0]];
5387 }
5388 return UINT_MAX;
5389 }
5390
5391 /* Local function for parse_bracket_exp used in _LIBC environement.
5392 Build the range expression which starts from START_ELEM, and ends
5393 at END_ELEM. The result are written to MBCSET and SBCSET.
5394 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
5395 mbcset->range_ends, is a pointer argument sinse we may
5396 update it. */
5397
5398 auto inline reg_errcode_t
5399 __attribute ((always_inline))
5400 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
5401 re_charset_t *mbcset;
5402 int *range_alloc;
5403 bitset_t sbcset;
5404 bracket_elem_t *start_elem, *end_elem;
5405 {
5406 unsigned int ch;
5407 uint32_t start_collseq;
5408 uint32_t end_collseq;
5409
5410 /* Equivalence Classes and Character Classes can't be a range
5411 start/end. */
5412 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
5413 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
5414 0))
5415 return REG_ERANGE;
5416
5417 start_collseq = lookup_collation_sequence_value (start_elem);
5418 end_collseq = lookup_collation_sequence_value (end_elem);
5419 /* Check start/end collation sequence values. */
5420 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
5421 return REG_ECOLLATE;
5422 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
5423 return REG_ERANGE;
5424
5425 /* Got valid collation sequence values, add them as a new entry.
5426 However, if we have no collation elements, and the character set
5427 is single byte, the single byte character set that we
5428 build below suffices. */
5429 if (nrules > 0 || dfa->mb_cur_max > 1)
5430 {
5431 /* Check the space of the arrays. */
5432 if (BE (*range_alloc == mbcset->nranges, 0))
5433 {
5434 /* There is not enough space, need realloc. */
5435 uint32_t *new_array_start;
5436 uint32_t *new_array_end;
5437 int new_nranges;
5438
5439 /* +1 in case of mbcset->nranges is 0. */
5440 new_nranges = 2 * mbcset->nranges + 1;
5441 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
5442 new_nranges);
5443 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
5444 new_nranges);
5445
5446 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
5447 return REG_ESPACE;
5448
5449 mbcset->range_starts = new_array_start;
5450 mbcset->range_ends = new_array_end;
5451 *range_alloc = new_nranges;
5452 }
5453
5454 mbcset->range_starts[mbcset->nranges] = start_collseq;
5455 mbcset->range_ends[mbcset->nranges++] = end_collseq;
5456 }
5457
5458 /* Build the table for single byte characters. */
5459 for (ch = 0; ch < SBC_MAX; ch++)
5460 {
5461 uint32_t ch_collseq;
5462 /*
5463 if (MB_CUR_MAX == 1)
5464 */
5465 if (nrules == 0)
5466 ch_collseq = collseqmb[ch];
5467 else
5468 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
5469 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
5470 bitset_set (sbcset, ch);
5471 }
5472 return REG_NOERROR;
5473 }
5474
5475 /* Local function for parse_bracket_exp used in _LIBC environement.
5476 Build the collating element which is represented by NAME.
5477 The result are written to MBCSET and SBCSET.
5478 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
5479 pointer argument sinse we may update it. */
5480
5481 auto inline reg_errcode_t
5482 __attribute ((always_inline))
5483 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
5484 re_charset_t *mbcset;
5485 int *coll_sym_alloc;
5486 bitset_t sbcset;
5487 const unsigned char *name;
5488 {
5489 int32_t elem, idx;
5490 size_t name_len = strlen ((const char *) name);
5491 if (nrules != 0)
5492 {
5493 elem = seek_collating_symbol_entry (name, name_len);
5494 if (symb_table[2 * elem] != 0)
5495 {
5496 /* We found the entry. */
5497 idx = symb_table[2 * elem + 1];
5498 /* Skip the name of collating element name. */
5499 idx += 1 + extra[idx];
5500 }
5501 else if (symb_table[2 * elem] == 0 && name_len == 1)
5502 {
5503 /* No valid character, treat it as a normal
5504 character. */
5505 bitset_set (sbcset, name[0]);
5506 return REG_NOERROR;
5507 }
5508 else
5509 return REG_ECOLLATE;
5510
5511 /* Got valid collation sequence, add it as a new entry. */
5512 /* Check the space of the arrays. */
5513 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
5514 {
5515 /* Not enough, realloc it. */
5516 /* +1 in case of mbcset->ncoll_syms is 0. */
5517 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
5518 /* Use realloc since mbcset->coll_syms is NULL
5519 if *alloc == 0. */
5520 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
5521 new_coll_sym_alloc);
5522 if (BE (new_coll_syms == NULL, 0))
5523 return REG_ESPACE;
5524 mbcset->coll_syms = new_coll_syms;
5525 *coll_sym_alloc = new_coll_sym_alloc;
5526 }
5527 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
5528 return REG_NOERROR;
5529 }
5530 else
5531 {
5532 if (BE (name_len != 1, 0))
5533 return REG_ECOLLATE;
5534 else
5535 {
5536 bitset_set (sbcset, name[0]);
5537 return REG_NOERROR;
5538 }
5539 }
5540 }
5541 #endif
5542
5543 re_token_t br_token;
5544 re_bitset_ptr_t sbcset;
5545 #ifdef RE_ENABLE_I18N
5546 re_charset_t *mbcset;
5547 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
5548 int equiv_class_alloc = 0, char_class_alloc = 0;
5549 #endif /* not RE_ENABLE_I18N */
5550 int non_match = 0;
5551 bin_tree_t *work_tree;
5552 int token_len;
5553 int first_round = 1;
5554 #ifdef _LIBC
5555 collseqmb = (const unsigned char *)
5556 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
5557 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
5558 if (nrules)
5559 {
5560 /*
5561 if (MB_CUR_MAX > 1)
5562 */
5563 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
5564 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
5565 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
5566 _NL_COLLATE_SYMB_TABLEMB);
5567 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
5568 _NL_COLLATE_SYMB_EXTRAMB);
5569 }
5570 #endif
5571 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
5572 #ifdef RE_ENABLE_I18N
5573 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
5574 #endif /* RE_ENABLE_I18N */
5575 #ifdef RE_ENABLE_I18N
5576 if (BE (sbcset == NULL || mbcset == NULL, 0))
5577 #else
5578 if (BE (sbcset == NULL, 0))
5579 #endif /* RE_ENABLE_I18N */
5580 {
5581 *err = REG_ESPACE;
5582 return NULL;
5583 }
5584
5585 token_len = peek_token_bracket (token, regexp, syntax);
5586 if (BE (token->type == END_OF_RE, 0))
5587 {
5588 *err = REG_BADPAT;
5589 goto parse_bracket_exp_free_return;
5590 }
5591 if (token->type == OP_NON_MATCH_LIST)
5592 {
5593 #ifdef RE_ENABLE_I18N
5594 mbcset->non_match = 1;
5595 #endif /* not RE_ENABLE_I18N */
5596 non_match = 1;
5597 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
5598 bitset_set (sbcset, '\0');
5599 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
5600 token_len = peek_token_bracket (token, regexp, syntax);
5601 if (BE (token->type == END_OF_RE, 0))
5602 {
5603 *err = REG_BADPAT;
5604 goto parse_bracket_exp_free_return;
5605 }
5606 }
5607
5608 /* We treat the first ']' as a normal character. */
5609 if (token->type == OP_CLOSE_BRACKET)
5610 token->type = CHARACTER;
5611
5612 while (1)
5613 {
5614 bracket_elem_t start_elem, end_elem;
5615 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
5616 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
5617 reg_errcode_t ret;
5618 int token_len2 = 0, is_range_exp = 0;
5619 re_token_t token2;
5620
5621 start_elem.opr.name = start_name_buf;
5622 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
5623 syntax, first_round);
5624 if (BE (ret != REG_NOERROR, 0))
5625 {
5626 *err = ret;
5627 goto parse_bracket_exp_free_return;
5628 }
5629 first_round = 0;
5630
5631 /* Get information about the next token. We need it in any case. */
5632 token_len = peek_token_bracket (token, regexp, syntax);
5633
5634 /* Do not check for ranges if we know they are not allowed. */
5635 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
5636 {
5637 if (BE (token->type == END_OF_RE, 0))
5638 {
5639 *err = REG_EBRACK;
5640 goto parse_bracket_exp_free_return;
5641 }
5642 if (token->type == OP_CHARSET_RANGE)
5643 {
5644 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
5645 token_len2 = peek_token_bracket (&token2, regexp, syntax);
5646 if (BE (token2.type == END_OF_RE, 0))
5647 {
5648 *err = REG_EBRACK;
5649 goto parse_bracket_exp_free_return;
5650 }
5651 if (token2.type == OP_CLOSE_BRACKET)
5652 {
5653 /* We treat the last '-' as a normal character. */
5654 re_string_skip_bytes (regexp, -token_len);
5655 token->type = CHARACTER;
5656 }
5657 else
5658 is_range_exp = 1;
5659 }
5660 }
5661
5662 if (is_range_exp == 1)
5663 {
5664 end_elem.opr.name = end_name_buf;
5665 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
5666 dfa, syntax, 1);
5667 if (BE (ret != REG_NOERROR, 0))
5668 {
5669 *err = ret;
5670 goto parse_bracket_exp_free_return;
5671 }
5672
5673 token_len = peek_token_bracket (token, regexp, syntax);
5674
5675 #ifdef _LIBC
5676 *err = build_range_exp (sbcset, mbcset, &range_alloc,
5677 &start_elem, &end_elem);
5678 #else
5679 # ifdef RE_ENABLE_I18N
5680 *err = build_range_exp (sbcset,
5681 dfa->mb_cur_max > 1 ? mbcset : NULL,
5682 &range_alloc, &start_elem, &end_elem);
5683 # else
5684 *err = build_range_exp (sbcset, &start_elem, &end_elem);
5685 # endif
5686 #endif /* RE_ENABLE_I18N */
5687 if (BE (*err != REG_NOERROR, 0))
5688 goto parse_bracket_exp_free_return;
5689 }
5690 else
5691 {
5692 switch (start_elem.type)
5693 {
5694 case SB_CHAR:
5695 bitset_set (sbcset, start_elem.opr.ch);
5696 break;
5697 #ifdef RE_ENABLE_I18N
5698 case MB_CHAR:
5699 /* Check whether the array has enough space. */
5700 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
5701 {
5702 wchar_t *new_mbchars;
5703 /* Not enough, realloc it. */
5704 /* +1 in case of mbcset->nmbchars is 0. */
5705 mbchar_alloc = 2 * mbcset->nmbchars + 1;
5706 /* Use realloc since array is NULL if *alloc == 0. */
5707 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
5708 mbchar_alloc);
5709 if (BE (new_mbchars == NULL, 0))
5710 goto parse_bracket_exp_espace;
5711 mbcset->mbchars = new_mbchars;
5712 }
5713 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
5714 break;
5715 #endif /* RE_ENABLE_I18N */
5716 case EQUIV_CLASS:
5717 *err = build_equiv_class (sbcset,
5718 #ifdef RE_ENABLE_I18N
5719 mbcset, &equiv_class_alloc,
5720 #endif /* RE_ENABLE_I18N */
5721 start_elem.opr.name);
5722 if (BE (*err != REG_NOERROR, 0))
5723 goto parse_bracket_exp_free_return;
5724 break;
5725 case COLL_SYM:
5726 *err = build_collating_symbol (sbcset,
5727 #ifdef RE_ENABLE_I18N
5728 mbcset, &coll_sym_alloc,
5729 #endif /* RE_ENABLE_I18N */
5730 start_elem.opr.name);
5731 if (BE (*err != REG_NOERROR, 0))
5732 goto parse_bracket_exp_free_return;
5733 break;
5734 case CHAR_CLASS:
5735 *err = build_charclass (regexp->trans, sbcset,
5736 #ifdef RE_ENABLE_I18N
5737 mbcset, &char_class_alloc,
5738 #endif /* RE_ENABLE_I18N */
5739 start_elem.opr.name, syntax);
5740 if (BE (*err != REG_NOERROR, 0))
5741 goto parse_bracket_exp_free_return;
5742 break;
5743 default:
5744 assert (0);
5745 break;
5746 }
5747 }
5748 if (BE (token->type == END_OF_RE, 0))
5749 {
5750 *err = REG_EBRACK;
5751 goto parse_bracket_exp_free_return;
5752 }
5753 if (token->type == OP_CLOSE_BRACKET)
5754 break;
5755 }
5756
5757 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
5758
5759 /* If it is non-matching list. */
5760 if (non_match)
5761 bitset_not (sbcset);
5762
5763 #ifdef RE_ENABLE_I18N
5764 /* Ensure only single byte characters are set. */
5765 if (dfa->mb_cur_max > 1)
5766 bitset_mask (sbcset, dfa->sb_char);
5767
5768 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
5769 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
5770 || mbcset->non_match)))
5771 {
5772 bin_tree_t *mbc_tree;
5773 int sbc_idx;
5774 /* Build a tree for complex bracket. */
5775 dfa->has_mb_node = 1;
5776 br_token.type = COMPLEX_BRACKET;
5777 br_token.opr.mbcset = mbcset;
5778 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
5779 if (BE (mbc_tree == NULL, 0))
5780 goto parse_bracket_exp_espace;
5781 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
5782 if (sbcset[sbc_idx])
5783 break;
5784 /* If there are no bits set in sbcset, there is no point
5785 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
5786 if (sbc_idx < BITSET_WORDS)
5787 {
5788 /* Build a tree for simple bracket. */
5789 br_token.type = SIMPLE_BRACKET;
5790 br_token.opr.sbcset = sbcset;
5791 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
5792 if (BE (work_tree == NULL, 0))
5793 goto parse_bracket_exp_espace;
5794
5795 /* Then join them by ALT node. */
5796 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
5797 if (BE (work_tree == NULL, 0))
5798 goto parse_bracket_exp_espace;
5799 }
5800 else
5801 {
5802 re_free (sbcset);
5803 work_tree = mbc_tree;
5804 }
5805 }
5806 else
5807 #endif /* not RE_ENABLE_I18N */
5808 {
5809 #ifdef RE_ENABLE_I18N
5810 free_charset (mbcset);
5811 #endif
5812 /* Build a tree for simple bracket. */
5813 br_token.type = SIMPLE_BRACKET;
5814 br_token.opr.sbcset = sbcset;
5815 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
5816 if (BE (work_tree == NULL, 0))
5817 goto parse_bracket_exp_espace;
5818 }
5819 return work_tree;
5820
5821 parse_bracket_exp_espace:
5822 *err = REG_ESPACE;
5823 parse_bracket_exp_free_return:
5824 re_free (sbcset);
5825 #ifdef RE_ENABLE_I18N
5826 free_charset (mbcset);
5827 #endif /* RE_ENABLE_I18N */
5828 return NULL;
5829 }
5830
5831 /* Parse an element in the bracket expression. */
5832
5833 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,int accept_hyphen)5834 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
5835 re_token_t *token, int token_len, re_dfa_t *dfa,
5836 reg_syntax_t syntax, int accept_hyphen)
5837 {
5838 #ifdef RE_ENABLE_I18N
5839 int cur_char_size;
5840 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
5841 if (cur_char_size > 1)
5842 {
5843 elem->type = MB_CHAR;
5844 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
5845 re_string_skip_bytes (regexp, cur_char_size);
5846 return REG_NOERROR;
5847 }
5848 #endif /* RE_ENABLE_I18N */
5849 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
5850 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
5851 || token->type == OP_OPEN_EQUIV_CLASS)
5852 return parse_bracket_symbol (elem, regexp, token);
5853 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
5854 {
5855 /* A '-' must only appear as anything but a range indicator before
5856 the closing bracket. Everything else is an error. */
5857 re_token_t token2;
5858 (void) peek_token_bracket (&token2, regexp, syntax);
5859 if (token2.type != OP_CLOSE_BRACKET)
5860 /* The actual error value is not standardized since this whole
5861 case is undefined. But ERANGE makes good sense. */
5862 return REG_ERANGE;
5863 }
5864 elem->type = SB_CHAR;
5865 elem->opr.ch = token->opr.c;
5866 return REG_NOERROR;
5867 }
5868
5869 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
5870 such as [:<character_class>:], [.<collating_element>.], and
5871 [=<equivalent_class>=]. */
5872
5873 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)5874 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
5875 re_token_t *token)
5876 {
5877 unsigned char ch, delim = token->opr.c;
5878 int i = 0;
5879 if (re_string_eoi(regexp))
5880 return REG_EBRACK;
5881 for (;; ++i)
5882 {
5883 if (i >= BRACKET_NAME_BUF_SIZE)
5884 return REG_EBRACK;
5885 if (token->type == OP_OPEN_CHAR_CLASS)
5886 ch = re_string_fetch_byte_case (regexp);
5887 else
5888 ch = re_string_fetch_byte (regexp);
5889 if (re_string_eoi(regexp))
5890 return REG_EBRACK;
5891 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
5892 break;
5893 elem->opr.name[i] = ch;
5894 }
5895 re_string_skip_bytes (regexp, 1);
5896 elem->opr.name[i] = '\0';
5897 switch (token->type)
5898 {
5899 case OP_OPEN_COLL_ELEM:
5900 elem->type = COLL_SYM;
5901 break;
5902 case OP_OPEN_EQUIV_CLASS:
5903 elem->type = EQUIV_CLASS;
5904 break;
5905 case OP_OPEN_CHAR_CLASS:
5906 elem->type = CHAR_CLASS;
5907 break;
5908 default:
5909 break;
5910 }
5911 return REG_NOERROR;
5912 }
5913
5914 /* Helper function for parse_bracket_exp.
5915 Build the equivalence class which is represented by NAME.
5916 The result are written to MBCSET and SBCSET.
5917 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
5918 is a pointer argument sinse we may update it. */
5919
5920 static reg_errcode_t
5921 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,int * equiv_class_alloc,const unsigned char * name)5922 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
5923 int *equiv_class_alloc, const unsigned char *name)
5924 #else /* not RE_ENABLE_I18N */
5925 build_equiv_class (bitset_t sbcset, const unsigned char *name)
5926 #endif /* not RE_ENABLE_I18N */
5927 {
5928 #ifdef _LIBC
5929 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
5930 if (nrules != 0)
5931 {
5932 const int32_t *table, *indirect;
5933 const unsigned char *weights, *extra, *cp;
5934 unsigned char char_buf[2];
5935 int32_t idx1, idx2;
5936 unsigned int ch;
5937 size_t len;
5938 /* This #include defines a local function! */
5939 # include <locale/weight.h>
5940 /* Calculate the index for equivalence class. */
5941 cp = name;
5942 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
5943 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
5944 _NL_COLLATE_WEIGHTMB);
5945 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
5946 _NL_COLLATE_EXTRAMB);
5947 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
5948 _NL_COLLATE_INDIRECTMB);
5949 idx1 = findidx (&cp);
5950 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
5951 /* This isn't a valid character. */
5952 return REG_ECOLLATE;
5953
5954 /* Build single byte matcing table for this equivalence class. */
5955 char_buf[1] = (unsigned char) '\0';
5956 len = weights[idx1];
5957 for (ch = 0; ch < SBC_MAX; ++ch)
5958 {
5959 char_buf[0] = ch;
5960 cp = char_buf;
5961 idx2 = findidx (&cp);
5962 /*
5963 idx2 = table[ch];
5964 */
5965 if (idx2 == 0)
5966 /* This isn't a valid character. */
5967 continue;
5968 if (len == weights[idx2])
5969 {
5970 int cnt = 0;
5971 while (cnt <= len &&
5972 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
5973 ++cnt;
5974
5975 if (cnt > len)
5976 bitset_set (sbcset, ch);
5977 }
5978 }
5979 /* Check whether the array has enough space. */
5980 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
5981 {
5982 /* Not enough, realloc it. */
5983 /* +1 in case of mbcset->nequiv_classes is 0. */
5984 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
5985 /* Use realloc since the array is NULL if *alloc == 0. */
5986 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
5987 int32_t,
5988 new_equiv_class_alloc);
5989 if (BE (new_equiv_classes == NULL, 0))
5990 return REG_ESPACE;
5991 mbcset->equiv_classes = new_equiv_classes;
5992 *equiv_class_alloc = new_equiv_class_alloc;
5993 }
5994 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
5995 }
5996 else
5997 #endif /* _LIBC */
5998 {
5999 if (BE (strlen ((const char *) name) != 1, 0))
6000 return REG_ECOLLATE;
6001 bitset_set (sbcset, *name);
6002 }
6003 return REG_NOERROR;
6004 }
6005
6006 /* Helper function for parse_bracket_exp.
6007 Build the character class which is represented by NAME.
6008 The result are written to MBCSET and SBCSET.
6009 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
6010 is a pointer argument sinse we may update it. */
6011
6012 static reg_errcode_t
6013 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,int * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)6014 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
6015 re_charset_t *mbcset, int *char_class_alloc,
6016 const unsigned char *class_name, reg_syntax_t syntax)
6017 #else /* not RE_ENABLE_I18N */
6018 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
6019 const unsigned char *class_name, reg_syntax_t syntax)
6020 #endif /* not RE_ENABLE_I18N */
6021 {
6022 int i;
6023 const char *name = (const char *) class_name;
6024
6025 /* In case of REG_ICASE "upper" and "lower" match the both of
6026 upper and lower cases. */
6027 if ((syntax & RE_ICASE)
6028 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
6029 name = "alpha";
6030
6031 #ifdef RE_ENABLE_I18N
6032 /* Check the space of the arrays. */
6033 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
6034 {
6035 /* Not enough, realloc it. */
6036 /* +1 in case of mbcset->nchar_classes is 0. */
6037 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
6038 /* Use realloc since array is NULL if *alloc == 0. */
6039 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
6040 new_char_class_alloc);
6041 if (BE (new_char_classes == NULL, 0))
6042 return REG_ESPACE;
6043 mbcset->char_classes = new_char_classes;
6044 *char_class_alloc = new_char_class_alloc;
6045 }
6046 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
6047 #endif /* RE_ENABLE_I18N */
6048
6049 #define BUILD_CHARCLASS_LOOP(ctype_func) \
6050 do { \
6051 if (BE (trans != NULL, 0)) \
6052 { \
6053 for (i = 0; i < SBC_MAX; ++i) \
6054 if (ctype_func (i)) \
6055 bitset_set (sbcset, trans[i]); \
6056 } \
6057 else \
6058 { \
6059 for (i = 0; i < SBC_MAX; ++i) \
6060 if (ctype_func (i)) \
6061 bitset_set (sbcset, i); \
6062 } \
6063 } while (0)
6064
6065 if (strcmp (name, "alnum") == 0)
6066 BUILD_CHARCLASS_LOOP (isalnum);
6067 else if (strcmp (name, "cntrl") == 0)
6068 BUILD_CHARCLASS_LOOP (iscntrl);
6069 else if (strcmp (name, "lower") == 0)
6070 BUILD_CHARCLASS_LOOP (islower);
6071 else if (strcmp (name, "space") == 0)
6072 BUILD_CHARCLASS_LOOP (isspace);
6073 else if (strcmp (name, "alpha") == 0)
6074 BUILD_CHARCLASS_LOOP (isalpha);
6075 else if (strcmp (name, "digit") == 0)
6076 BUILD_CHARCLASS_LOOP (isdigit);
6077 else if (strcmp (name, "print") == 0)
6078 BUILD_CHARCLASS_LOOP (isprint);
6079 else if (strcmp (name, "upper") == 0)
6080 BUILD_CHARCLASS_LOOP (isupper);
6081 else if (strcmp (name, "blank") == 0)
6082 BUILD_CHARCLASS_LOOP (isblank);
6083 else if (strcmp (name, "graph") == 0)
6084 BUILD_CHARCLASS_LOOP (isgraph);
6085 else if (strcmp (name, "punct") == 0)
6086 BUILD_CHARCLASS_LOOP (ispunct);
6087 else if (strcmp (name, "xdigit") == 0)
6088 BUILD_CHARCLASS_LOOP (isxdigit);
6089 else
6090 return REG_ECTYPE;
6091
6092 return REG_NOERROR;
6093 }
6094
6095 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,int non_match,reg_errcode_t * err)6096 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
6097 const unsigned char *class_name,
6098 const unsigned char *extra, int non_match,
6099 reg_errcode_t *err)
6100 {
6101 re_bitset_ptr_t sbcset;
6102 #ifdef RE_ENABLE_I18N
6103 re_charset_t *mbcset;
6104 int alloc = 0;
6105 #endif /* not RE_ENABLE_I18N */
6106 reg_errcode_t ret;
6107 re_token_t br_token;
6108 bin_tree_t *tree;
6109
6110 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
6111 #ifdef RE_ENABLE_I18N
6112 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
6113 #endif /* RE_ENABLE_I18N */
6114
6115 #ifdef RE_ENABLE_I18N
6116 if (BE (sbcset == NULL || mbcset == NULL, 0))
6117 #else /* not RE_ENABLE_I18N */
6118 if (BE (sbcset == NULL, 0))
6119 #endif /* not RE_ENABLE_I18N */
6120 {
6121 *err = REG_ESPACE;
6122 return NULL;
6123 }
6124
6125 if (non_match)
6126 {
6127 #ifdef RE_ENABLE_I18N
6128 /*
6129 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
6130 bitset_set(cset->sbcset, '\0');
6131 */
6132 mbcset->non_match = 1;
6133 #endif /* not RE_ENABLE_I18N */
6134 }
6135
6136 /* We don't care the syntax in this case. */
6137 ret = build_charclass (trans, sbcset,
6138 #ifdef RE_ENABLE_I18N
6139 mbcset, &alloc,
6140 #endif /* RE_ENABLE_I18N */
6141 class_name, 0);
6142
6143 if (BE (ret != REG_NOERROR, 0))
6144 {
6145 re_free (sbcset);
6146 #ifdef RE_ENABLE_I18N
6147 free_charset (mbcset);
6148 #endif /* RE_ENABLE_I18N */
6149 *err = ret;
6150 return NULL;
6151 }
6152 /* \w match '_' also. */
6153 for (; *extra; extra++)
6154 bitset_set (sbcset, *extra);
6155
6156 /* If it is non-matching list. */
6157 if (non_match)
6158 bitset_not (sbcset);
6159
6160 #ifdef RE_ENABLE_I18N
6161 /* Ensure only single byte characters are set. */
6162 if (dfa->mb_cur_max > 1)
6163 bitset_mask (sbcset, dfa->sb_char);
6164 #endif
6165
6166 /* Build a tree for simple bracket. */
6167 br_token.type = SIMPLE_BRACKET;
6168 br_token.opr.sbcset = sbcset;
6169 tree = create_token_tree (dfa, NULL, NULL, &br_token);
6170 if (BE (tree == NULL, 0))
6171 goto build_word_op_espace;
6172
6173 #ifdef RE_ENABLE_I18N
6174 if (dfa->mb_cur_max > 1)
6175 {
6176 bin_tree_t *mbc_tree;
6177 /* Build a tree for complex bracket. */
6178 br_token.type = COMPLEX_BRACKET;
6179 br_token.opr.mbcset = mbcset;
6180 dfa->has_mb_node = 1;
6181 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
6182 if (BE (mbc_tree == NULL, 0))
6183 goto build_word_op_espace;
6184 /* Then join them by ALT node. */
6185 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
6186 if (BE (mbc_tree != NULL, 1))
6187 return tree;
6188 }
6189 else
6190 {
6191 free_charset (mbcset);
6192 return tree;
6193 }
6194 #else /* not RE_ENABLE_I18N */
6195 return tree;
6196 #endif /* not RE_ENABLE_I18N */
6197
6198 build_word_op_espace:
6199 re_free (sbcset);
6200 #ifdef RE_ENABLE_I18N
6201 free_charset (mbcset);
6202 #endif /* RE_ENABLE_I18N */
6203 *err = REG_ESPACE;
6204 return NULL;
6205 }
6206
6207 /* This is intended for the expressions like "a{1,3}".
6208 Fetch a number from `input', and return the number.
6209 Return -1, if the number field is empty like "{,1}".
6210 Return -2, If an error is occured. */
6211
6212 static int
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)6213 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
6214 {
6215 int num = -1;
6216 unsigned char c;
6217 while (1)
6218 {
6219 fetch_token (token, input, syntax);
6220 c = token->opr.c;
6221 if (BE (token->type == END_OF_RE, 0))
6222 return -2;
6223 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
6224 break;
6225 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
6226 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
6227 num = (num > RE_DUP_MAX) ? -2 : num;
6228 }
6229 return num;
6230 }
6231
6232 #ifdef RE_ENABLE_I18N
6233 static void
free_charset(re_charset_t * cset)6234 free_charset (re_charset_t *cset)
6235 {
6236 re_free (cset->mbchars);
6237 # ifdef _LIBC
6238 re_free (cset->coll_syms);
6239 re_free (cset->equiv_classes);
6240 re_free (cset->range_starts);
6241 re_free (cset->range_ends);
6242 # endif
6243 re_free (cset->char_classes);
6244 re_free (cset);
6245 }
6246 #endif /* RE_ENABLE_I18N */
6247
6248 /* Functions for binary tree operation. */
6249
6250 /* Create a tree node. */
6251
6252 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)6253 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
6254 re_token_type_t type)
6255 {
6256 re_token_t t;
6257 t.type = type;
6258 return create_token_tree (dfa, left, right, &t);
6259 }
6260
6261 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)6262 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
6263 const re_token_t *token)
6264 {
6265 bin_tree_t *tree;
6266 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
6267 {
6268 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
6269
6270 if (storage == NULL)
6271 return NULL;
6272 storage->next = dfa->str_tree_storage;
6273 dfa->str_tree_storage = storage;
6274 dfa->str_tree_storage_idx = 0;
6275 }
6276 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
6277
6278 tree->parent = NULL;
6279 tree->left = left;
6280 tree->right = right;
6281 tree->token = *token;
6282 tree->token.duplicated = 0;
6283 tree->token.opt_subexp = 0;
6284 tree->first = NULL;
6285 tree->next = NULL;
6286 tree->node_idx = -1;
6287
6288 if (left != NULL)
6289 left->parent = tree;
6290 if (right != NULL)
6291 right->parent = tree;
6292 return tree;
6293 }
6294
6295 /* Mark the tree SRC as an optional subexpression.
6296 To be called from preorder or postorder. */
6297
6298 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)6299 mark_opt_subexp (void *extra, bin_tree_t *node)
6300 {
6301 int idx = (int) (long) extra;
6302 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
6303 node->token.opt_subexp = 1;
6304
6305 return REG_NOERROR;
6306 }
6307
6308 /* Free the allocated memory inside NODE. */
6309
6310 static void
free_token(re_token_t * node)6311 free_token (re_token_t *node)
6312 {
6313 #ifdef RE_ENABLE_I18N
6314 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
6315 free_charset (node->opr.mbcset);
6316 else
6317 #endif /* RE_ENABLE_I18N */
6318 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
6319 re_free (node->opr.sbcset);
6320 }
6321
6322 /* Worker function for tree walking. Free the allocated memory inside NODE
6323 and its children. */
6324
6325 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)6326 free_tree (void *extra, bin_tree_t *node)
6327 {
6328 free_token (&node->token);
6329 return REG_NOERROR;
6330 }
6331
6332
6333 /* Duplicate the node SRC, and return new node. This is a preorder
6334 visit similar to the one implemented by the generic visitor, but
6335 we need more infrastructure to maintain two parallel trees --- so,
6336 it's easier to duplicate. */
6337
6338 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)6339 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
6340 {
6341 const bin_tree_t *node;
6342 bin_tree_t *dup_root;
6343 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
6344
6345 for (node = root; ; )
6346 {
6347 /* Create a new tree and link it back to the current parent. */
6348 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
6349 if (*p_new == NULL)
6350 return NULL;
6351 (*p_new)->parent = dup_node;
6352 (*p_new)->token.duplicated = 1;
6353 dup_node = *p_new;
6354
6355 /* Go to the left node, or up and to the right. */
6356 if (node->left)
6357 {
6358 node = node->left;
6359 p_new = &dup_node->left;
6360 }
6361 else
6362 {
6363 const bin_tree_t *prev = NULL;
6364 while (node->right == prev || node->right == NULL)
6365 {
6366 prev = node;
6367 node = node->parent;
6368 dup_node = dup_node->parent;
6369 if (!node)
6370 return dup_root;
6371 }
6372 node = node->right;
6373 p_new = &dup_node->right;
6374 }
6375 }
6376 }
6377
6378 /******************************************************************************/
6379 /******************************************************************************/
6380 /******************************************************************************/
6381 /* GKINCLUDE #include "regexec.c" */
6382 /******************************************************************************/
6383 /******************************************************************************/
6384 /******************************************************************************/
6385 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
6386 int n) internal_function;
6387 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
6388 static void match_ctx_free (re_match_context_t *cache) internal_function;
6389 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
6390 int str_idx, int from, int to)
6391 internal_function;
6392 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
6393 internal_function;
6394 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
6395 int str_idx) internal_function;
6396 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
6397 int node, int str_idx)
6398 internal_function;
6399 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
6400 re_dfastate_t **limited_sts, int last_node,
6401 int last_str_idx)
6402 internal_function;
6403 static reg_errcode_t re_search_internal (const regex_t *preg,
6404 const char *string, int length,
6405 int start, int range, int stop,
6406 size_t nmatch, regmatch_t pmatch[],
6407 int eflags) internal_function;
6408 static int re_search_2_stub (struct re_pattern_buffer *bufp,
6409 const char *string1, int length1,
6410 const char *string2, int length2,
6411 int start, int range, struct re_registers *regs,
6412 int stop, int ret_len) internal_function;
6413 static int re_search_stub (struct re_pattern_buffer *bufp,
6414 const char *string, int length, int start,
6415 int range, int stop, struct re_registers *regs,
6416 int ret_len) internal_function;
6417 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
6418 int nregs, int regs_allocated) internal_function;
6419 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
6420 internal_function;
6421 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
6422 int *p_match_first) internal_function;
6423 static int check_halt_state_context (const re_match_context_t *mctx,
6424 const re_dfastate_t *state, int idx)
6425 internal_function;
6426 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
6427 regmatch_t *prev_idx_match, int cur_node,
6428 int cur_idx, int nmatch) internal_function;
6429 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
6430 int str_idx, int dest_node, int nregs,
6431 regmatch_t *regs,
6432 re_node_set *eps_via_nodes)
6433 internal_function;
6434 static reg_errcode_t set_regs (const regex_t *preg,
6435 const re_match_context_t *mctx,
6436 size_t nmatch, regmatch_t *pmatch,
6437 int fl_backtrack) internal_function;
6438 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
6439 internal_function;
6440
6441 #ifdef RE_ENABLE_I18N
6442 static int sift_states_iter_mb (const re_match_context_t *mctx,
6443 re_sift_context_t *sctx,
6444 int node_idx, int str_idx, int max_str_idx)
6445 internal_function;
6446 #endif /* RE_ENABLE_I18N */
6447 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
6448 re_sift_context_t *sctx)
6449 internal_function;
6450 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
6451 re_sift_context_t *sctx, int str_idx,
6452 re_node_set *cur_dest)
6453 internal_function;
6454 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
6455 re_sift_context_t *sctx,
6456 int str_idx,
6457 re_node_set *dest_nodes)
6458 internal_function;
6459 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
6460 re_node_set *dest_nodes,
6461 const re_node_set *candidates)
6462 internal_function;
6463 static int check_dst_limits (const re_match_context_t *mctx,
6464 re_node_set *limits,
6465 int dst_node, int dst_idx, int src_node,
6466 int src_idx) internal_function;
6467 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
6468 int boundaries, int subexp_idx,
6469 int from_node, int bkref_idx)
6470 internal_function;
6471 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
6472 int limit, int subexp_idx,
6473 int node, int str_idx,
6474 int bkref_idx) internal_function;
6475 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
6476 re_node_set *dest_nodes,
6477 const re_node_set *candidates,
6478 re_node_set *limits,
6479 struct re_backref_cache_entry *bkref_ents,
6480 int str_idx) internal_function;
6481 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
6482 re_sift_context_t *sctx,
6483 int str_idx, const re_node_set *candidates)
6484 internal_function;
6485 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
6486 re_dfastate_t **dst,
6487 re_dfastate_t **src, int num)
6488 internal_function;
6489 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
6490 re_match_context_t *mctx) internal_function;
6491 static re_dfastate_t *transit_state (reg_errcode_t *err,
6492 re_match_context_t *mctx,
6493 re_dfastate_t *state) internal_function;
6494 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
6495 re_match_context_t *mctx,
6496 re_dfastate_t *next_state)
6497 internal_function;
6498 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
6499 re_node_set *cur_nodes,
6500 int str_idx) internal_function;
6501 #if 0
6502 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
6503 re_match_context_t *mctx,
6504 re_dfastate_t *pstate)
6505 internal_function;
6506 #endif
6507 #ifdef RE_ENABLE_I18N
6508 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
6509 re_dfastate_t *pstate)
6510 internal_function;
6511 #endif /* RE_ENABLE_I18N */
6512 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
6513 const re_node_set *nodes)
6514 internal_function;
6515 static reg_errcode_t get_subexp (re_match_context_t *mctx,
6516 int bkref_node, int bkref_str_idx)
6517 internal_function;
6518 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
6519 const re_sub_match_top_t *sub_top,
6520 re_sub_match_last_t *sub_last,
6521 int bkref_node, int bkref_str)
6522 internal_function;
6523 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
6524 int subexp_idx, int type) internal_function;
6525 static reg_errcode_t check_arrival (re_match_context_t *mctx,
6526 state_array_t *path, int top_node,
6527 int top_str, int last_node, int last_str,
6528 int type) internal_function;
6529 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
6530 int str_idx,
6531 re_node_set *cur_nodes,
6532 re_node_set *next_nodes)
6533 internal_function;
6534 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
6535 re_node_set *cur_nodes,
6536 int ex_subexp, int type)
6537 internal_function;
6538 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
6539 re_node_set *dst_nodes,
6540 int target, int ex_subexp,
6541 int type) internal_function;
6542 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
6543 re_node_set *cur_nodes, int cur_str,
6544 int subexp_num, int type)
6545 internal_function;
6546 static int build_trtable (const re_dfa_t *dfa,
6547 re_dfastate_t *state) internal_function;
6548 #ifdef RE_ENABLE_I18N
6549 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
6550 const re_string_t *input, int idx)
6551 internal_function;
6552 # ifdef _LIBC
6553 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
6554 size_t name_len)
6555 internal_function;
6556 # endif /* _LIBC */
6557 #endif /* RE_ENABLE_I18N */
6558 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
6559 const re_dfastate_t *state,
6560 re_node_set *states_node,
6561 bitset_t *states_ch) internal_function;
6562 static int check_node_accept (const re_match_context_t *mctx,
6563 const re_token_t *node, int idx)
6564 internal_function;
6565 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
6566 internal_function;
6567
6568 /* Entry point for POSIX code. */
6569
6570 /* regexec searches for a given pattern, specified by PREG, in the
6571 string STRING.
6572
6573 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6574 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6575 least NMATCH elements, and we set them to the offsets of the
6576 corresponding matched substrings.
6577
6578 EFLAGS specifies `execution flags' which affect matching: if
6579 REG_NOTBOL is set, then ^ does not match at the beginning of the
6580 string; if REG_NOTEOL is set, then $ does not match at the end.
6581
6582 We return 0 if we find a match and REG_NOMATCH if not. */
6583
6584 int
regexec(preg,string,nmatch,pmatch,eflags)6585 regexec (preg, string, nmatch, pmatch, eflags)
6586 const regex_t *__restrict preg;
6587 const char *__restrict string;
6588 size_t nmatch;
6589 regmatch_t pmatch[];
6590 int eflags;
6591 {
6592 reg_errcode_t err;
6593 int start, length;
6594 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
6595
6596 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
6597 return REG_BADPAT;
6598
6599 if (eflags & REG_STARTEND)
6600 {
6601 start = pmatch[0].rm_so;
6602 length = pmatch[0].rm_eo;
6603 }
6604 else
6605 {
6606 start = 0;
6607 length = strlen (string);
6608 }
6609
6610 __libc_lock_lock (dfa->lock);
6611 if (preg->no_sub)
6612 err = re_search_internal (preg, string, length, start, length - start,
6613 length, 0, NULL, eflags);
6614 else
6615 err = re_search_internal (preg, string, length, start, length - start,
6616 length, nmatch, pmatch, eflags);
6617 __libc_lock_unlock (dfa->lock);
6618 return err != REG_NOERROR;
6619 }
6620
6621 #ifdef _LIBC
6622 # include <shlib-compat.h>
6623 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
6624
6625 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
6626 __typeof__ (__regexec) __compat_regexec;
6627
6628 int
6629 attribute_compat_text_section
__compat_regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)6630 __compat_regexec (const regex_t *__restrict preg,
6631 const char *__restrict string, size_t nmatch,
6632 regmatch_t pmatch[], int eflags)
6633 {
6634 return regexec (preg, string, nmatch, pmatch,
6635 eflags & (REG_NOTBOL | REG_NOTEOL));
6636 }
6637 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
6638 # endif
6639 #endif
6640
6641 /* Entry points for GNU code. */
6642
6643 /* re_match, re_search, re_match_2, re_search_2
6644
6645 The former two functions operate on STRING with length LENGTH,
6646 while the later two operate on concatenation of STRING1 and STRING2
6647 with lengths LENGTH1 and LENGTH2, respectively.
6648
6649 re_match() matches the compiled pattern in BUFP against the string,
6650 starting at index START.
6651
6652 re_search() first tries matching at index START, then it tries to match
6653 starting from index START + 1, and so on. The last start position tried
6654 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
6655 way as re_match().)
6656
6657 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
6658 the first STOP characters of the concatenation of the strings should be
6659 concerned.
6660
6661 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
6662 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
6663 computed relative to the concatenation, not relative to the individual
6664 strings.)
6665
6666 On success, re_match* functions return the length of the match, re_search*
6667 return the position of the start of the match. Return value -1 means no
6668 match was found and -2 indicates an internal error. */
6669
6670 int
re_match(bufp,string,length,start,regs)6671 re_match (bufp, string, length, start, regs)
6672 struct re_pattern_buffer *bufp;
6673 const char *string;
6674 int length, start;
6675 struct re_registers *regs;
6676 {
6677 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
6678 }
6679 #ifdef _LIBC
6680 weak_alias (__re_match, re_match)
6681 #endif
6682
6683 int
6684 re_search (bufp, string, length, start, range, regs)
6685 struct re_pattern_buffer *bufp;
6686 const char *string;
6687 int length, start, range;
6688 struct re_registers *regs;
6689 {
6690 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
6691 }
6692 #ifdef _LIBC
6693 weak_alias (__re_search, re_search)
6694 #endif
6695
6696 int
6697 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
6698 struct re_pattern_buffer *bufp;
6699 const char *string1, *string2;
6700 int length1, length2, start, stop;
6701 struct re_registers *regs;
6702 {
6703 return re_search_2_stub (bufp, string1, length1, string2, length2,
6704 start, 0, regs, stop, 1);
6705 }
6706 #ifdef _LIBC
6707 weak_alias (__re_match_2, re_match_2)
6708 #endif
6709
6710 int
6711 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
6712 struct re_pattern_buffer *bufp;
6713 const char *string1, *string2;
6714 int length1, length2, start, range, stop;
6715 struct re_registers *regs;
6716 {
6717 return re_search_2_stub (bufp, string1, length1, string2, length2,
6718 start, range, regs, stop, 0);
6719 }
6720 #ifdef _LIBC
6721 weak_alias (__re_search_2, re_search_2)
6722 #endif
6723
6724 static int
6725 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
6726 stop, ret_len)
6727 struct re_pattern_buffer *bufp;
6728 const char *string1, *string2;
6729 int length1, length2, start, range, stop, ret_len;
6730 struct re_registers *regs;
6731 {
6732 const char *str;
6733 int rval;
6734 int len = length1 + length2;
6735 int free_str = 0;
6736
6737 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
6738 return -2;
6739
6740 /* Concatenate the strings. */
6741 if (length2 > 0)
6742 if (length1 > 0)
6743 {
6744 char *s = re_malloc (char, len);
6745
6746 if (BE (s == NULL, 0))
6747 return -2;
6748 #ifdef _LIBC
6749 memcpy (__mempcpy (s, string1, length1), string2, length2);
6750 #else
6751 memcpy (s, string1, length1);
6752 memcpy (s + length1, string2, length2);
6753 #endif
6754 str = s;
6755 free_str = 1;
6756 }
6757 else
6758 str = string2;
6759 else
6760 str = string1;
6761
6762 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
6763 ret_len);
6764 if (free_str)
6765 re_free ((char *) str);
6766 return rval;
6767 }
6768
6769 /* The parameters have the same meaning as those of re_search.
6770 Additional parameters:
6771 If RET_LEN is nonzero the length of the match is returned (re_match style);
6772 otherwise the position of the match is returned. */
6773
6774 static int
re_search_stub(bufp,string,length,start,range,stop,regs,ret_len)6775 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
6776 struct re_pattern_buffer *bufp;
6777 const char *string;
6778 int length, start, range, stop, ret_len;
6779 struct re_registers *regs;
6780 {
6781 reg_errcode_t result;
6782 regmatch_t *pmatch;
6783 int nregs, rval;
6784 int eflags = 0;
6785 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
6786
6787 /* Check for out-of-range. */
6788 if (BE (start < 0 || start > length, 0))
6789 return -1;
6790 if (BE (start + range > length, 0))
6791 range = length - start;
6792 else if (BE (start + range < 0, 0))
6793 range = -start;
6794
6795 __libc_lock_lock (dfa->lock);
6796
6797 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
6798 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
6799
6800 /* Compile fastmap if we haven't yet. */
6801 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
6802 re_compile_fastmap (bufp);
6803
6804 if (BE (bufp->no_sub, 0))
6805 regs = NULL;
6806
6807 /* We need at least 1 register. */
6808 if (regs == NULL)
6809 nregs = 1;
6810 else if (BE (bufp->regs_allocated == REGS_FIXED &&
6811 regs->num_regs < bufp->re_nsub + 1, 0))
6812 {
6813 nregs = regs->num_regs;
6814 if (BE (nregs < 1, 0))
6815 {
6816 /* Nothing can be copied to regs. */
6817 regs = NULL;
6818 nregs = 1;
6819 }
6820 }
6821 else
6822 nregs = bufp->re_nsub + 1;
6823 pmatch = re_malloc (regmatch_t, nregs);
6824 if (BE (pmatch == NULL, 0))
6825 {
6826 rval = -2;
6827 goto out;
6828 }
6829
6830 result = re_search_internal (bufp, string, length, start, range, stop,
6831 nregs, pmatch, eflags);
6832
6833 rval = 0;
6834
6835 /* I hope we needn't fill ther regs with -1's when no match was found. */
6836 if (result != REG_NOERROR)
6837 rval = -1;
6838 else if (regs != NULL)
6839 {
6840 /* If caller wants register contents data back, copy them. */
6841 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
6842 bufp->regs_allocated);
6843 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
6844 rval = -2;
6845 }
6846
6847 if (BE (rval == 0, 1))
6848 {
6849 if (ret_len)
6850 {
6851 assert (pmatch[0].rm_so == start);
6852 rval = pmatch[0].rm_eo - start;
6853 }
6854 else
6855 rval = pmatch[0].rm_so;
6856 }
6857 re_free (pmatch);
6858 out:
6859 __libc_lock_unlock (dfa->lock);
6860 return rval;
6861 }
6862
6863 static unsigned
re_copy_regs(regs,pmatch,nregs,regs_allocated)6864 re_copy_regs (regs, pmatch, nregs, regs_allocated)
6865 struct re_registers *regs;
6866 regmatch_t *pmatch;
6867 int nregs, regs_allocated;
6868 {
6869 int rval = REGS_REALLOCATE;
6870 int i;
6871 int need_regs = nregs + 1;
6872 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
6873 uses. */
6874
6875 /* Have the register data arrays been allocated? */
6876 if (regs_allocated == REGS_UNALLOCATED)
6877 { /* No. So allocate them with malloc. */
6878 regs->start = re_malloc (regoff_t, need_regs);
6879 regs->end = re_malloc (regoff_t, need_regs);
6880 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
6881 return REGS_UNALLOCATED;
6882 regs->num_regs = need_regs;
6883 }
6884 else if (regs_allocated == REGS_REALLOCATE)
6885 { /* Yes. If we need more elements than were already
6886 allocated, reallocate them. If we need fewer, just
6887 leave it alone. */
6888 if (BE (need_regs > regs->num_regs, 0))
6889 {
6890 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
6891 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
6892 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
6893 return REGS_UNALLOCATED;
6894 regs->start = new_start;
6895 regs->end = new_end;
6896 regs->num_regs = need_regs;
6897 }
6898 }
6899 else
6900 {
6901 assert (regs_allocated == REGS_FIXED);
6902 /* This function may not be called with REGS_FIXED and nregs too big. */
6903 assert (regs->num_regs >= nregs);
6904 rval = REGS_FIXED;
6905 }
6906
6907 /* Copy the regs. */
6908 for (i = 0; i < nregs; ++i)
6909 {
6910 regs->start[i] = pmatch[i].rm_so;
6911 regs->end[i] = pmatch[i].rm_eo;
6912 }
6913 for ( ; i < regs->num_regs; ++i)
6914 regs->start[i] = regs->end[i] = -1;
6915
6916 return rval;
6917 }
6918
6919 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
6920 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
6921 this memory for recording register information. STARTS and ENDS
6922 must be allocated using the malloc library routine, and must each
6923 be at least NUM_REGS * sizeof (regoff_t) bytes long.
6924
6925 If NUM_REGS == 0, then subsequent matches should allocate their own
6926 register data.
6927
6928 Unless this function is called, the first search or match using
6929 PATTERN_BUFFER will allocate its own register data, without
6930 freeing the old data. */
6931
6932 void
re_set_registers(bufp,regs,num_regs,starts,ends)6933 re_set_registers (bufp, regs, num_regs, starts, ends)
6934 struct re_pattern_buffer *bufp;
6935 struct re_registers *regs;
6936 unsigned num_regs;
6937 regoff_t *starts, *ends;
6938 {
6939 if (num_regs)
6940 {
6941 bufp->regs_allocated = REGS_REALLOCATE;
6942 regs->num_regs = num_regs;
6943 regs->start = starts;
6944 regs->end = ends;
6945 }
6946 else
6947 {
6948 bufp->regs_allocated = REGS_UNALLOCATED;
6949 regs->num_regs = 0;
6950 regs->start = regs->end = (regoff_t *) 0;
6951 }
6952 }
6953 #ifdef _LIBC
6954 weak_alias (__re_set_registers, re_set_registers)
6955 #endif
6956
6957 /* Entry points compatible with 4.2 BSD regex library. We don't define
6958 them unless specifically requested. */
6959
6960 #if defined _REGEX_RE_COMP || defined _LIBC
6961 int
6962 # ifdef _LIBC
6963 weak_function
6964 # endif
6965 re_exec (s)
6966 const char *s;
6967 {
6968 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
6969 }
6970 #endif /* _REGEX_RE_COMP */
6971
6972 /* Internal entry point. */
6973
6974 /* Searches for a compiled pattern PREG in the string STRING, whose
6975 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
6976 mingings with regexec. START, and RANGE have the same meanings
6977 with re_search.
6978 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
6979 otherwise return the error code.
6980 Note: We assume front end functions already check ranges.
6981 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
6982
6983 static reg_errcode_t
re_search_internal(preg,string,length,start,range,stop,nmatch,pmatch,eflags)6984 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
6985 eflags)
6986 const regex_t *preg;
6987 const char *string;
6988 int length, start, range, stop, eflags;
6989 size_t nmatch;
6990 regmatch_t pmatch[];
6991 {
6992 reg_errcode_t err;
6993 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
6994 int left_lim, right_lim, incr;
6995 int fl_longest_match, match_first, match_kind, match_last = -1;
6996 int extra_nmatch;
6997 int sb, ch;
6998 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
6999 re_match_context_t mctx = { .dfa = dfa };
7000 #else
7001 re_match_context_t mctx;
7002 #endif
7003 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
7004 && range && !preg->can_be_null) ? preg->fastmap : NULL;
7005 RE_TRANSLATE_TYPE t = preg->translate;
7006
7007 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
7008 memset (&mctx, '\0', sizeof (re_match_context_t));
7009 mctx.dfa = dfa;
7010 #endif
7011
7012 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
7013 nmatch -= extra_nmatch;
7014
7015 /* Check if the DFA haven't been compiled. */
7016 if (BE (preg->used == 0 || dfa->init_state == NULL
7017 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
7018 || dfa->init_state_begbuf == NULL, 0))
7019 return REG_NOMATCH;
7020
7021 #ifdef DEBUG
7022 /* We assume front-end functions already check them. */
7023 assert (start + range >= 0 && start + range <= length);
7024 #endif
7025
7026 /* If initial states with non-begbuf contexts have no elements,
7027 the regex must be anchored. If preg->newline_anchor is set,
7028 we'll never use init_state_nl, so do not check it. */
7029 if (dfa->init_state->nodes.nelem == 0
7030 && dfa->init_state_word->nodes.nelem == 0
7031 && (dfa->init_state_nl->nodes.nelem == 0
7032 || !preg->newline_anchor))
7033 {
7034 if (start != 0 && start + range != 0)
7035 return REG_NOMATCH;
7036 start = range = 0;
7037 }
7038
7039 /* We must check the longest matching, if nmatch > 0. */
7040 fl_longest_match = (nmatch != 0 || dfa->nbackref);
7041
7042 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
7043 preg->translate, preg->syntax & RE_ICASE, dfa);
7044 if (BE (err != REG_NOERROR, 0))
7045 goto free_return;
7046 mctx.input.stop = stop;
7047 mctx.input.raw_stop = stop;
7048 mctx.input.newline_anchor = preg->newline_anchor;
7049
7050 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
7051 if (BE (err != REG_NOERROR, 0))
7052 goto free_return;
7053
7054 /* We will log all the DFA states through which the dfa pass,
7055 if nmatch > 1, or this dfa has "multibyte node", which is a
7056 back-reference or a node which can accept multibyte character or
7057 multi character collating element. */
7058 if (nmatch > 1 || dfa->has_mb_node)
7059 {
7060 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
7061 if (BE (mctx.state_log == NULL, 0))
7062 {
7063 err = REG_ESPACE;
7064 goto free_return;
7065 }
7066 }
7067 else
7068 mctx.state_log = NULL;
7069
7070 match_first = start;
7071 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
7072 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
7073
7074 /* Check incrementally whether of not the input string match. */
7075 incr = (range < 0) ? -1 : 1;
7076 left_lim = (range < 0) ? start + range : start;
7077 right_lim = (range < 0) ? start : start + range;
7078 sb = dfa->mb_cur_max == 1;
7079 match_kind =
7080 (fastmap
7081 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
7082 | (range >= 0 ? 2 : 0)
7083 | (t != NULL ? 1 : 0))
7084 : 8);
7085
7086 for (;; match_first += incr)
7087 {
7088 err = REG_NOMATCH;
7089 if (match_first < left_lim || right_lim < match_first)
7090 goto free_return;
7091
7092 /* Advance as rapidly as possible through the string, until we
7093 find a plausible place to start matching. This may be done
7094 with varying efficiency, so there are various possibilities:
7095 only the most common of them are specialized, in order to
7096 save on code size. We use a switch statement for speed. */
7097 switch (match_kind)
7098 {
7099 case 8:
7100 /* No fastmap. */
7101 break;
7102
7103 case 7:
7104 /* Fastmap with single-byte translation, match forward. */
7105 while (BE (match_first < right_lim, 1)
7106 && !fastmap[t[(unsigned char) string[match_first]]])
7107 ++match_first;
7108 goto forward_match_found_start_or_reached_end;
7109
7110 case 6:
7111 /* Fastmap without translation, match forward. */
7112 while (BE (match_first < right_lim, 1)
7113 && !fastmap[(unsigned char) string[match_first]])
7114 ++match_first;
7115
7116 forward_match_found_start_or_reached_end:
7117 if (BE (match_first == right_lim, 0))
7118 {
7119 ch = match_first >= length
7120 ? 0 : (unsigned char) string[match_first];
7121 if (!fastmap[t ? t[ch] : ch])
7122 goto free_return;
7123 }
7124 break;
7125
7126 case 4:
7127 case 5:
7128 /* Fastmap without multi-byte translation, match backwards. */
7129 while (match_first >= left_lim)
7130 {
7131 ch = match_first >= length
7132 ? 0 : (unsigned char) string[match_first];
7133 if (fastmap[t ? t[ch] : ch])
7134 break;
7135 --match_first;
7136 }
7137 if (match_first < left_lim)
7138 goto free_return;
7139 break;
7140
7141 default:
7142 /* In this case, we can't determine easily the current byte,
7143 since it might be a component byte of a multibyte
7144 character. Then we use the constructed buffer instead. */
7145 for (;;)
7146 {
7147 /* If MATCH_FIRST is out of the valid range, reconstruct the
7148 buffers. */
7149 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
7150 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
7151 {
7152 err = re_string_reconstruct (&mctx.input, match_first,
7153 eflags);
7154 if (BE (err != REG_NOERROR, 0))
7155 goto free_return;
7156
7157 offset = match_first - mctx.input.raw_mbs_idx;
7158 }
7159 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
7160 Note that MATCH_FIRST must not be smaller than 0. */
7161 ch = (match_first >= length
7162 ? 0 : re_string_byte_at (&mctx.input, offset));
7163 if (fastmap[ch])
7164 break;
7165 match_first += incr;
7166 if (match_first < left_lim || match_first > right_lim)
7167 {
7168 err = REG_NOMATCH;
7169 goto free_return;
7170 }
7171 }
7172 break;
7173 }
7174
7175 /* Reconstruct the buffers so that the matcher can assume that
7176 the matching starts from the beginning of the buffer. */
7177 err = re_string_reconstruct (&mctx.input, match_first, eflags);
7178 if (BE (err != REG_NOERROR, 0))
7179 goto free_return;
7180
7181 #ifdef RE_ENABLE_I18N
7182 /* Don't consider this char as a possible match start if it part,
7183 yet isn't the head, of a multibyte character. */
7184 if (!sb && !re_string_first_byte (&mctx.input, 0))
7185 continue;
7186 #endif
7187
7188 /* It seems to be appropriate one, then use the matcher. */
7189 /* We assume that the matching starts from 0. */
7190 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
7191 match_last = check_matching (&mctx, fl_longest_match,
7192 range >= 0 ? &match_first : NULL);
7193 if (match_last != -1)
7194 {
7195 if (BE (match_last == -2, 0))
7196 {
7197 err = REG_ESPACE;
7198 goto free_return;
7199 }
7200 else
7201 {
7202 mctx.match_last = match_last;
7203 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
7204 {
7205 re_dfastate_t *pstate = mctx.state_log[match_last];
7206 mctx.last_node = check_halt_state_context (&mctx, pstate,
7207 match_last);
7208 }
7209 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
7210 || dfa->nbackref)
7211 {
7212 err = prune_impossible_nodes (&mctx);
7213 if (err == REG_NOERROR)
7214 break;
7215 if (BE (err != REG_NOMATCH, 0))
7216 goto free_return;
7217 match_last = -1;
7218 }
7219 else
7220 break; /* We found a match. */
7221 }
7222 }
7223
7224 match_ctx_clean (&mctx);
7225 }
7226
7227 #ifdef DEBUG
7228 assert (match_last != -1);
7229 assert (err == REG_NOERROR);
7230 #endif
7231
7232 /* Set pmatch[] if we need. */
7233 if (nmatch > 0)
7234 {
7235 int reg_idx;
7236
7237 /* Initialize registers. */
7238 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
7239 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
7240
7241 /* Set the points where matching start/end. */
7242 pmatch[0].rm_so = 0;
7243 pmatch[0].rm_eo = mctx.match_last;
7244
7245 if (!preg->no_sub && nmatch > 1)
7246 {
7247 err = set_regs (preg, &mctx, nmatch, pmatch,
7248 dfa->has_plural_match && dfa->nbackref > 0);
7249 if (BE (err != REG_NOERROR, 0))
7250 goto free_return;
7251 }
7252
7253 /* At last, add the offset to the each registers, since we slided
7254 the buffers so that we could assume that the matching starts
7255 from 0. */
7256 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
7257 if (pmatch[reg_idx].rm_so != -1)
7258 {
7259 #ifdef RE_ENABLE_I18N
7260 if (BE (mctx.input.offsets_needed != 0, 0))
7261 {
7262 pmatch[reg_idx].rm_so =
7263 (pmatch[reg_idx].rm_so == mctx.input.valid_len
7264 ? mctx.input.valid_raw_len
7265 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
7266 pmatch[reg_idx].rm_eo =
7267 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
7268 ? mctx.input.valid_raw_len
7269 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
7270 }
7271 #else
7272 assert (mctx.input.offsets_needed == 0);
7273 #endif
7274 pmatch[reg_idx].rm_so += match_first;
7275 pmatch[reg_idx].rm_eo += match_first;
7276 }
7277 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
7278 {
7279 pmatch[nmatch + reg_idx].rm_so = -1;
7280 pmatch[nmatch + reg_idx].rm_eo = -1;
7281 }
7282
7283 if (dfa->subexp_map)
7284 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
7285 if (dfa->subexp_map[reg_idx] != reg_idx)
7286 {
7287 pmatch[reg_idx + 1].rm_so
7288 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
7289 pmatch[reg_idx + 1].rm_eo
7290 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
7291 }
7292 }
7293
7294 free_return:
7295 re_free (mctx.state_log);
7296 if (dfa->nbackref)
7297 match_ctx_free (&mctx);
7298 re_string_destruct (&mctx.input);
7299 return err;
7300 }
7301
7302 static reg_errcode_t
prune_impossible_nodes(mctx)7303 prune_impossible_nodes (mctx)
7304 re_match_context_t *mctx;
7305 {
7306 const re_dfa_t *const dfa = mctx->dfa;
7307 int halt_node, match_last;
7308 reg_errcode_t ret;
7309 re_dfastate_t **sifted_states;
7310 re_dfastate_t **lim_states = NULL;
7311 re_sift_context_t sctx;
7312 #ifdef DEBUG
7313 assert (mctx->state_log != NULL);
7314 #endif
7315 match_last = mctx->match_last;
7316 halt_node = mctx->last_node;
7317 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
7318 if (BE (sifted_states == NULL, 0))
7319 {
7320 ret = REG_ESPACE;
7321 goto free_return;
7322 }
7323 if (dfa->nbackref)
7324 {
7325 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
7326 if (BE (lim_states == NULL, 0))
7327 {
7328 ret = REG_ESPACE;
7329 goto free_return;
7330 }
7331 while (1)
7332 {
7333 memset (lim_states, '\0',
7334 sizeof (re_dfastate_t *) * (match_last + 1));
7335 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
7336 match_last);
7337 ret = sift_states_backward (mctx, &sctx);
7338 re_node_set_free (&sctx.limits);
7339 if (BE (ret != REG_NOERROR, 0))
7340 goto free_return;
7341 if (sifted_states[0] != NULL || lim_states[0] != NULL)
7342 break;
7343 do
7344 {
7345 --match_last;
7346 if (match_last < 0)
7347 {
7348 ret = REG_NOMATCH;
7349 goto free_return;
7350 }
7351 } while (mctx->state_log[match_last] == NULL
7352 || !mctx->state_log[match_last]->halt);
7353 halt_node = check_halt_state_context (mctx,
7354 mctx->state_log[match_last],
7355 match_last);
7356 }
7357 ret = merge_state_array (dfa, sifted_states, lim_states,
7358 match_last + 1);
7359 re_free (lim_states);
7360 lim_states = NULL;
7361 if (BE (ret != REG_NOERROR, 0))
7362 goto free_return;
7363 }
7364 else
7365 {
7366 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
7367 ret = sift_states_backward (mctx, &sctx);
7368 re_node_set_free (&sctx.limits);
7369 if (BE (ret != REG_NOERROR, 0))
7370 goto free_return;
7371 }
7372 re_free (mctx->state_log);
7373 mctx->state_log = sifted_states;
7374 sifted_states = NULL;
7375 mctx->last_node = halt_node;
7376 mctx->match_last = match_last;
7377 ret = REG_NOERROR;
7378 free_return:
7379 re_free (sifted_states);
7380 re_free (lim_states);
7381 return ret;
7382 }
7383
7384 /* Acquire an initial state and return it.
7385 We must select appropriate initial state depending on the context,
7386 since initial states may have constraints like "\<", "^", etc.. */
7387
7388 static inline re_dfastate_t *
7389 __attribute ((always_inline)) internal_function
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,int idx)7390 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
7391 int idx)
7392 {
7393 const re_dfa_t *const dfa = mctx->dfa;
7394 if (dfa->init_state->has_constraint)
7395 {
7396 unsigned int context;
7397 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
7398 if (IS_WORD_CONTEXT (context))
7399 return dfa->init_state_word;
7400 else if (IS_ORDINARY_CONTEXT (context))
7401 return dfa->init_state;
7402 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
7403 return dfa->init_state_begbuf;
7404 else if (IS_NEWLINE_CONTEXT (context))
7405 return dfa->init_state_nl;
7406 else if (IS_BEGBUF_CONTEXT (context))
7407 {
7408 /* It is relatively rare case, then calculate on demand. */
7409 return re_acquire_state_context (err, dfa,
7410 dfa->init_state->entrance_nodes,
7411 context);
7412 }
7413 else
7414 /* Must not happen? */
7415 return dfa->init_state;
7416 }
7417 else
7418 return dfa->init_state;
7419 }
7420
7421 /* Check whether the regular expression match input string INPUT or not,
7422 and return the index where the matching end, return -1 if not match,
7423 or return -2 in case of an error.
7424 FL_LONGEST_MATCH means we want the POSIX longest matching.
7425 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
7426 next place where we may want to try matching.
7427 Note that the matcher assume that the maching starts from the current
7428 index of the buffer. */
7429
7430 static int
7431 internal_function
check_matching(re_match_context_t * mctx,int fl_longest_match,int * p_match_first)7432 check_matching (re_match_context_t *mctx, int fl_longest_match,
7433 int *p_match_first)
7434 {
7435 const re_dfa_t *const dfa = mctx->dfa;
7436 reg_errcode_t err;
7437 int match = 0;
7438 int match_last = -1;
7439 int cur_str_idx = re_string_cur_idx (&mctx->input);
7440 re_dfastate_t *cur_state;
7441 int at_init_state = p_match_first != NULL;
7442 int next_start_idx = cur_str_idx;
7443
7444 err = REG_NOERROR;
7445 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
7446 /* An initial state must not be NULL (invalid). */
7447 if (BE (cur_state == NULL, 0))
7448 {
7449 assert (err == REG_ESPACE);
7450 return -2;
7451 }
7452
7453 if (mctx->state_log != NULL)
7454 {
7455 mctx->state_log[cur_str_idx] = cur_state;
7456
7457 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
7458 later. E.g. Processing back references. */
7459 if (BE (dfa->nbackref, 0))
7460 {
7461 at_init_state = 0;
7462 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
7463 if (BE (err != REG_NOERROR, 0))
7464 return err;
7465
7466 if (cur_state->has_backref)
7467 {
7468 err = transit_state_bkref (mctx, &cur_state->nodes);
7469 if (BE (err != REG_NOERROR, 0))
7470 return err;
7471 }
7472 }
7473 }
7474
7475 /* If the RE accepts NULL string. */
7476 if (BE (cur_state->halt, 0))
7477 {
7478 if (!cur_state->has_constraint
7479 || check_halt_state_context (mctx, cur_state, cur_str_idx))
7480 {
7481 if (!fl_longest_match)
7482 return cur_str_idx;
7483 else
7484 {
7485 match_last = cur_str_idx;
7486 match = 1;
7487 }
7488 }
7489 }
7490
7491 while (!re_string_eoi (&mctx->input))
7492 {
7493 re_dfastate_t *old_state = cur_state;
7494 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
7495
7496 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
7497 || (BE (next_char_idx >= mctx->input.valid_len, 0)
7498 && mctx->input.valid_len < mctx->input.len))
7499 {
7500 err = extend_buffers (mctx);
7501 if (BE (err != REG_NOERROR, 0))
7502 {
7503 assert (err == REG_ESPACE);
7504 return -2;
7505 }
7506 }
7507
7508 cur_state = transit_state (&err, mctx, cur_state);
7509 if (mctx->state_log != NULL)
7510 cur_state = merge_state_with_log (&err, mctx, cur_state);
7511
7512 if (cur_state == NULL)
7513 {
7514 /* Reached the invalid state or an error. Try to recover a valid
7515 state using the state log, if available and if we have not
7516 already found a valid (even if not the longest) match. */
7517 if (BE (err != REG_NOERROR, 0))
7518 return -2;
7519
7520 if (mctx->state_log == NULL
7521 || (match && !fl_longest_match)
7522 || (cur_state = find_recover_state (&err, mctx)) == NULL)
7523 break;
7524 }
7525
7526 if (BE (at_init_state, 0))
7527 {
7528 if (old_state == cur_state)
7529 next_start_idx = next_char_idx;
7530 else
7531 at_init_state = 0;
7532 }
7533
7534 if (cur_state->halt)
7535 {
7536 /* Reached a halt state.
7537 Check the halt state can satisfy the current context. */
7538 if (!cur_state->has_constraint
7539 || check_halt_state_context (mctx, cur_state,
7540 re_string_cur_idx (&mctx->input)))
7541 {
7542 /* We found an appropriate halt state. */
7543 match_last = re_string_cur_idx (&mctx->input);
7544 match = 1;
7545
7546 /* We found a match, do not modify match_first below. */
7547 p_match_first = NULL;
7548 if (!fl_longest_match)
7549 break;
7550 }
7551 }
7552 }
7553
7554 if (p_match_first)
7555 *p_match_first += next_start_idx;
7556
7557 return match_last;
7558 }
7559
7560 /* Check NODE match the current context. */
7561
7562 static int
7563 internal_function
check_halt_node_context(const re_dfa_t * dfa,int node,unsigned int context)7564 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
7565 {
7566 re_token_type_t type = dfa->nodes[node].type;
7567 unsigned int constraint = dfa->nodes[node].constraint;
7568 if (type != END_OF_RE)
7569 return 0;
7570 if (!constraint)
7571 return 1;
7572 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
7573 return 0;
7574 return 1;
7575 }
7576
7577 /* Check the halt state STATE match the current context.
7578 Return 0 if not match, if the node, STATE has, is a halt node and
7579 match the context, return the node. */
7580
7581 static int
7582 internal_function
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,int idx)7583 check_halt_state_context (const re_match_context_t *mctx,
7584 const re_dfastate_t *state, int idx)
7585 {
7586 int i;
7587 unsigned int context;
7588 #ifdef DEBUG
7589 assert (state->halt);
7590 #endif
7591 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
7592 for (i = 0; i < state->nodes.nelem; ++i)
7593 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
7594 return state->nodes.elems[i];
7595 return 0;
7596 }
7597
7598 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
7599 corresponding to the DFA).
7600 Return the destination node, and update EPS_VIA_NODES, return -1 in case
7601 of errors. */
7602
7603 static int
7604 internal_function
proceed_next_node(const re_match_context_t * mctx,int nregs,regmatch_t * regs,int * pidx,int node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)7605 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
7606 int *pidx, int node, re_node_set *eps_via_nodes,
7607 struct re_fail_stack_t *fs)
7608 {
7609 const re_dfa_t *const dfa = mctx->dfa;
7610 int i, err;
7611 if (IS_EPSILON_NODE (dfa->nodes[node].type))
7612 {
7613 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
7614 re_node_set *edests = &dfa->edests[node];
7615 int dest_node;
7616 err = re_node_set_insert (eps_via_nodes, node);
7617 if (BE (err < 0, 0))
7618 return -2;
7619 /* Pick up a valid destination, or return -1 if none is found. */
7620 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
7621 {
7622 int candidate = edests->elems[i];
7623 if (!re_node_set_contains (cur_nodes, candidate))
7624 continue;
7625 if (dest_node == -1)
7626 dest_node = candidate;
7627
7628 else
7629 {
7630 /* In order to avoid infinite loop like "(a*)*", return the second
7631 epsilon-transition if the first was already considered. */
7632 if (re_node_set_contains (eps_via_nodes, dest_node))
7633 return candidate;
7634
7635 /* Otherwise, push the second epsilon-transition on the fail stack. */
7636 else if (fs != NULL
7637 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
7638 eps_via_nodes))
7639 return -2;
7640
7641 /* We know we are going to exit. */
7642 break;
7643 }
7644 }
7645 return dest_node;
7646 }
7647 else
7648 {
7649 int naccepted = 0;
7650 re_token_type_t type = dfa->nodes[node].type;
7651
7652 #ifdef RE_ENABLE_I18N
7653 if (dfa->nodes[node].accept_mb)
7654 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
7655 else
7656 #endif /* RE_ENABLE_I18N */
7657 if (type == OP_BACK_REF)
7658 {
7659 int subexp_idx = dfa->nodes[node].opr.idx + 1;
7660 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
7661 if (fs != NULL)
7662 {
7663 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
7664 return -1;
7665 else if (naccepted)
7666 {
7667 char *buf = (char *) re_string_get_buffer (&mctx->input);
7668 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
7669 naccepted) != 0)
7670 return -1;
7671 }
7672 }
7673
7674 if (naccepted == 0)
7675 {
7676 int dest_node;
7677 err = re_node_set_insert (eps_via_nodes, node);
7678 if (BE (err < 0, 0))
7679 return -2;
7680 dest_node = dfa->edests[node].elems[0];
7681 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
7682 dest_node))
7683 return dest_node;
7684 }
7685 }
7686
7687 if (naccepted != 0
7688 || check_node_accept (mctx, dfa->nodes + node, *pidx))
7689 {
7690 int dest_node = dfa->nexts[node];
7691 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
7692 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
7693 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
7694 dest_node)))
7695 return -1;
7696 re_node_set_empty (eps_via_nodes);
7697 return dest_node;
7698 }
7699 }
7700 return -1;
7701 }
7702
7703 static reg_errcode_t
7704 internal_function
push_fail_stack(struct re_fail_stack_t * fs,int str_idx,int dest_node,int nregs,regmatch_t * regs,re_node_set * eps_via_nodes)7705 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
7706 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
7707 {
7708 reg_errcode_t err;
7709 int num = fs->num++;
7710 if (fs->num == fs->alloc)
7711 {
7712 struct re_fail_stack_ent_t *new_array;
7713 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
7714 * fs->alloc * 2));
7715 if (new_array == NULL)
7716 return REG_ESPACE;
7717 fs->alloc *= 2;
7718 fs->stack = new_array;
7719 }
7720 fs->stack[num].idx = str_idx;
7721 fs->stack[num].node = dest_node;
7722 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
7723 if (fs->stack[num].regs == NULL)
7724 return REG_ESPACE;
7725 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
7726 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
7727 return err;
7728 }
7729
7730 static int
7731 internal_function
pop_fail_stack(struct re_fail_stack_t * fs,int * pidx,int nregs,regmatch_t * regs,re_node_set * eps_via_nodes)7732 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
7733 regmatch_t *regs, re_node_set *eps_via_nodes)
7734 {
7735 int num = --fs->num;
7736 assert (num >= 0);
7737 *pidx = fs->stack[num].idx;
7738 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
7739 re_node_set_free (eps_via_nodes);
7740 re_free (fs->stack[num].regs);
7741 *eps_via_nodes = fs->stack[num].eps_via_nodes;
7742 return fs->stack[num].node;
7743 }
7744
7745 /* Set the positions where the subexpressions are starts/ends to registers
7746 PMATCH.
7747 Note: We assume that pmatch[0] is already set, and
7748 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
7749
7750 static reg_errcode_t
7751 internal_function
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,int fl_backtrack)7752 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
7753 regmatch_t *pmatch, int fl_backtrack)
7754 {
7755 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
7756 int idx, cur_node;
7757 re_node_set eps_via_nodes;
7758 struct re_fail_stack_t *fs;
7759 struct re_fail_stack_t fs_body = { 0, 2, NULL };
7760 regmatch_t *prev_idx_match;
7761 int prev_idx_match_malloced = 0;
7762
7763 #ifdef DEBUG
7764 assert (nmatch > 1);
7765 assert (mctx->state_log != NULL);
7766 #endif
7767 if (fl_backtrack)
7768 {
7769 fs = &fs_body;
7770 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
7771 if (fs->stack == NULL)
7772 return REG_ESPACE;
7773 }
7774 else
7775 fs = NULL;
7776
7777 cur_node = dfa->init_node;
7778 re_node_set_init_empty (&eps_via_nodes);
7779
7780 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
7781 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
7782 else
7783 {
7784 prev_idx_match = re_malloc (regmatch_t, nmatch);
7785 if (prev_idx_match == NULL)
7786 {
7787 free_fail_stack_return (fs);
7788 return REG_ESPACE;
7789 }
7790 prev_idx_match_malloced = 1;
7791 }
7792 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
7793
7794 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
7795 {
7796 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
7797
7798 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
7799 {
7800 int reg_idx;
7801 if (fs)
7802 {
7803 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
7804 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
7805 break;
7806 if (reg_idx == nmatch)
7807 {
7808 re_node_set_free (&eps_via_nodes);
7809 if (prev_idx_match_malloced)
7810 re_free (prev_idx_match);
7811 return free_fail_stack_return (fs);
7812 }
7813 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
7814 &eps_via_nodes);
7815 }
7816 else
7817 {
7818 re_node_set_free (&eps_via_nodes);
7819 if (prev_idx_match_malloced)
7820 re_free (prev_idx_match);
7821 return REG_NOERROR;
7822 }
7823 }
7824
7825 /* Proceed to next node. */
7826 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
7827 &eps_via_nodes, fs);
7828
7829 if (BE (cur_node < 0, 0))
7830 {
7831 if (BE (cur_node == -2, 0))
7832 {
7833 re_node_set_free (&eps_via_nodes);
7834 if (prev_idx_match_malloced)
7835 re_free (prev_idx_match);
7836 free_fail_stack_return (fs);
7837 return REG_ESPACE;
7838 }
7839 if (fs)
7840 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
7841 &eps_via_nodes);
7842 else
7843 {
7844 re_node_set_free (&eps_via_nodes);
7845 if (prev_idx_match_malloced)
7846 re_free (prev_idx_match);
7847 return REG_NOMATCH;
7848 }
7849 }
7850 }
7851 re_node_set_free (&eps_via_nodes);
7852 if (prev_idx_match_malloced)
7853 re_free (prev_idx_match);
7854 return free_fail_stack_return (fs);
7855 }
7856
7857 static reg_errcode_t
7858 internal_function
free_fail_stack_return(struct re_fail_stack_t * fs)7859 free_fail_stack_return (struct re_fail_stack_t *fs)
7860 {
7861 if (fs)
7862 {
7863 int fs_idx;
7864 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
7865 {
7866 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
7867 re_free (fs->stack[fs_idx].regs);
7868 }
7869 re_free (fs->stack);
7870 }
7871 return REG_NOERROR;
7872 }
7873
7874 static void
7875 internal_function
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,int cur_node,int cur_idx,int nmatch)7876 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
7877 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
7878 {
7879 int type = dfa->nodes[cur_node].type;
7880 if (type == OP_OPEN_SUBEXP)
7881 {
7882 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
7883
7884 /* We are at the first node of this sub expression. */
7885 if (reg_num < nmatch)
7886 {
7887 pmatch[reg_num].rm_so = cur_idx;
7888 pmatch[reg_num].rm_eo = -1;
7889 }
7890 }
7891 else if (type == OP_CLOSE_SUBEXP)
7892 {
7893 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
7894 if (reg_num < nmatch)
7895 {
7896 /* We are at the last node of this sub expression. */
7897 if (pmatch[reg_num].rm_so < cur_idx)
7898 {
7899 pmatch[reg_num].rm_eo = cur_idx;
7900 /* This is a non-empty match or we are not inside an optional
7901 subexpression. Accept this right away. */
7902 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
7903 }
7904 else
7905 {
7906 if (dfa->nodes[cur_node].opt_subexp
7907 && prev_idx_match[reg_num].rm_so != -1)
7908 /* We transited through an empty match for an optional
7909 subexpression, like (a?)*, and this is not the subexp's
7910 first match. Copy back the old content of the registers
7911 so that matches of an inner subexpression are undone as
7912 well, like in ((a?))*. */
7913 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
7914 else
7915 /* We completed a subexpression, but it may be part of
7916 an optional one, so do not update PREV_IDX_MATCH. */
7917 pmatch[reg_num].rm_eo = cur_idx;
7918 }
7919 }
7920 }
7921 }
7922
7923 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
7924 and sift the nodes in each states according to the following rules.
7925 Updated state_log will be wrote to STATE_LOG.
7926
7927 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
7928 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
7929 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
7930 the LAST_NODE, we throw away the node `a'.
7931 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
7932 string `s' and transit to `b':
7933 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
7934 away the node `a'.
7935 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
7936 thrown away, we throw away the node `a'.
7937 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
7938 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
7939 node `a'.
7940 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
7941 we throw away the node `a'. */
7942
7943 #define STATE_NODE_CONTAINS(state,node) \
7944 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
7945
7946 static reg_errcode_t
7947 internal_function
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)7948 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
7949 {
7950 reg_errcode_t err;
7951 int null_cnt = 0;
7952 int str_idx = sctx->last_str_idx;
7953 re_node_set cur_dest;
7954
7955 #ifdef DEBUG
7956 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
7957 #endif
7958
7959 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
7960 transit to the last_node and the last_node itself. */
7961 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
7962 if (BE (err != REG_NOERROR, 0))
7963 return err;
7964 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
7965 if (BE (err != REG_NOERROR, 0))
7966 goto free_return;
7967
7968 /* Then check each states in the state_log. */
7969 while (str_idx > 0)
7970 {
7971 /* Update counters. */
7972 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
7973 if (null_cnt > mctx->max_mb_elem_len)
7974 {
7975 memset (sctx->sifted_states, '\0',
7976 sizeof (re_dfastate_t *) * str_idx);
7977 re_node_set_free (&cur_dest);
7978 return REG_NOERROR;
7979 }
7980 re_node_set_empty (&cur_dest);
7981 --str_idx;
7982
7983 if (mctx->state_log[str_idx])
7984 {
7985 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
7986 if (BE (err != REG_NOERROR, 0))
7987 goto free_return;
7988 }
7989
7990 /* Add all the nodes which satisfy the following conditions:
7991 - It can epsilon transit to a node in CUR_DEST.
7992 - It is in CUR_SRC.
7993 And update state_log. */
7994 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
7995 if (BE (err != REG_NOERROR, 0))
7996 goto free_return;
7997 }
7998 err = REG_NOERROR;
7999 free_return:
8000 re_node_set_free (&cur_dest);
8001 return err;
8002 }
8003
8004 static reg_errcode_t
8005 internal_function
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,int str_idx,re_node_set * cur_dest)8006 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
8007 int str_idx, re_node_set *cur_dest)
8008 {
8009 const re_dfa_t *const dfa = mctx->dfa;
8010 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
8011 int i;
8012
8013 /* Then build the next sifted state.
8014 We build the next sifted state on `cur_dest', and update
8015 `sifted_states[str_idx]' with `cur_dest'.
8016 Note:
8017 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
8018 `cur_src' points the node_set of the old `state_log[str_idx]'
8019 (with the epsilon nodes pre-filtered out). */
8020 for (i = 0; i < cur_src->nelem; i++)
8021 {
8022 int prev_node = cur_src->elems[i];
8023 int naccepted = 0;
8024 int ret;
8025
8026 #ifdef DEBUG
8027 re_token_type_t type = dfa->nodes[prev_node].type;
8028 assert (!IS_EPSILON_NODE (type));
8029 #endif
8030 #ifdef RE_ENABLE_I18N
8031 /* If the node may accept `multi byte'. */
8032 if (dfa->nodes[prev_node].accept_mb)
8033 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
8034 str_idx, sctx->last_str_idx);
8035 #endif /* RE_ENABLE_I18N */
8036
8037 /* We don't check backreferences here.
8038 See update_cur_sifted_state(). */
8039 if (!naccepted
8040 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
8041 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
8042 dfa->nexts[prev_node]))
8043 naccepted = 1;
8044
8045 if (naccepted == 0)
8046 continue;
8047
8048 if (sctx->limits.nelem)
8049 {
8050 int to_idx = str_idx + naccepted;
8051 if (check_dst_limits (mctx, &sctx->limits,
8052 dfa->nexts[prev_node], to_idx,
8053 prev_node, str_idx))
8054 continue;
8055 }
8056 ret = re_node_set_insert (cur_dest, prev_node);
8057 if (BE (ret == -1, 0))
8058 return REG_ESPACE;
8059 }
8060
8061 return REG_NOERROR;
8062 }
8063
8064 /* Helper functions. */
8065
8066 static reg_errcode_t
8067 internal_function
clean_state_log_if_needed(re_match_context_t * mctx,int next_state_log_idx)8068 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
8069 {
8070 int top = mctx->state_log_top;
8071
8072 if (next_state_log_idx >= mctx->input.bufs_len
8073 || (next_state_log_idx >= mctx->input.valid_len
8074 && mctx->input.valid_len < mctx->input.len))
8075 {
8076 reg_errcode_t err;
8077 err = extend_buffers (mctx);
8078 if (BE (err != REG_NOERROR, 0))
8079 return err;
8080 }
8081
8082 if (top < next_state_log_idx)
8083 {
8084 memset (mctx->state_log + top + 1, '\0',
8085 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
8086 mctx->state_log_top = next_state_log_idx;
8087 }
8088 return REG_NOERROR;
8089 }
8090
8091 static reg_errcode_t
8092 internal_function
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,int num)8093 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
8094 re_dfastate_t **src, int num)
8095 {
8096 int st_idx;
8097 reg_errcode_t err;
8098 for (st_idx = 0; st_idx < num; ++st_idx)
8099 {
8100 if (dst[st_idx] == NULL)
8101 dst[st_idx] = src[st_idx];
8102 else if (src[st_idx] != NULL)
8103 {
8104 re_node_set merged_set;
8105 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
8106 &src[st_idx]->nodes);
8107 if (BE (err != REG_NOERROR, 0))
8108 return err;
8109 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
8110 re_node_set_free (&merged_set);
8111 if (BE (err != REG_NOERROR, 0))
8112 return err;
8113 }
8114 }
8115 return REG_NOERROR;
8116 }
8117
8118 static reg_errcode_t
8119 internal_function
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,int str_idx,re_node_set * dest_nodes)8120 update_cur_sifted_state (const re_match_context_t *mctx,
8121 re_sift_context_t *sctx, int str_idx,
8122 re_node_set *dest_nodes)
8123 {
8124 const re_dfa_t *const dfa = mctx->dfa;
8125 reg_errcode_t err = REG_NOERROR;
8126 const re_node_set *candidates;
8127 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
8128 : &mctx->state_log[str_idx]->nodes);
8129
8130 if (dest_nodes->nelem == 0)
8131 sctx->sifted_states[str_idx] = NULL;
8132 else
8133 {
8134 if (candidates)
8135 {
8136 /* At first, add the nodes which can epsilon transit to a node in
8137 DEST_NODE. */
8138 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
8139 if (BE (err != REG_NOERROR, 0))
8140 return err;
8141
8142 /* Then, check the limitations in the current sift_context. */
8143 if (sctx->limits.nelem)
8144 {
8145 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
8146 mctx->bkref_ents, str_idx);
8147 if (BE (err != REG_NOERROR, 0))
8148 return err;
8149 }
8150 }
8151
8152 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
8153 if (BE (err != REG_NOERROR, 0))
8154 return err;
8155 }
8156
8157 if (candidates && mctx->state_log[str_idx]->has_backref)
8158 {
8159 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
8160 if (BE (err != REG_NOERROR, 0))
8161 return err;
8162 }
8163 return REG_NOERROR;
8164 }
8165
8166 static reg_errcode_t
8167 internal_function
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)8168 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
8169 const re_node_set *candidates)
8170 {
8171 reg_errcode_t err = REG_NOERROR;
8172 int i;
8173
8174 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
8175 if (BE (err != REG_NOERROR, 0))
8176 return err;
8177
8178 if (!state->inveclosure.alloc)
8179 {
8180 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
8181 if (BE (err != REG_NOERROR, 0))
8182 return REG_ESPACE;
8183 for (i = 0; i < dest_nodes->nelem; i++)
8184 re_node_set_merge (&state->inveclosure,
8185 dfa->inveclosures + dest_nodes->elems[i]);
8186 }
8187 return re_node_set_add_intersect (dest_nodes, candidates,
8188 &state->inveclosure);
8189 }
8190
8191 static reg_errcode_t
8192 internal_function
sub_epsilon_src_nodes(const re_dfa_t * dfa,int node,re_node_set * dest_nodes,const re_node_set * candidates)8193 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
8194 const re_node_set *candidates)
8195 {
8196 int ecl_idx;
8197 reg_errcode_t err;
8198 re_node_set *inv_eclosure = dfa->inveclosures + node;
8199 re_node_set except_nodes;
8200 re_node_set_init_empty (&except_nodes);
8201 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
8202 {
8203 int cur_node = inv_eclosure->elems[ecl_idx];
8204 if (cur_node == node)
8205 continue;
8206 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
8207 {
8208 int edst1 = dfa->edests[cur_node].elems[0];
8209 int edst2 = ((dfa->edests[cur_node].nelem > 1)
8210 ? dfa->edests[cur_node].elems[1] : -1);
8211 if ((!re_node_set_contains (inv_eclosure, edst1)
8212 && re_node_set_contains (dest_nodes, edst1))
8213 || (edst2 > 0
8214 && !re_node_set_contains (inv_eclosure, edst2)
8215 && re_node_set_contains (dest_nodes, edst2)))
8216 {
8217 err = re_node_set_add_intersect (&except_nodes, candidates,
8218 dfa->inveclosures + cur_node);
8219 if (BE (err != REG_NOERROR, 0))
8220 {
8221 re_node_set_free (&except_nodes);
8222 return err;
8223 }
8224 }
8225 }
8226 }
8227 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
8228 {
8229 int cur_node = inv_eclosure->elems[ecl_idx];
8230 if (!re_node_set_contains (&except_nodes, cur_node))
8231 {
8232 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
8233 re_node_set_remove_at (dest_nodes, idx);
8234 }
8235 }
8236 re_node_set_free (&except_nodes);
8237 return REG_NOERROR;
8238 }
8239
8240 static int
8241 internal_function
check_dst_limits(const re_match_context_t * mctx,re_node_set * limits,int dst_node,int dst_idx,int src_node,int src_idx)8242 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
8243 int dst_node, int dst_idx, int src_node, int src_idx)
8244 {
8245 const re_dfa_t *const dfa = mctx->dfa;
8246 int lim_idx, src_pos, dst_pos;
8247
8248 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
8249 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
8250 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
8251 {
8252 int subexp_idx;
8253 struct re_backref_cache_entry *ent;
8254 ent = mctx->bkref_ents + limits->elems[lim_idx];
8255 subexp_idx = dfa->nodes[ent->node].opr.idx;
8256
8257 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
8258 subexp_idx, dst_node, dst_idx,
8259 dst_bkref_idx);
8260 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
8261 subexp_idx, src_node, src_idx,
8262 src_bkref_idx);
8263
8264 /* In case of:
8265 <src> <dst> ( <subexp> )
8266 ( <subexp> ) <src> <dst>
8267 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
8268 if (src_pos == dst_pos)
8269 continue; /* This is unrelated limitation. */
8270 else
8271 return 1;
8272 }
8273 return 0;
8274 }
8275
8276 static int
8277 internal_function
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,int subexp_idx,int from_node,int bkref_idx)8278 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
8279 int subexp_idx, int from_node, int bkref_idx)
8280 {
8281 const re_dfa_t *const dfa = mctx->dfa;
8282 const re_node_set *eclosures = dfa->eclosures + from_node;
8283 int node_idx;
8284
8285 /* Else, we are on the boundary: examine the nodes on the epsilon
8286 closure. */
8287 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
8288 {
8289 int node = eclosures->elems[node_idx];
8290 switch (dfa->nodes[node].type)
8291 {
8292 case OP_BACK_REF:
8293 if (bkref_idx != -1)
8294 {
8295 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
8296 do
8297 {
8298 int dst, cpos;
8299
8300 if (ent->node != node)
8301 continue;
8302
8303 if (subexp_idx < BITSET_WORD_BITS
8304 && !(ent->eps_reachable_subexps_map
8305 & ((bitset_word_t) 1 << subexp_idx)))
8306 continue;
8307
8308 /* Recurse trying to reach the OP_OPEN_SUBEXP and
8309 OP_CLOSE_SUBEXP cases below. But, if the
8310 destination node is the same node as the source
8311 node, don't recurse because it would cause an
8312 infinite loop: a regex that exhibits this behavior
8313 is ()\1*\1* */
8314 dst = dfa->edests[node].elems[0];
8315 if (dst == from_node)
8316 {
8317 if (boundaries & 1)
8318 return -1;
8319 else /* if (boundaries & 2) */
8320 return 0;
8321 }
8322
8323 cpos =
8324 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
8325 dst, bkref_idx);
8326 if (cpos == -1 /* && (boundaries & 1) */)
8327 return -1;
8328 if (cpos == 0 && (boundaries & 2))
8329 return 0;
8330
8331 if (subexp_idx < BITSET_WORD_BITS)
8332 ent->eps_reachable_subexps_map
8333 &= ~((bitset_word_t) 1 << subexp_idx);
8334 }
8335 while (ent++->more);
8336 }
8337 break;
8338
8339 case OP_OPEN_SUBEXP:
8340 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
8341 return -1;
8342 break;
8343
8344 case OP_CLOSE_SUBEXP:
8345 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
8346 return 0;
8347 break;
8348
8349 default:
8350 break;
8351 }
8352 }
8353
8354 return (boundaries & 2) ? 1 : 0;
8355 }
8356
8357 static int
8358 internal_function
check_dst_limits_calc_pos(const re_match_context_t * mctx,int limit,int subexp_idx,int from_node,int str_idx,int bkref_idx)8359 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
8360 int subexp_idx, int from_node, int str_idx,
8361 int bkref_idx)
8362 {
8363 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
8364 int boundaries;
8365
8366 /* If we are outside the range of the subexpression, return -1 or 1. */
8367 if (str_idx < lim->subexp_from)
8368 return -1;
8369
8370 if (lim->subexp_to < str_idx)
8371 return 1;
8372
8373 /* If we are within the subexpression, return 0. */
8374 boundaries = (str_idx == lim->subexp_from);
8375 boundaries |= (str_idx == lim->subexp_to) << 1;
8376 if (boundaries == 0)
8377 return 0;
8378
8379 /* Else, examine epsilon closure. */
8380 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
8381 from_node, bkref_idx);
8382 }
8383
8384 /* Check the limitations of sub expressions LIMITS, and remove the nodes
8385 which are against limitations from DEST_NODES. */
8386
8387 static reg_errcode_t
8388 internal_function
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,int str_idx)8389 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
8390 const re_node_set *candidates, re_node_set *limits,
8391 struct re_backref_cache_entry *bkref_ents, int str_idx)
8392 {
8393 reg_errcode_t err;
8394 int node_idx, lim_idx;
8395
8396 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
8397 {
8398 int subexp_idx;
8399 struct re_backref_cache_entry *ent;
8400 ent = bkref_ents + limits->elems[lim_idx];
8401
8402 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
8403 continue; /* This is unrelated limitation. */
8404
8405 subexp_idx = dfa->nodes[ent->node].opr.idx;
8406 if (ent->subexp_to == str_idx)
8407 {
8408 int ops_node = -1;
8409 int cls_node = -1;
8410 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
8411 {
8412 int node = dest_nodes->elems[node_idx];
8413 re_token_type_t type = dfa->nodes[node].type;
8414 if (type == OP_OPEN_SUBEXP
8415 && subexp_idx == dfa->nodes[node].opr.idx)
8416 ops_node = node;
8417 else if (type == OP_CLOSE_SUBEXP
8418 && subexp_idx == dfa->nodes[node].opr.idx)
8419 cls_node = node;
8420 }
8421
8422 /* Check the limitation of the open subexpression. */
8423 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
8424 if (ops_node >= 0)
8425 {
8426 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
8427 candidates);
8428 if (BE (err != REG_NOERROR, 0))
8429 return err;
8430 }
8431
8432 /* Check the limitation of the close subexpression. */
8433 if (cls_node >= 0)
8434 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
8435 {
8436 int node = dest_nodes->elems[node_idx];
8437 if (!re_node_set_contains (dfa->inveclosures + node,
8438 cls_node)
8439 && !re_node_set_contains (dfa->eclosures + node,
8440 cls_node))
8441 {
8442 /* It is against this limitation.
8443 Remove it form the current sifted state. */
8444 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
8445 candidates);
8446 if (BE (err != REG_NOERROR, 0))
8447 return err;
8448 --node_idx;
8449 }
8450 }
8451 }
8452 else /* (ent->subexp_to != str_idx) */
8453 {
8454 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
8455 {
8456 int node = dest_nodes->elems[node_idx];
8457 re_token_type_t type = dfa->nodes[node].type;
8458 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
8459 {
8460 if (subexp_idx != dfa->nodes[node].opr.idx)
8461 continue;
8462 /* It is against this limitation.
8463 Remove it form the current sifted state. */
8464 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
8465 candidates);
8466 if (BE (err != REG_NOERROR, 0))
8467 return err;
8468 }
8469 }
8470 }
8471 }
8472 return REG_NOERROR;
8473 }
8474
8475 static reg_errcode_t
8476 internal_function
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,int str_idx,const re_node_set * candidates)8477 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
8478 int str_idx, const re_node_set *candidates)
8479 {
8480 const re_dfa_t *const dfa = mctx->dfa;
8481 reg_errcode_t err;
8482 int node_idx, node;
8483 re_sift_context_t local_sctx;
8484 int first_idx = search_cur_bkref_entry (mctx, str_idx);
8485
8486 if (first_idx == -1)
8487 return REG_NOERROR;
8488
8489 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
8490
8491 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
8492 {
8493 int enabled_idx;
8494 re_token_type_t type;
8495 struct re_backref_cache_entry *entry;
8496 node = candidates->elems[node_idx];
8497 type = dfa->nodes[node].type;
8498 /* Avoid infinite loop for the REs like "()\1+". */
8499 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
8500 continue;
8501 if (type != OP_BACK_REF)
8502 continue;
8503
8504 entry = mctx->bkref_ents + first_idx;
8505 enabled_idx = first_idx;
8506 do
8507 {
8508 int subexp_len;
8509 int to_idx;
8510 int dst_node;
8511 int ret;
8512 re_dfastate_t *cur_state;
8513
8514 if (entry->node != node)
8515 continue;
8516 subexp_len = entry->subexp_to - entry->subexp_from;
8517 to_idx = str_idx + subexp_len;
8518 dst_node = (subexp_len ? dfa->nexts[node]
8519 : dfa->edests[node].elems[0]);
8520
8521 if (to_idx > sctx->last_str_idx
8522 || sctx->sifted_states[to_idx] == NULL
8523 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
8524 || check_dst_limits (mctx, &sctx->limits, node,
8525 str_idx, dst_node, to_idx))
8526 continue;
8527
8528 if (local_sctx.sifted_states == NULL)
8529 {
8530 local_sctx = *sctx;
8531 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
8532 if (BE (err != REG_NOERROR, 0))
8533 goto free_return;
8534 }
8535 local_sctx.last_node = node;
8536 local_sctx.last_str_idx = str_idx;
8537 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
8538 if (BE (ret < 0, 0))
8539 {
8540 err = REG_ESPACE;
8541 goto free_return;
8542 }
8543 cur_state = local_sctx.sifted_states[str_idx];
8544 err = sift_states_backward (mctx, &local_sctx);
8545 if (BE (err != REG_NOERROR, 0))
8546 goto free_return;
8547 if (sctx->limited_states != NULL)
8548 {
8549 err = merge_state_array (dfa, sctx->limited_states,
8550 local_sctx.sifted_states,
8551 str_idx + 1);
8552 if (BE (err != REG_NOERROR, 0))
8553 goto free_return;
8554 }
8555 local_sctx.sifted_states[str_idx] = cur_state;
8556 re_node_set_remove (&local_sctx.limits, enabled_idx);
8557
8558 /* mctx->bkref_ents may have changed, reload the pointer. */
8559 entry = mctx->bkref_ents + enabled_idx;
8560 }
8561 while (enabled_idx++, entry++->more);
8562 }
8563 err = REG_NOERROR;
8564 free_return:
8565 if (local_sctx.sifted_states != NULL)
8566 {
8567 re_node_set_free (&local_sctx.limits);
8568 }
8569
8570 return err;
8571 }
8572
8573
8574 #ifdef RE_ENABLE_I18N
8575 static int
8576 internal_function
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,int node_idx,int str_idx,int max_str_idx)8577 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
8578 int node_idx, int str_idx, int max_str_idx)
8579 {
8580 const re_dfa_t *const dfa = mctx->dfa;
8581 int naccepted;
8582 /* Check the node can accept `multi byte'. */
8583 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
8584 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
8585 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
8586 dfa->nexts[node_idx]))
8587 /* The node can't accept the `multi byte', or the
8588 destination was already thrown away, then the node
8589 could't accept the current input `multi byte'. */
8590 naccepted = 0;
8591 /* Otherwise, it is sure that the node could accept
8592 `naccepted' bytes input. */
8593 return naccepted;
8594 }
8595 #endif /* RE_ENABLE_I18N */
8596
8597
8598 /* Functions for state transition. */
8599
8600 /* Return the next state to which the current state STATE will transit by
8601 accepting the current input byte, and update STATE_LOG if necessary.
8602 If STATE can accept a multibyte char/collating element/back reference
8603 update the destination of STATE_LOG. */
8604
8605 static re_dfastate_t *
8606 internal_function
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)8607 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
8608 re_dfastate_t *state)
8609 {
8610 re_dfastate_t **trtable;
8611 unsigned char ch;
8612
8613 #ifdef RE_ENABLE_I18N
8614 /* If the current state can accept multibyte. */
8615 if (BE (state->accept_mb, 0))
8616 {
8617 *err = transit_state_mb (mctx, state);
8618 if (BE (*err != REG_NOERROR, 0))
8619 return NULL;
8620 }
8621 #endif /* RE_ENABLE_I18N */
8622
8623 /* Then decide the next state with the single byte. */
8624 #if 0
8625 if (0)
8626 /* don't use transition table */
8627 return transit_state_sb (err, mctx, state);
8628 #endif
8629
8630 /* Use transition table */
8631 ch = re_string_fetch_byte (&mctx->input);
8632 for (;;)
8633 {
8634 trtable = state->trtable;
8635 if (BE (trtable != NULL, 1))
8636 return trtable[ch];
8637
8638 trtable = state->word_trtable;
8639 if (BE (trtable != NULL, 1))
8640 {
8641 unsigned int context;
8642 context
8643 = re_string_context_at (&mctx->input,
8644 re_string_cur_idx (&mctx->input) - 1,
8645 mctx->eflags);
8646 if (IS_WORD_CONTEXT (context))
8647 return trtable[ch + SBC_MAX];
8648 else
8649 return trtable[ch];
8650 }
8651
8652 if (!build_trtable (mctx->dfa, state))
8653 {
8654 *err = REG_ESPACE;
8655 return NULL;
8656 }
8657
8658 /* Retry, we now have a transition table. */
8659 }
8660 }
8661
8662 /* Update the state_log if we need */
8663 re_dfastate_t *
8664 internal_function
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)8665 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
8666 re_dfastate_t *next_state)
8667 {
8668 const re_dfa_t *const dfa = mctx->dfa;
8669 int cur_idx = re_string_cur_idx (&mctx->input);
8670
8671 if (cur_idx > mctx->state_log_top)
8672 {
8673 mctx->state_log[cur_idx] = next_state;
8674 mctx->state_log_top = cur_idx;
8675 }
8676 else if (mctx->state_log[cur_idx] == 0)
8677 {
8678 mctx->state_log[cur_idx] = next_state;
8679 }
8680 else
8681 {
8682 re_dfastate_t *pstate;
8683 unsigned int context;
8684 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
8685 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
8686 the destination of a multibyte char/collating element/
8687 back reference. Then the next state is the union set of
8688 these destinations and the results of the transition table. */
8689 pstate = mctx->state_log[cur_idx];
8690 log_nodes = pstate->entrance_nodes;
8691 if (next_state != NULL)
8692 {
8693 table_nodes = next_state->entrance_nodes;
8694 *err = re_node_set_init_union (&next_nodes, table_nodes,
8695 log_nodes);
8696 if (BE (*err != REG_NOERROR, 0))
8697 return NULL;
8698 }
8699 else
8700 next_nodes = *log_nodes;
8701 /* Note: We already add the nodes of the initial state,
8702 then we don't need to add them here. */
8703
8704 context = re_string_context_at (&mctx->input,
8705 re_string_cur_idx (&mctx->input) - 1,
8706 mctx->eflags);
8707 next_state = mctx->state_log[cur_idx]
8708 = re_acquire_state_context (err, dfa, &next_nodes, context);
8709 /* We don't need to check errors here, since the return value of
8710 this function is next_state and ERR is already set. */
8711
8712 if (table_nodes != NULL)
8713 re_node_set_free (&next_nodes);
8714 }
8715
8716 if (BE (dfa->nbackref, 0) && next_state != NULL)
8717 {
8718 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
8719 later. We must check them here, since the back references in the
8720 next state might use them. */
8721 *err = check_subexp_matching_top (mctx, &next_state->nodes,
8722 cur_idx);
8723 if (BE (*err != REG_NOERROR, 0))
8724 return NULL;
8725
8726 /* If the next state has back references. */
8727 if (next_state->has_backref)
8728 {
8729 *err = transit_state_bkref (mctx, &next_state->nodes);
8730 if (BE (*err != REG_NOERROR, 0))
8731 return NULL;
8732 next_state = mctx->state_log[cur_idx];
8733 }
8734 }
8735
8736 return next_state;
8737 }
8738
8739 /* Skip bytes in the input that correspond to part of a
8740 multi-byte match, then look in the log for a state
8741 from which to restart matching. */
8742 re_dfastate_t *
8743 internal_function
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)8744 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
8745 {
8746 re_dfastate_t *cur_state;
8747 do
8748 {
8749 int max = mctx->state_log_top;
8750 int cur_str_idx = re_string_cur_idx (&mctx->input);
8751
8752 do
8753 {
8754 if (++cur_str_idx > max)
8755 return NULL;
8756 re_string_skip_bytes (&mctx->input, 1);
8757 }
8758 while (mctx->state_log[cur_str_idx] == NULL);
8759
8760 cur_state = merge_state_with_log (err, mctx, NULL);
8761 }
8762 while (*err == REG_NOERROR && cur_state == NULL);
8763 return cur_state;
8764 }
8765
8766 /* Helper functions for transit_state. */
8767
8768 /* From the node set CUR_NODES, pick up the nodes whose types are
8769 OP_OPEN_SUBEXP and which have corresponding back references in the regular
8770 expression. And register them to use them later for evaluating the
8771 correspoding back references. */
8772
8773 static reg_errcode_t
8774 internal_function
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,int str_idx)8775 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
8776 int str_idx)
8777 {
8778 const re_dfa_t *const dfa = mctx->dfa;
8779 int node_idx;
8780 reg_errcode_t err;
8781
8782 /* TODO: This isn't efficient.
8783 Because there might be more than one nodes whose types are
8784 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
8785 nodes.
8786 E.g. RE: (a){2} */
8787 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
8788 {
8789 int node = cur_nodes->elems[node_idx];
8790 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
8791 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
8792 && (dfa->used_bkref_map
8793 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
8794 {
8795 err = match_ctx_add_subtop (mctx, node, str_idx);
8796 if (BE (err != REG_NOERROR, 0))
8797 return err;
8798 }
8799 }
8800 return REG_NOERROR;
8801 }
8802
8803 #if 0
8804 /* Return the next state to which the current state STATE will transit by
8805 accepting the current input byte. */
8806
8807 static re_dfastate_t *
8808 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
8809 re_dfastate_t *state)
8810 {
8811 const re_dfa_t *const dfa = mctx->dfa;
8812 re_node_set next_nodes;
8813 re_dfastate_t *next_state;
8814 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
8815 unsigned int context;
8816
8817 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
8818 if (BE (*err != REG_NOERROR, 0))
8819 return NULL;
8820 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
8821 {
8822 int cur_node = state->nodes.elems[node_cnt];
8823 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
8824 {
8825 *err = re_node_set_merge (&next_nodes,
8826 dfa->eclosures + dfa->nexts[cur_node]);
8827 if (BE (*err != REG_NOERROR, 0))
8828 {
8829 re_node_set_free (&next_nodes);
8830 return NULL;
8831 }
8832 }
8833 }
8834 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
8835 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
8836 /* We don't need to check errors here, since the return value of
8837 this function is next_state and ERR is already set. */
8838
8839 re_node_set_free (&next_nodes);
8840 re_string_skip_bytes (&mctx->input, 1);
8841 return next_state;
8842 }
8843 #endif
8844
8845 #ifdef RE_ENABLE_I18N
8846 static reg_errcode_t
8847 internal_function
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)8848 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
8849 {
8850 const re_dfa_t *const dfa = mctx->dfa;
8851 reg_errcode_t err;
8852 int i;
8853
8854 for (i = 0; i < pstate->nodes.nelem; ++i)
8855 {
8856 re_node_set dest_nodes, *new_nodes;
8857 int cur_node_idx = pstate->nodes.elems[i];
8858 int naccepted, dest_idx;
8859 unsigned int context;
8860 re_dfastate_t *dest_state;
8861
8862 if (!dfa->nodes[cur_node_idx].accept_mb)
8863 continue;
8864
8865 if (dfa->nodes[cur_node_idx].constraint)
8866 {
8867 context = re_string_context_at (&mctx->input,
8868 re_string_cur_idx (&mctx->input),
8869 mctx->eflags);
8870 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
8871 context))
8872 continue;
8873 }
8874
8875 /* How many bytes the node can accept? */
8876 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
8877 re_string_cur_idx (&mctx->input));
8878 if (naccepted == 0)
8879 continue;
8880
8881 /* The node can accepts `naccepted' bytes. */
8882 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
8883 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
8884 : mctx->max_mb_elem_len);
8885 err = clean_state_log_if_needed (mctx, dest_idx);
8886 if (BE (err != REG_NOERROR, 0))
8887 return err;
8888 #ifdef DEBUG
8889 assert (dfa->nexts[cur_node_idx] != -1);
8890 #endif
8891 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
8892
8893 dest_state = mctx->state_log[dest_idx];
8894 if (dest_state == NULL)
8895 dest_nodes = *new_nodes;
8896 else
8897 {
8898 err = re_node_set_init_union (&dest_nodes,
8899 dest_state->entrance_nodes, new_nodes);
8900 if (BE (err != REG_NOERROR, 0))
8901 return err;
8902 }
8903 context = re_string_context_at (&mctx->input, dest_idx - 1,
8904 mctx->eflags);
8905 mctx->state_log[dest_idx]
8906 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
8907 if (dest_state != NULL)
8908 re_node_set_free (&dest_nodes);
8909 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
8910 return err;
8911 }
8912 return REG_NOERROR;
8913 }
8914 #endif /* RE_ENABLE_I18N */
8915
8916 static reg_errcode_t
8917 internal_function
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)8918 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
8919 {
8920 const re_dfa_t *const dfa = mctx->dfa;
8921 reg_errcode_t err;
8922 int i;
8923 int cur_str_idx = re_string_cur_idx (&mctx->input);
8924
8925 for (i = 0; i < nodes->nelem; ++i)
8926 {
8927 int dest_str_idx, prev_nelem, bkc_idx;
8928 int node_idx = nodes->elems[i];
8929 unsigned int context;
8930 const re_token_t *node = dfa->nodes + node_idx;
8931 re_node_set *new_dest_nodes;
8932
8933 /* Check whether `node' is a backreference or not. */
8934 if (node->type != OP_BACK_REF)
8935 continue;
8936
8937 if (node->constraint)
8938 {
8939 context = re_string_context_at (&mctx->input, cur_str_idx,
8940 mctx->eflags);
8941 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
8942 continue;
8943 }
8944
8945 /* `node' is a backreference.
8946 Check the substring which the substring matched. */
8947 bkc_idx = mctx->nbkref_ents;
8948 err = get_subexp (mctx, node_idx, cur_str_idx);
8949 if (BE (err != REG_NOERROR, 0))
8950 goto free_return;
8951
8952 /* And add the epsilon closures (which is `new_dest_nodes') of
8953 the backreference to appropriate state_log. */
8954 #ifdef DEBUG
8955 assert (dfa->nexts[node_idx] != -1);
8956 #endif
8957 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
8958 {
8959 int subexp_len;
8960 re_dfastate_t *dest_state;
8961 struct re_backref_cache_entry *bkref_ent;
8962 bkref_ent = mctx->bkref_ents + bkc_idx;
8963 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
8964 continue;
8965 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
8966 new_dest_nodes = (subexp_len == 0
8967 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
8968 : dfa->eclosures + dfa->nexts[node_idx]);
8969 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
8970 - bkref_ent->subexp_from);
8971 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
8972 mctx->eflags);
8973 dest_state = mctx->state_log[dest_str_idx];
8974 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
8975 : mctx->state_log[cur_str_idx]->nodes.nelem);
8976 /* Add `new_dest_node' to state_log. */
8977 if (dest_state == NULL)
8978 {
8979 mctx->state_log[dest_str_idx]
8980 = re_acquire_state_context (&err, dfa, new_dest_nodes,
8981 context);
8982 if (BE (mctx->state_log[dest_str_idx] == NULL
8983 && err != REG_NOERROR, 0))
8984 goto free_return;
8985 }
8986 else
8987 {
8988 re_node_set dest_nodes;
8989 err = re_node_set_init_union (&dest_nodes,
8990 dest_state->entrance_nodes,
8991 new_dest_nodes);
8992 if (BE (err != REG_NOERROR, 0))
8993 {
8994 re_node_set_free (&dest_nodes);
8995 goto free_return;
8996 }
8997 mctx->state_log[dest_str_idx]
8998 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
8999 re_node_set_free (&dest_nodes);
9000 if (BE (mctx->state_log[dest_str_idx] == NULL
9001 && err != REG_NOERROR, 0))
9002 goto free_return;
9003 }
9004 /* We need to check recursively if the backreference can epsilon
9005 transit. */
9006 if (subexp_len == 0
9007 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
9008 {
9009 err = check_subexp_matching_top (mctx, new_dest_nodes,
9010 cur_str_idx);
9011 if (BE (err != REG_NOERROR, 0))
9012 goto free_return;
9013 err = transit_state_bkref (mctx, new_dest_nodes);
9014 if (BE (err != REG_NOERROR, 0))
9015 goto free_return;
9016 }
9017 }
9018 }
9019 err = REG_NOERROR;
9020 free_return:
9021 return err;
9022 }
9023
9024 /* Enumerate all the candidates which the backreference BKREF_NODE can match
9025 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
9026 Note that we might collect inappropriate candidates here.
9027 However, the cost of checking them strictly here is too high, then we
9028 delay these checking for prune_impossible_nodes(). */
9029
9030 static reg_errcode_t
9031 internal_function
get_subexp(re_match_context_t * mctx,int bkref_node,int bkref_str_idx)9032 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
9033 {
9034 const re_dfa_t *const dfa = mctx->dfa;
9035 int subexp_num, sub_top_idx;
9036 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
9037 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
9038 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
9039 if (cache_idx != -1)
9040 {
9041 const struct re_backref_cache_entry *entry
9042 = mctx->bkref_ents + cache_idx;
9043 do
9044 if (entry->node == bkref_node)
9045 return REG_NOERROR; /* We already checked it. */
9046 while (entry++->more);
9047 }
9048
9049 subexp_num = dfa->nodes[bkref_node].opr.idx;
9050
9051 /* For each sub expression */
9052 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
9053 {
9054 reg_errcode_t err;
9055 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
9056 re_sub_match_last_t *sub_last;
9057 int sub_last_idx, sl_str, bkref_str_off;
9058
9059 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
9060 continue; /* It isn't related. */
9061
9062 sl_str = sub_top->str_idx;
9063 bkref_str_off = bkref_str_idx;
9064 /* At first, check the last node of sub expressions we already
9065 evaluated. */
9066 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
9067 {
9068 int sl_str_diff;
9069 sub_last = sub_top->lasts[sub_last_idx];
9070 sl_str_diff = sub_last->str_idx - sl_str;
9071 /* The matched string by the sub expression match with the substring
9072 at the back reference? */
9073 if (sl_str_diff > 0)
9074 {
9075 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
9076 {
9077 /* Not enough chars for a successful match. */
9078 if (bkref_str_off + sl_str_diff > mctx->input.len)
9079 break;
9080
9081 err = clean_state_log_if_needed (mctx,
9082 bkref_str_off
9083 + sl_str_diff);
9084 if (BE (err != REG_NOERROR, 0))
9085 return err;
9086 buf = (const char *) re_string_get_buffer (&mctx->input);
9087 }
9088 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
9089 /* We don't need to search this sub expression any more. */
9090 break;
9091 }
9092 bkref_str_off += sl_str_diff;
9093 sl_str += sl_str_diff;
9094 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
9095 bkref_str_idx);
9096
9097 /* Reload buf, since the preceding call might have reallocated
9098 the buffer. */
9099 buf = (const char *) re_string_get_buffer (&mctx->input);
9100
9101 if (err == REG_NOMATCH)
9102 continue;
9103 if (BE (err != REG_NOERROR, 0))
9104 return err;
9105 }
9106
9107 if (sub_last_idx < sub_top->nlasts)
9108 continue;
9109 if (sub_last_idx > 0)
9110 ++sl_str;
9111 /* Then, search for the other last nodes of the sub expression. */
9112 for (; sl_str <= bkref_str_idx; ++sl_str)
9113 {
9114 int cls_node, sl_str_off;
9115 const re_node_set *nodes;
9116 sl_str_off = sl_str - sub_top->str_idx;
9117 /* The matched string by the sub expression match with the substring
9118 at the back reference? */
9119 if (sl_str_off > 0)
9120 {
9121 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
9122 {
9123 /* If we are at the end of the input, we cannot match. */
9124 if (bkref_str_off >= mctx->input.len)
9125 break;
9126
9127 err = extend_buffers (mctx);
9128 if (BE (err != REG_NOERROR, 0))
9129 return err;
9130
9131 buf = (const char *) re_string_get_buffer (&mctx->input);
9132 }
9133 if (buf [bkref_str_off++] != buf[sl_str - 1])
9134 break; /* We don't need to search this sub expression
9135 any more. */
9136 }
9137 if (mctx->state_log[sl_str] == NULL)
9138 continue;
9139 /* Does this state have a ')' of the sub expression? */
9140 nodes = &mctx->state_log[sl_str]->nodes;
9141 cls_node = find_subexp_node (dfa, nodes, subexp_num,
9142 OP_CLOSE_SUBEXP);
9143 if (cls_node == -1)
9144 continue; /* No. */
9145 if (sub_top->path == NULL)
9146 {
9147 sub_top->path = calloc (sizeof (state_array_t),
9148 sl_str - sub_top->str_idx + 1);
9149 if (sub_top->path == NULL)
9150 return REG_ESPACE;
9151 }
9152 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
9153 in the current context? */
9154 err = check_arrival (mctx, sub_top->path, sub_top->node,
9155 sub_top->str_idx, cls_node, sl_str,
9156 OP_CLOSE_SUBEXP);
9157 if (err == REG_NOMATCH)
9158 continue;
9159 if (BE (err != REG_NOERROR, 0))
9160 return err;
9161 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
9162 if (BE (sub_last == NULL, 0))
9163 return REG_ESPACE;
9164 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
9165 bkref_str_idx);
9166 if (err == REG_NOMATCH)
9167 continue;
9168 }
9169 }
9170 return REG_NOERROR;
9171 }
9172
9173 /* Helper functions for get_subexp(). */
9174
9175 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
9176 If it can arrive, register the sub expression expressed with SUB_TOP
9177 and SUB_LAST. */
9178
9179 static reg_errcode_t
9180 internal_function
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,int bkref_node,int bkref_str)9181 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
9182 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
9183 {
9184 reg_errcode_t err;
9185 int to_idx;
9186 /* Can the subexpression arrive the back reference? */
9187 err = check_arrival (mctx, &sub_last->path, sub_last->node,
9188 sub_last->str_idx, bkref_node, bkref_str,
9189 OP_OPEN_SUBEXP);
9190 if (err != REG_NOERROR)
9191 return err;
9192 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
9193 sub_last->str_idx);
9194 if (BE (err != REG_NOERROR, 0))
9195 return err;
9196 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
9197 return clean_state_log_if_needed (mctx, to_idx);
9198 }
9199
9200 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
9201 Search '(' if FL_OPEN, or search ')' otherwise.
9202 TODO: This function isn't efficient...
9203 Because there might be more than one nodes whose types are
9204 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
9205 nodes.
9206 E.g. RE: (a){2} */
9207
9208 static int
9209 internal_function
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,int subexp_idx,int type)9210 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
9211 int subexp_idx, int type)
9212 {
9213 int cls_idx;
9214 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
9215 {
9216 int cls_node = nodes->elems[cls_idx];
9217 const re_token_t *node = dfa->nodes + cls_node;
9218 if (node->type == type
9219 && node->opr.idx == subexp_idx)
9220 return cls_node;
9221 }
9222 return -1;
9223 }
9224
9225 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
9226 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
9227 heavily reused.
9228 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
9229
9230 static reg_errcode_t
9231 internal_function
check_arrival(re_match_context_t * mctx,state_array_t * path,int top_node,int top_str,int last_node,int last_str,int type)9232 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
9233 int top_str, int last_node, int last_str, int type)
9234 {
9235 const re_dfa_t *const dfa = mctx->dfa;
9236 reg_errcode_t err = REG_NOERROR;
9237 int subexp_num, backup_cur_idx, str_idx, null_cnt;
9238 re_dfastate_t *cur_state = NULL;
9239 re_node_set *cur_nodes, next_nodes;
9240 re_dfastate_t **backup_state_log;
9241 unsigned int context;
9242
9243 subexp_num = dfa->nodes[top_node].opr.idx;
9244 /* Extend the buffer if we need. */
9245 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
9246 {
9247 re_dfastate_t **new_array;
9248 int old_alloc = path->alloc;
9249 path->alloc += last_str + mctx->max_mb_elem_len + 1;
9250 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
9251 if (BE (new_array == NULL, 0))
9252 {
9253 path->alloc = old_alloc;
9254 return REG_ESPACE;
9255 }
9256 path->array = new_array;
9257 memset (new_array + old_alloc, '\0',
9258 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
9259 }
9260
9261 str_idx = path->next_idx ? path->next_idx : top_str;
9262
9263 /* Temporary modify MCTX. */
9264 backup_state_log = mctx->state_log;
9265 backup_cur_idx = mctx->input.cur_idx;
9266 mctx->state_log = path->array;
9267 mctx->input.cur_idx = str_idx;
9268
9269 /* Setup initial node set. */
9270 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
9271 if (str_idx == top_str)
9272 {
9273 err = re_node_set_init_1 (&next_nodes, top_node);
9274 if (BE (err != REG_NOERROR, 0))
9275 return err;
9276 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
9277 if (BE (err != REG_NOERROR, 0))
9278 {
9279 re_node_set_free (&next_nodes);
9280 return err;
9281 }
9282 }
9283 else
9284 {
9285 cur_state = mctx->state_log[str_idx];
9286 if (cur_state && cur_state->has_backref)
9287 {
9288 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
9289 if (BE (err != REG_NOERROR, 0))
9290 return err;
9291 }
9292 else
9293 re_node_set_init_empty (&next_nodes);
9294 }
9295 if (str_idx == top_str || (cur_state && cur_state->has_backref))
9296 {
9297 if (next_nodes.nelem)
9298 {
9299 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
9300 subexp_num, type);
9301 if (BE (err != REG_NOERROR, 0))
9302 {
9303 re_node_set_free (&next_nodes);
9304 return err;
9305 }
9306 }
9307 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
9308 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
9309 {
9310 re_node_set_free (&next_nodes);
9311 return err;
9312 }
9313 mctx->state_log[str_idx] = cur_state;
9314 }
9315
9316 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
9317 {
9318 re_node_set_empty (&next_nodes);
9319 if (mctx->state_log[str_idx + 1])
9320 {
9321 err = re_node_set_merge (&next_nodes,
9322 &mctx->state_log[str_idx + 1]->nodes);
9323 if (BE (err != REG_NOERROR, 0))
9324 {
9325 re_node_set_free (&next_nodes);
9326 return err;
9327 }
9328 }
9329 if (cur_state)
9330 {
9331 err = check_arrival_add_next_nodes (mctx, str_idx,
9332 &cur_state->non_eps_nodes,
9333 &next_nodes);
9334 if (BE (err != REG_NOERROR, 0))
9335 {
9336 re_node_set_free (&next_nodes);
9337 return err;
9338 }
9339 }
9340 ++str_idx;
9341 if (next_nodes.nelem)
9342 {
9343 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
9344 if (BE (err != REG_NOERROR, 0))
9345 {
9346 re_node_set_free (&next_nodes);
9347 return err;
9348 }
9349 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
9350 subexp_num, type);
9351 if (BE (err != REG_NOERROR, 0))
9352 {
9353 re_node_set_free (&next_nodes);
9354 return err;
9355 }
9356 }
9357 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
9358 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
9359 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
9360 {
9361 re_node_set_free (&next_nodes);
9362 return err;
9363 }
9364 mctx->state_log[str_idx] = cur_state;
9365 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
9366 }
9367 re_node_set_free (&next_nodes);
9368 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
9369 : &mctx->state_log[last_str]->nodes);
9370 path->next_idx = str_idx;
9371
9372 /* Fix MCTX. */
9373 mctx->state_log = backup_state_log;
9374 mctx->input.cur_idx = backup_cur_idx;
9375
9376 /* Then check the current node set has the node LAST_NODE. */
9377 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
9378 return REG_NOERROR;
9379
9380 return REG_NOMATCH;
9381 }
9382
9383 /* Helper functions for check_arrival. */
9384
9385 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
9386 to NEXT_NODES.
9387 TODO: This function is similar to the functions transit_state*(),
9388 however this function has many additional works.
9389 Can't we unify them? */
9390
9391 static reg_errcode_t
9392 internal_function
check_arrival_add_next_nodes(re_match_context_t * mctx,int str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)9393 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
9394 re_node_set *cur_nodes, re_node_set *next_nodes)
9395 {
9396 const re_dfa_t *const dfa = mctx->dfa;
9397 int result;
9398 int cur_idx;
9399 reg_errcode_t err = REG_NOERROR;
9400 re_node_set union_set;
9401 re_node_set_init_empty (&union_set);
9402 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
9403 {
9404 int naccepted = 0;
9405 int cur_node = cur_nodes->elems[cur_idx];
9406 #ifdef DEBUG
9407 re_token_type_t type = dfa->nodes[cur_node].type;
9408 assert (!IS_EPSILON_NODE (type));
9409 #endif
9410 #ifdef RE_ENABLE_I18N
9411 /* If the node may accept `multi byte'. */
9412 if (dfa->nodes[cur_node].accept_mb)
9413 {
9414 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
9415 str_idx);
9416 if (naccepted > 1)
9417 {
9418 re_dfastate_t *dest_state;
9419 int next_node = dfa->nexts[cur_node];
9420 int next_idx = str_idx + naccepted;
9421 dest_state = mctx->state_log[next_idx];
9422 re_node_set_empty (&union_set);
9423 if (dest_state)
9424 {
9425 err = re_node_set_merge (&union_set, &dest_state->nodes);
9426 if (BE (err != REG_NOERROR, 0))
9427 {
9428 re_node_set_free (&union_set);
9429 return err;
9430 }
9431 }
9432 result = re_node_set_insert (&union_set, next_node);
9433 if (BE (result < 0, 0))
9434 {
9435 re_node_set_free (&union_set);
9436 return REG_ESPACE;
9437 }
9438 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
9439 &union_set);
9440 if (BE (mctx->state_log[next_idx] == NULL
9441 && err != REG_NOERROR, 0))
9442 {
9443 re_node_set_free (&union_set);
9444 return err;
9445 }
9446 }
9447 }
9448 #endif /* RE_ENABLE_I18N */
9449 if (naccepted
9450 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
9451 {
9452 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
9453 if (BE (result < 0, 0))
9454 {
9455 re_node_set_free (&union_set);
9456 return REG_ESPACE;
9457 }
9458 }
9459 }
9460 re_node_set_free (&union_set);
9461 return REG_NOERROR;
9462 }
9463
9464 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
9465 CUR_NODES, however exclude the nodes which are:
9466 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
9467 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
9468 */
9469
9470 static reg_errcode_t
9471 internal_function
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,int ex_subexp,int type)9472 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
9473 int ex_subexp, int type)
9474 {
9475 reg_errcode_t err;
9476 int idx, outside_node;
9477 re_node_set new_nodes;
9478 #ifdef DEBUG
9479 assert (cur_nodes->nelem);
9480 #endif
9481 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
9482 if (BE (err != REG_NOERROR, 0))
9483 return err;
9484 /* Create a new node set NEW_NODES with the nodes which are epsilon
9485 closures of the node in CUR_NODES. */
9486
9487 for (idx = 0; idx < cur_nodes->nelem; ++idx)
9488 {
9489 int cur_node = cur_nodes->elems[idx];
9490 const re_node_set *eclosure = dfa->eclosures + cur_node;
9491 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
9492 if (outside_node == -1)
9493 {
9494 /* There are no problematic nodes, just merge them. */
9495 err = re_node_set_merge (&new_nodes, eclosure);
9496 if (BE (err != REG_NOERROR, 0))
9497 {
9498 re_node_set_free (&new_nodes);
9499 return err;
9500 }
9501 }
9502 else
9503 {
9504 /* There are problematic nodes, re-calculate incrementally. */
9505 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
9506 ex_subexp, type);
9507 if (BE (err != REG_NOERROR, 0))
9508 {
9509 re_node_set_free (&new_nodes);
9510 return err;
9511 }
9512 }
9513 }
9514 re_node_set_free (cur_nodes);
9515 *cur_nodes = new_nodes;
9516 return REG_NOERROR;
9517 }
9518
9519 /* Helper function for check_arrival_expand_ecl.
9520 Check incrementally the epsilon closure of TARGET, and if it isn't
9521 problematic append it to DST_NODES. */
9522
9523 static reg_errcode_t
9524 internal_function
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,int target,int ex_subexp,int type)9525 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
9526 int target, int ex_subexp, int type)
9527 {
9528 int cur_node;
9529 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
9530 {
9531 int err;
9532
9533 if (dfa->nodes[cur_node].type == type
9534 && dfa->nodes[cur_node].opr.idx == ex_subexp)
9535 {
9536 if (type == OP_CLOSE_SUBEXP)
9537 {
9538 err = re_node_set_insert (dst_nodes, cur_node);
9539 if (BE (err == -1, 0))
9540 return REG_ESPACE;
9541 }
9542 break;
9543 }
9544 err = re_node_set_insert (dst_nodes, cur_node);
9545 if (BE (err == -1, 0))
9546 return REG_ESPACE;
9547 if (dfa->edests[cur_node].nelem == 0)
9548 break;
9549 if (dfa->edests[cur_node].nelem == 2)
9550 {
9551 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
9552 dfa->edests[cur_node].elems[1],
9553 ex_subexp, type);
9554 if (BE (err != REG_NOERROR, 0))
9555 return err;
9556 }
9557 cur_node = dfa->edests[cur_node].elems[0];
9558 }
9559 return REG_NOERROR;
9560 }
9561
9562
9563 /* For all the back references in the current state, calculate the
9564 destination of the back references by the appropriate entry
9565 in MCTX->BKREF_ENTS. */
9566
9567 static reg_errcode_t
9568 internal_function
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,int cur_str,int subexp_num,int type)9569 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
9570 int cur_str, int subexp_num, int type)
9571 {
9572 const re_dfa_t *const dfa = mctx->dfa;
9573 reg_errcode_t err;
9574 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
9575 struct re_backref_cache_entry *ent;
9576
9577 if (cache_idx_start == -1)
9578 return REG_NOERROR;
9579
9580 restart:
9581 ent = mctx->bkref_ents + cache_idx_start;
9582 do
9583 {
9584 int to_idx, next_node;
9585
9586 /* Is this entry ENT is appropriate? */
9587 if (!re_node_set_contains (cur_nodes, ent->node))
9588 continue; /* No. */
9589
9590 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
9591 /* Calculate the destination of the back reference, and append it
9592 to MCTX->STATE_LOG. */
9593 if (to_idx == cur_str)
9594 {
9595 /* The backreference did epsilon transit, we must re-check all the
9596 node in the current state. */
9597 re_node_set new_dests;
9598 reg_errcode_t err2, err3;
9599 next_node = dfa->edests[ent->node].elems[0];
9600 if (re_node_set_contains (cur_nodes, next_node))
9601 continue;
9602 err = re_node_set_init_1 (&new_dests, next_node);
9603 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
9604 err3 = re_node_set_merge (cur_nodes, &new_dests);
9605 re_node_set_free (&new_dests);
9606 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
9607 || err3 != REG_NOERROR, 0))
9608 {
9609 err = (err != REG_NOERROR ? err
9610 : (err2 != REG_NOERROR ? err2 : err3));
9611 return err;
9612 }
9613 /* TODO: It is still inefficient... */
9614 goto restart;
9615 }
9616 else
9617 {
9618 re_node_set union_set;
9619 next_node = dfa->nexts[ent->node];
9620 if (mctx->state_log[to_idx])
9621 {
9622 int ret;
9623 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
9624 next_node))
9625 continue;
9626 err = re_node_set_init_copy (&union_set,
9627 &mctx->state_log[to_idx]->nodes);
9628 ret = re_node_set_insert (&union_set, next_node);
9629 if (BE (err != REG_NOERROR || ret < 0, 0))
9630 {
9631 re_node_set_free (&union_set);
9632 err = err != REG_NOERROR ? err : REG_ESPACE;
9633 return err;
9634 }
9635 }
9636 else
9637 {
9638 err = re_node_set_init_1 (&union_set, next_node);
9639 if (BE (err != REG_NOERROR, 0))
9640 return err;
9641 }
9642 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
9643 re_node_set_free (&union_set);
9644 if (BE (mctx->state_log[to_idx] == NULL
9645 && err != REG_NOERROR, 0))
9646 return err;
9647 }
9648 }
9649 while (ent++->more);
9650 return REG_NOERROR;
9651 }
9652
9653 /* Build transition table for the state.
9654 Return 1 if succeeded, otherwise return NULL. */
9655
9656 static int
9657 internal_function
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)9658 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
9659 {
9660 reg_errcode_t err;
9661 int i, j, ch, need_word_trtable = 0;
9662 bitset_word_t elem, mask;
9663 bool dests_node_malloced = false;
9664 bool dest_states_malloced = false;
9665 int ndests; /* Number of the destination states from `state'. */
9666 re_dfastate_t **trtable;
9667 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
9668 re_node_set follows, *dests_node;
9669 bitset_t *dests_ch;
9670 bitset_t acceptable;
9671
9672 struct dests_alloc
9673 {
9674 re_node_set dests_node[SBC_MAX];
9675 bitset_t dests_ch[SBC_MAX];
9676 } *dests_alloc;
9677
9678 /* We build DFA states which corresponds to the destination nodes
9679 from `state'. `dests_node[i]' represents the nodes which i-th
9680 destination state contains, and `dests_ch[i]' represents the
9681 characters which i-th destination state accepts. */
9682 if (__libc_use_alloca (sizeof (struct dests_alloc)))
9683 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
9684 else
9685 {
9686 dests_alloc = re_malloc (struct dests_alloc, 1);
9687 if (BE (dests_alloc == NULL, 0))
9688 return 0;
9689 dests_node_malloced = true;
9690 }
9691 dests_node = dests_alloc->dests_node;
9692 dests_ch = dests_alloc->dests_ch;
9693
9694 /* Initialize transiton table. */
9695 state->word_trtable = state->trtable = NULL;
9696
9697 /* At first, group all nodes belonging to `state' into several
9698 destinations. */
9699 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
9700 if (BE (ndests <= 0, 0))
9701 {
9702 if (dests_node_malloced)
9703 free (dests_alloc);
9704 /* Return 0 in case of an error, 1 otherwise. */
9705 if (ndests == 0)
9706 {
9707 state->trtable = (re_dfastate_t **)
9708 calloc (sizeof (re_dfastate_t *), SBC_MAX);
9709 return 1;
9710 }
9711 return 0;
9712 }
9713
9714 err = re_node_set_alloc (&follows, ndests + 1);
9715 if (BE (err != REG_NOERROR, 0))
9716 goto out_free;
9717
9718 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
9719 + ndests * 3 * sizeof (re_dfastate_t *)))
9720 dest_states = (re_dfastate_t **)
9721 alloca (ndests * 3 * sizeof (re_dfastate_t *));
9722 else
9723 {
9724 dest_states = (re_dfastate_t **)
9725 malloc (ndests * 3 * sizeof (re_dfastate_t *));
9726 if (BE (dest_states == NULL, 0))
9727 {
9728 out_free:
9729 if (dest_states_malloced)
9730 free (dest_states);
9731 re_node_set_free (&follows);
9732 for (i = 0; i < ndests; ++i)
9733 re_node_set_free (dests_node + i);
9734 if (dests_node_malloced)
9735 free (dests_alloc);
9736 return 0;
9737 }
9738 dest_states_malloced = true;
9739 }
9740 dest_states_word = dest_states + ndests;
9741 dest_states_nl = dest_states_word + ndests;
9742 bitset_empty (acceptable);
9743
9744 /* Then build the states for all destinations. */
9745 for (i = 0; i < ndests; ++i)
9746 {
9747 int next_node;
9748 re_node_set_empty (&follows);
9749 /* Merge the follows of this destination states. */
9750 for (j = 0; j < dests_node[i].nelem; ++j)
9751 {
9752 next_node = dfa->nexts[dests_node[i].elems[j]];
9753 if (next_node != -1)
9754 {
9755 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
9756 if (BE (err != REG_NOERROR, 0))
9757 goto out_free;
9758 }
9759 }
9760 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
9761 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
9762 goto out_free;
9763 /* If the new state has context constraint,
9764 build appropriate states for these contexts. */
9765 if (dest_states[i]->has_constraint)
9766 {
9767 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
9768 CONTEXT_WORD);
9769 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
9770 goto out_free;
9771
9772 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
9773 need_word_trtable = 1;
9774
9775 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
9776 CONTEXT_NEWLINE);
9777 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
9778 goto out_free;
9779 }
9780 else
9781 {
9782 dest_states_word[i] = dest_states[i];
9783 dest_states_nl[i] = dest_states[i];
9784 }
9785 bitset_merge (acceptable, dests_ch[i]);
9786 }
9787
9788 if (!BE (need_word_trtable, 0))
9789 {
9790 /* We don't care about whether the following character is a word
9791 character, or we are in a single-byte character set so we can
9792 discern by looking at the character code: allocate a
9793 256-entry transition table. */
9794 trtable = state->trtable =
9795 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
9796 if (BE (trtable == NULL, 0))
9797 goto out_free;
9798
9799 /* For all characters ch...: */
9800 for (i = 0; i < BITSET_WORDS; ++i)
9801 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
9802 elem;
9803 mask <<= 1, elem >>= 1, ++ch)
9804 if (BE (elem & 1, 0))
9805 {
9806 /* There must be exactly one destination which accepts
9807 character ch. See group_nodes_into_DFAstates. */
9808 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
9809 ;
9810
9811 /* j-th destination accepts the word character ch. */
9812 if (dfa->word_char[i] & mask)
9813 trtable[ch] = dest_states_word[j];
9814 else
9815 trtable[ch] = dest_states[j];
9816 }
9817 }
9818 else
9819 {
9820 /* We care about whether the following character is a word
9821 character, and we are in a multi-byte character set: discern
9822 by looking at the character code: build two 256-entry
9823 transition tables, one starting at trtable[0] and one
9824 starting at trtable[SBC_MAX]. */
9825 trtable = state->word_trtable =
9826 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
9827 if (BE (trtable == NULL, 0))
9828 goto out_free;
9829
9830 /* For all characters ch...: */
9831 for (i = 0; i < BITSET_WORDS; ++i)
9832 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
9833 elem;
9834 mask <<= 1, elem >>= 1, ++ch)
9835 if (BE (elem & 1, 0))
9836 {
9837 /* There must be exactly one destination which accepts
9838 character ch. See group_nodes_into_DFAstates. */
9839 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
9840 ;
9841
9842 /* j-th destination accepts the word character ch. */
9843 trtable[ch] = dest_states[j];
9844 trtable[ch + SBC_MAX] = dest_states_word[j];
9845 }
9846 }
9847
9848 /* new line */
9849 if (bitset_contain (acceptable, NEWLINE_CHAR))
9850 {
9851 /* The current state accepts newline character. */
9852 for (j = 0; j < ndests; ++j)
9853 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
9854 {
9855 /* k-th destination accepts newline character. */
9856 trtable[NEWLINE_CHAR] = dest_states_nl[j];
9857 if (need_word_trtable)
9858 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
9859 /* There must be only one destination which accepts
9860 newline. See group_nodes_into_DFAstates. */
9861 break;
9862 }
9863 }
9864
9865 if (dest_states_malloced)
9866 free (dest_states);
9867
9868 re_node_set_free (&follows);
9869 for (i = 0; i < ndests; ++i)
9870 re_node_set_free (dests_node + i);
9871
9872 if (dests_node_malloced)
9873 free (dests_alloc);
9874
9875 return 1;
9876 }
9877
9878 /* Group all nodes belonging to STATE into several destinations.
9879 Then for all destinations, set the nodes belonging to the destination
9880 to DESTS_NODE[i] and set the characters accepted by the destination
9881 to DEST_CH[i]. This function return the number of destinations. */
9882
9883 static int
9884 internal_function
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)9885 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
9886 re_node_set *dests_node, bitset_t *dests_ch)
9887 {
9888 reg_errcode_t err;
9889 int result;
9890 int i, j, k;
9891 int ndests; /* Number of the destinations from `state'. */
9892 bitset_t accepts; /* Characters a node can accept. */
9893 const re_node_set *cur_nodes = &state->nodes;
9894 bitset_empty (accepts);
9895 ndests = 0;
9896
9897 /* For all the nodes belonging to `state', */
9898 for (i = 0; i < cur_nodes->nelem; ++i)
9899 {
9900 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
9901 re_token_type_t type = node->type;
9902 unsigned int constraint = node->constraint;
9903
9904 /* Enumerate all single byte character this node can accept. */
9905 if (type == CHARACTER)
9906 bitset_set (accepts, node->opr.c);
9907 else if (type == SIMPLE_BRACKET)
9908 {
9909 bitset_merge (accepts, node->opr.sbcset);
9910 }
9911 else if (type == OP_PERIOD)
9912 {
9913 #ifdef RE_ENABLE_I18N
9914 if (dfa->mb_cur_max > 1)
9915 bitset_merge (accepts, dfa->sb_char);
9916 else
9917 #endif
9918 bitset_set_all (accepts);
9919 if (!(dfa->syntax & RE_DOT_NEWLINE))
9920 bitset_clear (accepts, '\n');
9921 if (dfa->syntax & RE_DOT_NOT_NULL)
9922 bitset_clear (accepts, '\0');
9923 }
9924 #ifdef RE_ENABLE_I18N
9925 else if (type == OP_UTF8_PERIOD)
9926 {
9927 memset (accepts, '\xff', sizeof (bitset_t) / 2);
9928 if (!(dfa->syntax & RE_DOT_NEWLINE))
9929 bitset_clear (accepts, '\n');
9930 if (dfa->syntax & RE_DOT_NOT_NULL)
9931 bitset_clear (accepts, '\0');
9932 }
9933 #endif
9934 else
9935 continue;
9936
9937 /* Check the `accepts' and sift the characters which are not
9938 match it the context. */
9939 if (constraint)
9940 {
9941 if (constraint & NEXT_NEWLINE_CONSTRAINT)
9942 {
9943 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
9944 bitset_empty (accepts);
9945 if (accepts_newline)
9946 bitset_set (accepts, NEWLINE_CHAR);
9947 else
9948 continue;
9949 }
9950 if (constraint & NEXT_ENDBUF_CONSTRAINT)
9951 {
9952 bitset_empty (accepts);
9953 continue;
9954 }
9955
9956 if (constraint & NEXT_WORD_CONSTRAINT)
9957 {
9958 bitset_word_t any_set = 0;
9959 if (type == CHARACTER && !node->word_char)
9960 {
9961 bitset_empty (accepts);
9962 continue;
9963 }
9964 #ifdef RE_ENABLE_I18N
9965 if (dfa->mb_cur_max > 1)
9966 for (j = 0; j < BITSET_WORDS; ++j)
9967 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
9968 else
9969 #endif
9970 for (j = 0; j < BITSET_WORDS; ++j)
9971 any_set |= (accepts[j] &= dfa->word_char[j]);
9972 if (!any_set)
9973 continue;
9974 }
9975 if (constraint & NEXT_NOTWORD_CONSTRAINT)
9976 {
9977 bitset_word_t any_set = 0;
9978 if (type == CHARACTER && node->word_char)
9979 {
9980 bitset_empty (accepts);
9981 continue;
9982 }
9983 #ifdef RE_ENABLE_I18N
9984 if (dfa->mb_cur_max > 1)
9985 for (j = 0; j < BITSET_WORDS; ++j)
9986 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
9987 else
9988 #endif
9989 for (j = 0; j < BITSET_WORDS; ++j)
9990 any_set |= (accepts[j] &= ~dfa->word_char[j]);
9991 if (!any_set)
9992 continue;
9993 }
9994 }
9995
9996 /* Then divide `accepts' into DFA states, or create a new
9997 state. Above, we make sure that accepts is not empty. */
9998 for (j = 0; j < ndests; ++j)
9999 {
10000 bitset_t intersec; /* Intersection sets, see below. */
10001 bitset_t remains;
10002 /* Flags, see below. */
10003 bitset_word_t has_intersec, not_subset, not_consumed;
10004
10005 /* Optimization, skip if this state doesn't accept the character. */
10006 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
10007 continue;
10008
10009 /* Enumerate the intersection set of this state and `accepts'. */
10010 has_intersec = 0;
10011 for (k = 0; k < BITSET_WORDS; ++k)
10012 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
10013 /* And skip if the intersection set is empty. */
10014 if (!has_intersec)
10015 continue;
10016
10017 /* Then check if this state is a subset of `accepts'. */
10018 not_subset = not_consumed = 0;
10019 for (k = 0; k < BITSET_WORDS; ++k)
10020 {
10021 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
10022 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
10023 }
10024
10025 /* If this state isn't a subset of `accepts', create a
10026 new group state, which has the `remains'. */
10027 if (not_subset)
10028 {
10029 bitset_copy (dests_ch[ndests], remains);
10030 bitset_copy (dests_ch[j], intersec);
10031 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
10032 if (BE (err != REG_NOERROR, 0))
10033 goto error_return;
10034 ++ndests;
10035 }
10036
10037 /* Put the position in the current group. */
10038 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
10039 if (BE (result < 0, 0))
10040 goto error_return;
10041
10042 /* If all characters are consumed, go to next node. */
10043 if (!not_consumed)
10044 break;
10045 }
10046 /* Some characters remain, create a new group. */
10047 if (j == ndests)
10048 {
10049 bitset_copy (dests_ch[ndests], accepts);
10050 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
10051 if (BE (err != REG_NOERROR, 0))
10052 goto error_return;
10053 ++ndests;
10054 bitset_empty (accepts);
10055 }
10056 }
10057 return ndests;
10058 error_return:
10059 for (j = 0; j < ndests; ++j)
10060 re_node_set_free (dests_node + j);
10061 return -1;
10062 }
10063
10064 #ifdef RE_ENABLE_I18N
10065 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
10066 Return the number of the bytes the node accepts.
10067 STR_IDX is the current index of the input string.
10068
10069 This function handles the nodes which can accept one character, or
10070 one collating element like '.', '[a-z]', opposite to the other nodes
10071 can only accept one byte. */
10072
10073 static int
10074 internal_function
check_node_accept_bytes(const re_dfa_t * dfa,int node_idx,const re_string_t * input,int str_idx)10075 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
10076 const re_string_t *input, int str_idx)
10077 {
10078 const re_token_t *node = dfa->nodes + node_idx;
10079 int char_len, elem_len;
10080 int i;
10081
10082 if (BE (node->type == OP_UTF8_PERIOD, 0))
10083 {
10084 unsigned char c = re_string_byte_at (input, str_idx), d;
10085 if (BE (c < 0xc2, 1))
10086 return 0;
10087
10088 if (str_idx + 2 > input->len)
10089 return 0;
10090
10091 d = re_string_byte_at (input, str_idx + 1);
10092 if (c < 0xe0)
10093 return (d < 0x80 || d > 0xbf) ? 0 : 2;
10094 else if (c < 0xf0)
10095 {
10096 char_len = 3;
10097 if (c == 0xe0 && d < 0xa0)
10098 return 0;
10099 }
10100 else if (c < 0xf8)
10101 {
10102 char_len = 4;
10103 if (c == 0xf0 && d < 0x90)
10104 return 0;
10105 }
10106 else if (c < 0xfc)
10107 {
10108 char_len = 5;
10109 if (c == 0xf8 && d < 0x88)
10110 return 0;
10111 }
10112 else if (c < 0xfe)
10113 {
10114 char_len = 6;
10115 if (c == 0xfc && d < 0x84)
10116 return 0;
10117 }
10118 else
10119 return 0;
10120
10121 if (str_idx + char_len > input->len)
10122 return 0;
10123
10124 for (i = 1; i < char_len; ++i)
10125 {
10126 d = re_string_byte_at (input, str_idx + i);
10127 if (d < 0x80 || d > 0xbf)
10128 return 0;
10129 }
10130 return char_len;
10131 }
10132
10133 char_len = re_string_char_size_at (input, str_idx);
10134 if (node->type == OP_PERIOD)
10135 {
10136 if (char_len <= 1)
10137 return 0;
10138 /* FIXME: I don't think this if is needed, as both '\n'
10139 and '\0' are char_len == 1. */
10140 /* '.' accepts any one character except the following two cases. */
10141 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
10142 re_string_byte_at (input, str_idx) == '\n') ||
10143 ((dfa->syntax & RE_DOT_NOT_NULL) &&
10144 re_string_byte_at (input, str_idx) == '\0'))
10145 return 0;
10146 return char_len;
10147 }
10148
10149 elem_len = re_string_elem_size_at (input, str_idx);
10150 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
10151 return 0;
10152
10153 if (node->type == COMPLEX_BRACKET)
10154 {
10155 const re_charset_t *cset = node->opr.mbcset;
10156 # ifdef _LIBC
10157 const unsigned char *pin
10158 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
10159 int j;
10160 uint32_t nrules;
10161 # endif /* _LIBC */
10162 int match_len = 0;
10163 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
10164 ? re_string_wchar_at (input, str_idx) : 0);
10165
10166 /* match with multibyte character? */
10167 for (i = 0; i < cset->nmbchars; ++i)
10168 if (wc == cset->mbchars[i])
10169 {
10170 match_len = char_len;
10171 goto check_node_accept_bytes_match;
10172 }
10173 /* match with character_class? */
10174 for (i = 0; i < cset->nchar_classes; ++i)
10175 {
10176 wctype_t wt = cset->char_classes[i];
10177 if (__iswctype (wc, wt))
10178 {
10179 match_len = char_len;
10180 goto check_node_accept_bytes_match;
10181 }
10182 }
10183
10184 # ifdef _LIBC
10185 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
10186 if (nrules != 0)
10187 {
10188 unsigned int in_collseq = 0;
10189 const int32_t *table, *indirect;
10190 const unsigned char *weights, *extra;
10191 const char *collseqwc;
10192 int32_t idx;
10193 /* This #include defines a local function! */
10194 # include <locale/weight.h>
10195
10196 /* match with collating_symbol? */
10197 if (cset->ncoll_syms)
10198 extra = (const unsigned char *)
10199 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
10200 for (i = 0; i < cset->ncoll_syms; ++i)
10201 {
10202 const unsigned char *coll_sym = extra + cset->coll_syms[i];
10203 /* Compare the length of input collating element and
10204 the length of current collating element. */
10205 if (*coll_sym != elem_len)
10206 continue;
10207 /* Compare each bytes. */
10208 for (j = 0; j < *coll_sym; j++)
10209 if (pin[j] != coll_sym[1 + j])
10210 break;
10211 if (j == *coll_sym)
10212 {
10213 /* Match if every bytes is equal. */
10214 match_len = j;
10215 goto check_node_accept_bytes_match;
10216 }
10217 }
10218
10219 if (cset->nranges)
10220 {
10221 if (elem_len <= char_len)
10222 {
10223 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
10224 in_collseq = __collseq_table_lookup (collseqwc, wc);
10225 }
10226 else
10227 in_collseq = find_collation_sequence_value (pin, elem_len);
10228 }
10229 /* match with range expression? */
10230 for (i = 0; i < cset->nranges; ++i)
10231 if (cset->range_starts[i] <= in_collseq
10232 && in_collseq <= cset->range_ends[i])
10233 {
10234 match_len = elem_len;
10235 goto check_node_accept_bytes_match;
10236 }
10237
10238 /* match with equivalence_class? */
10239 if (cset->nequiv_classes)
10240 {
10241 const unsigned char *cp = pin;
10242 table = (const int32_t *)
10243 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
10244 weights = (const unsigned char *)
10245 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
10246 extra = (const unsigned char *)
10247 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
10248 indirect = (const int32_t *)
10249 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
10250 idx = findidx (&cp);
10251 if (idx > 0)
10252 for (i = 0; i < cset->nequiv_classes; ++i)
10253 {
10254 int32_t equiv_class_idx = cset->equiv_classes[i];
10255 size_t weight_len = weights[idx];
10256 if (weight_len == weights[equiv_class_idx])
10257 {
10258 int cnt = 0;
10259 while (cnt <= weight_len
10260 && (weights[equiv_class_idx + 1 + cnt]
10261 == weights[idx + 1 + cnt]))
10262 ++cnt;
10263 if (cnt > weight_len)
10264 {
10265 match_len = elem_len;
10266 goto check_node_accept_bytes_match;
10267 }
10268 }
10269 }
10270 }
10271 }
10272 else
10273 # endif /* _LIBC */
10274 {
10275 /* match with range expression? */
10276 #if __GNUC__ >= 2
10277 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
10278 #else
10279 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
10280 cmp_buf[2] = wc;
10281 #endif
10282 for (i = 0; i < cset->nranges; ++i)
10283 {
10284 cmp_buf[0] = cset->range_starts[i];
10285 cmp_buf[4] = cset->range_ends[i];
10286 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
10287 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
10288 {
10289 match_len = char_len;
10290 goto check_node_accept_bytes_match;
10291 }
10292 }
10293 }
10294 check_node_accept_bytes_match:
10295 if (!cset->non_match)
10296 return match_len;
10297 else
10298 {
10299 if (match_len > 0)
10300 return 0;
10301 else
10302 return (elem_len > char_len) ? elem_len : char_len;
10303 }
10304 }
10305 return 0;
10306 }
10307
10308 # ifdef _LIBC
10309 static unsigned int
10310 internal_function
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)10311 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
10312 {
10313 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
10314 if (nrules == 0)
10315 {
10316 if (mbs_len == 1)
10317 {
10318 /* No valid character. Match it as a single byte character. */
10319 const unsigned char *collseq = (const unsigned char *)
10320 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
10321 return collseq[mbs[0]];
10322 }
10323 return UINT_MAX;
10324 }
10325 else
10326 {
10327 int32_t idx;
10328 const unsigned char *extra = (const unsigned char *)
10329 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
10330 int32_t extrasize = (const unsigned char *)
10331 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
10332
10333 for (idx = 0; idx < extrasize;)
10334 {
10335 int mbs_cnt, found = 0;
10336 int32_t elem_mbs_len;
10337 /* Skip the name of collating element name. */
10338 idx = idx + extra[idx] + 1;
10339 elem_mbs_len = extra[idx++];
10340 if (mbs_len == elem_mbs_len)
10341 {
10342 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
10343 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
10344 break;
10345 if (mbs_cnt == elem_mbs_len)
10346 /* Found the entry. */
10347 found = 1;
10348 }
10349 /* Skip the byte sequence of the collating element. */
10350 idx += elem_mbs_len;
10351 /* Adjust for the alignment. */
10352 idx = (idx + 3) & ~3;
10353 /* Skip the collation sequence value. */
10354 idx += sizeof (uint32_t);
10355 /* Skip the wide char sequence of the collating element. */
10356 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
10357 /* If we found the entry, return the sequence value. */
10358 if (found)
10359 return *(uint32_t *) (extra + idx);
10360 /* Skip the collation sequence value. */
10361 idx += sizeof (uint32_t);
10362 }
10363 return UINT_MAX;
10364 }
10365 }
10366 # endif /* _LIBC */
10367 #endif /* RE_ENABLE_I18N */
10368
10369 /* Check whether the node accepts the byte which is IDX-th
10370 byte of the INPUT. */
10371
10372 static int
10373 internal_function
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,int idx)10374 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
10375 int idx)
10376 {
10377 unsigned char ch;
10378 ch = re_string_byte_at (&mctx->input, idx);
10379 switch (node->type)
10380 {
10381 case CHARACTER:
10382 if (node->opr.c != ch)
10383 return 0;
10384 break;
10385
10386 case SIMPLE_BRACKET:
10387 if (!bitset_contain (node->opr.sbcset, ch))
10388 return 0;
10389 break;
10390
10391 #ifdef RE_ENABLE_I18N
10392 case OP_UTF8_PERIOD:
10393 if (ch >= 0x80)
10394 return 0;
10395 /* FALLTHROUGH */
10396 #endif
10397 case OP_PERIOD:
10398 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
10399 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
10400 return 0;
10401 break;
10402
10403 default:
10404 return 0;
10405 }
10406
10407 if (node->constraint)
10408 {
10409 /* The node has constraints. Check whether the current context
10410 satisfies the constraints. */
10411 unsigned int context = re_string_context_at (&mctx->input, idx,
10412 mctx->eflags);
10413 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
10414 return 0;
10415 }
10416
10417 return 1;
10418 }
10419
10420 /* Extend the buffers, if the buffers have run out. */
10421
10422 static reg_errcode_t
10423 internal_function
extend_buffers(re_match_context_t * mctx)10424 extend_buffers (re_match_context_t *mctx)
10425 {
10426 reg_errcode_t ret;
10427 re_string_t *pstr = &mctx->input;
10428
10429 /* Double the lengthes of the buffers. */
10430 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
10431 if (BE (ret != REG_NOERROR, 0))
10432 return ret;
10433
10434 if (mctx->state_log != NULL)
10435 {
10436 /* And double the length of state_log. */
10437 /* XXX We have no indication of the size of this buffer. If this
10438 allocation fail we have no indication that the state_log array
10439 does not have the right size. */
10440 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
10441 pstr->bufs_len + 1);
10442 if (BE (new_array == NULL, 0))
10443 return REG_ESPACE;
10444 mctx->state_log = new_array;
10445 }
10446
10447 /* Then reconstruct the buffers. */
10448 if (pstr->icase)
10449 {
10450 #ifdef RE_ENABLE_I18N
10451 if (pstr->mb_cur_max > 1)
10452 {
10453 ret = build_wcs_upper_buffer (pstr);
10454 if (BE (ret != REG_NOERROR, 0))
10455 return ret;
10456 }
10457 else
10458 #endif /* RE_ENABLE_I18N */
10459 build_upper_buffer (pstr);
10460 }
10461 else
10462 {
10463 #ifdef RE_ENABLE_I18N
10464 if (pstr->mb_cur_max > 1)
10465 build_wcs_buffer (pstr);
10466 else
10467 #endif /* RE_ENABLE_I18N */
10468 {
10469 if (pstr->trans != NULL)
10470 re_string_translate_buffer (pstr);
10471 }
10472 }
10473 return REG_NOERROR;
10474 }
10475
10476
10477 /* Functions for matching context. */
10478
10479 /* Initialize MCTX. */
10480
10481 static reg_errcode_t
10482 internal_function
match_ctx_init(re_match_context_t * mctx,int eflags,int n)10483 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
10484 {
10485 mctx->eflags = eflags;
10486 mctx->match_last = -1;
10487 if (n > 0)
10488 {
10489 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
10490 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
10491 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
10492 return REG_ESPACE;
10493 }
10494 /* Already zero-ed by the caller.
10495 else
10496 mctx->bkref_ents = NULL;
10497 mctx->nbkref_ents = 0;
10498 mctx->nsub_tops = 0; */
10499 mctx->abkref_ents = n;
10500 mctx->max_mb_elem_len = 1;
10501 mctx->asub_tops = n;
10502 return REG_NOERROR;
10503 }
10504
10505 /* Clean the entries which depend on the current input in MCTX.
10506 This function must be invoked when the matcher changes the start index
10507 of the input, or changes the input string. */
10508
10509 static void
10510 internal_function
match_ctx_clean(re_match_context_t * mctx)10511 match_ctx_clean (re_match_context_t *mctx)
10512 {
10513 int st_idx;
10514 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
10515 {
10516 int sl_idx;
10517 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
10518 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
10519 {
10520 re_sub_match_last_t *last = top->lasts[sl_idx];
10521 re_free (last->path.array);
10522 re_free (last);
10523 }
10524 re_free (top->lasts);
10525 if (top->path)
10526 {
10527 re_free (top->path->array);
10528 re_free (top->path);
10529 }
10530 free (top);
10531 }
10532
10533 mctx->nsub_tops = 0;
10534 mctx->nbkref_ents = 0;
10535 }
10536
10537 /* Free all the memory associated with MCTX. */
10538
10539 static void
10540 internal_function
match_ctx_free(re_match_context_t * mctx)10541 match_ctx_free (re_match_context_t *mctx)
10542 {
10543 /* First, free all the memory associated with MCTX->SUB_TOPS. */
10544 match_ctx_clean (mctx);
10545 re_free (mctx->sub_tops);
10546 re_free (mctx->bkref_ents);
10547 }
10548
10549 /* Add a new backreference entry to MCTX.
10550 Note that we assume that caller never call this function with duplicate
10551 entry, and call with STR_IDX which isn't smaller than any existing entry.
10552 */
10553
10554 static reg_errcode_t
10555 internal_function
match_ctx_add_entry(re_match_context_t * mctx,int node,int str_idx,int from,int to)10556 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
10557 int to)
10558 {
10559 if (mctx->nbkref_ents >= mctx->abkref_ents)
10560 {
10561 struct re_backref_cache_entry* new_entry;
10562 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
10563 mctx->abkref_ents * 2);
10564 if (BE (new_entry == NULL, 0))
10565 {
10566 re_free (mctx->bkref_ents);
10567 return REG_ESPACE;
10568 }
10569 mctx->bkref_ents = new_entry;
10570 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
10571 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
10572 mctx->abkref_ents *= 2;
10573 }
10574 if (mctx->nbkref_ents > 0
10575 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
10576 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
10577
10578 mctx->bkref_ents[mctx->nbkref_ents].node = node;
10579 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
10580 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
10581 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
10582
10583 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
10584 If bit N is clear, means that this entry won't epsilon-transition to
10585 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
10586 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
10587 such node.
10588
10589 A backreference does not epsilon-transition unless it is empty, so set
10590 to all zeros if FROM != TO. */
10591 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
10592 = (from == to ? ~0 : 0);
10593
10594 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
10595 if (mctx->max_mb_elem_len < to - from)
10596 mctx->max_mb_elem_len = to - from;
10597 return REG_NOERROR;
10598 }
10599
10600 /* Search for the first entry which has the same str_idx, or -1 if none is
10601 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
10602
10603 static int
10604 internal_function
search_cur_bkref_entry(const re_match_context_t * mctx,int str_idx)10605 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
10606 {
10607 int left, right, mid, last;
10608 last = right = mctx->nbkref_ents;
10609 for (left = 0; left < right;)
10610 {
10611 mid = (left + right) / 2;
10612 if (mctx->bkref_ents[mid].str_idx < str_idx)
10613 left = mid + 1;
10614 else
10615 right = mid;
10616 }
10617 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
10618 return left;
10619 else
10620 return -1;
10621 }
10622
10623 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
10624 at STR_IDX. */
10625
10626 static reg_errcode_t
10627 internal_function
match_ctx_add_subtop(re_match_context_t * mctx,int node,int str_idx)10628 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
10629 {
10630 #ifdef DEBUG
10631 assert (mctx->sub_tops != NULL);
10632 assert (mctx->asub_tops > 0);
10633 #endif
10634 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
10635 {
10636 int new_asub_tops = mctx->asub_tops * 2;
10637 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
10638 re_sub_match_top_t *,
10639 new_asub_tops);
10640 if (BE (new_array == NULL, 0))
10641 return REG_ESPACE;
10642 mctx->sub_tops = new_array;
10643 mctx->asub_tops = new_asub_tops;
10644 }
10645 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
10646 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
10647 return REG_ESPACE;
10648 mctx->sub_tops[mctx->nsub_tops]->node = node;
10649 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
10650 return REG_NOERROR;
10651 }
10652
10653 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
10654 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
10655
10656 static re_sub_match_last_t *
10657 internal_function
match_ctx_add_sublast(re_sub_match_top_t * subtop,int node,int str_idx)10658 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
10659 {
10660 re_sub_match_last_t *new_entry;
10661 if (BE (subtop->nlasts == subtop->alasts, 0))
10662 {
10663 int new_alasts = 2 * subtop->alasts + 1;
10664 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
10665 re_sub_match_last_t *,
10666 new_alasts);
10667 if (BE (new_array == NULL, 0))
10668 return NULL;
10669 subtop->lasts = new_array;
10670 subtop->alasts = new_alasts;
10671 }
10672 new_entry = calloc (1, sizeof (re_sub_match_last_t));
10673 if (BE (new_entry != NULL, 1))
10674 {
10675 subtop->lasts[subtop->nlasts] = new_entry;
10676 new_entry->node = node;
10677 new_entry->str_idx = str_idx;
10678 ++subtop->nlasts;
10679 }
10680 return new_entry;
10681 }
10682
10683 static void
10684 internal_function
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,int last_node,int last_str_idx)10685 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
10686 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
10687 {
10688 sctx->sifted_states = sifted_sts;
10689 sctx->limited_states = limited_sts;
10690 sctx->last_node = last_node;
10691 sctx->last_str_idx = last_str_idx;
10692 re_node_set_init_empty (&sctx->limits);
10693 }
10694
10695
10696 /* Binary backward compatibility. */
10697 #if _LIBC
10698 # include <shlib-compat.h>
10699 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3)
10700 link_warning (re_max_failures, "the 're_max_failures' variable is obsolete and will go away.")
10701 int re_max_failures = 2000;
10702 # endif
10703 #endif
10704 #endif
10705