1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002, 2003, 2005 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, write to the Free
18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20 
21 /* this is for removing a compiler warning */
gkfooo()22 void gkfooo() { return; }
23 
24 #ifdef USE_GKREGEX
25 
26 #ifdef HAVE_CONFIG_H
27 #include "config.h"
28 #endif
29 
30 #ifdef _LIBC
31 /* We have to keep the namespace clean.  */
32 # define regfree(preg) __regfree (preg)
33 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
34 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
35 # define regerror(errcode, preg, errbuf, errbuf_size) \
36 	__regerror(errcode, preg, errbuf, errbuf_size)
37 # define re_set_registers(bu, re, nu, st, en) \
38 	__re_set_registers (bu, re, nu, st, en)
39 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
40 	__re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
41 # define re_match(bufp, string, size, pos, regs) \
42 	__re_match (bufp, string, size, pos, regs)
43 # define re_search(bufp, string, size, startpos, range, regs) \
44 	__re_search (bufp, string, size, startpos, range, regs)
45 # define re_compile_pattern(pattern, length, bufp) \
46 	__re_compile_pattern (pattern, length, bufp)
47 # define re_set_syntax(syntax) __re_set_syntax (syntax)
48 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
49 	__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
50 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
51 
52 # include "../locale/localeinfo.h"
53 #endif
54 
55 #include "GKlib.h"
56 
57 
58 /******************************************************************************/
59 /******************************************************************************/
60 /******************************************************************************/
61 /* GKINCLUDE #include "regex_internal.h" */
62 /******************************************************************************/
63 /******************************************************************************/
64 /******************************************************************************/
65 /* Extended regular expression matching and search library.
66    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
67    This file is part of the GNU C Library.
68    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
69 
70    The GNU C Library is free software; you can redistribute it and/or
71    modify it under the terms of the GNU Lesser General Public
72    License as published by the Free Software Foundation; either
73    version 2.1 of the License, or (at your option) any later version.
74 
75    The GNU C Library is distributed in the hope that it will be useful,
76    but WITHOUT ANY WARRANTY; without even the implied warranty of
77    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
78    Lesser General Public License for more details.
79 
80    You should have received a copy of the GNU Lesser General Public
81    License along with the GNU C Library; if not, write to the Free
82    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
83    02111-1307 USA.  */
84 
85 #ifndef _REGEX_INTERNAL_H
86 #define _REGEX_INTERNAL_H 1
87 
88 #include <assert.h>
89 #include <ctype.h>
90 #include <stdio.h>
91 #include <stdlib.h>
92 #include <string.h>
93 
94 #if defined(__MINGW32_VERSION) || defined(_MSC_VER)
95 #define strcasecmp stricmp
96 #endif
97 
98 #if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
99 # include <langinfo.h>
100 #endif
101 #if defined HAVE_LOCALE_H || defined _LIBC
102 # include <locale.h>
103 #endif
104 #if defined HAVE_WCHAR_H || defined _LIBC
105 # include <wchar.h>
106 #endif /* HAVE_WCHAR_H || _LIBC */
107 #if defined HAVE_WCTYPE_H || defined _LIBC
108 # include <wctype.h>
109 #endif /* HAVE_WCTYPE_H || _LIBC */
110 #if defined HAVE_STDBOOL_H || defined _LIBC
111 # include <stdbool.h>
112 #else
113 typedef enum { false, true } bool;
114 #endif /* HAVE_STDBOOL_H || _LIBC */
115 #if defined HAVE_STDINT_H || defined _LIBC
116 # include <stdint.h>
117 #endif /* HAVE_STDINT_H || _LIBC */
118 #if defined _LIBC
119 # include <bits/libc-lock.h>
120 #else
121 # define __libc_lock_define(CLASS,NAME)
122 # define __libc_lock_init(NAME) do { } while (0)
123 # define __libc_lock_lock(NAME) do { } while (0)
124 # define __libc_lock_unlock(NAME) do { } while (0)
125 #endif
126 
127 /* In case that the system doesn't have isblank().  */
128 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
129 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
130 #endif
131 
132 #ifdef _LIBC
133 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
134 #  define _RE_DEFINE_LOCALE_FUNCTIONS 1
135 #   include <locale/localeinfo.h>
136 #   include <locale/elem-hash.h>
137 #   include <locale/coll-lookup.h>
138 # endif
139 #endif
140 
141 /* This is for other GNU distributions with internationalized messages.  */
142 #if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
143 # include <libintl.h>
144 # ifdef _LIBC
145 #  undef gettext
146 #  define gettext(msgid) \
147   INTUSE(__dcgettext) (_libc_intl_domainname, msgid, LC_MESSAGES)
148 # endif
149 #else
150 # define gettext(msgid) (msgid)
151 #endif
152 
153 #ifndef gettext_noop
154 /* This define is so xgettext can find the internationalizable
155    strings.  */
156 # define gettext_noop(String) String
157 #endif
158 
159 /* For loser systems without the definition.  */
160 #ifndef SIZE_MAX
161 # define SIZE_MAX ((size_t) -1)
162 #endif
163 
164 #if (defined MB_CUR_MAX && HAVE_LOCALE_H && HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_WCRTOMB && HAVE_MBRTOWC && HAVE_WCSCOLL) || _LIBC
165 # define RE_ENABLE_I18N
166 #endif
167 
168 #if __GNUC__ >= 3
169 # define BE(expr, val) __builtin_expect (expr, val)
170 #else
171 # define BE(expr, val) (expr)
172 # define inline
173 #endif
174 
175 /* Number of single byte character.  */
176 #define SBC_MAX 256
177 
178 #define COLL_ELEM_LEN_MAX 8
179 
180 /* The character which represents newline.  */
181 #define NEWLINE_CHAR '\n'
182 #define WIDE_NEWLINE_CHAR L'\n'
183 
184 /* Rename to standard API for using out of glibc.  */
185 #ifndef _LIBC
186 # define __wctype wctype
187 # define __iswctype iswctype
188 # define __btowc btowc
189 # define __mempcpy mempcpy
190 # define __wcrtomb wcrtomb
191 # define __regfree regfree
192 # define attribute_hidden
193 #endif /* not _LIBC */
194 
195 #ifdef __GNUC__
196 # define __attribute(arg) __attribute__ (arg)
197 #else
198 # define __attribute(arg)
199 #endif
200 
201 extern const char __re_error_msgid[] attribute_hidden;
202 extern const size_t __re_error_msgid_idx[] attribute_hidden;
203 
204 /* An integer used to represent a set of bits.  It must be unsigned,
205    and must be at least as wide as unsigned int.  */
206 typedef unsigned long int bitset_word_t;
207 /* All bits set in a bitset_word_t.  */
208 #define BITSET_WORD_MAX ULONG_MAX
209 /* Number of bits in a bitset_word_t.  */
210 #define BITSET_WORD_BITS (sizeof (bitset_word_t) * CHAR_BIT)
211 /* Number of bitset_word_t in a bit_set.  */
212 #define BITSET_WORDS (SBC_MAX / BITSET_WORD_BITS)
213 typedef bitset_word_t bitset_t[BITSET_WORDS];
214 typedef bitset_word_t *re_bitset_ptr_t;
215 typedef const bitset_word_t *re_const_bitset_ptr_t;
216 
217 #define bitset_set(set,i) \
218   (set[i / BITSET_WORD_BITS] |= (bitset_word_t) 1 << i % BITSET_WORD_BITS)
219 #define bitset_clear(set,i) \
220   (set[i / BITSET_WORD_BITS] &= ~((bitset_word_t) 1 << i % BITSET_WORD_BITS))
221 #define bitset_contain(set,i) \
222   (set[i / BITSET_WORD_BITS] & ((bitset_word_t) 1 << i % BITSET_WORD_BITS))
223 #define bitset_empty(set) memset (set, '\0', sizeof (bitset_t))
224 #define bitset_set_all(set) memset (set, '\xff', sizeof (bitset_t))
225 #define bitset_copy(dest,src) memcpy (dest, src, sizeof (bitset_t))
226 
227 #define PREV_WORD_CONSTRAINT 0x0001
228 #define PREV_NOTWORD_CONSTRAINT 0x0002
229 #define NEXT_WORD_CONSTRAINT 0x0004
230 #define NEXT_NOTWORD_CONSTRAINT 0x0008
231 #define PREV_NEWLINE_CONSTRAINT 0x0010
232 #define NEXT_NEWLINE_CONSTRAINT 0x0020
233 #define PREV_BEGBUF_CONSTRAINT 0x0040
234 #define NEXT_ENDBUF_CONSTRAINT 0x0080
235 #define WORD_DELIM_CONSTRAINT 0x0100
236 #define NOT_WORD_DELIM_CONSTRAINT 0x0200
237 
238 typedef enum
239 {
240   INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
241   WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
242   WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
243   INSIDE_NOTWORD = PREV_NOTWORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
244   LINE_FIRST = PREV_NEWLINE_CONSTRAINT,
245   LINE_LAST = NEXT_NEWLINE_CONSTRAINT,
246   BUF_FIRST = PREV_BEGBUF_CONSTRAINT,
247   BUF_LAST = NEXT_ENDBUF_CONSTRAINT,
248   WORD_DELIM = WORD_DELIM_CONSTRAINT,
249   NOT_WORD_DELIM = NOT_WORD_DELIM_CONSTRAINT
250 } re_context_type;
251 
252 typedef struct
253 {
254   int alloc;
255   int nelem;
256   int *elems;
257 } re_node_set;
258 
259 typedef enum
260 {
261   NON_TYPE = 0,
262 
263   /* Node type, These are used by token, node, tree.  */
264   CHARACTER = 1,
265   END_OF_RE = 2,
266   SIMPLE_BRACKET = 3,
267   OP_BACK_REF = 4,
268   OP_PERIOD = 5,
269 #ifdef RE_ENABLE_I18N
270   COMPLEX_BRACKET = 6,
271   OP_UTF8_PERIOD = 7,
272 #endif /* RE_ENABLE_I18N */
273 
274   /* We define EPSILON_BIT as a macro so that OP_OPEN_SUBEXP is used
275      when the debugger shows values of this enum type.  */
276 #define EPSILON_BIT 8
277   OP_OPEN_SUBEXP = EPSILON_BIT | 0,
278   OP_CLOSE_SUBEXP = EPSILON_BIT | 1,
279   OP_ALT = EPSILON_BIT | 2,
280   OP_DUP_ASTERISK = EPSILON_BIT | 3,
281   ANCHOR = EPSILON_BIT | 4,
282 
283   /* Tree type, these are used only by tree. */
284   CONCAT = 16,
285   SUBEXP = 17,
286 
287   /* Token type, these are used only by token.  */
288   OP_DUP_PLUS = 18,
289   OP_DUP_QUESTION,
290   OP_OPEN_BRACKET,
291   OP_CLOSE_BRACKET,
292   OP_CHARSET_RANGE,
293   OP_OPEN_DUP_NUM,
294   OP_CLOSE_DUP_NUM,
295   OP_NON_MATCH_LIST,
296   OP_OPEN_COLL_ELEM,
297   OP_CLOSE_COLL_ELEM,
298   OP_OPEN_EQUIV_CLASS,
299   OP_CLOSE_EQUIV_CLASS,
300   OP_OPEN_CHAR_CLASS,
301   OP_CLOSE_CHAR_CLASS,
302   OP_WORD,
303   OP_NOTWORD,
304   OP_SPACE,
305   OP_NOTSPACE,
306   BACK_SLASH
307 
308 } re_token_type_t;
309 
310 #ifdef RE_ENABLE_I18N
311 typedef struct
312 {
313   /* Multibyte characters.  */
314   wchar_t *mbchars;
315 
316   /* Collating symbols.  */
317 # ifdef _LIBC
318   int32_t *coll_syms;
319 # endif
320 
321   /* Equivalence classes. */
322 # ifdef _LIBC
323   int32_t *equiv_classes;
324 # endif
325 
326   /* Range expressions. */
327 # ifdef _LIBC
328   uint32_t *range_starts;
329   uint32_t *range_ends;
330 # else /* not _LIBC */
331   wchar_t *range_starts;
332   wchar_t *range_ends;
333 # endif /* not _LIBC */
334 
335   /* Character classes. */
336   wctype_t *char_classes;
337 
338   /* If this character set is the non-matching list.  */
339   unsigned int non_match : 1;
340 
341   /* # of multibyte characters.  */
342   int nmbchars;
343 
344   /* # of collating symbols.  */
345   int ncoll_syms;
346 
347   /* # of equivalence classes. */
348   int nequiv_classes;
349 
350   /* # of range expressions. */
351   int nranges;
352 
353   /* # of character classes. */
354   int nchar_classes;
355 } re_charset_t;
356 #endif /* RE_ENABLE_I18N */
357 
358 typedef struct
359 {
360   union
361   {
362     unsigned char c;		/* for CHARACTER */
363     re_bitset_ptr_t sbcset;	/* for SIMPLE_BRACKET */
364 #ifdef RE_ENABLE_I18N
365     re_charset_t *mbcset;	/* for COMPLEX_BRACKET */
366 #endif /* RE_ENABLE_I18N */
367     int idx;			/* for BACK_REF */
368     re_context_type ctx_type;	/* for ANCHOR */
369   } opr;
370 #if __GNUC__ >= 2
371   re_token_type_t type : 8;
372 #else
373   re_token_type_t type;
374 #endif
375   unsigned int constraint : 10;	/* context constraint */
376   unsigned int duplicated : 1;
377   unsigned int opt_subexp : 1;
378 #ifdef RE_ENABLE_I18N
379   unsigned int accept_mb : 1;
380   /* These 2 bits can be moved into the union if needed (e.g. if running out
381      of bits; move opr.c to opr.c.c and move the flags to opr.c.flags).  */
382   unsigned int mb_partial : 1;
383 #endif
384   unsigned int word_char : 1;
385 } re_token_t;
386 
387 #define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT)
388 
389 struct re_string_t
390 {
391   /* Indicate the raw buffer which is the original string passed as an
392      argument of regexec(), re_search(), etc..  */
393   const unsigned char *raw_mbs;
394   /* Store the multibyte string.  In case of "case insensitive mode" like
395      REG_ICASE, upper cases of the string are stored, otherwise MBS points
396      the same address that RAW_MBS points.  */
397   unsigned char *mbs;
398 #ifdef RE_ENABLE_I18N
399   /* Store the wide character string which is corresponding to MBS.  */
400   wint_t *wcs;
401   int *offsets;
402   mbstate_t cur_state;
403 #endif
404   /* Index in RAW_MBS.  Each character mbs[i] corresponds to
405      raw_mbs[raw_mbs_idx + i].  */
406   int raw_mbs_idx;
407   /* The length of the valid characters in the buffers.  */
408   int valid_len;
409   /* The corresponding number of bytes in raw_mbs array.  */
410   int valid_raw_len;
411   /* The length of the buffers MBS and WCS.  */
412   int bufs_len;
413   /* The index in MBS, which is updated by re_string_fetch_byte.  */
414   int cur_idx;
415   /* length of RAW_MBS array.  */
416   int raw_len;
417   /* This is RAW_LEN - RAW_MBS_IDX + VALID_LEN - VALID_RAW_LEN.  */
418   int len;
419   /* End of the buffer may be shorter than its length in the cases such
420      as re_match_2, re_search_2.  Then, we use STOP for end of the buffer
421      instead of LEN.  */
422   int raw_stop;
423   /* This is RAW_STOP - RAW_MBS_IDX adjusted through OFFSETS.  */
424   int stop;
425 
426   /* The context of mbs[0].  We store the context independently, since
427      the context of mbs[0] may be different from raw_mbs[0], which is
428      the beginning of the input string.  */
429   unsigned int tip_context;
430   /* The translation passed as a part of an argument of re_compile_pattern.  */
431   RE_TRANSLATE_TYPE trans;
432   /* Copy of re_dfa_t's word_char.  */
433   re_const_bitset_ptr_t word_char;
434   /* 1 if REG_ICASE.  */
435   unsigned char icase;
436   unsigned char is_utf8;
437   unsigned char map_notascii;
438   unsigned char mbs_allocated;
439   unsigned char offsets_needed;
440   unsigned char newline_anchor;
441   unsigned char word_ops_used;
442   int mb_cur_max;
443 };
444 typedef struct re_string_t re_string_t;
445 
446 
447 struct re_dfa_t;
448 typedef struct re_dfa_t re_dfa_t;
449 
450 #ifndef _LIBC
451 # ifdef __i386__
452 #  define internal_function   __attribute ((regparm (3), stdcall))
453 # else
454 #  define internal_function
455 # endif
456 #endif
457 
458 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
459 						int new_buf_len)
460      internal_function;
461 #ifdef RE_ENABLE_I18N
462 static void build_wcs_buffer (re_string_t *pstr) internal_function;
463 static int build_wcs_upper_buffer (re_string_t *pstr) internal_function;
464 #endif /* RE_ENABLE_I18N */
465 static void build_upper_buffer (re_string_t *pstr) internal_function;
466 static void re_string_translate_buffer (re_string_t *pstr) internal_function;
467 static unsigned int re_string_context_at (const re_string_t *input, int idx,
468 					  int eflags)
469      internal_function __attribute ((pure));
470 #define re_string_peek_byte(pstr, offset) \
471   ((pstr)->mbs[(pstr)->cur_idx + offset])
472 #define re_string_fetch_byte(pstr) \
473   ((pstr)->mbs[(pstr)->cur_idx++])
474 #define re_string_first_byte(pstr, idx) \
475   ((idx) == (pstr)->valid_len || (pstr)->wcs[idx] != WEOF)
476 #define re_string_is_single_byte_char(pstr, idx) \
477   ((pstr)->wcs[idx] != WEOF && ((pstr)->valid_len == (idx) + 1 \
478 				|| (pstr)->wcs[(idx) + 1] != WEOF))
479 #define re_string_eoi(pstr) ((pstr)->stop <= (pstr)->cur_idx)
480 #define re_string_cur_idx(pstr) ((pstr)->cur_idx)
481 #define re_string_get_buffer(pstr) ((pstr)->mbs)
482 #define re_string_length(pstr) ((pstr)->len)
483 #define re_string_byte_at(pstr,idx) ((pstr)->mbs[idx])
484 #define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx))
485 #define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx))
486 
487 #ifdef __GNUC__
488 # define alloca(size)   __builtin_alloca (size)
489 # define HAVE_ALLOCA 1
490 #elif defined(_MSC_VER)
491 # include <malloc.h>
492 # define alloca _alloca
493 # define HAVE_ALLOCA 1
494 #else
495 # error No alloca()
496 #endif
497 
498 #ifndef _LIBC
499 # if HAVE_ALLOCA
500 /* The OS usually guarantees only one guard page at the bottom of the stack,
501    and a page size can be as small as 4096 bytes.  So we cannot safely
502    allocate anything larger than 4096 bytes.  Also care for the possibility
503    of a few compiler-allocated temporary stack slots.  */
504 #  define __libc_use_alloca(n) ((n) < 4032)
505 # else
506 /* alloca is implemented with malloc, so just use malloc.  */
507 #  define __libc_use_alloca(n) 0
508 # endif
509 #endif
510 
511 #define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t)))
512 #define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t)))
513 #define re_free(p) free (p)
514 
515 struct bin_tree_t
516 {
517   struct bin_tree_t *parent;
518   struct bin_tree_t *left;
519   struct bin_tree_t *right;
520   struct bin_tree_t *first;
521   struct bin_tree_t *next;
522 
523   re_token_t token;
524 
525   /* `node_idx' is the index in dfa->nodes, if `type' == 0.
526      Otherwise `type' indicate the type of this node.  */
527   int node_idx;
528 };
529 typedef struct bin_tree_t bin_tree_t;
530 
531 #define BIN_TREE_STORAGE_SIZE \
532   ((1024 - sizeof (void *)) / sizeof (bin_tree_t))
533 
534 struct bin_tree_storage_t
535 {
536   struct bin_tree_storage_t *next;
537   bin_tree_t data[BIN_TREE_STORAGE_SIZE];
538 };
539 typedef struct bin_tree_storage_t bin_tree_storage_t;
540 
541 #define CONTEXT_WORD 1
542 #define CONTEXT_NEWLINE (CONTEXT_WORD << 1)
543 #define CONTEXT_BEGBUF (CONTEXT_NEWLINE << 1)
544 #define CONTEXT_ENDBUF (CONTEXT_BEGBUF << 1)
545 
546 #define IS_WORD_CONTEXT(c) ((c) & CONTEXT_WORD)
547 #define IS_NEWLINE_CONTEXT(c) ((c) & CONTEXT_NEWLINE)
548 #define IS_BEGBUF_CONTEXT(c) ((c) & CONTEXT_BEGBUF)
549 #define IS_ENDBUF_CONTEXT(c) ((c) & CONTEXT_ENDBUF)
550 #define IS_ORDINARY_CONTEXT(c) ((c) == 0)
551 
552 #define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_')
553 #define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR)
554 #define IS_WIDE_WORD_CHAR(ch) (iswalnum (ch) || (ch) == L'_')
555 #define IS_WIDE_NEWLINE(ch) ((ch) == WIDE_NEWLINE_CHAR)
556 
557 #define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \
558  ((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
559   || ((constraint & PREV_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
560   || ((constraint & PREV_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context))\
561   || ((constraint & PREV_BEGBUF_CONSTRAINT) && !IS_BEGBUF_CONTEXT (context)))
562 
563 #define NOT_SATISFY_NEXT_CONSTRAINT(constraint,context) \
564  ((((constraint) & NEXT_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
565   || (((constraint) & NEXT_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
566   || (((constraint) & NEXT_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context)) \
567   || (((constraint) & NEXT_ENDBUF_CONSTRAINT) && !IS_ENDBUF_CONTEXT (context)))
568 
569 struct re_dfastate_t
570 {
571   unsigned int hash;
572   re_node_set nodes;
573   re_node_set non_eps_nodes;
574   re_node_set inveclosure;
575   re_node_set *entrance_nodes;
576   struct re_dfastate_t **trtable, **word_trtable;
577   unsigned int context : 4;
578   unsigned int halt : 1;
579   /* If this state can accept `multi byte'.
580      Note that we refer to multibyte characters, and multi character
581      collating elements as `multi byte'.  */
582   unsigned int accept_mb : 1;
583   /* If this state has backreference node(s).  */
584   unsigned int has_backref : 1;
585   unsigned int has_constraint : 1;
586 };
587 typedef struct re_dfastate_t re_dfastate_t;
588 
589 struct re_state_table_entry
590 {
591   int num;
592   int alloc;
593   re_dfastate_t **array;
594 };
595 
596 /* Array type used in re_sub_match_last_t and re_sub_match_top_t.  */
597 
598 typedef struct
599 {
600   int next_idx;
601   int alloc;
602   re_dfastate_t **array;
603 } state_array_t;
604 
605 /* Store information about the node NODE whose type is OP_CLOSE_SUBEXP.  */
606 
607 typedef struct
608 {
609   int node;
610   int str_idx; /* The position NODE match at.  */
611   state_array_t path;
612 } re_sub_match_last_t;
613 
614 /* Store information about the node NODE whose type is OP_OPEN_SUBEXP.
615    And information about the node, whose type is OP_CLOSE_SUBEXP,
616    corresponding to NODE is stored in LASTS.  */
617 
618 typedef struct
619 {
620   int str_idx;
621   int node;
622   state_array_t *path;
623   int alasts; /* Allocation size of LASTS.  */
624   int nlasts; /* The number of LASTS.  */
625   re_sub_match_last_t **lasts;
626 } re_sub_match_top_t;
627 
628 struct re_backref_cache_entry
629 {
630   int node;
631   int str_idx;
632   int subexp_from;
633   int subexp_to;
634   char more;
635   char unused;
636   unsigned short int eps_reachable_subexps_map;
637 };
638 
639 typedef struct
640 {
641   /* The string object corresponding to the input string.  */
642   re_string_t input;
643 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
644   const re_dfa_t *const dfa;
645 #else
646   const re_dfa_t *dfa;
647 #endif
648   /* EFLAGS of the argument of regexec.  */
649   int eflags;
650   /* Where the matching ends.  */
651   int match_last;
652   int last_node;
653   /* The state log used by the matcher.  */
654   re_dfastate_t **state_log;
655   int state_log_top;
656   /* Back reference cache.  */
657   int nbkref_ents;
658   int abkref_ents;
659   struct re_backref_cache_entry *bkref_ents;
660   int max_mb_elem_len;
661   int nsub_tops;
662   int asub_tops;
663   re_sub_match_top_t **sub_tops;
664 } re_match_context_t;
665 
666 typedef struct
667 {
668   re_dfastate_t **sifted_states;
669   re_dfastate_t **limited_states;
670   int last_node;
671   int last_str_idx;
672   re_node_set limits;
673 } re_sift_context_t;
674 
675 struct re_fail_stack_ent_t
676 {
677   int idx;
678   int node;
679   regmatch_t *regs;
680   re_node_set eps_via_nodes;
681 };
682 
683 struct re_fail_stack_t
684 {
685   int num;
686   int alloc;
687   struct re_fail_stack_ent_t *stack;
688 };
689 
690 struct re_dfa_t
691 {
692   re_token_t *nodes;
693   size_t nodes_alloc;
694   size_t nodes_len;
695   int *nexts;
696   int *org_indices;
697   re_node_set *edests;
698   re_node_set *eclosures;
699   re_node_set *inveclosures;
700   struct re_state_table_entry *state_table;
701   re_dfastate_t *init_state;
702   re_dfastate_t *init_state_word;
703   re_dfastate_t *init_state_nl;
704   re_dfastate_t *init_state_begbuf;
705   bin_tree_t *str_tree;
706   bin_tree_storage_t *str_tree_storage;
707   re_bitset_ptr_t sb_char;
708   int str_tree_storage_idx;
709 
710   /* number of subexpressions `re_nsub' is in regex_t.  */
711   unsigned int state_hash_mask;
712   int init_node;
713   int nbackref; /* The number of backreference in this dfa.  */
714 
715   /* Bitmap expressing which backreference is used.  */
716   bitset_word_t used_bkref_map;
717   bitset_word_t completed_bkref_map;
718 
719   unsigned int has_plural_match : 1;
720   /* If this dfa has "multibyte node", which is a backreference or
721      a node which can accept multibyte character or multi character
722      collating element.  */
723   unsigned int has_mb_node : 1;
724   unsigned int is_utf8 : 1;
725   unsigned int map_notascii : 1;
726   unsigned int word_ops_used : 1;
727   int mb_cur_max;
728   bitset_t word_char;
729   reg_syntax_t syntax;
730   int *subexp_map;
731 #ifdef DEBUG
732   char* re_str;
733 #endif
734   __libc_lock_define (, lock)
735 };
736 
737 #define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))
738 #define re_node_set_remove(set,id) \
739   (re_node_set_remove_at (set, re_node_set_contains (set, id) - 1))
740 #define re_node_set_empty(p) ((p)->nelem = 0)
741 #define re_node_set_free(set) re_free ((set)->elems)
742 
743 
744 typedef enum
745 {
746   SB_CHAR,
747   MB_CHAR,
748   EQUIV_CLASS,
749   COLL_SYM,
750   CHAR_CLASS
751 } bracket_elem_type;
752 
753 typedef struct
754 {
755   bracket_elem_type type;
756   union
757   {
758     unsigned char ch;
759     unsigned char *name;
760     wchar_t wch;
761   } opr;
762 } bracket_elem_t;
763 
764 
765 /* Inline functions for bitset operation.  */
766 static inline void
bitset_not(bitset_t set)767 bitset_not (bitset_t set)
768 {
769   int bitset_i;
770   for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
771     set[bitset_i] = ~set[bitset_i];
772 }
773 
774 static inline void
bitset_merge(bitset_t dest,const bitset_t src)775 bitset_merge (bitset_t dest, const bitset_t src)
776 {
777   int bitset_i;
778   for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
779     dest[bitset_i] |= src[bitset_i];
780 }
781 
782 static inline void
bitset_mask(bitset_t dest,const bitset_t src)783 bitset_mask (bitset_t dest, const bitset_t src)
784 {
785   int bitset_i;
786   for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
787     dest[bitset_i] &= src[bitset_i];
788 }
789 
790 #ifdef RE_ENABLE_I18N
791 /* Inline functions for re_string.  */
792 static inline int
internal_function(pure)793 internal_function __attribute ((pure))
794 re_string_char_size_at (const re_string_t *pstr, int idx)
795 {
796   int byte_idx;
797   if (pstr->mb_cur_max == 1)
798     return 1;
799   for (byte_idx = 1; idx + byte_idx < pstr->valid_len; ++byte_idx)
800     if (pstr->wcs[idx + byte_idx] != WEOF)
801       break;
802   return byte_idx;
803 }
804 
805 static inline wint_t
internal_function(pure)806 internal_function __attribute ((pure))
807 re_string_wchar_at (const re_string_t *pstr, int idx)
808 {
809   if (pstr->mb_cur_max == 1)
810     return (wint_t) pstr->mbs[idx];
811   return (wint_t) pstr->wcs[idx];
812 }
813 
814 static int
internal_function(pure)815 internal_function __attribute ((pure))
816 re_string_elem_size_at (const re_string_t *pstr, int idx)
817 {
818 # ifdef _LIBC
819   const unsigned char *p, *extra;
820   const int32_t *table, *indirect;
821   int32_t tmp;
822 #  include <locale/weight.h>
823   uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
824 
825   if (nrules != 0)
826     {
827       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
828       extra = (const unsigned char *)
829 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
830       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
831 						_NL_COLLATE_INDIRECTMB);
832       p = pstr->mbs + idx;
833       tmp = findidx (&p);
834       return p - pstr->mbs - idx;
835     }
836   else
837 # endif /* _LIBC */
838     return 1;
839 }
840 #endif /* RE_ENABLE_I18N */
841 
842 #endif /*  _REGEX_INTERNAL_H */
843 
844 /******************************************************************************/
845 /******************************************************************************/
846 /******************************************************************************/
847 /* GKINCLUDE #include "regex_internal.c" */
848 /******************************************************************************/
849 /******************************************************************************/
850 /******************************************************************************/
851 /* Extended regular expression matching and search library.
852    Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
853    This file is part of the GNU C Library.
854    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
855 
856    The GNU C Library is free software; you can redistribute it and/or
857    modify it under the terms of the GNU Lesser General Public
858    License as published by the Free Software Foundation; either
859    version 2.1 of the License, or (at your option) any later version.
860 
861    The GNU C Library is distributed in the hope that it will be useful,
862    but WITHOUT ANY WARRANTY; without even the implied warranty of
863    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
864    Lesser General Public License for more details.
865 
866    You should have received a copy of the GNU Lesser General Public
867    License along with the GNU C Library; if not, write to the Free
868    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
869    02111-1307 USA.  */
870 
871 static void re_string_construct_common (const char *str, int len,
872 					re_string_t *pstr,
873 					RE_TRANSLATE_TYPE trans, int icase,
874 					const re_dfa_t *dfa) internal_function;
875 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
876 					  const re_node_set *nodes,
877 					  unsigned int hash) internal_function;
878 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
879 					  const re_node_set *nodes,
880 					  unsigned int context,
881 					  unsigned int hash) internal_function;
882 
883 /* Functions for string operation.  */
884 
885 /* This function allocate the buffers.  It is necessary to call
886    re_string_reconstruct before using the object.  */
887 
888 static reg_errcode_t
889 internal_function
re_string_allocate(re_string_t * pstr,const char * str,int len,int init_len,RE_TRANSLATE_TYPE trans,int icase,const re_dfa_t * dfa)890 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
891 		    RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
892 {
893   reg_errcode_t ret;
894   int init_buf_len;
895 
896   /* Ensure at least one character fits into the buffers.  */
897   if (init_len < dfa->mb_cur_max)
898     init_len = dfa->mb_cur_max;
899   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
900   re_string_construct_common (str, len, pstr, trans, icase, dfa);
901 
902   ret = re_string_realloc_buffers (pstr, init_buf_len);
903   if (BE (ret != REG_NOERROR, 0))
904     return ret;
905 
906   pstr->word_char = dfa->word_char;
907   pstr->word_ops_used = dfa->word_ops_used;
908   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
909   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
910   pstr->valid_raw_len = pstr->valid_len;
911   return REG_NOERROR;
912 }
913 
914 /* This function allocate the buffers, and initialize them.  */
915 
916 static reg_errcode_t
917 internal_function
re_string_construct(re_string_t * pstr,const char * str,int len,RE_TRANSLATE_TYPE trans,int icase,const re_dfa_t * dfa)918 re_string_construct (re_string_t *pstr, const char *str, int len,
919 		     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
920 {
921   reg_errcode_t ret;
922   memset (pstr, '\0', sizeof (re_string_t));
923   re_string_construct_common (str, len, pstr, trans, icase, dfa);
924 
925   if (len > 0)
926     {
927       ret = re_string_realloc_buffers (pstr, len + 1);
928       if (BE (ret != REG_NOERROR, 0))
929 	return ret;
930     }
931   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
932 
933   if (icase)
934     {
935 #ifdef RE_ENABLE_I18N
936       if (dfa->mb_cur_max > 1)
937 	{
938 	  while (1)
939 	    {
940 	      ret = build_wcs_upper_buffer (pstr);
941 	      if (BE (ret != REG_NOERROR, 0))
942 		return ret;
943 	      if (pstr->valid_raw_len >= len)
944 		break;
945 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
946 		break;
947 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
948 	      if (BE (ret != REG_NOERROR, 0))
949 		return ret;
950 	    }
951 	}
952       else
953 #endif /* RE_ENABLE_I18N  */
954 	build_upper_buffer (pstr);
955     }
956   else
957     {
958 #ifdef RE_ENABLE_I18N
959       if (dfa->mb_cur_max > 1)
960 	build_wcs_buffer (pstr);
961       else
962 #endif /* RE_ENABLE_I18N  */
963 	{
964 	  if (trans != NULL)
965 	    re_string_translate_buffer (pstr);
966 	  else
967 	    {
968 	      pstr->valid_len = pstr->bufs_len;
969 	      pstr->valid_raw_len = pstr->bufs_len;
970 	    }
971 	}
972     }
973 
974   return REG_NOERROR;
975 }
976 
977 /* Helper functions for re_string_allocate, and re_string_construct.  */
978 
979 static reg_errcode_t
980 internal_function
re_string_realloc_buffers(re_string_t * pstr,int new_buf_len)981 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
982 {
983 #ifdef RE_ENABLE_I18N
984   if (pstr->mb_cur_max > 1)
985     {
986       wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
987       if (BE (new_wcs == NULL, 0))
988 	return REG_ESPACE;
989       pstr->wcs = new_wcs;
990       if (pstr->offsets != NULL)
991 	{
992 	  int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
993 	  if (BE (new_offsets == NULL, 0))
994 	    return REG_ESPACE;
995 	  pstr->offsets = new_offsets;
996 	}
997     }
998 #endif /* RE_ENABLE_I18N  */
999   if (pstr->mbs_allocated)
1000     {
1001       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
1002 					   new_buf_len);
1003       if (BE (new_mbs == NULL, 0))
1004 	return REG_ESPACE;
1005       pstr->mbs = new_mbs;
1006     }
1007   pstr->bufs_len = new_buf_len;
1008   return REG_NOERROR;
1009 }
1010 
1011 
1012 static void
1013 internal_function
re_string_construct_common(const char * str,int len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,int icase,const re_dfa_t * dfa)1014 re_string_construct_common (const char *str, int len, re_string_t *pstr,
1015 			    RE_TRANSLATE_TYPE trans, int icase,
1016 			    const re_dfa_t *dfa)
1017 {
1018   pstr->raw_mbs = (const unsigned char *) str;
1019   pstr->len = len;
1020   pstr->raw_len = len;
1021   pstr->trans = trans;
1022   pstr->icase = icase ? 1 : 0;
1023   pstr->mbs_allocated = (trans != NULL || icase);
1024   pstr->mb_cur_max = dfa->mb_cur_max;
1025   pstr->is_utf8 = dfa->is_utf8;
1026   pstr->map_notascii = dfa->map_notascii;
1027   pstr->stop = pstr->len;
1028   pstr->raw_stop = pstr->stop;
1029 }
1030 
1031 #ifdef RE_ENABLE_I18N
1032 
1033 /* Build wide character buffer PSTR->WCS.
1034    If the byte sequence of the string are:
1035      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
1036    Then wide character buffer will be:
1037      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
1038    We use WEOF for padding, they indicate that the position isn't
1039    a first byte of a multibyte character.
1040 
1041    Note that this function assumes PSTR->VALID_LEN elements are already
1042    built and starts from PSTR->VALID_LEN.  */
1043 
1044 static void
1045 internal_function
build_wcs_buffer(re_string_t * pstr)1046 build_wcs_buffer (re_string_t *pstr)
1047 {
1048 #ifdef _LIBC
1049   unsigned char buf[MB_LEN_MAX];
1050   assert (MB_LEN_MAX >= pstr->mb_cur_max);
1051 #else
1052   unsigned char buf[64];
1053 #endif
1054   mbstate_t prev_st;
1055   int byte_idx, end_idx, remain_len;
1056   size_t mbclen;
1057 
1058   /* Build the buffers from pstr->valid_len to either pstr->len or
1059      pstr->bufs_len.  */
1060   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1061   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
1062     {
1063       wchar_t wc;
1064       const char *p;
1065 
1066       remain_len = end_idx - byte_idx;
1067       prev_st = pstr->cur_state;
1068       /* Apply the translation if we need.  */
1069       if (BE (pstr->trans != NULL, 0))
1070 	{
1071 	  int i, ch;
1072 
1073 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
1074 	    {
1075 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
1076 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
1077 	    }
1078 	  p = (const char *) buf;
1079 	}
1080       else
1081 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
1082       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
1083       if (BE (mbclen == (size_t) -2, 0))
1084 	{
1085 	  /* The buffer doesn't have enough space, finish to build.  */
1086 	  pstr->cur_state = prev_st;
1087 	  break;
1088 	}
1089       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
1090 	{
1091 	  /* We treat these cases as a singlebyte character.  */
1092 	  mbclen = 1;
1093 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
1094 	  if (BE (pstr->trans != NULL, 0))
1095 	    wc = pstr->trans[wc];
1096 	  pstr->cur_state = prev_st;
1097 	}
1098 
1099       /* Write wide character and padding.  */
1100       pstr->wcs[byte_idx++] = wc;
1101       /* Write paddings.  */
1102       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1103 	pstr->wcs[byte_idx++] = WEOF;
1104     }
1105   pstr->valid_len = byte_idx;
1106   pstr->valid_raw_len = byte_idx;
1107 }
1108 
1109 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
1110    but for REG_ICASE.  */
1111 
1112 static reg_errcode_t
1113 internal_function
build_wcs_upper_buffer(re_string_t * pstr)1114 build_wcs_upper_buffer (re_string_t *pstr)
1115 {
1116   mbstate_t prev_st;
1117   int src_idx, byte_idx, end_idx, remain_len;
1118   size_t mbclen;
1119 #ifdef _LIBC
1120   char buf[MB_LEN_MAX];
1121   assert (MB_LEN_MAX >= pstr->mb_cur_max);
1122 #else
1123   char buf[64];
1124 #endif
1125 
1126   byte_idx = pstr->valid_len;
1127   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1128 
1129   /* The following optimization assumes that ASCII characters can be
1130      mapped to wide characters with a simple cast.  */
1131   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
1132     {
1133       while (byte_idx < end_idx)
1134 	{
1135 	  wchar_t wc;
1136 
1137 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
1138 	      && mbsinit (&pstr->cur_state))
1139 	    {
1140 	      /* In case of a singlebyte character.  */
1141 	      pstr->mbs[byte_idx]
1142 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
1143 	      /* The next step uses the assumption that wchar_t is encoded
1144 		 ASCII-safe: all ASCII values can be converted like this.  */
1145 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
1146 	      ++byte_idx;
1147 	      continue;
1148 	    }
1149 
1150 	  remain_len = end_idx - byte_idx;
1151 	  prev_st = pstr->cur_state;
1152 	  mbclen = mbrtowc (&wc,
1153 			    ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
1154 			     + byte_idx), remain_len, &pstr->cur_state);
1155 	  if (BE (mbclen + 2 > 2, 1))
1156 	    {
1157 	      wchar_t wcu = wc;
1158 	      if (iswlower (wc))
1159 		{
1160 		  size_t mbcdlen;
1161 
1162 		  wcu = towupper (wc);
1163 		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
1164 		  if (BE (mbclen == mbcdlen, 1))
1165 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
1166 		  else
1167 		    {
1168 		      src_idx = byte_idx;
1169 		      goto offsets_needed;
1170 		    }
1171 		}
1172 	      else
1173 		memcpy (pstr->mbs + byte_idx,
1174 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
1175 	      pstr->wcs[byte_idx++] = wcu;
1176 	      /* Write paddings.  */
1177 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1178 		pstr->wcs[byte_idx++] = WEOF;
1179 	    }
1180 	  else if (mbclen == (size_t) -1 || mbclen == 0)
1181 	    {
1182 	      /* It is an invalid character or '\0'.  Just use the byte.  */
1183 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
1184 	      pstr->mbs[byte_idx] = ch;
1185 	      /* And also cast it to wide char.  */
1186 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
1187 	      if (BE (mbclen == (size_t) -1, 0))
1188 		pstr->cur_state = prev_st;
1189 	    }
1190 	  else
1191 	    {
1192 	      /* The buffer doesn't have enough space, finish to build.  */
1193 	      pstr->cur_state = prev_st;
1194 	      break;
1195 	    }
1196 	}
1197       pstr->valid_len = byte_idx;
1198       pstr->valid_raw_len = byte_idx;
1199       return REG_NOERROR;
1200     }
1201   else
1202     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
1203       {
1204 	wchar_t wc;
1205 	const char *p;
1206       offsets_needed:
1207 	remain_len = end_idx - byte_idx;
1208 	prev_st = pstr->cur_state;
1209 	if (BE (pstr->trans != NULL, 0))
1210 	  {
1211 	    int i, ch;
1212 
1213 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
1214 	      {
1215 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
1216 		buf[i] = pstr->trans[ch];
1217 	      }
1218 	    p = (const char *) buf;
1219 	  }
1220 	else
1221 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
1222 	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
1223 	if (BE (mbclen + 2 > 2, 1))
1224 	  {
1225 	    wchar_t wcu = wc;
1226 	    if (iswlower (wc))
1227 	      {
1228 		size_t mbcdlen;
1229 
1230 		wcu = towupper (wc);
1231 		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
1232 		if (BE (mbclen == mbcdlen, 1))
1233 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
1234 		else if (mbcdlen != (size_t) -1)
1235 		  {
1236 		    size_t i;
1237 
1238 		    if (byte_idx + mbcdlen > pstr->bufs_len)
1239 		      {
1240 			pstr->cur_state = prev_st;
1241 			break;
1242 		      }
1243 
1244 		    if (pstr->offsets == NULL)
1245 		      {
1246 			pstr->offsets = re_malloc (int, pstr->bufs_len);
1247 
1248 			if (pstr->offsets == NULL)
1249 			  return REG_ESPACE;
1250 		      }
1251 		    if (!pstr->offsets_needed)
1252 		      {
1253 			for (i = 0; i < (size_t) byte_idx; ++i)
1254 			  pstr->offsets[i] = i;
1255 			pstr->offsets_needed = 1;
1256 		      }
1257 
1258 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
1259 		    pstr->wcs[byte_idx] = wcu;
1260 		    pstr->offsets[byte_idx] = src_idx;
1261 		    for (i = 1; i < mbcdlen; ++i)
1262 		      {
1263 			pstr->offsets[byte_idx + i]
1264 			  = src_idx + (i < mbclen ? i : mbclen - 1);
1265 			pstr->wcs[byte_idx + i] = WEOF;
1266 		      }
1267 		    pstr->len += mbcdlen - mbclen;
1268 		    if (pstr->raw_stop > src_idx)
1269 		      pstr->stop += mbcdlen - mbclen;
1270 		    end_idx = (pstr->bufs_len > pstr->len)
1271 			      ? pstr->len : pstr->bufs_len;
1272 		    byte_idx += mbcdlen;
1273 		    src_idx += mbclen;
1274 		    continue;
1275 		  }
1276                 else
1277                   memcpy (pstr->mbs + byte_idx, p, mbclen);
1278 	      }
1279 	    else
1280 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
1281 
1282 	    if (BE (pstr->offsets_needed != 0, 0))
1283 	      {
1284 		size_t i;
1285 		for (i = 0; i < mbclen; ++i)
1286 		  pstr->offsets[byte_idx + i] = src_idx + i;
1287 	      }
1288 	    src_idx += mbclen;
1289 
1290 	    pstr->wcs[byte_idx++] = wcu;
1291 	    /* Write paddings.  */
1292 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1293 	      pstr->wcs[byte_idx++] = WEOF;
1294 	  }
1295 	else if (mbclen == (size_t) -1 || mbclen == 0)
1296 	  {
1297 	    /* It is an invalid character or '\0'.  Just use the byte.  */
1298 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
1299 
1300 	    if (BE (pstr->trans != NULL, 0))
1301 	      ch = pstr->trans [ch];
1302 	    pstr->mbs[byte_idx] = ch;
1303 
1304 	    if (BE (pstr->offsets_needed != 0, 0))
1305 	      pstr->offsets[byte_idx] = src_idx;
1306 	    ++src_idx;
1307 
1308 	    /* And also cast it to wide char.  */
1309 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
1310 	    if (BE (mbclen == (size_t) -1, 0))
1311 	      pstr->cur_state = prev_st;
1312 	  }
1313 	else
1314 	  {
1315 	    /* The buffer doesn't have enough space, finish to build.  */
1316 	    pstr->cur_state = prev_st;
1317 	    break;
1318 	  }
1319       }
1320   pstr->valid_len = byte_idx;
1321   pstr->valid_raw_len = src_idx;
1322   return REG_NOERROR;
1323 }
1324 
1325 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
1326    Return the index.  */
1327 
1328 static int
1329 internal_function
re_string_skip_chars(re_string_t * pstr,int new_raw_idx,wint_t * last_wc)1330 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
1331 {
1332   mbstate_t prev_st;
1333   int rawbuf_idx;
1334   size_t mbclen;
1335   wchar_t wc = WEOF;
1336 
1337   /* Skip the characters which are not necessary to check.  */
1338   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
1339        rawbuf_idx < new_raw_idx;)
1340     {
1341       int remain_len;
1342       remain_len = pstr->len - rawbuf_idx;
1343       prev_st = pstr->cur_state;
1344       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
1345 			remain_len, &pstr->cur_state);
1346       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
1347 	{
1348 	  /* We treat these cases as a single byte character.  */
1349 	  if (mbclen == 0 || remain_len == 0)
1350 	    wc = L'\0';
1351 	  else
1352 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
1353 	  mbclen = 1;
1354 	  pstr->cur_state = prev_st;
1355 	}
1356       /* Then proceed the next character.  */
1357       rawbuf_idx += mbclen;
1358     }
1359   *last_wc = (wint_t) wc;
1360   return rawbuf_idx;
1361 }
1362 #endif /* RE_ENABLE_I18N  */
1363 
1364 /* Build the buffer PSTR->MBS, and apply the translation if we need.
1365    This function is used in case of REG_ICASE.  */
1366 
1367 static void
1368 internal_function
build_upper_buffer(re_string_t * pstr)1369 build_upper_buffer (re_string_t *pstr)
1370 {
1371   int char_idx, end_idx;
1372   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1373 
1374   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
1375     {
1376       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
1377       if (BE (pstr->trans != NULL, 0))
1378 	ch = pstr->trans[ch];
1379       if (islower (ch))
1380 	pstr->mbs[char_idx] = toupper (ch);
1381       else
1382 	pstr->mbs[char_idx] = ch;
1383     }
1384   pstr->valid_len = char_idx;
1385   pstr->valid_raw_len = char_idx;
1386 }
1387 
1388 /* Apply TRANS to the buffer in PSTR.  */
1389 
1390 static void
1391 internal_function
re_string_translate_buffer(re_string_t * pstr)1392 re_string_translate_buffer (re_string_t *pstr)
1393 {
1394   int buf_idx, end_idx;
1395   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1396 
1397   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
1398     {
1399       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
1400       pstr->mbs[buf_idx] = pstr->trans[ch];
1401     }
1402 
1403   pstr->valid_len = buf_idx;
1404   pstr->valid_raw_len = buf_idx;
1405 }
1406 
1407 /* This function re-construct the buffers.
1408    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
1409    convert to upper case in case of REG_ICASE, apply translation.  */
1410 
1411 static reg_errcode_t
1412 internal_function
re_string_reconstruct(re_string_t * pstr,int idx,int eflags)1413 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
1414 {
1415   int offset = idx - pstr->raw_mbs_idx;
1416   if (BE (offset < 0, 0))
1417     {
1418       /* Reset buffer.  */
1419 #ifdef RE_ENABLE_I18N
1420       if (pstr->mb_cur_max > 1)
1421 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
1422 #endif /* RE_ENABLE_I18N */
1423       pstr->len = pstr->raw_len;
1424       pstr->stop = pstr->raw_stop;
1425       pstr->valid_len = 0;
1426       pstr->raw_mbs_idx = 0;
1427       pstr->valid_raw_len = 0;
1428       pstr->offsets_needed = 0;
1429       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
1430 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
1431       if (!pstr->mbs_allocated)
1432 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
1433       offset = idx;
1434     }
1435 
1436   if (BE (offset != 0, 1))
1437     {
1438       /* Should the already checked characters be kept?  */
1439       if (BE (offset < pstr->valid_raw_len, 1))
1440 	{
1441 	  /* Yes, move them to the front of the buffer.  */
1442 #ifdef RE_ENABLE_I18N
1443 	  if (BE (pstr->offsets_needed, 0))
1444 	    {
1445 	      int low = 0, high = pstr->valid_len, mid;
1446 	      do
1447 		{
1448 		  mid = (high + low) / 2;
1449 		  if (pstr->offsets[mid] > offset)
1450 		    high = mid;
1451 		  else if (pstr->offsets[mid] < offset)
1452 		    low = mid + 1;
1453 		  else
1454 		    break;
1455 		}
1456 	      while (low < high);
1457 	      if (pstr->offsets[mid] < offset)
1458 		++mid;
1459 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
1460 							eflags);
1461 	      /* This can be quite complicated, so handle specially
1462 		 only the common and easy case where the character with
1463 		 different length representation of lower and upper
1464 		 case is present at or after offset.  */
1465 	      if (pstr->valid_len > offset
1466 		  && mid == offset && pstr->offsets[mid] == offset)
1467 		{
1468 		  memmove (pstr->wcs, pstr->wcs + offset,
1469 			   (pstr->valid_len - offset) * sizeof (wint_t));
1470 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
1471 		  pstr->valid_len -= offset;
1472 		  pstr->valid_raw_len -= offset;
1473 		  for (low = 0; low < pstr->valid_len; low++)
1474 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
1475 		}
1476 	      else
1477 		{
1478 		  /* Otherwise, just find out how long the partial multibyte
1479 		     character at offset is and fill it with WEOF/255.  */
1480 		  pstr->len = pstr->raw_len - idx + offset;
1481 		  pstr->stop = pstr->raw_stop - idx + offset;
1482 		  pstr->offsets_needed = 0;
1483 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
1484 		    --mid;
1485 		  while (mid < pstr->valid_len)
1486 		    if (pstr->wcs[mid] != WEOF)
1487 		      break;
1488 		    else
1489 		      ++mid;
1490 		  if (mid == pstr->valid_len)
1491 		    pstr->valid_len = 0;
1492 		  else
1493 		    {
1494 		      pstr->valid_len = pstr->offsets[mid] - offset;
1495 		      if (pstr->valid_len)
1496 			{
1497 			  for (low = 0; low < pstr->valid_len; ++low)
1498 			    pstr->wcs[low] = WEOF;
1499 			  memset (pstr->mbs, 255, pstr->valid_len);
1500 			}
1501 		    }
1502 		  pstr->valid_raw_len = pstr->valid_len;
1503 		}
1504 	    }
1505 	  else
1506 #endif
1507 	    {
1508 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
1509 							eflags);
1510 #ifdef RE_ENABLE_I18N
1511 	      if (pstr->mb_cur_max > 1)
1512 		memmove (pstr->wcs, pstr->wcs + offset,
1513 			 (pstr->valid_len - offset) * sizeof (wint_t));
1514 #endif /* RE_ENABLE_I18N */
1515 	      if (BE (pstr->mbs_allocated, 0))
1516 		memmove (pstr->mbs, pstr->mbs + offset,
1517 			 pstr->valid_len - offset);
1518 	      pstr->valid_len -= offset;
1519 	      pstr->valid_raw_len -= offset;
1520 #if DEBUG
1521 	      assert (pstr->valid_len > 0);
1522 #endif
1523 	    }
1524 	}
1525       else
1526 	{
1527 	  /* No, skip all characters until IDX.  */
1528 	  int prev_valid_len = pstr->valid_len;
1529 
1530 #ifdef RE_ENABLE_I18N
1531 	  if (BE (pstr->offsets_needed, 0))
1532 	    {
1533 	      pstr->len = pstr->raw_len - idx + offset;
1534 	      pstr->stop = pstr->raw_stop - idx + offset;
1535 	      pstr->offsets_needed = 0;
1536 	    }
1537 #endif
1538 	  pstr->valid_len = 0;
1539 #ifdef RE_ENABLE_I18N
1540 	  if (pstr->mb_cur_max > 1)
1541 	    {
1542 	      int wcs_idx;
1543 	      wint_t wc = WEOF;
1544 
1545 	      if (pstr->is_utf8)
1546 		{
1547 		  const unsigned char *raw, *p, *q, *end;
1548 
1549 		  /* Special case UTF-8.  Multi-byte chars start with any
1550 		     byte other than 0x80 - 0xbf.  */
1551 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
1552 		  end = raw + (offset - pstr->mb_cur_max);
1553 		  if (end < pstr->raw_mbs)
1554 		    end = pstr->raw_mbs;
1555 		  p = raw + offset - 1;
1556 #ifdef _LIBC
1557 		  /* We know the wchar_t encoding is UCS4, so for the simple
1558 		     case, ASCII characters, skip the conversion step.  */
1559 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
1560 		    {
1561 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
1562 		      /* pstr->valid_len = 0; */
1563 		      wc = (wchar_t) *p;
1564 		    }
1565 		  else
1566 #endif
1567 		    for (; p >= end; --p)
1568 		      if ((*p & 0xc0) != 0x80)
1569 			{
1570 			  mbstate_t cur_state;
1571 			  wchar_t wc2;
1572 			  int mlen = raw + pstr->len - p;
1573 			  unsigned char buf[6];
1574 			  size_t mbclen;
1575 
1576 			  q = p;
1577 			  if (BE (pstr->trans != NULL, 0))
1578 			    {
1579 			      int i = mlen < 6 ? mlen : 6;
1580 			      while (--i >= 0)
1581 				buf[i] = pstr->trans[p[i]];
1582 			      q = buf;
1583 			    }
1584 			  /* XXX Don't use mbrtowc, we know which conversion
1585 			     to use (UTF-8 -> UCS4).  */
1586 			  memset (&cur_state, 0, sizeof (cur_state));
1587 			  mbclen = mbrtowc (&wc2, (const char *) p, mlen,
1588 					    &cur_state);
1589 			  if (raw + offset - p <= mbclen
1590 			      && mbclen < (size_t) -2)
1591 			    {
1592 			      memset (&pstr->cur_state, '\0',
1593 				      sizeof (mbstate_t));
1594 			      pstr->valid_len = mbclen - (raw + offset - p);
1595 			      wc = wc2;
1596 			    }
1597 			  break;
1598 			}
1599 		}
1600 
1601 	      if (wc == WEOF)
1602 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
1603 	      if (wc == WEOF)
1604 		pstr->tip_context
1605 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
1606 	      else
1607 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
1608 				      && IS_WIDE_WORD_CHAR (wc))
1609 				     ? CONTEXT_WORD
1610 				     : ((IS_WIDE_NEWLINE (wc)
1611 					 && pstr->newline_anchor)
1612 					? CONTEXT_NEWLINE : 0));
1613 	      if (BE (pstr->valid_len, 0))
1614 		{
1615 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
1616 		    pstr->wcs[wcs_idx] = WEOF;
1617 		  if (pstr->mbs_allocated)
1618 		    memset (pstr->mbs, 255, pstr->valid_len);
1619 		}
1620 	      pstr->valid_raw_len = pstr->valid_len;
1621 	    }
1622 	  else
1623 #endif /* RE_ENABLE_I18N */
1624 	    {
1625 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
1626 	      pstr->valid_raw_len = 0;
1627 	      if (pstr->trans)
1628 		c = pstr->trans[c];
1629 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
1630 				   ? CONTEXT_WORD
1631 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
1632 				      ? CONTEXT_NEWLINE : 0));
1633 	    }
1634 	}
1635       if (!BE (pstr->mbs_allocated, 0))
1636 	pstr->mbs += offset;
1637     }
1638   pstr->raw_mbs_idx = idx;
1639   pstr->len -= offset;
1640   pstr->stop -= offset;
1641 
1642   /* Then build the buffers.  */
1643 #ifdef RE_ENABLE_I18N
1644   if (pstr->mb_cur_max > 1)
1645     {
1646       if (pstr->icase)
1647 	{
1648 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
1649 	  if (BE (ret != REG_NOERROR, 0))
1650 	    return ret;
1651 	}
1652       else
1653 	build_wcs_buffer (pstr);
1654     }
1655   else
1656 #endif /* RE_ENABLE_I18N */
1657     if (BE (pstr->mbs_allocated, 0))
1658       {
1659 	if (pstr->icase)
1660 	  build_upper_buffer (pstr);
1661 	else if (pstr->trans != NULL)
1662 	  re_string_translate_buffer (pstr);
1663       }
1664     else
1665       pstr->valid_len = pstr->len;
1666 
1667   pstr->cur_idx = 0;
1668   return REG_NOERROR;
1669 }
1670 
1671 static unsigned char
internal_function(pure)1672 internal_function __attribute ((pure))
1673 re_string_peek_byte_case (const re_string_t *pstr, int idx)
1674 {
1675   int ch, off;
1676 
1677   /* Handle the common (easiest) cases first.  */
1678   if (BE (!pstr->mbs_allocated, 1))
1679     return re_string_peek_byte (pstr, idx);
1680 
1681 #ifdef RE_ENABLE_I18N
1682   if (pstr->mb_cur_max > 1
1683       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
1684     return re_string_peek_byte (pstr, idx);
1685 #endif
1686 
1687   off = pstr->cur_idx + idx;
1688 #ifdef RE_ENABLE_I18N
1689   if (pstr->offsets_needed)
1690     off = pstr->offsets[off];
1691 #endif
1692 
1693   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
1694 
1695 #ifdef RE_ENABLE_I18N
1696   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
1697      this function returns CAPITAL LETTER I instead of first byte of
1698      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
1699      since peek_byte_case doesn't advance cur_idx in any way.  */
1700   if (pstr->offsets_needed && !isascii (ch))
1701     return re_string_peek_byte (pstr, idx);
1702 #endif
1703 
1704   return ch;
1705 }
1706 
1707 static unsigned char
internal_function(pure)1708 internal_function __attribute ((pure))
1709 re_string_fetch_byte_case (re_string_t *pstr)
1710 {
1711   if (BE (!pstr->mbs_allocated, 1))
1712     return re_string_fetch_byte (pstr);
1713 
1714 #ifdef RE_ENABLE_I18N
1715   if (pstr->offsets_needed)
1716     {
1717       int off, ch;
1718 
1719       /* For tr_TR.UTF-8 [[:islower:]] there is
1720 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
1721 	 in that case the whole multi-byte character and return
1722 	 the original letter.  On the other side, with
1723 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
1724 	 anything else would complicate things too much.  */
1725 
1726       if (!re_string_first_byte (pstr, pstr->cur_idx))
1727 	return re_string_fetch_byte (pstr);
1728 
1729       off = pstr->offsets[pstr->cur_idx];
1730       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
1731 
1732       if (! isascii (ch))
1733 	return re_string_fetch_byte (pstr);
1734 
1735       re_string_skip_bytes (pstr,
1736 			    re_string_char_size_at (pstr, pstr->cur_idx));
1737       return ch;
1738     }
1739 #endif
1740 
1741   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
1742 }
1743 
1744 static void
1745 internal_function
re_string_destruct(re_string_t * pstr)1746 re_string_destruct (re_string_t *pstr)
1747 {
1748 #ifdef RE_ENABLE_I18N
1749   re_free (pstr->wcs);
1750   re_free (pstr->offsets);
1751 #endif /* RE_ENABLE_I18N  */
1752   if (pstr->mbs_allocated)
1753     re_free (pstr->mbs);
1754 }
1755 
1756 /* Return the context at IDX in INPUT.  */
1757 
1758 static unsigned int
1759 internal_function
re_string_context_at(const re_string_t * input,int idx,int eflags)1760 re_string_context_at (const re_string_t *input, int idx, int eflags)
1761 {
1762   int c;
1763   if (BE (idx < 0, 0))
1764     /* In this case, we use the value stored in input->tip_context,
1765        since we can't know the character in input->mbs[-1] here.  */
1766     return input->tip_context;
1767   if (BE (idx == input->len, 0))
1768     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
1769 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
1770 #ifdef RE_ENABLE_I18N
1771   if (input->mb_cur_max > 1)
1772     {
1773       wint_t wc;
1774       int wc_idx = idx;
1775       while(input->wcs[wc_idx] == WEOF)
1776 	{
1777 #ifdef DEBUG
1778 	  /* It must not happen.  */
1779 	  assert (wc_idx >= 0);
1780 #endif
1781 	  --wc_idx;
1782 	  if (wc_idx < 0)
1783 	    return input->tip_context;
1784 	}
1785       wc = input->wcs[wc_idx];
1786       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
1787 	return CONTEXT_WORD;
1788       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
1789 	      ? CONTEXT_NEWLINE : 0);
1790     }
1791   else
1792 #endif
1793     {
1794       c = re_string_byte_at (input, idx);
1795       if (bitset_contain (input->word_char, c))
1796 	return CONTEXT_WORD;
1797       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
1798     }
1799 }
1800 
1801 /* Functions for set operation.  */
1802 
1803 static reg_errcode_t
1804 internal_function
re_node_set_alloc(re_node_set * set,int size)1805 re_node_set_alloc (re_node_set *set, int size)
1806 {
1807   set->alloc = size;
1808   set->nelem = 0;
1809   set->elems = re_malloc (int, size);
1810   if (BE (set->elems == NULL, 0))
1811     return REG_ESPACE;
1812   return REG_NOERROR;
1813 }
1814 
1815 static reg_errcode_t
1816 internal_function
re_node_set_init_1(re_node_set * set,int elem)1817 re_node_set_init_1 (re_node_set *set, int elem)
1818 {
1819   set->alloc = 1;
1820   set->nelem = 1;
1821   set->elems = re_malloc (int, 1);
1822   if (BE (set->elems == NULL, 0))
1823     {
1824       set->alloc = set->nelem = 0;
1825       return REG_ESPACE;
1826     }
1827   set->elems[0] = elem;
1828   return REG_NOERROR;
1829 }
1830 
1831 static reg_errcode_t
1832 internal_function
re_node_set_init_2(re_node_set * set,int elem1,int elem2)1833 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
1834 {
1835   set->alloc = 2;
1836   set->elems = re_malloc (int, 2);
1837   if (BE (set->elems == NULL, 0))
1838     return REG_ESPACE;
1839   if (elem1 == elem2)
1840     {
1841       set->nelem = 1;
1842       set->elems[0] = elem1;
1843     }
1844   else
1845     {
1846       set->nelem = 2;
1847       if (elem1 < elem2)
1848 	{
1849 	  set->elems[0] = elem1;
1850 	  set->elems[1] = elem2;
1851 	}
1852       else
1853 	{
1854 	  set->elems[0] = elem2;
1855 	  set->elems[1] = elem1;
1856 	}
1857     }
1858   return REG_NOERROR;
1859 }
1860 
1861 static reg_errcode_t
1862 internal_function
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)1863 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1864 {
1865   dest->nelem = src->nelem;
1866   if (src->nelem > 0)
1867     {
1868       dest->alloc = dest->nelem;
1869       dest->elems = re_malloc (int, dest->alloc);
1870       if (BE (dest->elems == NULL, 0))
1871 	{
1872 	  dest->alloc = dest->nelem = 0;
1873 	  return REG_ESPACE;
1874 	}
1875       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1876     }
1877   else
1878     re_node_set_init_empty (dest);
1879   return REG_NOERROR;
1880 }
1881 
1882 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1883    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1884    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1885 
1886 static reg_errcode_t
1887 internal_function
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1888 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1889 			   const re_node_set *src2)
1890 {
1891   int i1, i2, is, id, delta, sbase;
1892   if (src1->nelem == 0 || src2->nelem == 0)
1893     return REG_NOERROR;
1894 
1895   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1896      conservative estimate.  */
1897   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1898     {
1899       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1900       int *new_elems = re_realloc (dest->elems, int, new_alloc);
1901       if (BE (new_elems == NULL, 0))
1902         return REG_ESPACE;
1903       dest->elems = new_elems;
1904       dest->alloc = new_alloc;
1905     }
1906 
1907   /* Find the items in the intersection of SRC1 and SRC2, and copy
1908      into the top of DEST those that are not already in DEST itself.  */
1909   sbase = dest->nelem + src1->nelem + src2->nelem;
1910   i1 = src1->nelem - 1;
1911   i2 = src2->nelem - 1;
1912   id = dest->nelem - 1;
1913   for (;;)
1914     {
1915       if (src1->elems[i1] == src2->elems[i2])
1916 	{
1917 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1918 	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
1919 	    --id;
1920 
1921           if (id < 0 || dest->elems[id] != src1->elems[i1])
1922             dest->elems[--sbase] = src1->elems[i1];
1923 
1924 	  if (--i1 < 0 || --i2 < 0)
1925 	    break;
1926 	}
1927 
1928       /* Lower the highest of the two items.  */
1929       else if (src1->elems[i1] < src2->elems[i2])
1930 	{
1931 	  if (--i2 < 0)
1932 	    break;
1933 	}
1934       else
1935 	{
1936 	  if (--i1 < 0)
1937 	    break;
1938 	}
1939     }
1940 
1941   id = dest->nelem - 1;
1942   is = dest->nelem + src1->nelem + src2->nelem - 1;
1943   delta = is - sbase + 1;
1944 
1945   /* Now copy.  When DELTA becomes zero, the remaining
1946      DEST elements are already in place; this is more or
1947      less the same loop that is in re_node_set_merge.  */
1948   dest->nelem += delta;
1949   if (delta > 0 && id >= 0)
1950     for (;;)
1951       {
1952         if (dest->elems[is] > dest->elems[id])
1953           {
1954             /* Copy from the top.  */
1955             dest->elems[id + delta--] = dest->elems[is--];
1956             if (delta == 0)
1957               break;
1958           }
1959         else
1960           {
1961             /* Slide from the bottom.  */
1962             dest->elems[id + delta] = dest->elems[id];
1963             if (--id < 0)
1964               break;
1965           }
1966       }
1967 
1968   /* Copy remaining SRC elements.  */
1969   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1970 
1971   return REG_NOERROR;
1972 }
1973 
1974 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1975    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1976 
1977 static reg_errcode_t
1978 internal_function
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1979 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1980 			const re_node_set *src2)
1981 {
1982   int i1, i2, id;
1983   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1984     {
1985       dest->alloc = src1->nelem + src2->nelem;
1986       dest->elems = re_malloc (int, dest->alloc);
1987       if (BE (dest->elems == NULL, 0))
1988 	return REG_ESPACE;
1989     }
1990   else
1991     {
1992       if (src1 != NULL && src1->nelem > 0)
1993 	return re_node_set_init_copy (dest, src1);
1994       else if (src2 != NULL && src2->nelem > 0)
1995 	return re_node_set_init_copy (dest, src2);
1996       else
1997 	re_node_set_init_empty (dest);
1998       return REG_NOERROR;
1999     }
2000   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
2001     {
2002       if (src1->elems[i1] > src2->elems[i2])
2003 	{
2004 	  dest->elems[id++] = src2->elems[i2++];
2005 	  continue;
2006 	}
2007       if (src1->elems[i1] == src2->elems[i2])
2008 	++i2;
2009       dest->elems[id++] = src1->elems[i1++];
2010     }
2011   if (i1 < src1->nelem)
2012     {
2013       memcpy (dest->elems + id, src1->elems + i1,
2014 	     (src1->nelem - i1) * sizeof (int));
2015       id += src1->nelem - i1;
2016     }
2017   else if (i2 < src2->nelem)
2018     {
2019       memcpy (dest->elems + id, src2->elems + i2,
2020 	     (src2->nelem - i2) * sizeof (int));
2021       id += src2->nelem - i2;
2022     }
2023   dest->nelem = id;
2024   return REG_NOERROR;
2025 }
2026 
2027 /* Calculate the union set of the sets DEST and SRC. And store it to
2028    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
2029 
2030 static reg_errcode_t
2031 internal_function
re_node_set_merge(re_node_set * dest,const re_node_set * src)2032 re_node_set_merge (re_node_set *dest, const re_node_set *src)
2033 {
2034   int is, id, sbase, delta;
2035   if (src == NULL || src->nelem == 0)
2036     return REG_NOERROR;
2037   if (dest->alloc < 2 * src->nelem + dest->nelem)
2038     {
2039       int new_alloc = 2 * (src->nelem + dest->alloc);
2040       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
2041       if (BE (new_buffer == NULL, 0))
2042 	return REG_ESPACE;
2043       dest->elems = new_buffer;
2044       dest->alloc = new_alloc;
2045     }
2046 
2047   if (BE (dest->nelem == 0, 0))
2048     {
2049       dest->nelem = src->nelem;
2050       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
2051       return REG_NOERROR;
2052     }
2053 
2054   /* Copy into the top of DEST the items of SRC that are not
2055      found in DEST.  Maybe we could binary search in DEST?  */
2056   for (sbase = dest->nelem + 2 * src->nelem,
2057        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
2058     {
2059       if (dest->elems[id] == src->elems[is])
2060         is--, id--;
2061       else if (dest->elems[id] < src->elems[is])
2062         dest->elems[--sbase] = src->elems[is--];
2063       else /* if (dest->elems[id] > src->elems[is]) */
2064         --id;
2065     }
2066 
2067   if (is >= 0)
2068     {
2069       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
2070       sbase -= is + 1;
2071       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
2072     }
2073 
2074   id = dest->nelem - 1;
2075   is = dest->nelem + 2 * src->nelem - 1;
2076   delta = is - sbase + 1;
2077   if (delta == 0)
2078     return REG_NOERROR;
2079 
2080   /* Now copy.  When DELTA becomes zero, the remaining
2081      DEST elements are already in place.  */
2082   dest->nelem += delta;
2083   for (;;)
2084     {
2085       if (dest->elems[is] > dest->elems[id])
2086         {
2087 	  /* Copy from the top.  */
2088           dest->elems[id + delta--] = dest->elems[is--];
2089 	  if (delta == 0)
2090 	    break;
2091 	}
2092       else
2093         {
2094           /* Slide from the bottom.  */
2095           dest->elems[id + delta] = dest->elems[id];
2096 	  if (--id < 0)
2097 	    {
2098 	      /* Copy remaining SRC elements.  */
2099 	      memcpy (dest->elems, dest->elems + sbase,
2100 	              delta * sizeof (int));
2101 	      break;
2102 	    }
2103 	}
2104     }
2105 
2106   return REG_NOERROR;
2107 }
2108 
2109 /* Insert the new element ELEM to the re_node_set* SET.
2110    SET should not already have ELEM.
2111    return -1 if an error is occured, return 1 otherwise.  */
2112 
2113 static int
2114 internal_function
re_node_set_insert(re_node_set * set,int elem)2115 re_node_set_insert (re_node_set *set, int elem)
2116 {
2117   int idx;
2118   /* In case the set is empty.  */
2119   if (set->alloc == 0)
2120     {
2121       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
2122 	return 1;
2123       else
2124 	return -1;
2125     }
2126 
2127   if (BE (set->nelem, 0) == 0)
2128     {
2129       /* We already guaranteed above that set->alloc != 0.  */
2130       set->elems[0] = elem;
2131       ++set->nelem;
2132       return 1;
2133     }
2134 
2135   /* Realloc if we need.  */
2136   if (set->alloc == set->nelem)
2137     {
2138       int *new_elems;
2139       set->alloc = set->alloc * 2;
2140       new_elems = re_realloc (set->elems, int, set->alloc);
2141       if (BE (new_elems == NULL, 0))
2142 	return -1;
2143       set->elems = new_elems;
2144     }
2145 
2146   /* Move the elements which follows the new element.  Test the
2147      first element separately to skip a check in the inner loop.  */
2148   if (elem < set->elems[0])
2149     {
2150       idx = 0;
2151       for (idx = set->nelem; idx > 0; idx--)
2152         set->elems[idx] = set->elems[idx - 1];
2153     }
2154   else
2155     {
2156       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
2157         set->elems[idx] = set->elems[idx - 1];
2158     }
2159 
2160   /* Insert the new element.  */
2161   set->elems[idx] = elem;
2162   ++set->nelem;
2163   return 1;
2164 }
2165 
2166 /* Insert the new element ELEM to the re_node_set* SET.
2167    SET should not already have any element greater than or equal to ELEM.
2168    Return -1 if an error is occured, return 1 otherwise.  */
2169 
2170 static int
2171 internal_function
re_node_set_insert_last(re_node_set * set,int elem)2172 re_node_set_insert_last (re_node_set *set, int elem)
2173 {
2174   /* Realloc if we need.  */
2175   if (set->alloc == set->nelem)
2176     {
2177       int *new_elems;
2178       set->alloc = (set->alloc + 1) * 2;
2179       new_elems = re_realloc (set->elems, int, set->alloc);
2180       if (BE (new_elems == NULL, 0))
2181 	return -1;
2182       set->elems = new_elems;
2183     }
2184 
2185   /* Insert the new element.  */
2186   set->elems[set->nelem++] = elem;
2187   return 1;
2188 }
2189 
2190 /* Compare two node sets SET1 and SET2.
2191    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
2192 
2193 static int
internal_function(pure)2194 internal_function __attribute ((pure))
2195 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
2196 {
2197   int i;
2198   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
2199     return 0;
2200   for (i = set1->nelem ; --i >= 0 ; )
2201     if (set1->elems[i] != set2->elems[i])
2202       return 0;
2203   return 1;
2204 }
2205 
2206 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
2207 
2208 static int
internal_function(pure)2209 internal_function __attribute ((pure))
2210 re_node_set_contains (const re_node_set *set, int elem)
2211 {
2212   unsigned int idx, right, mid;
2213   if (set->nelem <= 0)
2214     return 0;
2215 
2216   /* Binary search the element.  */
2217   idx = 0;
2218   right = set->nelem - 1;
2219   while (idx < right)
2220     {
2221       mid = (idx + right) / 2;
2222       if (set->elems[mid] < elem)
2223 	idx = mid + 1;
2224       else
2225 	right = mid;
2226     }
2227   return set->elems[idx] == elem ? idx + 1 : 0;
2228 }
2229 
2230 static void
2231 internal_function
re_node_set_remove_at(re_node_set * set,int idx)2232 re_node_set_remove_at (re_node_set *set, int idx)
2233 {
2234   if (idx < 0 || idx >= set->nelem)
2235     return;
2236   --set->nelem;
2237   for (; idx < set->nelem; idx++)
2238     set->elems[idx] = set->elems[idx + 1];
2239 }
2240 
2241 
2242 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
2243    Or return -1, if an error will be occured.  */
2244 
2245 static int
2246 internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)2247 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
2248 {
2249   int type = token.type;
2250   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
2251     {
2252       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
2253       int *new_nexts, *new_indices;
2254       re_node_set *new_edests, *new_eclosures;
2255       re_token_t *new_nodes;
2256 
2257       /* Avoid overflows.  */
2258       if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
2259 	return -1;
2260 
2261       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
2262       if (BE (new_nodes == NULL, 0))
2263 	return -1;
2264       dfa->nodes = new_nodes;
2265       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
2266       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
2267       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
2268       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
2269       if (BE (new_nexts == NULL || new_indices == NULL
2270 	      || new_edests == NULL || new_eclosures == NULL, 0))
2271 	return -1;
2272       dfa->nexts = new_nexts;
2273       dfa->org_indices = new_indices;
2274       dfa->edests = new_edests;
2275       dfa->eclosures = new_eclosures;
2276       dfa->nodes_alloc = new_nodes_alloc;
2277     }
2278   dfa->nodes[dfa->nodes_len] = token;
2279   dfa->nodes[dfa->nodes_len].constraint = 0;
2280 #ifdef RE_ENABLE_I18N
2281   dfa->nodes[dfa->nodes_len].accept_mb =
2282     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
2283 #endif
2284   dfa->nexts[dfa->nodes_len] = -1;
2285   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
2286   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
2287   return dfa->nodes_len++;
2288 }
2289 
2290 static inline unsigned int
2291 internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)2292 calc_state_hash (const re_node_set *nodes, unsigned int context)
2293 {
2294   unsigned int hash = nodes->nelem + context;
2295   int i;
2296   for (i = 0 ; i < nodes->nelem ; i++)
2297     hash += nodes->elems[i];
2298   return hash;
2299 }
2300 
2301 /* Search for the state whose node_set is equivalent to NODES.
2302    Return the pointer to the state, if we found it in the DFA.
2303    Otherwise create the new one and return it.  In case of an error
2304    return NULL and set the error code in ERR.
2305    Note: - We assume NULL as the invalid state, then it is possible that
2306 	   return value is NULL and ERR is REG_NOERROR.
2307 	 - We never return non-NULL value in case of any errors, it is for
2308 	   optimization.  */
2309 
2310 static re_dfastate_t *
2311 internal_function
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)2312 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
2313 		  const re_node_set *nodes)
2314 {
2315   unsigned int hash;
2316   re_dfastate_t *new_state;
2317   struct re_state_table_entry *spot;
2318   int i;
2319   if (BE (nodes->nelem == 0, 0))
2320     {
2321       *err = REG_NOERROR;
2322       return NULL;
2323     }
2324   hash = calc_state_hash (nodes, 0);
2325   spot = dfa->state_table + (hash & dfa->state_hash_mask);
2326 
2327   for (i = 0 ; i < spot->num ; i++)
2328     {
2329       re_dfastate_t *state = spot->array[i];
2330       if (hash != state->hash)
2331 	continue;
2332       if (re_node_set_compare (&state->nodes, nodes))
2333 	return state;
2334     }
2335 
2336   /* There are no appropriate state in the dfa, create the new one.  */
2337   new_state = create_ci_newstate (dfa, nodes, hash);
2338   if (BE (new_state == NULL, 0))
2339     *err = REG_ESPACE;
2340 
2341   return new_state;
2342 }
2343 
2344 /* Search for the state whose node_set is equivalent to NODES and
2345    whose context is equivalent to CONTEXT.
2346    Return the pointer to the state, if we found it in the DFA.
2347    Otherwise create the new one and return it.  In case of an error
2348    return NULL and set the error code in ERR.
2349    Note: - We assume NULL as the invalid state, then it is possible that
2350 	   return value is NULL and ERR is REG_NOERROR.
2351 	 - We never return non-NULL value in case of any errors, it is for
2352 	   optimization.  */
2353 
2354 static re_dfastate_t *
2355 internal_function
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)2356 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
2357 			  const re_node_set *nodes, unsigned int context)
2358 {
2359   unsigned int hash;
2360   re_dfastate_t *new_state;
2361   struct re_state_table_entry *spot;
2362   int i;
2363   if (nodes->nelem == 0)
2364     {
2365       *err = REG_NOERROR;
2366       return NULL;
2367     }
2368   hash = calc_state_hash (nodes, context);
2369   spot = dfa->state_table + (hash & dfa->state_hash_mask);
2370 
2371   for (i = 0 ; i < spot->num ; i++)
2372     {
2373       re_dfastate_t *state = spot->array[i];
2374       if (state->hash == hash
2375 	  && state->context == context
2376 	  && re_node_set_compare (state->entrance_nodes, nodes))
2377 	return state;
2378     }
2379   /* There are no appropriate state in `dfa', create the new one.  */
2380   new_state = create_cd_newstate (dfa, nodes, context, hash);
2381   if (BE (new_state == NULL, 0))
2382     *err = REG_ESPACE;
2383 
2384   return new_state;
2385 }
2386 
2387 /* Finish initialization of the new state NEWSTATE, and using its hash value
2388    HASH put in the appropriate bucket of DFA's state table.  Return value
2389    indicates the error code if failed.  */
2390 
2391 static reg_errcode_t
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,unsigned int hash)2392 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
2393 		unsigned int hash)
2394 {
2395   struct re_state_table_entry *spot;
2396   reg_errcode_t err;
2397   int i;
2398 
2399   newstate->hash = hash;
2400   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
2401   if (BE (err != REG_NOERROR, 0))
2402     return REG_ESPACE;
2403   for (i = 0; i < newstate->nodes.nelem; i++)
2404     {
2405       int elem = newstate->nodes.elems[i];
2406       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
2407         re_node_set_insert_last (&newstate->non_eps_nodes, elem);
2408     }
2409 
2410   spot = dfa->state_table + (hash & dfa->state_hash_mask);
2411   if (BE (spot->alloc <= spot->num, 0))
2412     {
2413       int new_alloc = 2 * spot->num + 2;
2414       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
2415 					      new_alloc);
2416       if (BE (new_array == NULL, 0))
2417 	return REG_ESPACE;
2418       spot->array = new_array;
2419       spot->alloc = new_alloc;
2420     }
2421   spot->array[spot->num++] = newstate;
2422   return REG_NOERROR;
2423 }
2424 
2425 static void
free_state(re_dfastate_t * state)2426 free_state (re_dfastate_t *state)
2427 {
2428   re_node_set_free (&state->non_eps_nodes);
2429   re_node_set_free (&state->inveclosure);
2430   if (state->entrance_nodes != &state->nodes)
2431     {
2432       re_node_set_free (state->entrance_nodes);
2433       re_free (state->entrance_nodes);
2434     }
2435   re_node_set_free (&state->nodes);
2436   re_free (state->word_trtable);
2437   re_free (state->trtable);
2438   re_free (state);
2439 }
2440 
2441 /* Create the new state which is independ of contexts.
2442    Return the new state if succeeded, otherwise return NULL.  */
2443 
2444 static re_dfastate_t *
2445 internal_function
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int hash)2446 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
2447 		    unsigned int hash)
2448 {
2449   int i;
2450   reg_errcode_t err;
2451   re_dfastate_t *newstate;
2452 
2453   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
2454   if (BE (newstate == NULL, 0))
2455     return NULL;
2456   err = re_node_set_init_copy (&newstate->nodes, nodes);
2457   if (BE (err != REG_NOERROR, 0))
2458     {
2459       re_free (newstate);
2460       return NULL;
2461     }
2462 
2463   newstate->entrance_nodes = &newstate->nodes;
2464   for (i = 0 ; i < nodes->nelem ; i++)
2465     {
2466       re_token_t *node = dfa->nodes + nodes->elems[i];
2467       re_token_type_t type = node->type;
2468       if (type == CHARACTER && !node->constraint)
2469 	continue;
2470 #ifdef RE_ENABLE_I18N
2471       newstate->accept_mb |= node->accept_mb;
2472 #endif /* RE_ENABLE_I18N */
2473 
2474       /* If the state has the halt node, the state is a halt state.  */
2475       if (type == END_OF_RE)
2476 	newstate->halt = 1;
2477       else if (type == OP_BACK_REF)
2478 	newstate->has_backref = 1;
2479       else if (type == ANCHOR || node->constraint)
2480 	newstate->has_constraint = 1;
2481     }
2482   err = register_state (dfa, newstate, hash);
2483   if (BE (err != REG_NOERROR, 0))
2484     {
2485       free_state (newstate);
2486       newstate = NULL;
2487     }
2488   return newstate;
2489 }
2490 
2491 /* Create the new state which is depend on the context CONTEXT.
2492    Return the new state if succeeded, otherwise return NULL.  */
2493 
2494 static re_dfastate_t *
2495 internal_function
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,unsigned int hash)2496 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
2497 		    unsigned int context, unsigned int hash)
2498 {
2499   int i, nctx_nodes = 0;
2500   reg_errcode_t err;
2501   re_dfastate_t *newstate;
2502 
2503   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
2504   if (BE (newstate == NULL, 0))
2505     return NULL;
2506   err = re_node_set_init_copy (&newstate->nodes, nodes);
2507   if (BE (err != REG_NOERROR, 0))
2508     {
2509       re_free (newstate);
2510       return NULL;
2511     }
2512 
2513   newstate->context = context;
2514   newstate->entrance_nodes = &newstate->nodes;
2515 
2516   for (i = 0 ; i < nodes->nelem ; i++)
2517     {
2518       unsigned int constraint = 0;
2519       re_token_t *node = dfa->nodes + nodes->elems[i];
2520       re_token_type_t type = node->type;
2521       if (node->constraint)
2522 	constraint = node->constraint;
2523 
2524       if (type == CHARACTER && !constraint)
2525 	continue;
2526 #ifdef RE_ENABLE_I18N
2527       newstate->accept_mb |= node->accept_mb;
2528 #endif /* RE_ENABLE_I18N */
2529 
2530       /* If the state has the halt node, the state is a halt state.  */
2531       if (type == END_OF_RE)
2532 	newstate->halt = 1;
2533       else if (type == OP_BACK_REF)
2534 	newstate->has_backref = 1;
2535       else if (type == ANCHOR)
2536 	constraint = node->opr.ctx_type;
2537 
2538       if (constraint)
2539 	{
2540 	  if (newstate->entrance_nodes == &newstate->nodes)
2541 	    {
2542 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
2543 	      if (BE (newstate->entrance_nodes == NULL, 0))
2544 		{
2545 		  free_state (newstate);
2546 		  return NULL;
2547 		}
2548 	      re_node_set_init_copy (newstate->entrance_nodes, nodes);
2549 	      nctx_nodes = 0;
2550 	      newstate->has_constraint = 1;
2551 	    }
2552 
2553 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
2554 	    {
2555 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
2556 	      ++nctx_nodes;
2557 	    }
2558 	}
2559     }
2560   err = register_state (dfa, newstate, hash);
2561   if (BE (err != REG_NOERROR, 0))
2562     {
2563       free_state (newstate);
2564       newstate = NULL;
2565     }
2566   return  newstate;
2567 }
2568 
2569 /******************************************************************************/
2570 /******************************************************************************/
2571 /******************************************************************************/
2572 /* GKINCLUDE #include "regcomp.c" */
2573 /******************************************************************************/
2574 /******************************************************************************/
2575 /******************************************************************************/
2576 /* Extended regular expression matching and search library.
2577    Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
2578    This file is part of the GNU C Library.
2579    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
2580 
2581    The GNU C Library is free software; you can redistribute it and/or
2582    modify it under the terms of the GNU Lesser General Public
2583    License as published by the Free Software Foundation; either
2584    version 2.1 of the License, or (at your option) any later version.
2585 
2586    The GNU C Library is distributed in the hope that it will be useful,
2587    but WITHOUT ANY WARRANTY; without even the implied warranty of
2588    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
2589    Lesser General Public License for more details.
2590 
2591    You should have received a copy of the GNU Lesser General Public
2592    License along with the GNU C Library; if not, write to the Free
2593    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
2594    02111-1307 USA.  */
2595 
2596 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
2597 					  size_t length, reg_syntax_t syntax);
2598 static void re_compile_fastmap_iter (regex_t *bufp,
2599 				     const re_dfastate_t *init_state,
2600 				     char *fastmap);
2601 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
2602 #ifdef RE_ENABLE_I18N
2603 static void free_charset (re_charset_t *cset);
2604 #endif /* RE_ENABLE_I18N */
2605 static void free_workarea_compile (regex_t *preg);
2606 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
2607 #ifdef RE_ENABLE_I18N
2608 static void optimize_utf8 (re_dfa_t *dfa);
2609 #endif
2610 static reg_errcode_t analyze (regex_t *preg);
2611 static reg_errcode_t preorder (bin_tree_t *root,
2612 			       reg_errcode_t (fn (void *, bin_tree_t *)),
2613 			       void *extra);
2614 static reg_errcode_t postorder (bin_tree_t *root,
2615 				reg_errcode_t (fn (void *, bin_tree_t *)),
2616 				void *extra);
2617 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
2618 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
2619 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
2620 				 bin_tree_t *node);
2621 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
2622 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
2623 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
2624 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
2625 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
2626 				   unsigned int constraint);
2627 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
2628 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
2629 					 int node, int root);
2630 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
2631 static int fetch_number (re_string_t *input, re_token_t *token,
2632 			 reg_syntax_t syntax);
2633 static int peek_token (re_token_t *token, re_string_t *input,
2634 			reg_syntax_t syntax) internal_function;
2635 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
2636 			  reg_syntax_t syntax, reg_errcode_t *err);
2637 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
2638 				  re_token_t *token, reg_syntax_t syntax,
2639 				  int nest, reg_errcode_t *err);
2640 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
2641 				 re_token_t *token, reg_syntax_t syntax,
2642 				 int nest, reg_errcode_t *err);
2643 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
2644 				     re_token_t *token, reg_syntax_t syntax,
2645 				     int nest, reg_errcode_t *err);
2646 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
2647 				  re_token_t *token, reg_syntax_t syntax,
2648 				  int nest, reg_errcode_t *err);
2649 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
2650 				 re_dfa_t *dfa, re_token_t *token,
2651 				 reg_syntax_t syntax, reg_errcode_t *err);
2652 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
2653 				      re_token_t *token, reg_syntax_t syntax,
2654 				      reg_errcode_t *err);
2655 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
2656 					    re_string_t *regexp,
2657 					    re_token_t *token, int token_len,
2658 					    re_dfa_t *dfa,
2659 					    reg_syntax_t syntax,
2660 					    int accept_hyphen);
2661 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
2662 					  re_string_t *regexp,
2663 					  re_token_t *token);
2664 #ifdef RE_ENABLE_I18N
2665 static reg_errcode_t build_equiv_class (bitset_t sbcset,
2666 					re_charset_t *mbcset,
2667 					int *equiv_class_alloc,
2668 					const unsigned char *name);
2669 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2670 				      bitset_t sbcset,
2671 				      re_charset_t *mbcset,
2672 				      int *char_class_alloc,
2673 				      const unsigned char *class_name,
2674 				      reg_syntax_t syntax);
2675 #else  /* not RE_ENABLE_I18N */
2676 static reg_errcode_t build_equiv_class (bitset_t sbcset,
2677 					const unsigned char *name);
2678 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2679 				      bitset_t sbcset,
2680 				      const unsigned char *class_name,
2681 				      reg_syntax_t syntax);
2682 #endif /* not RE_ENABLE_I18N */
2683 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
2684 				       RE_TRANSLATE_TYPE trans,
2685 				       const unsigned char *class_name,
2686 				       const unsigned char *extra,
2687 				       int non_match, reg_errcode_t *err);
2688 static bin_tree_t *create_tree (re_dfa_t *dfa,
2689 				bin_tree_t *left, bin_tree_t *right,
2690 				re_token_type_t type);
2691 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
2692 				      bin_tree_t *left, bin_tree_t *right,
2693 				      const re_token_t *token);
2694 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
2695 static void free_token (re_token_t *node);
2696 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
2697 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
2698 
2699 /* This table gives an error message for each of the error codes listed
2700    in regex.h.  Obviously the order here has to be same as there.
2701    POSIX doesn't require that we do anything for REG_NOERROR,
2702    but why not be nice?  */
2703 
2704 const char __re_error_msgid[] attribute_hidden =
2705   {
2706 #define REG_NOERROR_IDX	0
2707     gettext_noop ("Success")	/* REG_NOERROR */
2708     "\0"
2709 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
2710     gettext_noop ("No match")	/* REG_NOMATCH */
2711     "\0"
2712 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
2713     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
2714     "\0"
2715 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
2716     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
2717     "\0"
2718 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
2719     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
2720     "\0"
2721 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
2722     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
2723     "\0"
2724 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
2725     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
2726     "\0"
2727 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
2728     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
2729     "\0"
2730 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
2731     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
2732     "\0"
2733 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
2734     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
2735     "\0"
2736 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
2737     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
2738     "\0"
2739 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
2740     gettext_noop ("Invalid range end")	/* REG_ERANGE */
2741     "\0"
2742 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
2743     gettext_noop ("Memory exhausted") /* REG_ESPACE */
2744     "\0"
2745 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
2746     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
2747     "\0"
2748 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
2749     gettext_noop ("Premature end of regular expression") /* REG_EEND */
2750     "\0"
2751 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
2752     gettext_noop ("Regular expression too big") /* REG_ESIZE */
2753     "\0"
2754 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
2755     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
2756   };
2757 
2758 const size_t __re_error_msgid_idx[] attribute_hidden =
2759   {
2760     REG_NOERROR_IDX,
2761     REG_NOMATCH_IDX,
2762     REG_BADPAT_IDX,
2763     REG_ECOLLATE_IDX,
2764     REG_ECTYPE_IDX,
2765     REG_EESCAPE_IDX,
2766     REG_ESUBREG_IDX,
2767     REG_EBRACK_IDX,
2768     REG_EPAREN_IDX,
2769     REG_EBRACE_IDX,
2770     REG_BADBR_IDX,
2771     REG_ERANGE_IDX,
2772     REG_ESPACE_IDX,
2773     REG_BADRPT_IDX,
2774     REG_EEND_IDX,
2775     REG_ESIZE_IDX,
2776     REG_ERPAREN_IDX
2777   };
2778 
2779 /* Entry points for GNU code.  */
2780 
2781 /* re_compile_pattern is the GNU regular expression compiler: it
2782    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
2783    Returns 0 if the pattern was valid, otherwise an error string.
2784 
2785    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
2786    are set in BUFP on entry.  */
2787 
2788 const char *
re_compile_pattern(pattern,length,bufp)2789 re_compile_pattern (pattern, length, bufp)
2790     const char *pattern;
2791     size_t length;
2792     struct re_pattern_buffer *bufp;
2793 {
2794   reg_errcode_t ret;
2795 
2796   /* And GNU code determines whether or not to get register information
2797      by passing null for the REGS argument to re_match, etc., not by
2798      setting no_sub, unless RE_NO_SUB is set.  */
2799   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
2800 
2801   /* Match anchors at newline.  */
2802   bufp->newline_anchor = 1;
2803 
2804   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
2805 
2806   if (!ret)
2807     return NULL;
2808   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
2809 }
2810 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)2811 weak_alias (__re_compile_pattern, re_compile_pattern)
2812 #endif
2813 
2814 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
2815    also be assigned to arbitrarily: each pattern buffer stores its own
2816    syntax, so it can be changed between regex compilations.  */
2817 /* This has no initializer because initialized variables in Emacs
2818    become read-only after dumping.  */
2819 reg_syntax_t re_syntax_options;
2820 
2821 
2822 /* Specify the precise syntax of regexps for compilation.  This provides
2823    for compatibility for various utilities which historically have
2824    different, incompatible syntaxes.
2825 
2826    The argument SYNTAX is a bit mask comprised of the various bits
2827    defined in regex.h.  We return the old syntax.  */
2828 
2829 reg_syntax_t
2830 re_set_syntax (syntax)
2831     reg_syntax_t syntax;
2832 {
2833   reg_syntax_t ret = re_syntax_options;
2834 
2835   re_syntax_options = syntax;
2836   return ret;
2837 }
2838 #ifdef _LIBC
2839 weak_alias (__re_set_syntax, re_set_syntax)
2840 #endif
2841 
2842 int
2843 re_compile_fastmap (bufp)
2844     struct re_pattern_buffer *bufp;
2845 {
2846   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
2847   char *fastmap = bufp->fastmap;
2848 
2849   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
2850   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
2851   if (dfa->init_state != dfa->init_state_word)
2852     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
2853   if (dfa->init_state != dfa->init_state_nl)
2854     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
2855   if (dfa->init_state != dfa->init_state_begbuf)
2856     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
2857   bufp->fastmap_accurate = 1;
2858   return 0;
2859 }
2860 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)2861 weak_alias (__re_compile_fastmap, re_compile_fastmap)
2862 #endif
2863 
2864 static inline void
2865 __attribute ((always_inline))
2866 re_set_fastmap (char *fastmap, int icase, int ch)
2867 {
2868   fastmap[ch] = 1;
2869   if (icase)
2870     fastmap[tolower (ch)] = 1;
2871 }
2872 
2873 /* Helper function for re_compile_fastmap.
2874    Compile fastmap for the initial_state INIT_STATE.  */
2875 
2876 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)2877 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
2878 			 char *fastmap)
2879 {
2880   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
2881   int node_cnt;
2882   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
2883   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
2884     {
2885       int node = init_state->nodes.elems[node_cnt];
2886       re_token_type_t type = dfa->nodes[node].type;
2887 
2888       if (type == CHARACTER)
2889 	{
2890 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
2891 #ifdef RE_ENABLE_I18N
2892 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
2893 	    {
2894 	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
2895 	      wchar_t wc;
2896 	      mbstate_t state;
2897 
2898 	      p = buf;
2899 	      *p++ = dfa->nodes[node].opr.c;
2900 	      while (++node < dfa->nodes_len
2901 		     &&	dfa->nodes[node].type == CHARACTER
2902 		     && dfa->nodes[node].mb_partial)
2903 		*p++ = dfa->nodes[node].opr.c;
2904 	      memset (&state, '\0', sizeof (state));
2905 	      if (mbrtowc (&wc, (const char *) buf, p - buf,
2906 			   &state) == p - buf
2907 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
2908 		      != (size_t) -1))
2909 		re_set_fastmap (fastmap, 0, buf[0]);
2910 	    }
2911 #endif
2912 	}
2913       else if (type == SIMPLE_BRACKET)
2914 	{
2915 	  int i, ch;
2916 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
2917 	    {
2918 	      int j;
2919 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
2920 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
2921 		if (w & ((bitset_word_t) 1 << j))
2922 		  re_set_fastmap (fastmap, icase, ch);
2923 	    }
2924 	}
2925 #ifdef RE_ENABLE_I18N
2926       else if (type == COMPLEX_BRACKET)
2927 	{
2928 	  int i;
2929 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
2930 	  if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
2931 	      || cset->nranges || cset->nchar_classes)
2932 	    {
2933 # ifdef _LIBC
2934 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
2935 		{
2936 		  /* In this case we want to catch the bytes which are
2937 		     the first byte of any collation elements.
2938 		     e.g. In da_DK, we want to catch 'a' since "aa"
2939 			  is a valid collation element, and don't catch
2940 			  'b' since 'b' is the only collation element
2941 			  which starts from 'b'.  */
2942 		  const int32_t *table = (const int32_t *)
2943 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
2944 		  for (i = 0; i < SBC_MAX; ++i)
2945 		    if (table[i] < 0)
2946 		      re_set_fastmap (fastmap, icase, i);
2947 		}
2948 # else
2949 	      if (dfa->mb_cur_max > 1)
2950 		for (i = 0; i < SBC_MAX; ++i)
2951 		  if (__btowc (i) == WEOF)
2952 		    re_set_fastmap (fastmap, icase, i);
2953 # endif /* not _LIBC */
2954 	    }
2955 	  for (i = 0; i < cset->nmbchars; ++i)
2956 	    {
2957 	      char buf[256];
2958 	      mbstate_t state;
2959 	      memset (&state, '\0', sizeof (state));
2960 	      if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
2961 		re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
2962 	      if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
2963 		{
2964 		  if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
2965 		      != (size_t) -1)
2966 		    re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
2967 		}
2968 	    }
2969 	}
2970 #endif /* RE_ENABLE_I18N */
2971       else if (type == OP_PERIOD
2972 #ifdef RE_ENABLE_I18N
2973 	       || type == OP_UTF8_PERIOD
2974 #endif /* RE_ENABLE_I18N */
2975 	       || type == END_OF_RE)
2976 	{
2977 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
2978 	  if (type == END_OF_RE)
2979 	    bufp->can_be_null = 1;
2980 	  return;
2981 	}
2982     }
2983 }
2984 
2985 /* Entry point for POSIX code.  */
2986 /* regcomp takes a regular expression as a string and compiles it.
2987 
2988    PREG is a regex_t *.  We do not expect any fields to be initialized,
2989    since POSIX says we shouldn't.  Thus, we set
2990 
2991      `buffer' to the compiled pattern;
2992      `used' to the length of the compiled pattern;
2993      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
2994        REG_EXTENDED bit in CFLAGS is set; otherwise, to
2995        RE_SYNTAX_POSIX_BASIC;
2996      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
2997      `fastmap' to an allocated space for the fastmap;
2998      `fastmap_accurate' to zero;
2999      `re_nsub' to the number of subexpressions in PATTERN.
3000 
3001    PATTERN is the address of the pattern string.
3002 
3003    CFLAGS is a series of bits which affect compilation.
3004 
3005      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
3006      use POSIX basic syntax.
3007 
3008      If REG_NEWLINE is set, then . and [^...] don't match newline.
3009      Also, regexec will try a match beginning after every newline.
3010 
3011      If REG_ICASE is set, then we considers upper- and lowercase
3012      versions of letters to be equivalent when matching.
3013 
3014      If REG_NOSUB is set, then when PREG is passed to regexec, that
3015      routine will report only success or failure, and nothing about the
3016      registers.
3017 
3018    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
3019    the return codes and their meanings.)  */
3020 
3021 int
regcomp(preg,pattern,cflags)3022 regcomp (preg, pattern, cflags)
3023     regex_t *__restrict preg;
3024     const char *__restrict pattern;
3025     int cflags;
3026 {
3027   reg_errcode_t ret;
3028   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
3029 			 : RE_SYNTAX_POSIX_BASIC);
3030 
3031   preg->buffer = NULL;
3032   preg->allocated = 0;
3033   preg->used = 0;
3034 
3035   /* Try to allocate space for the fastmap.  */
3036   preg->fastmap = re_malloc (char, SBC_MAX);
3037   if (BE (preg->fastmap == NULL, 0))
3038     return REG_ESPACE;
3039 
3040   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
3041 
3042   /* If REG_NEWLINE is set, newlines are treated differently.  */
3043   if (cflags & REG_NEWLINE)
3044     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
3045       syntax &= ~RE_DOT_NEWLINE;
3046       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
3047       /* It also changes the matching behavior.  */
3048       preg->newline_anchor = 1;
3049     }
3050   else
3051     preg->newline_anchor = 0;
3052   preg->no_sub = !!(cflags & REG_NOSUB);
3053   preg->translate = NULL;
3054 
3055   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
3056 
3057   /* POSIX doesn't distinguish between an unmatched open-group and an
3058      unmatched close-group: both are REG_EPAREN.  */
3059   if (ret == REG_ERPAREN)
3060     ret = REG_EPAREN;
3061 
3062   /* We have already checked preg->fastmap != NULL.  */
3063   if (BE (ret == REG_NOERROR, 1))
3064     /* Compute the fastmap now, since regexec cannot modify the pattern
3065        buffer.  This function never fails in this implementation.  */
3066     (void) re_compile_fastmap (preg);
3067   else
3068     {
3069       /* Some error occurred while compiling the expression.  */
3070       re_free (preg->fastmap);
3071       preg->fastmap = NULL;
3072     }
3073 
3074   return (int) ret;
3075 }
3076 #ifdef _LIBC
weak_alias(__regcomp,regcomp)3077 weak_alias (__regcomp, regcomp)
3078 #endif
3079 
3080 /* Returns a message corresponding to an error code, ERRCODE, returned
3081    from either regcomp or regexec.   We don't use PREG here.  */
3082 
3083 /* regerror ( int errcode, preg, errbuf, errbuf_size) */
3084 size_t
3085 regerror (
3086     int errcode,
3087     const regex_t *__restrict preg,
3088     char *__restrict errbuf,
3089     size_t errbuf_size)
3090 {
3091   const char *msg;
3092   size_t msg_size;
3093 
3094   if (BE (errcode < 0
3095 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
3096 			       / sizeof (__re_error_msgid_idx[0])), 0))
3097     /* Only error codes returned by the rest of the code should be passed
3098        to this routine.  If we are given anything else, or if other regex
3099        code generates an invalid error code, then the program has a bug.
3100        Dump core so we can fix it.  */
3101     abort ();
3102 
3103   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3104 
3105   msg_size = strlen (msg) + 1; /* Includes the null.  */
3106 
3107   if (BE (errbuf_size != 0, 1))
3108     {
3109       if (BE (msg_size > errbuf_size, 0))
3110 	{
3111 #if defined HAVE_MEMPCPY || defined _LIBC
3112 	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
3113 #else
3114 	  memcpy (errbuf, msg, errbuf_size - 1);
3115 	  errbuf[errbuf_size - 1] = 0;
3116 #endif
3117 	}
3118       else
3119 	memcpy (errbuf, msg, msg_size);
3120     }
3121 
3122   return msg_size;
3123 }
3124 #ifdef _LIBC
3125 weak_alias (__regerror, regerror)
3126 #endif
3127 
3128 
3129 #ifdef RE_ENABLE_I18N
3130 /* This static array is used for the map to single-byte characters when
3131    UTF-8 is used.  Otherwise we would allocate memory just to initialize
3132    it the same all the time.  UTF-8 is the preferred encoding so this is
3133    a worthwhile optimization.  */
3134 static const bitset_t utf8_sb_map =
3135 {
3136   /* Set the first 128 bits.  */
3137   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
3138 };
3139 #endif
3140 
3141 
3142 static void
free_dfa_content(re_dfa_t * dfa)3143 free_dfa_content (re_dfa_t *dfa)
3144 {
3145   int i, j;
3146 
3147   if (dfa->nodes)
3148     for (i = 0; i < dfa->nodes_len; ++i)
3149       free_token (dfa->nodes + i);
3150   re_free (dfa->nexts);
3151   for (i = 0; i < dfa->nodes_len; ++i)
3152     {
3153       if (dfa->eclosures != NULL)
3154 	re_node_set_free (dfa->eclosures + i);
3155       if (dfa->inveclosures != NULL)
3156 	re_node_set_free (dfa->inveclosures + i);
3157       if (dfa->edests != NULL)
3158 	re_node_set_free (dfa->edests + i);
3159     }
3160   re_free (dfa->edests);
3161   re_free (dfa->eclosures);
3162   re_free (dfa->inveclosures);
3163   re_free (dfa->nodes);
3164 
3165   if (dfa->state_table)
3166     for (i = 0; i <= dfa->state_hash_mask; ++i)
3167       {
3168 	struct re_state_table_entry *entry = dfa->state_table + i;
3169 	for (j = 0; j < entry->num; ++j)
3170 	  {
3171 	    re_dfastate_t *state = entry->array[j];
3172 	    free_state (state);
3173 	  }
3174         re_free (entry->array);
3175       }
3176   re_free (dfa->state_table);
3177 #ifdef RE_ENABLE_I18N
3178   if (dfa->sb_char != utf8_sb_map)
3179     re_free (dfa->sb_char);
3180 #endif
3181   re_free (dfa->subexp_map);
3182 #ifdef DEBUG
3183   re_free (dfa->re_str);
3184 #endif
3185 
3186   re_free (dfa);
3187 }
3188 
3189 
3190 /* Free dynamically allocated space used by PREG.  */
3191 
3192 void
regfree(preg)3193 regfree (preg)
3194     regex_t *preg;
3195 {
3196   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3197   if (BE (dfa != NULL, 1))
3198     free_dfa_content (dfa);
3199   preg->buffer = NULL;
3200   preg->allocated = 0;
3201 
3202   re_free (preg->fastmap);
3203   preg->fastmap = NULL;
3204 
3205   re_free (preg->translate);
3206   preg->translate = NULL;
3207 }
3208 #ifdef _LIBC
3209 weak_alias (__regfree, regfree)
3210 #endif
3211 
3212 /* Entry points compatible with 4.2 BSD regex library.  We don't define
3213    them unless specifically requested.  */
3214 
3215 #if defined _REGEX_RE_COMP || defined _LIBC
3216 
3217 /* BSD has one and only one pattern buffer.  */
3218 static struct re_pattern_buffer re_comp_buf;
3219 
3220 char *
3221 # ifdef _LIBC
3222 /* Make these definitions weak in libc, so POSIX programs can redefine
3223    these names if they don't use our functions, and still use
3224    regcomp/regexec above without link errors.  */
3225 weak_function
3226 # endif
re_comp(s)3227 re_comp (s)
3228      const char *s;
3229 {
3230   reg_errcode_t ret;
3231   char *fastmap;
3232 
3233   if (!s)
3234     {
3235       if (!re_comp_buf.buffer)
3236 	return gettext ("No previous regular expression");
3237       return 0;
3238     }
3239 
3240   if (re_comp_buf.buffer)
3241     {
3242       fastmap = re_comp_buf.fastmap;
3243       re_comp_buf.fastmap = NULL;
3244       __regfree (&re_comp_buf);
3245       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
3246       re_comp_buf.fastmap = fastmap;
3247     }
3248 
3249   if (re_comp_buf.fastmap == NULL)
3250     {
3251       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
3252       if (re_comp_buf.fastmap == NULL)
3253 	return (char *) gettext (__re_error_msgid
3254 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
3255     }
3256 
3257   /* Since `re_exec' always passes NULL for the `regs' argument, we
3258      don't need to initialize the pattern buffer fields which affect it.  */
3259 
3260   /* Match anchors at newlines.  */
3261   re_comp_buf.newline_anchor = 1;
3262 
3263   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
3264 
3265   if (!ret)
3266     return NULL;
3267 
3268   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
3269   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3270 }
3271 
3272 #ifdef _LIBC
libc_freeres_fn(free_mem)3273 libc_freeres_fn (free_mem)
3274 {
3275   __regfree (&re_comp_buf);
3276 }
3277 #endif
3278 
3279 #endif /* _REGEX_RE_COMP */
3280 
3281 /* Internal entry point.
3282    Compile the regular expression PATTERN, whose length is LENGTH.
3283    SYNTAX indicate regular expression's syntax.  */
3284 
3285 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)3286 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
3287 		     reg_syntax_t syntax)
3288 {
3289   reg_errcode_t err = REG_NOERROR;
3290   re_dfa_t *dfa;
3291   re_string_t regexp;
3292 
3293   /* Initialize the pattern buffer.  */
3294   preg->fastmap_accurate = 0;
3295   preg->syntax = syntax;
3296   preg->not_bol = preg->not_eol = 0;
3297   preg->used = 0;
3298   preg->re_nsub = 0;
3299   preg->can_be_null = 0;
3300   preg->regs_allocated = REGS_UNALLOCATED;
3301 
3302   /* Initialize the dfa.  */
3303   dfa = (re_dfa_t *) preg->buffer;
3304   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
3305     {
3306       /* If zero allocated, but buffer is non-null, try to realloc
3307 	 enough space.  This loses if buffer's address is bogus, but
3308 	 that is the user's responsibility.  If ->buffer is NULL this
3309 	 is a simple allocation.  */
3310       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
3311       if (dfa == NULL)
3312 	return REG_ESPACE;
3313       preg->allocated = sizeof (re_dfa_t);
3314       preg->buffer = (unsigned char *) dfa;
3315     }
3316   preg->used = sizeof (re_dfa_t);
3317 
3318   err = init_dfa (dfa, length);
3319   if (BE (err != REG_NOERROR, 0))
3320     {
3321       free_dfa_content (dfa);
3322       preg->buffer = NULL;
3323       preg->allocated = 0;
3324       return err;
3325     }
3326 #ifdef DEBUG
3327   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
3328   dfa->re_str = re_malloc (char, length + 1);
3329   strncpy (dfa->re_str, pattern, length + 1);
3330 #endif
3331 
3332   __libc_lock_init (dfa->lock);
3333 
3334   err = re_string_construct (&regexp, pattern, length, preg->translate,
3335 			     syntax & RE_ICASE, dfa);
3336   if (BE (err != REG_NOERROR, 0))
3337     {
3338     re_compile_internal_free_return:
3339       free_workarea_compile (preg);
3340       re_string_destruct (&regexp);
3341       free_dfa_content (dfa);
3342       preg->buffer = NULL;
3343       preg->allocated = 0;
3344       return err;
3345     }
3346 
3347   /* Parse the regular expression, and build a structure tree.  */
3348   preg->re_nsub = 0;
3349   dfa->str_tree = parse (&regexp, preg, syntax, &err);
3350   if (BE (dfa->str_tree == NULL, 0))
3351     goto re_compile_internal_free_return;
3352 
3353   /* Analyze the tree and create the nfa.  */
3354   err = analyze (preg);
3355   if (BE (err != REG_NOERROR, 0))
3356     goto re_compile_internal_free_return;
3357 
3358 #ifdef RE_ENABLE_I18N
3359   /* If possible, do searching in single byte encoding to speed things up.  */
3360   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
3361     optimize_utf8 (dfa);
3362 #endif
3363 
3364   /* Then create the initial state of the dfa.  */
3365   err = create_initial_state (dfa);
3366 
3367   /* Release work areas.  */
3368   free_workarea_compile (preg);
3369   re_string_destruct (&regexp);
3370 
3371   if (BE (err != REG_NOERROR, 0))
3372     {
3373       free_dfa_content (dfa);
3374       preg->buffer = NULL;
3375       preg->allocated = 0;
3376     }
3377 
3378   return err;
3379 }
3380 
3381 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
3382    as the initial length of some arrays.  */
3383 
3384 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)3385 init_dfa (re_dfa_t *dfa, size_t pat_len)
3386 {
3387   unsigned int table_size;
3388 #ifndef _LIBC
3389   char *codeset_name;
3390 #endif
3391 
3392   memset (dfa, '\0', sizeof (re_dfa_t));
3393 
3394   /* Force allocation of str_tree_storage the first time.  */
3395   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3396 
3397   /* Avoid overflows.  */
3398   if (pat_len == SIZE_MAX)
3399     return REG_ESPACE;
3400 
3401   dfa->nodes_alloc = pat_len + 1;
3402   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
3403 
3404   /*  table_size = 2 ^ ceil(log pat_len) */
3405   for (table_size = 1; ; table_size <<= 1)
3406     if (table_size > pat_len)
3407       break;
3408 
3409   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
3410   dfa->state_hash_mask = table_size - 1;
3411 
3412   dfa->mb_cur_max = MB_CUR_MAX;
3413 #ifdef _LIBC
3414   if (dfa->mb_cur_max == 6
3415       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
3416     dfa->is_utf8 = 1;
3417   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
3418 		       != 0);
3419 #else
3420 # ifdef HAVE_LANGINFO_CODESET
3421   codeset_name = nl_langinfo (CODESET);
3422 # else
3423   codeset_name = getenv ("LC_ALL");
3424   if (codeset_name == NULL || codeset_name[0] == '\0')
3425     codeset_name = getenv ("LC_CTYPE");
3426   if (codeset_name == NULL || codeset_name[0] == '\0')
3427     codeset_name = getenv ("LANG");
3428   if (codeset_name == NULL)
3429     codeset_name = "";
3430   else if (strchr (codeset_name, '.') !=  NULL)
3431     codeset_name = strchr (codeset_name, '.') + 1;
3432 # endif
3433 
3434   if (strcasecmp (codeset_name, "UTF-8") == 0
3435       || strcasecmp (codeset_name, "UTF8") == 0)
3436     dfa->is_utf8 = 1;
3437 
3438   /* We check exhaustively in the loop below if this charset is a
3439      superset of ASCII.  */
3440   dfa->map_notascii = 0;
3441 #endif
3442 
3443 #ifdef RE_ENABLE_I18N
3444   if (dfa->mb_cur_max > 1)
3445     {
3446       if (dfa->is_utf8)
3447 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
3448       else
3449 	{
3450 	  int i, j, ch;
3451 
3452 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3453 	  if (BE (dfa->sb_char == NULL, 0))
3454 	    return REG_ESPACE;
3455 
3456 	  /* Set the bits corresponding to single byte chars.  */
3457 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
3458 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
3459 	      {
3460 		wint_t wch = __btowc (ch);
3461 		if (wch != WEOF)
3462 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
3463 # ifndef _LIBC
3464 		if (isascii (ch) && wch != ch)
3465 		  dfa->map_notascii = 1;
3466 # endif
3467 	      }
3468 	}
3469     }
3470 #endif
3471 
3472   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
3473     return REG_ESPACE;
3474   return REG_NOERROR;
3475 }
3476 
3477 /* Initialize WORD_CHAR table, which indicate which character is
3478    "word".  In this case "word" means that it is the word construction
3479    character used by some operators like "\<", "\>", etc.  */
3480 
3481 static void
3482 internal_function
init_word_char(re_dfa_t * dfa)3483 init_word_char (re_dfa_t *dfa)
3484 {
3485   int i, j, ch;
3486   dfa->word_ops_used = 1;
3487   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
3488     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
3489       if (isalnum (ch) || ch == '_')
3490 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
3491 }
3492 
3493 /* Free the work area which are only used while compiling.  */
3494 
3495 static void
free_workarea_compile(regex_t * preg)3496 free_workarea_compile (regex_t *preg)
3497 {
3498   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3499   bin_tree_storage_t *storage, *next;
3500   for (storage = dfa->str_tree_storage; storage; storage = next)
3501     {
3502       next = storage->next;
3503       re_free (storage);
3504     }
3505   dfa->str_tree_storage = NULL;
3506   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3507   dfa->str_tree = NULL;
3508   re_free (dfa->org_indices);
3509   dfa->org_indices = NULL;
3510 }
3511 
3512 /* Create initial states for all contexts.  */
3513 
3514 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)3515 create_initial_state (re_dfa_t *dfa)
3516 {
3517   int first, i;
3518   reg_errcode_t err;
3519   re_node_set init_nodes;
3520 
3521   /* Initial states have the epsilon closure of the node which is
3522      the first node of the regular expression.  */
3523   first = dfa->str_tree->first->node_idx;
3524   dfa->init_node = first;
3525   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
3526   if (BE (err != REG_NOERROR, 0))
3527     return err;
3528 
3529   /* The back-references which are in initial states can epsilon transit,
3530      since in this case all of the subexpressions can be null.
3531      Then we add epsilon closures of the nodes which are the next nodes of
3532      the back-references.  */
3533   if (dfa->nbackref > 0)
3534     for (i = 0; i < init_nodes.nelem; ++i)
3535       {
3536 	int node_idx = init_nodes.elems[i];
3537 	re_token_type_t type = dfa->nodes[node_idx].type;
3538 
3539 	int clexp_idx;
3540 	if (type != OP_BACK_REF)
3541 	  continue;
3542 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
3543 	  {
3544 	    re_token_t *clexp_node;
3545 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
3546 	    if (clexp_node->type == OP_CLOSE_SUBEXP
3547 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
3548 	      break;
3549 	  }
3550 	if (clexp_idx == init_nodes.nelem)
3551 	  continue;
3552 
3553 	if (type == OP_BACK_REF)
3554 	  {
3555 	    int dest_idx = dfa->edests[node_idx].elems[0];
3556 	    if (!re_node_set_contains (&init_nodes, dest_idx))
3557 	      {
3558 		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
3559 		i = 0;
3560 	      }
3561 	  }
3562       }
3563 
3564   /* It must be the first time to invoke acquire_state.  */
3565   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
3566   /* We don't check ERR here, since the initial state must not be NULL.  */
3567   if (BE (dfa->init_state == NULL, 0))
3568     return err;
3569   if (dfa->init_state->has_constraint)
3570     {
3571       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
3572 						       CONTEXT_WORD);
3573       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
3574 						     CONTEXT_NEWLINE);
3575       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
3576 							 &init_nodes,
3577 							 CONTEXT_NEWLINE
3578 							 | CONTEXT_BEGBUF);
3579       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
3580 	      || dfa->init_state_begbuf == NULL, 0))
3581 	return err;
3582     }
3583   else
3584     dfa->init_state_word = dfa->init_state_nl
3585       = dfa->init_state_begbuf = dfa->init_state;
3586 
3587   re_node_set_free (&init_nodes);
3588   return REG_NOERROR;
3589 }
3590 
3591 #ifdef RE_ENABLE_I18N
3592 /* If it is possible to do searching in single byte encoding instead of UTF-8
3593    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
3594    DFA nodes where needed.  */
3595 
3596 static void
optimize_utf8(re_dfa_t * dfa)3597 optimize_utf8 (re_dfa_t *dfa)
3598 {
3599   int node, i, mb_chars = 0, has_period = 0;
3600 
3601   for (node = 0; node < dfa->nodes_len; ++node)
3602     switch (dfa->nodes[node].type)
3603       {
3604       case CHARACTER:
3605 	if (dfa->nodes[node].opr.c >= 0x80)
3606 	  mb_chars = 1;
3607 	break;
3608       case ANCHOR:
3609 	switch (dfa->nodes[node].opr.idx)
3610 	  {
3611 	  case LINE_FIRST:
3612 	  case LINE_LAST:
3613 	  case BUF_FIRST:
3614 	  case BUF_LAST:
3615 	    break;
3616 	  default:
3617 	    /* Word anchors etc. cannot be handled.  */
3618 	    return;
3619 	  }
3620 	break;
3621       case OP_PERIOD:
3622         has_period = 1;
3623         break;
3624       case OP_BACK_REF:
3625       case OP_ALT:
3626       case END_OF_RE:
3627       case OP_DUP_ASTERISK:
3628       case OP_OPEN_SUBEXP:
3629       case OP_CLOSE_SUBEXP:
3630 	break;
3631       case COMPLEX_BRACKET:
3632 	return;
3633       case SIMPLE_BRACKET:
3634 	/* Just double check.  The non-ASCII range starts at 0x80.  */
3635 	assert (0x80 % BITSET_WORD_BITS == 0);
3636         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
3637 	  if (dfa->nodes[node].opr.sbcset[i])
3638 	    return;
3639 	break;
3640       default:
3641 	abort ();
3642       }
3643 
3644   if (mb_chars || has_period)
3645     for (node = 0; node < dfa->nodes_len; ++node)
3646       {
3647 	if (dfa->nodes[node].type == CHARACTER
3648 	    && dfa->nodes[node].opr.c >= 0x80)
3649 	  dfa->nodes[node].mb_partial = 0;
3650 	else if (dfa->nodes[node].type == OP_PERIOD)
3651 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
3652       }
3653 
3654   /* The search can be in single byte locale.  */
3655   dfa->mb_cur_max = 1;
3656   dfa->is_utf8 = 0;
3657   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
3658 }
3659 #endif
3660 
3661 /* Analyze the structure tree, and calculate "first", "next", "edest",
3662    "eclosure", and "inveclosure".  */
3663 
3664 static reg_errcode_t
analyze(regex_t * preg)3665 analyze (regex_t *preg)
3666 {
3667   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3668   reg_errcode_t ret;
3669 
3670   /* Allocate arrays.  */
3671   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
3672   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
3673   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
3674   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
3675   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
3676 	  || dfa->eclosures == NULL, 0))
3677     return REG_ESPACE;
3678 
3679   dfa->subexp_map = re_malloc (int, preg->re_nsub);
3680   if (dfa->subexp_map != NULL)
3681     {
3682       int i;
3683       for (i = 0; i < preg->re_nsub; i++)
3684 	dfa->subexp_map[i] = i;
3685       preorder (dfa->str_tree, optimize_subexps, dfa);
3686       for (i = 0; i < preg->re_nsub; i++)
3687 	if (dfa->subexp_map[i] != i)
3688 	  break;
3689       if (i == preg->re_nsub)
3690 	{
3691 	  free (dfa->subexp_map);
3692 	  dfa->subexp_map = NULL;
3693 	}
3694     }
3695 
3696   ret = postorder (dfa->str_tree, lower_subexps, preg);
3697   if (BE (ret != REG_NOERROR, 0))
3698     return ret;
3699   ret = postorder (dfa->str_tree, calc_first, dfa);
3700   if (BE (ret != REG_NOERROR, 0))
3701     return ret;
3702   preorder (dfa->str_tree, calc_next, dfa);
3703   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
3704   if (BE (ret != REG_NOERROR, 0))
3705     return ret;
3706   ret = calc_eclosure (dfa);
3707   if (BE (ret != REG_NOERROR, 0))
3708     return ret;
3709 
3710   /* We only need this during the prune_impossible_nodes pass in regexec.c;
3711      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
3712   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
3713       || dfa->nbackref)
3714     {
3715       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
3716       if (BE (dfa->inveclosures == NULL, 0))
3717         return REG_ESPACE;
3718       ret = calc_inveclosure (dfa);
3719     }
3720 
3721   return ret;
3722 }
3723 
3724 /* Our parse trees are very unbalanced, so we cannot use a stack to
3725    implement parse tree visits.  Instead, we use parent pointers and
3726    some hairy code in these two functions.  */
3727 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)3728 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
3729 	   void *extra)
3730 {
3731   bin_tree_t *node, *prev;
3732 
3733   for (node = root; ; )
3734     {
3735       /* Descend down the tree, preferably to the left (or to the right
3736 	 if that's the only child).  */
3737       while (node->left || node->right)
3738 	if (node->left)
3739           node = node->left;
3740         else
3741           node = node->right;
3742 
3743       do
3744 	{
3745 	  reg_errcode_t err = fn (extra, node);
3746 	  if (BE (err != REG_NOERROR, 0))
3747 	    return err;
3748           if (node->parent == NULL)
3749 	    return REG_NOERROR;
3750 	  prev = node;
3751 	  node = node->parent;
3752 	}
3753       /* Go up while we have a node that is reached from the right.  */
3754       while (node->right == prev || node->right == NULL);
3755       node = node->right;
3756     }
3757 }
3758 
3759 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)3760 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
3761 	  void *extra)
3762 {
3763   bin_tree_t *node;
3764 
3765   for (node = root; ; )
3766     {
3767       reg_errcode_t err = fn (extra, node);
3768       if (BE (err != REG_NOERROR, 0))
3769 	return err;
3770 
3771       /* Go to the left node, or up and to the right.  */
3772       if (node->left)
3773 	node = node->left;
3774       else
3775 	{
3776 	  bin_tree_t *prev = NULL;
3777 	  while (node->right == prev || node->right == NULL)
3778 	    {
3779 	      prev = node;
3780 	      node = node->parent;
3781 	      if (!node)
3782 	        return REG_NOERROR;
3783 	    }
3784 	  node = node->right;
3785 	}
3786     }
3787 }
3788 
3789 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
3790    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
3791    backreferences as well.  Requires a preorder visit.  */
3792 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)3793 optimize_subexps (void *extra, bin_tree_t *node)
3794 {
3795   re_dfa_t *dfa = (re_dfa_t *) extra;
3796 
3797   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
3798     {
3799       int idx = node->token.opr.idx;
3800       node->token.opr.idx = dfa->subexp_map[idx];
3801       dfa->used_bkref_map |= 1 << node->token.opr.idx;
3802     }
3803 
3804   else if (node->token.type == SUBEXP
3805            && node->left && node->left->token.type == SUBEXP)
3806     {
3807       int other_idx = node->left->token.opr.idx;
3808 
3809       node->left = node->left->left;
3810       if (node->left)
3811         node->left->parent = node;
3812 
3813       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
3814       if (other_idx < BITSET_WORD_BITS)
3815 	  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
3816     }
3817 
3818   return REG_NOERROR;
3819 }
3820 
3821 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
3822    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
3823 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)3824 lower_subexps (void *extra, bin_tree_t *node)
3825 {
3826   regex_t *preg = (regex_t *) extra;
3827   reg_errcode_t err = REG_NOERROR;
3828 
3829   if (node->left && node->left->token.type == SUBEXP)
3830     {
3831       node->left = lower_subexp (&err, preg, node->left);
3832       if (node->left)
3833 	node->left->parent = node;
3834     }
3835   if (node->right && node->right->token.type == SUBEXP)
3836     {
3837       node->right = lower_subexp (&err, preg, node->right);
3838       if (node->right)
3839 	node->right->parent = node;
3840     }
3841 
3842   return err;
3843 }
3844 
3845 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)3846 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
3847 {
3848   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3849   bin_tree_t *body = node->left;
3850   bin_tree_t *op, *cls, *tree1, *tree;
3851 
3852   if (preg->no_sub
3853       /* We do not optimize empty subexpressions, because otherwise we may
3854 	 have bad CONCAT nodes with NULL children.  This is obviously not
3855 	 very common, so we do not lose much.  An example that triggers
3856 	 this case is the sed "script" /\(\)/x.  */
3857       && node->left != NULL
3858       && (node->token.opr.idx >= BITSET_WORD_BITS
3859 	  || !(dfa->used_bkref_map
3860 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
3861     return node->left;
3862 
3863   /* Convert the SUBEXP node to the concatenation of an
3864      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
3865   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
3866   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
3867   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
3868   tree = create_tree (dfa, op, tree1, CONCAT);
3869   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
3870     {
3871       *err = REG_ESPACE;
3872       return NULL;
3873     }
3874 
3875   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
3876   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
3877   return tree;
3878 }
3879 
3880 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
3881    nodes.  Requires a postorder visit.  */
3882 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)3883 calc_first (void *extra, bin_tree_t *node)
3884 {
3885   re_dfa_t *dfa = (re_dfa_t *) extra;
3886   if (node->token.type == CONCAT)
3887     {
3888       node->first = node->left->first;
3889       node->node_idx = node->left->node_idx;
3890     }
3891   else
3892     {
3893       node->first = node;
3894       node->node_idx = re_dfa_add_node (dfa, node->token);
3895       if (BE (node->node_idx == -1, 0))
3896         return REG_ESPACE;
3897     }
3898   return REG_NOERROR;
3899 }
3900 
3901 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
3902 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)3903 calc_next (void *extra, bin_tree_t *node)
3904 {
3905   switch (node->token.type)
3906     {
3907     case OP_DUP_ASTERISK:
3908       node->left->next = node;
3909       break;
3910     case CONCAT:
3911       node->left->next = node->right->first;
3912       node->right->next = node->next;
3913       break;
3914     default:
3915       if (node->left)
3916 	node->left->next = node->next;
3917       if (node->right)
3918         node->right->next = node->next;
3919       break;
3920     }
3921   return REG_NOERROR;
3922 }
3923 
3924 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
3925 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)3926 link_nfa_nodes (void *extra, bin_tree_t *node)
3927 {
3928   re_dfa_t *dfa = (re_dfa_t *) extra;
3929   int idx = node->node_idx;
3930   reg_errcode_t err = REG_NOERROR;
3931 
3932   switch (node->token.type)
3933     {
3934     case CONCAT:
3935       break;
3936 
3937     case END_OF_RE:
3938       assert (node->next == NULL);
3939       break;
3940 
3941     case OP_DUP_ASTERISK:
3942     case OP_ALT:
3943       {
3944 	int left, right;
3945 	dfa->has_plural_match = 1;
3946 	if (node->left != NULL)
3947 	  left = node->left->first->node_idx;
3948 	else
3949 	  left = node->next->node_idx;
3950 	if (node->right != NULL)
3951 	  right = node->right->first->node_idx;
3952 	else
3953 	  right = node->next->node_idx;
3954 	assert (left > -1);
3955 	assert (right > -1);
3956 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
3957       }
3958       break;
3959 
3960     case ANCHOR:
3961     case OP_OPEN_SUBEXP:
3962     case OP_CLOSE_SUBEXP:
3963       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
3964       break;
3965 
3966     case OP_BACK_REF:
3967       dfa->nexts[idx] = node->next->node_idx;
3968       if (node->token.type == OP_BACK_REF)
3969 	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
3970       break;
3971 
3972     default:
3973       assert (!IS_EPSILON_NODE (node->token.type));
3974       dfa->nexts[idx] = node->next->node_idx;
3975       break;
3976     }
3977 
3978   return err;
3979 }
3980 
3981 /* Duplicate the epsilon closure of the node ROOT_NODE.
3982    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
3983    to their own constraint.  */
3984 
3985 static reg_errcode_t
3986 internal_function
duplicate_node_closure(re_dfa_t * dfa,int top_org_node,int top_clone_node,int root_node,unsigned int init_constraint)3987 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
3988 			int root_node, unsigned int init_constraint)
3989 {
3990   int org_node, clone_node, ret;
3991   unsigned int constraint = init_constraint;
3992   for (org_node = top_org_node, clone_node = top_clone_node;;)
3993     {
3994       int org_dest, clone_dest;
3995       if (dfa->nodes[org_node].type == OP_BACK_REF)
3996 	{
3997 	  /* If the back reference epsilon-transit, its destination must
3998 	     also have the constraint.  Then duplicate the epsilon closure
3999 	     of the destination of the back reference, and store it in
4000 	     edests of the back reference.  */
4001 	  org_dest = dfa->nexts[org_node];
4002 	  re_node_set_empty (dfa->edests + clone_node);
4003 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
4004 	  if (BE (clone_dest == -1, 0))
4005 	    return REG_ESPACE;
4006 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
4007 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4008 	  if (BE (ret < 0, 0))
4009 	    return REG_ESPACE;
4010 	}
4011       else if (dfa->edests[org_node].nelem == 0)
4012 	{
4013 	  /* In case of the node can't epsilon-transit, don't duplicate the
4014 	     destination and store the original destination as the
4015 	     destination of the node.  */
4016 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
4017 	  break;
4018 	}
4019       else if (dfa->edests[org_node].nelem == 1)
4020 	{
4021 	  /* In case of the node can epsilon-transit, and it has only one
4022 	     destination.  */
4023 	  org_dest = dfa->edests[org_node].elems[0];
4024 	  re_node_set_empty (dfa->edests + clone_node);
4025 	  if (dfa->nodes[org_node].type == ANCHOR)
4026 	    {
4027 	      /* In case of the node has another constraint, append it.  */
4028 	      if (org_node == root_node && clone_node != org_node)
4029 		{
4030 		  /* ...but if the node is root_node itself, it means the
4031 		     epsilon closure have a loop, then tie it to the
4032 		     destination of the root_node.  */
4033 		  ret = re_node_set_insert (dfa->edests + clone_node,
4034 					    org_dest);
4035 		  if (BE (ret < 0, 0))
4036 		    return REG_ESPACE;
4037 		  break;
4038 		}
4039 	      constraint |= dfa->nodes[org_node].opr.ctx_type;
4040 	    }
4041 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
4042 	  if (BE (clone_dest == -1, 0))
4043 	    return REG_ESPACE;
4044 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4045 	  if (BE (ret < 0, 0))
4046 	    return REG_ESPACE;
4047 	}
4048       else /* dfa->edests[org_node].nelem == 2 */
4049 	{
4050 	  /* In case of the node can epsilon-transit, and it has two
4051 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
4052 	  org_dest = dfa->edests[org_node].elems[0];
4053 	  re_node_set_empty (dfa->edests + clone_node);
4054 	  /* Search for a duplicated node which satisfies the constraint.  */
4055 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
4056 	  if (clone_dest == -1)
4057 	    {
4058 	      /* There are no such a duplicated node, create a new one.  */
4059 	      reg_errcode_t err;
4060 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
4061 	      if (BE (clone_dest == -1, 0))
4062 		return REG_ESPACE;
4063 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4064 	      if (BE (ret < 0, 0))
4065 		return REG_ESPACE;
4066 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
4067 					    root_node, constraint);
4068 	      if (BE (err != REG_NOERROR, 0))
4069 		return err;
4070 	    }
4071 	  else
4072 	    {
4073 	      /* There are a duplicated node which satisfy the constraint,
4074 		 use it to avoid infinite loop.  */
4075 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4076 	      if (BE (ret < 0, 0))
4077 		return REG_ESPACE;
4078 	    }
4079 
4080 	  org_dest = dfa->edests[org_node].elems[1];
4081 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
4082 	  if (BE (clone_dest == -1, 0))
4083 	    return REG_ESPACE;
4084 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4085 	  if (BE (ret < 0, 0))
4086 	    return REG_ESPACE;
4087 	}
4088       org_node = org_dest;
4089       clone_node = clone_dest;
4090     }
4091   return REG_NOERROR;
4092 }
4093 
4094 /* Search for a node which is duplicated from the node ORG_NODE, and
4095    satisfies the constraint CONSTRAINT.  */
4096 
4097 static int
search_duplicated_node(const re_dfa_t * dfa,int org_node,unsigned int constraint)4098 search_duplicated_node (const re_dfa_t *dfa, int org_node,
4099 			unsigned int constraint)
4100 {
4101   int idx;
4102   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
4103     {
4104       if (org_node == dfa->org_indices[idx]
4105 	  && constraint == dfa->nodes[idx].constraint)
4106 	return idx; /* Found.  */
4107     }
4108   return -1; /* Not found.  */
4109 }
4110 
4111 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
4112    Return the index of the new node, or -1 if insufficient storage is
4113    available.  */
4114 
4115 static int
duplicate_node(re_dfa_t * dfa,int org_idx,unsigned int constraint)4116 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
4117 {
4118   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
4119   if (BE (dup_idx != -1, 1))
4120     {
4121       dfa->nodes[dup_idx].constraint = constraint;
4122       if (dfa->nodes[org_idx].type == ANCHOR)
4123 	dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
4124       dfa->nodes[dup_idx].duplicated = 1;
4125 
4126       /* Store the index of the original node.  */
4127       dfa->org_indices[dup_idx] = org_idx;
4128     }
4129   return dup_idx;
4130 }
4131 
4132 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)4133 calc_inveclosure (re_dfa_t *dfa)
4134 {
4135   int src, idx, ret;
4136   for (idx = 0; idx < dfa->nodes_len; ++idx)
4137     re_node_set_init_empty (dfa->inveclosures + idx);
4138 
4139   for (src = 0; src < dfa->nodes_len; ++src)
4140     {
4141       int *elems = dfa->eclosures[src].elems;
4142       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
4143 	{
4144 	  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
4145 	  if (BE (ret == -1, 0))
4146 	    return REG_ESPACE;
4147 	}
4148     }
4149 
4150   return REG_NOERROR;
4151 }
4152 
4153 /* Calculate "eclosure" for all the node in DFA.  */
4154 
4155 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)4156 calc_eclosure (re_dfa_t *dfa)
4157 {
4158   int node_idx, incomplete;
4159 #ifdef DEBUG
4160   assert (dfa->nodes_len > 0);
4161 #endif
4162   incomplete = 0;
4163   /* For each nodes, calculate epsilon closure.  */
4164   for (node_idx = 0; ; ++node_idx)
4165     {
4166       reg_errcode_t err;
4167       re_node_set eclosure_elem;
4168       if (node_idx == dfa->nodes_len)
4169 	{
4170 	  if (!incomplete)
4171 	    break;
4172 	  incomplete = 0;
4173 	  node_idx = 0;
4174 	}
4175 
4176 #ifdef DEBUG
4177       assert (dfa->eclosures[node_idx].nelem != -1);
4178 #endif
4179 
4180       /* If we have already calculated, skip it.  */
4181       if (dfa->eclosures[node_idx].nelem != 0)
4182 	continue;
4183       /* Calculate epsilon closure of `node_idx'.  */
4184       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
4185       if (BE (err != REG_NOERROR, 0))
4186 	return err;
4187 
4188       if (dfa->eclosures[node_idx].nelem == 0)
4189 	{
4190 	  incomplete = 1;
4191 	  re_node_set_free (&eclosure_elem);
4192 	}
4193     }
4194   return REG_NOERROR;
4195 }
4196 
4197 /* Calculate epsilon closure of NODE.  */
4198 
4199 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,int node,int root)4200 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
4201 {
4202   reg_errcode_t err;
4203   unsigned int constraint;
4204   int i, incomplete;
4205   re_node_set eclosure;
4206   incomplete = 0;
4207   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
4208   if (BE (err != REG_NOERROR, 0))
4209     return err;
4210 
4211   /* This indicates that we are calculating this node now.
4212      We reference this value to avoid infinite loop.  */
4213   dfa->eclosures[node].nelem = -1;
4214 
4215   constraint = ((dfa->nodes[node].type == ANCHOR)
4216 		? dfa->nodes[node].opr.ctx_type : 0);
4217   /* If the current node has constraints, duplicate all nodes.
4218      Since they must inherit the constraints.  */
4219   if (constraint
4220       && dfa->edests[node].nelem
4221       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
4222     {
4223       err = duplicate_node_closure (dfa, node, node, node, constraint);
4224       if (BE (err != REG_NOERROR, 0))
4225 	return err;
4226     }
4227 
4228   /* Expand each epsilon destination nodes.  */
4229   if (IS_EPSILON_NODE(dfa->nodes[node].type))
4230     for (i = 0; i < dfa->edests[node].nelem; ++i)
4231       {
4232 	re_node_set eclosure_elem;
4233 	int edest = dfa->edests[node].elems[i];
4234 	/* If calculating the epsilon closure of `edest' is in progress,
4235 	   return intermediate result.  */
4236 	if (dfa->eclosures[edest].nelem == -1)
4237 	  {
4238 	    incomplete = 1;
4239 	    continue;
4240 	  }
4241 	/* If we haven't calculated the epsilon closure of `edest' yet,
4242 	   calculate now. Otherwise use calculated epsilon closure.  */
4243 	if (dfa->eclosures[edest].nelem == 0)
4244 	  {
4245 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
4246 	    if (BE (err != REG_NOERROR, 0))
4247 	      return err;
4248 	  }
4249 	else
4250 	  eclosure_elem = dfa->eclosures[edest];
4251 	/* Merge the epsilon closure of `edest'.  */
4252 	re_node_set_merge (&eclosure, &eclosure_elem);
4253 	/* If the epsilon closure of `edest' is incomplete,
4254 	   the epsilon closure of this node is also incomplete.  */
4255 	if (dfa->eclosures[edest].nelem == 0)
4256 	  {
4257 	    incomplete = 1;
4258 	    re_node_set_free (&eclosure_elem);
4259 	  }
4260       }
4261 
4262   /* Epsilon closures include itself.  */
4263   re_node_set_insert (&eclosure, node);
4264   if (incomplete && !root)
4265     dfa->eclosures[node].nelem = 0;
4266   else
4267     dfa->eclosures[node] = eclosure;
4268   *new_set = eclosure;
4269   return REG_NOERROR;
4270 }
4271 
4272 /* Functions for token which are used in the parser.  */
4273 
4274 /* Fetch a token from INPUT.
4275    We must not use this function inside bracket expressions.  */
4276 
4277 static void
4278 internal_function
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)4279 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
4280 {
4281   re_string_skip_bytes (input, peek_token (result, input, syntax));
4282 }
4283 
4284 /* Peek a token from INPUT, and return the length of the token.
4285    We must not use this function inside bracket expressions.  */
4286 
4287 static int
4288 internal_function
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)4289 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
4290 {
4291   unsigned char c;
4292 
4293   if (re_string_eoi (input))
4294     {
4295       token->type = END_OF_RE;
4296       return 0;
4297     }
4298 
4299   c = re_string_peek_byte (input, 0);
4300   token->opr.c = c;
4301 
4302   token->word_char = 0;
4303 #ifdef RE_ENABLE_I18N
4304   token->mb_partial = 0;
4305   if (input->mb_cur_max > 1 &&
4306       !re_string_first_byte (input, re_string_cur_idx (input)))
4307     {
4308       token->type = CHARACTER;
4309       token->mb_partial = 1;
4310       return 1;
4311     }
4312 #endif
4313   if (c == '\\')
4314     {
4315       unsigned char c2;
4316       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
4317 	{
4318 	  token->type = BACK_SLASH;
4319 	  return 1;
4320 	}
4321 
4322       c2 = re_string_peek_byte_case (input, 1);
4323       token->opr.c = c2;
4324       token->type = CHARACTER;
4325 #ifdef RE_ENABLE_I18N
4326       if (input->mb_cur_max > 1)
4327 	{
4328 	  wint_t wc = re_string_wchar_at (input,
4329 					  re_string_cur_idx (input) + 1);
4330 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
4331 	}
4332       else
4333 #endif
4334 	token->word_char = IS_WORD_CHAR (c2) != 0;
4335 
4336       switch (c2)
4337 	{
4338 	case '|':
4339 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
4340 	    token->type = OP_ALT;
4341 	  break;
4342 	case '1': case '2': case '3': case '4': case '5':
4343 	case '6': case '7': case '8': case '9':
4344 	  if (!(syntax & RE_NO_BK_REFS))
4345 	    {
4346 	      token->type = OP_BACK_REF;
4347 	      token->opr.idx = c2 - '1';
4348 	    }
4349 	  break;
4350 	case '<':
4351 	  if (!(syntax & RE_NO_GNU_OPS))
4352 	    {
4353 	      token->type = ANCHOR;
4354 	      token->opr.ctx_type = WORD_FIRST;
4355 	    }
4356 	  break;
4357 	case '>':
4358 	  if (!(syntax & RE_NO_GNU_OPS))
4359 	    {
4360 	      token->type = ANCHOR;
4361 	      token->opr.ctx_type = WORD_LAST;
4362 	    }
4363 	  break;
4364 	case 'b':
4365 	  if (!(syntax & RE_NO_GNU_OPS))
4366 	    {
4367 	      token->type = ANCHOR;
4368 	      token->opr.ctx_type = WORD_DELIM;
4369 	    }
4370 	  break;
4371 	case 'B':
4372 	  if (!(syntax & RE_NO_GNU_OPS))
4373 	    {
4374 	      token->type = ANCHOR;
4375 	      token->opr.ctx_type = NOT_WORD_DELIM;
4376 	    }
4377 	  break;
4378 	case 'w':
4379 	  if (!(syntax & RE_NO_GNU_OPS))
4380 	    token->type = OP_WORD;
4381 	  break;
4382 	case 'W':
4383 	  if (!(syntax & RE_NO_GNU_OPS))
4384 	    token->type = OP_NOTWORD;
4385 	  break;
4386 	case 's':
4387 	  if (!(syntax & RE_NO_GNU_OPS))
4388 	    token->type = OP_SPACE;
4389 	  break;
4390 	case 'S':
4391 	  if (!(syntax & RE_NO_GNU_OPS))
4392 	    token->type = OP_NOTSPACE;
4393 	  break;
4394 	case '`':
4395 	  if (!(syntax & RE_NO_GNU_OPS))
4396 	    {
4397 	      token->type = ANCHOR;
4398 	      token->opr.ctx_type = BUF_FIRST;
4399 	    }
4400 	  break;
4401 	case '\'':
4402 	  if (!(syntax & RE_NO_GNU_OPS))
4403 	    {
4404 	      token->type = ANCHOR;
4405 	      token->opr.ctx_type = BUF_LAST;
4406 	    }
4407 	  break;
4408 	case '(':
4409 	  if (!(syntax & RE_NO_BK_PARENS))
4410 	    token->type = OP_OPEN_SUBEXP;
4411 	  break;
4412 	case ')':
4413 	  if (!(syntax & RE_NO_BK_PARENS))
4414 	    token->type = OP_CLOSE_SUBEXP;
4415 	  break;
4416 	case '+':
4417 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
4418 	    token->type = OP_DUP_PLUS;
4419 	  break;
4420 	case '?':
4421 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
4422 	    token->type = OP_DUP_QUESTION;
4423 	  break;
4424 	case '{':
4425 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
4426 	    token->type = OP_OPEN_DUP_NUM;
4427 	  break;
4428 	case '}':
4429 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
4430 	    token->type = OP_CLOSE_DUP_NUM;
4431 	  break;
4432 	default:
4433 	  break;
4434 	}
4435       return 2;
4436     }
4437 
4438   token->type = CHARACTER;
4439 #ifdef RE_ENABLE_I18N
4440   if (input->mb_cur_max > 1)
4441     {
4442       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
4443       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
4444     }
4445   else
4446 #endif
4447     token->word_char = IS_WORD_CHAR (token->opr.c);
4448 
4449   switch (c)
4450     {
4451     case '\n':
4452       if (syntax & RE_NEWLINE_ALT)
4453 	token->type = OP_ALT;
4454       break;
4455     case '|':
4456       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
4457 	token->type = OP_ALT;
4458       break;
4459     case '*':
4460       token->type = OP_DUP_ASTERISK;
4461       break;
4462     case '+':
4463       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
4464 	token->type = OP_DUP_PLUS;
4465       break;
4466     case '?':
4467       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
4468 	token->type = OP_DUP_QUESTION;
4469       break;
4470     case '{':
4471       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
4472 	token->type = OP_OPEN_DUP_NUM;
4473       break;
4474     case '}':
4475       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
4476 	token->type = OP_CLOSE_DUP_NUM;
4477       break;
4478     case '(':
4479       if (syntax & RE_NO_BK_PARENS)
4480 	token->type = OP_OPEN_SUBEXP;
4481       break;
4482     case ')':
4483       if (syntax & RE_NO_BK_PARENS)
4484 	token->type = OP_CLOSE_SUBEXP;
4485       break;
4486     case '[':
4487       token->type = OP_OPEN_BRACKET;
4488       break;
4489     case '.':
4490       token->type = OP_PERIOD;
4491       break;
4492     case '^':
4493       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
4494 	  re_string_cur_idx (input) != 0)
4495 	{
4496 	  char prev = re_string_peek_byte (input, -1);
4497 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
4498 	    break;
4499 	}
4500       token->type = ANCHOR;
4501       token->opr.ctx_type = LINE_FIRST;
4502       break;
4503     case '$':
4504       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
4505 	  re_string_cur_idx (input) + 1 != re_string_length (input))
4506 	{
4507 	  re_token_t next;
4508 	  re_string_skip_bytes (input, 1);
4509 	  peek_token (&next, input, syntax);
4510 	  re_string_skip_bytes (input, -1);
4511 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
4512 	    break;
4513 	}
4514       token->type = ANCHOR;
4515       token->opr.ctx_type = LINE_LAST;
4516       break;
4517     default:
4518       break;
4519     }
4520   return 1;
4521 }
4522 
4523 /* Peek a token from INPUT, and return the length of the token.
4524    We must not use this function out of bracket expressions.  */
4525 
4526 static int
4527 internal_function
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)4528 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
4529 {
4530   unsigned char c;
4531   if (re_string_eoi (input))
4532     {
4533       token->type = END_OF_RE;
4534       return 0;
4535     }
4536   c = re_string_peek_byte (input, 0);
4537   token->opr.c = c;
4538 
4539 #ifdef RE_ENABLE_I18N
4540   if (input->mb_cur_max > 1 &&
4541       !re_string_first_byte (input, re_string_cur_idx (input)))
4542     {
4543       token->type = CHARACTER;
4544       return 1;
4545     }
4546 #endif /* RE_ENABLE_I18N */
4547 
4548   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
4549       && re_string_cur_idx (input) + 1 < re_string_length (input))
4550     {
4551       /* In this case, '\' escape a character.  */
4552       unsigned char c2;
4553       re_string_skip_bytes (input, 1);
4554       c2 = re_string_peek_byte (input, 0);
4555       token->opr.c = c2;
4556       token->type = CHARACTER;
4557       return 1;
4558     }
4559   if (c == '[') /* '[' is a special char in a bracket exps.  */
4560     {
4561       unsigned char c2;
4562       int token_len;
4563       if (re_string_cur_idx (input) + 1 < re_string_length (input))
4564 	c2 = re_string_peek_byte (input, 1);
4565       else
4566 	c2 = 0;
4567       token->opr.c = c2;
4568       token_len = 2;
4569       switch (c2)
4570 	{
4571 	case '.':
4572 	  token->type = OP_OPEN_COLL_ELEM;
4573 	  break;
4574 	case '=':
4575 	  token->type = OP_OPEN_EQUIV_CLASS;
4576 	  break;
4577 	case ':':
4578 	  if (syntax & RE_CHAR_CLASSES)
4579 	    {
4580 	      token->type = OP_OPEN_CHAR_CLASS;
4581 	      break;
4582 	    }
4583 	  /* else fall through.  */
4584 	default:
4585 	  token->type = CHARACTER;
4586 	  token->opr.c = c;
4587 	  token_len = 1;
4588 	  break;
4589 	}
4590       return token_len;
4591     }
4592   switch (c)
4593     {
4594     case '-':
4595       token->type = OP_CHARSET_RANGE;
4596       break;
4597     case ']':
4598       token->type = OP_CLOSE_BRACKET;
4599       break;
4600     case '^':
4601       token->type = OP_NON_MATCH_LIST;
4602       break;
4603     default:
4604       token->type = CHARACTER;
4605     }
4606   return 1;
4607 }
4608 
4609 /* Functions for parser.  */
4610 
4611 /* Entry point of the parser.
4612    Parse the regular expression REGEXP and return the structure tree.
4613    If an error is occured, ERR is set by error code, and return NULL.
4614    This function build the following tree, from regular expression <reg_exp>:
4615 	   CAT
4616 	   / \
4617 	  /   \
4618    <reg_exp>  EOR
4619 
4620    CAT means concatenation.
4621    EOR means end of regular expression.  */
4622 
4623 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)4624 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
4625        reg_errcode_t *err)
4626 {
4627   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4628   bin_tree_t *tree, *eor, *root;
4629   re_token_t current_token;
4630   dfa->syntax = syntax;
4631   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
4632   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
4633   if (BE (*err != REG_NOERROR && tree == NULL, 0))
4634     return NULL;
4635   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
4636   if (tree != NULL)
4637     root = create_tree (dfa, tree, eor, CONCAT);
4638   else
4639     root = eor;
4640   if (BE (eor == NULL || root == NULL, 0))
4641     {
4642       *err = REG_ESPACE;
4643       return NULL;
4644     }
4645   return root;
4646 }
4647 
4648 /* This function build the following tree, from regular expression
4649    <branch1>|<branch2>:
4650 	   ALT
4651 	   / \
4652 	  /   \
4653    <branch1> <branch2>
4654 
4655    ALT means alternative, which represents the operator `|'.  */
4656 
4657 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)4658 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
4659 	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
4660 {
4661   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4662   bin_tree_t *tree, *branch = NULL;
4663   tree = parse_branch (regexp, preg, token, syntax, nest, err);
4664   if (BE (*err != REG_NOERROR && tree == NULL, 0))
4665     return NULL;
4666 
4667   while (token->type == OP_ALT)
4668     {
4669       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
4670       if (token->type != OP_ALT && token->type != END_OF_RE
4671 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
4672 	{
4673 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
4674 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
4675 	    return NULL;
4676 	}
4677       else
4678 	branch = NULL;
4679       tree = create_tree (dfa, tree, branch, OP_ALT);
4680       if (BE (tree == NULL, 0))
4681 	{
4682 	  *err = REG_ESPACE;
4683 	  return NULL;
4684 	}
4685     }
4686   return tree;
4687 }
4688 
4689 /* This function build the following tree, from regular expression
4690    <exp1><exp2>:
4691 	CAT
4692 	/ \
4693        /   \
4694    <exp1> <exp2>
4695 
4696    CAT means concatenation.  */
4697 
4698 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)4699 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
4700 	      reg_syntax_t syntax, int nest, reg_errcode_t *err)
4701 {
4702   bin_tree_t *tree, *exp;
4703   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4704   tree = parse_expression (regexp, preg, token, syntax, nest, err);
4705   if (BE (*err != REG_NOERROR && tree == NULL, 0))
4706     return NULL;
4707 
4708   while (token->type != OP_ALT && token->type != END_OF_RE
4709 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
4710     {
4711       exp = parse_expression (regexp, preg, token, syntax, nest, err);
4712       if (BE (*err != REG_NOERROR && exp == NULL, 0))
4713 	{
4714 	  return NULL;
4715 	}
4716       if (tree != NULL && exp != NULL)
4717 	{
4718 	  tree = create_tree (dfa, tree, exp, CONCAT);
4719 	  if (tree == NULL)
4720 	    {
4721 	      *err = REG_ESPACE;
4722 	      return NULL;
4723 	    }
4724 	}
4725       else if (tree == NULL)
4726 	tree = exp;
4727       /* Otherwise exp == NULL, we don't need to create new tree.  */
4728     }
4729   return tree;
4730 }
4731 
4732 /* This function build the following tree, from regular expression a*:
4733 	 *
4734 	 |
4735 	 a
4736 */
4737 
4738 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)4739 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
4740 		  reg_syntax_t syntax, int nest, reg_errcode_t *err)
4741 {
4742   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4743   bin_tree_t *tree;
4744   switch (token->type)
4745     {
4746     case CHARACTER:
4747       tree = create_token_tree (dfa, NULL, NULL, token);
4748       if (BE (tree == NULL, 0))
4749 	{
4750 	  *err = REG_ESPACE;
4751 	  return NULL;
4752 	}
4753 #ifdef RE_ENABLE_I18N
4754       if (dfa->mb_cur_max > 1)
4755 	{
4756 	  while (!re_string_eoi (regexp)
4757 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
4758 	    {
4759 	      bin_tree_t *mbc_remain;
4760 	      fetch_token (token, regexp, syntax);
4761 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
4762 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
4763 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
4764 		{
4765 		  *err = REG_ESPACE;
4766 		  return NULL;
4767 		}
4768 	    }
4769 	}
4770 #endif
4771       break;
4772     case OP_OPEN_SUBEXP:
4773       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
4774       if (BE (*err != REG_NOERROR && tree == NULL, 0))
4775 	return NULL;
4776       break;
4777     case OP_OPEN_BRACKET:
4778       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
4779       if (BE (*err != REG_NOERROR && tree == NULL, 0))
4780 	return NULL;
4781       break;
4782     case OP_BACK_REF:
4783       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
4784 	{
4785 	  *err = REG_ESUBREG;
4786 	  return NULL;
4787 	}
4788       dfa->used_bkref_map |= 1 << token->opr.idx;
4789       tree = create_token_tree (dfa, NULL, NULL, token);
4790       if (BE (tree == NULL, 0))
4791 	{
4792 	  *err = REG_ESPACE;
4793 	  return NULL;
4794 	}
4795       ++dfa->nbackref;
4796       dfa->has_mb_node = 1;
4797       break;
4798     case OP_OPEN_DUP_NUM:
4799       if (syntax & RE_CONTEXT_INVALID_DUP)
4800 	{
4801 	  *err = REG_BADRPT;
4802 	  return NULL;
4803 	}
4804       /* FALLTHROUGH */
4805     case OP_DUP_ASTERISK:
4806     case OP_DUP_PLUS:
4807     case OP_DUP_QUESTION:
4808       if (syntax & RE_CONTEXT_INVALID_OPS)
4809 	{
4810 	  *err = REG_BADRPT;
4811 	  return NULL;
4812 	}
4813       else if (syntax & RE_CONTEXT_INDEP_OPS)
4814 	{
4815 	  fetch_token (token, regexp, syntax);
4816 	  return parse_expression (regexp, preg, token, syntax, nest, err);
4817 	}
4818       /* else fall through  */
4819     case OP_CLOSE_SUBEXP:
4820       if ((token->type == OP_CLOSE_SUBEXP) &&
4821 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
4822 	{
4823 	  *err = REG_ERPAREN;
4824 	  return NULL;
4825 	}
4826       /* else fall through  */
4827     case OP_CLOSE_DUP_NUM:
4828       /* We treat it as a normal character.  */
4829 
4830       /* Then we can these characters as normal characters.  */
4831       token->type = CHARACTER;
4832       /* mb_partial and word_char bits should be initialized already
4833 	 by peek_token.  */
4834       tree = create_token_tree (dfa, NULL, NULL, token);
4835       if (BE (tree == NULL, 0))
4836 	{
4837 	  *err = REG_ESPACE;
4838 	  return NULL;
4839 	}
4840       break;
4841     case ANCHOR:
4842       if ((token->opr.ctx_type
4843 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
4844 	  && dfa->word_ops_used == 0)
4845 	init_word_char (dfa);
4846       if (token->opr.ctx_type == WORD_DELIM
4847           || token->opr.ctx_type == NOT_WORD_DELIM)
4848 	{
4849 	  bin_tree_t *tree_first, *tree_last;
4850 	  if (token->opr.ctx_type == WORD_DELIM)
4851 	    {
4852 	      token->opr.ctx_type = WORD_FIRST;
4853 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
4854 	      token->opr.ctx_type = WORD_LAST;
4855             }
4856           else
4857             {
4858 	      token->opr.ctx_type = INSIDE_WORD;
4859 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
4860 	      token->opr.ctx_type = INSIDE_NOTWORD;
4861             }
4862 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
4863 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
4864 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
4865 	    {
4866 	      *err = REG_ESPACE;
4867 	      return NULL;
4868 	    }
4869 	}
4870       else
4871 	{
4872 	  tree = create_token_tree (dfa, NULL, NULL, token);
4873 	  if (BE (tree == NULL, 0))
4874 	    {
4875 	      *err = REG_ESPACE;
4876 	      return NULL;
4877 	    }
4878 	}
4879       /* We must return here, since ANCHORs can't be followed
4880 	 by repetition operators.
4881 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
4882 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
4883       fetch_token (token, regexp, syntax);
4884       return tree;
4885     case OP_PERIOD:
4886       tree = create_token_tree (dfa, NULL, NULL, token);
4887       if (BE (tree == NULL, 0))
4888 	{
4889 	  *err = REG_ESPACE;
4890 	  return NULL;
4891 	}
4892       if (dfa->mb_cur_max > 1)
4893 	dfa->has_mb_node = 1;
4894       break;
4895     case OP_WORD:
4896     case OP_NOTWORD:
4897       tree = build_charclass_op (dfa, regexp->trans,
4898 				 (const unsigned char *) "alnum",
4899 				 (const unsigned char *) "_",
4900 				 token->type == OP_NOTWORD, err);
4901       if (BE (*err != REG_NOERROR && tree == NULL, 0))
4902 	return NULL;
4903       break;
4904     case OP_SPACE:
4905     case OP_NOTSPACE:
4906       tree = build_charclass_op (dfa, regexp->trans,
4907 				 (const unsigned char *) "space",
4908 				 (const unsigned char *) "",
4909 				 token->type == OP_NOTSPACE, err);
4910       if (BE (*err != REG_NOERROR && tree == NULL, 0))
4911 	return NULL;
4912       break;
4913     case OP_ALT:
4914     case END_OF_RE:
4915       return NULL;
4916     case BACK_SLASH:
4917       *err = REG_EESCAPE;
4918       return NULL;
4919     default:
4920       /* Must not happen?  */
4921 #ifdef DEBUG
4922       assert (0);
4923 #endif
4924       return NULL;
4925     }
4926   fetch_token (token, regexp, syntax);
4927 
4928   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
4929 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
4930     {
4931       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
4932       if (BE (*err != REG_NOERROR && tree == NULL, 0))
4933 	return NULL;
4934       /* In BRE consecutive duplications are not allowed.  */
4935       if ((syntax & RE_CONTEXT_INVALID_DUP)
4936 	  && (token->type == OP_DUP_ASTERISK
4937 	      || token->type == OP_OPEN_DUP_NUM))
4938 	{
4939 	  *err = REG_BADRPT;
4940 	  return NULL;
4941 	}
4942     }
4943 
4944   return tree;
4945 }
4946 
4947 /* This function build the following tree, from regular expression
4948    (<reg_exp>):
4949 	 SUBEXP
4950 	    |
4951 	<reg_exp>
4952 */
4953 
4954 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)4955 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
4956 	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
4957 {
4958   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4959   bin_tree_t *tree;
4960   size_t cur_nsub;
4961   cur_nsub = preg->re_nsub++;
4962 
4963   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
4964 
4965   /* The subexpression may be a null string.  */
4966   if (token->type == OP_CLOSE_SUBEXP)
4967     tree = NULL;
4968   else
4969     {
4970       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
4971       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
4972         *err = REG_EPAREN;
4973       if (BE (*err != REG_NOERROR, 0))
4974 	return NULL;
4975     }
4976 
4977   if (cur_nsub <= '9' - '1')
4978     dfa->completed_bkref_map |= 1 << cur_nsub;
4979 
4980   tree = create_tree (dfa, tree, NULL, SUBEXP);
4981   if (BE (tree == NULL, 0))
4982     {
4983       *err = REG_ESPACE;
4984       return NULL;
4985     }
4986   tree->token.opr.idx = cur_nsub;
4987   return tree;
4988 }
4989 
4990 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
4991 
4992 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)4993 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
4994 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
4995 {
4996   bin_tree_t *tree = NULL, *old_tree = NULL;
4997   int i, start, end, start_idx = re_string_cur_idx (regexp);
4998   re_token_t start_token = *token;
4999 
5000   if (token->type == OP_OPEN_DUP_NUM)
5001     {
5002       end = 0;
5003       start = fetch_number (regexp, token, syntax);
5004       if (start == -1)
5005 	{
5006 	  if (token->type == CHARACTER && token->opr.c == ',')
5007 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
5008 	  else
5009 	    {
5010 	      *err = REG_BADBR; /* <re>{} is invalid.  */
5011 	      return NULL;
5012 	    }
5013 	}
5014       if (BE (start != -2, 1))
5015 	{
5016 	  /* We treat "{n}" as "{n,n}".  */
5017 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
5018 		 : ((token->type == CHARACTER && token->opr.c == ',')
5019 		    ? fetch_number (regexp, token, syntax) : -2));
5020 	}
5021       if (BE (start == -2 || end == -2, 0))
5022 	{
5023 	  /* Invalid sequence.  */
5024 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
5025 	    {
5026 	      if (token->type == END_OF_RE)
5027 		*err = REG_EBRACE;
5028 	      else
5029 		*err = REG_BADBR;
5030 
5031 	      return NULL;
5032 	    }
5033 
5034 	  /* If the syntax bit is set, rollback.  */
5035 	  re_string_set_index (regexp, start_idx);
5036 	  *token = start_token;
5037 	  token->type = CHARACTER;
5038 	  /* mb_partial and word_char bits should be already initialized by
5039 	     peek_token.  */
5040 	  return elem;
5041 	}
5042 
5043       if (BE (end != -1 && start > end, 0))
5044 	{
5045 	  /* First number greater than second.  */
5046 	  *err = REG_BADBR;
5047 	  return NULL;
5048 	}
5049     }
5050   else
5051     {
5052       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
5053       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
5054     }
5055 
5056   fetch_token (token, regexp, syntax);
5057 
5058   if (BE (elem == NULL, 0))
5059     return NULL;
5060   if (BE (start == 0 && end == 0, 0))
5061     {
5062       postorder (elem, free_tree, NULL);
5063       return NULL;
5064     }
5065 
5066   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
5067   if (BE (start > 0, 0))
5068     {
5069       tree = elem;
5070       for (i = 2; i <= start; ++i)
5071 	{
5072 	  elem = duplicate_tree (elem, dfa);
5073 	  tree = create_tree (dfa, tree, elem, CONCAT);
5074 	  if (BE (elem == NULL || tree == NULL, 0))
5075 	    goto parse_dup_op_espace;
5076 	}
5077 
5078       if (start == end)
5079 	return tree;
5080 
5081       /* Duplicate ELEM before it is marked optional.  */
5082       elem = duplicate_tree (elem, dfa);
5083       old_tree = tree;
5084     }
5085   else
5086     old_tree = NULL;
5087 
5088   if (elem->token.type == SUBEXP)
5089     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
5090 
5091   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
5092   if (BE (tree == NULL, 0))
5093     goto parse_dup_op_espace;
5094 
5095   /* This loop is actually executed only when end != -1,
5096      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
5097      already created the start+1-th copy.  */
5098   for (i = start + 2; i <= end; ++i)
5099     {
5100       elem = duplicate_tree (elem, dfa);
5101       tree = create_tree (dfa, tree, elem, CONCAT);
5102       if (BE (elem == NULL || tree == NULL, 0))
5103         goto parse_dup_op_espace;
5104 
5105       tree = create_tree (dfa, tree, NULL, OP_ALT);
5106       if (BE (tree == NULL, 0))
5107         goto parse_dup_op_espace;
5108     }
5109 
5110   if (old_tree)
5111     tree = create_tree (dfa, old_tree, tree, CONCAT);
5112 
5113   return tree;
5114 
5115  parse_dup_op_espace:
5116   *err = REG_ESPACE;
5117   return NULL;
5118 }
5119 
5120 /* Size of the names for collating symbol/equivalence_class/character_class.
5121    I'm not sure, but maybe enough.  */
5122 #define BRACKET_NAME_BUF_SIZE 32
5123 
5124 #ifndef _LIBC
5125   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
5126      Build the range expression which starts from START_ELEM, and ends
5127      at END_ELEM.  The result are written to MBCSET and SBCSET.
5128      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
5129      mbcset->range_ends, is a pointer argument sinse we may
5130      update it.  */
5131 
5132 static reg_errcode_t
5133 internal_function
5134 # ifdef RE_ENABLE_I18N
build_range_exp(bitset_t sbcset,re_charset_t * mbcset,int * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)5135 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
5136 		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
5137 # else /* not RE_ENABLE_I18N */
5138 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
5139 		 bracket_elem_t *end_elem)
5140 # endif /* not RE_ENABLE_I18N */
5141 {
5142   unsigned int start_ch, end_ch;
5143   /* Equivalence Classes and Character Classes can't be a range start/end.  */
5144   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
5145 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
5146 	  0))
5147     return REG_ERANGE;
5148 
5149   /* We can handle no multi character collating elements without libc
5150      support.  */
5151   if (BE ((start_elem->type == COLL_SYM
5152 	   && strlen ((char *) start_elem->opr.name) > 1)
5153 	  || (end_elem->type == COLL_SYM
5154 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
5155     return REG_ECOLLATE;
5156 
5157 # ifdef RE_ENABLE_I18N
5158   {
5159     wchar_t wc;
5160     wint_t start_wc;
5161     wint_t end_wc;
5162     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
5163 
5164     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
5165 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
5166 		   : 0));
5167     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
5168 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
5169 		 : 0));
5170     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
5171 		? __btowc (start_ch) : start_elem->opr.wch);
5172     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
5173 	      ? __btowc (end_ch) : end_elem->opr.wch);
5174     if (start_wc == WEOF || end_wc == WEOF)
5175       return REG_ECOLLATE;
5176     cmp_buf[0] = start_wc;
5177     cmp_buf[4] = end_wc;
5178     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
5179       return REG_ERANGE;
5180 
5181     /* Got valid collation sequence values, add them as a new entry.
5182        However, for !_LIBC we have no collation elements: if the
5183        character set is single byte, the single byte character set
5184        that we build below suffices.  parse_bracket_exp passes
5185        no MBCSET if dfa->mb_cur_max == 1.  */
5186     if (mbcset)
5187       {
5188         /* Check the space of the arrays.  */
5189         if (BE (*range_alloc == mbcset->nranges, 0))
5190           {
5191 	    /* There is not enough space, need realloc.  */
5192 	    wchar_t *new_array_start, *new_array_end;
5193 	    int new_nranges;
5194 
5195 	    /* +1 in case of mbcset->nranges is 0.  */
5196 	    new_nranges = 2 * mbcset->nranges + 1;
5197 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
5198 	       are NULL if *range_alloc == 0.  */
5199 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
5200 				          new_nranges);
5201 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
5202 				        new_nranges);
5203 
5204 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
5205 	      return REG_ESPACE;
5206 
5207 	    mbcset->range_starts = new_array_start;
5208 	    mbcset->range_ends = new_array_end;
5209 	    *range_alloc = new_nranges;
5210           }
5211 
5212         mbcset->range_starts[mbcset->nranges] = start_wc;
5213         mbcset->range_ends[mbcset->nranges++] = end_wc;
5214       }
5215 
5216     /* Build the table for single byte characters.  */
5217     for (wc = 0; wc < SBC_MAX; ++wc)
5218       {
5219 	cmp_buf[2] = wc;
5220 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
5221 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
5222 	  bitset_set (sbcset, wc);
5223       }
5224   }
5225 # else /* not RE_ENABLE_I18N */
5226   {
5227     unsigned int ch;
5228     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
5229 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
5230 		   : 0));
5231     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
5232 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
5233 		 : 0));
5234     if (start_ch > end_ch)
5235       return REG_ERANGE;
5236     /* Build the table for single byte characters.  */
5237     for (ch = 0; ch < SBC_MAX; ++ch)
5238       if (start_ch <= ch  && ch <= end_ch)
5239 	bitset_set (sbcset, ch);
5240   }
5241 # endif /* not RE_ENABLE_I18N */
5242   return REG_NOERROR;
5243 }
5244 #endif /* not _LIBC */
5245 
5246 #ifndef _LIBC
5247 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
5248    Build the collating element which is represented by NAME.
5249    The result are written to MBCSET and SBCSET.
5250    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
5251    pointer argument since we may update it.  */
5252 
5253 static reg_errcode_t
5254 internal_function
5255 # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,int * coll_sym_alloc,const unsigned char * name)5256 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
5257 			int *coll_sym_alloc, const unsigned char *name)
5258 # else /* not RE_ENABLE_I18N */
5259 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
5260 # endif /* not RE_ENABLE_I18N */
5261 {
5262   size_t name_len = strlen ((const char *) name);
5263   if (BE (name_len != 1, 0))
5264     return REG_ECOLLATE;
5265   else
5266     {
5267       bitset_set (sbcset, name[0]);
5268       return REG_NOERROR;
5269     }
5270 }
5271 #endif /* not _LIBC */
5272 
5273 /* This function parse bracket expression like "[abc]", "[a-c]",
5274    "[[.a-a.]]" etc.  */
5275 
5276 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)5277 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
5278 		   reg_syntax_t syntax, reg_errcode_t *err)
5279 {
5280 #ifdef _LIBC
5281   const unsigned char *collseqmb;
5282   const char *collseqwc;
5283   uint32_t nrules;
5284   int32_t table_size;
5285   const int32_t *symb_table;
5286   const unsigned char *extra;
5287 
5288   /* Local function for parse_bracket_exp used in _LIBC environement.
5289      Seek the collating symbol entry correspondings to NAME.
5290      Return the index of the symbol in the SYMB_TABLE.  */
5291 
5292   auto inline int32_t
5293   __attribute ((always_inline))
5294   seek_collating_symbol_entry (name, name_len)
5295 	 const unsigned char *name;
5296 	 size_t name_len;
5297     {
5298       int32_t hash = elem_hash ((const char *) name, name_len);
5299       int32_t elem = hash % table_size;
5300       if (symb_table[2 * elem] != 0)
5301 	{
5302 	  int32_t second = hash % (table_size - 2) + 1;
5303 
5304 	  do
5305 	    {
5306 	      /* First compare the hashing value.  */
5307 	      if (symb_table[2 * elem] == hash
5308 		  /* Compare the length of the name.  */
5309 		  && name_len == extra[symb_table[2 * elem + 1]]
5310 		  /* Compare the name.  */
5311 		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
5312 			     name_len) == 0)
5313 		{
5314 		  /* Yep, this is the entry.  */
5315 		  break;
5316 		}
5317 
5318 	      /* Next entry.  */
5319 	      elem += second;
5320 	    }
5321 	  while (symb_table[2 * elem] != 0);
5322 	}
5323       return elem;
5324     }
5325 
5326   /* Local function for parse_bracket_exp used in _LIBC environement.
5327      Look up the collation sequence value of BR_ELEM.
5328      Return the value if succeeded, UINT_MAX otherwise.  */
5329 
5330   auto inline unsigned int
5331   __attribute ((always_inline))
5332   lookup_collation_sequence_value (br_elem)
5333 	 bracket_elem_t *br_elem;
5334     {
5335       if (br_elem->type == SB_CHAR)
5336 	{
5337 	  /*
5338 	  if (MB_CUR_MAX == 1)
5339 	  */
5340 	  if (nrules == 0)
5341 	    return collseqmb[br_elem->opr.ch];
5342 	  else
5343 	    {
5344 	      wint_t wc = __btowc (br_elem->opr.ch);
5345 	      return __collseq_table_lookup (collseqwc, wc);
5346 	    }
5347 	}
5348       else if (br_elem->type == MB_CHAR)
5349 	{
5350 	  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
5351 	}
5352       else if (br_elem->type == COLL_SYM)
5353 	{
5354 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
5355 	  if (nrules != 0)
5356 	    {
5357 	      int32_t elem, idx;
5358 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
5359 						  sym_name_len);
5360 	      if (symb_table[2 * elem] != 0)
5361 		{
5362 		  /* We found the entry.  */
5363 		  idx = symb_table[2 * elem + 1];
5364 		  /* Skip the name of collating element name.  */
5365 		  idx += 1 + extra[idx];
5366 		  /* Skip the byte sequence of the collating element.  */
5367 		  idx += 1 + extra[idx];
5368 		  /* Adjust for the alignment.  */
5369 		  idx = (idx + 3) & ~3;
5370 		  /* Skip the multibyte collation sequence value.  */
5371 		  idx += sizeof (unsigned int);
5372 		  /* Skip the wide char sequence of the collating element.  */
5373 		  idx += sizeof (unsigned int) *
5374 		    (1 + *(unsigned int *) (extra + idx));
5375 		  /* Return the collation sequence value.  */
5376 		  return *(unsigned int *) (extra + idx);
5377 		}
5378 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
5379 		{
5380 		  /* No valid character.  Match it as a single byte
5381 		     character.  */
5382 		  return collseqmb[br_elem->opr.name[0]];
5383 		}
5384 	    }
5385 	  else if (sym_name_len == 1)
5386 	    return collseqmb[br_elem->opr.name[0]];
5387 	}
5388       return UINT_MAX;
5389     }
5390 
5391   /* Local function for parse_bracket_exp used in _LIBC environement.
5392      Build the range expression which starts from START_ELEM, and ends
5393      at END_ELEM.  The result are written to MBCSET and SBCSET.
5394      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
5395      mbcset->range_ends, is a pointer argument sinse we may
5396      update it.  */
5397 
5398   auto inline reg_errcode_t
5399   __attribute ((always_inline))
5400   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
5401 	 re_charset_t *mbcset;
5402 	 int *range_alloc;
5403 	 bitset_t sbcset;
5404 	 bracket_elem_t *start_elem, *end_elem;
5405     {
5406       unsigned int ch;
5407       uint32_t start_collseq;
5408       uint32_t end_collseq;
5409 
5410       /* Equivalence Classes and Character Classes can't be a range
5411 	 start/end.  */
5412       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
5413 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
5414 	      0))
5415 	return REG_ERANGE;
5416 
5417       start_collseq = lookup_collation_sequence_value (start_elem);
5418       end_collseq = lookup_collation_sequence_value (end_elem);
5419       /* Check start/end collation sequence values.  */
5420       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
5421 	return REG_ECOLLATE;
5422       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
5423 	return REG_ERANGE;
5424 
5425       /* Got valid collation sequence values, add them as a new entry.
5426 	 However, if we have no collation elements, and the character set
5427 	 is single byte, the single byte character set that we
5428 	 build below suffices. */
5429       if (nrules > 0 || dfa->mb_cur_max > 1)
5430 	{
5431           /* Check the space of the arrays.  */
5432           if (BE (*range_alloc == mbcset->nranges, 0))
5433 	    {
5434 	      /* There is not enough space, need realloc.  */
5435 	      uint32_t *new_array_start;
5436 	      uint32_t *new_array_end;
5437 	      int new_nranges;
5438 
5439 	      /* +1 in case of mbcset->nranges is 0.  */
5440 	      new_nranges = 2 * mbcset->nranges + 1;
5441 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
5442 					    new_nranges);
5443 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
5444 				          new_nranges);
5445 
5446 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
5447 	        return REG_ESPACE;
5448 
5449 	      mbcset->range_starts = new_array_start;
5450 	      mbcset->range_ends = new_array_end;
5451 	      *range_alloc = new_nranges;
5452 	    }
5453 
5454           mbcset->range_starts[mbcset->nranges] = start_collseq;
5455           mbcset->range_ends[mbcset->nranges++] = end_collseq;
5456 	}
5457 
5458       /* Build the table for single byte characters.  */
5459       for (ch = 0; ch < SBC_MAX; ch++)
5460 	{
5461 	  uint32_t ch_collseq;
5462 	  /*
5463 	  if (MB_CUR_MAX == 1)
5464 	  */
5465 	  if (nrules == 0)
5466 	    ch_collseq = collseqmb[ch];
5467 	  else
5468 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
5469 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
5470 	    bitset_set (sbcset, ch);
5471 	}
5472       return REG_NOERROR;
5473     }
5474 
5475   /* Local function for parse_bracket_exp used in _LIBC environement.
5476      Build the collating element which is represented by NAME.
5477      The result are written to MBCSET and SBCSET.
5478      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
5479      pointer argument sinse we may update it.  */
5480 
5481   auto inline reg_errcode_t
5482   __attribute ((always_inline))
5483   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
5484 	 re_charset_t *mbcset;
5485 	 int *coll_sym_alloc;
5486 	 bitset_t sbcset;
5487 	 const unsigned char *name;
5488     {
5489       int32_t elem, idx;
5490       size_t name_len = strlen ((const char *) name);
5491       if (nrules != 0)
5492 	{
5493 	  elem = seek_collating_symbol_entry (name, name_len);
5494 	  if (symb_table[2 * elem] != 0)
5495 	    {
5496 	      /* We found the entry.  */
5497 	      idx = symb_table[2 * elem + 1];
5498 	      /* Skip the name of collating element name.  */
5499 	      idx += 1 + extra[idx];
5500 	    }
5501 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
5502 	    {
5503 	      /* No valid character, treat it as a normal
5504 		 character.  */
5505 	      bitset_set (sbcset, name[0]);
5506 	      return REG_NOERROR;
5507 	    }
5508 	  else
5509 	    return REG_ECOLLATE;
5510 
5511 	  /* Got valid collation sequence, add it as a new entry.  */
5512 	  /* Check the space of the arrays.  */
5513 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
5514 	    {
5515 	      /* Not enough, realloc it.  */
5516 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
5517 	      int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
5518 	      /* Use realloc since mbcset->coll_syms is NULL
5519 		 if *alloc == 0.  */
5520 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
5521 						   new_coll_sym_alloc);
5522 	      if (BE (new_coll_syms == NULL, 0))
5523 		return REG_ESPACE;
5524 	      mbcset->coll_syms = new_coll_syms;
5525 	      *coll_sym_alloc = new_coll_sym_alloc;
5526 	    }
5527 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
5528 	  return REG_NOERROR;
5529 	}
5530       else
5531 	{
5532 	  if (BE (name_len != 1, 0))
5533 	    return REG_ECOLLATE;
5534 	  else
5535 	    {
5536 	      bitset_set (sbcset, name[0]);
5537 	      return REG_NOERROR;
5538 	    }
5539 	}
5540     }
5541 #endif
5542 
5543   re_token_t br_token;
5544   re_bitset_ptr_t sbcset;
5545 #ifdef RE_ENABLE_I18N
5546   re_charset_t *mbcset;
5547   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
5548   int equiv_class_alloc = 0, char_class_alloc = 0;
5549 #endif /* not RE_ENABLE_I18N */
5550   int non_match = 0;
5551   bin_tree_t *work_tree;
5552   int token_len;
5553   int first_round = 1;
5554 #ifdef _LIBC
5555   collseqmb = (const unsigned char *)
5556     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
5557   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
5558   if (nrules)
5559     {
5560       /*
5561       if (MB_CUR_MAX > 1)
5562       */
5563       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
5564       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
5565       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
5566 						  _NL_COLLATE_SYMB_TABLEMB);
5567       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
5568 						   _NL_COLLATE_SYMB_EXTRAMB);
5569     }
5570 #endif
5571   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
5572 #ifdef RE_ENABLE_I18N
5573   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
5574 #endif /* RE_ENABLE_I18N */
5575 #ifdef RE_ENABLE_I18N
5576   if (BE (sbcset == NULL || mbcset == NULL, 0))
5577 #else
5578   if (BE (sbcset == NULL, 0))
5579 #endif /* RE_ENABLE_I18N */
5580     {
5581       *err = REG_ESPACE;
5582       return NULL;
5583     }
5584 
5585   token_len = peek_token_bracket (token, regexp, syntax);
5586   if (BE (token->type == END_OF_RE, 0))
5587     {
5588       *err = REG_BADPAT;
5589       goto parse_bracket_exp_free_return;
5590     }
5591   if (token->type == OP_NON_MATCH_LIST)
5592     {
5593 #ifdef RE_ENABLE_I18N
5594       mbcset->non_match = 1;
5595 #endif /* not RE_ENABLE_I18N */
5596       non_match = 1;
5597       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
5598 	bitset_set (sbcset, '\0');
5599       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
5600       token_len = peek_token_bracket (token, regexp, syntax);
5601       if (BE (token->type == END_OF_RE, 0))
5602 	{
5603 	  *err = REG_BADPAT;
5604 	  goto parse_bracket_exp_free_return;
5605 	}
5606     }
5607 
5608   /* We treat the first ']' as a normal character.  */
5609   if (token->type == OP_CLOSE_BRACKET)
5610     token->type = CHARACTER;
5611 
5612   while (1)
5613     {
5614       bracket_elem_t start_elem, end_elem;
5615       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
5616       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
5617       reg_errcode_t ret;
5618       int token_len2 = 0, is_range_exp = 0;
5619       re_token_t token2;
5620 
5621       start_elem.opr.name = start_name_buf;
5622       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
5623 				   syntax, first_round);
5624       if (BE (ret != REG_NOERROR, 0))
5625 	{
5626 	  *err = ret;
5627 	  goto parse_bracket_exp_free_return;
5628 	}
5629       first_round = 0;
5630 
5631       /* Get information about the next token.  We need it in any case.  */
5632       token_len = peek_token_bracket (token, regexp, syntax);
5633 
5634       /* Do not check for ranges if we know they are not allowed.  */
5635       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
5636 	{
5637 	  if (BE (token->type == END_OF_RE, 0))
5638 	    {
5639 	      *err = REG_EBRACK;
5640 	      goto parse_bracket_exp_free_return;
5641 	    }
5642 	  if (token->type == OP_CHARSET_RANGE)
5643 	    {
5644 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
5645 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
5646 	      if (BE (token2.type == END_OF_RE, 0))
5647 		{
5648 		  *err = REG_EBRACK;
5649 		  goto parse_bracket_exp_free_return;
5650 		}
5651 	      if (token2.type == OP_CLOSE_BRACKET)
5652 		{
5653 		  /* We treat the last '-' as a normal character.  */
5654 		  re_string_skip_bytes (regexp, -token_len);
5655 		  token->type = CHARACTER;
5656 		}
5657 	      else
5658 		is_range_exp = 1;
5659 	    }
5660 	}
5661 
5662       if (is_range_exp == 1)
5663 	{
5664 	  end_elem.opr.name = end_name_buf;
5665 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
5666 				       dfa, syntax, 1);
5667 	  if (BE (ret != REG_NOERROR, 0))
5668 	    {
5669 	      *err = ret;
5670 	      goto parse_bracket_exp_free_return;
5671 	    }
5672 
5673 	  token_len = peek_token_bracket (token, regexp, syntax);
5674 
5675 #ifdef _LIBC
5676 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
5677 				  &start_elem, &end_elem);
5678 #else
5679 # ifdef RE_ENABLE_I18N
5680 	  *err = build_range_exp (sbcset,
5681 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
5682 				  &range_alloc, &start_elem, &end_elem);
5683 # else
5684 	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
5685 # endif
5686 #endif /* RE_ENABLE_I18N */
5687 	  if (BE (*err != REG_NOERROR, 0))
5688 	    goto parse_bracket_exp_free_return;
5689 	}
5690       else
5691 	{
5692 	  switch (start_elem.type)
5693 	    {
5694 	    case SB_CHAR:
5695 	      bitset_set (sbcset, start_elem.opr.ch);
5696 	      break;
5697 #ifdef RE_ENABLE_I18N
5698 	    case MB_CHAR:
5699 	      /* Check whether the array has enough space.  */
5700 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
5701 		{
5702 		  wchar_t *new_mbchars;
5703 		  /* Not enough, realloc it.  */
5704 		  /* +1 in case of mbcset->nmbchars is 0.  */
5705 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
5706 		  /* Use realloc since array is NULL if *alloc == 0.  */
5707 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
5708 					    mbchar_alloc);
5709 		  if (BE (new_mbchars == NULL, 0))
5710 		    goto parse_bracket_exp_espace;
5711 		  mbcset->mbchars = new_mbchars;
5712 		}
5713 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
5714 	      break;
5715 #endif /* RE_ENABLE_I18N */
5716 	    case EQUIV_CLASS:
5717 	      *err = build_equiv_class (sbcset,
5718 #ifdef RE_ENABLE_I18N
5719 					mbcset, &equiv_class_alloc,
5720 #endif /* RE_ENABLE_I18N */
5721 					start_elem.opr.name);
5722 	      if (BE (*err != REG_NOERROR, 0))
5723 		goto parse_bracket_exp_free_return;
5724 	      break;
5725 	    case COLL_SYM:
5726 	      *err = build_collating_symbol (sbcset,
5727 #ifdef RE_ENABLE_I18N
5728 					     mbcset, &coll_sym_alloc,
5729 #endif /* RE_ENABLE_I18N */
5730 					     start_elem.opr.name);
5731 	      if (BE (*err != REG_NOERROR, 0))
5732 		goto parse_bracket_exp_free_return;
5733 	      break;
5734 	    case CHAR_CLASS:
5735 	      *err = build_charclass (regexp->trans, sbcset,
5736 #ifdef RE_ENABLE_I18N
5737 				      mbcset, &char_class_alloc,
5738 #endif /* RE_ENABLE_I18N */
5739 				      start_elem.opr.name, syntax);
5740 	      if (BE (*err != REG_NOERROR, 0))
5741 	       goto parse_bracket_exp_free_return;
5742 	      break;
5743 	    default:
5744 	      assert (0);
5745 	      break;
5746 	    }
5747 	}
5748       if (BE (token->type == END_OF_RE, 0))
5749 	{
5750 	  *err = REG_EBRACK;
5751 	  goto parse_bracket_exp_free_return;
5752 	}
5753       if (token->type == OP_CLOSE_BRACKET)
5754 	break;
5755     }
5756 
5757   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
5758 
5759   /* If it is non-matching list.  */
5760   if (non_match)
5761     bitset_not (sbcset);
5762 
5763 #ifdef RE_ENABLE_I18N
5764   /* Ensure only single byte characters are set.  */
5765   if (dfa->mb_cur_max > 1)
5766     bitset_mask (sbcset, dfa->sb_char);
5767 
5768   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
5769       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
5770 						     || mbcset->non_match)))
5771     {
5772       bin_tree_t *mbc_tree;
5773       int sbc_idx;
5774       /* Build a tree for complex bracket.  */
5775       dfa->has_mb_node = 1;
5776       br_token.type = COMPLEX_BRACKET;
5777       br_token.opr.mbcset = mbcset;
5778       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
5779       if (BE (mbc_tree == NULL, 0))
5780 	goto parse_bracket_exp_espace;
5781       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
5782 	if (sbcset[sbc_idx])
5783 	  break;
5784       /* If there are no bits set in sbcset, there is no point
5785 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
5786       if (sbc_idx < BITSET_WORDS)
5787 	{
5788           /* Build a tree for simple bracket.  */
5789           br_token.type = SIMPLE_BRACKET;
5790           br_token.opr.sbcset = sbcset;
5791           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
5792           if (BE (work_tree == NULL, 0))
5793             goto parse_bracket_exp_espace;
5794 
5795           /* Then join them by ALT node.  */
5796           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
5797           if (BE (work_tree == NULL, 0))
5798             goto parse_bracket_exp_espace;
5799 	}
5800       else
5801 	{
5802 	  re_free (sbcset);
5803 	  work_tree = mbc_tree;
5804 	}
5805     }
5806   else
5807 #endif /* not RE_ENABLE_I18N */
5808     {
5809 #ifdef RE_ENABLE_I18N
5810       free_charset (mbcset);
5811 #endif
5812       /* Build a tree for simple bracket.  */
5813       br_token.type = SIMPLE_BRACKET;
5814       br_token.opr.sbcset = sbcset;
5815       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
5816       if (BE (work_tree == NULL, 0))
5817         goto parse_bracket_exp_espace;
5818     }
5819   return work_tree;
5820 
5821  parse_bracket_exp_espace:
5822   *err = REG_ESPACE;
5823  parse_bracket_exp_free_return:
5824   re_free (sbcset);
5825 #ifdef RE_ENABLE_I18N
5826   free_charset (mbcset);
5827 #endif /* RE_ENABLE_I18N */
5828   return NULL;
5829 }
5830 
5831 /* Parse an element in the bracket expression.  */
5832 
5833 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,int accept_hyphen)5834 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
5835 		       re_token_t *token, int token_len, re_dfa_t *dfa,
5836 		       reg_syntax_t syntax, int accept_hyphen)
5837 {
5838 #ifdef RE_ENABLE_I18N
5839   int cur_char_size;
5840   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
5841   if (cur_char_size > 1)
5842     {
5843       elem->type = MB_CHAR;
5844       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
5845       re_string_skip_bytes (regexp, cur_char_size);
5846       return REG_NOERROR;
5847     }
5848 #endif /* RE_ENABLE_I18N */
5849   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
5850   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
5851       || token->type == OP_OPEN_EQUIV_CLASS)
5852     return parse_bracket_symbol (elem, regexp, token);
5853   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
5854     {
5855       /* A '-' must only appear as anything but a range indicator before
5856 	 the closing bracket.  Everything else is an error.  */
5857       re_token_t token2;
5858       (void) peek_token_bracket (&token2, regexp, syntax);
5859       if (token2.type != OP_CLOSE_BRACKET)
5860 	/* The actual error value is not standardized since this whole
5861 	   case is undefined.  But ERANGE makes good sense.  */
5862 	return REG_ERANGE;
5863     }
5864   elem->type = SB_CHAR;
5865   elem->opr.ch = token->opr.c;
5866   return REG_NOERROR;
5867 }
5868 
5869 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
5870    such as [:<character_class>:], [.<collating_element>.], and
5871    [=<equivalent_class>=].  */
5872 
5873 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)5874 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
5875 		      re_token_t *token)
5876 {
5877   unsigned char ch, delim = token->opr.c;
5878   int i = 0;
5879   if (re_string_eoi(regexp))
5880     return REG_EBRACK;
5881   for (;; ++i)
5882     {
5883       if (i >= BRACKET_NAME_BUF_SIZE)
5884 	return REG_EBRACK;
5885       if (token->type == OP_OPEN_CHAR_CLASS)
5886 	ch = re_string_fetch_byte_case (regexp);
5887       else
5888 	ch = re_string_fetch_byte (regexp);
5889       if (re_string_eoi(regexp))
5890 	return REG_EBRACK;
5891       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
5892 	break;
5893       elem->opr.name[i] = ch;
5894     }
5895   re_string_skip_bytes (regexp, 1);
5896   elem->opr.name[i] = '\0';
5897   switch (token->type)
5898     {
5899     case OP_OPEN_COLL_ELEM:
5900       elem->type = COLL_SYM;
5901       break;
5902     case OP_OPEN_EQUIV_CLASS:
5903       elem->type = EQUIV_CLASS;
5904       break;
5905     case OP_OPEN_CHAR_CLASS:
5906       elem->type = CHAR_CLASS;
5907       break;
5908     default:
5909       break;
5910     }
5911   return REG_NOERROR;
5912 }
5913 
5914   /* Helper function for parse_bracket_exp.
5915      Build the equivalence class which is represented by NAME.
5916      The result are written to MBCSET and SBCSET.
5917      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
5918      is a pointer argument sinse we may update it.  */
5919 
5920 static reg_errcode_t
5921 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,int * equiv_class_alloc,const unsigned char * name)5922 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
5923 		   int *equiv_class_alloc, const unsigned char *name)
5924 #else /* not RE_ENABLE_I18N */
5925 build_equiv_class (bitset_t sbcset, const unsigned char *name)
5926 #endif /* not RE_ENABLE_I18N */
5927 {
5928 #ifdef _LIBC
5929   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
5930   if (nrules != 0)
5931     {
5932       const int32_t *table, *indirect;
5933       const unsigned char *weights, *extra, *cp;
5934       unsigned char char_buf[2];
5935       int32_t idx1, idx2;
5936       unsigned int ch;
5937       size_t len;
5938       /* This #include defines a local function!  */
5939 # include <locale/weight.h>
5940       /* Calculate the index for equivalence class.  */
5941       cp = name;
5942       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
5943       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
5944 					       _NL_COLLATE_WEIGHTMB);
5945       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
5946 						   _NL_COLLATE_EXTRAMB);
5947       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
5948 						_NL_COLLATE_INDIRECTMB);
5949       idx1 = findidx (&cp);
5950       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
5951 	/* This isn't a valid character.  */
5952 	return REG_ECOLLATE;
5953 
5954       /* Build single byte matcing table for this equivalence class.  */
5955       char_buf[1] = (unsigned char) '\0';
5956       len = weights[idx1];
5957       for (ch = 0; ch < SBC_MAX; ++ch)
5958 	{
5959 	  char_buf[0] = ch;
5960 	  cp = char_buf;
5961 	  idx2 = findidx (&cp);
5962 /*
5963 	  idx2 = table[ch];
5964 */
5965 	  if (idx2 == 0)
5966 	    /* This isn't a valid character.  */
5967 	    continue;
5968 	  if (len == weights[idx2])
5969 	    {
5970 	      int cnt = 0;
5971 	      while (cnt <= len &&
5972 		     weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
5973 		++cnt;
5974 
5975 	      if (cnt > len)
5976 		bitset_set (sbcset, ch);
5977 	    }
5978 	}
5979       /* Check whether the array has enough space.  */
5980       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
5981 	{
5982 	  /* Not enough, realloc it.  */
5983 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
5984 	  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
5985 	  /* Use realloc since the array is NULL if *alloc == 0.  */
5986 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
5987 						   int32_t,
5988 						   new_equiv_class_alloc);
5989 	  if (BE (new_equiv_classes == NULL, 0))
5990 	    return REG_ESPACE;
5991 	  mbcset->equiv_classes = new_equiv_classes;
5992 	  *equiv_class_alloc = new_equiv_class_alloc;
5993 	}
5994       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
5995     }
5996   else
5997 #endif /* _LIBC */
5998     {
5999       if (BE (strlen ((const char *) name) != 1, 0))
6000 	return REG_ECOLLATE;
6001       bitset_set (sbcset, *name);
6002     }
6003   return REG_NOERROR;
6004 }
6005 
6006   /* Helper function for parse_bracket_exp.
6007      Build the character class which is represented by NAME.
6008      The result are written to MBCSET and SBCSET.
6009      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
6010      is a pointer argument sinse we may update it.  */
6011 
6012 static reg_errcode_t
6013 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,int * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)6014 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
6015 		 re_charset_t *mbcset, int *char_class_alloc,
6016 		 const unsigned char *class_name, reg_syntax_t syntax)
6017 #else /* not RE_ENABLE_I18N */
6018 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
6019 		 const unsigned char *class_name, reg_syntax_t syntax)
6020 #endif /* not RE_ENABLE_I18N */
6021 {
6022   int i;
6023   const char *name = (const char *) class_name;
6024 
6025   /* In case of REG_ICASE "upper" and "lower" match the both of
6026      upper and lower cases.  */
6027   if ((syntax & RE_ICASE)
6028       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
6029     name = "alpha";
6030 
6031 #ifdef RE_ENABLE_I18N
6032   /* Check the space of the arrays.  */
6033   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
6034     {
6035       /* Not enough, realloc it.  */
6036       /* +1 in case of mbcset->nchar_classes is 0.  */
6037       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
6038       /* Use realloc since array is NULL if *alloc == 0.  */
6039       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
6040 					       new_char_class_alloc);
6041       if (BE (new_char_classes == NULL, 0))
6042 	return REG_ESPACE;
6043       mbcset->char_classes = new_char_classes;
6044       *char_class_alloc = new_char_class_alloc;
6045     }
6046   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
6047 #endif /* RE_ENABLE_I18N */
6048 
6049 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
6050   do {						\
6051     if (BE (trans != NULL, 0))			\
6052       {						\
6053 	for (i = 0; i < SBC_MAX; ++i)		\
6054   	  if (ctype_func (i))			\
6055 	    bitset_set (sbcset, trans[i]);	\
6056       }						\
6057     else					\
6058       {						\
6059 	for (i = 0; i < SBC_MAX; ++i)		\
6060   	  if (ctype_func (i))			\
6061 	    bitset_set (sbcset, i);		\
6062       }						\
6063   } while (0)
6064 
6065   if (strcmp (name, "alnum") == 0)
6066     BUILD_CHARCLASS_LOOP (isalnum);
6067   else if (strcmp (name, "cntrl") == 0)
6068     BUILD_CHARCLASS_LOOP (iscntrl);
6069   else if (strcmp (name, "lower") == 0)
6070     BUILD_CHARCLASS_LOOP (islower);
6071   else if (strcmp (name, "space") == 0)
6072     BUILD_CHARCLASS_LOOP (isspace);
6073   else if (strcmp (name, "alpha") == 0)
6074     BUILD_CHARCLASS_LOOP (isalpha);
6075   else if (strcmp (name, "digit") == 0)
6076     BUILD_CHARCLASS_LOOP (isdigit);
6077   else if (strcmp (name, "print") == 0)
6078     BUILD_CHARCLASS_LOOP (isprint);
6079   else if (strcmp (name, "upper") == 0)
6080     BUILD_CHARCLASS_LOOP (isupper);
6081   else if (strcmp (name, "blank") == 0)
6082     BUILD_CHARCLASS_LOOP (isblank);
6083   else if (strcmp (name, "graph") == 0)
6084     BUILD_CHARCLASS_LOOP (isgraph);
6085   else if (strcmp (name, "punct") == 0)
6086     BUILD_CHARCLASS_LOOP (ispunct);
6087   else if (strcmp (name, "xdigit") == 0)
6088     BUILD_CHARCLASS_LOOP (isxdigit);
6089   else
6090     return REG_ECTYPE;
6091 
6092   return REG_NOERROR;
6093 }
6094 
6095 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,int non_match,reg_errcode_t * err)6096 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
6097 		    const unsigned char *class_name,
6098 		    const unsigned char *extra, int non_match,
6099 		    reg_errcode_t *err)
6100 {
6101   re_bitset_ptr_t sbcset;
6102 #ifdef RE_ENABLE_I18N
6103   re_charset_t *mbcset;
6104   int alloc = 0;
6105 #endif /* not RE_ENABLE_I18N */
6106   reg_errcode_t ret;
6107   re_token_t br_token;
6108   bin_tree_t *tree;
6109 
6110   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
6111 #ifdef RE_ENABLE_I18N
6112   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
6113 #endif /* RE_ENABLE_I18N */
6114 
6115 #ifdef RE_ENABLE_I18N
6116   if (BE (sbcset == NULL || mbcset == NULL, 0))
6117 #else /* not RE_ENABLE_I18N */
6118   if (BE (sbcset == NULL, 0))
6119 #endif /* not RE_ENABLE_I18N */
6120     {
6121       *err = REG_ESPACE;
6122       return NULL;
6123     }
6124 
6125   if (non_match)
6126     {
6127 #ifdef RE_ENABLE_I18N
6128       /*
6129       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
6130 	bitset_set(cset->sbcset, '\0');
6131       */
6132       mbcset->non_match = 1;
6133 #endif /* not RE_ENABLE_I18N */
6134     }
6135 
6136   /* We don't care the syntax in this case.  */
6137   ret = build_charclass (trans, sbcset,
6138 #ifdef RE_ENABLE_I18N
6139 			 mbcset, &alloc,
6140 #endif /* RE_ENABLE_I18N */
6141 			 class_name, 0);
6142 
6143   if (BE (ret != REG_NOERROR, 0))
6144     {
6145       re_free (sbcset);
6146 #ifdef RE_ENABLE_I18N
6147       free_charset (mbcset);
6148 #endif /* RE_ENABLE_I18N */
6149       *err = ret;
6150       return NULL;
6151     }
6152   /* \w match '_' also.  */
6153   for (; *extra; extra++)
6154     bitset_set (sbcset, *extra);
6155 
6156   /* If it is non-matching list.  */
6157   if (non_match)
6158     bitset_not (sbcset);
6159 
6160 #ifdef RE_ENABLE_I18N
6161   /* Ensure only single byte characters are set.  */
6162   if (dfa->mb_cur_max > 1)
6163     bitset_mask (sbcset, dfa->sb_char);
6164 #endif
6165 
6166   /* Build a tree for simple bracket.  */
6167   br_token.type = SIMPLE_BRACKET;
6168   br_token.opr.sbcset = sbcset;
6169   tree = create_token_tree (dfa, NULL, NULL, &br_token);
6170   if (BE (tree == NULL, 0))
6171     goto build_word_op_espace;
6172 
6173 #ifdef RE_ENABLE_I18N
6174   if (dfa->mb_cur_max > 1)
6175     {
6176       bin_tree_t *mbc_tree;
6177       /* Build a tree for complex bracket.  */
6178       br_token.type = COMPLEX_BRACKET;
6179       br_token.opr.mbcset = mbcset;
6180       dfa->has_mb_node = 1;
6181       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
6182       if (BE (mbc_tree == NULL, 0))
6183 	goto build_word_op_espace;
6184       /* Then join them by ALT node.  */
6185       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
6186       if (BE (mbc_tree != NULL, 1))
6187 	return tree;
6188     }
6189   else
6190     {
6191       free_charset (mbcset);
6192       return tree;
6193     }
6194 #else /* not RE_ENABLE_I18N */
6195   return tree;
6196 #endif /* not RE_ENABLE_I18N */
6197 
6198  build_word_op_espace:
6199   re_free (sbcset);
6200 #ifdef RE_ENABLE_I18N
6201   free_charset (mbcset);
6202 #endif /* RE_ENABLE_I18N */
6203   *err = REG_ESPACE;
6204   return NULL;
6205 }
6206 
6207 /* This is intended for the expressions like "a{1,3}".
6208    Fetch a number from `input', and return the number.
6209    Return -1, if the number field is empty like "{,1}".
6210    Return -2, If an error is occured.  */
6211 
6212 static int
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)6213 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
6214 {
6215   int num = -1;
6216   unsigned char c;
6217   while (1)
6218     {
6219       fetch_token (token, input, syntax);
6220       c = token->opr.c;
6221       if (BE (token->type == END_OF_RE, 0))
6222 	return -2;
6223       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
6224 	break;
6225       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
6226 	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
6227       num = (num > RE_DUP_MAX) ? -2 : num;
6228     }
6229   return num;
6230 }
6231 
6232 #ifdef RE_ENABLE_I18N
6233 static void
free_charset(re_charset_t * cset)6234 free_charset (re_charset_t *cset)
6235 {
6236   re_free (cset->mbchars);
6237 # ifdef _LIBC
6238   re_free (cset->coll_syms);
6239   re_free (cset->equiv_classes);
6240   re_free (cset->range_starts);
6241   re_free (cset->range_ends);
6242 # endif
6243   re_free (cset->char_classes);
6244   re_free (cset);
6245 }
6246 #endif /* RE_ENABLE_I18N */
6247 
6248 /* Functions for binary tree operation.  */
6249 
6250 /* Create a tree node.  */
6251 
6252 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)6253 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
6254 	     re_token_type_t type)
6255 {
6256   re_token_t t;
6257   t.type = type;
6258   return create_token_tree (dfa, left, right, &t);
6259 }
6260 
6261 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)6262 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
6263 		   const re_token_t *token)
6264 {
6265   bin_tree_t *tree;
6266   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
6267     {
6268       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
6269 
6270       if (storage == NULL)
6271 	return NULL;
6272       storage->next = dfa->str_tree_storage;
6273       dfa->str_tree_storage = storage;
6274       dfa->str_tree_storage_idx = 0;
6275     }
6276   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
6277 
6278   tree->parent = NULL;
6279   tree->left = left;
6280   tree->right = right;
6281   tree->token = *token;
6282   tree->token.duplicated = 0;
6283   tree->token.opt_subexp = 0;
6284   tree->first = NULL;
6285   tree->next = NULL;
6286   tree->node_idx = -1;
6287 
6288   if (left != NULL)
6289     left->parent = tree;
6290   if (right != NULL)
6291     right->parent = tree;
6292   return tree;
6293 }
6294 
6295 /* Mark the tree SRC as an optional subexpression.
6296    To be called from preorder or postorder.  */
6297 
6298 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)6299 mark_opt_subexp (void *extra, bin_tree_t *node)
6300 {
6301   int idx = (int) (long) extra;
6302   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
6303     node->token.opt_subexp = 1;
6304 
6305   return REG_NOERROR;
6306 }
6307 
6308 /* Free the allocated memory inside NODE. */
6309 
6310 static void
free_token(re_token_t * node)6311 free_token (re_token_t *node)
6312 {
6313 #ifdef RE_ENABLE_I18N
6314   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
6315     free_charset (node->opr.mbcset);
6316   else
6317 #endif /* RE_ENABLE_I18N */
6318     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
6319       re_free (node->opr.sbcset);
6320 }
6321 
6322 /* Worker function for tree walking.  Free the allocated memory inside NODE
6323    and its children. */
6324 
6325 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)6326 free_tree (void *extra, bin_tree_t *node)
6327 {
6328   free_token (&node->token);
6329   return REG_NOERROR;
6330 }
6331 
6332 
6333 /* Duplicate the node SRC, and return new node.  This is a preorder
6334    visit similar to the one implemented by the generic visitor, but
6335    we need more infrastructure to maintain two parallel trees --- so,
6336    it's easier to duplicate.  */
6337 
6338 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)6339 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
6340 {
6341   const bin_tree_t *node;
6342   bin_tree_t *dup_root;
6343   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
6344 
6345   for (node = root; ; )
6346     {
6347       /* Create a new tree and link it back to the current parent.  */
6348       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
6349       if (*p_new == NULL)
6350 	return NULL;
6351       (*p_new)->parent = dup_node;
6352       (*p_new)->token.duplicated = 1;
6353       dup_node = *p_new;
6354 
6355       /* Go to the left node, or up and to the right.  */
6356       if (node->left)
6357 	{
6358 	  node = node->left;
6359 	  p_new = &dup_node->left;
6360 	}
6361       else
6362 	{
6363 	  const bin_tree_t *prev = NULL;
6364 	  while (node->right == prev || node->right == NULL)
6365 	    {
6366 	      prev = node;
6367 	      node = node->parent;
6368 	      dup_node = dup_node->parent;
6369 	      if (!node)
6370 	        return dup_root;
6371 	    }
6372 	  node = node->right;
6373 	  p_new = &dup_node->right;
6374 	}
6375     }
6376 }
6377 
6378 /******************************************************************************/
6379 /******************************************************************************/
6380 /******************************************************************************/
6381 /* GKINCLUDE #include "regexec.c" */
6382 /******************************************************************************/
6383 /******************************************************************************/
6384 /******************************************************************************/
6385 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
6386 				     int n) internal_function;
6387 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
6388 static void match_ctx_free (re_match_context_t *cache) internal_function;
6389 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
6390 					  int str_idx, int from, int to)
6391      internal_function;
6392 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
6393      internal_function;
6394 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
6395 					   int str_idx) internal_function;
6396 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
6397 						   int node, int str_idx)
6398      internal_function;
6399 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
6400 			   re_dfastate_t **limited_sts, int last_node,
6401 			   int last_str_idx)
6402      internal_function;
6403 static reg_errcode_t re_search_internal (const regex_t *preg,
6404 					 const char *string, int length,
6405 					 int start, int range, int stop,
6406 					 size_t nmatch, regmatch_t pmatch[],
6407 					 int eflags) internal_function;
6408 static int re_search_2_stub (struct re_pattern_buffer *bufp,
6409 			     const char *string1, int length1,
6410 			     const char *string2, int length2,
6411 			     int start, int range, struct re_registers *regs,
6412 			     int stop, int ret_len) internal_function;
6413 static int re_search_stub (struct re_pattern_buffer *bufp,
6414 			   const char *string, int length, int start,
6415 			   int range, int stop, struct re_registers *regs,
6416 			   int ret_len) internal_function;
6417 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
6418 			      int nregs, int regs_allocated) internal_function;
6419 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
6420      internal_function;
6421 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
6422 			   int *p_match_first) internal_function;
6423 static int check_halt_state_context (const re_match_context_t *mctx,
6424 				     const re_dfastate_t *state, int idx)
6425      internal_function;
6426 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
6427 			 regmatch_t *prev_idx_match, int cur_node,
6428 			 int cur_idx, int nmatch) internal_function;
6429 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
6430 				      int str_idx, int dest_node, int nregs,
6431 				      regmatch_t *regs,
6432 				      re_node_set *eps_via_nodes)
6433      internal_function;
6434 static reg_errcode_t set_regs (const regex_t *preg,
6435 			       const re_match_context_t *mctx,
6436 			       size_t nmatch, regmatch_t *pmatch,
6437 			       int fl_backtrack) internal_function;
6438 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
6439      internal_function;
6440 
6441 #ifdef RE_ENABLE_I18N
6442 static int sift_states_iter_mb (const re_match_context_t *mctx,
6443 				re_sift_context_t *sctx,
6444 				int node_idx, int str_idx, int max_str_idx)
6445      internal_function;
6446 #endif /* RE_ENABLE_I18N */
6447 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
6448 					   re_sift_context_t *sctx)
6449      internal_function;
6450 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
6451 					  re_sift_context_t *sctx, int str_idx,
6452 					  re_node_set *cur_dest)
6453      internal_function;
6454 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
6455 					      re_sift_context_t *sctx,
6456 					      int str_idx,
6457 					      re_node_set *dest_nodes)
6458      internal_function;
6459 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
6460 					    re_node_set *dest_nodes,
6461 					    const re_node_set *candidates)
6462      internal_function;
6463 static int check_dst_limits (const re_match_context_t *mctx,
6464 			     re_node_set *limits,
6465 			     int dst_node, int dst_idx, int src_node,
6466 			     int src_idx) internal_function;
6467 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
6468 					int boundaries, int subexp_idx,
6469 					int from_node, int bkref_idx)
6470      internal_function;
6471 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
6472 				      int limit, int subexp_idx,
6473 				      int node, int str_idx,
6474 				      int bkref_idx) internal_function;
6475 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
6476 					  re_node_set *dest_nodes,
6477 					  const re_node_set *candidates,
6478 					  re_node_set *limits,
6479 					  struct re_backref_cache_entry *bkref_ents,
6480 					  int str_idx) internal_function;
6481 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
6482 					re_sift_context_t *sctx,
6483 					int str_idx, const re_node_set *candidates)
6484      internal_function;
6485 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
6486 					re_dfastate_t **dst,
6487 					re_dfastate_t **src, int num)
6488      internal_function;
6489 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
6490 					 re_match_context_t *mctx) internal_function;
6491 static re_dfastate_t *transit_state (reg_errcode_t *err,
6492 				     re_match_context_t *mctx,
6493 				     re_dfastate_t *state) internal_function;
6494 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
6495 					    re_match_context_t *mctx,
6496 					    re_dfastate_t *next_state)
6497      internal_function;
6498 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
6499 						re_node_set *cur_nodes,
6500 						int str_idx) internal_function;
6501 #if 0
6502 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
6503 					re_match_context_t *mctx,
6504 					re_dfastate_t *pstate)
6505      internal_function;
6506 #endif
6507 #ifdef RE_ENABLE_I18N
6508 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
6509 				       re_dfastate_t *pstate)
6510      internal_function;
6511 #endif /* RE_ENABLE_I18N */
6512 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
6513 					  const re_node_set *nodes)
6514      internal_function;
6515 static reg_errcode_t get_subexp (re_match_context_t *mctx,
6516 				 int bkref_node, int bkref_str_idx)
6517      internal_function;
6518 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
6519 				     const re_sub_match_top_t *sub_top,
6520 				     re_sub_match_last_t *sub_last,
6521 				     int bkref_node, int bkref_str)
6522      internal_function;
6523 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
6524 			     int subexp_idx, int type) internal_function;
6525 static reg_errcode_t check_arrival (re_match_context_t *mctx,
6526 				    state_array_t *path, int top_node,
6527 				    int top_str, int last_node, int last_str,
6528 				    int type) internal_function;
6529 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
6530 						   int str_idx,
6531 						   re_node_set *cur_nodes,
6532 						   re_node_set *next_nodes)
6533      internal_function;
6534 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
6535 					       re_node_set *cur_nodes,
6536 					       int ex_subexp, int type)
6537      internal_function;
6538 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
6539 						   re_node_set *dst_nodes,
6540 						   int target, int ex_subexp,
6541 						   int type) internal_function;
6542 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
6543 					 re_node_set *cur_nodes, int cur_str,
6544 					 int subexp_num, int type)
6545      internal_function;
6546 static int build_trtable (const re_dfa_t *dfa,
6547 			  re_dfastate_t *state) internal_function;
6548 #ifdef RE_ENABLE_I18N
6549 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
6550 				    const re_string_t *input, int idx)
6551      internal_function;
6552 # ifdef _LIBC
6553 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
6554 						   size_t name_len)
6555      internal_function;
6556 # endif /* _LIBC */
6557 #endif /* RE_ENABLE_I18N */
6558 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
6559 				       const re_dfastate_t *state,
6560 				       re_node_set *states_node,
6561 				       bitset_t *states_ch) internal_function;
6562 static int check_node_accept (const re_match_context_t *mctx,
6563 			      const re_token_t *node, int idx)
6564      internal_function;
6565 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
6566      internal_function;
6567 
6568 /* Entry point for POSIX code.  */
6569 
6570 /* regexec searches for a given pattern, specified by PREG, in the
6571    string STRING.
6572 
6573    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6574    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
6575    least NMATCH elements, and we set them to the offsets of the
6576    corresponding matched substrings.
6577 
6578    EFLAGS specifies `execution flags' which affect matching: if
6579    REG_NOTBOL is set, then ^ does not match at the beginning of the
6580    string; if REG_NOTEOL is set, then $ does not match at the end.
6581 
6582    We return 0 if we find a match and REG_NOMATCH if not.  */
6583 
6584 int
regexec(preg,string,nmatch,pmatch,eflags)6585 regexec (preg, string, nmatch, pmatch, eflags)
6586     const regex_t *__restrict preg;
6587     const char *__restrict string;
6588     size_t nmatch;
6589     regmatch_t pmatch[];
6590     int eflags;
6591 {
6592   reg_errcode_t err;
6593   int start, length;
6594   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
6595 
6596   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
6597     return REG_BADPAT;
6598 
6599   if (eflags & REG_STARTEND)
6600     {
6601       start = pmatch[0].rm_so;
6602       length = pmatch[0].rm_eo;
6603     }
6604   else
6605     {
6606       start = 0;
6607       length = strlen (string);
6608     }
6609 
6610   __libc_lock_lock (dfa->lock);
6611   if (preg->no_sub)
6612     err = re_search_internal (preg, string, length, start, length - start,
6613 			      length, 0, NULL, eflags);
6614   else
6615     err = re_search_internal (preg, string, length, start, length - start,
6616 			      length, nmatch, pmatch, eflags);
6617   __libc_lock_unlock (dfa->lock);
6618   return err != REG_NOERROR;
6619 }
6620 
6621 #ifdef _LIBC
6622 # include <shlib-compat.h>
6623 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
6624 
6625 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
6626 __typeof__ (__regexec) __compat_regexec;
6627 
6628 int
6629 attribute_compat_text_section
__compat_regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)6630 __compat_regexec (const regex_t *__restrict preg,
6631 		  const char *__restrict string, size_t nmatch,
6632 		  regmatch_t pmatch[], int eflags)
6633 {
6634   return regexec (preg, string, nmatch, pmatch,
6635 		  eflags & (REG_NOTBOL | REG_NOTEOL));
6636 }
6637 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
6638 # endif
6639 #endif
6640 
6641 /* Entry points for GNU code.  */
6642 
6643 /* re_match, re_search, re_match_2, re_search_2
6644 
6645    The former two functions operate on STRING with length LENGTH,
6646    while the later two operate on concatenation of STRING1 and STRING2
6647    with lengths LENGTH1 and LENGTH2, respectively.
6648 
6649    re_match() matches the compiled pattern in BUFP against the string,
6650    starting at index START.
6651 
6652    re_search() first tries matching at index START, then it tries to match
6653    starting from index START + 1, and so on.  The last start position tried
6654    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
6655    way as re_match().)
6656 
6657    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
6658    the first STOP characters of the concatenation of the strings should be
6659    concerned.
6660 
6661    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
6662    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
6663    computed relative to the concatenation, not relative to the individual
6664    strings.)
6665 
6666    On success, re_match* functions return the length of the match, re_search*
6667    return the position of the start of the match.  Return value -1 means no
6668    match was found and -2 indicates an internal error.  */
6669 
6670 int
re_match(bufp,string,length,start,regs)6671 re_match (bufp, string, length, start, regs)
6672     struct re_pattern_buffer *bufp;
6673     const char *string;
6674     int length, start;
6675     struct re_registers *regs;
6676 {
6677   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
6678 }
6679 #ifdef _LIBC
6680 weak_alias (__re_match, re_match)
6681 #endif
6682 
6683 int
6684 re_search (bufp, string, length, start, range, regs)
6685     struct re_pattern_buffer *bufp;
6686     const char *string;
6687     int length, start, range;
6688     struct re_registers *regs;
6689 {
6690   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
6691 }
6692 #ifdef _LIBC
6693 weak_alias (__re_search, re_search)
6694 #endif
6695 
6696 int
6697 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
6698     struct re_pattern_buffer *bufp;
6699     const char *string1, *string2;
6700     int length1, length2, start, stop;
6701     struct re_registers *regs;
6702 {
6703   return re_search_2_stub (bufp, string1, length1, string2, length2,
6704 			   start, 0, regs, stop, 1);
6705 }
6706 #ifdef _LIBC
6707 weak_alias (__re_match_2, re_match_2)
6708 #endif
6709 
6710 int
6711 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
6712     struct re_pattern_buffer *bufp;
6713     const char *string1, *string2;
6714     int length1, length2, start, range, stop;
6715     struct re_registers *regs;
6716 {
6717   return re_search_2_stub (bufp, string1, length1, string2, length2,
6718 			   start, range, regs, stop, 0);
6719 }
6720 #ifdef _LIBC
6721 weak_alias (__re_search_2, re_search_2)
6722 #endif
6723 
6724 static int
6725 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
6726 		  stop, ret_len)
6727     struct re_pattern_buffer *bufp;
6728     const char *string1, *string2;
6729     int length1, length2, start, range, stop, ret_len;
6730     struct re_registers *regs;
6731 {
6732   const char *str;
6733   int rval;
6734   int len = length1 + length2;
6735   int free_str = 0;
6736 
6737   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
6738     return -2;
6739 
6740   /* Concatenate the strings.  */
6741   if (length2 > 0)
6742     if (length1 > 0)
6743       {
6744 	char *s = re_malloc (char, len);
6745 
6746 	if (BE (s == NULL, 0))
6747 	  return -2;
6748 #ifdef _LIBC
6749 	memcpy (__mempcpy (s, string1, length1), string2, length2);
6750 #else
6751 	memcpy (s, string1, length1);
6752 	memcpy (s + length1, string2, length2);
6753 #endif
6754 	str = s;
6755 	free_str = 1;
6756       }
6757     else
6758       str = string2;
6759   else
6760     str = string1;
6761 
6762   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
6763 			 ret_len);
6764   if (free_str)
6765     re_free ((char *) str);
6766   return rval;
6767 }
6768 
6769 /* The parameters have the same meaning as those of re_search.
6770    Additional parameters:
6771    If RET_LEN is nonzero the length of the match is returned (re_match style);
6772    otherwise the position of the match is returned.  */
6773 
6774 static int
re_search_stub(bufp,string,length,start,range,stop,regs,ret_len)6775 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
6776     struct re_pattern_buffer *bufp;
6777     const char *string;
6778     int length, start, range, stop, ret_len;
6779     struct re_registers *regs;
6780 {
6781   reg_errcode_t result;
6782   regmatch_t *pmatch;
6783   int nregs, rval;
6784   int eflags = 0;
6785   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
6786 
6787   /* Check for out-of-range.  */
6788   if (BE (start < 0 || start > length, 0))
6789     return -1;
6790   if (BE (start + range > length, 0))
6791     range = length - start;
6792   else if (BE (start + range < 0, 0))
6793     range = -start;
6794 
6795   __libc_lock_lock (dfa->lock);
6796 
6797   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
6798   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
6799 
6800   /* Compile fastmap if we haven't yet.  */
6801   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
6802     re_compile_fastmap (bufp);
6803 
6804   if (BE (bufp->no_sub, 0))
6805     regs = NULL;
6806 
6807   /* We need at least 1 register.  */
6808   if (regs == NULL)
6809     nregs = 1;
6810   else if (BE (bufp->regs_allocated == REGS_FIXED &&
6811 	       regs->num_regs < bufp->re_nsub + 1, 0))
6812     {
6813       nregs = regs->num_regs;
6814       if (BE (nregs < 1, 0))
6815 	{
6816 	  /* Nothing can be copied to regs.  */
6817 	  regs = NULL;
6818 	  nregs = 1;
6819 	}
6820     }
6821   else
6822     nregs = bufp->re_nsub + 1;
6823   pmatch = re_malloc (regmatch_t, nregs);
6824   if (BE (pmatch == NULL, 0))
6825     {
6826       rval = -2;
6827       goto out;
6828     }
6829 
6830   result = re_search_internal (bufp, string, length, start, range, stop,
6831 			       nregs, pmatch, eflags);
6832 
6833   rval = 0;
6834 
6835   /* I hope we needn't fill ther regs with -1's when no match was found.  */
6836   if (result != REG_NOERROR)
6837     rval = -1;
6838   else if (regs != NULL)
6839     {
6840       /* If caller wants register contents data back, copy them.  */
6841       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
6842 					   bufp->regs_allocated);
6843       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
6844 	rval = -2;
6845     }
6846 
6847   if (BE (rval == 0, 1))
6848     {
6849       if (ret_len)
6850 	{
6851 	  assert (pmatch[0].rm_so == start);
6852 	  rval = pmatch[0].rm_eo - start;
6853 	}
6854       else
6855 	rval = pmatch[0].rm_so;
6856     }
6857   re_free (pmatch);
6858  out:
6859   __libc_lock_unlock (dfa->lock);
6860   return rval;
6861 }
6862 
6863 static unsigned
re_copy_regs(regs,pmatch,nregs,regs_allocated)6864 re_copy_regs (regs, pmatch, nregs, regs_allocated)
6865     struct re_registers *regs;
6866     regmatch_t *pmatch;
6867     int nregs, regs_allocated;
6868 {
6869   int rval = REGS_REALLOCATE;
6870   int i;
6871   int need_regs = nregs + 1;
6872   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
6873      uses.  */
6874 
6875   /* Have the register data arrays been allocated?  */
6876   if (regs_allocated == REGS_UNALLOCATED)
6877     { /* No.  So allocate them with malloc.  */
6878       regs->start = re_malloc (regoff_t, need_regs);
6879       regs->end = re_malloc (regoff_t, need_regs);
6880       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
6881 	return REGS_UNALLOCATED;
6882       regs->num_regs = need_regs;
6883     }
6884   else if (regs_allocated == REGS_REALLOCATE)
6885     { /* Yes.  If we need more elements than were already
6886 	 allocated, reallocate them.  If we need fewer, just
6887 	 leave it alone.  */
6888       if (BE (need_regs > regs->num_regs, 0))
6889 	{
6890 	  regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
6891 	  regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
6892 	  if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
6893 	    return REGS_UNALLOCATED;
6894 	  regs->start = new_start;
6895 	  regs->end = new_end;
6896 	  regs->num_regs = need_regs;
6897 	}
6898     }
6899   else
6900     {
6901       assert (regs_allocated == REGS_FIXED);
6902       /* This function may not be called with REGS_FIXED and nregs too big.  */
6903       assert (regs->num_regs >= nregs);
6904       rval = REGS_FIXED;
6905     }
6906 
6907   /* Copy the regs.  */
6908   for (i = 0; i < nregs; ++i)
6909     {
6910       regs->start[i] = pmatch[i].rm_so;
6911       regs->end[i] = pmatch[i].rm_eo;
6912     }
6913   for ( ; i < regs->num_regs; ++i)
6914     regs->start[i] = regs->end[i] = -1;
6915 
6916   return rval;
6917 }
6918 
6919 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
6920    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
6921    this memory for recording register information.  STARTS and ENDS
6922    must be allocated using the malloc library routine, and must each
6923    be at least NUM_REGS * sizeof (regoff_t) bytes long.
6924 
6925    If NUM_REGS == 0, then subsequent matches should allocate their own
6926    register data.
6927 
6928    Unless this function is called, the first search or match using
6929    PATTERN_BUFFER will allocate its own register data, without
6930    freeing the old data.  */
6931 
6932 void
re_set_registers(bufp,regs,num_regs,starts,ends)6933 re_set_registers (bufp, regs, num_regs, starts, ends)
6934     struct re_pattern_buffer *bufp;
6935     struct re_registers *regs;
6936     unsigned num_regs;
6937     regoff_t *starts, *ends;
6938 {
6939   if (num_regs)
6940     {
6941       bufp->regs_allocated = REGS_REALLOCATE;
6942       regs->num_regs = num_regs;
6943       regs->start = starts;
6944       regs->end = ends;
6945     }
6946   else
6947     {
6948       bufp->regs_allocated = REGS_UNALLOCATED;
6949       regs->num_regs = 0;
6950       regs->start = regs->end = (regoff_t *) 0;
6951     }
6952 }
6953 #ifdef _LIBC
6954 weak_alias (__re_set_registers, re_set_registers)
6955 #endif
6956 
6957 /* Entry points compatible with 4.2 BSD regex library.  We don't define
6958    them unless specifically requested.  */
6959 
6960 #if defined _REGEX_RE_COMP || defined _LIBC
6961 int
6962 # ifdef _LIBC
6963 weak_function
6964 # endif
6965 re_exec (s)
6966      const char *s;
6967 {
6968   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
6969 }
6970 #endif /* _REGEX_RE_COMP */
6971 
6972 /* Internal entry point.  */
6973 
6974 /* Searches for a compiled pattern PREG in the string STRING, whose
6975    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
6976    mingings with regexec.  START, and RANGE have the same meanings
6977    with re_search.
6978    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
6979    otherwise return the error code.
6980    Note: We assume front end functions already check ranges.
6981    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
6982 
6983 static reg_errcode_t
re_search_internal(preg,string,length,start,range,stop,nmatch,pmatch,eflags)6984 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
6985 		    eflags)
6986     const regex_t *preg;
6987     const char *string;
6988     int length, start, range, stop, eflags;
6989     size_t nmatch;
6990     regmatch_t pmatch[];
6991 {
6992   reg_errcode_t err;
6993   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
6994   int left_lim, right_lim, incr;
6995   int fl_longest_match, match_first, match_kind, match_last = -1;
6996   int extra_nmatch;
6997   int sb, ch;
6998 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
6999   re_match_context_t mctx = { .dfa = dfa };
7000 #else
7001   re_match_context_t mctx;
7002 #endif
7003   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
7004 		   && range && !preg->can_be_null) ? preg->fastmap : NULL;
7005   RE_TRANSLATE_TYPE t = preg->translate;
7006 
7007 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
7008   memset (&mctx, '\0', sizeof (re_match_context_t));
7009   mctx.dfa = dfa;
7010 #endif
7011 
7012   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
7013   nmatch -= extra_nmatch;
7014 
7015   /* Check if the DFA haven't been compiled.  */
7016   if (BE (preg->used == 0 || dfa->init_state == NULL
7017 	  || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
7018 	  || dfa->init_state_begbuf == NULL, 0))
7019     return REG_NOMATCH;
7020 
7021 #ifdef DEBUG
7022   /* We assume front-end functions already check them.  */
7023   assert (start + range >= 0 && start + range <= length);
7024 #endif
7025 
7026   /* If initial states with non-begbuf contexts have no elements,
7027      the regex must be anchored.  If preg->newline_anchor is set,
7028      we'll never use init_state_nl, so do not check it.  */
7029   if (dfa->init_state->nodes.nelem == 0
7030       && dfa->init_state_word->nodes.nelem == 0
7031       && (dfa->init_state_nl->nodes.nelem == 0
7032 	  || !preg->newline_anchor))
7033     {
7034       if (start != 0 && start + range != 0)
7035         return REG_NOMATCH;
7036       start = range = 0;
7037     }
7038 
7039   /* We must check the longest matching, if nmatch > 0.  */
7040   fl_longest_match = (nmatch != 0 || dfa->nbackref);
7041 
7042   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
7043 			    preg->translate, preg->syntax & RE_ICASE, dfa);
7044   if (BE (err != REG_NOERROR, 0))
7045     goto free_return;
7046   mctx.input.stop = stop;
7047   mctx.input.raw_stop = stop;
7048   mctx.input.newline_anchor = preg->newline_anchor;
7049 
7050   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
7051   if (BE (err != REG_NOERROR, 0))
7052     goto free_return;
7053 
7054   /* We will log all the DFA states through which the dfa pass,
7055      if nmatch > 1, or this dfa has "multibyte node", which is a
7056      back-reference or a node which can accept multibyte character or
7057      multi character collating element.  */
7058   if (nmatch > 1 || dfa->has_mb_node)
7059     {
7060       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
7061       if (BE (mctx.state_log == NULL, 0))
7062 	{
7063 	  err = REG_ESPACE;
7064 	  goto free_return;
7065 	}
7066     }
7067   else
7068     mctx.state_log = NULL;
7069 
7070   match_first = start;
7071   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
7072 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
7073 
7074   /* Check incrementally whether of not the input string match.  */
7075   incr = (range < 0) ? -1 : 1;
7076   left_lim = (range < 0) ? start + range : start;
7077   right_lim = (range < 0) ? start : start + range;
7078   sb = dfa->mb_cur_max == 1;
7079   match_kind =
7080     (fastmap
7081      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
7082 	| (range >= 0 ? 2 : 0)
7083 	| (t != NULL ? 1 : 0))
7084      : 8);
7085 
7086   for (;; match_first += incr)
7087     {
7088       err = REG_NOMATCH;
7089       if (match_first < left_lim || right_lim < match_first)
7090 	goto free_return;
7091 
7092       /* Advance as rapidly as possible through the string, until we
7093 	 find a plausible place to start matching.  This may be done
7094 	 with varying efficiency, so there are various possibilities:
7095 	 only the most common of them are specialized, in order to
7096 	 save on code size.  We use a switch statement for speed.  */
7097       switch (match_kind)
7098 	{
7099 	case 8:
7100 	  /* No fastmap.  */
7101 	  break;
7102 
7103 	case 7:
7104 	  /* Fastmap with single-byte translation, match forward.  */
7105 	  while (BE (match_first < right_lim, 1)
7106 		 && !fastmap[t[(unsigned char) string[match_first]]])
7107 	    ++match_first;
7108 	  goto forward_match_found_start_or_reached_end;
7109 
7110 	case 6:
7111 	  /* Fastmap without translation, match forward.  */
7112 	  while (BE (match_first < right_lim, 1)
7113 		 && !fastmap[(unsigned char) string[match_first]])
7114 	    ++match_first;
7115 
7116 	forward_match_found_start_or_reached_end:
7117 	  if (BE (match_first == right_lim, 0))
7118 	    {
7119 	      ch = match_first >= length
7120 		       ? 0 : (unsigned char) string[match_first];
7121 	      if (!fastmap[t ? t[ch] : ch])
7122 		goto free_return;
7123 	    }
7124 	  break;
7125 
7126 	case 4:
7127 	case 5:
7128 	  /* Fastmap without multi-byte translation, match backwards.  */
7129 	  while (match_first >= left_lim)
7130 	    {
7131 	      ch = match_first >= length
7132 		       ? 0 : (unsigned char) string[match_first];
7133 	      if (fastmap[t ? t[ch] : ch])
7134 		break;
7135 	      --match_first;
7136 	    }
7137 	  if (match_first < left_lim)
7138 	    goto free_return;
7139 	  break;
7140 
7141 	default:
7142 	  /* In this case, we can't determine easily the current byte,
7143 	     since it might be a component byte of a multibyte
7144 	     character.  Then we use the constructed buffer instead.  */
7145 	  for (;;)
7146 	    {
7147 	      /* If MATCH_FIRST is out of the valid range, reconstruct the
7148 		 buffers.  */
7149 	      unsigned int offset = match_first - mctx.input.raw_mbs_idx;
7150 	      if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
7151 		{
7152 		  err = re_string_reconstruct (&mctx.input, match_first,
7153 					       eflags);
7154 		  if (BE (err != REG_NOERROR, 0))
7155 		    goto free_return;
7156 
7157 		  offset = match_first - mctx.input.raw_mbs_idx;
7158 		}
7159 	      /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
7160 		 Note that MATCH_FIRST must not be smaller than 0.  */
7161 	      ch = (match_first >= length
7162 		    ? 0 : re_string_byte_at (&mctx.input, offset));
7163 	      if (fastmap[ch])
7164 		break;
7165 	      match_first += incr;
7166 	      if (match_first < left_lim || match_first > right_lim)
7167 	        {
7168 	          err = REG_NOMATCH;
7169 	          goto free_return;
7170 	        }
7171 	    }
7172 	  break;
7173 	}
7174 
7175       /* Reconstruct the buffers so that the matcher can assume that
7176 	 the matching starts from the beginning of the buffer.  */
7177       err = re_string_reconstruct (&mctx.input, match_first, eflags);
7178       if (BE (err != REG_NOERROR, 0))
7179 	goto free_return;
7180 
7181 #ifdef RE_ENABLE_I18N
7182      /* Don't consider this char as a possible match start if it part,
7183 	yet isn't the head, of a multibyte character.  */
7184       if (!sb && !re_string_first_byte (&mctx.input, 0))
7185 	continue;
7186 #endif
7187 
7188       /* It seems to be appropriate one, then use the matcher.  */
7189       /* We assume that the matching starts from 0.  */
7190       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
7191       match_last = check_matching (&mctx, fl_longest_match,
7192 				   range >= 0 ? &match_first : NULL);
7193       if (match_last != -1)
7194 	{
7195 	  if (BE (match_last == -2, 0))
7196 	    {
7197 	      err = REG_ESPACE;
7198 	      goto free_return;
7199 	    }
7200 	  else
7201 	    {
7202 	      mctx.match_last = match_last;
7203 	      if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
7204 		{
7205 		  re_dfastate_t *pstate = mctx.state_log[match_last];
7206 		  mctx.last_node = check_halt_state_context (&mctx, pstate,
7207 							     match_last);
7208 		}
7209 	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
7210 		  || dfa->nbackref)
7211 		{
7212 		  err = prune_impossible_nodes (&mctx);
7213 		  if (err == REG_NOERROR)
7214 		    break;
7215 		  if (BE (err != REG_NOMATCH, 0))
7216 		    goto free_return;
7217 		  match_last = -1;
7218 		}
7219 	      else
7220 		break; /* We found a match.  */
7221 	    }
7222 	}
7223 
7224       match_ctx_clean (&mctx);
7225     }
7226 
7227 #ifdef DEBUG
7228   assert (match_last != -1);
7229   assert (err == REG_NOERROR);
7230 #endif
7231 
7232   /* Set pmatch[] if we need.  */
7233   if (nmatch > 0)
7234     {
7235       int reg_idx;
7236 
7237       /* Initialize registers.  */
7238       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
7239 	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
7240 
7241       /* Set the points where matching start/end.  */
7242       pmatch[0].rm_so = 0;
7243       pmatch[0].rm_eo = mctx.match_last;
7244 
7245       if (!preg->no_sub && nmatch > 1)
7246 	{
7247 	  err = set_regs (preg, &mctx, nmatch, pmatch,
7248 			  dfa->has_plural_match && dfa->nbackref > 0);
7249 	  if (BE (err != REG_NOERROR, 0))
7250 	    goto free_return;
7251 	}
7252 
7253       /* At last, add the offset to the each registers, since we slided
7254 	 the buffers so that we could assume that the matching starts
7255 	 from 0.  */
7256       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
7257 	if (pmatch[reg_idx].rm_so != -1)
7258 	  {
7259 #ifdef RE_ENABLE_I18N
7260 	    if (BE (mctx.input.offsets_needed != 0, 0))
7261 	      {
7262 		pmatch[reg_idx].rm_so =
7263 		  (pmatch[reg_idx].rm_so == mctx.input.valid_len
7264 		   ? mctx.input.valid_raw_len
7265 		   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
7266 		pmatch[reg_idx].rm_eo =
7267 		  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
7268 		   ? mctx.input.valid_raw_len
7269 		   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
7270 	      }
7271 #else
7272 	    assert (mctx.input.offsets_needed == 0);
7273 #endif
7274 	    pmatch[reg_idx].rm_so += match_first;
7275 	    pmatch[reg_idx].rm_eo += match_first;
7276 	  }
7277       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
7278 	{
7279 	  pmatch[nmatch + reg_idx].rm_so = -1;
7280 	  pmatch[nmatch + reg_idx].rm_eo = -1;
7281 	}
7282 
7283       if (dfa->subexp_map)
7284         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
7285           if (dfa->subexp_map[reg_idx] != reg_idx)
7286             {
7287               pmatch[reg_idx + 1].rm_so
7288                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
7289               pmatch[reg_idx + 1].rm_eo
7290                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
7291             }
7292     }
7293 
7294  free_return:
7295   re_free (mctx.state_log);
7296   if (dfa->nbackref)
7297     match_ctx_free (&mctx);
7298   re_string_destruct (&mctx.input);
7299   return err;
7300 }
7301 
7302 static reg_errcode_t
prune_impossible_nodes(mctx)7303 prune_impossible_nodes (mctx)
7304      re_match_context_t *mctx;
7305 {
7306   const re_dfa_t *const dfa = mctx->dfa;
7307   int halt_node, match_last;
7308   reg_errcode_t ret;
7309   re_dfastate_t **sifted_states;
7310   re_dfastate_t **lim_states = NULL;
7311   re_sift_context_t sctx;
7312 #ifdef DEBUG
7313   assert (mctx->state_log != NULL);
7314 #endif
7315   match_last = mctx->match_last;
7316   halt_node = mctx->last_node;
7317   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
7318   if (BE (sifted_states == NULL, 0))
7319     {
7320       ret = REG_ESPACE;
7321       goto free_return;
7322     }
7323   if (dfa->nbackref)
7324     {
7325       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
7326       if (BE (lim_states == NULL, 0))
7327 	{
7328 	  ret = REG_ESPACE;
7329 	  goto free_return;
7330 	}
7331       while (1)
7332 	{
7333 	  memset (lim_states, '\0',
7334 		  sizeof (re_dfastate_t *) * (match_last + 1));
7335 	  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
7336 			 match_last);
7337 	  ret = sift_states_backward (mctx, &sctx);
7338 	  re_node_set_free (&sctx.limits);
7339 	  if (BE (ret != REG_NOERROR, 0))
7340 	      goto free_return;
7341 	  if (sifted_states[0] != NULL || lim_states[0] != NULL)
7342 	    break;
7343 	  do
7344 	    {
7345 	      --match_last;
7346 	      if (match_last < 0)
7347 		{
7348 		  ret = REG_NOMATCH;
7349 		  goto free_return;
7350 		}
7351 	    } while (mctx->state_log[match_last] == NULL
7352 		     || !mctx->state_log[match_last]->halt);
7353 	  halt_node = check_halt_state_context (mctx,
7354 						mctx->state_log[match_last],
7355 						match_last);
7356 	}
7357       ret = merge_state_array (dfa, sifted_states, lim_states,
7358 			       match_last + 1);
7359       re_free (lim_states);
7360       lim_states = NULL;
7361       if (BE (ret != REG_NOERROR, 0))
7362 	goto free_return;
7363     }
7364   else
7365     {
7366       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
7367       ret = sift_states_backward (mctx, &sctx);
7368       re_node_set_free (&sctx.limits);
7369       if (BE (ret != REG_NOERROR, 0))
7370 	goto free_return;
7371     }
7372   re_free (mctx->state_log);
7373   mctx->state_log = sifted_states;
7374   sifted_states = NULL;
7375   mctx->last_node = halt_node;
7376   mctx->match_last = match_last;
7377   ret = REG_NOERROR;
7378  free_return:
7379   re_free (sifted_states);
7380   re_free (lim_states);
7381   return ret;
7382 }
7383 
7384 /* Acquire an initial state and return it.
7385    We must select appropriate initial state depending on the context,
7386    since initial states may have constraints like "\<", "^", etc..  */
7387 
7388 static inline re_dfastate_t *
7389 __attribute ((always_inline)) internal_function
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,int idx)7390 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
7391 			    int idx)
7392 {
7393   const re_dfa_t *const dfa = mctx->dfa;
7394   if (dfa->init_state->has_constraint)
7395     {
7396       unsigned int context;
7397       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
7398       if (IS_WORD_CONTEXT (context))
7399 	return dfa->init_state_word;
7400       else if (IS_ORDINARY_CONTEXT (context))
7401 	return dfa->init_state;
7402       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
7403 	return dfa->init_state_begbuf;
7404       else if (IS_NEWLINE_CONTEXT (context))
7405 	return dfa->init_state_nl;
7406       else if (IS_BEGBUF_CONTEXT (context))
7407 	{
7408 	  /* It is relatively rare case, then calculate on demand.  */
7409 	  return re_acquire_state_context (err, dfa,
7410 					   dfa->init_state->entrance_nodes,
7411 					   context);
7412 	}
7413       else
7414 	/* Must not happen?  */
7415 	return dfa->init_state;
7416     }
7417   else
7418     return dfa->init_state;
7419 }
7420 
7421 /* Check whether the regular expression match input string INPUT or not,
7422    and return the index where the matching end, return -1 if not match,
7423    or return -2 in case of an error.
7424    FL_LONGEST_MATCH means we want the POSIX longest matching.
7425    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
7426    next place where we may want to try matching.
7427    Note that the matcher assume that the maching starts from the current
7428    index of the buffer.  */
7429 
7430 static int
7431 internal_function
check_matching(re_match_context_t * mctx,int fl_longest_match,int * p_match_first)7432 check_matching (re_match_context_t *mctx, int fl_longest_match,
7433 		int *p_match_first)
7434 {
7435   const re_dfa_t *const dfa = mctx->dfa;
7436   reg_errcode_t err;
7437   int match = 0;
7438   int match_last = -1;
7439   int cur_str_idx = re_string_cur_idx (&mctx->input);
7440   re_dfastate_t *cur_state;
7441   int at_init_state = p_match_first != NULL;
7442   int next_start_idx = cur_str_idx;
7443 
7444   err = REG_NOERROR;
7445   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
7446   /* An initial state must not be NULL (invalid).  */
7447   if (BE (cur_state == NULL, 0))
7448     {
7449       assert (err == REG_ESPACE);
7450       return -2;
7451     }
7452 
7453   if (mctx->state_log != NULL)
7454     {
7455       mctx->state_log[cur_str_idx] = cur_state;
7456 
7457       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
7458 	 later.  E.g. Processing back references.  */
7459       if (BE (dfa->nbackref, 0))
7460 	{
7461 	  at_init_state = 0;
7462 	  err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
7463 	  if (BE (err != REG_NOERROR, 0))
7464 	    return err;
7465 
7466 	  if (cur_state->has_backref)
7467 	    {
7468 	      err = transit_state_bkref (mctx, &cur_state->nodes);
7469 	      if (BE (err != REG_NOERROR, 0))
7470 	        return err;
7471 	    }
7472 	}
7473     }
7474 
7475   /* If the RE accepts NULL string.  */
7476   if (BE (cur_state->halt, 0))
7477     {
7478       if (!cur_state->has_constraint
7479 	  || check_halt_state_context (mctx, cur_state, cur_str_idx))
7480 	{
7481 	  if (!fl_longest_match)
7482 	    return cur_str_idx;
7483 	  else
7484 	    {
7485 	      match_last = cur_str_idx;
7486 	      match = 1;
7487 	    }
7488 	}
7489     }
7490 
7491   while (!re_string_eoi (&mctx->input))
7492     {
7493       re_dfastate_t *old_state = cur_state;
7494       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
7495 
7496       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
7497           || (BE (next_char_idx >= mctx->input.valid_len, 0)
7498               && mctx->input.valid_len < mctx->input.len))
7499         {
7500           err = extend_buffers (mctx);
7501           if (BE (err != REG_NOERROR, 0))
7502 	    {
7503 	      assert (err == REG_ESPACE);
7504 	      return -2;
7505 	    }
7506         }
7507 
7508       cur_state = transit_state (&err, mctx, cur_state);
7509       if (mctx->state_log != NULL)
7510 	cur_state = merge_state_with_log (&err, mctx, cur_state);
7511 
7512       if (cur_state == NULL)
7513 	{
7514 	  /* Reached the invalid state or an error.  Try to recover a valid
7515 	     state using the state log, if available and if we have not
7516 	     already found a valid (even if not the longest) match.  */
7517 	  if (BE (err != REG_NOERROR, 0))
7518 	    return -2;
7519 
7520 	  if (mctx->state_log == NULL
7521 	      || (match && !fl_longest_match)
7522 	      || (cur_state = find_recover_state (&err, mctx)) == NULL)
7523 	    break;
7524 	}
7525 
7526       if (BE (at_init_state, 0))
7527 	{
7528 	  if (old_state == cur_state)
7529 	    next_start_idx = next_char_idx;
7530 	  else
7531 	    at_init_state = 0;
7532 	}
7533 
7534       if (cur_state->halt)
7535 	{
7536 	  /* Reached a halt state.
7537 	     Check the halt state can satisfy the current context.  */
7538 	  if (!cur_state->has_constraint
7539 	      || check_halt_state_context (mctx, cur_state,
7540 					   re_string_cur_idx (&mctx->input)))
7541 	    {
7542 	      /* We found an appropriate halt state.  */
7543 	      match_last = re_string_cur_idx (&mctx->input);
7544 	      match = 1;
7545 
7546 	      /* We found a match, do not modify match_first below.  */
7547 	      p_match_first = NULL;
7548 	      if (!fl_longest_match)
7549 		break;
7550 	    }
7551 	}
7552     }
7553 
7554   if (p_match_first)
7555     *p_match_first += next_start_idx;
7556 
7557   return match_last;
7558 }
7559 
7560 /* Check NODE match the current context.  */
7561 
7562 static int
7563 internal_function
check_halt_node_context(const re_dfa_t * dfa,int node,unsigned int context)7564 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
7565 {
7566   re_token_type_t type = dfa->nodes[node].type;
7567   unsigned int constraint = dfa->nodes[node].constraint;
7568   if (type != END_OF_RE)
7569     return 0;
7570   if (!constraint)
7571     return 1;
7572   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
7573     return 0;
7574   return 1;
7575 }
7576 
7577 /* Check the halt state STATE match the current context.
7578    Return 0 if not match, if the node, STATE has, is a halt node and
7579    match the context, return the node.  */
7580 
7581 static int
7582 internal_function
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,int idx)7583 check_halt_state_context (const re_match_context_t *mctx,
7584 			  const re_dfastate_t *state, int idx)
7585 {
7586   int i;
7587   unsigned int context;
7588 #ifdef DEBUG
7589   assert (state->halt);
7590 #endif
7591   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
7592   for (i = 0; i < state->nodes.nelem; ++i)
7593     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
7594       return state->nodes.elems[i];
7595   return 0;
7596 }
7597 
7598 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
7599    corresponding to the DFA).
7600    Return the destination node, and update EPS_VIA_NODES, return -1 in case
7601    of errors.  */
7602 
7603 static int
7604 internal_function
proceed_next_node(const re_match_context_t * mctx,int nregs,regmatch_t * regs,int * pidx,int node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)7605 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
7606 		   int *pidx, int node, re_node_set *eps_via_nodes,
7607 		   struct re_fail_stack_t *fs)
7608 {
7609   const re_dfa_t *const dfa = mctx->dfa;
7610   int i, err;
7611   if (IS_EPSILON_NODE (dfa->nodes[node].type))
7612     {
7613       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
7614       re_node_set *edests = &dfa->edests[node];
7615       int dest_node;
7616       err = re_node_set_insert (eps_via_nodes, node);
7617       if (BE (err < 0, 0))
7618 	return -2;
7619       /* Pick up a valid destination, or return -1 if none is found.  */
7620       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
7621 	{
7622 	  int candidate = edests->elems[i];
7623 	  if (!re_node_set_contains (cur_nodes, candidate))
7624 	    continue;
7625           if (dest_node == -1)
7626 	    dest_node = candidate;
7627 
7628           else
7629 	    {
7630 	      /* In order to avoid infinite loop like "(a*)*", return the second
7631 	         epsilon-transition if the first was already considered.  */
7632 	      if (re_node_set_contains (eps_via_nodes, dest_node))
7633 	        return candidate;
7634 
7635 	      /* Otherwise, push the second epsilon-transition on the fail stack.  */
7636 	      else if (fs != NULL
7637 		       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
7638 				           eps_via_nodes))
7639 		return -2;
7640 
7641 	      /* We know we are going to exit.  */
7642 	      break;
7643 	    }
7644 	}
7645       return dest_node;
7646     }
7647   else
7648     {
7649       int naccepted = 0;
7650       re_token_type_t type = dfa->nodes[node].type;
7651 
7652 #ifdef RE_ENABLE_I18N
7653       if (dfa->nodes[node].accept_mb)
7654 	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
7655       else
7656 #endif /* RE_ENABLE_I18N */
7657       if (type == OP_BACK_REF)
7658 	{
7659 	  int subexp_idx = dfa->nodes[node].opr.idx + 1;
7660 	  naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
7661 	  if (fs != NULL)
7662 	    {
7663 	      if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
7664 		return -1;
7665 	      else if (naccepted)
7666 		{
7667 		  char *buf = (char *) re_string_get_buffer (&mctx->input);
7668 		  if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
7669 			      naccepted) != 0)
7670 		    return -1;
7671 		}
7672 	    }
7673 
7674 	  if (naccepted == 0)
7675 	    {
7676 	      int dest_node;
7677 	      err = re_node_set_insert (eps_via_nodes, node);
7678 	      if (BE (err < 0, 0))
7679 		return -2;
7680 	      dest_node = dfa->edests[node].elems[0];
7681 	      if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
7682 					dest_node))
7683 		return dest_node;
7684 	    }
7685 	}
7686 
7687       if (naccepted != 0
7688 	  || check_node_accept (mctx, dfa->nodes + node, *pidx))
7689 	{
7690 	  int dest_node = dfa->nexts[node];
7691 	  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
7692 	  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
7693 		     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
7694 					       dest_node)))
7695 	    return -1;
7696 	  re_node_set_empty (eps_via_nodes);
7697 	  return dest_node;
7698 	}
7699     }
7700   return -1;
7701 }
7702 
7703 static reg_errcode_t
7704 internal_function
push_fail_stack(struct re_fail_stack_t * fs,int str_idx,int dest_node,int nregs,regmatch_t * regs,re_node_set * eps_via_nodes)7705 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
7706 		 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
7707 {
7708   reg_errcode_t err;
7709   int num = fs->num++;
7710   if (fs->num == fs->alloc)
7711     {
7712       struct re_fail_stack_ent_t *new_array;
7713       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
7714 				       * fs->alloc * 2));
7715       if (new_array == NULL)
7716 	return REG_ESPACE;
7717       fs->alloc *= 2;
7718       fs->stack = new_array;
7719     }
7720   fs->stack[num].idx = str_idx;
7721   fs->stack[num].node = dest_node;
7722   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
7723   if (fs->stack[num].regs == NULL)
7724     return REG_ESPACE;
7725   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
7726   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
7727   return err;
7728 }
7729 
7730 static int
7731 internal_function
pop_fail_stack(struct re_fail_stack_t * fs,int * pidx,int nregs,regmatch_t * regs,re_node_set * eps_via_nodes)7732 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
7733 		regmatch_t *regs, re_node_set *eps_via_nodes)
7734 {
7735   int num = --fs->num;
7736   assert (num >= 0);
7737   *pidx = fs->stack[num].idx;
7738   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
7739   re_node_set_free (eps_via_nodes);
7740   re_free (fs->stack[num].regs);
7741   *eps_via_nodes = fs->stack[num].eps_via_nodes;
7742   return fs->stack[num].node;
7743 }
7744 
7745 /* Set the positions where the subexpressions are starts/ends to registers
7746    PMATCH.
7747    Note: We assume that pmatch[0] is already set, and
7748    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
7749 
7750 static reg_errcode_t
7751 internal_function
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,int fl_backtrack)7752 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
7753 	  regmatch_t *pmatch, int fl_backtrack)
7754 {
7755   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
7756   int idx, cur_node;
7757   re_node_set eps_via_nodes;
7758   struct re_fail_stack_t *fs;
7759   struct re_fail_stack_t fs_body = { 0, 2, NULL };
7760   regmatch_t *prev_idx_match;
7761   int prev_idx_match_malloced = 0;
7762 
7763 #ifdef DEBUG
7764   assert (nmatch > 1);
7765   assert (mctx->state_log != NULL);
7766 #endif
7767   if (fl_backtrack)
7768     {
7769       fs = &fs_body;
7770       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
7771       if (fs->stack == NULL)
7772 	return REG_ESPACE;
7773     }
7774   else
7775     fs = NULL;
7776 
7777   cur_node = dfa->init_node;
7778   re_node_set_init_empty (&eps_via_nodes);
7779 
7780   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
7781     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
7782   else
7783     {
7784       prev_idx_match = re_malloc (regmatch_t, nmatch);
7785       if (prev_idx_match == NULL)
7786 	{
7787 	  free_fail_stack_return (fs);
7788 	  return REG_ESPACE;
7789 	}
7790       prev_idx_match_malloced = 1;
7791     }
7792   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
7793 
7794   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
7795     {
7796       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
7797 
7798       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
7799 	{
7800 	  int reg_idx;
7801 	  if (fs)
7802 	    {
7803 	      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
7804 		if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
7805 		  break;
7806 	      if (reg_idx == nmatch)
7807 		{
7808 		  re_node_set_free (&eps_via_nodes);
7809 		  if (prev_idx_match_malloced)
7810 		    re_free (prev_idx_match);
7811 		  return free_fail_stack_return (fs);
7812 		}
7813 	      cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
7814 					 &eps_via_nodes);
7815 	    }
7816 	  else
7817 	    {
7818 	      re_node_set_free (&eps_via_nodes);
7819 	      if (prev_idx_match_malloced)
7820 		re_free (prev_idx_match);
7821 	      return REG_NOERROR;
7822 	    }
7823 	}
7824 
7825       /* Proceed to next node.  */
7826       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
7827 				    &eps_via_nodes, fs);
7828 
7829       if (BE (cur_node < 0, 0))
7830 	{
7831 	  if (BE (cur_node == -2, 0))
7832 	    {
7833 	      re_node_set_free (&eps_via_nodes);
7834 	      if (prev_idx_match_malloced)
7835 		re_free (prev_idx_match);
7836 	      free_fail_stack_return (fs);
7837 	      return REG_ESPACE;
7838 	    }
7839 	  if (fs)
7840 	    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
7841 				       &eps_via_nodes);
7842 	  else
7843 	    {
7844 	      re_node_set_free (&eps_via_nodes);
7845 	      if (prev_idx_match_malloced)
7846 		re_free (prev_idx_match);
7847 	      return REG_NOMATCH;
7848 	    }
7849 	}
7850     }
7851   re_node_set_free (&eps_via_nodes);
7852   if (prev_idx_match_malloced)
7853     re_free (prev_idx_match);
7854   return free_fail_stack_return (fs);
7855 }
7856 
7857 static reg_errcode_t
7858 internal_function
free_fail_stack_return(struct re_fail_stack_t * fs)7859 free_fail_stack_return (struct re_fail_stack_t *fs)
7860 {
7861   if (fs)
7862     {
7863       int fs_idx;
7864       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
7865 	{
7866 	  re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
7867 	  re_free (fs->stack[fs_idx].regs);
7868 	}
7869       re_free (fs->stack);
7870     }
7871   return REG_NOERROR;
7872 }
7873 
7874 static void
7875 internal_function
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,int cur_node,int cur_idx,int nmatch)7876 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
7877 	     regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
7878 {
7879   int type = dfa->nodes[cur_node].type;
7880   if (type == OP_OPEN_SUBEXP)
7881     {
7882       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
7883 
7884       /* We are at the first node of this sub expression.  */
7885       if (reg_num < nmatch)
7886 	{
7887 	  pmatch[reg_num].rm_so = cur_idx;
7888 	  pmatch[reg_num].rm_eo = -1;
7889 	}
7890     }
7891   else if (type == OP_CLOSE_SUBEXP)
7892     {
7893       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
7894       if (reg_num < nmatch)
7895 	{
7896 	  /* We are at the last node of this sub expression.  */
7897 	  if (pmatch[reg_num].rm_so < cur_idx)
7898 	    {
7899 	      pmatch[reg_num].rm_eo = cur_idx;
7900 	      /* This is a non-empty match or we are not inside an optional
7901 		 subexpression.  Accept this right away.  */
7902 	      memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
7903 	    }
7904 	  else
7905 	    {
7906 	      if (dfa->nodes[cur_node].opt_subexp
7907 		  && prev_idx_match[reg_num].rm_so != -1)
7908 		/* We transited through an empty match for an optional
7909 		   subexpression, like (a?)*, and this is not the subexp's
7910 		   first match.  Copy back the old content of the registers
7911 		   so that matches of an inner subexpression are undone as
7912 		   well, like in ((a?))*.  */
7913 		memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
7914 	      else
7915 		/* We completed a subexpression, but it may be part of
7916 		   an optional one, so do not update PREV_IDX_MATCH.  */
7917 		pmatch[reg_num].rm_eo = cur_idx;
7918 	    }
7919 	}
7920     }
7921 }
7922 
7923 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
7924    and sift the nodes in each states according to the following rules.
7925    Updated state_log will be wrote to STATE_LOG.
7926 
7927    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
7928      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
7929 	If `a' isn't the LAST_NODE and `a' can't epsilon transit to
7930 	the LAST_NODE, we throw away the node `a'.
7931      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
7932 	string `s' and transit to `b':
7933 	i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
7934 	   away the node `a'.
7935 	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
7936 	    thrown away, we throw away the node `a'.
7937      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
7938 	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
7939 	   node `a'.
7940 	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
7941 	    we throw away the node `a'.  */
7942 
7943 #define STATE_NODE_CONTAINS(state,node) \
7944   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
7945 
7946 static reg_errcode_t
7947 internal_function
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)7948 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
7949 {
7950   reg_errcode_t err;
7951   int null_cnt = 0;
7952   int str_idx = sctx->last_str_idx;
7953   re_node_set cur_dest;
7954 
7955 #ifdef DEBUG
7956   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
7957 #endif
7958 
7959   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
7960      transit to the last_node and the last_node itself.  */
7961   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
7962   if (BE (err != REG_NOERROR, 0))
7963     return err;
7964   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
7965   if (BE (err != REG_NOERROR, 0))
7966     goto free_return;
7967 
7968   /* Then check each states in the state_log.  */
7969   while (str_idx > 0)
7970     {
7971       /* Update counters.  */
7972       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
7973       if (null_cnt > mctx->max_mb_elem_len)
7974 	{
7975 	  memset (sctx->sifted_states, '\0',
7976 		  sizeof (re_dfastate_t *) * str_idx);
7977 	  re_node_set_free (&cur_dest);
7978 	  return REG_NOERROR;
7979 	}
7980       re_node_set_empty (&cur_dest);
7981       --str_idx;
7982 
7983       if (mctx->state_log[str_idx])
7984 	{
7985 	  err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
7986           if (BE (err != REG_NOERROR, 0))
7987 	    goto free_return;
7988 	}
7989 
7990       /* Add all the nodes which satisfy the following conditions:
7991 	 - It can epsilon transit to a node in CUR_DEST.
7992 	 - It is in CUR_SRC.
7993 	 And update state_log.  */
7994       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
7995       if (BE (err != REG_NOERROR, 0))
7996 	goto free_return;
7997     }
7998   err = REG_NOERROR;
7999  free_return:
8000   re_node_set_free (&cur_dest);
8001   return err;
8002 }
8003 
8004 static reg_errcode_t
8005 internal_function
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,int str_idx,re_node_set * cur_dest)8006 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
8007 		     int str_idx, re_node_set *cur_dest)
8008 {
8009   const re_dfa_t *const dfa = mctx->dfa;
8010   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
8011   int i;
8012 
8013   /* Then build the next sifted state.
8014      We build the next sifted state on `cur_dest', and update
8015      `sifted_states[str_idx]' with `cur_dest'.
8016      Note:
8017      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
8018      `cur_src' points the node_set of the old `state_log[str_idx]'
8019      (with the epsilon nodes pre-filtered out).  */
8020   for (i = 0; i < cur_src->nelem; i++)
8021     {
8022       int prev_node = cur_src->elems[i];
8023       int naccepted = 0;
8024       int ret;
8025 
8026 #ifdef DEBUG
8027       re_token_type_t type = dfa->nodes[prev_node].type;
8028       assert (!IS_EPSILON_NODE (type));
8029 #endif
8030 #ifdef RE_ENABLE_I18N
8031       /* If the node may accept `multi byte'.  */
8032       if (dfa->nodes[prev_node].accept_mb)
8033 	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
8034 					 str_idx, sctx->last_str_idx);
8035 #endif /* RE_ENABLE_I18N */
8036 
8037       /* We don't check backreferences here.
8038 	 See update_cur_sifted_state().  */
8039       if (!naccepted
8040 	  && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
8041 	  && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
8042 				  dfa->nexts[prev_node]))
8043 	naccepted = 1;
8044 
8045       if (naccepted == 0)
8046 	continue;
8047 
8048       if (sctx->limits.nelem)
8049 	{
8050 	  int to_idx = str_idx + naccepted;
8051 	  if (check_dst_limits (mctx, &sctx->limits,
8052 				dfa->nexts[prev_node], to_idx,
8053 				prev_node, str_idx))
8054 	    continue;
8055 	}
8056       ret = re_node_set_insert (cur_dest, prev_node);
8057       if (BE (ret == -1, 0))
8058 	return REG_ESPACE;
8059     }
8060 
8061   return REG_NOERROR;
8062 }
8063 
8064 /* Helper functions.  */
8065 
8066 static reg_errcode_t
8067 internal_function
clean_state_log_if_needed(re_match_context_t * mctx,int next_state_log_idx)8068 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
8069 {
8070   int top = mctx->state_log_top;
8071 
8072   if (next_state_log_idx >= mctx->input.bufs_len
8073       || (next_state_log_idx >= mctx->input.valid_len
8074 	  && mctx->input.valid_len < mctx->input.len))
8075     {
8076       reg_errcode_t err;
8077       err = extend_buffers (mctx);
8078       if (BE (err != REG_NOERROR, 0))
8079 	return err;
8080     }
8081 
8082   if (top < next_state_log_idx)
8083     {
8084       memset (mctx->state_log + top + 1, '\0',
8085 	      sizeof (re_dfastate_t *) * (next_state_log_idx - top));
8086       mctx->state_log_top = next_state_log_idx;
8087     }
8088   return REG_NOERROR;
8089 }
8090 
8091 static reg_errcode_t
8092 internal_function
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,int num)8093 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
8094 		   re_dfastate_t **src, int num)
8095 {
8096   int st_idx;
8097   reg_errcode_t err;
8098   for (st_idx = 0; st_idx < num; ++st_idx)
8099     {
8100       if (dst[st_idx] == NULL)
8101 	dst[st_idx] = src[st_idx];
8102       else if (src[st_idx] != NULL)
8103 	{
8104 	  re_node_set merged_set;
8105 	  err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
8106 					&src[st_idx]->nodes);
8107 	  if (BE (err != REG_NOERROR, 0))
8108 	    return err;
8109 	  dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
8110 	  re_node_set_free (&merged_set);
8111 	  if (BE (err != REG_NOERROR, 0))
8112 	    return err;
8113 	}
8114     }
8115   return REG_NOERROR;
8116 }
8117 
8118 static reg_errcode_t
8119 internal_function
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,int str_idx,re_node_set * dest_nodes)8120 update_cur_sifted_state (const re_match_context_t *mctx,
8121 			 re_sift_context_t *sctx, int str_idx,
8122 			 re_node_set *dest_nodes)
8123 {
8124   const re_dfa_t *const dfa = mctx->dfa;
8125   reg_errcode_t err = REG_NOERROR;
8126   const re_node_set *candidates;
8127   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
8128 		: &mctx->state_log[str_idx]->nodes);
8129 
8130   if (dest_nodes->nelem == 0)
8131     sctx->sifted_states[str_idx] = NULL;
8132   else
8133     {
8134       if (candidates)
8135 	{
8136 	  /* At first, add the nodes which can epsilon transit to a node in
8137 	     DEST_NODE.  */
8138 	  err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
8139 	  if (BE (err != REG_NOERROR, 0))
8140 	    return err;
8141 
8142 	  /* Then, check the limitations in the current sift_context.  */
8143 	  if (sctx->limits.nelem)
8144 	    {
8145 	      err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
8146 					 mctx->bkref_ents, str_idx);
8147 	      if (BE (err != REG_NOERROR, 0))
8148 		return err;
8149 	    }
8150 	}
8151 
8152       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
8153       if (BE (err != REG_NOERROR, 0))
8154 	return err;
8155     }
8156 
8157   if (candidates && mctx->state_log[str_idx]->has_backref)
8158     {
8159       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
8160       if (BE (err != REG_NOERROR, 0))
8161 	return err;
8162     }
8163   return REG_NOERROR;
8164 }
8165 
8166 static reg_errcode_t
8167 internal_function
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)8168 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
8169 		       const re_node_set *candidates)
8170 {
8171   reg_errcode_t err = REG_NOERROR;
8172   int i;
8173 
8174   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
8175   if (BE (err != REG_NOERROR, 0))
8176     return err;
8177 
8178   if (!state->inveclosure.alloc)
8179     {
8180       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
8181       if (BE (err != REG_NOERROR, 0))
8182         return REG_ESPACE;
8183       for (i = 0; i < dest_nodes->nelem; i++)
8184         re_node_set_merge (&state->inveclosure,
8185 			   dfa->inveclosures + dest_nodes->elems[i]);
8186     }
8187   return re_node_set_add_intersect (dest_nodes, candidates,
8188 				    &state->inveclosure);
8189 }
8190 
8191 static reg_errcode_t
8192 internal_function
sub_epsilon_src_nodes(const re_dfa_t * dfa,int node,re_node_set * dest_nodes,const re_node_set * candidates)8193 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
8194 		       const re_node_set *candidates)
8195 {
8196     int ecl_idx;
8197     reg_errcode_t err;
8198     re_node_set *inv_eclosure = dfa->inveclosures + node;
8199     re_node_set except_nodes;
8200     re_node_set_init_empty (&except_nodes);
8201     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
8202       {
8203 	int cur_node = inv_eclosure->elems[ecl_idx];
8204 	if (cur_node == node)
8205 	  continue;
8206 	if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
8207 	  {
8208 	    int edst1 = dfa->edests[cur_node].elems[0];
8209 	    int edst2 = ((dfa->edests[cur_node].nelem > 1)
8210 			 ? dfa->edests[cur_node].elems[1] : -1);
8211 	    if ((!re_node_set_contains (inv_eclosure, edst1)
8212 		 && re_node_set_contains (dest_nodes, edst1))
8213 		|| (edst2 > 0
8214 		    && !re_node_set_contains (inv_eclosure, edst2)
8215 		    && re_node_set_contains (dest_nodes, edst2)))
8216 	      {
8217 		err = re_node_set_add_intersect (&except_nodes, candidates,
8218 						 dfa->inveclosures + cur_node);
8219 		if (BE (err != REG_NOERROR, 0))
8220 		  {
8221 		    re_node_set_free (&except_nodes);
8222 		    return err;
8223 		  }
8224 	      }
8225 	  }
8226       }
8227     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
8228       {
8229 	int cur_node = inv_eclosure->elems[ecl_idx];
8230 	if (!re_node_set_contains (&except_nodes, cur_node))
8231 	  {
8232 	    int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
8233 	    re_node_set_remove_at (dest_nodes, idx);
8234 	  }
8235       }
8236     re_node_set_free (&except_nodes);
8237     return REG_NOERROR;
8238 }
8239 
8240 static int
8241 internal_function
check_dst_limits(const re_match_context_t * mctx,re_node_set * limits,int dst_node,int dst_idx,int src_node,int src_idx)8242 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
8243 		  int dst_node, int dst_idx, int src_node, int src_idx)
8244 {
8245   const re_dfa_t *const dfa = mctx->dfa;
8246   int lim_idx, src_pos, dst_pos;
8247 
8248   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
8249   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
8250   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
8251     {
8252       int subexp_idx;
8253       struct re_backref_cache_entry *ent;
8254       ent = mctx->bkref_ents + limits->elems[lim_idx];
8255       subexp_idx = dfa->nodes[ent->node].opr.idx;
8256 
8257       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
8258 					   subexp_idx, dst_node, dst_idx,
8259 					   dst_bkref_idx);
8260       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
8261 					   subexp_idx, src_node, src_idx,
8262 					   src_bkref_idx);
8263 
8264       /* In case of:
8265 	 <src> <dst> ( <subexp> )
8266 	 ( <subexp> ) <src> <dst>
8267 	 ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
8268       if (src_pos == dst_pos)
8269 	continue; /* This is unrelated limitation.  */
8270       else
8271 	return 1;
8272     }
8273   return 0;
8274 }
8275 
8276 static int
8277 internal_function
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,int subexp_idx,int from_node,int bkref_idx)8278 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
8279 			     int subexp_idx, int from_node, int bkref_idx)
8280 {
8281   const re_dfa_t *const dfa = mctx->dfa;
8282   const re_node_set *eclosures = dfa->eclosures + from_node;
8283   int node_idx;
8284 
8285   /* Else, we are on the boundary: examine the nodes on the epsilon
8286      closure.  */
8287   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
8288     {
8289       int node = eclosures->elems[node_idx];
8290       switch (dfa->nodes[node].type)
8291 	{
8292 	case OP_BACK_REF:
8293 	  if (bkref_idx != -1)
8294 	    {
8295 	      struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
8296 	      do
8297 	        {
8298 		  int dst, cpos;
8299 
8300 		  if (ent->node != node)
8301 		    continue;
8302 
8303 		  if (subexp_idx < BITSET_WORD_BITS
8304 		      && !(ent->eps_reachable_subexps_map
8305 			   & ((bitset_word_t) 1 << subexp_idx)))
8306 		    continue;
8307 
8308 		  /* Recurse trying to reach the OP_OPEN_SUBEXP and
8309 		     OP_CLOSE_SUBEXP cases below.  But, if the
8310 		     destination node is the same node as the source
8311 		     node, don't recurse because it would cause an
8312 		     infinite loop: a regex that exhibits this behavior
8313 		     is ()\1*\1*  */
8314 		  dst = dfa->edests[node].elems[0];
8315 		  if (dst == from_node)
8316 		    {
8317 		      if (boundaries & 1)
8318 		        return -1;
8319 		      else /* if (boundaries & 2) */
8320 		        return 0;
8321 		    }
8322 
8323 		  cpos =
8324 		    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
8325 						 dst, bkref_idx);
8326 		  if (cpos == -1 /* && (boundaries & 1) */)
8327 		    return -1;
8328 		  if (cpos == 0 && (boundaries & 2))
8329 		    return 0;
8330 
8331 		  if (subexp_idx < BITSET_WORD_BITS)
8332 		    ent->eps_reachable_subexps_map
8333 		      &= ~((bitset_word_t) 1 << subexp_idx);
8334 	        }
8335 	      while (ent++->more);
8336 	    }
8337 	  break;
8338 
8339 	case OP_OPEN_SUBEXP:
8340 	  if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
8341 	    return -1;
8342 	  break;
8343 
8344 	case OP_CLOSE_SUBEXP:
8345 	  if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
8346 	    return 0;
8347 	  break;
8348 
8349 	default:
8350 	    break;
8351 	}
8352     }
8353 
8354   return (boundaries & 2) ? 1 : 0;
8355 }
8356 
8357 static int
8358 internal_function
check_dst_limits_calc_pos(const re_match_context_t * mctx,int limit,int subexp_idx,int from_node,int str_idx,int bkref_idx)8359 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
8360 			   int subexp_idx, int from_node, int str_idx,
8361 			   int bkref_idx)
8362 {
8363   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
8364   int boundaries;
8365 
8366   /* If we are outside the range of the subexpression, return -1 or 1.  */
8367   if (str_idx < lim->subexp_from)
8368     return -1;
8369 
8370   if (lim->subexp_to < str_idx)
8371     return 1;
8372 
8373   /* If we are within the subexpression, return 0.  */
8374   boundaries = (str_idx == lim->subexp_from);
8375   boundaries |= (str_idx == lim->subexp_to) << 1;
8376   if (boundaries == 0)
8377     return 0;
8378 
8379   /* Else, examine epsilon closure.  */
8380   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
8381 				      from_node, bkref_idx);
8382 }
8383 
8384 /* Check the limitations of sub expressions LIMITS, and remove the nodes
8385    which are against limitations from DEST_NODES. */
8386 
8387 static reg_errcode_t
8388 internal_function
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,int str_idx)8389 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
8390 		     const re_node_set *candidates, re_node_set *limits,
8391 		     struct re_backref_cache_entry *bkref_ents, int str_idx)
8392 {
8393   reg_errcode_t err;
8394   int node_idx, lim_idx;
8395 
8396   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
8397     {
8398       int subexp_idx;
8399       struct re_backref_cache_entry *ent;
8400       ent = bkref_ents + limits->elems[lim_idx];
8401 
8402       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
8403 	continue; /* This is unrelated limitation.  */
8404 
8405       subexp_idx = dfa->nodes[ent->node].opr.idx;
8406       if (ent->subexp_to == str_idx)
8407 	{
8408 	  int ops_node = -1;
8409 	  int cls_node = -1;
8410 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
8411 	    {
8412 	      int node = dest_nodes->elems[node_idx];
8413 	      re_token_type_t type = dfa->nodes[node].type;
8414 	      if (type == OP_OPEN_SUBEXP
8415 		  && subexp_idx == dfa->nodes[node].opr.idx)
8416 		ops_node = node;
8417 	      else if (type == OP_CLOSE_SUBEXP
8418 		       && subexp_idx == dfa->nodes[node].opr.idx)
8419 		cls_node = node;
8420 	    }
8421 
8422 	  /* Check the limitation of the open subexpression.  */
8423 	  /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
8424 	  if (ops_node >= 0)
8425 	    {
8426 	      err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
8427 					   candidates);
8428 	      if (BE (err != REG_NOERROR, 0))
8429 		return err;
8430 	    }
8431 
8432 	  /* Check the limitation of the close subexpression.  */
8433 	  if (cls_node >= 0)
8434 	    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
8435 	      {
8436 		int node = dest_nodes->elems[node_idx];
8437 		if (!re_node_set_contains (dfa->inveclosures + node,
8438 					   cls_node)
8439 		    && !re_node_set_contains (dfa->eclosures + node,
8440 					      cls_node))
8441 		  {
8442 		    /* It is against this limitation.
8443 		       Remove it form the current sifted state.  */
8444 		    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
8445 						 candidates);
8446 		    if (BE (err != REG_NOERROR, 0))
8447 		      return err;
8448 		    --node_idx;
8449 		  }
8450 	      }
8451 	}
8452       else /* (ent->subexp_to != str_idx)  */
8453 	{
8454 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
8455 	    {
8456 	      int node = dest_nodes->elems[node_idx];
8457 	      re_token_type_t type = dfa->nodes[node].type;
8458 	      if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
8459 		{
8460 		  if (subexp_idx != dfa->nodes[node].opr.idx)
8461 		    continue;
8462 		  /* It is against this limitation.
8463 		     Remove it form the current sifted state.  */
8464 		  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
8465 					       candidates);
8466 		  if (BE (err != REG_NOERROR, 0))
8467 		    return err;
8468 		}
8469 	    }
8470 	}
8471     }
8472   return REG_NOERROR;
8473 }
8474 
8475 static reg_errcode_t
8476 internal_function
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,int str_idx,const re_node_set * candidates)8477 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
8478 		   int str_idx, const re_node_set *candidates)
8479 {
8480   const re_dfa_t *const dfa = mctx->dfa;
8481   reg_errcode_t err;
8482   int node_idx, node;
8483   re_sift_context_t local_sctx;
8484   int first_idx = search_cur_bkref_entry (mctx, str_idx);
8485 
8486   if (first_idx == -1)
8487     return REG_NOERROR;
8488 
8489   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
8490 
8491   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
8492     {
8493       int enabled_idx;
8494       re_token_type_t type;
8495       struct re_backref_cache_entry *entry;
8496       node = candidates->elems[node_idx];
8497       type = dfa->nodes[node].type;
8498       /* Avoid infinite loop for the REs like "()\1+".  */
8499       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
8500 	continue;
8501       if (type != OP_BACK_REF)
8502 	continue;
8503 
8504       entry = mctx->bkref_ents + first_idx;
8505       enabled_idx = first_idx;
8506       do
8507 	{
8508 	  int subexp_len;
8509 	  int to_idx;
8510 	  int dst_node;
8511 	  int ret;
8512 	  re_dfastate_t *cur_state;
8513 
8514 	  if (entry->node != node)
8515 	    continue;
8516 	  subexp_len = entry->subexp_to - entry->subexp_from;
8517 	  to_idx = str_idx + subexp_len;
8518 	  dst_node = (subexp_len ? dfa->nexts[node]
8519 		      : dfa->edests[node].elems[0]);
8520 
8521 	  if (to_idx > sctx->last_str_idx
8522 	      || sctx->sifted_states[to_idx] == NULL
8523 	      || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
8524 	      || check_dst_limits (mctx, &sctx->limits, node,
8525 				   str_idx, dst_node, to_idx))
8526 	    continue;
8527 
8528 	  if (local_sctx.sifted_states == NULL)
8529 	    {
8530 	      local_sctx = *sctx;
8531 	      err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
8532 	      if (BE (err != REG_NOERROR, 0))
8533 		goto free_return;
8534 	    }
8535 	  local_sctx.last_node = node;
8536 	  local_sctx.last_str_idx = str_idx;
8537 	  ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
8538 	  if (BE (ret < 0, 0))
8539 	    {
8540 	      err = REG_ESPACE;
8541 	      goto free_return;
8542 	    }
8543 	  cur_state = local_sctx.sifted_states[str_idx];
8544 	  err = sift_states_backward (mctx, &local_sctx);
8545 	  if (BE (err != REG_NOERROR, 0))
8546 	    goto free_return;
8547 	  if (sctx->limited_states != NULL)
8548 	    {
8549 	      err = merge_state_array (dfa, sctx->limited_states,
8550 				       local_sctx.sifted_states,
8551 				       str_idx + 1);
8552 	      if (BE (err != REG_NOERROR, 0))
8553 		goto free_return;
8554 	    }
8555 	  local_sctx.sifted_states[str_idx] = cur_state;
8556 	  re_node_set_remove (&local_sctx.limits, enabled_idx);
8557 
8558 	  /* mctx->bkref_ents may have changed, reload the pointer.  */
8559           entry = mctx->bkref_ents + enabled_idx;
8560 	}
8561       while (enabled_idx++, entry++->more);
8562     }
8563   err = REG_NOERROR;
8564  free_return:
8565   if (local_sctx.sifted_states != NULL)
8566     {
8567       re_node_set_free (&local_sctx.limits);
8568     }
8569 
8570   return err;
8571 }
8572 
8573 
8574 #ifdef RE_ENABLE_I18N
8575 static int
8576 internal_function
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,int node_idx,int str_idx,int max_str_idx)8577 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
8578 		     int node_idx, int str_idx, int max_str_idx)
8579 {
8580   const re_dfa_t *const dfa = mctx->dfa;
8581   int naccepted;
8582   /* Check the node can accept `multi byte'.  */
8583   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
8584   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
8585       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
8586 			    dfa->nexts[node_idx]))
8587     /* The node can't accept the `multi byte', or the
8588        destination was already thrown away, then the node
8589        could't accept the current input `multi byte'.   */
8590     naccepted = 0;
8591   /* Otherwise, it is sure that the node could accept
8592      `naccepted' bytes input.  */
8593   return naccepted;
8594 }
8595 #endif /* RE_ENABLE_I18N */
8596 
8597 
8598 /* Functions for state transition.  */
8599 
8600 /* Return the next state to which the current state STATE will transit by
8601    accepting the current input byte, and update STATE_LOG if necessary.
8602    If STATE can accept a multibyte char/collating element/back reference
8603    update the destination of STATE_LOG.  */
8604 
8605 static re_dfastate_t *
8606 internal_function
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)8607 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
8608 	       re_dfastate_t *state)
8609 {
8610   re_dfastate_t **trtable;
8611   unsigned char ch;
8612 
8613 #ifdef RE_ENABLE_I18N
8614   /* If the current state can accept multibyte.  */
8615   if (BE (state->accept_mb, 0))
8616     {
8617       *err = transit_state_mb (mctx, state);
8618       if (BE (*err != REG_NOERROR, 0))
8619 	return NULL;
8620     }
8621 #endif /* RE_ENABLE_I18N */
8622 
8623   /* Then decide the next state with the single byte.  */
8624 #if 0
8625   if (0)
8626     /* don't use transition table  */
8627     return transit_state_sb (err, mctx, state);
8628 #endif
8629 
8630   /* Use transition table  */
8631   ch = re_string_fetch_byte (&mctx->input);
8632   for (;;)
8633     {
8634       trtable = state->trtable;
8635       if (BE (trtable != NULL, 1))
8636 	return trtable[ch];
8637 
8638       trtable = state->word_trtable;
8639       if (BE (trtable != NULL, 1))
8640         {
8641 	  unsigned int context;
8642 	  context
8643 	    = re_string_context_at (&mctx->input,
8644 				    re_string_cur_idx (&mctx->input) - 1,
8645 				    mctx->eflags);
8646 	  if (IS_WORD_CONTEXT (context))
8647 	    return trtable[ch + SBC_MAX];
8648 	  else
8649 	    return trtable[ch];
8650 	}
8651 
8652       if (!build_trtable (mctx->dfa, state))
8653 	{
8654 	  *err = REG_ESPACE;
8655 	  return NULL;
8656 	}
8657 
8658       /* Retry, we now have a transition table.  */
8659     }
8660 }
8661 
8662 /* Update the state_log if we need */
8663 re_dfastate_t *
8664 internal_function
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)8665 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
8666 		      re_dfastate_t *next_state)
8667 {
8668   const re_dfa_t *const dfa = mctx->dfa;
8669   int cur_idx = re_string_cur_idx (&mctx->input);
8670 
8671   if (cur_idx > mctx->state_log_top)
8672     {
8673       mctx->state_log[cur_idx] = next_state;
8674       mctx->state_log_top = cur_idx;
8675     }
8676   else if (mctx->state_log[cur_idx] == 0)
8677     {
8678       mctx->state_log[cur_idx] = next_state;
8679     }
8680   else
8681     {
8682       re_dfastate_t *pstate;
8683       unsigned int context;
8684       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
8685       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
8686          the destination of a multibyte char/collating element/
8687          back reference.  Then the next state is the union set of
8688          these destinations and the results of the transition table.  */
8689       pstate = mctx->state_log[cur_idx];
8690       log_nodes = pstate->entrance_nodes;
8691       if (next_state != NULL)
8692         {
8693           table_nodes = next_state->entrance_nodes;
8694           *err = re_node_set_init_union (&next_nodes, table_nodes,
8695 					     log_nodes);
8696           if (BE (*err != REG_NOERROR, 0))
8697 	    return NULL;
8698         }
8699       else
8700         next_nodes = *log_nodes;
8701       /* Note: We already add the nodes of the initial state,
8702 	 then we don't need to add them here.  */
8703 
8704       context = re_string_context_at (&mctx->input,
8705 				      re_string_cur_idx (&mctx->input) - 1,
8706 				      mctx->eflags);
8707       next_state = mctx->state_log[cur_idx]
8708         = re_acquire_state_context (err, dfa, &next_nodes, context);
8709       /* We don't need to check errors here, since the return value of
8710          this function is next_state and ERR is already set.  */
8711 
8712       if (table_nodes != NULL)
8713         re_node_set_free (&next_nodes);
8714     }
8715 
8716   if (BE (dfa->nbackref, 0) && next_state != NULL)
8717     {
8718       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
8719 	 later.  We must check them here, since the back references in the
8720 	 next state might use them.  */
8721       *err = check_subexp_matching_top (mctx, &next_state->nodes,
8722 					cur_idx);
8723       if (BE (*err != REG_NOERROR, 0))
8724 	return NULL;
8725 
8726       /* If the next state has back references.  */
8727       if (next_state->has_backref)
8728 	{
8729 	  *err = transit_state_bkref (mctx, &next_state->nodes);
8730 	  if (BE (*err != REG_NOERROR, 0))
8731 	    return NULL;
8732 	  next_state = mctx->state_log[cur_idx];
8733 	}
8734     }
8735 
8736   return next_state;
8737 }
8738 
8739 /* Skip bytes in the input that correspond to part of a
8740    multi-byte match, then look in the log for a state
8741    from which to restart matching.  */
8742 re_dfastate_t *
8743 internal_function
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)8744 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
8745 {
8746   re_dfastate_t *cur_state;
8747   do
8748     {
8749       int max = mctx->state_log_top;
8750       int cur_str_idx = re_string_cur_idx (&mctx->input);
8751 
8752       do
8753 	{
8754           if (++cur_str_idx > max)
8755             return NULL;
8756           re_string_skip_bytes (&mctx->input, 1);
8757 	}
8758       while (mctx->state_log[cur_str_idx] == NULL);
8759 
8760       cur_state = merge_state_with_log (err, mctx, NULL);
8761     }
8762   while (*err == REG_NOERROR && cur_state == NULL);
8763   return cur_state;
8764 }
8765 
8766 /* Helper functions for transit_state.  */
8767 
8768 /* From the node set CUR_NODES, pick up the nodes whose types are
8769    OP_OPEN_SUBEXP and which have corresponding back references in the regular
8770    expression. And register them to use them later for evaluating the
8771    correspoding back references.  */
8772 
8773 static reg_errcode_t
8774 internal_function
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,int str_idx)8775 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
8776 			   int str_idx)
8777 {
8778   const re_dfa_t *const dfa = mctx->dfa;
8779   int node_idx;
8780   reg_errcode_t err;
8781 
8782   /* TODO: This isn't efficient.
8783 	   Because there might be more than one nodes whose types are
8784 	   OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
8785 	   nodes.
8786 	   E.g. RE: (a){2}  */
8787   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
8788     {
8789       int node = cur_nodes->elems[node_idx];
8790       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
8791 	  && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
8792 	  && (dfa->used_bkref_map
8793 	      & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
8794 	{
8795 	  err = match_ctx_add_subtop (mctx, node, str_idx);
8796 	  if (BE (err != REG_NOERROR, 0))
8797 	    return err;
8798 	}
8799     }
8800   return REG_NOERROR;
8801 }
8802 
8803 #if 0
8804 /* Return the next state to which the current state STATE will transit by
8805    accepting the current input byte.  */
8806 
8807 static re_dfastate_t *
8808 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
8809 		  re_dfastate_t *state)
8810 {
8811   const re_dfa_t *const dfa = mctx->dfa;
8812   re_node_set next_nodes;
8813   re_dfastate_t *next_state;
8814   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
8815   unsigned int context;
8816 
8817   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
8818   if (BE (*err != REG_NOERROR, 0))
8819     return NULL;
8820   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
8821     {
8822       int cur_node = state->nodes.elems[node_cnt];
8823       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
8824 	{
8825 	  *err = re_node_set_merge (&next_nodes,
8826 				    dfa->eclosures + dfa->nexts[cur_node]);
8827 	  if (BE (*err != REG_NOERROR, 0))
8828 	    {
8829 	      re_node_set_free (&next_nodes);
8830 	      return NULL;
8831 	    }
8832 	}
8833     }
8834   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
8835   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
8836   /* We don't need to check errors here, since the return value of
8837      this function is next_state and ERR is already set.  */
8838 
8839   re_node_set_free (&next_nodes);
8840   re_string_skip_bytes (&mctx->input, 1);
8841   return next_state;
8842 }
8843 #endif
8844 
8845 #ifdef RE_ENABLE_I18N
8846 static reg_errcode_t
8847 internal_function
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)8848 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
8849 {
8850   const re_dfa_t *const dfa = mctx->dfa;
8851   reg_errcode_t err;
8852   int i;
8853 
8854   for (i = 0; i < pstate->nodes.nelem; ++i)
8855     {
8856       re_node_set dest_nodes, *new_nodes;
8857       int cur_node_idx = pstate->nodes.elems[i];
8858       int naccepted, dest_idx;
8859       unsigned int context;
8860       re_dfastate_t *dest_state;
8861 
8862       if (!dfa->nodes[cur_node_idx].accept_mb)
8863         continue;
8864 
8865       if (dfa->nodes[cur_node_idx].constraint)
8866 	{
8867 	  context = re_string_context_at (&mctx->input,
8868 					  re_string_cur_idx (&mctx->input),
8869 					  mctx->eflags);
8870 	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
8871 					   context))
8872 	    continue;
8873 	}
8874 
8875       /* How many bytes the node can accept?  */
8876       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
8877 					   re_string_cur_idx (&mctx->input));
8878       if (naccepted == 0)
8879 	continue;
8880 
8881       /* The node can accepts `naccepted' bytes.  */
8882       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
8883       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
8884 			       : mctx->max_mb_elem_len);
8885       err = clean_state_log_if_needed (mctx, dest_idx);
8886       if (BE (err != REG_NOERROR, 0))
8887 	return err;
8888 #ifdef DEBUG
8889       assert (dfa->nexts[cur_node_idx] != -1);
8890 #endif
8891       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
8892 
8893       dest_state = mctx->state_log[dest_idx];
8894       if (dest_state == NULL)
8895 	dest_nodes = *new_nodes;
8896       else
8897 	{
8898 	  err = re_node_set_init_union (&dest_nodes,
8899 					dest_state->entrance_nodes, new_nodes);
8900 	  if (BE (err != REG_NOERROR, 0))
8901 	    return err;
8902 	}
8903       context = re_string_context_at (&mctx->input, dest_idx - 1,
8904 				      mctx->eflags);
8905       mctx->state_log[dest_idx]
8906 	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
8907       if (dest_state != NULL)
8908 	re_node_set_free (&dest_nodes);
8909       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
8910 	return err;
8911     }
8912   return REG_NOERROR;
8913 }
8914 #endif /* RE_ENABLE_I18N */
8915 
8916 static reg_errcode_t
8917 internal_function
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)8918 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
8919 {
8920   const re_dfa_t *const dfa = mctx->dfa;
8921   reg_errcode_t err;
8922   int i;
8923   int cur_str_idx = re_string_cur_idx (&mctx->input);
8924 
8925   for (i = 0; i < nodes->nelem; ++i)
8926     {
8927       int dest_str_idx, prev_nelem, bkc_idx;
8928       int node_idx = nodes->elems[i];
8929       unsigned int context;
8930       const re_token_t *node = dfa->nodes + node_idx;
8931       re_node_set *new_dest_nodes;
8932 
8933       /* Check whether `node' is a backreference or not.  */
8934       if (node->type != OP_BACK_REF)
8935 	continue;
8936 
8937       if (node->constraint)
8938 	{
8939 	  context = re_string_context_at (&mctx->input, cur_str_idx,
8940 					  mctx->eflags);
8941 	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
8942 	    continue;
8943 	}
8944 
8945       /* `node' is a backreference.
8946 	 Check the substring which the substring matched.  */
8947       bkc_idx = mctx->nbkref_ents;
8948       err = get_subexp (mctx, node_idx, cur_str_idx);
8949       if (BE (err != REG_NOERROR, 0))
8950 	goto free_return;
8951 
8952       /* And add the epsilon closures (which is `new_dest_nodes') of
8953 	 the backreference to appropriate state_log.  */
8954 #ifdef DEBUG
8955       assert (dfa->nexts[node_idx] != -1);
8956 #endif
8957       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
8958 	{
8959 	  int subexp_len;
8960 	  re_dfastate_t *dest_state;
8961 	  struct re_backref_cache_entry *bkref_ent;
8962 	  bkref_ent = mctx->bkref_ents + bkc_idx;
8963 	  if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
8964 	    continue;
8965 	  subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
8966 	  new_dest_nodes = (subexp_len == 0
8967 			    ? dfa->eclosures + dfa->edests[node_idx].elems[0]
8968 			    : dfa->eclosures + dfa->nexts[node_idx]);
8969 	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
8970 			  - bkref_ent->subexp_from);
8971 	  context = re_string_context_at (&mctx->input, dest_str_idx - 1,
8972 					  mctx->eflags);
8973 	  dest_state = mctx->state_log[dest_str_idx];
8974 	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
8975 			: mctx->state_log[cur_str_idx]->nodes.nelem);
8976 	  /* Add `new_dest_node' to state_log.  */
8977 	  if (dest_state == NULL)
8978 	    {
8979 	      mctx->state_log[dest_str_idx]
8980 		= re_acquire_state_context (&err, dfa, new_dest_nodes,
8981 					    context);
8982 	      if (BE (mctx->state_log[dest_str_idx] == NULL
8983 		      && err != REG_NOERROR, 0))
8984 		goto free_return;
8985 	    }
8986 	  else
8987 	    {
8988 	      re_node_set dest_nodes;
8989 	      err = re_node_set_init_union (&dest_nodes,
8990 					    dest_state->entrance_nodes,
8991 					    new_dest_nodes);
8992 	      if (BE (err != REG_NOERROR, 0))
8993 		{
8994 		  re_node_set_free (&dest_nodes);
8995 		  goto free_return;
8996 		}
8997 	      mctx->state_log[dest_str_idx]
8998 		= re_acquire_state_context (&err, dfa, &dest_nodes, context);
8999 	      re_node_set_free (&dest_nodes);
9000 	      if (BE (mctx->state_log[dest_str_idx] == NULL
9001 		      && err != REG_NOERROR, 0))
9002 		goto free_return;
9003 	    }
9004 	  /* We need to check recursively if the backreference can epsilon
9005 	     transit.  */
9006 	  if (subexp_len == 0
9007 	      && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
9008 	    {
9009 	      err = check_subexp_matching_top (mctx, new_dest_nodes,
9010 					       cur_str_idx);
9011 	      if (BE (err != REG_NOERROR, 0))
9012 		goto free_return;
9013 	      err = transit_state_bkref (mctx, new_dest_nodes);
9014 	      if (BE (err != REG_NOERROR, 0))
9015 		goto free_return;
9016 	    }
9017 	}
9018     }
9019   err = REG_NOERROR;
9020  free_return:
9021   return err;
9022 }
9023 
9024 /* Enumerate all the candidates which the backreference BKREF_NODE can match
9025    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
9026    Note that we might collect inappropriate candidates here.
9027    However, the cost of checking them strictly here is too high, then we
9028    delay these checking for prune_impossible_nodes().  */
9029 
9030 static reg_errcode_t
9031 internal_function
get_subexp(re_match_context_t * mctx,int bkref_node,int bkref_str_idx)9032 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
9033 {
9034   const re_dfa_t *const dfa = mctx->dfa;
9035   int subexp_num, sub_top_idx;
9036   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
9037   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
9038   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
9039   if (cache_idx != -1)
9040     {
9041       const struct re_backref_cache_entry *entry
9042 	= mctx->bkref_ents + cache_idx;
9043       do
9044         if (entry->node == bkref_node)
9045 	  return REG_NOERROR; /* We already checked it.  */
9046       while (entry++->more);
9047     }
9048 
9049   subexp_num = dfa->nodes[bkref_node].opr.idx;
9050 
9051   /* For each sub expression  */
9052   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
9053     {
9054       reg_errcode_t err;
9055       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
9056       re_sub_match_last_t *sub_last;
9057       int sub_last_idx, sl_str, bkref_str_off;
9058 
9059       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
9060 	continue; /* It isn't related.  */
9061 
9062       sl_str = sub_top->str_idx;
9063       bkref_str_off = bkref_str_idx;
9064       /* At first, check the last node of sub expressions we already
9065 	 evaluated.  */
9066       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
9067 	{
9068 	  int sl_str_diff;
9069 	  sub_last = sub_top->lasts[sub_last_idx];
9070 	  sl_str_diff = sub_last->str_idx - sl_str;
9071 	  /* The matched string by the sub expression match with the substring
9072 	     at the back reference?  */
9073 	  if (sl_str_diff > 0)
9074 	    {
9075 	      if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
9076 		{
9077 		  /* Not enough chars for a successful match.  */
9078 		  if (bkref_str_off + sl_str_diff > mctx->input.len)
9079 		    break;
9080 
9081 		  err = clean_state_log_if_needed (mctx,
9082 						   bkref_str_off
9083 						   + sl_str_diff);
9084 		  if (BE (err != REG_NOERROR, 0))
9085 		    return err;
9086 		  buf = (const char *) re_string_get_buffer (&mctx->input);
9087 		}
9088 	      if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
9089 		/* We don't need to search this sub expression any more.  */
9090 		break;
9091 	    }
9092 	  bkref_str_off += sl_str_diff;
9093 	  sl_str += sl_str_diff;
9094 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
9095 				bkref_str_idx);
9096 
9097 	  /* Reload buf, since the preceding call might have reallocated
9098 	     the buffer.  */
9099 	  buf = (const char *) re_string_get_buffer (&mctx->input);
9100 
9101 	  if (err == REG_NOMATCH)
9102 	    continue;
9103 	  if (BE (err != REG_NOERROR, 0))
9104 	    return err;
9105 	}
9106 
9107       if (sub_last_idx < sub_top->nlasts)
9108 	continue;
9109       if (sub_last_idx > 0)
9110 	++sl_str;
9111       /* Then, search for the other last nodes of the sub expression.  */
9112       for (; sl_str <= bkref_str_idx; ++sl_str)
9113 	{
9114 	  int cls_node, sl_str_off;
9115 	  const re_node_set *nodes;
9116 	  sl_str_off = sl_str - sub_top->str_idx;
9117 	  /* The matched string by the sub expression match with the substring
9118 	     at the back reference?  */
9119 	  if (sl_str_off > 0)
9120 	    {
9121 	      if (BE (bkref_str_off >= mctx->input.valid_len, 0))
9122 		{
9123 		  /* If we are at the end of the input, we cannot match.  */
9124 		  if (bkref_str_off >= mctx->input.len)
9125 		    break;
9126 
9127 		  err = extend_buffers (mctx);
9128 		  if (BE (err != REG_NOERROR, 0))
9129 		    return err;
9130 
9131 		  buf = (const char *) re_string_get_buffer (&mctx->input);
9132 		}
9133 	      if (buf [bkref_str_off++] != buf[sl_str - 1])
9134 		break; /* We don't need to search this sub expression
9135 			  any more.  */
9136 	    }
9137 	  if (mctx->state_log[sl_str] == NULL)
9138 	    continue;
9139 	  /* Does this state have a ')' of the sub expression?  */
9140 	  nodes = &mctx->state_log[sl_str]->nodes;
9141 	  cls_node = find_subexp_node (dfa, nodes, subexp_num,
9142 				       OP_CLOSE_SUBEXP);
9143 	  if (cls_node == -1)
9144 	    continue; /* No.  */
9145 	  if (sub_top->path == NULL)
9146 	    {
9147 	      sub_top->path = calloc (sizeof (state_array_t),
9148 				      sl_str - sub_top->str_idx + 1);
9149 	      if (sub_top->path == NULL)
9150 		return REG_ESPACE;
9151 	    }
9152 	  /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
9153 	     in the current context?  */
9154 	  err = check_arrival (mctx, sub_top->path, sub_top->node,
9155 			       sub_top->str_idx, cls_node, sl_str,
9156 			       OP_CLOSE_SUBEXP);
9157 	  if (err == REG_NOMATCH)
9158 	      continue;
9159 	  if (BE (err != REG_NOERROR, 0))
9160 	      return err;
9161 	  sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
9162 	  if (BE (sub_last == NULL, 0))
9163 	    return REG_ESPACE;
9164 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
9165 				bkref_str_idx);
9166 	  if (err == REG_NOMATCH)
9167 	    continue;
9168 	}
9169     }
9170   return REG_NOERROR;
9171 }
9172 
9173 /* Helper functions for get_subexp().  */
9174 
9175 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
9176    If it can arrive, register the sub expression expressed with SUB_TOP
9177    and SUB_LAST.  */
9178 
9179 static reg_errcode_t
9180 internal_function
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,int bkref_node,int bkref_str)9181 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
9182 		re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
9183 {
9184   reg_errcode_t err;
9185   int to_idx;
9186   /* Can the subexpression arrive the back reference?  */
9187   err = check_arrival (mctx, &sub_last->path, sub_last->node,
9188 		       sub_last->str_idx, bkref_node, bkref_str,
9189 		       OP_OPEN_SUBEXP);
9190   if (err != REG_NOERROR)
9191     return err;
9192   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
9193 			     sub_last->str_idx);
9194   if (BE (err != REG_NOERROR, 0))
9195     return err;
9196   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
9197   return clean_state_log_if_needed (mctx, to_idx);
9198 }
9199 
9200 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
9201    Search '(' if FL_OPEN, or search ')' otherwise.
9202    TODO: This function isn't efficient...
9203 	 Because there might be more than one nodes whose types are
9204 	 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
9205 	 nodes.
9206 	 E.g. RE: (a){2}  */
9207 
9208 static int
9209 internal_function
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,int subexp_idx,int type)9210 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
9211 		  int subexp_idx, int type)
9212 {
9213   int cls_idx;
9214   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
9215     {
9216       int cls_node = nodes->elems[cls_idx];
9217       const re_token_t *node = dfa->nodes + cls_node;
9218       if (node->type == type
9219 	  && node->opr.idx == subexp_idx)
9220 	return cls_node;
9221     }
9222   return -1;
9223 }
9224 
9225 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
9226    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
9227    heavily reused.
9228    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
9229 
9230 static reg_errcode_t
9231 internal_function
check_arrival(re_match_context_t * mctx,state_array_t * path,int top_node,int top_str,int last_node,int last_str,int type)9232 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
9233 	       int top_str, int last_node, int last_str, int type)
9234 {
9235   const re_dfa_t *const dfa = mctx->dfa;
9236   reg_errcode_t err = REG_NOERROR;
9237   int subexp_num, backup_cur_idx, str_idx, null_cnt;
9238   re_dfastate_t *cur_state = NULL;
9239   re_node_set *cur_nodes, next_nodes;
9240   re_dfastate_t **backup_state_log;
9241   unsigned int context;
9242 
9243   subexp_num = dfa->nodes[top_node].opr.idx;
9244   /* Extend the buffer if we need.  */
9245   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
9246     {
9247       re_dfastate_t **new_array;
9248       int old_alloc = path->alloc;
9249       path->alloc += last_str + mctx->max_mb_elem_len + 1;
9250       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
9251       if (BE (new_array == NULL, 0))
9252 	{
9253 	  path->alloc = old_alloc;
9254 	  return REG_ESPACE;
9255 	}
9256       path->array = new_array;
9257       memset (new_array + old_alloc, '\0',
9258 	      sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
9259     }
9260 
9261   str_idx = path->next_idx ? path->next_idx : top_str;
9262 
9263   /* Temporary modify MCTX.  */
9264   backup_state_log = mctx->state_log;
9265   backup_cur_idx = mctx->input.cur_idx;
9266   mctx->state_log = path->array;
9267   mctx->input.cur_idx = str_idx;
9268 
9269   /* Setup initial node set.  */
9270   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
9271   if (str_idx == top_str)
9272     {
9273       err = re_node_set_init_1 (&next_nodes, top_node);
9274       if (BE (err != REG_NOERROR, 0))
9275 	return err;
9276       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
9277       if (BE (err != REG_NOERROR, 0))
9278 	{
9279 	  re_node_set_free (&next_nodes);
9280 	  return err;
9281 	}
9282     }
9283   else
9284     {
9285       cur_state = mctx->state_log[str_idx];
9286       if (cur_state && cur_state->has_backref)
9287 	{
9288 	  err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
9289 	  if (BE (err != REG_NOERROR, 0))
9290 	    return err;
9291 	}
9292       else
9293 	re_node_set_init_empty (&next_nodes);
9294     }
9295   if (str_idx == top_str || (cur_state && cur_state->has_backref))
9296     {
9297       if (next_nodes.nelem)
9298 	{
9299 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
9300 				    subexp_num, type);
9301 	  if (BE (err != REG_NOERROR, 0))
9302 	    {
9303 	      re_node_set_free (&next_nodes);
9304 	      return err;
9305 	    }
9306 	}
9307       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
9308       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
9309 	{
9310 	  re_node_set_free (&next_nodes);
9311 	  return err;
9312 	}
9313       mctx->state_log[str_idx] = cur_state;
9314     }
9315 
9316   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
9317     {
9318       re_node_set_empty (&next_nodes);
9319       if (mctx->state_log[str_idx + 1])
9320 	{
9321 	  err = re_node_set_merge (&next_nodes,
9322 				   &mctx->state_log[str_idx + 1]->nodes);
9323 	  if (BE (err != REG_NOERROR, 0))
9324 	    {
9325 	      re_node_set_free (&next_nodes);
9326 	      return err;
9327 	    }
9328 	}
9329       if (cur_state)
9330 	{
9331 	  err = check_arrival_add_next_nodes (mctx, str_idx,
9332 					      &cur_state->non_eps_nodes,
9333 					      &next_nodes);
9334 	  if (BE (err != REG_NOERROR, 0))
9335 	    {
9336 	      re_node_set_free (&next_nodes);
9337 	      return err;
9338 	    }
9339 	}
9340       ++str_idx;
9341       if (next_nodes.nelem)
9342 	{
9343 	  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
9344 	  if (BE (err != REG_NOERROR, 0))
9345 	    {
9346 	      re_node_set_free (&next_nodes);
9347 	      return err;
9348 	    }
9349 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
9350 				    subexp_num, type);
9351 	  if (BE (err != REG_NOERROR, 0))
9352 	    {
9353 	      re_node_set_free (&next_nodes);
9354 	      return err;
9355 	    }
9356 	}
9357       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
9358       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
9359       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
9360 	{
9361 	  re_node_set_free (&next_nodes);
9362 	  return err;
9363 	}
9364       mctx->state_log[str_idx] = cur_state;
9365       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
9366     }
9367   re_node_set_free (&next_nodes);
9368   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
9369 	       : &mctx->state_log[last_str]->nodes);
9370   path->next_idx = str_idx;
9371 
9372   /* Fix MCTX.  */
9373   mctx->state_log = backup_state_log;
9374   mctx->input.cur_idx = backup_cur_idx;
9375 
9376   /* Then check the current node set has the node LAST_NODE.  */
9377   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
9378     return REG_NOERROR;
9379 
9380   return REG_NOMATCH;
9381 }
9382 
9383 /* Helper functions for check_arrival.  */
9384 
9385 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
9386    to NEXT_NODES.
9387    TODO: This function is similar to the functions transit_state*(),
9388 	 however this function has many additional works.
9389 	 Can't we unify them?  */
9390 
9391 static reg_errcode_t
9392 internal_function
check_arrival_add_next_nodes(re_match_context_t * mctx,int str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)9393 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
9394 			      re_node_set *cur_nodes, re_node_set *next_nodes)
9395 {
9396   const re_dfa_t *const dfa = mctx->dfa;
9397   int result;
9398   int cur_idx;
9399   reg_errcode_t err = REG_NOERROR;
9400   re_node_set union_set;
9401   re_node_set_init_empty (&union_set);
9402   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
9403     {
9404       int naccepted = 0;
9405       int cur_node = cur_nodes->elems[cur_idx];
9406 #ifdef DEBUG
9407       re_token_type_t type = dfa->nodes[cur_node].type;
9408       assert (!IS_EPSILON_NODE (type));
9409 #endif
9410 #ifdef RE_ENABLE_I18N
9411       /* If the node may accept `multi byte'.  */
9412       if (dfa->nodes[cur_node].accept_mb)
9413 	{
9414 	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
9415 					       str_idx);
9416 	  if (naccepted > 1)
9417 	    {
9418 	      re_dfastate_t *dest_state;
9419 	      int next_node = dfa->nexts[cur_node];
9420 	      int next_idx = str_idx + naccepted;
9421 	      dest_state = mctx->state_log[next_idx];
9422 	      re_node_set_empty (&union_set);
9423 	      if (dest_state)
9424 		{
9425 		  err = re_node_set_merge (&union_set, &dest_state->nodes);
9426 		  if (BE (err != REG_NOERROR, 0))
9427 		    {
9428 		      re_node_set_free (&union_set);
9429 		      return err;
9430 		    }
9431 		}
9432 	      result = re_node_set_insert (&union_set, next_node);
9433 	      if (BE (result < 0, 0))
9434 		{
9435 		  re_node_set_free (&union_set);
9436 		  return REG_ESPACE;
9437 		}
9438 	      mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
9439 							    &union_set);
9440 	      if (BE (mctx->state_log[next_idx] == NULL
9441 		      && err != REG_NOERROR, 0))
9442 		{
9443 		  re_node_set_free (&union_set);
9444 		  return err;
9445 		}
9446 	    }
9447 	}
9448 #endif /* RE_ENABLE_I18N */
9449       if (naccepted
9450 	  || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
9451 	{
9452 	  result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
9453 	  if (BE (result < 0, 0))
9454 	    {
9455 	      re_node_set_free (&union_set);
9456 	      return REG_ESPACE;
9457 	    }
9458 	}
9459     }
9460   re_node_set_free (&union_set);
9461   return REG_NOERROR;
9462 }
9463 
9464 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
9465    CUR_NODES, however exclude the nodes which are:
9466     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
9467     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
9468 */
9469 
9470 static reg_errcode_t
9471 internal_function
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,int ex_subexp,int type)9472 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
9473 			  int ex_subexp, int type)
9474 {
9475   reg_errcode_t err;
9476   int idx, outside_node;
9477   re_node_set new_nodes;
9478 #ifdef DEBUG
9479   assert (cur_nodes->nelem);
9480 #endif
9481   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
9482   if (BE (err != REG_NOERROR, 0))
9483     return err;
9484   /* Create a new node set NEW_NODES with the nodes which are epsilon
9485      closures of the node in CUR_NODES.  */
9486 
9487   for (idx = 0; idx < cur_nodes->nelem; ++idx)
9488     {
9489       int cur_node = cur_nodes->elems[idx];
9490       const re_node_set *eclosure = dfa->eclosures + cur_node;
9491       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
9492       if (outside_node == -1)
9493 	{
9494 	  /* There are no problematic nodes, just merge them.  */
9495 	  err = re_node_set_merge (&new_nodes, eclosure);
9496 	  if (BE (err != REG_NOERROR, 0))
9497 	    {
9498 	      re_node_set_free (&new_nodes);
9499 	      return err;
9500 	    }
9501 	}
9502       else
9503 	{
9504 	  /* There are problematic nodes, re-calculate incrementally.  */
9505 	  err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
9506 					      ex_subexp, type);
9507 	  if (BE (err != REG_NOERROR, 0))
9508 	    {
9509 	      re_node_set_free (&new_nodes);
9510 	      return err;
9511 	    }
9512 	}
9513     }
9514   re_node_set_free (cur_nodes);
9515   *cur_nodes = new_nodes;
9516   return REG_NOERROR;
9517 }
9518 
9519 /* Helper function for check_arrival_expand_ecl.
9520    Check incrementally the epsilon closure of TARGET, and if it isn't
9521    problematic append it to DST_NODES.  */
9522 
9523 static reg_errcode_t
9524 internal_function
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,int target,int ex_subexp,int type)9525 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
9526 			      int target, int ex_subexp, int type)
9527 {
9528   int cur_node;
9529   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
9530     {
9531       int err;
9532 
9533       if (dfa->nodes[cur_node].type == type
9534 	  && dfa->nodes[cur_node].opr.idx == ex_subexp)
9535 	{
9536 	  if (type == OP_CLOSE_SUBEXP)
9537 	    {
9538 	      err = re_node_set_insert (dst_nodes, cur_node);
9539 	      if (BE (err == -1, 0))
9540 		return REG_ESPACE;
9541 	    }
9542 	  break;
9543 	}
9544       err = re_node_set_insert (dst_nodes, cur_node);
9545       if (BE (err == -1, 0))
9546 	return REG_ESPACE;
9547       if (dfa->edests[cur_node].nelem == 0)
9548 	break;
9549       if (dfa->edests[cur_node].nelem == 2)
9550 	{
9551 	  err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
9552 					      dfa->edests[cur_node].elems[1],
9553 					      ex_subexp, type);
9554 	  if (BE (err != REG_NOERROR, 0))
9555 	    return err;
9556 	}
9557       cur_node = dfa->edests[cur_node].elems[0];
9558     }
9559   return REG_NOERROR;
9560 }
9561 
9562 
9563 /* For all the back references in the current state, calculate the
9564    destination of the back references by the appropriate entry
9565    in MCTX->BKREF_ENTS.  */
9566 
9567 static reg_errcode_t
9568 internal_function
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,int cur_str,int subexp_num,int type)9569 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
9570 		    int cur_str, int subexp_num, int type)
9571 {
9572   const re_dfa_t *const dfa = mctx->dfa;
9573   reg_errcode_t err;
9574   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
9575   struct re_backref_cache_entry *ent;
9576 
9577   if (cache_idx_start == -1)
9578     return REG_NOERROR;
9579 
9580  restart:
9581   ent = mctx->bkref_ents + cache_idx_start;
9582   do
9583     {
9584       int to_idx, next_node;
9585 
9586       /* Is this entry ENT is appropriate?  */
9587       if (!re_node_set_contains (cur_nodes, ent->node))
9588 	continue; /* No.  */
9589 
9590       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
9591       /* Calculate the destination of the back reference, and append it
9592 	 to MCTX->STATE_LOG.  */
9593       if (to_idx == cur_str)
9594 	{
9595 	  /* The backreference did epsilon transit, we must re-check all the
9596 	     node in the current state.  */
9597 	  re_node_set new_dests;
9598 	  reg_errcode_t err2, err3;
9599 	  next_node = dfa->edests[ent->node].elems[0];
9600 	  if (re_node_set_contains (cur_nodes, next_node))
9601 	    continue;
9602 	  err = re_node_set_init_1 (&new_dests, next_node);
9603 	  err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
9604 	  err3 = re_node_set_merge (cur_nodes, &new_dests);
9605 	  re_node_set_free (&new_dests);
9606 	  if (BE (err != REG_NOERROR || err2 != REG_NOERROR
9607 		  || err3 != REG_NOERROR, 0))
9608 	    {
9609 	      err = (err != REG_NOERROR ? err
9610 		     : (err2 != REG_NOERROR ? err2 : err3));
9611 	      return err;
9612 	    }
9613 	  /* TODO: It is still inefficient...  */
9614 	  goto restart;
9615 	}
9616       else
9617 	{
9618 	  re_node_set union_set;
9619 	  next_node = dfa->nexts[ent->node];
9620 	  if (mctx->state_log[to_idx])
9621 	    {
9622 	      int ret;
9623 	      if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
9624 					next_node))
9625 		continue;
9626 	      err = re_node_set_init_copy (&union_set,
9627 					   &mctx->state_log[to_idx]->nodes);
9628 	      ret = re_node_set_insert (&union_set, next_node);
9629 	      if (BE (err != REG_NOERROR || ret < 0, 0))
9630 		{
9631 		  re_node_set_free (&union_set);
9632 		  err = err != REG_NOERROR ? err : REG_ESPACE;
9633 		  return err;
9634 		}
9635 	    }
9636 	  else
9637 	    {
9638 	      err = re_node_set_init_1 (&union_set, next_node);
9639 	      if (BE (err != REG_NOERROR, 0))
9640 		return err;
9641 	    }
9642 	  mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
9643 	  re_node_set_free (&union_set);
9644 	  if (BE (mctx->state_log[to_idx] == NULL
9645 		  && err != REG_NOERROR, 0))
9646 	    return err;
9647 	}
9648     }
9649   while (ent++->more);
9650   return REG_NOERROR;
9651 }
9652 
9653 /* Build transition table for the state.
9654    Return 1 if succeeded, otherwise return NULL.  */
9655 
9656 static int
9657 internal_function
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)9658 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
9659 {
9660   reg_errcode_t err;
9661   int i, j, ch, need_word_trtable = 0;
9662   bitset_word_t elem, mask;
9663   bool dests_node_malloced = false;
9664   bool dest_states_malloced = false;
9665   int ndests; /* Number of the destination states from `state'.  */
9666   re_dfastate_t **trtable;
9667   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
9668   re_node_set follows, *dests_node;
9669   bitset_t *dests_ch;
9670   bitset_t acceptable;
9671 
9672   struct dests_alloc
9673   {
9674     re_node_set dests_node[SBC_MAX];
9675     bitset_t dests_ch[SBC_MAX];
9676   } *dests_alloc;
9677 
9678   /* We build DFA states which corresponds to the destination nodes
9679      from `state'.  `dests_node[i]' represents the nodes which i-th
9680      destination state contains, and `dests_ch[i]' represents the
9681      characters which i-th destination state accepts.  */
9682   if (__libc_use_alloca (sizeof (struct dests_alloc)))
9683     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
9684   else
9685     {
9686       dests_alloc = re_malloc (struct dests_alloc, 1);
9687       if (BE (dests_alloc == NULL, 0))
9688 	return 0;
9689       dests_node_malloced = true;
9690     }
9691   dests_node = dests_alloc->dests_node;
9692   dests_ch = dests_alloc->dests_ch;
9693 
9694   /* Initialize transiton table.  */
9695   state->word_trtable = state->trtable = NULL;
9696 
9697   /* At first, group all nodes belonging to `state' into several
9698      destinations.  */
9699   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
9700   if (BE (ndests <= 0, 0))
9701     {
9702       if (dests_node_malloced)
9703 	free (dests_alloc);
9704       /* Return 0 in case of an error, 1 otherwise.  */
9705       if (ndests == 0)
9706 	{
9707 	  state->trtable = (re_dfastate_t **)
9708 	    calloc (sizeof (re_dfastate_t *), SBC_MAX);
9709 	  return 1;
9710 	}
9711       return 0;
9712     }
9713 
9714   err = re_node_set_alloc (&follows, ndests + 1);
9715   if (BE (err != REG_NOERROR, 0))
9716     goto out_free;
9717 
9718   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
9719 			 + ndests * 3 * sizeof (re_dfastate_t *)))
9720     dest_states = (re_dfastate_t **)
9721       alloca (ndests * 3 * sizeof (re_dfastate_t *));
9722   else
9723     {
9724       dest_states = (re_dfastate_t **)
9725 	malloc (ndests * 3 * sizeof (re_dfastate_t *));
9726       if (BE (dest_states == NULL, 0))
9727 	{
9728 out_free:
9729 	  if (dest_states_malloced)
9730 	    free (dest_states);
9731 	  re_node_set_free (&follows);
9732 	  for (i = 0; i < ndests; ++i)
9733 	    re_node_set_free (dests_node + i);
9734 	  if (dests_node_malloced)
9735 	    free (dests_alloc);
9736 	  return 0;
9737 	}
9738       dest_states_malloced = true;
9739     }
9740   dest_states_word = dest_states + ndests;
9741   dest_states_nl = dest_states_word + ndests;
9742   bitset_empty (acceptable);
9743 
9744   /* Then build the states for all destinations.  */
9745   for (i = 0; i < ndests; ++i)
9746     {
9747       int next_node;
9748       re_node_set_empty (&follows);
9749       /* Merge the follows of this destination states.  */
9750       for (j = 0; j < dests_node[i].nelem; ++j)
9751 	{
9752 	  next_node = dfa->nexts[dests_node[i].elems[j]];
9753 	  if (next_node != -1)
9754 	    {
9755 	      err = re_node_set_merge (&follows, dfa->eclosures + next_node);
9756 	      if (BE (err != REG_NOERROR, 0))
9757 		goto out_free;
9758 	    }
9759 	}
9760       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
9761       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
9762 	goto out_free;
9763       /* If the new state has context constraint,
9764 	 build appropriate states for these contexts.  */
9765       if (dest_states[i]->has_constraint)
9766 	{
9767 	  dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
9768 							  CONTEXT_WORD);
9769 	  if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
9770 	    goto out_free;
9771 
9772 	  if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
9773 	    need_word_trtable = 1;
9774 
9775 	  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
9776 							CONTEXT_NEWLINE);
9777 	  if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
9778 	    goto out_free;
9779  	}
9780       else
9781 	{
9782 	  dest_states_word[i] = dest_states[i];
9783 	  dest_states_nl[i] = dest_states[i];
9784 	}
9785       bitset_merge (acceptable, dests_ch[i]);
9786     }
9787 
9788   if (!BE (need_word_trtable, 0))
9789     {
9790       /* We don't care about whether the following character is a word
9791 	 character, or we are in a single-byte character set so we can
9792 	 discern by looking at the character code: allocate a
9793 	 256-entry transition table.  */
9794       trtable = state->trtable =
9795 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
9796       if (BE (trtable == NULL, 0))
9797 	goto out_free;
9798 
9799       /* For all characters ch...:  */
9800       for (i = 0; i < BITSET_WORDS; ++i)
9801 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
9802 	     elem;
9803 	     mask <<= 1, elem >>= 1, ++ch)
9804 	  if (BE (elem & 1, 0))
9805 	    {
9806 	      /* There must be exactly one destination which accepts
9807 		 character ch.  See group_nodes_into_DFAstates.  */
9808 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
9809 		;
9810 
9811 	      /* j-th destination accepts the word character ch.  */
9812 	      if (dfa->word_char[i] & mask)
9813 		trtable[ch] = dest_states_word[j];
9814 	      else
9815 		trtable[ch] = dest_states[j];
9816 	    }
9817     }
9818   else
9819     {
9820       /* We care about whether the following character is a word
9821 	 character, and we are in a multi-byte character set: discern
9822 	 by looking at the character code: build two 256-entry
9823 	 transition tables, one starting at trtable[0] and one
9824 	 starting at trtable[SBC_MAX].  */
9825       trtable = state->word_trtable =
9826 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
9827       if (BE (trtable == NULL, 0))
9828 	goto out_free;
9829 
9830       /* For all characters ch...:  */
9831       for (i = 0; i < BITSET_WORDS; ++i)
9832 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
9833 	     elem;
9834 	     mask <<= 1, elem >>= 1, ++ch)
9835 	  if (BE (elem & 1, 0))
9836 	    {
9837 	      /* There must be exactly one destination which accepts
9838 		 character ch.  See group_nodes_into_DFAstates.  */
9839 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
9840 		;
9841 
9842 	      /* j-th destination accepts the word character ch.  */
9843 	      trtable[ch] = dest_states[j];
9844 	      trtable[ch + SBC_MAX] = dest_states_word[j];
9845 	    }
9846     }
9847 
9848   /* new line */
9849   if (bitset_contain (acceptable, NEWLINE_CHAR))
9850     {
9851       /* The current state accepts newline character.  */
9852       for (j = 0; j < ndests; ++j)
9853 	if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
9854 	  {
9855 	    /* k-th destination accepts newline character.  */
9856 	    trtable[NEWLINE_CHAR] = dest_states_nl[j];
9857 	    if (need_word_trtable)
9858 	      trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
9859 	    /* There must be only one destination which accepts
9860 	       newline.  See group_nodes_into_DFAstates.  */
9861 	    break;
9862 	  }
9863     }
9864 
9865   if (dest_states_malloced)
9866     free (dest_states);
9867 
9868   re_node_set_free (&follows);
9869   for (i = 0; i < ndests; ++i)
9870     re_node_set_free (dests_node + i);
9871 
9872   if (dests_node_malloced)
9873     free (dests_alloc);
9874 
9875   return 1;
9876 }
9877 
9878 /* Group all nodes belonging to STATE into several destinations.
9879    Then for all destinations, set the nodes belonging to the destination
9880    to DESTS_NODE[i] and set the characters accepted by the destination
9881    to DEST_CH[i].  This function return the number of destinations.  */
9882 
9883 static int
9884 internal_function
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)9885 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
9886 			    re_node_set *dests_node, bitset_t *dests_ch)
9887 {
9888   reg_errcode_t err;
9889   int result;
9890   int i, j, k;
9891   int ndests; /* Number of the destinations from `state'.  */
9892   bitset_t accepts; /* Characters a node can accept.  */
9893   const re_node_set *cur_nodes = &state->nodes;
9894   bitset_empty (accepts);
9895   ndests = 0;
9896 
9897   /* For all the nodes belonging to `state',  */
9898   for (i = 0; i < cur_nodes->nelem; ++i)
9899     {
9900       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
9901       re_token_type_t type = node->type;
9902       unsigned int constraint = node->constraint;
9903 
9904       /* Enumerate all single byte character this node can accept.  */
9905       if (type == CHARACTER)
9906 	bitset_set (accepts, node->opr.c);
9907       else if (type == SIMPLE_BRACKET)
9908 	{
9909 	  bitset_merge (accepts, node->opr.sbcset);
9910 	}
9911       else if (type == OP_PERIOD)
9912 	{
9913 #ifdef RE_ENABLE_I18N
9914 	  if (dfa->mb_cur_max > 1)
9915 	    bitset_merge (accepts, dfa->sb_char);
9916 	  else
9917 #endif
9918 	    bitset_set_all (accepts);
9919 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
9920 	    bitset_clear (accepts, '\n');
9921 	  if (dfa->syntax & RE_DOT_NOT_NULL)
9922 	    bitset_clear (accepts, '\0');
9923 	}
9924 #ifdef RE_ENABLE_I18N
9925       else if (type == OP_UTF8_PERIOD)
9926         {
9927 	  memset (accepts, '\xff', sizeof (bitset_t) / 2);
9928 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
9929 	    bitset_clear (accepts, '\n');
9930 	  if (dfa->syntax & RE_DOT_NOT_NULL)
9931 	    bitset_clear (accepts, '\0');
9932         }
9933 #endif
9934       else
9935 	continue;
9936 
9937       /* Check the `accepts' and sift the characters which are not
9938 	 match it the context.  */
9939       if (constraint)
9940 	{
9941 	  if (constraint & NEXT_NEWLINE_CONSTRAINT)
9942 	    {
9943 	      bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
9944 	      bitset_empty (accepts);
9945 	      if (accepts_newline)
9946 		bitset_set (accepts, NEWLINE_CHAR);
9947 	      else
9948 		continue;
9949 	    }
9950 	  if (constraint & NEXT_ENDBUF_CONSTRAINT)
9951 	    {
9952 	      bitset_empty (accepts);
9953 	      continue;
9954 	    }
9955 
9956 	  if (constraint & NEXT_WORD_CONSTRAINT)
9957 	    {
9958 	      bitset_word_t any_set = 0;
9959 	      if (type == CHARACTER && !node->word_char)
9960 		{
9961 		  bitset_empty (accepts);
9962 		  continue;
9963 		}
9964 #ifdef RE_ENABLE_I18N
9965 	      if (dfa->mb_cur_max > 1)
9966 		for (j = 0; j < BITSET_WORDS; ++j)
9967 		  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
9968 	      else
9969 #endif
9970 		for (j = 0; j < BITSET_WORDS; ++j)
9971 		  any_set |= (accepts[j] &= dfa->word_char[j]);
9972 	      if (!any_set)
9973 		continue;
9974 	    }
9975 	  if (constraint & NEXT_NOTWORD_CONSTRAINT)
9976 	    {
9977 	      bitset_word_t any_set = 0;
9978 	      if (type == CHARACTER && node->word_char)
9979 		{
9980 		  bitset_empty (accepts);
9981 		  continue;
9982 		}
9983 #ifdef RE_ENABLE_I18N
9984 	      if (dfa->mb_cur_max > 1)
9985 		for (j = 0; j < BITSET_WORDS; ++j)
9986 		  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
9987 	      else
9988 #endif
9989 		for (j = 0; j < BITSET_WORDS; ++j)
9990 		  any_set |= (accepts[j] &= ~dfa->word_char[j]);
9991 	      if (!any_set)
9992 		continue;
9993 	    }
9994 	}
9995 
9996       /* Then divide `accepts' into DFA states, or create a new
9997 	 state.  Above, we make sure that accepts is not empty.  */
9998       for (j = 0; j < ndests; ++j)
9999 	{
10000 	  bitset_t intersec; /* Intersection sets, see below.  */
10001 	  bitset_t remains;
10002 	  /* Flags, see below.  */
10003 	  bitset_word_t has_intersec, not_subset, not_consumed;
10004 
10005 	  /* Optimization, skip if this state doesn't accept the character.  */
10006 	  if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
10007 	    continue;
10008 
10009 	  /* Enumerate the intersection set of this state and `accepts'.  */
10010 	  has_intersec = 0;
10011 	  for (k = 0; k < BITSET_WORDS; ++k)
10012 	    has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
10013 	  /* And skip if the intersection set is empty.  */
10014 	  if (!has_intersec)
10015 	    continue;
10016 
10017 	  /* Then check if this state is a subset of `accepts'.  */
10018 	  not_subset = not_consumed = 0;
10019 	  for (k = 0; k < BITSET_WORDS; ++k)
10020 	    {
10021 	      not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
10022 	      not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
10023 	    }
10024 
10025 	  /* If this state isn't a subset of `accepts', create a
10026 	     new group state, which has the `remains'. */
10027 	  if (not_subset)
10028 	    {
10029 	      bitset_copy (dests_ch[ndests], remains);
10030 	      bitset_copy (dests_ch[j], intersec);
10031 	      err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
10032 	      if (BE (err != REG_NOERROR, 0))
10033 		goto error_return;
10034 	      ++ndests;
10035 	    }
10036 
10037 	  /* Put the position in the current group. */
10038 	  result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
10039 	  if (BE (result < 0, 0))
10040 	    goto error_return;
10041 
10042 	  /* If all characters are consumed, go to next node. */
10043 	  if (!not_consumed)
10044 	    break;
10045 	}
10046       /* Some characters remain, create a new group. */
10047       if (j == ndests)
10048 	{
10049 	  bitset_copy (dests_ch[ndests], accepts);
10050 	  err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
10051 	  if (BE (err != REG_NOERROR, 0))
10052 	    goto error_return;
10053 	  ++ndests;
10054 	  bitset_empty (accepts);
10055 	}
10056     }
10057   return ndests;
10058  error_return:
10059   for (j = 0; j < ndests; ++j)
10060     re_node_set_free (dests_node + j);
10061   return -1;
10062 }
10063 
10064 #ifdef RE_ENABLE_I18N
10065 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
10066    Return the number of the bytes the node accepts.
10067    STR_IDX is the current index of the input string.
10068 
10069    This function handles the nodes which can accept one character, or
10070    one collating element like '.', '[a-z]', opposite to the other nodes
10071    can only accept one byte.  */
10072 
10073 static int
10074 internal_function
check_node_accept_bytes(const re_dfa_t * dfa,int node_idx,const re_string_t * input,int str_idx)10075 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
10076 			 const re_string_t *input, int str_idx)
10077 {
10078   const re_token_t *node = dfa->nodes + node_idx;
10079   int char_len, elem_len;
10080   int i;
10081 
10082   if (BE (node->type == OP_UTF8_PERIOD, 0))
10083     {
10084       unsigned char c = re_string_byte_at (input, str_idx), d;
10085       if (BE (c < 0xc2, 1))
10086 	return 0;
10087 
10088       if (str_idx + 2 > input->len)
10089 	return 0;
10090 
10091       d = re_string_byte_at (input, str_idx + 1);
10092       if (c < 0xe0)
10093 	return (d < 0x80 || d > 0xbf) ? 0 : 2;
10094       else if (c < 0xf0)
10095 	{
10096 	  char_len = 3;
10097 	  if (c == 0xe0 && d < 0xa0)
10098 	    return 0;
10099 	}
10100       else if (c < 0xf8)
10101 	{
10102 	  char_len = 4;
10103 	  if (c == 0xf0 && d < 0x90)
10104 	    return 0;
10105 	}
10106       else if (c < 0xfc)
10107 	{
10108 	  char_len = 5;
10109 	  if (c == 0xf8 && d < 0x88)
10110 	    return 0;
10111 	}
10112       else if (c < 0xfe)
10113 	{
10114 	  char_len = 6;
10115 	  if (c == 0xfc && d < 0x84)
10116 	    return 0;
10117 	}
10118       else
10119 	return 0;
10120 
10121       if (str_idx + char_len > input->len)
10122 	return 0;
10123 
10124       for (i = 1; i < char_len; ++i)
10125 	{
10126 	  d = re_string_byte_at (input, str_idx + i);
10127 	  if (d < 0x80 || d > 0xbf)
10128 	    return 0;
10129 	}
10130       return char_len;
10131     }
10132 
10133   char_len = re_string_char_size_at (input, str_idx);
10134   if (node->type == OP_PERIOD)
10135     {
10136       if (char_len <= 1)
10137         return 0;
10138       /* FIXME: I don't think this if is needed, as both '\n'
10139 	 and '\0' are char_len == 1.  */
10140       /* '.' accepts any one character except the following two cases.  */
10141       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
10142 	   re_string_byte_at (input, str_idx) == '\n') ||
10143 	  ((dfa->syntax & RE_DOT_NOT_NULL) &&
10144 	   re_string_byte_at (input, str_idx) == '\0'))
10145 	return 0;
10146       return char_len;
10147     }
10148 
10149   elem_len = re_string_elem_size_at (input, str_idx);
10150   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
10151     return 0;
10152 
10153   if (node->type == COMPLEX_BRACKET)
10154     {
10155       const re_charset_t *cset = node->opr.mbcset;
10156 # ifdef _LIBC
10157       const unsigned char *pin
10158 	= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
10159       int j;
10160       uint32_t nrules;
10161 # endif /* _LIBC */
10162       int match_len = 0;
10163       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
10164 		    ? re_string_wchar_at (input, str_idx) : 0);
10165 
10166       /* match with multibyte character?  */
10167       for (i = 0; i < cset->nmbchars; ++i)
10168 	if (wc == cset->mbchars[i])
10169 	  {
10170 	    match_len = char_len;
10171 	    goto check_node_accept_bytes_match;
10172 	  }
10173       /* match with character_class?  */
10174       for (i = 0; i < cset->nchar_classes; ++i)
10175 	{
10176 	  wctype_t wt = cset->char_classes[i];
10177 	  if (__iswctype (wc, wt))
10178 	    {
10179 	      match_len = char_len;
10180 	      goto check_node_accept_bytes_match;
10181 	    }
10182 	}
10183 
10184 # ifdef _LIBC
10185       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
10186       if (nrules != 0)
10187 	{
10188 	  unsigned int in_collseq = 0;
10189 	  const int32_t *table, *indirect;
10190 	  const unsigned char *weights, *extra;
10191 	  const char *collseqwc;
10192 	  int32_t idx;
10193 	  /* This #include defines a local function!  */
10194 #  include <locale/weight.h>
10195 
10196 	  /* match with collating_symbol?  */
10197 	  if (cset->ncoll_syms)
10198 	    extra = (const unsigned char *)
10199 	      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
10200 	  for (i = 0; i < cset->ncoll_syms; ++i)
10201 	    {
10202 	      const unsigned char *coll_sym = extra + cset->coll_syms[i];
10203 	      /* Compare the length of input collating element and
10204 		 the length of current collating element.  */
10205 	      if (*coll_sym != elem_len)
10206 		continue;
10207 	      /* Compare each bytes.  */
10208 	      for (j = 0; j < *coll_sym; j++)
10209 		if (pin[j] != coll_sym[1 + j])
10210 		  break;
10211 	      if (j == *coll_sym)
10212 		{
10213 		  /* Match if every bytes is equal.  */
10214 		  match_len = j;
10215 		  goto check_node_accept_bytes_match;
10216 		}
10217 	    }
10218 
10219 	  if (cset->nranges)
10220 	    {
10221 	      if (elem_len <= char_len)
10222 		{
10223 		  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
10224 		  in_collseq = __collseq_table_lookup (collseqwc, wc);
10225 		}
10226 	      else
10227 		in_collseq = find_collation_sequence_value (pin, elem_len);
10228 	    }
10229 	  /* match with range expression?  */
10230 	  for (i = 0; i < cset->nranges; ++i)
10231 	    if (cset->range_starts[i] <= in_collseq
10232 		&& in_collseq <= cset->range_ends[i])
10233 	      {
10234 		match_len = elem_len;
10235 		goto check_node_accept_bytes_match;
10236 	      }
10237 
10238 	  /* match with equivalence_class?  */
10239 	  if (cset->nequiv_classes)
10240 	    {
10241 	      const unsigned char *cp = pin;
10242 	      table = (const int32_t *)
10243 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
10244 	      weights = (const unsigned char *)
10245 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
10246 	      extra = (const unsigned char *)
10247 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
10248 	      indirect = (const int32_t *)
10249 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
10250 	      idx = findidx (&cp);
10251 	      if (idx > 0)
10252 		for (i = 0; i < cset->nequiv_classes; ++i)
10253 		  {
10254 		    int32_t equiv_class_idx = cset->equiv_classes[i];
10255 		    size_t weight_len = weights[idx];
10256 		    if (weight_len == weights[equiv_class_idx])
10257 		      {
10258 			int cnt = 0;
10259 			while (cnt <= weight_len
10260 			       && (weights[equiv_class_idx + 1 + cnt]
10261 				   == weights[idx + 1 + cnt]))
10262 			  ++cnt;
10263 			if (cnt > weight_len)
10264 			  {
10265 			    match_len = elem_len;
10266 			    goto check_node_accept_bytes_match;
10267 			  }
10268 		      }
10269 		  }
10270 	    }
10271 	}
10272       else
10273 # endif /* _LIBC */
10274 	{
10275 	  /* match with range expression?  */
10276 #if __GNUC__ >= 2
10277 	  wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
10278 #else
10279 	  wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
10280 	  cmp_buf[2] = wc;
10281 #endif
10282 	  for (i = 0; i < cset->nranges; ++i)
10283 	    {
10284 	      cmp_buf[0] = cset->range_starts[i];
10285 	      cmp_buf[4] = cset->range_ends[i];
10286 	      if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
10287 		  && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
10288 		{
10289 		  match_len = char_len;
10290 		  goto check_node_accept_bytes_match;
10291 		}
10292 	    }
10293 	}
10294     check_node_accept_bytes_match:
10295       if (!cset->non_match)
10296 	return match_len;
10297       else
10298 	{
10299 	  if (match_len > 0)
10300 	    return 0;
10301 	  else
10302 	    return (elem_len > char_len) ? elem_len : char_len;
10303 	}
10304     }
10305   return 0;
10306 }
10307 
10308 # ifdef _LIBC
10309 static unsigned int
10310 internal_function
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)10311 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
10312 {
10313   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
10314   if (nrules == 0)
10315     {
10316       if (mbs_len == 1)
10317 	{
10318 	  /* No valid character.  Match it as a single byte character.  */
10319 	  const unsigned char *collseq = (const unsigned char *)
10320 	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
10321 	  return collseq[mbs[0]];
10322 	}
10323       return UINT_MAX;
10324     }
10325   else
10326     {
10327       int32_t idx;
10328       const unsigned char *extra = (const unsigned char *)
10329 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
10330       int32_t extrasize = (const unsigned char *)
10331 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
10332 
10333       for (idx = 0; idx < extrasize;)
10334 	{
10335 	  int mbs_cnt, found = 0;
10336 	  int32_t elem_mbs_len;
10337 	  /* Skip the name of collating element name.  */
10338 	  idx = idx + extra[idx] + 1;
10339 	  elem_mbs_len = extra[idx++];
10340 	  if (mbs_len == elem_mbs_len)
10341 	    {
10342 	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
10343 		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
10344 		  break;
10345 	      if (mbs_cnt == elem_mbs_len)
10346 		/* Found the entry.  */
10347 		found = 1;
10348 	    }
10349 	  /* Skip the byte sequence of the collating element.  */
10350 	  idx += elem_mbs_len;
10351 	  /* Adjust for the alignment.  */
10352 	  idx = (idx + 3) & ~3;
10353 	  /* Skip the collation sequence value.  */
10354 	  idx += sizeof (uint32_t);
10355 	  /* Skip the wide char sequence of the collating element.  */
10356 	  idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
10357 	  /* If we found the entry, return the sequence value.  */
10358 	  if (found)
10359 	    return *(uint32_t *) (extra + idx);
10360 	  /* Skip the collation sequence value.  */
10361 	  idx += sizeof (uint32_t);
10362 	}
10363       return UINT_MAX;
10364     }
10365 }
10366 # endif /* _LIBC */
10367 #endif /* RE_ENABLE_I18N */
10368 
10369 /* Check whether the node accepts the byte which is IDX-th
10370    byte of the INPUT.  */
10371 
10372 static int
10373 internal_function
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,int idx)10374 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
10375 		   int idx)
10376 {
10377   unsigned char ch;
10378   ch = re_string_byte_at (&mctx->input, idx);
10379   switch (node->type)
10380     {
10381     case CHARACTER:
10382       if (node->opr.c != ch)
10383         return 0;
10384       break;
10385 
10386     case SIMPLE_BRACKET:
10387       if (!bitset_contain (node->opr.sbcset, ch))
10388         return 0;
10389       break;
10390 
10391 #ifdef RE_ENABLE_I18N
10392     case OP_UTF8_PERIOD:
10393       if (ch >= 0x80)
10394         return 0;
10395       /* FALLTHROUGH */
10396 #endif
10397     case OP_PERIOD:
10398       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
10399 	  || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
10400 	return 0;
10401       break;
10402 
10403     default:
10404       return 0;
10405     }
10406 
10407   if (node->constraint)
10408     {
10409       /* The node has constraints.  Check whether the current context
10410 	 satisfies the constraints.  */
10411       unsigned int context = re_string_context_at (&mctx->input, idx,
10412 						   mctx->eflags);
10413       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
10414 	return 0;
10415     }
10416 
10417   return 1;
10418 }
10419 
10420 /* Extend the buffers, if the buffers have run out.  */
10421 
10422 static reg_errcode_t
10423 internal_function
extend_buffers(re_match_context_t * mctx)10424 extend_buffers (re_match_context_t *mctx)
10425 {
10426   reg_errcode_t ret;
10427   re_string_t *pstr = &mctx->input;
10428 
10429   /* Double the lengthes of the buffers.  */
10430   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
10431   if (BE (ret != REG_NOERROR, 0))
10432     return ret;
10433 
10434   if (mctx->state_log != NULL)
10435     {
10436       /* And double the length of state_log.  */
10437       /* XXX We have no indication of the size of this buffer.  If this
10438 	 allocation fail we have no indication that the state_log array
10439 	 does not have the right size.  */
10440       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
10441 					      pstr->bufs_len + 1);
10442       if (BE (new_array == NULL, 0))
10443 	return REG_ESPACE;
10444       mctx->state_log = new_array;
10445     }
10446 
10447   /* Then reconstruct the buffers.  */
10448   if (pstr->icase)
10449     {
10450 #ifdef RE_ENABLE_I18N
10451       if (pstr->mb_cur_max > 1)
10452 	{
10453 	  ret = build_wcs_upper_buffer (pstr);
10454 	  if (BE (ret != REG_NOERROR, 0))
10455 	    return ret;
10456 	}
10457       else
10458 #endif /* RE_ENABLE_I18N  */
10459 	build_upper_buffer (pstr);
10460     }
10461   else
10462     {
10463 #ifdef RE_ENABLE_I18N
10464       if (pstr->mb_cur_max > 1)
10465 	build_wcs_buffer (pstr);
10466       else
10467 #endif /* RE_ENABLE_I18N  */
10468 	{
10469 	  if (pstr->trans != NULL)
10470 	    re_string_translate_buffer (pstr);
10471 	}
10472     }
10473   return REG_NOERROR;
10474 }
10475 
10476 
10477 /* Functions for matching context.  */
10478 
10479 /* Initialize MCTX.  */
10480 
10481 static reg_errcode_t
10482 internal_function
match_ctx_init(re_match_context_t * mctx,int eflags,int n)10483 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
10484 {
10485   mctx->eflags = eflags;
10486   mctx->match_last = -1;
10487   if (n > 0)
10488     {
10489       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
10490       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
10491       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
10492 	return REG_ESPACE;
10493     }
10494   /* Already zero-ed by the caller.
10495      else
10496        mctx->bkref_ents = NULL;
10497      mctx->nbkref_ents = 0;
10498      mctx->nsub_tops = 0;  */
10499   mctx->abkref_ents = n;
10500   mctx->max_mb_elem_len = 1;
10501   mctx->asub_tops = n;
10502   return REG_NOERROR;
10503 }
10504 
10505 /* Clean the entries which depend on the current input in MCTX.
10506    This function must be invoked when the matcher changes the start index
10507    of the input, or changes the input string.  */
10508 
10509 static void
10510 internal_function
match_ctx_clean(re_match_context_t * mctx)10511 match_ctx_clean (re_match_context_t *mctx)
10512 {
10513   int st_idx;
10514   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
10515     {
10516       int sl_idx;
10517       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
10518       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
10519 	{
10520 	  re_sub_match_last_t *last = top->lasts[sl_idx];
10521 	  re_free (last->path.array);
10522 	  re_free (last);
10523 	}
10524       re_free (top->lasts);
10525       if (top->path)
10526 	{
10527 	  re_free (top->path->array);
10528 	  re_free (top->path);
10529 	}
10530       free (top);
10531     }
10532 
10533   mctx->nsub_tops = 0;
10534   mctx->nbkref_ents = 0;
10535 }
10536 
10537 /* Free all the memory associated with MCTX.  */
10538 
10539 static void
10540 internal_function
match_ctx_free(re_match_context_t * mctx)10541 match_ctx_free (re_match_context_t *mctx)
10542 {
10543   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
10544   match_ctx_clean (mctx);
10545   re_free (mctx->sub_tops);
10546   re_free (mctx->bkref_ents);
10547 }
10548 
10549 /* Add a new backreference entry to MCTX.
10550    Note that we assume that caller never call this function with duplicate
10551    entry, and call with STR_IDX which isn't smaller than any existing entry.
10552 */
10553 
10554 static reg_errcode_t
10555 internal_function
match_ctx_add_entry(re_match_context_t * mctx,int node,int str_idx,int from,int to)10556 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
10557 		     int to)
10558 {
10559   if (mctx->nbkref_ents >= mctx->abkref_ents)
10560     {
10561       struct re_backref_cache_entry* new_entry;
10562       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
10563 			      mctx->abkref_ents * 2);
10564       if (BE (new_entry == NULL, 0))
10565 	{
10566 	  re_free (mctx->bkref_ents);
10567 	  return REG_ESPACE;
10568 	}
10569       mctx->bkref_ents = new_entry;
10570       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
10571 	      sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
10572       mctx->abkref_ents *= 2;
10573     }
10574   if (mctx->nbkref_ents > 0
10575       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
10576     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
10577 
10578   mctx->bkref_ents[mctx->nbkref_ents].node = node;
10579   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
10580   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
10581   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
10582 
10583   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
10584      If bit N is clear, means that this entry won't epsilon-transition to
10585      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
10586      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
10587      such node.
10588 
10589      A backreference does not epsilon-transition unless it is empty, so set
10590      to all zeros if FROM != TO.  */
10591   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
10592     = (from == to ? ~0 : 0);
10593 
10594   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
10595   if (mctx->max_mb_elem_len < to - from)
10596     mctx->max_mb_elem_len = to - from;
10597   return REG_NOERROR;
10598 }
10599 
10600 /* Search for the first entry which has the same str_idx, or -1 if none is
10601    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
10602 
10603 static int
10604 internal_function
search_cur_bkref_entry(const re_match_context_t * mctx,int str_idx)10605 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
10606 {
10607   int left, right, mid, last;
10608   last = right = mctx->nbkref_ents;
10609   for (left = 0; left < right;)
10610     {
10611       mid = (left + right) / 2;
10612       if (mctx->bkref_ents[mid].str_idx < str_idx)
10613 	left = mid + 1;
10614       else
10615 	right = mid;
10616     }
10617   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
10618     return left;
10619   else
10620     return -1;
10621 }
10622 
10623 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
10624    at STR_IDX.  */
10625 
10626 static reg_errcode_t
10627 internal_function
match_ctx_add_subtop(re_match_context_t * mctx,int node,int str_idx)10628 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
10629 {
10630 #ifdef DEBUG
10631   assert (mctx->sub_tops != NULL);
10632   assert (mctx->asub_tops > 0);
10633 #endif
10634   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
10635     {
10636       int new_asub_tops = mctx->asub_tops * 2;
10637       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
10638 						   re_sub_match_top_t *,
10639 						   new_asub_tops);
10640       if (BE (new_array == NULL, 0))
10641 	return REG_ESPACE;
10642       mctx->sub_tops = new_array;
10643       mctx->asub_tops = new_asub_tops;
10644     }
10645   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
10646   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
10647     return REG_ESPACE;
10648   mctx->sub_tops[mctx->nsub_tops]->node = node;
10649   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
10650   return REG_NOERROR;
10651 }
10652 
10653 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
10654    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
10655 
10656 static re_sub_match_last_t *
10657 internal_function
match_ctx_add_sublast(re_sub_match_top_t * subtop,int node,int str_idx)10658 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
10659 {
10660   re_sub_match_last_t *new_entry;
10661   if (BE (subtop->nlasts == subtop->alasts, 0))
10662     {
10663       int new_alasts = 2 * subtop->alasts + 1;
10664       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
10665 						    re_sub_match_last_t *,
10666 						    new_alasts);
10667       if (BE (new_array == NULL, 0))
10668 	return NULL;
10669       subtop->lasts = new_array;
10670       subtop->alasts = new_alasts;
10671     }
10672   new_entry = calloc (1, sizeof (re_sub_match_last_t));
10673   if (BE (new_entry != NULL, 1))
10674     {
10675       subtop->lasts[subtop->nlasts] = new_entry;
10676       new_entry->node = node;
10677       new_entry->str_idx = str_idx;
10678       ++subtop->nlasts;
10679     }
10680   return new_entry;
10681 }
10682 
10683 static void
10684 internal_function
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,int last_node,int last_str_idx)10685 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
10686 	       re_dfastate_t **limited_sts, int last_node, int last_str_idx)
10687 {
10688   sctx->sifted_states = sifted_sts;
10689   sctx->limited_states = limited_sts;
10690   sctx->last_node = last_node;
10691   sctx->last_str_idx = last_str_idx;
10692   re_node_set_init_empty (&sctx->limits);
10693 }
10694 
10695 
10696 /* Binary backward compatibility.  */
10697 #if _LIBC
10698 # include <shlib-compat.h>
10699 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3)
10700 link_warning (re_max_failures, "the 're_max_failures' variable is obsolete and will go away.")
10701 int re_max_failures = 2000;
10702 # endif
10703 #endif
10704 #endif
10705