1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU General Public
8    License as published by the Free Software Foundation; either
9    version 3 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    General Public License for more details.
15 
16    You should have received a copy of the GNU General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19 
20 #include "verify.h"
21 #include "intprops.h"
22 static void re_string_construct_common (const char *str, Idx len,
23 					re_string_t *pstr,
24 					RE_TRANSLATE_TYPE trans, bool icase,
25 					const re_dfa_t *dfa) internal_function;
26 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
27 					  const re_node_set *nodes,
28 					  re_hashval_t hash) internal_function;
29 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
30 					  const re_node_set *nodes,
31 					  unsigned int context,
32 					  re_hashval_t hash) internal_function;
33 
34 /* Functions for string operation.  */
35 
36 /* This function allocate the buffers.  It is necessary to call
37    re_string_reconstruct before using the object.  */
38 
39 static reg_errcode_t
40 internal_function __attribute_warn_unused_result__
41 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
42 		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
43 {
44   reg_errcode_t ret;
45   Idx init_buf_len;
46 
47   /* Ensure at least one character fits into the buffers.  */
48   if (init_len < dfa->mb_cur_max)
49     init_len = dfa->mb_cur_max;
50   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
51   re_string_construct_common (str, len, pstr, trans, icase, dfa);
52 
53   ret = re_string_realloc_buffers (pstr, init_buf_len);
54   if (BE (ret != REG_NOERROR, 0))
55     return ret;
56 
57   pstr->word_char = dfa->word_char;
58   pstr->word_ops_used = dfa->word_ops_used;
59   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
60   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
61   pstr->valid_raw_len = pstr->valid_len;
62   return REG_NOERROR;
63 }
64 
65 /* This function allocate the buffers, and initialize them.  */
66 
67 static reg_errcode_t
68 internal_function __attribute_warn_unused_result__
69 re_string_construct (re_string_t *pstr, const char *str, Idx len,
70 		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
71 {
72   reg_errcode_t ret;
73   memset (pstr, '\0', sizeof (re_string_t));
74   re_string_construct_common (str, len, pstr, trans, icase, dfa);
75 
76   if (len > 0)
77     {
78       ret = re_string_realloc_buffers (pstr, len + 1);
79       if (BE (ret != REG_NOERROR, 0))
80 	return ret;
81     }
82   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
83 
84   if (icase)
85     {
86 #ifdef RE_ENABLE_I18N
87       if (dfa->mb_cur_max > 1)
88 	{
89 	  while (1)
90 	    {
91 	      ret = build_wcs_upper_buffer (pstr);
92 	      if (BE (ret != REG_NOERROR, 0))
93 		return ret;
94 	      if (pstr->valid_raw_len >= len)
95 		break;
96 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
97 		break;
98 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
99 	      if (BE (ret != REG_NOERROR, 0))
100 		return ret;
101 	    }
102 	}
103       else
104 #endif /* RE_ENABLE_I18N  */
105 	build_upper_buffer (pstr);
106     }
107   else
108     {
109 #ifdef RE_ENABLE_I18N
110       if (dfa->mb_cur_max > 1)
111 	build_wcs_buffer (pstr);
112       else
113 #endif /* RE_ENABLE_I18N  */
114 	{
115 	  if (trans != NULL)
116 	    re_string_translate_buffer (pstr);
117 	  else
118 	    {
119 	      pstr->valid_len = pstr->bufs_len;
120 	      pstr->valid_raw_len = pstr->bufs_len;
121 	    }
122 	}
123     }
124 
125   return REG_NOERROR;
126 }
127 
128 /* Helper functions for re_string_allocate, and re_string_construct.  */
129 
130 static reg_errcode_t
131 internal_function __attribute_warn_unused_result__
132 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
133 {
134 #ifdef RE_ENABLE_I18N
135   if (pstr->mb_cur_max > 1)
136     {
137       wint_t *new_wcs;
138 
139       /* Avoid overflow in realloc.  */
140       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
141       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
142 	return REG_ESPACE;
143 
144       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
145       if (BE (new_wcs == NULL, 0))
146 	return REG_ESPACE;
147       pstr->wcs = new_wcs;
148       if (pstr->offsets != NULL)
149 	{
150 	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
151 	  if (BE (new_offsets == NULL, 0))
152 	    return REG_ESPACE;
153 	  pstr->offsets = new_offsets;
154 	}
155     }
156 #endif /* RE_ENABLE_I18N  */
157   if (pstr->mbs_allocated)
158     {
159       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
160 					   new_buf_len);
161       if (BE (new_mbs == NULL, 0))
162 	return REG_ESPACE;
163       pstr->mbs = new_mbs;
164     }
165   pstr->bufs_len = new_buf_len;
166   return REG_NOERROR;
167 }
168 
169 
170 static void
171 internal_function
172 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
173 			    RE_TRANSLATE_TYPE trans, bool icase,
174 			    const re_dfa_t *dfa)
175 {
176   pstr->raw_mbs = (const unsigned char *) str;
177   pstr->len = len;
178   pstr->raw_len = len;
179   pstr->trans = trans;
180   pstr->icase = icase;
181   pstr->mbs_allocated = (trans != NULL || icase);
182   pstr->mb_cur_max = dfa->mb_cur_max;
183   pstr->is_utf8 = dfa->is_utf8;
184   pstr->map_notascii = dfa->map_notascii;
185   pstr->stop = pstr->len;
186   pstr->raw_stop = pstr->stop;
187 }
188 
189 #ifdef RE_ENABLE_I18N
190 
191 /* Build wide character buffer PSTR->WCS.
192    If the byte sequence of the string are:
193      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
194    Then wide character buffer will be:
195      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
196    We use WEOF for padding, they indicate that the position isn't
197    a first byte of a multibyte character.
198 
199    Note that this function assumes PSTR->VALID_LEN elements are already
200    built and starts from PSTR->VALID_LEN.  */
201 
202 static void
203 internal_function
204 build_wcs_buffer (re_string_t *pstr)
205 {
206 #ifdef _LIBC
207   unsigned char buf[MB_LEN_MAX];
208   assert (MB_LEN_MAX >= pstr->mb_cur_max);
209 #else
210   unsigned char buf[64];
211 #endif
212   mbstate_t prev_st;
213   Idx byte_idx, end_idx, remain_len;
214   size_t mbclen;
215 
216   /* Build the buffers from pstr->valid_len to either pstr->len or
217      pstr->bufs_len.  */
218   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
219   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
220     {
221       wchar_t wc;
222       const char *p;
223 
224       remain_len = end_idx - byte_idx;
225       prev_st = pstr->cur_state;
226       /* Apply the translation if we need.  */
227       if (BE (pstr->trans != NULL, 0))
228 	{
229 	  int i, ch;
230 
231 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
232 	    {
233 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
234 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
235 	    }
236 	  p = (const char *) buf;
237 	}
238       else
239 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
240       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
241       if (BE (mbclen == (size_t) -1 || mbclen == 0
242 	      || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
243 	{
244 	  /* We treat these cases as a singlebyte character.  */
245 	  mbclen = 1;
246 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
247 	  if (BE (pstr->trans != NULL, 0))
248 	    wc = pstr->trans[wc];
249 	  pstr->cur_state = prev_st;
250 	}
251       else if (BE (mbclen == (size_t) -2, 0))
252 	{
253 	  /* The buffer doesn't have enough space, finish to build.  */
254 	  pstr->cur_state = prev_st;
255 	  break;
256 	}
257 
258       /* Write wide character and padding.  */
259       pstr->wcs[byte_idx++] = wc;
260       /* Write paddings.  */
261       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
262 	pstr->wcs[byte_idx++] = WEOF;
263     }
264   pstr->valid_len = byte_idx;
265   pstr->valid_raw_len = byte_idx;
266 }
267 
268 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
269    but for REG_ICASE.  */
270 
271 static reg_errcode_t
272 internal_function __attribute_warn_unused_result__
273 build_wcs_upper_buffer (re_string_t *pstr)
274 {
275   mbstate_t prev_st;
276   Idx src_idx, byte_idx, end_idx, remain_len;
277   size_t mbclen;
278 #ifdef _LIBC
279   char buf[MB_LEN_MAX];
280   assert (MB_LEN_MAX >= pstr->mb_cur_max);
281 #else
282   char buf[64];
283 #endif
284 
285   byte_idx = pstr->valid_len;
286   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
287 
288   /* The following optimization assumes that ASCII characters can be
289      mapped to wide characters with a simple cast.  */
290   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
291     {
292       while (byte_idx < end_idx)
293 	{
294 	  wchar_t wc;
295 
296 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
297 	      && mbsinit (&pstr->cur_state))
298 	    {
299 	      /* In case of a singlebyte character.  */
300 	      pstr->mbs[byte_idx]
301 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
302 	      /* The next step uses the assumption that wchar_t is encoded
303 		 ASCII-safe: all ASCII values can be converted like this.  */
304 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
305 	      ++byte_idx;
306 	      continue;
307 	    }
308 
309 	  remain_len = end_idx - byte_idx;
310 	  prev_st = pstr->cur_state;
311 	  mbclen = __mbrtowc (&wc,
312 			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
313 			       + byte_idx), remain_len, &pstr->cur_state);
314 	  if (BE (mbclen < (size_t) -2, 1))
315 	    {
316 	      wchar_t wcu = wc;
317 	      if (iswlower (wc))
318 		{
319 		  size_t mbcdlen;
320 
321 		  wcu = towupper (wc);
322 		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
323 		  if (BE (mbclen == mbcdlen, 1))
324 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
325 		  else
326 		    {
327 		      src_idx = byte_idx;
328 		      goto offsets_needed;
329 		    }
330 		}
331 	      else
332 		memcpy (pstr->mbs + byte_idx,
333 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
334 	      pstr->wcs[byte_idx++] = wcu;
335 	      /* Write paddings.  */
336 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
337 		pstr->wcs[byte_idx++] = WEOF;
338 	    }
339 	  else if (mbclen == (size_t) -1 || mbclen == 0
340 		   || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
341 	    {
342 	      /* It is an invalid character, an incomplete character
343 		 at the end of the string, or '\0'.  Just use the byte.  */
344 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
345 	      pstr->mbs[byte_idx] = ch;
346 	      /* And also cast it to wide char.  */
347 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
348 	      if (BE (mbclen == (size_t) -1, 0))
349 		pstr->cur_state = prev_st;
350 	    }
351 	  else
352 	    {
353 	      /* The buffer doesn't have enough space, finish to build.  */
354 	      pstr->cur_state = prev_st;
355 	      break;
356 	    }
357 	}
358       pstr->valid_len = byte_idx;
359       pstr->valid_raw_len = byte_idx;
360       return REG_NOERROR;
361     }
362   else
363     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
364       {
365 	wchar_t wc;
366 	const char *p;
367       offsets_needed:
368 	remain_len = end_idx - byte_idx;
369 	prev_st = pstr->cur_state;
370 	if (BE (pstr->trans != NULL, 0))
371 	  {
372 	    int i, ch;
373 
374 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
375 	      {
376 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
377 		buf[i] = pstr->trans[ch];
378 	      }
379 	    p = (const char *) buf;
380 	  }
381 	else
382 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
383 	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
384 	if (BE (mbclen < (size_t) -2, 1))
385 	  {
386 	    wchar_t wcu = wc;
387 	    if (iswlower (wc))
388 	      {
389 		size_t mbcdlen;
390 
391 		wcu = towupper (wc);
392 		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
393 		if (BE (mbclen == mbcdlen, 1))
394 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
395 		else if (mbcdlen != (size_t) -1)
396 		  {
397 		    size_t i;
398 
399 		    if (byte_idx + mbcdlen > pstr->bufs_len)
400 		      {
401 			pstr->cur_state = prev_st;
402 			break;
403 		      }
404 
405 		    if (pstr->offsets == NULL)
406 		      {
407 			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
408 
409 			if (pstr->offsets == NULL)
410 			  return REG_ESPACE;
411 		      }
412 		    if (!pstr->offsets_needed)
413 		      {
414 			for (i = 0; i < (size_t) byte_idx; ++i)
415 			  pstr->offsets[i] = i;
416 			pstr->offsets_needed = 1;
417 		      }
418 
419 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
420 		    pstr->wcs[byte_idx] = wcu;
421 		    pstr->offsets[byte_idx] = src_idx;
422 		    for (i = 1; i < mbcdlen; ++i)
423 		      {
424 			pstr->offsets[byte_idx + i]
425 			  = src_idx + (i < mbclen ? i : mbclen - 1);
426 			pstr->wcs[byte_idx + i] = WEOF;
427 		      }
428 		    pstr->len += mbcdlen - mbclen;
429 		    if (pstr->raw_stop > src_idx)
430 		      pstr->stop += mbcdlen - mbclen;
431 		    end_idx = (pstr->bufs_len > pstr->len)
432 			      ? pstr->len : pstr->bufs_len;
433 		    byte_idx += mbcdlen;
434 		    src_idx += mbclen;
435 		    continue;
436 		  }
437 		else
438 		  memcpy (pstr->mbs + byte_idx, p, mbclen);
439 	      }
440 	    else
441 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
442 
443 	    if (BE (pstr->offsets_needed != 0, 0))
444 	      {
445 		size_t i;
446 		for (i = 0; i < mbclen; ++i)
447 		  pstr->offsets[byte_idx + i] = src_idx + i;
448 	      }
449 	    src_idx += mbclen;
450 
451 	    pstr->wcs[byte_idx++] = wcu;
452 	    /* Write paddings.  */
453 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
454 	      pstr->wcs[byte_idx++] = WEOF;
455 	  }
456 	else if (mbclen == (size_t) -1 || mbclen == 0
457 		 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
458 	  {
459 	    /* It is an invalid character or '\0'.  Just use the byte.  */
460 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
461 
462 	    if (BE (pstr->trans != NULL, 0))
463 	      ch = pstr->trans [ch];
464 	    pstr->mbs[byte_idx] = ch;
465 
466 	    if (BE (pstr->offsets_needed != 0, 0))
467 	      pstr->offsets[byte_idx] = src_idx;
468 	    ++src_idx;
469 
470 	    /* And also cast it to wide char.  */
471 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
472 	    if (BE (mbclen == (size_t) -1, 0))
473 	      pstr->cur_state = prev_st;
474 	  }
475 	else
476 	  {
477 	    /* The buffer doesn't have enough space, finish to build.  */
478 	    pstr->cur_state = prev_st;
479 	    break;
480 	  }
481       }
482   pstr->valid_len = byte_idx;
483   pstr->valid_raw_len = src_idx;
484   return REG_NOERROR;
485 }
486 
487 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
488    Return the index.  */
489 
490 static Idx
491 internal_function
492 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
493 {
494   mbstate_t prev_st;
495   Idx rawbuf_idx;
496   size_t mbclen;
497   wint_t wc = WEOF;
498 
499   /* Skip the characters which are not necessary to check.  */
500   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
501        rawbuf_idx < new_raw_idx;)
502     {
503       wchar_t wc2;
504       Idx remain_len = pstr->raw_len - rawbuf_idx;
505       prev_st = pstr->cur_state;
506       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
507 			  remain_len, &pstr->cur_state);
508       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
509 	{
510 	  /* We treat these cases as a single byte character.  */
511 	  if (mbclen == 0 || remain_len == 0)
512 	    wc = L'\0';
513 	  else
514 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
515 	  mbclen = 1;
516 	  pstr->cur_state = prev_st;
517 	}
518       else
519 	wc = wc2;
520       /* Then proceed the next character.  */
521       rawbuf_idx += mbclen;
522     }
523   *last_wc = wc;
524   return rawbuf_idx;
525 }
526 #endif /* RE_ENABLE_I18N  */
527 
528 /* Build the buffer PSTR->MBS, and apply the translation if we need.
529    This function is used in case of REG_ICASE.  */
530 
531 static void
532 internal_function
533 build_upper_buffer (re_string_t *pstr)
534 {
535   Idx char_idx, end_idx;
536   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
537 
538   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
539     {
540       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
541       if (BE (pstr->trans != NULL, 0))
542 	ch = pstr->trans[ch];
543       if (islower (ch))
544 	pstr->mbs[char_idx] = toupper (ch);
545       else
546 	pstr->mbs[char_idx] = ch;
547     }
548   pstr->valid_len = char_idx;
549   pstr->valid_raw_len = char_idx;
550 }
551 
552 /* Apply TRANS to the buffer in PSTR.  */
553 
554 static void
555 internal_function
556 re_string_translate_buffer (re_string_t *pstr)
557 {
558   Idx buf_idx, end_idx;
559   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
560 
561   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
562     {
563       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
564       pstr->mbs[buf_idx] = pstr->trans[ch];
565     }
566 
567   pstr->valid_len = buf_idx;
568   pstr->valid_raw_len = buf_idx;
569 }
570 
571 /* This function re-construct the buffers.
572    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
573    convert to upper case in case of REG_ICASE, apply translation.  */
574 
575 static reg_errcode_t
576 internal_function __attribute_warn_unused_result__
577 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
578 {
579   Idx offset;
580 
581   if (BE (pstr->raw_mbs_idx <= idx, 0))
582     offset = idx - pstr->raw_mbs_idx;
583   else
584     {
585       /* Reset buffer.  */
586 #ifdef RE_ENABLE_I18N
587       if (pstr->mb_cur_max > 1)
588 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
589 #endif /* RE_ENABLE_I18N */
590       pstr->len = pstr->raw_len;
591       pstr->stop = pstr->raw_stop;
592       pstr->valid_len = 0;
593       pstr->raw_mbs_idx = 0;
594       pstr->valid_raw_len = 0;
595       pstr->offsets_needed = 0;
596       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
597 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
598       if (!pstr->mbs_allocated)
599 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
600       offset = idx;
601     }
602 
603   if (BE (offset != 0, 1))
604     {
605       /* Should the already checked characters be kept?  */
606       if (BE (offset < pstr->valid_raw_len, 1))
607 	{
608 	  /* Yes, move them to the front of the buffer.  */
609 #ifdef RE_ENABLE_I18N
610 	  if (BE (pstr->offsets_needed, 0))
611 	    {
612 	      Idx low = 0, high = pstr->valid_len, mid;
613 	      do
614 		{
615 		  mid = (high + low) / 2;
616 		  if (pstr->offsets[mid] > offset)
617 		    high = mid;
618 		  else if (pstr->offsets[mid] < offset)
619 		    low = mid + 1;
620 		  else
621 		    break;
622 		}
623 	      while (low < high);
624 	      if (pstr->offsets[mid] < offset)
625 		++mid;
626 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
627 							eflags);
628 	      /* This can be quite complicated, so handle specially
629 		 only the common and easy case where the character with
630 		 different length representation of lower and upper
631 		 case is present at or after offset.  */
632 	      if (pstr->valid_len > offset
633 		  && mid == offset && pstr->offsets[mid] == offset)
634 		{
635 		  memmove (pstr->wcs, pstr->wcs + offset,
636 			   (pstr->valid_len - offset) * sizeof (wint_t));
637 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
638 		  pstr->valid_len -= offset;
639 		  pstr->valid_raw_len -= offset;
640 		  for (low = 0; low < pstr->valid_len; low++)
641 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
642 		}
643 	      else
644 		{
645 		  /* Otherwise, just find out how long the partial multibyte
646 		     character at offset is and fill it with WEOF/255.  */
647 		  pstr->len = pstr->raw_len - idx + offset;
648 		  pstr->stop = pstr->raw_stop - idx + offset;
649 		  pstr->offsets_needed = 0;
650 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
651 		    --mid;
652 		  while (mid < pstr->valid_len)
653 		    if (pstr->wcs[mid] != WEOF)
654 		      break;
655 		    else
656 		      ++mid;
657 		  if (mid == pstr->valid_len)
658 		    pstr->valid_len = 0;
659 		  else
660 		    {
661 		      pstr->valid_len = pstr->offsets[mid] - offset;
662 		      if (pstr->valid_len)
663 			{
664 			  for (low = 0; low < pstr->valid_len; ++low)
665 			    pstr->wcs[low] = WEOF;
666 			  memset (pstr->mbs, 255, pstr->valid_len);
667 			}
668 		    }
669 		  pstr->valid_raw_len = pstr->valid_len;
670 		}
671 	    }
672 	  else
673 #endif
674 	    {
675 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
676 							eflags);
677 #ifdef RE_ENABLE_I18N
678 	      if (pstr->mb_cur_max > 1)
679 		memmove (pstr->wcs, pstr->wcs + offset,
680 			 (pstr->valid_len - offset) * sizeof (wint_t));
681 #endif /* RE_ENABLE_I18N */
682 	      if (BE (pstr->mbs_allocated, 0))
683 		memmove (pstr->mbs, pstr->mbs + offset,
684 			 pstr->valid_len - offset);
685 	      pstr->valid_len -= offset;
686 	      pstr->valid_raw_len -= offset;
687 #if DEBUG
688 	      assert (pstr->valid_len > 0);
689 #endif
690 	    }
691 	}
692       else
693 	{
694 #ifdef RE_ENABLE_I18N
695 	  /* No, skip all characters until IDX.  */
696 	  Idx prev_valid_len = pstr->valid_len;
697 
698 	  if (BE (pstr->offsets_needed, 0))
699 	    {
700 	      pstr->len = pstr->raw_len - idx + offset;
701 	      pstr->stop = pstr->raw_stop - idx + offset;
702 	      pstr->offsets_needed = 0;
703 	    }
704 #endif
705 	  pstr->valid_len = 0;
706 #ifdef RE_ENABLE_I18N
707 	  if (pstr->mb_cur_max > 1)
708 	    {
709 	      Idx wcs_idx;
710 	      wint_t wc = WEOF;
711 
712 	      if (pstr->is_utf8)
713 		{
714 		  const unsigned char *raw, *p, *end;
715 
716 		  /* Special case UTF-8.  Multi-byte chars start with any
717 		     byte other than 0x80 - 0xbf.  */
718 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
719 		  end = raw + (offset - pstr->mb_cur_max);
720 		  if (end < pstr->raw_mbs)
721 		    end = pstr->raw_mbs;
722 		  p = raw + offset - 1;
723 #ifdef _LIBC
724 		  /* We know the wchar_t encoding is UCS4, so for the simple
725 		     case, ASCII characters, skip the conversion step.  */
726 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
727 		    {
728 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
729 		      /* pstr->valid_len = 0; */
730 		      wc = (wchar_t) *p;
731 		    }
732 		  else
733 #endif
734 		    for (; p >= end; --p)
735 		      if ((*p & 0xc0) != 0x80)
736 			{
737 			  mbstate_t cur_state;
738 			  wchar_t wc2;
739 			  Idx mlen = raw + pstr->len - p;
740 			  unsigned char buf[6];
741 			  size_t mbclen;
742 
743 			  const unsigned char *pp = p;
744 			  if (BE (pstr->trans != NULL, 0))
745 			    {
746 			      int i = mlen < 6 ? mlen : 6;
747 			      while (--i >= 0)
748 				buf[i] = pstr->trans[p[i]];
749 			      pp = buf;
750 			    }
751 			  /* XXX Don't use mbrtowc, we know which conversion
752 			     to use (UTF-8 -> UCS4).  */
753 			  memset (&cur_state, 0, sizeof (cur_state));
754 			  mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
755 					      &cur_state);
756 			  if (raw + offset - p <= mbclen
757 			      && mbclen < (size_t) -2)
758 			    {
759 			      memset (&pstr->cur_state, '\0',
760 				      sizeof (mbstate_t));
761 			      pstr->valid_len = mbclen - (raw + offset - p);
762 			      wc = wc2;
763 			    }
764 			  break;
765 			}
766 		}
767 
768 	      if (wc == WEOF)
769 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
770 	      if (wc == WEOF)
771 		pstr->tip_context
772 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
773 	      else
774 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
775 				      && IS_WIDE_WORD_CHAR (wc))
776 				     ? CONTEXT_WORD
777 				     : ((IS_WIDE_NEWLINE (wc)
778 					 && pstr->newline_anchor)
779 					? CONTEXT_NEWLINE : 0));
780 	      if (BE (pstr->valid_len, 0))
781 		{
782 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
783 		    pstr->wcs[wcs_idx] = WEOF;
784 		  if (pstr->mbs_allocated)
785 		    memset (pstr->mbs, 255, pstr->valid_len);
786 		}
787 	      pstr->valid_raw_len = pstr->valid_len;
788 	    }
789 	  else
790 #endif /* RE_ENABLE_I18N */
791 	    {
792 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
793 	      pstr->valid_raw_len = 0;
794 	      if (pstr->trans)
795 		c = pstr->trans[c];
796 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
797 				   ? CONTEXT_WORD
798 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
799 				      ? CONTEXT_NEWLINE : 0));
800 	    }
801 	}
802       if (!BE (pstr->mbs_allocated, 0))
803 	pstr->mbs += offset;
804     }
805   pstr->raw_mbs_idx = idx;
806   pstr->len -= offset;
807   pstr->stop -= offset;
808 
809   /* Then build the buffers.  */
810 #ifdef RE_ENABLE_I18N
811   if (pstr->mb_cur_max > 1)
812     {
813       if (pstr->icase)
814 	{
815 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
816 	  if (BE (ret != REG_NOERROR, 0))
817 	    return ret;
818 	}
819       else
820 	build_wcs_buffer (pstr);
821     }
822   else
823 #endif /* RE_ENABLE_I18N */
824     if (BE (pstr->mbs_allocated, 0))
825       {
826 	if (pstr->icase)
827 	  build_upper_buffer (pstr);
828 	else if (pstr->trans != NULL)
829 	  re_string_translate_buffer (pstr);
830       }
831     else
832       pstr->valid_len = pstr->len;
833 
834   pstr->cur_idx = 0;
835   return REG_NOERROR;
836 }
837 
838 static unsigned char
839 internal_function __attribute__ ((pure))
840 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
841 {
842   int ch;
843   Idx off;
844 
845   /* Handle the common (easiest) cases first.  */
846   if (BE (!pstr->mbs_allocated, 1))
847     return re_string_peek_byte (pstr, idx);
848 
849 #ifdef RE_ENABLE_I18N
850   if (pstr->mb_cur_max > 1
851       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
852     return re_string_peek_byte (pstr, idx);
853 #endif
854 
855   off = pstr->cur_idx + idx;
856 #ifdef RE_ENABLE_I18N
857   if (pstr->offsets_needed)
858     off = pstr->offsets[off];
859 #endif
860 
861   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
862 
863 #ifdef RE_ENABLE_I18N
864   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
865      this function returns CAPITAL LETTER I instead of first byte of
866      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
867      since peek_byte_case doesn't advance cur_idx in any way.  */
868   if (pstr->offsets_needed && !isascii (ch))
869     return re_string_peek_byte (pstr, idx);
870 #endif
871 
872   return ch;
873 }
874 
875 static unsigned char
876 internal_function
877 re_string_fetch_byte_case (re_string_t *pstr)
878 {
879   if (BE (!pstr->mbs_allocated, 1))
880     return re_string_fetch_byte (pstr);
881 
882 #ifdef RE_ENABLE_I18N
883   if (pstr->offsets_needed)
884     {
885       Idx off;
886       int ch;
887 
888       /* For tr_TR.UTF-8 [[:islower:]] there is
889 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
890 	 in that case the whole multi-byte character and return
891 	 the original letter.  On the other side, with
892 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
893 	 anything else would complicate things too much.  */
894 
895       if (!re_string_first_byte (pstr, pstr->cur_idx))
896 	return re_string_fetch_byte (pstr);
897 
898       off = pstr->offsets[pstr->cur_idx];
899       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
900 
901       if (! isascii (ch))
902 	return re_string_fetch_byte (pstr);
903 
904       re_string_skip_bytes (pstr,
905 			    re_string_char_size_at (pstr, pstr->cur_idx));
906       return ch;
907     }
908 #endif
909 
910   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
911 }
912 
913 static void
914 internal_function
915 re_string_destruct (re_string_t *pstr)
916 {
917 #ifdef RE_ENABLE_I18N
918   re_free (pstr->wcs);
919   re_free (pstr->offsets);
920 #endif /* RE_ENABLE_I18N  */
921   if (pstr->mbs_allocated)
922     re_free (pstr->mbs);
923 }
924 
925 /* Return the context at IDX in INPUT.  */
926 
927 static unsigned int
928 internal_function
929 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
930 {
931   int c;
932   if (BE (! REG_VALID_INDEX (idx), 0))
933     /* In this case, we use the value stored in input->tip_context,
934        since we can't know the character in input->mbs[-1] here.  */
935     return input->tip_context;
936   if (BE (idx == input->len, 0))
937     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
938 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
939 #ifdef RE_ENABLE_I18N
940   if (input->mb_cur_max > 1)
941     {
942       wint_t wc;
943       Idx wc_idx = idx;
944       while(input->wcs[wc_idx] == WEOF)
945 	{
946 #ifdef DEBUG
947 	  /* It must not happen.  */
948 	  assert (REG_VALID_INDEX (wc_idx));
949 #endif
950 	  --wc_idx;
951 	  if (! REG_VALID_INDEX (wc_idx))
952 	    return input->tip_context;
953 	}
954       wc = input->wcs[wc_idx];
955       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
956 	return CONTEXT_WORD;
957       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
958 	      ? CONTEXT_NEWLINE : 0);
959     }
960   else
961 #endif
962     {
963       c = re_string_byte_at (input, idx);
964       if (bitset_contain (input->word_char, c))
965 	return CONTEXT_WORD;
966       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
967     }
968 }
969 
970 /* Functions for set operation.  */
971 
972 static reg_errcode_t
973 internal_function __attribute_warn_unused_result__
974 re_node_set_alloc (re_node_set *set, Idx size)
975 {
976   set->alloc = size;
977   set->nelem = 0;
978   set->elems = re_malloc (Idx, size);
979   if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL || size != 0))
980     return REG_ESPACE;
981   return REG_NOERROR;
982 }
983 
984 static reg_errcode_t
985 internal_function __attribute_warn_unused_result__
986 re_node_set_init_1 (re_node_set *set, Idx elem)
987 {
988   set->alloc = 1;
989   set->nelem = 1;
990   set->elems = re_malloc (Idx, 1);
991   if (BE (set->elems == NULL, 0))
992     {
993       set->alloc = set->nelem = 0;
994       return REG_ESPACE;
995     }
996   set->elems[0] = elem;
997   return REG_NOERROR;
998 }
999 
1000 static reg_errcode_t
1001 internal_function __attribute_warn_unused_result__
1002 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1003 {
1004   set->alloc = 2;
1005   set->elems = re_malloc (Idx, 2);
1006   if (BE (set->elems == NULL, 0))
1007     return REG_ESPACE;
1008   if (elem1 == elem2)
1009     {
1010       set->nelem = 1;
1011       set->elems[0] = elem1;
1012     }
1013   else
1014     {
1015       set->nelem = 2;
1016       if (elem1 < elem2)
1017 	{
1018 	  set->elems[0] = elem1;
1019 	  set->elems[1] = elem2;
1020 	}
1021       else
1022 	{
1023 	  set->elems[0] = elem2;
1024 	  set->elems[1] = elem1;
1025 	}
1026     }
1027   return REG_NOERROR;
1028 }
1029 
1030 static reg_errcode_t
1031 internal_function __attribute_warn_unused_result__
1032 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1033 {
1034   dest->nelem = src->nelem;
1035   if (src->nelem > 0)
1036     {
1037       dest->alloc = dest->nelem;
1038       dest->elems = re_malloc (Idx, dest->alloc);
1039       if (BE (dest->elems == NULL, 0))
1040 	{
1041 	  dest->alloc = dest->nelem = 0;
1042 	  return REG_ESPACE;
1043 	}
1044       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1045     }
1046   else
1047     re_node_set_init_empty (dest);
1048   return REG_NOERROR;
1049 }
1050 
1051 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1052    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1053    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1054 
1055 static reg_errcode_t
1056 internal_function __attribute_warn_unused_result__
1057 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1058 			   const re_node_set *src2)
1059 {
1060   Idx i1, i2, is, id, delta, sbase;
1061   if (src1->nelem == 0 || src2->nelem == 0)
1062     return REG_NOERROR;
1063 
1064   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1065      conservative estimate.  */
1066   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1067     {
1068       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1069       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1070       if (BE (new_elems == NULL, 0))
1071 	return REG_ESPACE;
1072       dest->elems = new_elems;
1073       dest->alloc = new_alloc;
1074     }
1075 
1076   /* Find the items in the intersection of SRC1 and SRC2, and copy
1077      into the top of DEST those that are not already in DEST itself.  */
1078   sbase = dest->nelem + src1->nelem + src2->nelem;
1079   i1 = src1->nelem - 1;
1080   i2 = src2->nelem - 1;
1081   id = dest->nelem - 1;
1082   for (;;)
1083     {
1084       if (src1->elems[i1] == src2->elems[i2])
1085 	{
1086 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1087 	  while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1088 	    --id;
1089 
1090           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1091             dest->elems[--sbase] = src1->elems[i1];
1092 
1093 	  if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1094 	    break;
1095 	}
1096 
1097       /* Lower the highest of the two items.  */
1098       else if (src1->elems[i1] < src2->elems[i2])
1099 	{
1100 	  if (! REG_VALID_INDEX (--i2))
1101 	    break;
1102 	}
1103       else
1104 	{
1105 	  if (! REG_VALID_INDEX (--i1))
1106 	    break;
1107 	}
1108     }
1109 
1110   id = dest->nelem - 1;
1111   is = dest->nelem + src1->nelem + src2->nelem - 1;
1112   delta = is - sbase + 1;
1113 
1114   /* Now copy.  When DELTA becomes zero, the remaining
1115      DEST elements are already in place; this is more or
1116      less the same loop that is in re_node_set_merge.  */
1117   dest->nelem += delta;
1118   if (delta > 0 && REG_VALID_INDEX (id))
1119     for (;;)
1120       {
1121 	if (dest->elems[is] > dest->elems[id])
1122 	  {
1123 	    /* Copy from the top.  */
1124 	    dest->elems[id + delta--] = dest->elems[is--];
1125 	    if (delta == 0)
1126 	      break;
1127 	  }
1128 	else
1129 	  {
1130 	    /* Slide from the bottom.  */
1131 	    dest->elems[id + delta] = dest->elems[id];
1132 	    if (! REG_VALID_INDEX (--id))
1133 	      break;
1134 	  }
1135       }
1136 
1137   /* Copy remaining SRC elements.  */
1138   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1139 
1140   return REG_NOERROR;
1141 }
1142 
1143 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1144    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1145 
1146 static reg_errcode_t
1147 internal_function __attribute_warn_unused_result__
1148 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1149 			const re_node_set *src2)
1150 {
1151   Idx i1, i2, id;
1152   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1153     {
1154       dest->alloc = src1->nelem + src2->nelem;
1155       dest->elems = re_malloc (Idx, dest->alloc);
1156       if (BE (dest->elems == NULL, 0))
1157 	return REG_ESPACE;
1158     }
1159   else
1160     {
1161       if (src1 != NULL && src1->nelem > 0)
1162 	return re_node_set_init_copy (dest, src1);
1163       else if (src2 != NULL && src2->nelem > 0)
1164 	return re_node_set_init_copy (dest, src2);
1165       else
1166 	re_node_set_init_empty (dest);
1167       return REG_NOERROR;
1168     }
1169   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1170     {
1171       if (src1->elems[i1] > src2->elems[i2])
1172 	{
1173 	  dest->elems[id++] = src2->elems[i2++];
1174 	  continue;
1175 	}
1176       if (src1->elems[i1] == src2->elems[i2])
1177 	++i2;
1178       dest->elems[id++] = src1->elems[i1++];
1179     }
1180   if (i1 < src1->nelem)
1181     {
1182       memcpy (dest->elems + id, src1->elems + i1,
1183 	     (src1->nelem - i1) * sizeof (Idx));
1184       id += src1->nelem - i1;
1185     }
1186   else if (i2 < src2->nelem)
1187     {
1188       memcpy (dest->elems + id, src2->elems + i2,
1189 	     (src2->nelem - i2) * sizeof (Idx));
1190       id += src2->nelem - i2;
1191     }
1192   dest->nelem = id;
1193   return REG_NOERROR;
1194 }
1195 
1196 /* Calculate the union set of the sets DEST and SRC. And store it to
1197    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1198 
1199 static reg_errcode_t
1200 internal_function __attribute_warn_unused_result__
1201 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1202 {
1203   Idx is, id, sbase, delta;
1204   if (src == NULL || src->nelem == 0)
1205     return REG_NOERROR;
1206   if (dest->alloc < 2 * src->nelem + dest->nelem)
1207     {
1208       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1209       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1210       if (BE (new_buffer == NULL, 0))
1211 	return REG_ESPACE;
1212       dest->elems = new_buffer;
1213       dest->alloc = new_alloc;
1214     }
1215 
1216   if (BE (dest->nelem == 0, 0))
1217     {
1218       dest->nelem = src->nelem;
1219       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1220       return REG_NOERROR;
1221     }
1222 
1223   /* Copy into the top of DEST the items of SRC that are not
1224      found in DEST.  Maybe we could binary search in DEST?  */
1225   for (sbase = dest->nelem + 2 * src->nelem,
1226        is = src->nelem - 1, id = dest->nelem - 1;
1227        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1228     {
1229       if (dest->elems[id] == src->elems[is])
1230 	is--, id--;
1231       else if (dest->elems[id] < src->elems[is])
1232 	dest->elems[--sbase] = src->elems[is--];
1233       else /* if (dest->elems[id] > src->elems[is]) */
1234 	--id;
1235     }
1236 
1237   if (REG_VALID_INDEX (is))
1238     {
1239       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1240       sbase -= is + 1;
1241       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1242     }
1243 
1244   id = dest->nelem - 1;
1245   is = dest->nelem + 2 * src->nelem - 1;
1246   delta = is - sbase + 1;
1247   if (delta == 0)
1248     return REG_NOERROR;
1249 
1250   /* Now copy.  When DELTA becomes zero, the remaining
1251      DEST elements are already in place.  */
1252   dest->nelem += delta;
1253   for (;;)
1254     {
1255       if (dest->elems[is] > dest->elems[id])
1256 	{
1257 	  /* Copy from the top.  */
1258 	  dest->elems[id + delta--] = dest->elems[is--];
1259 	  if (delta == 0)
1260 	    break;
1261 	}
1262       else
1263 	{
1264 	  /* Slide from the bottom.  */
1265 	  dest->elems[id + delta] = dest->elems[id];
1266 	  if (! REG_VALID_INDEX (--id))
1267 	    {
1268 	      /* Copy remaining SRC elements.  */
1269 	      memcpy (dest->elems, dest->elems + sbase,
1270 		      delta * sizeof (Idx));
1271 	      break;
1272 	    }
1273 	}
1274     }
1275 
1276   return REG_NOERROR;
1277 }
1278 
1279 /* Insert the new element ELEM to the re_node_set* SET.
1280    SET should not already have ELEM.
1281    Return true if successful.  */
1282 
1283 static bool
1284 internal_function __attribute_warn_unused_result__
1285 re_node_set_insert (re_node_set *set, Idx elem)
1286 {
1287   Idx idx;
1288   /* In case the set is empty.  */
1289   if (set->alloc == 0)
1290     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1291 
1292   if (BE (set->nelem, 0) == 0)
1293     {
1294       /* We already guaranteed above that set->alloc != 0.  */
1295       set->elems[0] = elem;
1296       ++set->nelem;
1297       return true;
1298     }
1299 
1300   /* Realloc if we need.  */
1301   if (set->alloc == set->nelem)
1302     {
1303       Idx *new_elems;
1304       set->alloc = set->alloc * 2;
1305       new_elems = re_realloc (set->elems, Idx, set->alloc);
1306       if (BE (new_elems == NULL, 0))
1307 	return false;
1308       set->elems = new_elems;
1309     }
1310 
1311   /* Move the elements which follows the new element.  Test the
1312      first element separately to skip a check in the inner loop.  */
1313   if (elem < set->elems[0])
1314     {
1315       idx = 0;
1316       for (idx = set->nelem; idx > 0; idx--)
1317 	set->elems[idx] = set->elems[idx - 1];
1318     }
1319   else
1320     {
1321       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1322 	set->elems[idx] = set->elems[idx - 1];
1323     }
1324 
1325   /* Insert the new element.  */
1326   set->elems[idx] = elem;
1327   ++set->nelem;
1328   return true;
1329 }
1330 
1331 /* Insert the new element ELEM to the re_node_set* SET.
1332    SET should not already have any element greater than or equal to ELEM.
1333    Return true if successful.  */
1334 
1335 static bool
1336 internal_function __attribute_warn_unused_result__
1337 re_node_set_insert_last (re_node_set *set, Idx elem)
1338 {
1339   /* Realloc if we need.  */
1340   if (set->alloc == set->nelem)
1341     {
1342       Idx *new_elems;
1343       set->alloc = (set->alloc + 1) * 2;
1344       new_elems = re_realloc (set->elems, Idx, set->alloc);
1345       if (BE (new_elems == NULL, 0))
1346 	return false;
1347       set->elems = new_elems;
1348     }
1349 
1350   /* Insert the new element.  */
1351   set->elems[set->nelem++] = elem;
1352   return true;
1353 }
1354 
1355 /* Compare two node sets SET1 and SET2.
1356    Return true if SET1 and SET2 are equivalent.  */
1357 
1358 static bool
1359 internal_function __attribute__ ((pure))
1360 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1361 {
1362   Idx i;
1363   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1364     return false;
1365   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1366     if (set1->elems[i] != set2->elems[i])
1367       return false;
1368   return true;
1369 }
1370 
1371 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1372 
1373 static Idx
1374 internal_function __attribute__ ((pure))
1375 re_node_set_contains (const re_node_set *set, Idx elem)
1376 {
1377   __re_size_t idx, right, mid;
1378   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1379     return 0;
1380 
1381   /* Binary search the element.  */
1382   idx = 0;
1383   right = set->nelem - 1;
1384   while (idx < right)
1385     {
1386       mid = (idx + right) / 2;
1387       if (set->elems[mid] < elem)
1388 	idx = mid + 1;
1389       else
1390 	right = mid;
1391     }
1392   return set->elems[idx] == elem ? idx + 1 : 0;
1393 }
1394 
1395 static void
1396 internal_function
1397 re_node_set_remove_at (re_node_set *set, Idx idx)
1398 {
1399   verify (! TYPE_SIGNED (Idx));
1400   /* if (idx < 0)
1401      return; */
1402   if (idx >= set->nelem)
1403     return;
1404   --set->nelem;
1405   for (; idx < set->nelem; idx++)
1406     set->elems[idx] = set->elems[idx + 1];
1407 }
1408 
1409 
1410 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1411    Or return REG_MISSING if an error occurred.  */
1412 
1413 static Idx
1414 internal_function
1415 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1416 {
1417   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1418     {
1419       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1420       Idx *new_nexts, *new_indices;
1421       re_node_set *new_edests, *new_eclosures;
1422       re_token_t *new_nodes;
1423 
1424       /* Avoid overflows in realloc.  */
1425       const size_t max_object_size = MAX (sizeof (re_token_t),
1426 					  MAX (sizeof (re_node_set),
1427 					       sizeof (Idx)));
1428       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1429 	return REG_MISSING;
1430 
1431       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1432       if (BE (new_nodes == NULL, 0))
1433 	return REG_MISSING;
1434       dfa->nodes = new_nodes;
1435       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1436       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1437       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1438       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1439       if (BE (new_nexts == NULL || new_indices == NULL
1440 	      || new_edests == NULL || new_eclosures == NULL, 0))
1441 	return REG_MISSING;
1442       dfa->nexts = new_nexts;
1443       dfa->org_indices = new_indices;
1444       dfa->edests = new_edests;
1445       dfa->eclosures = new_eclosures;
1446       dfa->nodes_alloc = new_nodes_alloc;
1447     }
1448   dfa->nodes[dfa->nodes_len] = token;
1449   dfa->nodes[dfa->nodes_len].constraint = 0;
1450 #ifdef RE_ENABLE_I18N
1451   dfa->nodes[dfa->nodes_len].accept_mb =
1452     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1453      || token.type == COMPLEX_BRACKET);
1454 #endif
1455   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1456   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1457   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1458   return dfa->nodes_len++;
1459 }
1460 
1461 static re_hashval_t
1462 internal_function
1463 calc_state_hash (const re_node_set *nodes, unsigned int context)
1464 {
1465   re_hashval_t hash = nodes->nelem + context;
1466   Idx i;
1467   for (i = 0 ; i < nodes->nelem ; i++)
1468     hash += nodes->elems[i];
1469   return hash;
1470 }
1471 
1472 /* Search for the state whose node_set is equivalent to NODES.
1473    Return the pointer to the state, if we found it in the DFA.
1474    Otherwise create the new one and return it.  In case of an error
1475    return NULL and set the error code in ERR.
1476    Note: - We assume NULL as the invalid state, then it is possible that
1477 	   return value is NULL and ERR is REG_NOERROR.
1478 	 - We never return non-NULL value in case of any errors, it is for
1479 	   optimization.  */
1480 
1481 static re_dfastate_t *
1482 internal_function __attribute_warn_unused_result__
1483 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1484 		  const re_node_set *nodes)
1485 {
1486   re_hashval_t hash;
1487   re_dfastate_t *new_state;
1488   struct re_state_table_entry *spot;
1489   Idx i;
1490 #ifdef lint
1491   /* Suppress bogus uninitialized-variable warnings.  */
1492   *err = REG_NOERROR;
1493 #endif
1494   if (BE (nodes->nelem == 0, 0))
1495     {
1496       *err = REG_NOERROR;
1497       return NULL;
1498     }
1499   hash = calc_state_hash (nodes, 0);
1500   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1501 
1502   for (i = 0 ; i < spot->num ; i++)
1503     {
1504       re_dfastate_t *state = spot->array[i];
1505       if (hash != state->hash)
1506 	continue;
1507       if (re_node_set_compare (&state->nodes, nodes))
1508 	return state;
1509     }
1510 
1511   /* There are no appropriate state in the dfa, create the new one.  */
1512   new_state = create_ci_newstate (dfa, nodes, hash);
1513   if (BE (new_state == NULL, 0))
1514     *err = REG_ESPACE;
1515 
1516   return new_state;
1517 }
1518 
1519 /* Search for the state whose node_set is equivalent to NODES and
1520    whose context is equivalent to CONTEXT.
1521    Return the pointer to the state, if we found it in the DFA.
1522    Otherwise create the new one and return it.  In case of an error
1523    return NULL and set the error code in ERR.
1524    Note: - We assume NULL as the invalid state, then it is possible that
1525 	   return value is NULL and ERR is REG_NOERROR.
1526 	 - We never return non-NULL value in case of any errors, it is for
1527 	   optimization.  */
1528 
1529 static re_dfastate_t *
1530 internal_function __attribute_warn_unused_result__
1531 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1532 			  const re_node_set *nodes, unsigned int context)
1533 {
1534   re_hashval_t hash;
1535   re_dfastate_t *new_state;
1536   struct re_state_table_entry *spot;
1537   Idx i;
1538 #ifdef lint
1539   /* Suppress bogus uninitialized-variable warnings.  */
1540   *err = REG_NOERROR;
1541 #endif
1542   if (nodes->nelem == 0)
1543     {
1544       *err = REG_NOERROR;
1545       return NULL;
1546     }
1547   hash = calc_state_hash (nodes, context);
1548   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1549 
1550   for (i = 0 ; i < spot->num ; i++)
1551     {
1552       re_dfastate_t *state = spot->array[i];
1553       if (state->hash == hash
1554 	  && state->context == context
1555 	  && re_node_set_compare (state->entrance_nodes, nodes))
1556 	return state;
1557     }
1558   /* There are no appropriate state in 'dfa', create the new one.  */
1559   new_state = create_cd_newstate (dfa, nodes, context, hash);
1560   if (BE (new_state == NULL, 0))
1561     *err = REG_ESPACE;
1562 
1563   return new_state;
1564 }
1565 
1566 /* Finish initialization of the new state NEWSTATE, and using its hash value
1567    HASH put in the appropriate bucket of DFA's state table.  Return value
1568    indicates the error code if failed.  */
1569 
1570 static reg_errcode_t
1571 __attribute_warn_unused_result__
1572 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1573 		re_hashval_t hash)
1574 {
1575   struct re_state_table_entry *spot;
1576   reg_errcode_t err;
1577   Idx i;
1578 
1579   newstate->hash = hash;
1580   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1581   if (BE (err != REG_NOERROR, 0))
1582     return REG_ESPACE;
1583   for (i = 0; i < newstate->nodes.nelem; i++)
1584     {
1585       Idx elem = newstate->nodes.elems[i];
1586       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1587 	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1588 	  return REG_ESPACE;
1589     }
1590 
1591   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1592   if (BE (spot->alloc <= spot->num, 0))
1593     {
1594       Idx new_alloc = 2 * spot->num + 2;
1595       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1596 					      new_alloc);
1597       if (BE (new_array == NULL, 0))
1598 	return REG_ESPACE;
1599       spot->array = new_array;
1600       spot->alloc = new_alloc;
1601     }
1602   spot->array[spot->num++] = newstate;
1603   return REG_NOERROR;
1604 }
1605 
1606 static void
1607 free_state (re_dfastate_t *state)
1608 {
1609   re_node_set_free (&state->non_eps_nodes);
1610   re_node_set_free (&state->inveclosure);
1611   if (state->entrance_nodes != &state->nodes)
1612     {
1613       re_node_set_free (state->entrance_nodes);
1614       re_free (state->entrance_nodes);
1615     }
1616   re_node_set_free (&state->nodes);
1617   re_free (state->word_trtable);
1618   re_free (state->trtable);
1619   re_free (state);
1620 }
1621 
1622 /* Create the new state which is independent of contexts.
1623    Return the new state if succeeded, otherwise return NULL.  */
1624 
1625 static re_dfastate_t *
1626 internal_function __attribute_warn_unused_result__
1627 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1628 		    re_hashval_t hash)
1629 {
1630   Idx i;
1631   reg_errcode_t err;
1632   re_dfastate_t *newstate;
1633 
1634   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1635   if (BE (newstate == NULL, 0))
1636     return NULL;
1637   err = re_node_set_init_copy (&newstate->nodes, nodes);
1638   if (BE (err != REG_NOERROR, 0))
1639     {
1640       re_free (newstate);
1641       return NULL;
1642     }
1643 
1644   newstate->entrance_nodes = &newstate->nodes;
1645   for (i = 0 ; i < nodes->nelem ; i++)
1646     {
1647       re_token_t *node = dfa->nodes + nodes->elems[i];
1648       re_token_type_t type = node->type;
1649       if (type == CHARACTER && !node->constraint)
1650 	continue;
1651 #ifdef RE_ENABLE_I18N
1652       newstate->accept_mb |= node->accept_mb;
1653 #endif /* RE_ENABLE_I18N */
1654 
1655       /* If the state has the halt node, the state is a halt state.  */
1656       if (type == END_OF_RE)
1657 	newstate->halt = 1;
1658       else if (type == OP_BACK_REF)
1659 	newstate->has_backref = 1;
1660       else if (type == ANCHOR || node->constraint)
1661 	newstate->has_constraint = 1;
1662     }
1663   err = register_state (dfa, newstate, hash);
1664   if (BE (err != REG_NOERROR, 0))
1665     {
1666       free_state (newstate);
1667       newstate = NULL;
1668     }
1669   return newstate;
1670 }
1671 
1672 /* Create the new state which is depend on the context CONTEXT.
1673    Return the new state if succeeded, otherwise return NULL.  */
1674 
1675 static re_dfastate_t *
1676 internal_function __attribute_warn_unused_result__
1677 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1678 		    unsigned int context, re_hashval_t hash)
1679 {
1680   Idx i, nctx_nodes = 0;
1681   reg_errcode_t err;
1682   re_dfastate_t *newstate;
1683 
1684   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1685   if (BE (newstate == NULL, 0))
1686     return NULL;
1687   err = re_node_set_init_copy (&newstate->nodes, nodes);
1688   if (BE (err != REG_NOERROR, 0))
1689     {
1690       re_free (newstate);
1691       return NULL;
1692     }
1693 
1694   newstate->context = context;
1695   newstate->entrance_nodes = &newstate->nodes;
1696 
1697   for (i = 0 ; i < nodes->nelem ; i++)
1698     {
1699       re_token_t *node = dfa->nodes + nodes->elems[i];
1700       re_token_type_t type = node->type;
1701       unsigned int constraint = node->constraint;
1702 
1703       if (type == CHARACTER && !constraint)
1704 	continue;
1705 #ifdef RE_ENABLE_I18N
1706       newstate->accept_mb |= node->accept_mb;
1707 #endif /* RE_ENABLE_I18N */
1708 
1709       /* If the state has the halt node, the state is a halt state.  */
1710       if (type == END_OF_RE)
1711 	newstate->halt = 1;
1712       else if (type == OP_BACK_REF)
1713 	newstate->has_backref = 1;
1714 
1715       if (constraint)
1716 	{
1717 	  if (newstate->entrance_nodes == &newstate->nodes)
1718 	    {
1719 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1720 	      if (BE (newstate->entrance_nodes == NULL, 0))
1721 		{
1722 		  free_state (newstate);
1723 		  return NULL;
1724 		}
1725 	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1726 		  != REG_NOERROR)
1727 		return NULL;
1728 	      nctx_nodes = 0;
1729 	      newstate->has_constraint = 1;
1730 	    }
1731 
1732 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1733 	    {
1734 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1735 	      ++nctx_nodes;
1736 	    }
1737 	}
1738     }
1739   err = register_state (dfa, newstate, hash);
1740   if (BE (err != REG_NOERROR, 0))
1741     {
1742       free_state (newstate);
1743       newstate = NULL;
1744     }
1745   return  newstate;
1746 }
1747