xref: /dragonfly/contrib/grep/lib/regex_internal.c (revision 0dace59e)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2012 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License along
17    with this program; if not, see <http://www.gnu.org/licenses/>.  */
18 
19 #include "verify.h"
20 #include "intprops.h"
21 static void re_string_construct_common (const char *str, Idx len,
22 					re_string_t *pstr,
23 					RE_TRANSLATE_TYPE trans, bool icase,
24 					const re_dfa_t *dfa) internal_function;
25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 					  const re_node_set *nodes,
27 					  re_hashval_t hash) internal_function;
28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 					  const re_node_set *nodes,
30 					  unsigned int context,
31 					  re_hashval_t hash) internal_function;
32 
33 /* Functions for string operation.  */
34 
35 /* This function allocate the buffers.  It is necessary to call
36    re_string_reconstruct before using the object.  */
37 
38 static reg_errcode_t
39 internal_function __attribute_warn_unused_result__
40 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
41 		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
42 {
43   reg_errcode_t ret;
44   Idx init_buf_len;
45 
46   /* Ensure at least one character fits into the buffers.  */
47   if (init_len < dfa->mb_cur_max)
48     init_len = dfa->mb_cur_max;
49   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50   re_string_construct_common (str, len, pstr, trans, icase, dfa);
51 
52   ret = re_string_realloc_buffers (pstr, init_buf_len);
53   if (BE (ret != REG_NOERROR, 0))
54     return ret;
55 
56   pstr->word_char = dfa->word_char;
57   pstr->word_ops_used = dfa->word_ops_used;
58   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
60   pstr->valid_raw_len = pstr->valid_len;
61   return REG_NOERROR;
62 }
63 
64 /* This function allocate the buffers, and initialize them.  */
65 
66 static reg_errcode_t
67 internal_function __attribute_warn_unused_result__
68 re_string_construct (re_string_t *pstr, const char *str, Idx len,
69 		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
70 {
71   reg_errcode_t ret;
72   memset (pstr, '\0', sizeof (re_string_t));
73   re_string_construct_common (str, len, pstr, trans, icase, dfa);
74 
75   if (len > 0)
76     {
77       ret = re_string_realloc_buffers (pstr, len + 1);
78       if (BE (ret != REG_NOERROR, 0))
79 	return ret;
80     }
81   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
82 
83   if (icase)
84     {
85 #ifdef RE_ENABLE_I18N
86       if (dfa->mb_cur_max > 1)
87 	{
88 	  while (1)
89 	    {
90 	      ret = build_wcs_upper_buffer (pstr);
91 	      if (BE (ret != REG_NOERROR, 0))
92 		return ret;
93 	      if (pstr->valid_raw_len >= len)
94 		break;
95 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 		break;
97 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98 	      if (BE (ret != REG_NOERROR, 0))
99 		return ret;
100 	    }
101 	}
102       else
103 #endif /* RE_ENABLE_I18N  */
104 	build_upper_buffer (pstr);
105     }
106   else
107     {
108 #ifdef RE_ENABLE_I18N
109       if (dfa->mb_cur_max > 1)
110 	build_wcs_buffer (pstr);
111       else
112 #endif /* RE_ENABLE_I18N  */
113 	{
114 	  if (trans != NULL)
115 	    re_string_translate_buffer (pstr);
116 	  else
117 	    {
118 	      pstr->valid_len = pstr->bufs_len;
119 	      pstr->valid_raw_len = pstr->bufs_len;
120 	    }
121 	}
122     }
123 
124   return REG_NOERROR;
125 }
126 
127 /* Helper functions for re_string_allocate, and re_string_construct.  */
128 
129 static reg_errcode_t
130 internal_function __attribute_warn_unused_result__
131 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
132 {
133 #ifdef RE_ENABLE_I18N
134   if (pstr->mb_cur_max > 1)
135     {
136       wint_t *new_wcs;
137 
138       /* Avoid overflow in realloc.  */
139       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
140       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
141 	return REG_ESPACE;
142 
143       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
144       if (BE (new_wcs == NULL, 0))
145 	return REG_ESPACE;
146       pstr->wcs = new_wcs;
147       if (pstr->offsets != NULL)
148 	{
149 	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
150 	  if (BE (new_offsets == NULL, 0))
151 	    return REG_ESPACE;
152 	  pstr->offsets = new_offsets;
153 	}
154     }
155 #endif /* RE_ENABLE_I18N  */
156   if (pstr->mbs_allocated)
157     {
158       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
159 					   new_buf_len);
160       if (BE (new_mbs == NULL, 0))
161 	return REG_ESPACE;
162       pstr->mbs = new_mbs;
163     }
164   pstr->bufs_len = new_buf_len;
165   return REG_NOERROR;
166 }
167 
168 
169 static void
170 internal_function
171 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
172 			    RE_TRANSLATE_TYPE trans, bool icase,
173 			    const re_dfa_t *dfa)
174 {
175   pstr->raw_mbs = (const unsigned char *) str;
176   pstr->len = len;
177   pstr->raw_len = len;
178   pstr->trans = trans;
179   pstr->icase = icase;
180   pstr->mbs_allocated = (trans != NULL || icase);
181   pstr->mb_cur_max = dfa->mb_cur_max;
182   pstr->is_utf8 = dfa->is_utf8;
183   pstr->map_notascii = dfa->map_notascii;
184   pstr->stop = pstr->len;
185   pstr->raw_stop = pstr->stop;
186 }
187 
188 #ifdef RE_ENABLE_I18N
189 
190 /* Build wide character buffer PSTR->WCS.
191    If the byte sequence of the string are:
192      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193    Then wide character buffer will be:
194      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
195    We use WEOF for padding, they indicate that the position isn't
196    a first byte of a multibyte character.
197 
198    Note that this function assumes PSTR->VALID_LEN elements are already
199    built and starts from PSTR->VALID_LEN.  */
200 
201 static void
202 internal_function
203 build_wcs_buffer (re_string_t *pstr)
204 {
205 #ifdef _LIBC
206   unsigned char buf[MB_LEN_MAX];
207   assert (MB_LEN_MAX >= pstr->mb_cur_max);
208 #else
209   unsigned char buf[64];
210 #endif
211   mbstate_t prev_st;
212   Idx byte_idx, end_idx, remain_len;
213   size_t mbclen;
214 
215   /* Build the buffers from pstr->valid_len to either pstr->len or
216      pstr->bufs_len.  */
217   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
218   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
219     {
220       wchar_t wc;
221       const char *p;
222 
223       remain_len = end_idx - byte_idx;
224       prev_st = pstr->cur_state;
225       /* Apply the translation if we need.  */
226       if (BE (pstr->trans != NULL, 0))
227 	{
228 	  int i, ch;
229 
230 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
231 	    {
232 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
233 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
234 	    }
235 	  p = (const char *) buf;
236 	}
237       else
238 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
239       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
240       if (BE (mbclen == (size_t) -1 || mbclen == 0
241 	      || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
242 	{
243 	  /* We treat these cases as a singlebyte character.  */
244 	  mbclen = 1;
245 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
246 	  if (BE (pstr->trans != NULL, 0))
247 	    wc = pstr->trans[wc];
248 	  pstr->cur_state = prev_st;
249 	}
250       else if (BE (mbclen == (size_t) -2, 0))
251 	{
252 	  /* The buffer doesn't have enough space, finish to build.  */
253 	  pstr->cur_state = prev_st;
254 	  break;
255 	}
256 
257       /* Write wide character and padding.  */
258       pstr->wcs[byte_idx++] = wc;
259       /* Write paddings.  */
260       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
261 	pstr->wcs[byte_idx++] = WEOF;
262     }
263   pstr->valid_len = byte_idx;
264   pstr->valid_raw_len = byte_idx;
265 }
266 
267 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
268    but for REG_ICASE.  */
269 
270 static reg_errcode_t
271 internal_function __attribute_warn_unused_result__
272 build_wcs_upper_buffer (re_string_t *pstr)
273 {
274   mbstate_t prev_st;
275   Idx src_idx, byte_idx, end_idx, remain_len;
276   size_t mbclen;
277 #ifdef _LIBC
278   char buf[MB_LEN_MAX];
279   assert (MB_LEN_MAX >= pstr->mb_cur_max);
280 #else
281   char buf[64];
282 #endif
283 
284   byte_idx = pstr->valid_len;
285   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
286 
287   /* The following optimization assumes that ASCII characters can be
288      mapped to wide characters with a simple cast.  */
289   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
290     {
291       while (byte_idx < end_idx)
292 	{
293 	  wchar_t wc;
294 
295 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
296 	      && mbsinit (&pstr->cur_state))
297 	    {
298 	      /* In case of a singlebyte character.  */
299 	      pstr->mbs[byte_idx]
300 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
301 	      /* The next step uses the assumption that wchar_t is encoded
302 		 ASCII-safe: all ASCII values can be converted like this.  */
303 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
304 	      ++byte_idx;
305 	      continue;
306 	    }
307 
308 	  remain_len = end_idx - byte_idx;
309 	  prev_st = pstr->cur_state;
310 	  mbclen = __mbrtowc (&wc,
311 			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
312 			       + byte_idx), remain_len, &pstr->cur_state);
313 	  if (BE (mbclen < (size_t) -2, 1))
314 	    {
315 	      wchar_t wcu = wc;
316 	      if (iswlower (wc))
317 		{
318 		  size_t mbcdlen;
319 
320 		  wcu = towupper (wc);
321 		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
322 		  if (BE (mbclen == mbcdlen, 1))
323 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
324 		  else
325 		    {
326 		      src_idx = byte_idx;
327 		      goto offsets_needed;
328 		    }
329 		}
330 	      else
331 		memcpy (pstr->mbs + byte_idx,
332 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
333 	      pstr->wcs[byte_idx++] = wcu;
334 	      /* Write paddings.  */
335 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
336 		pstr->wcs[byte_idx++] = WEOF;
337 	    }
338 	  else if (mbclen == (size_t) -1 || mbclen == 0
339 		   || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
340 	    {
341 	      /* It is an invalid character, an incomplete character
342 		 at the end of the string, or '\0'.  Just use the byte.  */
343 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
344 	      pstr->mbs[byte_idx] = ch;
345 	      /* And also cast it to wide char.  */
346 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
347 	      if (BE (mbclen == (size_t) -1, 0))
348 		pstr->cur_state = prev_st;
349 	    }
350 	  else
351 	    {
352 	      /* The buffer doesn't have enough space, finish to build.  */
353 	      pstr->cur_state = prev_st;
354 	      break;
355 	    }
356 	}
357       pstr->valid_len = byte_idx;
358       pstr->valid_raw_len = byte_idx;
359       return REG_NOERROR;
360     }
361   else
362     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
363       {
364 	wchar_t wc;
365 	const char *p;
366       offsets_needed:
367 	remain_len = end_idx - byte_idx;
368 	prev_st = pstr->cur_state;
369 	if (BE (pstr->trans != NULL, 0))
370 	  {
371 	    int i, ch;
372 
373 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
374 	      {
375 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
376 		buf[i] = pstr->trans[ch];
377 	      }
378 	    p = (const char *) buf;
379 	  }
380 	else
381 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
382 	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
383 	if (BE (mbclen < (size_t) -2, 1))
384 	  {
385 	    wchar_t wcu = wc;
386 	    if (iswlower (wc))
387 	      {
388 		size_t mbcdlen;
389 
390 		wcu = towupper (wc);
391 		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
392 		if (BE (mbclen == mbcdlen, 1))
393 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
394 		else if (mbcdlen != (size_t) -1)
395 		  {
396 		    size_t i;
397 
398 		    if (byte_idx + mbcdlen > pstr->bufs_len)
399 		      {
400 			pstr->cur_state = prev_st;
401 			break;
402 		      }
403 
404 		    if (pstr->offsets == NULL)
405 		      {
406 			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
407 
408 			if (pstr->offsets == NULL)
409 			  return REG_ESPACE;
410 		      }
411 		    if (!pstr->offsets_needed)
412 		      {
413 			for (i = 0; i < (size_t) byte_idx; ++i)
414 			  pstr->offsets[i] = i;
415 			pstr->offsets_needed = 1;
416 		      }
417 
418 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
419 		    pstr->wcs[byte_idx] = wcu;
420 		    pstr->offsets[byte_idx] = src_idx;
421 		    for (i = 1; i < mbcdlen; ++i)
422 		      {
423 			pstr->offsets[byte_idx + i]
424 			  = src_idx + (i < mbclen ? i : mbclen - 1);
425 			pstr->wcs[byte_idx + i] = WEOF;
426 		      }
427 		    pstr->len += mbcdlen - mbclen;
428 		    if (pstr->raw_stop > src_idx)
429 		      pstr->stop += mbcdlen - mbclen;
430 		    end_idx = (pstr->bufs_len > pstr->len)
431 			      ? pstr->len : pstr->bufs_len;
432 		    byte_idx += mbcdlen;
433 		    src_idx += mbclen;
434 		    continue;
435 		  }
436 		else
437 		  memcpy (pstr->mbs + byte_idx, p, mbclen);
438 	      }
439 	    else
440 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
441 
442 	    if (BE (pstr->offsets_needed != 0, 0))
443 	      {
444 		size_t i;
445 		for (i = 0; i < mbclen; ++i)
446 		  pstr->offsets[byte_idx + i] = src_idx + i;
447 	      }
448 	    src_idx += mbclen;
449 
450 	    pstr->wcs[byte_idx++] = wcu;
451 	    /* Write paddings.  */
452 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
453 	      pstr->wcs[byte_idx++] = WEOF;
454 	  }
455 	else if (mbclen == (size_t) -1 || mbclen == 0
456 		 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
457 	  {
458 	    /* It is an invalid character or '\0'.  Just use the byte.  */
459 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
460 
461 	    if (BE (pstr->trans != NULL, 0))
462 	      ch = pstr->trans [ch];
463 	    pstr->mbs[byte_idx] = ch;
464 
465 	    if (BE (pstr->offsets_needed != 0, 0))
466 	      pstr->offsets[byte_idx] = src_idx;
467 	    ++src_idx;
468 
469 	    /* And also cast it to wide char.  */
470 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
471 	    if (BE (mbclen == (size_t) -1, 0))
472 	      pstr->cur_state = prev_st;
473 	  }
474 	else
475 	  {
476 	    /* The buffer doesn't have enough space, finish to build.  */
477 	    pstr->cur_state = prev_st;
478 	    break;
479 	  }
480       }
481   pstr->valid_len = byte_idx;
482   pstr->valid_raw_len = src_idx;
483   return REG_NOERROR;
484 }
485 
486 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
487    Return the index.  */
488 
489 static Idx
490 internal_function
491 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
492 {
493   mbstate_t prev_st;
494   Idx rawbuf_idx;
495   size_t mbclen;
496   wint_t wc = WEOF;
497 
498   /* Skip the characters which are not necessary to check.  */
499   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
500        rawbuf_idx < new_raw_idx;)
501     {
502       wchar_t wc2;
503       Idx remain_len = pstr->raw_len - rawbuf_idx;
504       prev_st = pstr->cur_state;
505       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
506 			  remain_len, &pstr->cur_state);
507       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
508 	{
509 	  /* We treat these cases as a single byte character.  */
510 	  if (mbclen == 0 || remain_len == 0)
511 	    wc = L'\0';
512 	  else
513 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
514 	  mbclen = 1;
515 	  pstr->cur_state = prev_st;
516 	}
517       else
518 	wc = wc2;
519       /* Then proceed the next character.  */
520       rawbuf_idx += mbclen;
521     }
522   *last_wc = wc;
523   return rawbuf_idx;
524 }
525 #endif /* RE_ENABLE_I18N  */
526 
527 /* Build the buffer PSTR->MBS, and apply the translation if we need.
528    This function is used in case of REG_ICASE.  */
529 
530 static void
531 internal_function
532 build_upper_buffer (re_string_t *pstr)
533 {
534   Idx char_idx, end_idx;
535   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
536 
537   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
538     {
539       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
540       if (BE (pstr->trans != NULL, 0))
541 	ch = pstr->trans[ch];
542       if (islower (ch))
543 	pstr->mbs[char_idx] = toupper (ch);
544       else
545 	pstr->mbs[char_idx] = ch;
546     }
547   pstr->valid_len = char_idx;
548   pstr->valid_raw_len = char_idx;
549 }
550 
551 /* Apply TRANS to the buffer in PSTR.  */
552 
553 static void
554 internal_function
555 re_string_translate_buffer (re_string_t *pstr)
556 {
557   Idx buf_idx, end_idx;
558   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
559 
560   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
561     {
562       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
563       pstr->mbs[buf_idx] = pstr->trans[ch];
564     }
565 
566   pstr->valid_len = buf_idx;
567   pstr->valid_raw_len = buf_idx;
568 }
569 
570 /* This function re-construct the buffers.
571    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
572    convert to upper case in case of REG_ICASE, apply translation.  */
573 
574 static reg_errcode_t
575 internal_function __attribute_warn_unused_result__
576 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
577 {
578   Idx offset;
579 
580   if (BE (pstr->raw_mbs_idx <= idx, 0))
581     offset = idx - pstr->raw_mbs_idx;
582   else
583     {
584       /* Reset buffer.  */
585 #ifdef RE_ENABLE_I18N
586       if (pstr->mb_cur_max > 1)
587 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
588 #endif /* RE_ENABLE_I18N */
589       pstr->len = pstr->raw_len;
590       pstr->stop = pstr->raw_stop;
591       pstr->valid_len = 0;
592       pstr->raw_mbs_idx = 0;
593       pstr->valid_raw_len = 0;
594       pstr->offsets_needed = 0;
595       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
596 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
597       if (!pstr->mbs_allocated)
598 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
599       offset = idx;
600     }
601 
602   if (BE (offset != 0, 1))
603     {
604       /* Should the already checked characters be kept?  */
605       if (BE (offset < pstr->valid_raw_len, 1))
606 	{
607 	  /* Yes, move them to the front of the buffer.  */
608 #ifdef RE_ENABLE_I18N
609 	  if (BE (pstr->offsets_needed, 0))
610 	    {
611 	      Idx low = 0, high = pstr->valid_len, mid;
612 	      do
613 		{
614 		  mid = (high + low) / 2;
615 		  if (pstr->offsets[mid] > offset)
616 		    high = mid;
617 		  else if (pstr->offsets[mid] < offset)
618 		    low = mid + 1;
619 		  else
620 		    break;
621 		}
622 	      while (low < high);
623 	      if (pstr->offsets[mid] < offset)
624 		++mid;
625 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
626 							eflags);
627 	      /* This can be quite complicated, so handle specially
628 		 only the common and easy case where the character with
629 		 different length representation of lower and upper
630 		 case is present at or after offset.  */
631 	      if (pstr->valid_len > offset
632 		  && mid == offset && pstr->offsets[mid] == offset)
633 		{
634 		  memmove (pstr->wcs, pstr->wcs + offset,
635 			   (pstr->valid_len - offset) * sizeof (wint_t));
636 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
637 		  pstr->valid_len -= offset;
638 		  pstr->valid_raw_len -= offset;
639 		  for (low = 0; low < pstr->valid_len; low++)
640 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
641 		}
642 	      else
643 		{
644 		  /* Otherwise, just find out how long the partial multibyte
645 		     character at offset is and fill it with WEOF/255.  */
646 		  pstr->len = pstr->raw_len - idx + offset;
647 		  pstr->stop = pstr->raw_stop - idx + offset;
648 		  pstr->offsets_needed = 0;
649 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
650 		    --mid;
651 		  while (mid < pstr->valid_len)
652 		    if (pstr->wcs[mid] != WEOF)
653 		      break;
654 		    else
655 		      ++mid;
656 		  if (mid == pstr->valid_len)
657 		    pstr->valid_len = 0;
658 		  else
659 		    {
660 		      pstr->valid_len = pstr->offsets[mid] - offset;
661 		      if (pstr->valid_len)
662 			{
663 			  for (low = 0; low < pstr->valid_len; ++low)
664 			    pstr->wcs[low] = WEOF;
665 			  memset (pstr->mbs, 255, pstr->valid_len);
666 			}
667 		    }
668 		  pstr->valid_raw_len = pstr->valid_len;
669 		}
670 	    }
671 	  else
672 #endif
673 	    {
674 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
675 							eflags);
676 #ifdef RE_ENABLE_I18N
677 	      if (pstr->mb_cur_max > 1)
678 		memmove (pstr->wcs, pstr->wcs + offset,
679 			 (pstr->valid_len - offset) * sizeof (wint_t));
680 #endif /* RE_ENABLE_I18N */
681 	      if (BE (pstr->mbs_allocated, 0))
682 		memmove (pstr->mbs, pstr->mbs + offset,
683 			 pstr->valid_len - offset);
684 	      pstr->valid_len -= offset;
685 	      pstr->valid_raw_len -= offset;
686 #if DEBUG
687 	      assert (pstr->valid_len > 0);
688 #endif
689 	    }
690 	}
691       else
692 	{
693 #ifdef RE_ENABLE_I18N
694 	  /* No, skip all characters until IDX.  */
695 	  Idx prev_valid_len = pstr->valid_len;
696 
697 	  if (BE (pstr->offsets_needed, 0))
698 	    {
699 	      pstr->len = pstr->raw_len - idx + offset;
700 	      pstr->stop = pstr->raw_stop - idx + offset;
701 	      pstr->offsets_needed = 0;
702 	    }
703 #endif
704 	  pstr->valid_len = 0;
705 #ifdef RE_ENABLE_I18N
706 	  if (pstr->mb_cur_max > 1)
707 	    {
708 	      Idx wcs_idx;
709 	      wint_t wc = WEOF;
710 
711 	      if (pstr->is_utf8)
712 		{
713 		  const unsigned char *raw, *p, *end;
714 
715 		  /* Special case UTF-8.  Multi-byte chars start with any
716 		     byte other than 0x80 - 0xbf.  */
717 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
718 		  end = raw + (offset - pstr->mb_cur_max);
719 		  if (end < pstr->raw_mbs)
720 		    end = pstr->raw_mbs;
721 		  p = raw + offset - 1;
722 #ifdef _LIBC
723 		  /* We know the wchar_t encoding is UCS4, so for the simple
724 		     case, ASCII characters, skip the conversion step.  */
725 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
726 		    {
727 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
728 		      /* pstr->valid_len = 0; */
729 		      wc = (wchar_t) *p;
730 		    }
731 		  else
732 #endif
733 		    for (; p >= end; --p)
734 		      if ((*p & 0xc0) != 0x80)
735 			{
736 			  mbstate_t cur_state;
737 			  wchar_t wc2;
738 			  Idx mlen = raw + pstr->len - p;
739 			  unsigned char buf[6];
740 			  size_t mbclen;
741 
742 			  const unsigned char *pp = p;
743 			  if (BE (pstr->trans != NULL, 0))
744 			    {
745 			      int i = mlen < 6 ? mlen : 6;
746 			      while (--i >= 0)
747 				buf[i] = pstr->trans[p[i]];
748 			      pp = buf;
749 			    }
750 			  /* XXX Don't use mbrtowc, we know which conversion
751 			     to use (UTF-8 -> UCS4).  */
752 			  memset (&cur_state, 0, sizeof (cur_state));
753 			  mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
754 					      &cur_state);
755 			  if (raw + offset - p <= mbclen
756 			      && mbclen < (size_t) -2)
757 			    {
758 			      memset (&pstr->cur_state, '\0',
759 				      sizeof (mbstate_t));
760 			      pstr->valid_len = mbclen - (raw + offset - p);
761 			      wc = wc2;
762 			    }
763 			  break;
764 			}
765 		}
766 
767 	      if (wc == WEOF)
768 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
769 	      if (wc == WEOF)
770 		pstr->tip_context
771 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
772 	      else
773 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
774 				      && IS_WIDE_WORD_CHAR (wc))
775 				     ? CONTEXT_WORD
776 				     : ((IS_WIDE_NEWLINE (wc)
777 					 && pstr->newline_anchor)
778 					? CONTEXT_NEWLINE : 0));
779 	      if (BE (pstr->valid_len, 0))
780 		{
781 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
782 		    pstr->wcs[wcs_idx] = WEOF;
783 		  if (pstr->mbs_allocated)
784 		    memset (pstr->mbs, 255, pstr->valid_len);
785 		}
786 	      pstr->valid_raw_len = pstr->valid_len;
787 	    }
788 	  else
789 #endif /* RE_ENABLE_I18N */
790 	    {
791 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
792 	      pstr->valid_raw_len = 0;
793 	      if (pstr->trans)
794 		c = pstr->trans[c];
795 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
796 				   ? CONTEXT_WORD
797 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
798 				      ? CONTEXT_NEWLINE : 0));
799 	    }
800 	}
801       if (!BE (pstr->mbs_allocated, 0))
802 	pstr->mbs += offset;
803     }
804   pstr->raw_mbs_idx = idx;
805   pstr->len -= offset;
806   pstr->stop -= offset;
807 
808   /* Then build the buffers.  */
809 #ifdef RE_ENABLE_I18N
810   if (pstr->mb_cur_max > 1)
811     {
812       if (pstr->icase)
813 	{
814 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
815 	  if (BE (ret != REG_NOERROR, 0))
816 	    return ret;
817 	}
818       else
819 	build_wcs_buffer (pstr);
820     }
821   else
822 #endif /* RE_ENABLE_I18N */
823     if (BE (pstr->mbs_allocated, 0))
824       {
825 	if (pstr->icase)
826 	  build_upper_buffer (pstr);
827 	else if (pstr->trans != NULL)
828 	  re_string_translate_buffer (pstr);
829       }
830     else
831       pstr->valid_len = pstr->len;
832 
833   pstr->cur_idx = 0;
834   return REG_NOERROR;
835 }
836 
837 static unsigned char
838 internal_function __attribute ((pure))
839 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
840 {
841   int ch;
842   Idx off;
843 
844   /* Handle the common (easiest) cases first.  */
845   if (BE (!pstr->mbs_allocated, 1))
846     return re_string_peek_byte (pstr, idx);
847 
848 #ifdef RE_ENABLE_I18N
849   if (pstr->mb_cur_max > 1
850       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
851     return re_string_peek_byte (pstr, idx);
852 #endif
853 
854   off = pstr->cur_idx + idx;
855 #ifdef RE_ENABLE_I18N
856   if (pstr->offsets_needed)
857     off = pstr->offsets[off];
858 #endif
859 
860   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
861 
862 #ifdef RE_ENABLE_I18N
863   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
864      this function returns CAPITAL LETTER I instead of first byte of
865      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
866      since peek_byte_case doesn't advance cur_idx in any way.  */
867   if (pstr->offsets_needed && !isascii (ch))
868     return re_string_peek_byte (pstr, idx);
869 #endif
870 
871   return ch;
872 }
873 
874 static unsigned char
875 internal_function
876 re_string_fetch_byte_case (re_string_t *pstr)
877 {
878   if (BE (!pstr->mbs_allocated, 1))
879     return re_string_fetch_byte (pstr);
880 
881 #ifdef RE_ENABLE_I18N
882   if (pstr->offsets_needed)
883     {
884       Idx off;
885       int ch;
886 
887       /* For tr_TR.UTF-8 [[:islower:]] there is
888 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
889 	 in that case the whole multi-byte character and return
890 	 the original letter.  On the other side, with
891 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
892 	 anything else would complicate things too much.  */
893 
894       if (!re_string_first_byte (pstr, pstr->cur_idx))
895 	return re_string_fetch_byte (pstr);
896 
897       off = pstr->offsets[pstr->cur_idx];
898       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
899 
900       if (! isascii (ch))
901 	return re_string_fetch_byte (pstr);
902 
903       re_string_skip_bytes (pstr,
904 			    re_string_char_size_at (pstr, pstr->cur_idx));
905       return ch;
906     }
907 #endif
908 
909   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
910 }
911 
912 static void
913 internal_function
914 re_string_destruct (re_string_t *pstr)
915 {
916 #ifdef RE_ENABLE_I18N
917   re_free (pstr->wcs);
918   re_free (pstr->offsets);
919 #endif /* RE_ENABLE_I18N  */
920   if (pstr->mbs_allocated)
921     re_free (pstr->mbs);
922 }
923 
924 /* Return the context at IDX in INPUT.  */
925 
926 static unsigned int
927 internal_function
928 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
929 {
930   int c;
931   if (BE (! REG_VALID_INDEX (idx), 0))
932     /* In this case, we use the value stored in input->tip_context,
933        since we can't know the character in input->mbs[-1] here.  */
934     return input->tip_context;
935   if (BE (idx == input->len, 0))
936     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
937 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
938 #ifdef RE_ENABLE_I18N
939   if (input->mb_cur_max > 1)
940     {
941       wint_t wc;
942       Idx wc_idx = idx;
943       while(input->wcs[wc_idx] == WEOF)
944 	{
945 #ifdef DEBUG
946 	  /* It must not happen.  */
947 	  assert (REG_VALID_INDEX (wc_idx));
948 #endif
949 	  --wc_idx;
950 	  if (! REG_VALID_INDEX (wc_idx))
951 	    return input->tip_context;
952 	}
953       wc = input->wcs[wc_idx];
954       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
955 	return CONTEXT_WORD;
956       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
957 	      ? CONTEXT_NEWLINE : 0);
958     }
959   else
960 #endif
961     {
962       c = re_string_byte_at (input, idx);
963       if (bitset_contain (input->word_char, c))
964 	return CONTEXT_WORD;
965       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
966     }
967 }
968 
969 /* Functions for set operation.  */
970 
971 static reg_errcode_t
972 internal_function __attribute_warn_unused_result__
973 re_node_set_alloc (re_node_set *set, Idx size)
974 {
975   set->alloc = size;
976   set->nelem = 0;
977   set->elems = re_malloc (Idx, size);
978   if (BE (set->elems == NULL, 0))
979     return REG_ESPACE;
980   return REG_NOERROR;
981 }
982 
983 static reg_errcode_t
984 internal_function __attribute_warn_unused_result__
985 re_node_set_init_1 (re_node_set *set, Idx elem)
986 {
987   set->alloc = 1;
988   set->nelem = 1;
989   set->elems = re_malloc (Idx, 1);
990   if (BE (set->elems == NULL, 0))
991     {
992       set->alloc = set->nelem = 0;
993       return REG_ESPACE;
994     }
995   set->elems[0] = elem;
996   return REG_NOERROR;
997 }
998 
999 static reg_errcode_t
1000 internal_function __attribute_warn_unused_result__
1001 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1002 {
1003   set->alloc = 2;
1004   set->elems = re_malloc (Idx, 2);
1005   if (BE (set->elems == NULL, 0))
1006     return REG_ESPACE;
1007   if (elem1 == elem2)
1008     {
1009       set->nelem = 1;
1010       set->elems[0] = elem1;
1011     }
1012   else
1013     {
1014       set->nelem = 2;
1015       if (elem1 < elem2)
1016 	{
1017 	  set->elems[0] = elem1;
1018 	  set->elems[1] = elem2;
1019 	}
1020       else
1021 	{
1022 	  set->elems[0] = elem2;
1023 	  set->elems[1] = elem1;
1024 	}
1025     }
1026   return REG_NOERROR;
1027 }
1028 
1029 static reg_errcode_t
1030 internal_function __attribute_warn_unused_result__
1031 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1032 {
1033   dest->nelem = src->nelem;
1034   if (src->nelem > 0)
1035     {
1036       dest->alloc = dest->nelem;
1037       dest->elems = re_malloc (Idx, dest->alloc);
1038       if (BE (dest->elems == NULL, 0))
1039 	{
1040 	  dest->alloc = dest->nelem = 0;
1041 	  return REG_ESPACE;
1042 	}
1043       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1044     }
1045   else
1046     re_node_set_init_empty (dest);
1047   return REG_NOERROR;
1048 }
1049 
1050 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1051    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1052    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1053 
1054 static reg_errcode_t
1055 internal_function __attribute_warn_unused_result__
1056 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1057 			   const re_node_set *src2)
1058 {
1059   Idx i1, i2, is, id, delta, sbase;
1060   if (src1->nelem == 0 || src2->nelem == 0)
1061     return REG_NOERROR;
1062 
1063   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1064      conservative estimate.  */
1065   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1066     {
1067       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1068       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1069       if (BE (new_elems == NULL, 0))
1070 	return REG_ESPACE;
1071       dest->elems = new_elems;
1072       dest->alloc = new_alloc;
1073     }
1074 
1075   /* Find the items in the intersection of SRC1 and SRC2, and copy
1076      into the top of DEST those that are not already in DEST itself.  */
1077   sbase = dest->nelem + src1->nelem + src2->nelem;
1078   i1 = src1->nelem - 1;
1079   i2 = src2->nelem - 1;
1080   id = dest->nelem - 1;
1081   for (;;)
1082     {
1083       if (src1->elems[i1] == src2->elems[i2])
1084 	{
1085 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1086 	  while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1087 	    --id;
1088 
1089           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1090             dest->elems[--sbase] = src1->elems[i1];
1091 
1092 	  if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1093 	    break;
1094 	}
1095 
1096       /* Lower the highest of the two items.  */
1097       else if (src1->elems[i1] < src2->elems[i2])
1098 	{
1099 	  if (! REG_VALID_INDEX (--i2))
1100 	    break;
1101 	}
1102       else
1103 	{
1104 	  if (! REG_VALID_INDEX (--i1))
1105 	    break;
1106 	}
1107     }
1108 
1109   id = dest->nelem - 1;
1110   is = dest->nelem + src1->nelem + src2->nelem - 1;
1111   delta = is - sbase + 1;
1112 
1113   /* Now copy.  When DELTA becomes zero, the remaining
1114      DEST elements are already in place; this is more or
1115      less the same loop that is in re_node_set_merge.  */
1116   dest->nelem += delta;
1117   if (delta > 0 && REG_VALID_INDEX (id))
1118     for (;;)
1119       {
1120 	if (dest->elems[is] > dest->elems[id])
1121 	  {
1122 	    /* Copy from the top.  */
1123 	    dest->elems[id + delta--] = dest->elems[is--];
1124 	    if (delta == 0)
1125 	      break;
1126 	  }
1127 	else
1128 	  {
1129 	    /* Slide from the bottom.  */
1130 	    dest->elems[id + delta] = dest->elems[id];
1131 	    if (! REG_VALID_INDEX (--id))
1132 	      break;
1133 	  }
1134       }
1135 
1136   /* Copy remaining SRC elements.  */
1137   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1138 
1139   return REG_NOERROR;
1140 }
1141 
1142 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1143    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1144 
1145 static reg_errcode_t
1146 internal_function __attribute_warn_unused_result__
1147 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1148 			const re_node_set *src2)
1149 {
1150   Idx i1, i2, id;
1151   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1152     {
1153       dest->alloc = src1->nelem + src2->nelem;
1154       dest->elems = re_malloc (Idx, dest->alloc);
1155       if (BE (dest->elems == NULL, 0))
1156 	return REG_ESPACE;
1157     }
1158   else
1159     {
1160       if (src1 != NULL && src1->nelem > 0)
1161 	return re_node_set_init_copy (dest, src1);
1162       else if (src2 != NULL && src2->nelem > 0)
1163 	return re_node_set_init_copy (dest, src2);
1164       else
1165 	re_node_set_init_empty (dest);
1166       return REG_NOERROR;
1167     }
1168   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1169     {
1170       if (src1->elems[i1] > src2->elems[i2])
1171 	{
1172 	  dest->elems[id++] = src2->elems[i2++];
1173 	  continue;
1174 	}
1175       if (src1->elems[i1] == src2->elems[i2])
1176 	++i2;
1177       dest->elems[id++] = src1->elems[i1++];
1178     }
1179   if (i1 < src1->nelem)
1180     {
1181       memcpy (dest->elems + id, src1->elems + i1,
1182 	     (src1->nelem - i1) * sizeof (Idx));
1183       id += src1->nelem - i1;
1184     }
1185   else if (i2 < src2->nelem)
1186     {
1187       memcpy (dest->elems + id, src2->elems + i2,
1188 	     (src2->nelem - i2) * sizeof (Idx));
1189       id += src2->nelem - i2;
1190     }
1191   dest->nelem = id;
1192   return REG_NOERROR;
1193 }
1194 
1195 /* Calculate the union set of the sets DEST and SRC. And store it to
1196    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1197 
1198 static reg_errcode_t
1199 internal_function __attribute_warn_unused_result__
1200 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1201 {
1202   Idx is, id, sbase, delta;
1203   if (src == NULL || src->nelem == 0)
1204     return REG_NOERROR;
1205   if (dest->alloc < 2 * src->nelem + dest->nelem)
1206     {
1207       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1208       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1209       if (BE (new_buffer == NULL, 0))
1210 	return REG_ESPACE;
1211       dest->elems = new_buffer;
1212       dest->alloc = new_alloc;
1213     }
1214 
1215   if (BE (dest->nelem == 0, 0))
1216     {
1217       dest->nelem = src->nelem;
1218       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1219       return REG_NOERROR;
1220     }
1221 
1222   /* Copy into the top of DEST the items of SRC that are not
1223      found in DEST.  Maybe we could binary search in DEST?  */
1224   for (sbase = dest->nelem + 2 * src->nelem,
1225        is = src->nelem - 1, id = dest->nelem - 1;
1226        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1227     {
1228       if (dest->elems[id] == src->elems[is])
1229 	is--, id--;
1230       else if (dest->elems[id] < src->elems[is])
1231 	dest->elems[--sbase] = src->elems[is--];
1232       else /* if (dest->elems[id] > src->elems[is]) */
1233 	--id;
1234     }
1235 
1236   if (REG_VALID_INDEX (is))
1237     {
1238       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1239       sbase -= is + 1;
1240       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1241     }
1242 
1243   id = dest->nelem - 1;
1244   is = dest->nelem + 2 * src->nelem - 1;
1245   delta = is - sbase + 1;
1246   if (delta == 0)
1247     return REG_NOERROR;
1248 
1249   /* Now copy.  When DELTA becomes zero, the remaining
1250      DEST elements are already in place.  */
1251   dest->nelem += delta;
1252   for (;;)
1253     {
1254       if (dest->elems[is] > dest->elems[id])
1255 	{
1256 	  /* Copy from the top.  */
1257 	  dest->elems[id + delta--] = dest->elems[is--];
1258 	  if (delta == 0)
1259 	    break;
1260 	}
1261       else
1262 	{
1263 	  /* Slide from the bottom.  */
1264 	  dest->elems[id + delta] = dest->elems[id];
1265 	  if (! REG_VALID_INDEX (--id))
1266 	    {
1267 	      /* Copy remaining SRC elements.  */
1268 	      memcpy (dest->elems, dest->elems + sbase,
1269 		      delta * sizeof (Idx));
1270 	      break;
1271 	    }
1272 	}
1273     }
1274 
1275   return REG_NOERROR;
1276 }
1277 
1278 /* Insert the new element ELEM to the re_node_set* SET.
1279    SET should not already have ELEM.
1280    Return true if successful.  */
1281 
1282 static bool
1283 internal_function __attribute_warn_unused_result__
1284 re_node_set_insert (re_node_set *set, Idx elem)
1285 {
1286   Idx idx;
1287   /* In case the set is empty.  */
1288   if (set->alloc == 0)
1289     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1290 
1291   if (BE (set->nelem, 0) == 0)
1292     {
1293       /* We already guaranteed above that set->alloc != 0.  */
1294       set->elems[0] = elem;
1295       ++set->nelem;
1296       return true;
1297     }
1298 
1299   /* Realloc if we need.  */
1300   if (set->alloc == set->nelem)
1301     {
1302       Idx *new_elems;
1303       set->alloc = set->alloc * 2;
1304       new_elems = re_realloc (set->elems, Idx, set->alloc);
1305       if (BE (new_elems == NULL, 0))
1306 	return false;
1307       set->elems = new_elems;
1308     }
1309 
1310   /* Move the elements which follows the new element.  Test the
1311      first element separately to skip a check in the inner loop.  */
1312   if (elem < set->elems[0])
1313     {
1314       idx = 0;
1315       for (idx = set->nelem; idx > 0; idx--)
1316 	set->elems[idx] = set->elems[idx - 1];
1317     }
1318   else
1319     {
1320       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1321 	set->elems[idx] = set->elems[idx - 1];
1322     }
1323 
1324   /* Insert the new element.  */
1325   set->elems[idx] = elem;
1326   ++set->nelem;
1327   return true;
1328 }
1329 
1330 /* Insert the new element ELEM to the re_node_set* SET.
1331    SET should not already have any element greater than or equal to ELEM.
1332    Return true if successful.  */
1333 
1334 static bool
1335 internal_function __attribute_warn_unused_result__
1336 re_node_set_insert_last (re_node_set *set, Idx elem)
1337 {
1338   /* Realloc if we need.  */
1339   if (set->alloc == set->nelem)
1340     {
1341       Idx *new_elems;
1342       set->alloc = (set->alloc + 1) * 2;
1343       new_elems = re_realloc (set->elems, Idx, set->alloc);
1344       if (BE (new_elems == NULL, 0))
1345 	return false;
1346       set->elems = new_elems;
1347     }
1348 
1349   /* Insert the new element.  */
1350   set->elems[set->nelem++] = elem;
1351   return true;
1352 }
1353 
1354 /* Compare two node sets SET1 and SET2.
1355    Return true if SET1 and SET2 are equivalent.  */
1356 
1357 static bool
1358 internal_function __attribute ((pure))
1359 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1360 {
1361   Idx i;
1362   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1363     return false;
1364   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1365     if (set1->elems[i] != set2->elems[i])
1366       return false;
1367   return true;
1368 }
1369 
1370 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1371 
1372 static Idx
1373 internal_function __attribute ((pure))
1374 re_node_set_contains (const re_node_set *set, Idx elem)
1375 {
1376   __re_size_t idx, right, mid;
1377   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1378     return 0;
1379 
1380   /* Binary search the element.  */
1381   idx = 0;
1382   right = set->nelem - 1;
1383   while (idx < right)
1384     {
1385       mid = (idx + right) / 2;
1386       if (set->elems[mid] < elem)
1387 	idx = mid + 1;
1388       else
1389 	right = mid;
1390     }
1391   return set->elems[idx] == elem ? idx + 1 : 0;
1392 }
1393 
1394 static void
1395 internal_function
1396 re_node_set_remove_at (re_node_set *set, Idx idx)
1397 {
1398   verify (! TYPE_SIGNED (Idx));
1399   /* if (idx < 0)
1400      return; */
1401   if (idx >= set->nelem)
1402     return;
1403   --set->nelem;
1404   for (; idx < set->nelem; idx++)
1405     set->elems[idx] = set->elems[idx + 1];
1406 }
1407 
1408 
1409 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1410    Or return REG_MISSING if an error occurred.  */
1411 
1412 static Idx
1413 internal_function
1414 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1415 {
1416   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1417     {
1418       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1419       Idx *new_nexts, *new_indices;
1420       re_node_set *new_edests, *new_eclosures;
1421       re_token_t *new_nodes;
1422 
1423       /* Avoid overflows in realloc.  */
1424       const size_t max_object_size = MAX (sizeof (re_token_t),
1425 					  MAX (sizeof (re_node_set),
1426 					       sizeof (Idx)));
1427       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1428 	return REG_MISSING;
1429 
1430       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1431       if (BE (new_nodes == NULL, 0))
1432 	return REG_MISSING;
1433       dfa->nodes = new_nodes;
1434       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1435       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1436       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1437       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1438       if (BE (new_nexts == NULL || new_indices == NULL
1439 	      || new_edests == NULL || new_eclosures == NULL, 0))
1440 	return REG_MISSING;
1441       dfa->nexts = new_nexts;
1442       dfa->org_indices = new_indices;
1443       dfa->edests = new_edests;
1444       dfa->eclosures = new_eclosures;
1445       dfa->nodes_alloc = new_nodes_alloc;
1446     }
1447   dfa->nodes[dfa->nodes_len] = token;
1448   dfa->nodes[dfa->nodes_len].constraint = 0;
1449 #ifdef RE_ENABLE_I18N
1450   {
1451   int type = token.type;
1452   dfa->nodes[dfa->nodes_len].accept_mb =
1453     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1454   }
1455 #endif
1456   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1457   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1458   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1459   return dfa->nodes_len++;
1460 }
1461 
1462 static inline re_hashval_t
1463 internal_function
1464 calc_state_hash (const re_node_set *nodes, unsigned int context)
1465 {
1466   re_hashval_t hash = nodes->nelem + context;
1467   Idx i;
1468   for (i = 0 ; i < nodes->nelem ; i++)
1469     hash += nodes->elems[i];
1470   return hash;
1471 }
1472 
1473 /* Search for the state whose node_set is equivalent to NODES.
1474    Return the pointer to the state, if we found it in the DFA.
1475    Otherwise create the new one and return it.  In case of an error
1476    return NULL and set the error code in ERR.
1477    Note: - We assume NULL as the invalid state, then it is possible that
1478 	   return value is NULL and ERR is REG_NOERROR.
1479 	 - We never return non-NULL value in case of any errors, it is for
1480 	   optimization.  */
1481 
1482 static re_dfastate_t *
1483 internal_function __attribute_warn_unused_result__
1484 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1485 		  const re_node_set *nodes)
1486 {
1487   re_hashval_t hash;
1488   re_dfastate_t *new_state;
1489   struct re_state_table_entry *spot;
1490   Idx i;
1491 #ifdef lint
1492   /* Suppress bogus uninitialized-variable warnings.  */
1493   *err = REG_NOERROR;
1494 #endif
1495   if (BE (nodes->nelem == 0, 0))
1496     {
1497       *err = REG_NOERROR;
1498       return NULL;
1499     }
1500   hash = calc_state_hash (nodes, 0);
1501   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1502 
1503   for (i = 0 ; i < spot->num ; i++)
1504     {
1505       re_dfastate_t *state = spot->array[i];
1506       if (hash != state->hash)
1507 	continue;
1508       if (re_node_set_compare (&state->nodes, nodes))
1509 	return state;
1510     }
1511 
1512   /* There are no appropriate state in the dfa, create the new one.  */
1513   new_state = create_ci_newstate (dfa, nodes, hash);
1514   if (BE (new_state == NULL, 0))
1515     *err = REG_ESPACE;
1516 
1517   return new_state;
1518 }
1519 
1520 /* Search for the state whose node_set is equivalent to NODES and
1521    whose context is equivalent to CONTEXT.
1522    Return the pointer to the state, if we found it in the DFA.
1523    Otherwise create the new one and return it.  In case of an error
1524    return NULL and set the error code in ERR.
1525    Note: - We assume NULL as the invalid state, then it is possible that
1526 	   return value is NULL and ERR is REG_NOERROR.
1527 	 - We never return non-NULL value in case of any errors, it is for
1528 	   optimization.  */
1529 
1530 static re_dfastate_t *
1531 internal_function __attribute_warn_unused_result__
1532 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1533 			  const re_node_set *nodes, unsigned int context)
1534 {
1535   re_hashval_t hash;
1536   re_dfastate_t *new_state;
1537   struct re_state_table_entry *spot;
1538   Idx i;
1539 #ifdef lint
1540   /* Suppress bogus uninitialized-variable warnings.  */
1541   *err = REG_NOERROR;
1542 #endif
1543   if (nodes->nelem == 0)
1544     {
1545       *err = REG_NOERROR;
1546       return NULL;
1547     }
1548   hash = calc_state_hash (nodes, context);
1549   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1550 
1551   for (i = 0 ; i < spot->num ; i++)
1552     {
1553       re_dfastate_t *state = spot->array[i];
1554       if (state->hash == hash
1555 	  && state->context == context
1556 	  && re_node_set_compare (state->entrance_nodes, nodes))
1557 	return state;
1558     }
1559   /* There are no appropriate state in 'dfa', create the new one.  */
1560   new_state = create_cd_newstate (dfa, nodes, context, hash);
1561   if (BE (new_state == NULL, 0))
1562     *err = REG_ESPACE;
1563 
1564   return new_state;
1565 }
1566 
1567 /* Finish initialization of the new state NEWSTATE, and using its hash value
1568    HASH put in the appropriate bucket of DFA's state table.  Return value
1569    indicates the error code if failed.  */
1570 
1571 static reg_errcode_t
1572 __attribute_warn_unused_result__
1573 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1574 		re_hashval_t hash)
1575 {
1576   struct re_state_table_entry *spot;
1577   reg_errcode_t err;
1578   Idx i;
1579 
1580   newstate->hash = hash;
1581   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1582   if (BE (err != REG_NOERROR, 0))
1583     return REG_ESPACE;
1584   for (i = 0; i < newstate->nodes.nelem; i++)
1585     {
1586       Idx elem = newstate->nodes.elems[i];
1587       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1588 	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1589 	  return REG_ESPACE;
1590     }
1591 
1592   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1593   if (BE (spot->alloc <= spot->num, 0))
1594     {
1595       Idx new_alloc = 2 * spot->num + 2;
1596       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1597 					      new_alloc);
1598       if (BE (new_array == NULL, 0))
1599 	return REG_ESPACE;
1600       spot->array = new_array;
1601       spot->alloc = new_alloc;
1602     }
1603   spot->array[spot->num++] = newstate;
1604   return REG_NOERROR;
1605 }
1606 
1607 static void
1608 free_state (re_dfastate_t *state)
1609 {
1610   re_node_set_free (&state->non_eps_nodes);
1611   re_node_set_free (&state->inveclosure);
1612   if (state->entrance_nodes != &state->nodes)
1613     {
1614       re_node_set_free (state->entrance_nodes);
1615       re_free (state->entrance_nodes);
1616     }
1617   re_node_set_free (&state->nodes);
1618   re_free (state->word_trtable);
1619   re_free (state->trtable);
1620   re_free (state);
1621 }
1622 
1623 /* Create the new state which is independent of contexts.
1624    Return the new state if succeeded, otherwise return NULL.  */
1625 
1626 static re_dfastate_t *
1627 internal_function __attribute_warn_unused_result__
1628 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1629 		    re_hashval_t hash)
1630 {
1631   Idx i;
1632   reg_errcode_t err;
1633   re_dfastate_t *newstate;
1634 
1635   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1636   if (BE (newstate == NULL, 0))
1637     return NULL;
1638   err = re_node_set_init_copy (&newstate->nodes, nodes);
1639   if (BE (err != REG_NOERROR, 0))
1640     {
1641       re_free (newstate);
1642       return NULL;
1643     }
1644 
1645   newstate->entrance_nodes = &newstate->nodes;
1646   for (i = 0 ; i < nodes->nelem ; i++)
1647     {
1648       re_token_t *node = dfa->nodes + nodes->elems[i];
1649       re_token_type_t type = node->type;
1650       if (type == CHARACTER && !node->constraint)
1651 	continue;
1652 #ifdef RE_ENABLE_I18N
1653       newstate->accept_mb |= node->accept_mb;
1654 #endif /* RE_ENABLE_I18N */
1655 
1656       /* If the state has the halt node, the state is a halt state.  */
1657       if (type == END_OF_RE)
1658 	newstate->halt = 1;
1659       else if (type == OP_BACK_REF)
1660 	newstate->has_backref = 1;
1661       else if (type == ANCHOR || node->constraint)
1662 	newstate->has_constraint = 1;
1663     }
1664   err = register_state (dfa, newstate, hash);
1665   if (BE (err != REG_NOERROR, 0))
1666     {
1667       free_state (newstate);
1668       newstate = NULL;
1669     }
1670   return newstate;
1671 }
1672 
1673 /* Create the new state which is depend on the context CONTEXT.
1674    Return the new state if succeeded, otherwise return NULL.  */
1675 
1676 static re_dfastate_t *
1677 internal_function __attribute_warn_unused_result__
1678 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1679 		    unsigned int context, re_hashval_t hash)
1680 {
1681   Idx i, nctx_nodes = 0;
1682   reg_errcode_t err;
1683   re_dfastate_t *newstate;
1684 
1685   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1686   if (BE (newstate == NULL, 0))
1687     return NULL;
1688   err = re_node_set_init_copy (&newstate->nodes, nodes);
1689   if (BE (err != REG_NOERROR, 0))
1690     {
1691       re_free (newstate);
1692       return NULL;
1693     }
1694 
1695   newstate->context = context;
1696   newstate->entrance_nodes = &newstate->nodes;
1697 
1698   for (i = 0 ; i < nodes->nelem ; i++)
1699     {
1700       re_token_t *node = dfa->nodes + nodes->elems[i];
1701       re_token_type_t type = node->type;
1702       unsigned int constraint = node->constraint;
1703 
1704       if (type == CHARACTER && !constraint)
1705 	continue;
1706 #ifdef RE_ENABLE_I18N
1707       newstate->accept_mb |= node->accept_mb;
1708 #endif /* RE_ENABLE_I18N */
1709 
1710       /* If the state has the halt node, the state is a halt state.  */
1711       if (type == END_OF_RE)
1712 	newstate->halt = 1;
1713       else if (type == OP_BACK_REF)
1714 	newstate->has_backref = 1;
1715 
1716       if (constraint)
1717 	{
1718 	  if (newstate->entrance_nodes == &newstate->nodes)
1719 	    {
1720 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1721 	      if (BE (newstate->entrance_nodes == NULL, 0))
1722 		{
1723 		  free_state (newstate);
1724 		  return NULL;
1725 		}
1726 	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1727 		  != REG_NOERROR)
1728 		return NULL;
1729 	      nctx_nodes = 0;
1730 	      newstate->has_constraint = 1;
1731 	    }
1732 
1733 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1734 	    {
1735 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1736 	      ++nctx_nodes;
1737 	    }
1738 	}
1739     }
1740   err = register_state (dfa, newstate, hash);
1741   if (BE (err != REG_NOERROR, 0))
1742     {
1743       free_state (newstate);
1744       newstate = NULL;
1745     }
1746   return  newstate;
1747 }
1748