xref: /dragonfly/contrib/grep/lib/regex_internal.c (revision fcf53d9b)
1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
5    Software Foundation, Inc.
6    This file is part of the GNU C Library.
7    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
8 
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation; either version 3, or (at your option)
12    any later version.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18 
19    You should have received a copy of the GNU General Public License along
20    with this program; if not, write to the Free Software Foundation,
21    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 
23 #include "verify.h"
24 #include "intprops.h"
25 static void re_string_construct_common (const char *str, Idx len,
26 					re_string_t *pstr,
27 					RE_TRANSLATE_TYPE trans, bool icase,
28 					const re_dfa_t *dfa) internal_function;
29 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
30 					  const re_node_set *nodes,
31 					  re_hashval_t hash) internal_function;
32 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
33 					  const re_node_set *nodes,
34 					  unsigned int context,
35 					  re_hashval_t hash) internal_function;
36 
37 /* Functions for string operation.  */
38 
39 /* This function allocate the buffers.  It is necessary to call
40    re_string_reconstruct before using the object.  */
41 
42 static reg_errcode_t
43 internal_function __attribute_warn_unused_result__
44 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
45 		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
46 {
47   reg_errcode_t ret;
48   Idx init_buf_len;
49 
50   /* Ensure at least one character fits into the buffers.  */
51   if (init_len < dfa->mb_cur_max)
52     init_len = dfa->mb_cur_max;
53   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
54   re_string_construct_common (str, len, pstr, trans, icase, dfa);
55 
56   ret = re_string_realloc_buffers (pstr, init_buf_len);
57   if (BE (ret != REG_NOERROR, 0))
58     return ret;
59 
60   pstr->word_char = dfa->word_char;
61   pstr->word_ops_used = dfa->word_ops_used;
62   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
63   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
64   pstr->valid_raw_len = pstr->valid_len;
65   return REG_NOERROR;
66 }
67 
68 /* This function allocate the buffers, and initialize them.  */
69 
70 static reg_errcode_t
71 internal_function __attribute_warn_unused_result__
72 re_string_construct (re_string_t *pstr, const char *str, Idx len,
73 		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
74 {
75   reg_errcode_t ret;
76   memset (pstr, '\0', sizeof (re_string_t));
77   re_string_construct_common (str, len, pstr, trans, icase, dfa);
78 
79   if (len > 0)
80     {
81       ret = re_string_realloc_buffers (pstr, len + 1);
82       if (BE (ret != REG_NOERROR, 0))
83 	return ret;
84     }
85   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
86 
87   if (icase)
88     {
89 #ifdef RE_ENABLE_I18N
90       if (dfa->mb_cur_max > 1)
91 	{
92 	  while (1)
93 	    {
94 	      ret = build_wcs_upper_buffer (pstr);
95 	      if (BE (ret != REG_NOERROR, 0))
96 		return ret;
97 	      if (pstr->valid_raw_len >= len)
98 		break;
99 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
100 		break;
101 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
102 	      if (BE (ret != REG_NOERROR, 0))
103 		return ret;
104 	    }
105 	}
106       else
107 #endif /* RE_ENABLE_I18N  */
108 	build_upper_buffer (pstr);
109     }
110   else
111     {
112 #ifdef RE_ENABLE_I18N
113       if (dfa->mb_cur_max > 1)
114 	build_wcs_buffer (pstr);
115       else
116 #endif /* RE_ENABLE_I18N  */
117 	{
118 	  if (trans != NULL)
119 	    re_string_translate_buffer (pstr);
120 	  else
121 	    {
122 	      pstr->valid_len = pstr->bufs_len;
123 	      pstr->valid_raw_len = pstr->bufs_len;
124 	    }
125 	}
126     }
127 
128   return REG_NOERROR;
129 }
130 
131 /* Helper functions for re_string_allocate, and re_string_construct.  */
132 
133 static reg_errcode_t
134 internal_function __attribute_warn_unused_result__
135 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
136 {
137 #ifdef RE_ENABLE_I18N
138   if (pstr->mb_cur_max > 1)
139     {
140       wint_t *new_wcs;
141 
142       /* Avoid overflow.  */
143       size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
144       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
145 	return REG_ESPACE;
146 
147       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
148       if (BE (new_wcs == NULL, 0))
149 	return REG_ESPACE;
150       pstr->wcs = new_wcs;
151       if (pstr->offsets != NULL)
152 	{
153 	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
154 	  if (BE (new_offsets == NULL, 0))
155 	    return REG_ESPACE;
156 	  pstr->offsets = new_offsets;
157 	}
158     }
159 #endif /* RE_ENABLE_I18N  */
160   if (pstr->mbs_allocated)
161     {
162       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
163 					   new_buf_len);
164       if (BE (new_mbs == NULL, 0))
165 	return REG_ESPACE;
166       pstr->mbs = new_mbs;
167     }
168   pstr->bufs_len = new_buf_len;
169   return REG_NOERROR;
170 }
171 
172 
173 static void
174 internal_function
175 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
176 			    RE_TRANSLATE_TYPE trans, bool icase,
177 			    const re_dfa_t *dfa)
178 {
179   pstr->raw_mbs = (const unsigned char *) str;
180   pstr->len = len;
181   pstr->raw_len = len;
182   pstr->trans = trans;
183   pstr->icase = icase;
184   pstr->mbs_allocated = (trans != NULL || icase);
185   pstr->mb_cur_max = dfa->mb_cur_max;
186   pstr->is_utf8 = dfa->is_utf8;
187   pstr->map_notascii = dfa->map_notascii;
188   pstr->stop = pstr->len;
189   pstr->raw_stop = pstr->stop;
190 }
191 
192 #ifdef RE_ENABLE_I18N
193 
194 /* Build wide character buffer PSTR->WCS.
195    If the byte sequence of the string are:
196      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
197    Then wide character buffer will be:
198      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
199    We use WEOF for padding, they indicate that the position isn't
200    a first byte of a multibyte character.
201 
202    Note that this function assumes PSTR->VALID_LEN elements are already
203    built and starts from PSTR->VALID_LEN.  */
204 
205 static void
206 internal_function
207 build_wcs_buffer (re_string_t *pstr)
208 {
209 #ifdef _LIBC
210   unsigned char buf[MB_LEN_MAX];
211   assert (MB_LEN_MAX >= pstr->mb_cur_max);
212 #else
213   unsigned char buf[64];
214 #endif
215   mbstate_t prev_st;
216   Idx byte_idx, end_idx, remain_len;
217   size_t mbclen;
218 
219   /* Build the buffers from pstr->valid_len to either pstr->len or
220      pstr->bufs_len.  */
221   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
222   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
223     {
224       wchar_t wc;
225       const char *p;
226 
227       remain_len = end_idx - byte_idx;
228       prev_st = pstr->cur_state;
229       /* Apply the translation if we need.  */
230       if (BE (pstr->trans != NULL, 0))
231 	{
232 	  int i, ch;
233 
234 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
235 	    {
236 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
237 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
238 	    }
239 	  p = (const char *) buf;
240 	}
241       else
242 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
243       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
244       if (BE (mbclen == (size_t) -2, 0))
245 	{
246 	  /* The buffer doesn't have enough space, finish to build.  */
247 	  pstr->cur_state = prev_st;
248 	  break;
249 	}
250       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
251 	{
252 	  /* We treat these cases as a singlebyte character.  */
253 	  mbclen = 1;
254 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
255 	  if (BE (pstr->trans != NULL, 0))
256 	    wc = pstr->trans[wc];
257 	  pstr->cur_state = prev_st;
258 	}
259 
260       /* Write wide character and padding.  */
261       pstr->wcs[byte_idx++] = wc;
262       /* Write paddings.  */
263       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
264 	pstr->wcs[byte_idx++] = WEOF;
265     }
266   pstr->valid_len = byte_idx;
267   pstr->valid_raw_len = byte_idx;
268 }
269 
270 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
271    but for REG_ICASE.  */
272 
273 static reg_errcode_t
274 internal_function __attribute_warn_unused_result__
275 build_wcs_upper_buffer (re_string_t *pstr)
276 {
277   mbstate_t prev_st;
278   Idx src_idx, byte_idx, end_idx, remain_len;
279   size_t mbclen;
280 #ifdef _LIBC
281   char buf[MB_LEN_MAX];
282   assert (MB_LEN_MAX >= pstr->mb_cur_max);
283 #else
284   char buf[64];
285 #endif
286 
287   byte_idx = pstr->valid_len;
288   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
289 
290   /* The following optimization assumes that ASCII characters can be
291      mapped to wide characters with a simple cast.  */
292   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
293     {
294       while (byte_idx < end_idx)
295 	{
296 	  wchar_t wc;
297 
298 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
299 	      && mbsinit (&pstr->cur_state))
300 	    {
301 	      /* In case of a singlebyte character.  */
302 	      pstr->mbs[byte_idx]
303 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
304 	      /* The next step uses the assumption that wchar_t is encoded
305 		 ASCII-safe: all ASCII values can be converted like this.  */
306 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
307 	      ++byte_idx;
308 	      continue;
309 	    }
310 
311 	  remain_len = end_idx - byte_idx;
312 	  prev_st = pstr->cur_state;
313 	  mbclen = __mbrtowc (&wc,
314 			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
315 			       + byte_idx), remain_len, &pstr->cur_state);
316 	  if (BE (mbclen < (size_t) -2, 1))
317 	    {
318 	      wchar_t wcu = wc;
319 	      if (iswlower (wc))
320 		{
321 		  size_t mbcdlen;
322 
323 		  wcu = towupper (wc);
324 		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
325 		  if (BE (mbclen == mbcdlen, 1))
326 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
327 		  else
328 		    {
329 		      src_idx = byte_idx;
330 		      goto offsets_needed;
331 		    }
332 		}
333 	      else
334 		memcpy (pstr->mbs + byte_idx,
335 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
336 	      pstr->wcs[byte_idx++] = wcu;
337 	      /* Write paddings.  */
338 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
339 		pstr->wcs[byte_idx++] = WEOF;
340 	    }
341 	  else if (mbclen == (size_t) -1 || mbclen == 0)
342 	    {
343 	      /* It is an invalid character or '\0'.  Just use the byte.  */
344 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
345 	      pstr->mbs[byte_idx] = ch;
346 	      /* And also cast it to wide char.  */
347 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
348 	      if (BE (mbclen == (size_t) -1, 0))
349 		pstr->cur_state = prev_st;
350 	    }
351 	  else
352 	    {
353 	      /* The buffer doesn't have enough space, finish to build.  */
354 	      pstr->cur_state = prev_st;
355 	      break;
356 	    }
357 	}
358       pstr->valid_len = byte_idx;
359       pstr->valid_raw_len = byte_idx;
360       return REG_NOERROR;
361     }
362   else
363     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
364       {
365 	wchar_t wc;
366 	const char *p;
367       offsets_needed:
368 	remain_len = end_idx - byte_idx;
369 	prev_st = pstr->cur_state;
370 	if (BE (pstr->trans != NULL, 0))
371 	  {
372 	    int i, ch;
373 
374 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
375 	      {
376 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
377 		buf[i] = pstr->trans[ch];
378 	      }
379 	    p = (const char *) buf;
380 	  }
381 	else
382 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
383 	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
384 	if (BE (mbclen < (size_t) -2, 1))
385 	  {
386 	    wchar_t wcu = wc;
387 	    if (iswlower (wc))
388 	      {
389 		size_t mbcdlen;
390 
391 		wcu = towupper (wc);
392 		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
393 		if (BE (mbclen == mbcdlen, 1))
394 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
395 		else if (mbcdlen != (size_t) -1)
396 		  {
397 		    size_t i;
398 
399 		    if (byte_idx + mbcdlen > pstr->bufs_len)
400 		      {
401 			pstr->cur_state = prev_st;
402 			break;
403 		      }
404 
405 		    if (pstr->offsets == NULL)
406 		      {
407 			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
408 
409 			if (pstr->offsets == NULL)
410 			  return REG_ESPACE;
411 		      }
412 		    if (!pstr->offsets_needed)
413 		      {
414 			for (i = 0; i < (size_t) byte_idx; ++i)
415 			  pstr->offsets[i] = i;
416 			pstr->offsets_needed = 1;
417 		      }
418 
419 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
420 		    pstr->wcs[byte_idx] = wcu;
421 		    pstr->offsets[byte_idx] = src_idx;
422 		    for (i = 1; i < mbcdlen; ++i)
423 		      {
424 			pstr->offsets[byte_idx + i]
425 			  = src_idx + (i < mbclen ? i : mbclen - 1);
426 			pstr->wcs[byte_idx + i] = WEOF;
427 		      }
428 		    pstr->len += mbcdlen - mbclen;
429 		    if (pstr->raw_stop > src_idx)
430 		      pstr->stop += mbcdlen - mbclen;
431 		    end_idx = (pstr->bufs_len > pstr->len)
432 			      ? pstr->len : pstr->bufs_len;
433 		    byte_idx += mbcdlen;
434 		    src_idx += mbclen;
435 		    continue;
436 		  }
437 		else
438 		  memcpy (pstr->mbs + byte_idx, p, mbclen);
439 	      }
440 	    else
441 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
442 
443 	    if (BE (pstr->offsets_needed != 0, 0))
444 	      {
445 		size_t i;
446 		for (i = 0; i < mbclen; ++i)
447 		  pstr->offsets[byte_idx + i] = src_idx + i;
448 	      }
449 	    src_idx += mbclen;
450 
451 	    pstr->wcs[byte_idx++] = wcu;
452 	    /* Write paddings.  */
453 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
454 	      pstr->wcs[byte_idx++] = WEOF;
455 	  }
456 	else if (mbclen == (size_t) -1 || mbclen == 0)
457 	  {
458 	    /* It is an invalid character or '\0'.  Just use the byte.  */
459 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
460 
461 	    if (BE (pstr->trans != NULL, 0))
462 	      ch = pstr->trans [ch];
463 	    pstr->mbs[byte_idx] = ch;
464 
465 	    if (BE (pstr->offsets_needed != 0, 0))
466 	      pstr->offsets[byte_idx] = src_idx;
467 	    ++src_idx;
468 
469 	    /* And also cast it to wide char.  */
470 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
471 	    if (BE (mbclen == (size_t) -1, 0))
472 	      pstr->cur_state = prev_st;
473 	  }
474 	else
475 	  {
476 	    /* The buffer doesn't have enough space, finish to build.  */
477 	    pstr->cur_state = prev_st;
478 	    break;
479 	  }
480       }
481   pstr->valid_len = byte_idx;
482   pstr->valid_raw_len = src_idx;
483   return REG_NOERROR;
484 }
485 
486 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
487    Return the index.  */
488 
489 static Idx
490 internal_function
491 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
492 {
493   mbstate_t prev_st;
494   Idx rawbuf_idx;
495   size_t mbclen;
496   wint_t wc = WEOF;
497 
498   /* Skip the characters which are not necessary to check.  */
499   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
500        rawbuf_idx < new_raw_idx;)
501     {
502       wchar_t wc2;
503       Idx remain_len;
504       remain_len = pstr->len - rawbuf_idx;
505       prev_st = pstr->cur_state;
506       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
507 			  remain_len, &pstr->cur_state);
508       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
509 	{
510 	  /* We treat these cases as a single byte character.  */
511 	  if (mbclen == 0 || remain_len == 0)
512 	    wc = L'\0';
513 	  else
514 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
515 	  mbclen = 1;
516 	  pstr->cur_state = prev_st;
517 	}
518       else
519 	wc = wc2;
520       /* Then proceed the next character.  */
521       rawbuf_idx += mbclen;
522     }
523   *last_wc = wc;
524   return rawbuf_idx;
525 }
526 #endif /* RE_ENABLE_I18N  */
527 
528 /* Build the buffer PSTR->MBS, and apply the translation if we need.
529    This function is used in case of REG_ICASE.  */
530 
531 static void
532 internal_function
533 build_upper_buffer (re_string_t *pstr)
534 {
535   Idx char_idx, end_idx;
536   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
537 
538   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
539     {
540       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
541       if (BE (pstr->trans != NULL, 0))
542 	ch = pstr->trans[ch];
543       if (islower (ch))
544 	pstr->mbs[char_idx] = toupper (ch);
545       else
546 	pstr->mbs[char_idx] = ch;
547     }
548   pstr->valid_len = char_idx;
549   pstr->valid_raw_len = char_idx;
550 }
551 
552 /* Apply TRANS to the buffer in PSTR.  */
553 
554 static void
555 internal_function
556 re_string_translate_buffer (re_string_t *pstr)
557 {
558   Idx buf_idx, end_idx;
559   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
560 
561   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
562     {
563       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
564       pstr->mbs[buf_idx] = pstr->trans[ch];
565     }
566 
567   pstr->valid_len = buf_idx;
568   pstr->valid_raw_len = buf_idx;
569 }
570 
571 /* This function re-construct the buffers.
572    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
573    convert to upper case in case of REG_ICASE, apply translation.  */
574 
575 static reg_errcode_t
576 internal_function __attribute_warn_unused_result__
577 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
578 {
579   Idx offset;
580 
581   if (BE (pstr->raw_mbs_idx <= idx, 0))
582     offset = idx - pstr->raw_mbs_idx;
583   else
584     {
585       /* Reset buffer.  */
586 #ifdef RE_ENABLE_I18N
587       if (pstr->mb_cur_max > 1)
588 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
589 #endif /* RE_ENABLE_I18N */
590       pstr->len = pstr->raw_len;
591       pstr->stop = pstr->raw_stop;
592       pstr->valid_len = 0;
593       pstr->raw_mbs_idx = 0;
594       pstr->valid_raw_len = 0;
595       pstr->offsets_needed = 0;
596       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
597 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
598       if (!pstr->mbs_allocated)
599 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
600       offset = idx;
601     }
602 
603   if (BE (offset != 0, 1))
604     {
605       /* Should the already checked characters be kept?  */
606       if (BE (offset < pstr->valid_raw_len, 1))
607 	{
608 	  /* Yes, move them to the front of the buffer.  */
609 #ifdef RE_ENABLE_I18N
610 	  if (BE (pstr->offsets_needed, 0))
611 	    {
612 	      Idx low = 0, high = pstr->valid_len, mid;
613 	      do
614 		{
615 		  mid = (high + low) / 2;
616 		  if (pstr->offsets[mid] > offset)
617 		    high = mid;
618 		  else if (pstr->offsets[mid] < offset)
619 		    low = mid + 1;
620 		  else
621 		    break;
622 		}
623 	      while (low < high);
624 	      if (pstr->offsets[mid] < offset)
625 		++mid;
626 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
627 							eflags);
628 	      /* This can be quite complicated, so handle specially
629 		 only the common and easy case where the character with
630 		 different length representation of lower and upper
631 		 case is present at or after offset.  */
632 	      if (pstr->valid_len > offset
633 		  && mid == offset && pstr->offsets[mid] == offset)
634 		{
635 		  memmove (pstr->wcs, pstr->wcs + offset,
636 			   (pstr->valid_len - offset) * sizeof (wint_t));
637 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
638 		  pstr->valid_len -= offset;
639 		  pstr->valid_raw_len -= offset;
640 		  for (low = 0; low < pstr->valid_len; low++)
641 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
642 		}
643 	      else
644 		{
645 		  /* Otherwise, just find out how long the partial multibyte
646 		     character at offset is and fill it with WEOF/255.  */
647 		  pstr->len = pstr->raw_len - idx + offset;
648 		  pstr->stop = pstr->raw_stop - idx + offset;
649 		  pstr->offsets_needed = 0;
650 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
651 		    --mid;
652 		  while (mid < pstr->valid_len)
653 		    if (pstr->wcs[mid] != WEOF)
654 		      break;
655 		    else
656 		      ++mid;
657 		  if (mid == pstr->valid_len)
658 		    pstr->valid_len = 0;
659 		  else
660 		    {
661 		      pstr->valid_len = pstr->offsets[mid] - offset;
662 		      if (pstr->valid_len)
663 			{
664 			  for (low = 0; low < pstr->valid_len; ++low)
665 			    pstr->wcs[low] = WEOF;
666 			  memset (pstr->mbs, 255, pstr->valid_len);
667 			}
668 		    }
669 		  pstr->valid_raw_len = pstr->valid_len;
670 		}
671 	    }
672 	  else
673 #endif
674 	    {
675 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
676 							eflags);
677 #ifdef RE_ENABLE_I18N
678 	      if (pstr->mb_cur_max > 1)
679 		memmove (pstr->wcs, pstr->wcs + offset,
680 			 (pstr->valid_len - offset) * sizeof (wint_t));
681 #endif /* RE_ENABLE_I18N */
682 	      if (BE (pstr->mbs_allocated, 0))
683 		memmove (pstr->mbs, pstr->mbs + offset,
684 			 pstr->valid_len - offset);
685 	      pstr->valid_len -= offset;
686 	      pstr->valid_raw_len -= offset;
687 #if DEBUG
688 	      assert (pstr->valid_len > 0);
689 #endif
690 	    }
691 	}
692       else
693 	{
694 #ifdef RE_ENABLE_I18N
695 	  /* No, skip all characters until IDX.  */
696 	  Idx prev_valid_len = pstr->valid_len;
697 
698 	  if (BE (pstr->offsets_needed, 0))
699 	    {
700 	      pstr->len = pstr->raw_len - idx + offset;
701 	      pstr->stop = pstr->raw_stop - idx + offset;
702 	      pstr->offsets_needed = 0;
703 	    }
704 #endif
705 	  pstr->valid_len = 0;
706 #ifdef RE_ENABLE_I18N
707 	  if (pstr->mb_cur_max > 1)
708 	    {
709 	      Idx wcs_idx;
710 	      wint_t wc = WEOF;
711 
712 	      if (pstr->is_utf8)
713 		{
714 		  const unsigned char *raw, *p, *end;
715 
716 		  /* Special case UTF-8.  Multi-byte chars start with any
717 		     byte other than 0x80 - 0xbf.  */
718 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
719 		  end = raw + (offset - pstr->mb_cur_max);
720 		  if (end < pstr->raw_mbs)
721 		    end = pstr->raw_mbs;
722 		  p = raw + offset - 1;
723 #ifdef _LIBC
724 		  /* We know the wchar_t encoding is UCS4, so for the simple
725 		     case, ASCII characters, skip the conversion step.  */
726 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
727 		    {
728 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
729 		      /* pstr->valid_len = 0; */
730 		      wc = (wchar_t) *p;
731 		    }
732 		  else
733 #endif
734 		    for (; p >= end; --p)
735 		      if ((*p & 0xc0) != 0x80)
736 			{
737 			  mbstate_t cur_state;
738 			  wchar_t wc2;
739 			  Idx mlen = raw + pstr->len - p;
740 			  size_t mbclen;
741 
742 #if 0 /* dead code: buf is set but never used */
743 			  unsigned char buf[6];
744 			  if (BE (pstr->trans != NULL, 0))
745 			    {
746 			      int i = mlen < 6 ? mlen : 6;
747 			      while (--i >= 0)
748 				buf[i] = pstr->trans[p[i]];
749 			    }
750 #endif
751 			  /* XXX Don't use mbrtowc, we know which conversion
752 			     to use (UTF-8 -> UCS4).  */
753 			  memset (&cur_state, 0, sizeof (cur_state));
754 			  mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
755 					      &cur_state);
756 			  if (raw + offset - p <= mbclen
757 			      && mbclen < (size_t) -2)
758 			    {
759 			      memset (&pstr->cur_state, '\0',
760 				      sizeof (mbstate_t));
761 			      pstr->valid_len = mbclen - (raw + offset - p);
762 			      wc = wc2;
763 			    }
764 			  break;
765 			}
766 		}
767 
768 	      if (wc == WEOF)
769 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
770 	      if (wc == WEOF)
771 		pstr->tip_context
772 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
773 	      else
774 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
775 				      && IS_WIDE_WORD_CHAR (wc))
776 				     ? CONTEXT_WORD
777 				     : ((IS_WIDE_NEWLINE (wc)
778 					 && pstr->newline_anchor)
779 					? CONTEXT_NEWLINE : 0));
780 	      if (BE (pstr->valid_len, 0))
781 		{
782 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
783 		    pstr->wcs[wcs_idx] = WEOF;
784 		  if (pstr->mbs_allocated)
785 		    memset (pstr->mbs, 255, pstr->valid_len);
786 		}
787 	      pstr->valid_raw_len = pstr->valid_len;
788 	    }
789 	  else
790 #endif /* RE_ENABLE_I18N */
791 	    {
792 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
793 	      pstr->valid_raw_len = 0;
794 	      if (pstr->trans)
795 		c = pstr->trans[c];
796 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
797 				   ? CONTEXT_WORD
798 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
799 				      ? CONTEXT_NEWLINE : 0));
800 	    }
801 	}
802       if (!BE (pstr->mbs_allocated, 0))
803 	pstr->mbs += offset;
804     }
805   pstr->raw_mbs_idx = idx;
806   pstr->len -= offset;
807   pstr->stop -= offset;
808 
809   /* Then build the buffers.  */
810 #ifdef RE_ENABLE_I18N
811   if (pstr->mb_cur_max > 1)
812     {
813       if (pstr->icase)
814 	{
815 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
816 	  if (BE (ret != REG_NOERROR, 0))
817 	    return ret;
818 	}
819       else
820 	build_wcs_buffer (pstr);
821     }
822   else
823 #endif /* RE_ENABLE_I18N */
824     if (BE (pstr->mbs_allocated, 0))
825       {
826 	if (pstr->icase)
827 	  build_upper_buffer (pstr);
828 	else if (pstr->trans != NULL)
829 	  re_string_translate_buffer (pstr);
830       }
831     else
832       pstr->valid_len = pstr->len;
833 
834   pstr->cur_idx = 0;
835   return REG_NOERROR;
836 }
837 
838 static unsigned char
839 internal_function __attribute ((pure))
840 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
841 {
842   int ch;
843   Idx off;
844 
845   /* Handle the common (easiest) cases first.  */
846   if (BE (!pstr->mbs_allocated, 1))
847     return re_string_peek_byte (pstr, idx);
848 
849 #ifdef RE_ENABLE_I18N
850   if (pstr->mb_cur_max > 1
851       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
852     return re_string_peek_byte (pstr, idx);
853 #endif
854 
855   off = pstr->cur_idx + idx;
856 #ifdef RE_ENABLE_I18N
857   if (pstr->offsets_needed)
858     off = pstr->offsets[off];
859 #endif
860 
861   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
862 
863 #ifdef RE_ENABLE_I18N
864   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
865      this function returns CAPITAL LETTER I instead of first byte of
866      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
867      since peek_byte_case doesn't advance cur_idx in any way.  */
868   if (pstr->offsets_needed && !isascii (ch))
869     return re_string_peek_byte (pstr, idx);
870 #endif
871 
872   return ch;
873 }
874 
875 static unsigned char
876 internal_function __attribute ((pure))
877 re_string_fetch_byte_case (re_string_t *pstr)
878 {
879   if (BE (!pstr->mbs_allocated, 1))
880     return re_string_fetch_byte (pstr);
881 
882 #ifdef RE_ENABLE_I18N
883   if (pstr->offsets_needed)
884     {
885       Idx off;
886       int ch;
887 
888       /* For tr_TR.UTF-8 [[:islower:]] there is
889 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
890 	 in that case the whole multi-byte character and return
891 	 the original letter.  On the other side, with
892 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
893 	 anything else would complicate things too much.  */
894 
895       if (!re_string_first_byte (pstr, pstr->cur_idx))
896 	return re_string_fetch_byte (pstr);
897 
898       off = pstr->offsets[pstr->cur_idx];
899       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
900 
901       if (! isascii (ch))
902 	return re_string_fetch_byte (pstr);
903 
904       re_string_skip_bytes (pstr,
905 			    re_string_char_size_at (pstr, pstr->cur_idx));
906       return ch;
907     }
908 #endif
909 
910   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
911 }
912 
913 static void
914 internal_function
915 re_string_destruct (re_string_t *pstr)
916 {
917 #ifdef RE_ENABLE_I18N
918   re_free (pstr->wcs);
919   re_free (pstr->offsets);
920 #endif /* RE_ENABLE_I18N  */
921   if (pstr->mbs_allocated)
922     re_free (pstr->mbs);
923 }
924 
925 /* Return the context at IDX in INPUT.  */
926 
927 static unsigned int
928 internal_function
929 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
930 {
931   int c;
932   if (BE (! REG_VALID_INDEX (idx), 0))
933     /* In this case, we use the value stored in input->tip_context,
934        since we can't know the character in input->mbs[-1] here.  */
935     return input->tip_context;
936   if (BE (idx == input->len, 0))
937     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
938 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
939 #ifdef RE_ENABLE_I18N
940   if (input->mb_cur_max > 1)
941     {
942       wint_t wc;
943       Idx wc_idx = idx;
944       while(input->wcs[wc_idx] == WEOF)
945 	{
946 #ifdef DEBUG
947 	  /* It must not happen.  */
948 	  assert (REG_VALID_INDEX (wc_idx));
949 #endif
950 	  --wc_idx;
951 	  if (! REG_VALID_INDEX (wc_idx))
952 	    return input->tip_context;
953 	}
954       wc = input->wcs[wc_idx];
955       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
956 	return CONTEXT_WORD;
957       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
958 	      ? CONTEXT_NEWLINE : 0);
959     }
960   else
961 #endif
962     {
963       c = re_string_byte_at (input, idx);
964       if (bitset_contain (input->word_char, c))
965 	return CONTEXT_WORD;
966       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
967     }
968 }
969 
970 /* Functions for set operation.  */
971 
972 static reg_errcode_t
973 internal_function __attribute_warn_unused_result__
974 re_node_set_alloc (re_node_set *set, Idx size)
975 {
976   set->alloc = size;
977   set->nelem = 0;
978   set->elems = re_malloc (Idx, size);
979   if (BE (set->elems == NULL, 0))
980     return REG_ESPACE;
981   return REG_NOERROR;
982 }
983 
984 static reg_errcode_t
985 internal_function __attribute_warn_unused_result__
986 re_node_set_init_1 (re_node_set *set, Idx elem)
987 {
988   set->alloc = 1;
989   set->nelem = 1;
990   set->elems = re_malloc (Idx, 1);
991   if (BE (set->elems == NULL, 0))
992     {
993       set->alloc = set->nelem = 0;
994       return REG_ESPACE;
995     }
996   set->elems[0] = elem;
997   return REG_NOERROR;
998 }
999 
1000 static reg_errcode_t
1001 internal_function __attribute_warn_unused_result__
1002 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1003 {
1004   set->alloc = 2;
1005   set->elems = re_malloc (Idx, 2);
1006   if (BE (set->elems == NULL, 0))
1007     return REG_ESPACE;
1008   if (elem1 == elem2)
1009     {
1010       set->nelem = 1;
1011       set->elems[0] = elem1;
1012     }
1013   else
1014     {
1015       set->nelem = 2;
1016       if (elem1 < elem2)
1017 	{
1018 	  set->elems[0] = elem1;
1019 	  set->elems[1] = elem2;
1020 	}
1021       else
1022 	{
1023 	  set->elems[0] = elem2;
1024 	  set->elems[1] = elem1;
1025 	}
1026     }
1027   return REG_NOERROR;
1028 }
1029 
1030 static reg_errcode_t
1031 internal_function __attribute_warn_unused_result__
1032 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1033 {
1034   dest->nelem = src->nelem;
1035   if (src->nelem > 0)
1036     {
1037       dest->alloc = dest->nelem;
1038       dest->elems = re_malloc (Idx, dest->alloc);
1039       if (BE (dest->elems == NULL, 0))
1040 	{
1041 	  dest->alloc = dest->nelem = 0;
1042 	  return REG_ESPACE;
1043 	}
1044       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1045     }
1046   else
1047     re_node_set_init_empty (dest);
1048   return REG_NOERROR;
1049 }
1050 
1051 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1052    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1053    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1054 
1055 static reg_errcode_t
1056 internal_function __attribute_warn_unused_result__
1057 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1058 			   const re_node_set *src2)
1059 {
1060   Idx i1, i2, is, id, delta, sbase;
1061   if (src1->nelem == 0 || src2->nelem == 0)
1062     return REG_NOERROR;
1063 
1064   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1065      conservative estimate.  */
1066   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1067     {
1068       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1069       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1070       if (BE (new_elems == NULL, 0))
1071 	return REG_ESPACE;
1072       dest->elems = new_elems;
1073       dest->alloc = new_alloc;
1074     }
1075 
1076   /* Find the items in the intersection of SRC1 and SRC2, and copy
1077      into the top of DEST those that are not already in DEST itself.  */
1078   sbase = dest->nelem + src1->nelem + src2->nelem;
1079   i1 = src1->nelem - 1;
1080   i2 = src2->nelem - 1;
1081   id = dest->nelem - 1;
1082   for (;;)
1083     {
1084       if (src1->elems[i1] == src2->elems[i2])
1085 	{
1086 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1087 	  while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1088 	    --id;
1089 
1090           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1091             dest->elems[--sbase] = src1->elems[i1];
1092 
1093 	  if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1094 	    break;
1095 	}
1096 
1097       /* Lower the highest of the two items.  */
1098       else if (src1->elems[i1] < src2->elems[i2])
1099 	{
1100 	  if (! REG_VALID_INDEX (--i2))
1101 	    break;
1102 	}
1103       else
1104 	{
1105 	  if (! REG_VALID_INDEX (--i1))
1106 	    break;
1107 	}
1108     }
1109 
1110   id = dest->nelem - 1;
1111   is = dest->nelem + src1->nelem + src2->nelem - 1;
1112   delta = is - sbase + 1;
1113 
1114   /* Now copy.  When DELTA becomes zero, the remaining
1115      DEST elements are already in place; this is more or
1116      less the same loop that is in re_node_set_merge.  */
1117   dest->nelem += delta;
1118   if (delta > 0 && REG_VALID_INDEX (id))
1119     for (;;)
1120       {
1121 	if (dest->elems[is] > dest->elems[id])
1122 	  {
1123 	    /* Copy from the top.  */
1124 	    dest->elems[id + delta--] = dest->elems[is--];
1125 	    if (delta == 0)
1126 	      break;
1127 	  }
1128 	else
1129 	  {
1130 	    /* Slide from the bottom.  */
1131 	    dest->elems[id + delta] = dest->elems[id];
1132 	    if (! REG_VALID_INDEX (--id))
1133 	      break;
1134 	  }
1135       }
1136 
1137   /* Copy remaining SRC elements.  */
1138   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1139 
1140   return REG_NOERROR;
1141 }
1142 
1143 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1144    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1145 
1146 static reg_errcode_t
1147 internal_function __attribute_warn_unused_result__
1148 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1149 			const re_node_set *src2)
1150 {
1151   Idx i1, i2, id;
1152   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1153     {
1154       dest->alloc = src1->nelem + src2->nelem;
1155       dest->elems = re_malloc (Idx, dest->alloc);
1156       if (BE (dest->elems == NULL, 0))
1157 	return REG_ESPACE;
1158     }
1159   else
1160     {
1161       if (src1 != NULL && src1->nelem > 0)
1162 	return re_node_set_init_copy (dest, src1);
1163       else if (src2 != NULL && src2->nelem > 0)
1164 	return re_node_set_init_copy (dest, src2);
1165       else
1166 	re_node_set_init_empty (dest);
1167       return REG_NOERROR;
1168     }
1169   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1170     {
1171       if (src1->elems[i1] > src2->elems[i2])
1172 	{
1173 	  dest->elems[id++] = src2->elems[i2++];
1174 	  continue;
1175 	}
1176       if (src1->elems[i1] == src2->elems[i2])
1177 	++i2;
1178       dest->elems[id++] = src1->elems[i1++];
1179     }
1180   if (i1 < src1->nelem)
1181     {
1182       memcpy (dest->elems + id, src1->elems + i1,
1183 	     (src1->nelem - i1) * sizeof (Idx));
1184       id += src1->nelem - i1;
1185     }
1186   else if (i2 < src2->nelem)
1187     {
1188       memcpy (dest->elems + id, src2->elems + i2,
1189 	     (src2->nelem - i2) * sizeof (Idx));
1190       id += src2->nelem - i2;
1191     }
1192   dest->nelem = id;
1193   return REG_NOERROR;
1194 }
1195 
1196 /* Calculate the union set of the sets DEST and SRC. And store it to
1197    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1198 
1199 static reg_errcode_t
1200 internal_function __attribute_warn_unused_result__
1201 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1202 {
1203   Idx is, id, sbase, delta;
1204   if (src == NULL || src->nelem == 0)
1205     return REG_NOERROR;
1206   if (dest->alloc < 2 * src->nelem + dest->nelem)
1207     {
1208       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1209       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1210       if (BE (new_buffer == NULL, 0))
1211 	return REG_ESPACE;
1212       dest->elems = new_buffer;
1213       dest->alloc = new_alloc;
1214     }
1215 
1216   if (BE (dest->nelem == 0, 0))
1217     {
1218       dest->nelem = src->nelem;
1219       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1220       return REG_NOERROR;
1221     }
1222 
1223   /* Copy into the top of DEST the items of SRC that are not
1224      found in DEST.  Maybe we could binary search in DEST?  */
1225   for (sbase = dest->nelem + 2 * src->nelem,
1226        is = src->nelem - 1, id = dest->nelem - 1;
1227        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1228     {
1229       if (dest->elems[id] == src->elems[is])
1230 	is--, id--;
1231       else if (dest->elems[id] < src->elems[is])
1232 	dest->elems[--sbase] = src->elems[is--];
1233       else /* if (dest->elems[id] > src->elems[is]) */
1234 	--id;
1235     }
1236 
1237   if (REG_VALID_INDEX (is))
1238     {
1239       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1240       sbase -= is + 1;
1241       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1242     }
1243 
1244   id = dest->nelem - 1;
1245   is = dest->nelem + 2 * src->nelem - 1;
1246   delta = is - sbase + 1;
1247   if (delta == 0)
1248     return REG_NOERROR;
1249 
1250   /* Now copy.  When DELTA becomes zero, the remaining
1251      DEST elements are already in place.  */
1252   dest->nelem += delta;
1253   for (;;)
1254     {
1255       if (dest->elems[is] > dest->elems[id])
1256 	{
1257 	  /* Copy from the top.  */
1258 	  dest->elems[id + delta--] = dest->elems[is--];
1259 	  if (delta == 0)
1260 	    break;
1261 	}
1262       else
1263 	{
1264 	  /* Slide from the bottom.  */
1265 	  dest->elems[id + delta] = dest->elems[id];
1266 	  if (! REG_VALID_INDEX (--id))
1267 	    {
1268 	      /* Copy remaining SRC elements.  */
1269 	      memcpy (dest->elems, dest->elems + sbase,
1270 		      delta * sizeof (Idx));
1271 	      break;
1272 	    }
1273 	}
1274     }
1275 
1276   return REG_NOERROR;
1277 }
1278 
1279 /* Insert the new element ELEM to the re_node_set* SET.
1280    SET should not already have ELEM.
1281    Return true if successful.  */
1282 
1283 static bool
1284 internal_function __attribute_warn_unused_result__
1285 re_node_set_insert (re_node_set *set, Idx elem)
1286 {
1287   Idx idx;
1288   /* In case the set is empty.  */
1289   if (set->alloc == 0)
1290     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1291 
1292   if (BE (set->nelem, 0) == 0)
1293     {
1294       /* We already guaranteed above that set->alloc != 0.  */
1295       set->elems[0] = elem;
1296       ++set->nelem;
1297       return true;
1298     }
1299 
1300   /* Realloc if we need.  */
1301   if (set->alloc == set->nelem)
1302     {
1303       Idx *new_elems;
1304       set->alloc = set->alloc * 2;
1305       new_elems = re_realloc (set->elems, Idx, set->alloc);
1306       if (BE (new_elems == NULL, 0))
1307 	return false;
1308       set->elems = new_elems;
1309     }
1310 
1311   /* Move the elements which follows the new element.  Test the
1312      first element separately to skip a check in the inner loop.  */
1313   if (elem < set->elems[0])
1314     {
1315       idx = 0;
1316       for (idx = set->nelem; idx > 0; idx--)
1317 	set->elems[idx] = set->elems[idx - 1];
1318     }
1319   else
1320     {
1321       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1322 	set->elems[idx] = set->elems[idx - 1];
1323     }
1324 
1325   /* Insert the new element.  */
1326   set->elems[idx] = elem;
1327   ++set->nelem;
1328   return true;
1329 }
1330 
1331 /* Insert the new element ELEM to the re_node_set* SET.
1332    SET should not already have any element greater than or equal to ELEM.
1333    Return true if successful.  */
1334 
1335 static bool
1336 internal_function __attribute_warn_unused_result__
1337 re_node_set_insert_last (re_node_set *set, Idx elem)
1338 {
1339   /* Realloc if we need.  */
1340   if (set->alloc == set->nelem)
1341     {
1342       Idx *new_elems;
1343       set->alloc = (set->alloc + 1) * 2;
1344       new_elems = re_realloc (set->elems, Idx, set->alloc);
1345       if (BE (new_elems == NULL, 0))
1346 	return false;
1347       set->elems = new_elems;
1348     }
1349 
1350   /* Insert the new element.  */
1351   set->elems[set->nelem++] = elem;
1352   return true;
1353 }
1354 
1355 /* Compare two node sets SET1 and SET2.
1356    Return true if SET1 and SET2 are equivalent.  */
1357 
1358 static bool
1359 internal_function __attribute ((pure))
1360 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1361 {
1362   Idx i;
1363   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1364     return false;
1365   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1366     if (set1->elems[i] != set2->elems[i])
1367       return false;
1368   return true;
1369 }
1370 
1371 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1372 
1373 static Idx
1374 internal_function __attribute ((pure))
1375 re_node_set_contains (const re_node_set *set, Idx elem)
1376 {
1377   __re_size_t idx, right, mid;
1378   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1379     return 0;
1380 
1381   /* Binary search the element.  */
1382   idx = 0;
1383   right = set->nelem - 1;
1384   while (idx < right)
1385     {
1386       mid = (idx + right) / 2;
1387       if (set->elems[mid] < elem)
1388 	idx = mid + 1;
1389       else
1390 	right = mid;
1391     }
1392   return set->elems[idx] == elem ? idx + 1 : 0;
1393 }
1394 
1395 static void
1396 internal_function
1397 re_node_set_remove_at (re_node_set *set, Idx idx)
1398 {
1399   verify (! TYPE_SIGNED (Idx));
1400   /* if (idx < 0)
1401      return; */
1402   if (idx >= set->nelem)
1403     return;
1404   --set->nelem;
1405   for (; idx < set->nelem; idx++)
1406     set->elems[idx] = set->elems[idx + 1];
1407 }
1408 
1409 
1410 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1411    Or return REG_MISSING if an error occurred.  */
1412 
1413 static Idx
1414 internal_function
1415 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1416 {
1417   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1418     {
1419       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1420       Idx *new_nexts, *new_indices;
1421       re_node_set *new_edests, *new_eclosures;
1422       re_token_t *new_nodes;
1423       size_t max_object_size =
1424 	MAX (sizeof (re_token_t),
1425 	     MAX (sizeof (re_node_set),
1426 		  sizeof (Idx)));
1427 
1428       /* Avoid overflows.  */
1429       if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1430 	return REG_MISSING;
1431 
1432       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1433       if (BE (new_nodes == NULL, 0))
1434 	return REG_MISSING;
1435       dfa->nodes = new_nodes;
1436       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1437       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1438       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1439       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1440       if (BE (new_nexts == NULL || new_indices == NULL
1441 	      || new_edests == NULL || new_eclosures == NULL, 0))
1442 	return REG_MISSING;
1443       dfa->nexts = new_nexts;
1444       dfa->org_indices = new_indices;
1445       dfa->edests = new_edests;
1446       dfa->eclosures = new_eclosures;
1447       dfa->nodes_alloc = new_nodes_alloc;
1448     }
1449   dfa->nodes[dfa->nodes_len] = token;
1450   dfa->nodes[dfa->nodes_len].constraint = 0;
1451 #ifdef RE_ENABLE_I18N
1452   {
1453   int type = token.type;
1454   dfa->nodes[dfa->nodes_len].accept_mb =
1455     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1456   }
1457 #endif
1458   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1459   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1460   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1461   return dfa->nodes_len++;
1462 }
1463 
1464 static inline re_hashval_t
1465 internal_function
1466 calc_state_hash (const re_node_set *nodes, unsigned int context)
1467 {
1468   re_hashval_t hash = nodes->nelem + context;
1469   Idx i;
1470   for (i = 0 ; i < nodes->nelem ; i++)
1471     hash += nodes->elems[i];
1472   return hash;
1473 }
1474 
1475 /* Search for the state whose node_set is equivalent to NODES.
1476    Return the pointer to the state, if we found it in the DFA.
1477    Otherwise create the new one and return it.  In case of an error
1478    return NULL and set the error code in ERR.
1479    Note: - We assume NULL as the invalid state, then it is possible that
1480 	   return value is NULL and ERR is REG_NOERROR.
1481 	 - We never return non-NULL value in case of any errors, it is for
1482 	   optimization.  */
1483 
1484 static re_dfastate_t *
1485 internal_function __attribute_warn_unused_result__
1486 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1487 		  const re_node_set *nodes)
1488 {
1489   re_hashval_t hash;
1490   re_dfastate_t *new_state;
1491   struct re_state_table_entry *spot;
1492   Idx i;
1493 #ifdef lint
1494   /* Suppress bogus uninitialized-variable warnings.  */
1495   *err = REG_NOERROR;
1496 #endif
1497   if (BE (nodes->nelem == 0, 0))
1498     {
1499       *err = REG_NOERROR;
1500       return NULL;
1501     }
1502   hash = calc_state_hash (nodes, 0);
1503   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1504 
1505   for (i = 0 ; i < spot->num ; i++)
1506     {
1507       re_dfastate_t *state = spot->array[i];
1508       if (hash != state->hash)
1509 	continue;
1510       if (re_node_set_compare (&state->nodes, nodes))
1511 	return state;
1512     }
1513 
1514   /* There are no appropriate state in the dfa, create the new one.  */
1515   new_state = create_ci_newstate (dfa, nodes, hash);
1516   if (BE (new_state == NULL, 0))
1517     *err = REG_ESPACE;
1518 
1519   return new_state;
1520 }
1521 
1522 /* Search for the state whose node_set is equivalent to NODES and
1523    whose context is equivalent to CONTEXT.
1524    Return the pointer to the state, if we found it in the DFA.
1525    Otherwise create the new one and return it.  In case of an error
1526    return NULL and set the error code in ERR.
1527    Note: - We assume NULL as the invalid state, then it is possible that
1528 	   return value is NULL and ERR is REG_NOERROR.
1529 	 - We never return non-NULL value in case of any errors, it is for
1530 	   optimization.  */
1531 
1532 static re_dfastate_t *
1533 internal_function __attribute_warn_unused_result__
1534 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1535 			  const re_node_set *nodes, unsigned int context)
1536 {
1537   re_hashval_t hash;
1538   re_dfastate_t *new_state;
1539   struct re_state_table_entry *spot;
1540   Idx i;
1541 #ifdef lint
1542   /* Suppress bogus uninitialized-variable warnings.  */
1543   *err = REG_NOERROR;
1544 #endif
1545   if (nodes->nelem == 0)
1546     {
1547       *err = REG_NOERROR;
1548       return NULL;
1549     }
1550   hash = calc_state_hash (nodes, context);
1551   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1552 
1553   for (i = 0 ; i < spot->num ; i++)
1554     {
1555       re_dfastate_t *state = spot->array[i];
1556       if (state->hash == hash
1557 	  && state->context == context
1558 	  && re_node_set_compare (state->entrance_nodes, nodes))
1559 	return state;
1560     }
1561   /* There are no appropriate state in `dfa', create the new one.  */
1562   new_state = create_cd_newstate (dfa, nodes, context, hash);
1563   if (BE (new_state == NULL, 0))
1564     *err = REG_ESPACE;
1565 
1566   return new_state;
1567 }
1568 
1569 /* Finish initialization of the new state NEWSTATE, and using its hash value
1570    HASH put in the appropriate bucket of DFA's state table.  Return value
1571    indicates the error code if failed.  */
1572 
1573 static reg_errcode_t
1574 __attribute_warn_unused_result__
1575 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1576 		re_hashval_t hash)
1577 {
1578   struct re_state_table_entry *spot;
1579   reg_errcode_t err;
1580   Idx i;
1581 
1582   newstate->hash = hash;
1583   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1584   if (BE (err != REG_NOERROR, 0))
1585     return REG_ESPACE;
1586   for (i = 0; i < newstate->nodes.nelem; i++)
1587     {
1588       Idx elem = newstate->nodes.elems[i];
1589       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1590 	if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1591 	  return REG_ESPACE;
1592     }
1593 
1594   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1595   if (BE (spot->alloc <= spot->num, 0))
1596     {
1597       Idx new_alloc = 2 * spot->num + 2;
1598       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1599 					      new_alloc);
1600       if (BE (new_array == NULL, 0))
1601 	return REG_ESPACE;
1602       spot->array = new_array;
1603       spot->alloc = new_alloc;
1604     }
1605   spot->array[spot->num++] = newstate;
1606   return REG_NOERROR;
1607 }
1608 
1609 static void
1610 free_state (re_dfastate_t *state)
1611 {
1612   re_node_set_free (&state->non_eps_nodes);
1613   re_node_set_free (&state->inveclosure);
1614   if (state->entrance_nodes != &state->nodes)
1615     {
1616       re_node_set_free (state->entrance_nodes);
1617       re_free (state->entrance_nodes);
1618     }
1619   re_node_set_free (&state->nodes);
1620   re_free (state->word_trtable);
1621   re_free (state->trtable);
1622   re_free (state);
1623 }
1624 
1625 /* Create the new state which is independ of contexts.
1626    Return the new state if succeeded, otherwise return NULL.  */
1627 
1628 static re_dfastate_t *
1629 internal_function __attribute_warn_unused_result__
1630 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1631 		    re_hashval_t hash)
1632 {
1633   Idx i;
1634   reg_errcode_t err;
1635   re_dfastate_t *newstate;
1636 
1637   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1638   if (BE (newstate == NULL, 0))
1639     return NULL;
1640   err = re_node_set_init_copy (&newstate->nodes, nodes);
1641   if (BE (err != REG_NOERROR, 0))
1642     {
1643       re_free (newstate);
1644       return NULL;
1645     }
1646 
1647   newstate->entrance_nodes = &newstate->nodes;
1648   for (i = 0 ; i < nodes->nelem ; i++)
1649     {
1650       re_token_t *node = dfa->nodes + nodes->elems[i];
1651       re_token_type_t type = node->type;
1652       if (type == CHARACTER && !node->constraint)
1653 	continue;
1654 #ifdef RE_ENABLE_I18N
1655       newstate->accept_mb |= node->accept_mb;
1656 #endif /* RE_ENABLE_I18N */
1657 
1658       /* If the state has the halt node, the state is a halt state.  */
1659       if (type == END_OF_RE)
1660 	newstate->halt = 1;
1661       else if (type == OP_BACK_REF)
1662 	newstate->has_backref = 1;
1663       else if (type == ANCHOR || node->constraint)
1664 	newstate->has_constraint = 1;
1665     }
1666   err = register_state (dfa, newstate, hash);
1667   if (BE (err != REG_NOERROR, 0))
1668     {
1669       free_state (newstate);
1670       newstate = NULL;
1671     }
1672   return newstate;
1673 }
1674 
1675 /* Create the new state which is depend on the context CONTEXT.
1676    Return the new state if succeeded, otherwise return NULL.  */
1677 
1678 static re_dfastate_t *
1679 internal_function __attribute_warn_unused_result__
1680 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1681 		    unsigned int context, re_hashval_t hash)
1682 {
1683   Idx i, nctx_nodes = 0;
1684   reg_errcode_t err;
1685   re_dfastate_t *newstate;
1686 
1687   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1688   if (BE (newstate == NULL, 0))
1689     return NULL;
1690   err = re_node_set_init_copy (&newstate->nodes, nodes);
1691   if (BE (err != REG_NOERROR, 0))
1692     {
1693       re_free (newstate);
1694       return NULL;
1695     }
1696 
1697   newstate->context = context;
1698   newstate->entrance_nodes = &newstate->nodes;
1699 
1700   for (i = 0 ; i < nodes->nelem ; i++)
1701     {
1702       re_token_t *node = dfa->nodes + nodes->elems[i];
1703       re_token_type_t type = node->type;
1704       unsigned int constraint = node->constraint;
1705 
1706       if (type == CHARACTER && !constraint)
1707 	continue;
1708 #ifdef RE_ENABLE_I18N
1709       newstate->accept_mb |= node->accept_mb;
1710 #endif /* RE_ENABLE_I18N */
1711 
1712       /* If the state has the halt node, the state is a halt state.  */
1713       if (type == END_OF_RE)
1714 	newstate->halt = 1;
1715       else if (type == OP_BACK_REF)
1716 	newstate->has_backref = 1;
1717 
1718       if (constraint)
1719 	{
1720 	  if (newstate->entrance_nodes == &newstate->nodes)
1721 	    {
1722 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1723 	      if (BE (newstate->entrance_nodes == NULL, 0))
1724 		{
1725 		  free_state (newstate);
1726 		  return NULL;
1727 		}
1728 	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1729 		  != REG_NOERROR)
1730 		return NULL;
1731 	      nctx_nodes = 0;
1732 	      newstate->has_constraint = 1;
1733 	    }
1734 
1735 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1736 	    {
1737 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1738 	      ++nctx_nodes;
1739 	    }
1740 	}
1741     }
1742   err = register_state (dfa, newstate, hash);
1743   if (BE (err != REG_NOERROR, 0))
1744     {
1745       free_state (newstate);
1746       newstate = NULL;
1747     }
1748   return  newstate;
1749 }
1750