1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
5    Software Foundation, Inc.
6    This file is part of the GNU C Library.
7    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
8 
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation; either version 3, or (at your option)
12    any later version.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18 
19    You should have received a copy of the GNU General Public License along
20    with this program; if not, write to the Free Software Foundation,
21    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 
23 #include "verify.h"
24 #include "intprops.h"
25 static void re_string_construct_common (const char *str, Idx len,
26 					re_string_t *pstr,
27 					RE_TRANSLATE_TYPE trans, bool icase,
28 					const re_dfa_t *dfa) internal_function;
29 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
30 					  const re_node_set *nodes,
31 					  re_hashval_t hash) internal_function;
32 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
33 					  const re_node_set *nodes,
34 					  unsigned int context,
35 					  re_hashval_t hash) internal_function;
36 
37 /* Functions for string operation.  */
38 
39 /* This function allocate the buffers.  It is necessary to call
40    re_string_reconstruct before using the object.  */
41 
42 static reg_errcode_t
43 internal_function __attribute_warn_unused_result__
44 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
45 		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
46 {
47   reg_errcode_t ret;
48   Idx init_buf_len;
49 
50   /* Ensure at least one character fits into the buffers.  */
51   if (init_len < dfa->mb_cur_max)
52     init_len = dfa->mb_cur_max;
53   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
54   re_string_construct_common (str, len, pstr, trans, icase, dfa);
55 
56   ret = re_string_realloc_buffers (pstr, init_buf_len);
57   if (BE (ret != REG_NOERROR, 0))
58     return ret;
59 
60   pstr->word_char = dfa->word_char;
61   pstr->word_ops_used = dfa->word_ops_used;
62   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
63   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
64   pstr->valid_raw_len = pstr->valid_len;
65   return REG_NOERROR;
66 }
67 
68 /* This function allocate the buffers, and initialize them.  */
69 
70 static reg_errcode_t
71 internal_function __attribute_warn_unused_result__
72 re_string_construct (re_string_t *pstr, const char *str, Idx len,
73 		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
74 {
75   reg_errcode_t ret;
76   memset (pstr, '\0', sizeof (re_string_t));
77   re_string_construct_common (str, len, pstr, trans, icase, dfa);
78 
79   if (len > 0)
80     {
81       ret = re_string_realloc_buffers (pstr, len + 1);
82       if (BE (ret != REG_NOERROR, 0))
83 	return ret;
84     }
85   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
86 
87   if (icase)
88     {
89 #ifdef RE_ENABLE_I18N
90       if (dfa->mb_cur_max > 1)
91 	{
92 	  while (1)
93 	    {
94 	      ret = build_wcs_upper_buffer (pstr);
95 	      if (BE (ret != REG_NOERROR, 0))
96 		return ret;
97 	      if (pstr->valid_raw_len >= len)
98 		break;
99 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
100 		break;
101 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
102 	      if (BE (ret != REG_NOERROR, 0))
103 		return ret;
104 	    }
105 	}
106       else
107 #endif /* RE_ENABLE_I18N  */
108 	build_upper_buffer (pstr);
109     }
110   else
111     {
112 #ifdef RE_ENABLE_I18N
113       if (dfa->mb_cur_max > 1)
114 	build_wcs_buffer (pstr);
115       else
116 #endif /* RE_ENABLE_I18N  */
117 	{
118 	  if (trans != NULL)
119 	    re_string_translate_buffer (pstr);
120 	  else
121 	    {
122 	      pstr->valid_len = pstr->bufs_len;
123 	      pstr->valid_raw_len = pstr->bufs_len;
124 	    }
125 	}
126     }
127 
128   return REG_NOERROR;
129 }
130 
131 /* Helper functions for re_string_allocate, and re_string_construct.  */
132 
133 static reg_errcode_t
134 internal_function __attribute_warn_unused_result__
135 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
136 {
137 #ifdef RE_ENABLE_I18N
138   if (pstr->mb_cur_max > 1)
139     {
140       wint_t *new_wcs;
141 
142       /* Avoid overflow.  */
143       size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
144       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
145 	return REG_ESPACE;
146 
147       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
148       if (BE (new_wcs == NULL, 0))
149 	return REG_ESPACE;
150       pstr->wcs = new_wcs;
151       if (pstr->offsets != NULL)
152 	{
153 	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
154 	  if (BE (new_offsets == NULL, 0))
155 	    return REG_ESPACE;
156 	  pstr->offsets = new_offsets;
157 	}
158     }
159 #endif /* RE_ENABLE_I18N  */
160   if (pstr->mbs_allocated)
161     {
162       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
163 					   new_buf_len);
164       if (BE (new_mbs == NULL, 0))
165 	return REG_ESPACE;
166       pstr->mbs = new_mbs;
167     }
168   pstr->bufs_len = new_buf_len;
169   return REG_NOERROR;
170 }
171 
172 
173 static void
174 internal_function
175 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
176 			    RE_TRANSLATE_TYPE trans, bool icase,
177 			    const re_dfa_t *dfa)
178 {
179   pstr->raw_mbs = (const unsigned char *) str;
180   pstr->len = len;
181   pstr->raw_len = len;
182   pstr->trans = trans;
183   pstr->icase = icase;
184   pstr->mbs_allocated = (trans != NULL || icase);
185   pstr->mb_cur_max = dfa->mb_cur_max;
186   pstr->is_utf8 = dfa->is_utf8;
187   pstr->map_notascii = dfa->map_notascii;
188   pstr->stop = pstr->len;
189   pstr->raw_stop = pstr->stop;
190 }
191 
192 #ifdef RE_ENABLE_I18N
193 
194 /* Build wide character buffer PSTR->WCS.
195    If the byte sequence of the string are:
196      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
197    Then wide character buffer will be:
198      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
199    We use WEOF for padding, they indicate that the position isn't
200    a first byte of a multibyte character.
201 
202    Note that this function assumes PSTR->VALID_LEN elements are already
203    built and starts from PSTR->VALID_LEN.  */
204 
205 static void
206 internal_function
207 build_wcs_buffer (re_string_t *pstr)
208 {
209 #ifdef _LIBC
210   unsigned char buf[MB_LEN_MAX];
211   assert (MB_LEN_MAX >= pstr->mb_cur_max);
212 #else
213   unsigned char buf[64];
214 #endif
215   mbstate_t prev_st;
216   Idx byte_idx, end_idx, remain_len;
217   size_t mbclen;
218 
219   /* Build the buffers from pstr->valid_len to either pstr->len or
220      pstr->bufs_len.  */
221   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
222   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
223     {
224       wchar_t wc;
225       const char *p;
226 
227       remain_len = end_idx - byte_idx;
228       prev_st = pstr->cur_state;
229       /* Apply the translation if we need.  */
230       if (BE (pstr->trans != NULL, 0))
231 	{
232 	  int i, ch;
233 
234 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
235 	    {
236 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
237 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
238 	    }
239 	  p = (const char *) buf;
240 	}
241       else
242 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
243       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
244       if (BE (mbclen == (size_t) -2, 0))
245 	{
246 	  /* The buffer doesn't have enough space, finish to build.  */
247 	  pstr->cur_state = prev_st;
248 	  break;
249 	}
250       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
251 	{
252 	  /* We treat these cases as a singlebyte character.  */
253 	  mbclen = 1;
254 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
255 	  if (BE (pstr->trans != NULL, 0))
256 	    wc = pstr->trans[wc];
257 	  pstr->cur_state = prev_st;
258 	}
259 
260       /* Write wide character and padding.  */
261       pstr->wcs[byte_idx++] = wc;
262       /* Write paddings.  */
263       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
264 	pstr->wcs[byte_idx++] = WEOF;
265     }
266   pstr->valid_len = byte_idx;
267   pstr->valid_raw_len = byte_idx;
268 }
269 
270 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
271    but for REG_ICASE.  */
272 
273 static reg_errcode_t
274 internal_function __attribute_warn_unused_result__
275 build_wcs_upper_buffer (re_string_t *pstr)
276 {
277   mbstate_t prev_st;
278   Idx src_idx, byte_idx, end_idx, remain_len;
279   size_t mbclen;
280 #ifdef _LIBC
281   char buf[MB_LEN_MAX];
282   assert (MB_LEN_MAX >= pstr->mb_cur_max);
283 #else
284   char buf[64];
285 #endif
286 
287   byte_idx = pstr->valid_len;
288   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
289 
290   /* The following optimization assumes that ASCII characters can be
291      mapped to wide characters with a simple cast.  */
292   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
293     {
294       while (byte_idx < end_idx)
295 	{
296 	  wchar_t wc;
297 
298 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
299 	      && mbsinit (&pstr->cur_state))
300 	    {
301 	      /* In case of a singlebyte character.  */
302 	      pstr->mbs[byte_idx]
303 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
304 	      /* The next step uses the assumption that wchar_t is encoded
305 		 ASCII-safe: all ASCII values can be converted like this.  */
306 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
307 	      ++byte_idx;
308 	      continue;
309 	    }
310 
311 	  remain_len = end_idx - byte_idx;
312 	  prev_st = pstr->cur_state;
313 	  mbclen = __mbrtowc (&wc,
314 			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
315 			       + byte_idx), remain_len, &pstr->cur_state);
316 	  if (BE (mbclen < (size_t) -2, 1))
317 	    {
318 	      wchar_t wcu = wc;
319 	      if (iswlower (wc))
320 		{
321 		  size_t mbcdlen;
322 
323 		  wcu = towupper (wc);
324 		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
325 		  if (BE (mbclen == mbcdlen, 1))
326 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
327 		  else
328 		    {
329 		      src_idx = byte_idx;
330 		      goto offsets_needed;
331 		    }
332 		}
333 	      else
334 		memcpy (pstr->mbs + byte_idx,
335 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
336 	      pstr->wcs[byte_idx++] = wcu;
337 	      /* Write paddings.  */
338 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
339 		pstr->wcs[byte_idx++] = WEOF;
340 	    }
341 	  else if (mbclen == (size_t) -1 || mbclen == 0)
342 	    {
343 	      /* It is an invalid character or '\0'.  Just use the byte.  */
344 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
345 	      pstr->mbs[byte_idx] = ch;
346 	      /* And also cast it to wide char.  */
347 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
348 	      if (BE (mbclen == (size_t) -1, 0))
349 		pstr->cur_state = prev_st;
350 	    }
351 	  else
352 	    {
353 	      /* The buffer doesn't have enough space, finish to build.  */
354 	      pstr->cur_state = prev_st;
355 	      break;
356 	    }
357 	}
358       pstr->valid_len = byte_idx;
359       pstr->valid_raw_len = byte_idx;
360       return REG_NOERROR;
361     }
362   else
363     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
364       {
365 	wchar_t wc;
366 	const char *p;
367       offsets_needed:
368 	remain_len = end_idx - byte_idx;
369 	prev_st = pstr->cur_state;
370 	if (BE (pstr->trans != NULL, 0))
371 	  {
372 	    int i, ch;
373 
374 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
375 	      {
376 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
377 		buf[i] = pstr->trans[ch];
378 	      }
379 	    p = (const char *) buf;
380 	  }
381 	else
382 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
383 	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
384 	if (BE (mbclen < (size_t) -2, 1))
385 	  {
386 	    wchar_t wcu = wc;
387 	    if (iswlower (wc))
388 	      {
389 		size_t mbcdlen;
390 
391 		wcu = towupper (wc);
392 		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
393 		if (BE (mbclen == mbcdlen, 1))
394 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
395 		else if (mbcdlen != (size_t) -1)
396 		  {
397 		    size_t i;
398 
399 		    if (byte_idx + mbcdlen > pstr->bufs_len)
400 		      {
401 			pstr->cur_state = prev_st;
402 			break;
403 		      }
404 
405 		    if (pstr->offsets == NULL)
406 		      {
407 			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
408 
409 			if (pstr->offsets == NULL)
410 			  return REG_ESPACE;
411 		      }
412 		    if (!pstr->offsets_needed)
413 		      {
414 			for (i = 0; i < (size_t) byte_idx; ++i)
415 			  pstr->offsets[i] = i;
416 			pstr->offsets_needed = 1;
417 		      }
418 
419 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
420 		    pstr->wcs[byte_idx] = wcu;
421 		    pstr->offsets[byte_idx] = src_idx;
422 		    for (i = 1; i < mbcdlen; ++i)
423 		      {
424 			pstr->offsets[byte_idx + i]
425 			  = src_idx + (i < mbclen ? i : mbclen - 1);
426 			pstr->wcs[byte_idx + i] = WEOF;
427 		      }
428 		    pstr->len += mbcdlen - mbclen;
429 		    if (pstr->raw_stop > src_idx)
430 		      pstr->stop += mbcdlen - mbclen;
431 		    end_idx = (pstr->bufs_len > pstr->len)
432 			      ? pstr->len : pstr->bufs_len;
433 		    byte_idx += mbcdlen;
434 		    src_idx += mbclen;
435 		    continue;
436 		  }
437 		else
438 		  memcpy (pstr->mbs + byte_idx, p, mbclen);
439 	      }
440 	    else
441 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
442 
443 	    if (BE (pstr->offsets_needed != 0, 0))
444 	      {
445 		size_t i;
446 		for (i = 0; i < mbclen; ++i)
447 		  pstr->offsets[byte_idx + i] = src_idx + i;
448 	      }
449 	    src_idx += mbclen;
450 
451 	    pstr->wcs[byte_idx++] = wcu;
452 	    /* Write paddings.  */
453 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
454 	      pstr->wcs[byte_idx++] = WEOF;
455 	  }
456 	else if (mbclen == (size_t) -1 || mbclen == 0)
457 	  {
458 	    /* It is an invalid character or '\0'.  Just use the byte.  */
459 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
460 
461 	    if (BE (pstr->trans != NULL, 0))
462 	      ch = pstr->trans [ch];
463 	    pstr->mbs[byte_idx] = ch;
464 
465 	    if (BE (pstr->offsets_needed != 0, 0))
466 	      pstr->offsets[byte_idx] = src_idx;
467 	    ++src_idx;
468 
469 	    /* And also cast it to wide char.  */
470 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
471 	    if (BE (mbclen == (size_t) -1, 0))
472 	      pstr->cur_state = prev_st;
473 	  }
474 	else
475 	  {
476 	    /* The buffer doesn't have enough space, finish to build.  */
477 	    pstr->cur_state = prev_st;
478 	    break;
479 	  }
480       }
481   pstr->valid_len = byte_idx;
482   pstr->valid_raw_len = src_idx;
483   return REG_NOERROR;
484 }
485 
486 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
487    Return the index.  */
488 
489 static Idx
490 internal_function
491 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
492 {
493   mbstate_t prev_st;
494   Idx rawbuf_idx;
495   size_t mbclen;
496   wint_t wc = WEOF;
497 
498   /* Skip the characters which are not necessary to check.  */
499   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
500        rawbuf_idx < new_raw_idx;)
501     {
502       wchar_t wc2;
503       Idx remain_len;
504       remain_len = pstr->len - rawbuf_idx;
505       prev_st = pstr->cur_state;
506       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
507 			  remain_len, &pstr->cur_state);
508       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
509 	{
510 	  /* We treat these cases as a single byte character.  */
511 	  if (mbclen == 0 || remain_len == 0)
512 	    wc = L'\0';
513 	  else
514 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
515 	  mbclen = 1;
516 	  pstr->cur_state = prev_st;
517 	}
518       else
519 	wc = wc2;
520       /* Then proceed the next character.  */
521       rawbuf_idx += mbclen;
522     }
523   *last_wc = wc;
524   return rawbuf_idx;
525 }
526 #endif /* RE_ENABLE_I18N  */
527 
528 /* Build the buffer PSTR->MBS, and apply the translation if we need.
529    This function is used in case of REG_ICASE.  */
530 
531 static void
532 internal_function
533 build_upper_buffer (re_string_t *pstr)
534 {
535   Idx char_idx, end_idx;
536   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
537 
538   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
539     {
540       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
541       if (BE (pstr->trans != NULL, 0))
542 	ch = pstr->trans[ch];
543       if (islower (ch))
544 	pstr->mbs[char_idx] = toupper (ch);
545       else
546 	pstr->mbs[char_idx] = ch;
547     }
548   pstr->valid_len = char_idx;
549   pstr->valid_raw_len = char_idx;
550 }
551 
552 /* Apply TRANS to the buffer in PSTR.  */
553 
554 static void
555 internal_function
556 re_string_translate_buffer (re_string_t *pstr)
557 {
558   Idx buf_idx, end_idx;
559   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
560 
561   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
562     {
563       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
564       pstr->mbs[buf_idx] = pstr->trans[ch];
565     }
566 
567   pstr->valid_len = buf_idx;
568   pstr->valid_raw_len = buf_idx;
569 }
570 
571 /* This function re-construct the buffers.
572    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
573    convert to upper case in case of REG_ICASE, apply translation.  */
574 
575 static reg_errcode_t
576 internal_function __attribute_warn_unused_result__
577 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
578 {
579   Idx offset;
580 
581   if (BE (pstr->raw_mbs_idx <= idx, 0))
582     offset = idx - pstr->raw_mbs_idx;
583   else
584     {
585       /* Reset buffer.  */
586 #ifdef RE_ENABLE_I18N
587       if (pstr->mb_cur_max > 1)
588 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
589 #endif /* RE_ENABLE_I18N */
590       pstr->len = pstr->raw_len;
591       pstr->stop = pstr->raw_stop;
592       pstr->valid_len = 0;
593       pstr->raw_mbs_idx = 0;
594       pstr->valid_raw_len = 0;
595       pstr->offsets_needed = 0;
596       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
597 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
598       if (!pstr->mbs_allocated)
599 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
600       offset = idx;
601     }
602 
603   if (BE (offset != 0, 1))
604     {
605       /* Should the already checked characters be kept?  */
606       if (BE (offset < pstr->valid_raw_len, 1))
607 	{
608 	  /* Yes, move them to the front of the buffer.  */
609 #ifdef RE_ENABLE_I18N
610 	  if (BE (pstr->offsets_needed, 0))
611 	    {
612 	      Idx low = 0, high = pstr->valid_len, mid;
613 	      do
614 		{
615 		  mid = (high + low) / 2;
616 		  if (pstr->offsets[mid] > offset)
617 		    high = mid;
618 		  else if (pstr->offsets[mid] < offset)
619 		    low = mid + 1;
620 		  else
621 		    break;
622 		}
623 	      while (low < high);
624 	      if (pstr->offsets[mid] < offset)
625 		++mid;
626 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
627 							eflags);
628 	      /* This can be quite complicated, so handle specially
629 		 only the common and easy case where the character with
630 		 different length representation of lower and upper
631 		 case is present at or after offset.  */
632 	      if (pstr->valid_len > offset
633 		  && mid == offset && pstr->offsets[mid] == offset)
634 		{
635 		  memmove (pstr->wcs, pstr->wcs + offset,
636 			   (pstr->valid_len - offset) * sizeof (wint_t));
637 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
638 		  pstr->valid_len -= offset;
639 		  pstr->valid_raw_len -= offset;
640 		  for (low = 0; low < pstr->valid_len; low++)
641 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
642 		}
643 	      else
644 		{
645 		  /* Otherwise, just find out how long the partial multibyte
646 		     character at offset is and fill it with WEOF/255.  */
647 		  pstr->len = pstr->raw_len - idx + offset;
648 		  pstr->stop = pstr->raw_stop - idx + offset;
649 		  pstr->offsets_needed = 0;
650 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
651 		    --mid;
652 		  while (mid < pstr->valid_len)
653 		    if (pstr->wcs[mid] != WEOF)
654 		      break;
655 		    else
656 		      ++mid;
657 		  if (mid == pstr->valid_len)
658 		    pstr->valid_len = 0;
659 		  else
660 		    {
661 		      pstr->valid_len = pstr->offsets[mid] - offset;
662 		      if (pstr->valid_len)
663 			{
664 			  for (low = 0; low < pstr->valid_len; ++low)
665 			    pstr->wcs[low] = WEOF;
666 			  memset (pstr->mbs, 255, pstr->valid_len);
667 			}
668 		    }
669 		  pstr->valid_raw_len = pstr->valid_len;
670 		}
671 	    }
672 	  else
673 #endif
674 	    {
675 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
676 							eflags);
677 #ifdef RE_ENABLE_I18N
678 	      if (pstr->mb_cur_max > 1)
679 		memmove (pstr->wcs, pstr->wcs + offset,
680 			 (pstr->valid_len - offset) * sizeof (wint_t));
681 #endif /* RE_ENABLE_I18N */
682 	      if (BE (pstr->mbs_allocated, 0))
683 		memmove (pstr->mbs, pstr->mbs + offset,
684 			 pstr->valid_len - offset);
685 	      pstr->valid_len -= offset;
686 	      pstr->valid_raw_len -= offset;
687 #if DEBUG
688 	      assert (pstr->valid_len > 0);
689 #endif
690 	    }
691 	}
692       else
693 	{
694 #ifdef RE_ENABLE_I18N
695 	  /* No, skip all characters until IDX.  */
696 	  Idx prev_valid_len = pstr->valid_len;
697 
698 	  if (BE (pstr->offsets_needed, 0))
699 	    {
700 	      pstr->len = pstr->raw_len - idx + offset;
701 	      pstr->stop = pstr->raw_stop - idx + offset;
702 	      pstr->offsets_needed = 0;
703 	    }
704 #endif
705 	  pstr->valid_len = 0;
706 #ifdef RE_ENABLE_I18N
707 	  if (pstr->mb_cur_max > 1)
708 	    {
709 	      Idx wcs_idx;
710 	      wint_t wc = WEOF;
711 
712 	      if (pstr->is_utf8)
713 		{
714 		  const unsigned char *raw, *p, *end;
715 
716 		  /* Special case UTF-8.  Multi-byte chars start with any
717 		     byte other than 0x80 - 0xbf.  */
718 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
719 		  end = raw + (offset - pstr->mb_cur_max);
720 		  if (end < pstr->raw_mbs)
721 		    end = pstr->raw_mbs;
722 		  p = raw + offset - 1;
723 #ifdef _LIBC
724 		  /* We know the wchar_t encoding is UCS4, so for the simple
725 		     case, ASCII characters, skip the conversion step.  */
726 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
727 		    {
728 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
729 		      /* pstr->valid_len = 0; */
730 		      wc = (wchar_t) *p;
731 		    }
732 		  else
733 #endif
734 		    for (; p >= end; --p)
735 		      if ((*p & 0xc0) != 0x80)
736 			{
737 			  mbstate_t cur_state;
738 			  wchar_t wc2;
739 			  Idx mlen = raw + pstr->len - p;
740 			  unsigned char buf[6];
741 			  size_t mbclen;
742 
743 			  if (BE (pstr->trans != NULL, 0))
744 			    {
745 			      int i = mlen < 6 ? mlen : 6;
746 			      while (--i >= 0)
747 				buf[i] = pstr->trans[p[i]];
748 			    }
749 			  /* XXX Don't use mbrtowc, we know which conversion
750 			     to use (UTF-8 -> UCS4).  */
751 			  memset (&cur_state, 0, sizeof (cur_state));
752 			  mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
753 					      &cur_state);
754 			  if (raw + offset - p <= mbclen
755 			      && mbclen < (size_t) -2)
756 			    {
757 			      memset (&pstr->cur_state, '\0',
758 				      sizeof (mbstate_t));
759 			      pstr->valid_len = mbclen - (raw + offset - p);
760 			      wc = wc2;
761 			    }
762 			  break;
763 			}
764 		}
765 
766 	      if (wc == WEOF)
767 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
768 	      if (wc == WEOF)
769 		pstr->tip_context
770 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
771 	      else
772 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
773 				      && IS_WIDE_WORD_CHAR (wc))
774 				     ? CONTEXT_WORD
775 				     : ((IS_WIDE_NEWLINE (wc)
776 					 && pstr->newline_anchor)
777 					? CONTEXT_NEWLINE : 0));
778 	      if (BE (pstr->valid_len, 0))
779 		{
780 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
781 		    pstr->wcs[wcs_idx] = WEOF;
782 		  if (pstr->mbs_allocated)
783 		    memset (pstr->mbs, 255, pstr->valid_len);
784 		}
785 	      pstr->valid_raw_len = pstr->valid_len;
786 	    }
787 	  else
788 #endif /* RE_ENABLE_I18N */
789 	    {
790 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
791 	      pstr->valid_raw_len = 0;
792 	      if (pstr->trans)
793 		c = pstr->trans[c];
794 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
795 				   ? CONTEXT_WORD
796 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
797 				      ? CONTEXT_NEWLINE : 0));
798 	    }
799 	}
800       if (!BE (pstr->mbs_allocated, 0))
801 	pstr->mbs += offset;
802     }
803   pstr->raw_mbs_idx = idx;
804   pstr->len -= offset;
805   pstr->stop -= offset;
806 
807   /* Then build the buffers.  */
808 #ifdef RE_ENABLE_I18N
809   if (pstr->mb_cur_max > 1)
810     {
811       if (pstr->icase)
812 	{
813 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
814 	  if (BE (ret != REG_NOERROR, 0))
815 	    return ret;
816 	}
817       else
818 	build_wcs_buffer (pstr);
819     }
820   else
821 #endif /* RE_ENABLE_I18N */
822     if (BE (pstr->mbs_allocated, 0))
823       {
824 	if (pstr->icase)
825 	  build_upper_buffer (pstr);
826 	else if (pstr->trans != NULL)
827 	  re_string_translate_buffer (pstr);
828       }
829     else
830       pstr->valid_len = pstr->len;
831 
832   pstr->cur_idx = 0;
833   return REG_NOERROR;
834 }
835 
836 static unsigned char
837 internal_function __attribute ((pure))
838 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
839 {
840   int ch;
841   Idx off;
842 
843   /* Handle the common (easiest) cases first.  */
844   if (BE (!pstr->mbs_allocated, 1))
845     return re_string_peek_byte (pstr, idx);
846 
847 #ifdef RE_ENABLE_I18N
848   if (pstr->mb_cur_max > 1
849       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
850     return re_string_peek_byte (pstr, idx);
851 #endif
852 
853   off = pstr->cur_idx + idx;
854 #ifdef RE_ENABLE_I18N
855   if (pstr->offsets_needed)
856     off = pstr->offsets[off];
857 #endif
858 
859   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
860 
861 #ifdef RE_ENABLE_I18N
862   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
863      this function returns CAPITAL LETTER I instead of first byte of
864      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
865      since peek_byte_case doesn't advance cur_idx in any way.  */
866   if (pstr->offsets_needed && !isascii (ch))
867     return re_string_peek_byte (pstr, idx);
868 #endif
869 
870   return ch;
871 }
872 
873 static unsigned char
874 internal_function __attribute ((pure))
875 re_string_fetch_byte_case (re_string_t *pstr)
876 {
877   if (BE (!pstr->mbs_allocated, 1))
878     return re_string_fetch_byte (pstr);
879 
880 #ifdef RE_ENABLE_I18N
881   if (pstr->offsets_needed)
882     {
883       Idx off;
884       int ch;
885 
886       /* For tr_TR.UTF-8 [[:islower:]] there is
887 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
888 	 in that case the whole multi-byte character and return
889 	 the original letter.  On the other side, with
890 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
891 	 anything else would complicate things too much.  */
892 
893       if (!re_string_first_byte (pstr, pstr->cur_idx))
894 	return re_string_fetch_byte (pstr);
895 
896       off = pstr->offsets[pstr->cur_idx];
897       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
898 
899       if (! isascii (ch))
900 	return re_string_fetch_byte (pstr);
901 
902       re_string_skip_bytes (pstr,
903 			    re_string_char_size_at (pstr, pstr->cur_idx));
904       return ch;
905     }
906 #endif
907 
908   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
909 }
910 
911 static void
912 internal_function
913 re_string_destruct (re_string_t *pstr)
914 {
915 #ifdef RE_ENABLE_I18N
916   re_free (pstr->wcs);
917   re_free (pstr->offsets);
918 #endif /* RE_ENABLE_I18N  */
919   if (pstr->mbs_allocated)
920     re_free (pstr->mbs);
921 }
922 
923 /* Return the context at IDX in INPUT.  */
924 
925 static unsigned int
926 internal_function
927 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
928 {
929   int c;
930   if (BE (! REG_VALID_INDEX (idx), 0))
931     /* In this case, we use the value stored in input->tip_context,
932        since we can't know the character in input->mbs[-1] here.  */
933     return input->tip_context;
934   if (BE (idx == input->len, 0))
935     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
936 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
937 #ifdef RE_ENABLE_I18N
938   if (input->mb_cur_max > 1)
939     {
940       wint_t wc;
941       Idx wc_idx = idx;
942       while(input->wcs[wc_idx] == WEOF)
943 	{
944 #ifdef DEBUG
945 	  /* It must not happen.  */
946 	  assert (REG_VALID_INDEX (wc_idx));
947 #endif
948 	  --wc_idx;
949 	  if (! REG_VALID_INDEX (wc_idx))
950 	    return input->tip_context;
951 	}
952       wc = input->wcs[wc_idx];
953       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
954 	return CONTEXT_WORD;
955       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
956 	      ? CONTEXT_NEWLINE : 0);
957     }
958   else
959 #endif
960     {
961       c = re_string_byte_at (input, idx);
962       if (bitset_contain (input->word_char, c))
963 	return CONTEXT_WORD;
964       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
965     }
966 }
967 
968 /* Functions for set operation.  */
969 
970 static reg_errcode_t
971 internal_function __attribute_warn_unused_result__
972 re_node_set_alloc (re_node_set *set, Idx size)
973 {
974   set->alloc = size;
975   set->nelem = 0;
976   set->elems = re_malloc (Idx, size);
977   if (BE (set->elems == NULL, 0))
978     return REG_ESPACE;
979   return REG_NOERROR;
980 }
981 
982 static reg_errcode_t
983 internal_function __attribute_warn_unused_result__
984 re_node_set_init_1 (re_node_set *set, Idx elem)
985 {
986   set->alloc = 1;
987   set->nelem = 1;
988   set->elems = re_malloc (Idx, 1);
989   if (BE (set->elems == NULL, 0))
990     {
991       set->alloc = set->nelem = 0;
992       return REG_ESPACE;
993     }
994   set->elems[0] = elem;
995   return REG_NOERROR;
996 }
997 
998 static reg_errcode_t
999 internal_function __attribute_warn_unused_result__
1000 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1001 {
1002   set->alloc = 2;
1003   set->elems = re_malloc (Idx, 2);
1004   if (BE (set->elems == NULL, 0))
1005     return REG_ESPACE;
1006   if (elem1 == elem2)
1007     {
1008       set->nelem = 1;
1009       set->elems[0] = elem1;
1010     }
1011   else
1012     {
1013       set->nelem = 2;
1014       if (elem1 < elem2)
1015 	{
1016 	  set->elems[0] = elem1;
1017 	  set->elems[1] = elem2;
1018 	}
1019       else
1020 	{
1021 	  set->elems[0] = elem2;
1022 	  set->elems[1] = elem1;
1023 	}
1024     }
1025   return REG_NOERROR;
1026 }
1027 
1028 static reg_errcode_t
1029 internal_function __attribute_warn_unused_result__
1030 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1031 {
1032   dest->nelem = src->nelem;
1033   if (src->nelem > 0)
1034     {
1035       dest->alloc = dest->nelem;
1036       dest->elems = re_malloc (Idx, dest->alloc);
1037       if (BE (dest->elems == NULL, 0))
1038 	{
1039 	  dest->alloc = dest->nelem = 0;
1040 	  return REG_ESPACE;
1041 	}
1042       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1043     }
1044   else
1045     re_node_set_init_empty (dest);
1046   return REG_NOERROR;
1047 }
1048 
1049 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1050    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1051    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1052 
1053 static reg_errcode_t
1054 internal_function __attribute_warn_unused_result__
1055 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1056 			   const re_node_set *src2)
1057 {
1058   Idx i1, i2, is, id, delta, sbase;
1059   if (src1->nelem == 0 || src2->nelem == 0)
1060     return REG_NOERROR;
1061 
1062   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1063      conservative estimate.  */
1064   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1065     {
1066       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1067       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1068       if (BE (new_elems == NULL, 0))
1069 	return REG_ESPACE;
1070       dest->elems = new_elems;
1071       dest->alloc = new_alloc;
1072     }
1073 
1074   /* Find the items in the intersection of SRC1 and SRC2, and copy
1075      into the top of DEST those that are not already in DEST itself.  */
1076   sbase = dest->nelem + src1->nelem + src2->nelem;
1077   i1 = src1->nelem - 1;
1078   i2 = src2->nelem - 1;
1079   id = dest->nelem - 1;
1080   for (;;)
1081     {
1082       if (src1->elems[i1] == src2->elems[i2])
1083 	{
1084 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1085 	  while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1086 	    --id;
1087 
1088           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1089             dest->elems[--sbase] = src1->elems[i1];
1090 
1091 	  if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1092 	    break;
1093 	}
1094 
1095       /* Lower the highest of the two items.  */
1096       else if (src1->elems[i1] < src2->elems[i2])
1097 	{
1098 	  if (! REG_VALID_INDEX (--i2))
1099 	    break;
1100 	}
1101       else
1102 	{
1103 	  if (! REG_VALID_INDEX (--i1))
1104 	    break;
1105 	}
1106     }
1107 
1108   id = dest->nelem - 1;
1109   is = dest->nelem + src1->nelem + src2->nelem - 1;
1110   delta = is - sbase + 1;
1111 
1112   /* Now copy.  When DELTA becomes zero, the remaining
1113      DEST elements are already in place; this is more or
1114      less the same loop that is in re_node_set_merge.  */
1115   dest->nelem += delta;
1116   if (delta > 0 && REG_VALID_INDEX (id))
1117     for (;;)
1118       {
1119 	if (dest->elems[is] > dest->elems[id])
1120 	  {
1121 	    /* Copy from the top.  */
1122 	    dest->elems[id + delta--] = dest->elems[is--];
1123 	    if (delta == 0)
1124 	      break;
1125 	  }
1126 	else
1127 	  {
1128 	    /* Slide from the bottom.  */
1129 	    dest->elems[id + delta] = dest->elems[id];
1130 	    if (! REG_VALID_INDEX (--id))
1131 	      break;
1132 	  }
1133       }
1134 
1135   /* Copy remaining SRC elements.  */
1136   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1137 
1138   return REG_NOERROR;
1139 }
1140 
1141 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1142    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1143 
1144 static reg_errcode_t
1145 internal_function __attribute_warn_unused_result__
1146 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1147 			const re_node_set *src2)
1148 {
1149   Idx i1, i2, id;
1150   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1151     {
1152       dest->alloc = src1->nelem + src2->nelem;
1153       dest->elems = re_malloc (Idx, dest->alloc);
1154       if (BE (dest->elems == NULL, 0))
1155 	return REG_ESPACE;
1156     }
1157   else
1158     {
1159       if (src1 != NULL && src1->nelem > 0)
1160 	return re_node_set_init_copy (dest, src1);
1161       else if (src2 != NULL && src2->nelem > 0)
1162 	return re_node_set_init_copy (dest, src2);
1163       else
1164 	re_node_set_init_empty (dest);
1165       return REG_NOERROR;
1166     }
1167   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1168     {
1169       if (src1->elems[i1] > src2->elems[i2])
1170 	{
1171 	  dest->elems[id++] = src2->elems[i2++];
1172 	  continue;
1173 	}
1174       if (src1->elems[i1] == src2->elems[i2])
1175 	++i2;
1176       dest->elems[id++] = src1->elems[i1++];
1177     }
1178   if (i1 < src1->nelem)
1179     {
1180       memcpy (dest->elems + id, src1->elems + i1,
1181 	     (src1->nelem - i1) * sizeof (Idx));
1182       id += src1->nelem - i1;
1183     }
1184   else if (i2 < src2->nelem)
1185     {
1186       memcpy (dest->elems + id, src2->elems + i2,
1187 	     (src2->nelem - i2) * sizeof (Idx));
1188       id += src2->nelem - i2;
1189     }
1190   dest->nelem = id;
1191   return REG_NOERROR;
1192 }
1193 
1194 /* Calculate the union set of the sets DEST and SRC. And store it to
1195    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1196 
1197 static reg_errcode_t
1198 internal_function __attribute_warn_unused_result__
1199 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1200 {
1201   Idx is, id, sbase, delta;
1202   if (src == NULL || src->nelem == 0)
1203     return REG_NOERROR;
1204   if (dest->alloc < 2 * src->nelem + dest->nelem)
1205     {
1206       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1207       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1208       if (BE (new_buffer == NULL, 0))
1209 	return REG_ESPACE;
1210       dest->elems = new_buffer;
1211       dest->alloc = new_alloc;
1212     }
1213 
1214   if (BE (dest->nelem == 0, 0))
1215     {
1216       dest->nelem = src->nelem;
1217       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1218       return REG_NOERROR;
1219     }
1220 
1221   /* Copy into the top of DEST the items of SRC that are not
1222      found in DEST.  Maybe we could binary search in DEST?  */
1223   for (sbase = dest->nelem + 2 * src->nelem,
1224        is = src->nelem - 1, id = dest->nelem - 1;
1225        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1226     {
1227       if (dest->elems[id] == src->elems[is])
1228 	is--, id--;
1229       else if (dest->elems[id] < src->elems[is])
1230 	dest->elems[--sbase] = src->elems[is--];
1231       else /* if (dest->elems[id] > src->elems[is]) */
1232 	--id;
1233     }
1234 
1235   if (REG_VALID_INDEX (is))
1236     {
1237       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1238       sbase -= is + 1;
1239       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1240     }
1241 
1242   id = dest->nelem - 1;
1243   is = dest->nelem + 2 * src->nelem - 1;
1244   delta = is - sbase + 1;
1245   if (delta == 0)
1246     return REG_NOERROR;
1247 
1248   /* Now copy.  When DELTA becomes zero, the remaining
1249      DEST elements are already in place.  */
1250   dest->nelem += delta;
1251   for (;;)
1252     {
1253       if (dest->elems[is] > dest->elems[id])
1254 	{
1255 	  /* Copy from the top.  */
1256 	  dest->elems[id + delta--] = dest->elems[is--];
1257 	  if (delta == 0)
1258 	    break;
1259 	}
1260       else
1261 	{
1262 	  /* Slide from the bottom.  */
1263 	  dest->elems[id + delta] = dest->elems[id];
1264 	  if (! REG_VALID_INDEX (--id))
1265 	    {
1266 	      /* Copy remaining SRC elements.  */
1267 	      memcpy (dest->elems, dest->elems + sbase,
1268 		      delta * sizeof (Idx));
1269 	      break;
1270 	    }
1271 	}
1272     }
1273 
1274   return REG_NOERROR;
1275 }
1276 
1277 /* Insert the new element ELEM to the re_node_set* SET.
1278    SET should not already have ELEM.
1279    Return true if successful.  */
1280 
1281 static bool
1282 internal_function __attribute_warn_unused_result__
1283 re_node_set_insert (re_node_set *set, Idx elem)
1284 {
1285   Idx idx;
1286   /* In case the set is empty.  */
1287   if (set->alloc == 0)
1288     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1289 
1290   if (BE (set->nelem, 0) == 0)
1291     {
1292       /* We already guaranteed above that set->alloc != 0.  */
1293       set->elems[0] = elem;
1294       ++set->nelem;
1295       return true;
1296     }
1297 
1298   /* Realloc if we need.  */
1299   if (set->alloc == set->nelem)
1300     {
1301       Idx *new_elems;
1302       set->alloc = set->alloc * 2;
1303       new_elems = re_realloc (set->elems, Idx, set->alloc);
1304       if (BE (new_elems == NULL, 0))
1305 	return false;
1306       set->elems = new_elems;
1307     }
1308 
1309   /* Move the elements which follows the new element.  Test the
1310      first element separately to skip a check in the inner loop.  */
1311   if (elem < set->elems[0])
1312     {
1313       idx = 0;
1314       for (idx = set->nelem; idx > 0; idx--)
1315 	set->elems[idx] = set->elems[idx - 1];
1316     }
1317   else
1318     {
1319       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1320 	set->elems[idx] = set->elems[idx - 1];
1321     }
1322 
1323   /* Insert the new element.  */
1324   set->elems[idx] = elem;
1325   ++set->nelem;
1326   return true;
1327 }
1328 
1329 /* Insert the new element ELEM to the re_node_set* SET.
1330    SET should not already have any element greater than or equal to ELEM.
1331    Return true if successful.  */
1332 
1333 static bool
1334 internal_function __attribute_warn_unused_result__
1335 re_node_set_insert_last (re_node_set *set, Idx elem)
1336 {
1337   /* Realloc if we need.  */
1338   if (set->alloc == set->nelem)
1339     {
1340       Idx *new_elems;
1341       set->alloc = (set->alloc + 1) * 2;
1342       new_elems = re_realloc (set->elems, Idx, set->alloc);
1343       if (BE (new_elems == NULL, 0))
1344 	return false;
1345       set->elems = new_elems;
1346     }
1347 
1348   /* Insert the new element.  */
1349   set->elems[set->nelem++] = elem;
1350   return true;
1351 }
1352 
1353 /* Compare two node sets SET1 and SET2.
1354    Return true if SET1 and SET2 are equivalent.  */
1355 
1356 static bool
1357 internal_function __attribute ((pure))
1358 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1359 {
1360   Idx i;
1361   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1362     return false;
1363   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1364     if (set1->elems[i] != set2->elems[i])
1365       return false;
1366   return true;
1367 }
1368 
1369 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1370 
1371 static Idx
1372 internal_function __attribute ((pure))
1373 re_node_set_contains (const re_node_set *set, Idx elem)
1374 {
1375   __re_size_t idx, right, mid;
1376   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1377     return 0;
1378 
1379   /* Binary search the element.  */
1380   idx = 0;
1381   right = set->nelem - 1;
1382   while (idx < right)
1383     {
1384       mid = (idx + right) / 2;
1385       if (set->elems[mid] < elem)
1386 	idx = mid + 1;
1387       else
1388 	right = mid;
1389     }
1390   return set->elems[idx] == elem ? idx + 1 : 0;
1391 }
1392 
1393 static void
1394 internal_function
1395 re_node_set_remove_at (re_node_set *set, Idx idx)
1396 {
1397   verify (! TYPE_SIGNED (Idx));
1398   /* if (idx < 0)
1399      return; */
1400   if (idx >= set->nelem)
1401     return;
1402   --set->nelem;
1403   for (; idx < set->nelem; idx++)
1404     set->elems[idx] = set->elems[idx + 1];
1405 }
1406 
1407 
1408 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1409    Or return REG_MISSING if an error occurred.  */
1410 
1411 static Idx
1412 internal_function
1413 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1414 {
1415   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1416     {
1417       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1418       Idx *new_nexts, *new_indices;
1419       re_node_set *new_edests, *new_eclosures;
1420       re_token_t *new_nodes;
1421       size_t max_object_size =
1422 	MAX (sizeof (re_token_t),
1423 	     MAX (sizeof (re_node_set),
1424 		  sizeof (Idx)));
1425 
1426       /* Avoid overflows.  */
1427       if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1428 	return REG_MISSING;
1429 
1430       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1431       if (BE (new_nodes == NULL, 0))
1432 	return REG_MISSING;
1433       dfa->nodes = new_nodes;
1434       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1435       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1436       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1437       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1438       if (BE (new_nexts == NULL || new_indices == NULL
1439 	      || new_edests == NULL || new_eclosures == NULL, 0))
1440 	return REG_MISSING;
1441       dfa->nexts = new_nexts;
1442       dfa->org_indices = new_indices;
1443       dfa->edests = new_edests;
1444       dfa->eclosures = new_eclosures;
1445       dfa->nodes_alloc = new_nodes_alloc;
1446     }
1447   dfa->nodes[dfa->nodes_len] = token;
1448   dfa->nodes[dfa->nodes_len].constraint = 0;
1449 #ifdef RE_ENABLE_I18N
1450   {
1451   int type = token.type;
1452   dfa->nodes[dfa->nodes_len].accept_mb =
1453     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1454   }
1455 #endif
1456   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1457   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1458   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1459   return dfa->nodes_len++;
1460 }
1461 
1462 static inline re_hashval_t
1463 internal_function
1464 calc_state_hash (const re_node_set *nodes, unsigned int context)
1465 {
1466   re_hashval_t hash = nodes->nelem + context;
1467   Idx i;
1468   for (i = 0 ; i < nodes->nelem ; i++)
1469     hash += nodes->elems[i];
1470   return hash;
1471 }
1472 
1473 /* Search for the state whose node_set is equivalent to NODES.
1474    Return the pointer to the state, if we found it in the DFA.
1475    Otherwise create the new one and return it.  In case of an error
1476    return NULL and set the error code in ERR.
1477    Note: - We assume NULL as the invalid state, then it is possible that
1478 	   return value is NULL and ERR is REG_NOERROR.
1479 	 - We never return non-NULL value in case of any errors, it is for
1480 	   optimization.  */
1481 
1482 static re_dfastate_t *
1483 internal_function __attribute_warn_unused_result__
1484 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1485 		  const re_node_set *nodes)
1486 {
1487   re_hashval_t hash;
1488   re_dfastate_t *new_state;
1489   struct re_state_table_entry *spot;
1490   Idx i;
1491 #ifdef lint
1492   /* Suppress bogus uninitialized-variable warnings.  */
1493   *err = REG_NOERROR;
1494 #endif
1495   if (BE (nodes->nelem == 0, 0))
1496     {
1497       *err = REG_NOERROR;
1498       return NULL;
1499     }
1500   hash = calc_state_hash (nodes, 0);
1501   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1502 
1503   for (i = 0 ; i < spot->num ; i++)
1504     {
1505       re_dfastate_t *state = spot->array[i];
1506       if (hash != state->hash)
1507 	continue;
1508       if (re_node_set_compare (&state->nodes, nodes))
1509 	return state;
1510     }
1511 
1512   /* There are no appropriate state in the dfa, create the new one.  */
1513   new_state = create_ci_newstate (dfa, nodes, hash);
1514   if (BE (new_state == NULL, 0))
1515     *err = REG_ESPACE;
1516 
1517   return new_state;
1518 }
1519 
1520 /* Search for the state whose node_set is equivalent to NODES and
1521    whose context is equivalent to CONTEXT.
1522    Return the pointer to the state, if we found it in the DFA.
1523    Otherwise create the new one and return it.  In case of an error
1524    return NULL and set the error code in ERR.
1525    Note: - We assume NULL as the invalid state, then it is possible that
1526 	   return value is NULL and ERR is REG_NOERROR.
1527 	 - We never return non-NULL value in case of any errors, it is for
1528 	   optimization.  */
1529 
1530 static re_dfastate_t *
1531 internal_function __attribute_warn_unused_result__
1532 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1533 			  const re_node_set *nodes, unsigned int context)
1534 {
1535   re_hashval_t hash;
1536   re_dfastate_t *new_state;
1537   struct re_state_table_entry *spot;
1538   Idx i;
1539 #ifdef lint
1540   /* Suppress bogus uninitialized-variable warnings.  */
1541   *err = REG_NOERROR;
1542 #endif
1543   if (nodes->nelem == 0)
1544     {
1545       *err = REG_NOERROR;
1546       return NULL;
1547     }
1548   hash = calc_state_hash (nodes, context);
1549   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1550 
1551   for (i = 0 ; i < spot->num ; i++)
1552     {
1553       re_dfastate_t *state = spot->array[i];
1554       if (state->hash == hash
1555 	  && state->context == context
1556 	  && re_node_set_compare (state->entrance_nodes, nodes))
1557 	return state;
1558     }
1559   /* There are no appropriate state in `dfa', create the new one.  */
1560   new_state = create_cd_newstate (dfa, nodes, context, hash);
1561   if (BE (new_state == NULL, 0))
1562     *err = REG_ESPACE;
1563 
1564   return new_state;
1565 }
1566 
1567 /* Finish initialization of the new state NEWSTATE, and using its hash value
1568    HASH put in the appropriate bucket of DFA's state table.  Return value
1569    indicates the error code if failed.  */
1570 
1571 static reg_errcode_t
1572 __attribute_warn_unused_result__
1573 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1574 		re_hashval_t hash)
1575 {
1576   struct re_state_table_entry *spot;
1577   reg_errcode_t err;
1578   Idx i;
1579 
1580   newstate->hash = hash;
1581   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1582   if (BE (err != REG_NOERROR, 0))
1583     return REG_ESPACE;
1584   for (i = 0; i < newstate->nodes.nelem; i++)
1585     {
1586       Idx elem = newstate->nodes.elems[i];
1587       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1588 	if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1589 	  return REG_ESPACE;
1590     }
1591 
1592   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1593   if (BE (spot->alloc <= spot->num, 0))
1594     {
1595       Idx new_alloc = 2 * spot->num + 2;
1596       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1597 					      new_alloc);
1598       if (BE (new_array == NULL, 0))
1599 	return REG_ESPACE;
1600       spot->array = new_array;
1601       spot->alloc = new_alloc;
1602     }
1603   spot->array[spot->num++] = newstate;
1604   return REG_NOERROR;
1605 }
1606 
1607 static void
1608 free_state (re_dfastate_t *state)
1609 {
1610   re_node_set_free (&state->non_eps_nodes);
1611   re_node_set_free (&state->inveclosure);
1612   if (state->entrance_nodes != &state->nodes)
1613     {
1614       re_node_set_free (state->entrance_nodes);
1615       re_free (state->entrance_nodes);
1616     }
1617   re_node_set_free (&state->nodes);
1618   re_free (state->word_trtable);
1619   re_free (state->trtable);
1620   re_free (state);
1621 }
1622 
1623 /* Create the new state which is independ of contexts.
1624    Return the new state if succeeded, otherwise return NULL.  */
1625 
1626 static re_dfastate_t *
1627 internal_function __attribute_warn_unused_result__
1628 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1629 		    re_hashval_t hash)
1630 {
1631   Idx i;
1632   reg_errcode_t err;
1633   re_dfastate_t *newstate;
1634 
1635   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1636   if (BE (newstate == NULL, 0))
1637     return NULL;
1638   err = re_node_set_init_copy (&newstate->nodes, nodes);
1639   if (BE (err != REG_NOERROR, 0))
1640     {
1641       re_free (newstate);
1642       return NULL;
1643     }
1644 
1645   newstate->entrance_nodes = &newstate->nodes;
1646   for (i = 0 ; i < nodes->nelem ; i++)
1647     {
1648       re_token_t *node = dfa->nodes + nodes->elems[i];
1649       re_token_type_t type = node->type;
1650       if (type == CHARACTER && !node->constraint)
1651 	continue;
1652 #ifdef RE_ENABLE_I18N
1653       newstate->accept_mb |= node->accept_mb;
1654 #endif /* RE_ENABLE_I18N */
1655 
1656       /* If the state has the halt node, the state is a halt state.  */
1657       if (type == END_OF_RE)
1658 	newstate->halt = 1;
1659       else if (type == OP_BACK_REF)
1660 	newstate->has_backref = 1;
1661       else if (type == ANCHOR || node->constraint)
1662 	newstate->has_constraint = 1;
1663     }
1664   err = register_state (dfa, newstate, hash);
1665   if (BE (err != REG_NOERROR, 0))
1666     {
1667       free_state (newstate);
1668       newstate = NULL;
1669     }
1670   return newstate;
1671 }
1672 
1673 /* Create the new state which is depend on the context CONTEXT.
1674    Return the new state if succeeded, otherwise return NULL.  */
1675 
1676 static re_dfastate_t *
1677 internal_function __attribute_warn_unused_result__
1678 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1679 		    unsigned int context, re_hashval_t hash)
1680 {
1681   Idx i, nctx_nodes = 0;
1682   reg_errcode_t err;
1683   re_dfastate_t *newstate;
1684 
1685   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1686   if (BE (newstate == NULL, 0))
1687     return NULL;
1688   err = re_node_set_init_copy (&newstate->nodes, nodes);
1689   if (BE (err != REG_NOERROR, 0))
1690     {
1691       re_free (newstate);
1692       return NULL;
1693     }
1694 
1695   newstate->context = context;
1696   newstate->entrance_nodes = &newstate->nodes;
1697 
1698   for (i = 0 ; i < nodes->nelem ; i++)
1699     {
1700       re_token_t *node = dfa->nodes + nodes->elems[i];
1701       re_token_type_t type = node->type;
1702       unsigned int constraint = node->constraint;
1703 
1704       if (type == CHARACTER && !constraint)
1705 	continue;
1706 #ifdef RE_ENABLE_I18N
1707       newstate->accept_mb |= node->accept_mb;
1708 #endif /* RE_ENABLE_I18N */
1709 
1710       /* If the state has the halt node, the state is a halt state.  */
1711       if (type == END_OF_RE)
1712 	newstate->halt = 1;
1713       else if (type == OP_BACK_REF)
1714 	newstate->has_backref = 1;
1715 
1716       if (constraint)
1717 	{
1718 	  if (newstate->entrance_nodes == &newstate->nodes)
1719 	    {
1720 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1721 	      if (BE (newstate->entrance_nodes == NULL, 0))
1722 		{
1723 		  free_state (newstate);
1724 		  return NULL;
1725 		}
1726 	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1727 		  != REG_NOERROR)
1728 		return NULL;
1729 	      nctx_nodes = 0;
1730 	      newstate->has_constraint = 1;
1731 	    }
1732 
1733 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1734 	    {
1735 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1736 	      ++nctx_nodes;
1737 	    }
1738 	}
1739     }
1740   err = register_state (dfa, newstate, hash);
1741   if (BE (err != REG_NOERROR, 0))
1742     {
1743       free_state (newstate);
1744       newstate = NULL;
1745     }
1746   return  newstate;
1747 }
1748