1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19 
20 static void re_string_construct_common (const char *str, int len,
21 					re_string_t *pstr,
22 					RE_TRANSLATE_TYPE trans, int icase,
23 					const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 					  const re_node_set *nodes,
26 					  unsigned int hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 					  const re_node_set *nodes,
29 					  unsigned int context,
30 					  unsigned int hash) internal_function;
31 
32 #ifdef GAWK
33 #undef MAX	/* safety */
34 static int
MAX(size_t a,size_t b)35 MAX(size_t a, size_t b)
36 {
37 	return (a > b ? a : b);
38 }
39 #endif
40 
41 /* Functions for string operation.  */
42 
43 /* This function allocate the buffers.  It is necessary to call
44    re_string_reconstruct before using the object.  */
45 
46 static reg_errcode_t
47 internal_function
re_string_allocate(re_string_t * pstr,const char * str,int len,int init_len,RE_TRANSLATE_TYPE trans,int icase,const re_dfa_t * dfa)48 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
49 		    RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
50 {
51   reg_errcode_t ret;
52   int init_buf_len;
53 
54   /* Ensure at least one character fits into the buffers.  */
55   if (init_len < dfa->mb_cur_max)
56     init_len = dfa->mb_cur_max;
57   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
58   re_string_construct_common (str, len, pstr, trans, icase, dfa);
59 
60   ret = re_string_realloc_buffers (pstr, init_buf_len);
61   if (BE (ret != REG_NOERROR, 0))
62     return ret;
63 
64   pstr->word_char = dfa->word_char;
65   pstr->word_ops_used = dfa->word_ops_used;
66   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
67   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
68   pstr->valid_raw_len = pstr->valid_len;
69   return REG_NOERROR;
70 }
71 
72 /* This function allocate the buffers, and initialize them.  */
73 
74 static reg_errcode_t
75 internal_function
re_string_construct(re_string_t * pstr,const char * str,int len,RE_TRANSLATE_TYPE trans,int icase,const re_dfa_t * dfa)76 re_string_construct (re_string_t *pstr, const char *str, int len,
77 		     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
78 {
79   reg_errcode_t ret;
80   memset (pstr, '\0', sizeof (re_string_t));
81   re_string_construct_common (str, len, pstr, trans, icase, dfa);
82 
83   if (len > 0)
84     {
85       ret = re_string_realloc_buffers (pstr, len + 1);
86       if (BE (ret != REG_NOERROR, 0))
87 	return ret;
88     }
89   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
90 
91   if (icase)
92     {
93 #ifdef RE_ENABLE_I18N
94       if (dfa->mb_cur_max > 1)
95 	{
96 	  while (1)
97 	    {
98 	      ret = build_wcs_upper_buffer (pstr);
99 	      if (BE (ret != REG_NOERROR, 0))
100 		return ret;
101 	      if (pstr->valid_raw_len >= len)
102 		break;
103 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
104 		break;
105 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
106 	      if (BE (ret != REG_NOERROR, 0))
107 		return ret;
108 	    }
109 	}
110       else
111 #endif /* RE_ENABLE_I18N  */
112 	build_upper_buffer (pstr);
113     }
114   else
115     {
116 #ifdef RE_ENABLE_I18N
117       if (dfa->mb_cur_max > 1)
118 	build_wcs_buffer (pstr);
119       else
120 #endif /* RE_ENABLE_I18N  */
121 	{
122 	  if (trans != NULL)
123 	    re_string_translate_buffer (pstr);
124 	  else
125 	    {
126 	      pstr->valid_len = pstr->bufs_len;
127 	      pstr->valid_raw_len = pstr->bufs_len;
128 	    }
129 	}
130     }
131 
132   return REG_NOERROR;
133 }
134 
135 /* Helper functions for re_string_allocate, and re_string_construct.  */
136 
137 static reg_errcode_t
138 internal_function
re_string_realloc_buffers(re_string_t * pstr,int new_buf_len)139 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
140 {
141 #ifdef RE_ENABLE_I18N
142   if (pstr->mb_cur_max > 1)
143     {
144       wint_t *new_wcs;
145 
146       /* Avoid overflow in realloc.  */
147       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
148       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
149 	return REG_ESPACE;
150 
151       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
152       if (BE (new_wcs == NULL, 0))
153 	return REG_ESPACE;
154       pstr->wcs = new_wcs;
155       if (pstr->offsets != NULL)
156 	{
157 	  int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
158 	  if (BE (new_offsets == NULL, 0))
159 	    return REG_ESPACE;
160 	  pstr->offsets = new_offsets;
161 	}
162     }
163 #endif /* RE_ENABLE_I18N  */
164   if (pstr->mbs_allocated)
165     {
166       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
167 					   new_buf_len);
168       if (BE (new_mbs == NULL, 0))
169 	return REG_ESPACE;
170       pstr->mbs = new_mbs;
171     }
172   pstr->bufs_len = new_buf_len;
173   return REG_NOERROR;
174 }
175 
176 
177 static void
178 internal_function
re_string_construct_common(const char * str,int len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,int icase,const re_dfa_t * dfa)179 re_string_construct_common (const char *str, int len, re_string_t *pstr,
180 			    RE_TRANSLATE_TYPE trans, int icase,
181 			    const re_dfa_t *dfa)
182 {
183   pstr->raw_mbs = (const unsigned char *) str;
184   pstr->len = len;
185   pstr->raw_len = len;
186   pstr->trans = trans;
187   pstr->icase = icase ? 1 : 0;
188   pstr->mbs_allocated = (trans != NULL || icase);
189   pstr->mb_cur_max = dfa->mb_cur_max;
190   pstr->is_utf8 = dfa->is_utf8;
191   pstr->map_notascii = dfa->map_notascii;
192   pstr->stop = pstr->len;
193   pstr->raw_stop = pstr->stop;
194 }
195 
196 #ifdef RE_ENABLE_I18N
197 
198 /* Build wide character buffer PSTR->WCS.
199    If the byte sequence of the string are:
200      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
201    Then wide character buffer will be:
202      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
203    We use WEOF for padding, they indicate that the position isn't
204    a first byte of a multibyte character.
205 
206    Note that this function assumes PSTR->VALID_LEN elements are already
207    built and starts from PSTR->VALID_LEN.  */
208 
209 static void
210 internal_function
build_wcs_buffer(re_string_t * pstr)211 build_wcs_buffer (re_string_t *pstr)
212 {
213 #ifdef _LIBC
214   unsigned char buf[MB_LEN_MAX];
215   assert (MB_LEN_MAX >= pstr->mb_cur_max);
216 #else
217   unsigned char buf[64];
218 #endif
219   mbstate_t prev_st;
220   int byte_idx, end_idx, remain_len;
221   size_t mbclen;
222 
223   /* Build the buffers from pstr->valid_len to either pstr->len or
224      pstr->bufs_len.  */
225   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
226   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
227     {
228       wchar_t wc;
229       const char *p;
230 
231       remain_len = end_idx - byte_idx;
232       prev_st = pstr->cur_state;
233       /* Apply the translation if we need.  */
234       if (BE (pstr->trans != NULL, 0))
235 	{
236 	  int i, ch;
237 
238 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
239 	    {
240 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
241 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
242 	    }
243 	  p = (const char *) buf;
244 	}
245       else
246 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
247       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
248       if (BE (mbclen == (size_t) -2, 0))
249 	{
250 	  /* The buffer doesn't have enough space, finish to build.  */
251 	  pstr->cur_state = prev_st;
252 	  break;
253 	}
254       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
255 	{
256 	  /* We treat these cases as a singlebyte character.  */
257 	  mbclen = 1;
258 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
259 	  if (BE (pstr->trans != NULL, 0))
260 	    wc = pstr->trans[wc];
261 	  pstr->cur_state = prev_st;
262 	}
263 
264       /* Write wide character and padding.  */
265       pstr->wcs[byte_idx++] = wc;
266       /* Write paddings.  */
267       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
268 	pstr->wcs[byte_idx++] = WEOF;
269     }
270   pstr->valid_len = byte_idx;
271   pstr->valid_raw_len = byte_idx;
272 }
273 
274 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
275    but for REG_ICASE.  */
276 
277 static reg_errcode_t
278 internal_function
build_wcs_upper_buffer(re_string_t * pstr)279 build_wcs_upper_buffer (re_string_t *pstr)
280 {
281   mbstate_t prev_st;
282   int src_idx, byte_idx, end_idx, remain_len;
283   size_t mbclen;
284 #ifdef _LIBC
285   char buf[MB_LEN_MAX];
286   assert (MB_LEN_MAX >= pstr->mb_cur_max);
287 #else
288   char buf[64];
289 #endif
290 
291   byte_idx = pstr->valid_len;
292   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
293 
294   /* The following optimization assumes that ASCII characters can be
295      mapped to wide characters with a simple cast.  */
296   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
297     {
298       while (byte_idx < end_idx)
299 	{
300 	  wchar_t wc;
301 
302 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
303 	      && mbsinit (&pstr->cur_state))
304 	    {
305 	      /* In case of a singlebyte character.  */
306 	      pstr->mbs[byte_idx]
307 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
308 	      /* The next step uses the assumption that wchar_t is encoded
309 		 ASCII-safe: all ASCII values can be converted like this.  */
310 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
311 	      ++byte_idx;
312 	      continue;
313 	    }
314 
315 	  remain_len = end_idx - byte_idx;
316 	  prev_st = pstr->cur_state;
317 	  mbclen = __mbrtowc (&wc,
318 			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
319 			       + byte_idx), remain_len, &pstr->cur_state);
320 	  if (BE (mbclen + 2 > 2, 1))
321 	    {
322 	      wchar_t wcu = wc;
323 	      if (iswlower (wc))
324 		{
325 		  size_t mbcdlen;
326 
327 		  wcu = towupper (wc);
328 		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
329 		  if (BE (mbclen == mbcdlen, 1))
330 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
331 		  else
332 		    {
333 		      src_idx = byte_idx;
334 		      goto offsets_needed;
335 		    }
336 		}
337 	      else
338 		memcpy (pstr->mbs + byte_idx,
339 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
340 	      pstr->wcs[byte_idx++] = wcu;
341 	      /* Write paddings.  */
342 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
343 		pstr->wcs[byte_idx++] = WEOF;
344 	    }
345 	  else if (mbclen == (size_t) -1 || mbclen == 0)
346 	    {
347 	      /* It is an invalid character or '\0'.  Just use the byte.  */
348 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
349 	      pstr->mbs[byte_idx] = ch;
350 	      /* And also cast it to wide char.  */
351 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
352 	      if (BE (mbclen == (size_t) -1, 0))
353 		pstr->cur_state = prev_st;
354 	    }
355 	  else
356 	    {
357 	      /* The buffer doesn't have enough space, finish to build.  */
358 	      pstr->cur_state = prev_st;
359 	      break;
360 	    }
361 	}
362       pstr->valid_len = byte_idx;
363       pstr->valid_raw_len = byte_idx;
364       return REG_NOERROR;
365     }
366   else
367     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
368       {
369 	wchar_t wc;
370 	const char *p;
371       offsets_needed:
372 	remain_len = end_idx - byte_idx;
373 	prev_st = pstr->cur_state;
374 	if (BE (pstr->trans != NULL, 0))
375 	  {
376 	    int i, ch;
377 
378 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
379 	      {
380 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
381 		buf[i] = pstr->trans[ch];
382 	      }
383 	    p = (const char *) buf;
384 	  }
385 	else
386 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
387 	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
388 	if (BE (mbclen + 2 > 2, 1))
389 	  {
390 	    wchar_t wcu = wc;
391 	    if (iswlower (wc))
392 	      {
393 		size_t mbcdlen;
394 
395 		wcu = towupper (wc);
396 		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
397 		if (BE (mbclen == mbcdlen, 1))
398 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
399 		else if (mbcdlen != (size_t) -1)
400 		  {
401 		    size_t i;
402 
403 		    if (byte_idx + mbcdlen > pstr->bufs_len)
404 		      {
405 			pstr->cur_state = prev_st;
406 			break;
407 		      }
408 
409 		    if (pstr->offsets == NULL)
410 		      {
411 			pstr->offsets = re_malloc (int, pstr->bufs_len);
412 
413 			if (pstr->offsets == NULL)
414 			  return REG_ESPACE;
415 		      }
416 		    if (!pstr->offsets_needed)
417 		      {
418 			for (i = 0; i < (size_t) byte_idx; ++i)
419 			  pstr->offsets[i] = i;
420 			pstr->offsets_needed = 1;
421 		      }
422 
423 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
424 		    pstr->wcs[byte_idx] = wcu;
425 		    pstr->offsets[byte_idx] = src_idx;
426 		    for (i = 1; i < mbcdlen; ++i)
427 		      {
428 			pstr->offsets[byte_idx + i]
429 			  = src_idx + (i < mbclen ? i : mbclen - 1);
430 			pstr->wcs[byte_idx + i] = WEOF;
431 		      }
432 		    pstr->len += mbcdlen - mbclen;
433 		    if (pstr->raw_stop > src_idx)
434 		      pstr->stop += mbcdlen - mbclen;
435 		    end_idx = (pstr->bufs_len > pstr->len)
436 			      ? pstr->len : pstr->bufs_len;
437 		    byte_idx += mbcdlen;
438 		    src_idx += mbclen;
439 		    continue;
440 		  }
441 		else
442 		  memcpy (pstr->mbs + byte_idx, p, mbclen);
443 	      }
444 	    else
445 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
446 
447 	    if (BE (pstr->offsets_needed != 0, 0))
448 	      {
449 		size_t i;
450 		for (i = 0; i < mbclen; ++i)
451 		  pstr->offsets[byte_idx + i] = src_idx + i;
452 	      }
453 	    src_idx += mbclen;
454 
455 	    pstr->wcs[byte_idx++] = wcu;
456 	    /* Write paddings.  */
457 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
458 	      pstr->wcs[byte_idx++] = WEOF;
459 	  }
460 	else if (mbclen == (size_t) -1 || mbclen == 0)
461 	  {
462 	    /* It is an invalid character or '\0'.  Just use the byte.  */
463 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
464 
465 	    if (BE (pstr->trans != NULL, 0))
466 	      ch = pstr->trans [ch];
467 	    pstr->mbs[byte_idx] = ch;
468 
469 	    if (BE (pstr->offsets_needed != 0, 0))
470 	      pstr->offsets[byte_idx] = src_idx;
471 	    ++src_idx;
472 
473 	    /* And also cast it to wide char.  */
474 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
475 	    if (BE (mbclen == (size_t) -1, 0))
476 	      pstr->cur_state = prev_st;
477 	  }
478 	else
479 	  {
480 	    /* The buffer doesn't have enough space, finish to build.  */
481 	    pstr->cur_state = prev_st;
482 	    break;
483 	  }
484       }
485   pstr->valid_len = byte_idx;
486   pstr->valid_raw_len = src_idx;
487   return REG_NOERROR;
488 }
489 
490 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
491    Return the index.  */
492 
493 static int
494 internal_function
re_string_skip_chars(re_string_t * pstr,int new_raw_idx,wint_t * last_wc)495 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
496 {
497   mbstate_t prev_st;
498   int rawbuf_idx;
499   size_t mbclen;
500   wint_t wc = WEOF;
501 
502   /* Skip the characters which are not necessary to check.  */
503   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
504        rawbuf_idx < new_raw_idx;)
505     {
506       wchar_t wc2;
507       int remain_len = pstr->len - rawbuf_idx;
508       prev_st = pstr->cur_state;
509       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
510 			  remain_len, &pstr->cur_state);
511       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
512 	{
513 	  /* We treat these cases as a single byte character.  */
514 	  if (mbclen == 0 || remain_len == 0)
515 	    wc = L'\0';
516 	  else
517 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
518 	  mbclen = 1;
519 	  pstr->cur_state = prev_st;
520 	}
521       else
522 	wc = (wint_t) wc2;
523       /* Then proceed the next character.  */
524       rawbuf_idx += mbclen;
525     }
526   *last_wc = (wint_t) wc;
527   return rawbuf_idx;
528 }
529 #endif /* RE_ENABLE_I18N  */
530 
531 /* Build the buffer PSTR->MBS, and apply the translation if we need.
532    This function is used in case of REG_ICASE.  */
533 
534 static void
535 internal_function
build_upper_buffer(re_string_t * pstr)536 build_upper_buffer (re_string_t *pstr)
537 {
538   int char_idx, end_idx;
539   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
540 
541   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
542     {
543       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
544       if (BE (pstr->trans != NULL, 0))
545 	ch = pstr->trans[ch];
546       if (islower (ch))
547 	pstr->mbs[char_idx] = toupper (ch);
548       else
549 	pstr->mbs[char_idx] = ch;
550     }
551   pstr->valid_len = char_idx;
552   pstr->valid_raw_len = char_idx;
553 }
554 
555 /* Apply TRANS to the buffer in PSTR.  */
556 
557 static void
558 internal_function
re_string_translate_buffer(re_string_t * pstr)559 re_string_translate_buffer (re_string_t *pstr)
560 {
561   int buf_idx, end_idx;
562   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
563 
564   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
565     {
566       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
567       pstr->mbs[buf_idx] = pstr->trans[ch];
568     }
569 
570   pstr->valid_len = buf_idx;
571   pstr->valid_raw_len = buf_idx;
572 }
573 
574 /* This function re-construct the buffers.
575    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
576    convert to upper case in case of REG_ICASE, apply translation.  */
577 
578 static reg_errcode_t
579 internal_function
re_string_reconstruct(re_string_t * pstr,int idx,int eflags)580 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
581 {
582   int offset = idx - pstr->raw_mbs_idx;
583   if (BE (offset < 0, 0))
584     {
585       /* Reset buffer.  */
586 #ifdef RE_ENABLE_I18N
587       if (pstr->mb_cur_max > 1)
588 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
589 #endif /* RE_ENABLE_I18N */
590       pstr->len = pstr->raw_len;
591       pstr->stop = pstr->raw_stop;
592       pstr->valid_len = 0;
593       pstr->raw_mbs_idx = 0;
594       pstr->valid_raw_len = 0;
595       pstr->offsets_needed = 0;
596       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
597 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
598       if (!pstr->mbs_allocated)
599 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
600       offset = idx;
601     }
602 
603   if (BE (offset != 0, 1))
604     {
605       /* Should the already checked characters be kept?  */
606       if (BE (offset < pstr->valid_raw_len, 1))
607 	{
608 	  /* Yes, move them to the front of the buffer.  */
609 #ifdef RE_ENABLE_I18N
610 	  if (BE (pstr->offsets_needed, 0))
611 	    {
612 	      int low = 0, high = pstr->valid_len, mid;
613 	      do
614 		{
615 		  mid = low + (high - low) / 2;
616 		  if (pstr->offsets[mid] > offset)
617 		    high = mid;
618 		  else if (pstr->offsets[mid] < offset)
619 		    low = mid + 1;
620 		  else
621 		    break;
622 		}
623 	      while (low < high);
624 	      if (pstr->offsets[mid] < offset)
625 		++mid;
626 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
627 							eflags);
628 	      /* This can be quite complicated, so handle specially
629 		 only the common and easy case where the character with
630 		 different length representation of lower and upper
631 		 case is present at or after offset.  */
632 	      if (pstr->valid_len > offset
633 		  && mid == offset && pstr->offsets[mid] == offset)
634 		{
635 		  memmove (pstr->wcs, pstr->wcs + offset,
636 			   (pstr->valid_len - offset) * sizeof (wint_t));
637 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
638 		  pstr->valid_len -= offset;
639 		  pstr->valid_raw_len -= offset;
640 		  for (low = 0; low < pstr->valid_len; low++)
641 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
642 		}
643 	      else
644 		{
645 		  /* Otherwise, just find out how long the partial multibyte
646 		     character at offset is and fill it with WEOF/255.  */
647 		  pstr->len = pstr->raw_len - idx + offset;
648 		  pstr->stop = pstr->raw_stop - idx + offset;
649 		  pstr->offsets_needed = 0;
650 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
651 		    --mid;
652 		  while (mid < pstr->valid_len)
653 		    if (pstr->wcs[mid] != WEOF)
654 		      break;
655 		    else
656 		      ++mid;
657 		  if (mid == pstr->valid_len)
658 		    pstr->valid_len = 0;
659 		  else
660 		    {
661 		      pstr->valid_len = pstr->offsets[mid] - offset;
662 		      if (pstr->valid_len)
663 			{
664 			  for (low = 0; low < pstr->valid_len; ++low)
665 			    pstr->wcs[low] = WEOF;
666 			  memset (pstr->mbs, 255, pstr->valid_len);
667 			}
668 		    }
669 		  pstr->valid_raw_len = pstr->valid_len;
670 		}
671 	    }
672 	  else
673 #endif
674 	    {
675 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
676 							eflags);
677 #ifdef RE_ENABLE_I18N
678 	      if (pstr->mb_cur_max > 1)
679 		memmove (pstr->wcs, pstr->wcs + offset,
680 			 (pstr->valid_len - offset) * sizeof (wint_t));
681 #endif /* RE_ENABLE_I18N */
682 	      if (BE (pstr->mbs_allocated, 0))
683 		memmove (pstr->mbs, pstr->mbs + offset,
684 			 pstr->valid_len - offset);
685 	      pstr->valid_len -= offset;
686 	      pstr->valid_raw_len -= offset;
687 #if DEBUG
688 	      assert (pstr->valid_len > 0);
689 #endif
690 	    }
691 	}
692       else
693 	{
694 #ifdef RE_ENABLE_I18N
695 	  /* No, skip all characters until IDX.  */
696 	  int prev_valid_len = pstr->valid_len;
697 
698 	  if (BE (pstr->offsets_needed, 0))
699 	    {
700 	      pstr->len = pstr->raw_len - idx + offset;
701 	      pstr->stop = pstr->raw_stop - idx + offset;
702 	      pstr->offsets_needed = 0;
703 	    }
704 #endif
705 	  pstr->valid_len = 0;
706 #ifdef RE_ENABLE_I18N
707 	  if (pstr->mb_cur_max > 1)
708 	    {
709 	      int wcs_idx;
710 	      wint_t wc = WEOF;
711 
712 	      if (pstr->is_utf8)
713 		{
714 		  const unsigned char *raw, *p, *end;
715 
716 		  /* Special case UTF-8.  Multi-byte chars start with any
717 		     byte other than 0x80 - 0xbf.  */
718 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
719 		  end = raw + (offset - pstr->mb_cur_max);
720 		  if (end < pstr->raw_mbs)
721 		    end = pstr->raw_mbs;
722 		  p = raw + offset - 1;
723 #ifdef _LIBC
724 		  /* We know the wchar_t encoding is UCS4, so for the simple
725 		     case, ASCII characters, skip the conversion step.  */
726 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
727 		    {
728 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
729 		      /* pstr->valid_len = 0; */
730 		      wc = (wchar_t) *p;
731 		    }
732 		  else
733 #endif
734 		    for (; p >= end; --p)
735 		      if ((*p & 0xc0) != 0x80)
736 			{
737 			  mbstate_t cur_state;
738 			  wchar_t wc2;
739 			  int mlen = raw + pstr->len - p;
740 			  unsigned char buf[6];
741 			  size_t mbclen;
742 
743 			  if (BE (pstr->trans != NULL, 0))
744 			    {
745 			      int i = mlen < 6 ? mlen : 6;
746 			      while (--i >= 0)
747 				buf[i] = pstr->trans[p[i]];
748 			    }
749 			  /* XXX Don't use mbrtowc, we know which conversion
750 			     to use (UTF-8 -> UCS4).  */
751 			  memset (&cur_state, 0, sizeof (cur_state));
752 			  mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
753 					      &cur_state);
754 			  if (raw + offset - p <= mbclen
755 			      && mbclen < (size_t) -2)
756 			    {
757 			      memset (&pstr->cur_state, '\0',
758 				      sizeof (mbstate_t));
759 			      pstr->valid_len = mbclen - (raw + offset - p);
760 			      wc = wc2;
761 			    }
762 			  break;
763 			}
764 		}
765 
766 	      if (wc == WEOF)
767 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
768 	      if (wc == WEOF)
769 		pstr->tip_context
770 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
771 	      else
772 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
773 				      && IS_WIDE_WORD_CHAR (wc))
774 				     ? CONTEXT_WORD
775 				     : ((IS_WIDE_NEWLINE (wc)
776 					 && pstr->newline_anchor)
777 					? CONTEXT_NEWLINE : 0));
778 	      if (BE (pstr->valid_len, 0))
779 		{
780 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
781 		    pstr->wcs[wcs_idx] = WEOF;
782 		  if (pstr->mbs_allocated)
783 		    memset (pstr->mbs, 255, pstr->valid_len);
784 		}
785 	      pstr->valid_raw_len = pstr->valid_len;
786 	    }
787 	  else
788 #endif /* RE_ENABLE_I18N */
789 	    {
790 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
791 	      pstr->valid_raw_len = 0;
792 	      if (pstr->trans)
793 		c = pstr->trans[c];
794 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
795 				   ? CONTEXT_WORD
796 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
797 				      ? CONTEXT_NEWLINE : 0));
798 	    }
799 	}
800       if (!BE (pstr->mbs_allocated, 0))
801 	pstr->mbs += offset;
802     }
803   pstr->raw_mbs_idx = idx;
804   pstr->len -= offset;
805   pstr->stop -= offset;
806 
807   /* Then build the buffers.  */
808 #ifdef RE_ENABLE_I18N
809   if (pstr->mb_cur_max > 1)
810     {
811       if (pstr->icase)
812 	{
813 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
814 	  if (BE (ret != REG_NOERROR, 0))
815 	    return ret;
816 	}
817       else
818 	build_wcs_buffer (pstr);
819     }
820   else
821 #endif /* RE_ENABLE_I18N */
822     if (BE (pstr->mbs_allocated, 0))
823       {
824 	if (pstr->icase)
825 	  build_upper_buffer (pstr);
826 	else if (pstr->trans != NULL)
827 	  re_string_translate_buffer (pstr);
828       }
829     else
830       pstr->valid_len = pstr->len;
831 
832   pstr->cur_idx = 0;
833   return REG_NOERROR;
834 }
835 
836 static unsigned char
internal_function(pure)837 internal_function __attribute ((pure))
838 re_string_peek_byte_case (const re_string_t *pstr, int idx)
839 {
840   int ch, off;
841 
842   /* Handle the common (easiest) cases first.  */
843   if (BE (!pstr->mbs_allocated, 1))
844     return re_string_peek_byte (pstr, idx);
845 
846 #ifdef RE_ENABLE_I18N
847   if (pstr->mb_cur_max > 1
848       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
849     return re_string_peek_byte (pstr, idx);
850 #endif
851 
852   off = pstr->cur_idx + idx;
853 #ifdef RE_ENABLE_I18N
854   if (pstr->offsets_needed)
855     off = pstr->offsets[off];
856 #endif
857 
858   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
859 
860 #ifdef RE_ENABLE_I18N
861   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
862      this function returns CAPITAL LETTER I instead of first byte of
863      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
864      since peek_byte_case doesn't advance cur_idx in any way.  */
865   if (pstr->offsets_needed && !isascii (ch))
866     return re_string_peek_byte (pstr, idx);
867 #endif
868 
869   return ch;
870 }
871 
872 static unsigned char
internal_function(pure)873 internal_function __attribute ((pure))
874 re_string_fetch_byte_case (re_string_t *pstr)
875 {
876   if (BE (!pstr->mbs_allocated, 1))
877     return re_string_fetch_byte (pstr);
878 
879 #ifdef RE_ENABLE_I18N
880   if (pstr->offsets_needed)
881     {
882       int off, ch;
883 
884       /* For tr_TR.UTF-8 [[:islower:]] there is
885 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
886 	 in that case the whole multi-byte character and return
887 	 the original letter.  On the other side, with
888 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
889 	 anything else would complicate things too much.  */
890 
891       if (!re_string_first_byte (pstr, pstr->cur_idx))
892 	return re_string_fetch_byte (pstr);
893 
894       off = pstr->offsets[pstr->cur_idx];
895       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
896 
897       if (! isascii (ch))
898 	return re_string_fetch_byte (pstr);
899 
900       re_string_skip_bytes (pstr,
901 			    re_string_char_size_at (pstr, pstr->cur_idx));
902       return ch;
903     }
904 #endif
905 
906   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
907 }
908 
909 static void
910 internal_function
re_string_destruct(re_string_t * pstr)911 re_string_destruct (re_string_t *pstr)
912 {
913 #ifdef RE_ENABLE_I18N
914   re_free (pstr->wcs);
915   re_free (pstr->offsets);
916 #endif /* RE_ENABLE_I18N  */
917   if (pstr->mbs_allocated)
918     re_free (pstr->mbs);
919 }
920 
921 /* Return the context at IDX in INPUT.  */
922 
923 static unsigned int
924 internal_function
re_string_context_at(const re_string_t * input,int idx,int eflags)925 re_string_context_at (const re_string_t *input, int idx, int eflags)
926 {
927   int c;
928   if (BE (idx < 0, 0))
929     /* In this case, we use the value stored in input->tip_context,
930        since we can't know the character in input->mbs[-1] here.  */
931     return input->tip_context;
932   if (BE (idx == input->len, 0))
933     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
934 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
935 #ifdef RE_ENABLE_I18N
936   if (input->mb_cur_max > 1)
937     {
938       wint_t wc;
939       int wc_idx = idx;
940       while(input->wcs[wc_idx] == WEOF)
941 	{
942 #ifdef DEBUG
943 	  /* It must not happen.  */
944 	  assert (wc_idx >= 0);
945 #endif
946 	  --wc_idx;
947 	  if (wc_idx < 0)
948 	    return input->tip_context;
949 	}
950       wc = input->wcs[wc_idx];
951       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
952 	return CONTEXT_WORD;
953       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
954 	      ? CONTEXT_NEWLINE : 0);
955     }
956   else
957 #endif
958     {
959       c = re_string_byte_at (input, idx);
960       if (bitset_contain (input->word_char, c))
961 	return CONTEXT_WORD;
962       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
963     }
964 }
965 
966 /* Functions for set operation.  */
967 
968 static reg_errcode_t
969 internal_function
re_node_set_alloc(re_node_set * set,int size)970 re_node_set_alloc (re_node_set *set, int size)
971 {
972   /*
973    * ADR: valgrind says size can be 0, which then doesn't
974    * free the block of size 0.  Harumph. This seems
975    * to work ok, though.
976    */
977   if (size == 0)
978     {
979        memset(set, 0, sizeof(*set));
980        return REG_NOERROR;
981     }
982   set->alloc = size;
983   set->nelem = 0;
984   set->elems = re_malloc (int, size);
985   if (BE (set->elems == NULL, 0))
986     return REG_ESPACE;
987   return REG_NOERROR;
988 }
989 
990 static reg_errcode_t
991 internal_function
re_node_set_init_1(re_node_set * set,int elem)992 re_node_set_init_1 (re_node_set *set, int elem)
993 {
994   set->alloc = 1;
995   set->nelem = 1;
996   set->elems = re_malloc (int, 1);
997   if (BE (set->elems == NULL, 0))
998     {
999       set->alloc = set->nelem = 0;
1000       return REG_ESPACE;
1001     }
1002   set->elems[0] = elem;
1003   return REG_NOERROR;
1004 }
1005 
1006 static reg_errcode_t
1007 internal_function
re_node_set_init_2(re_node_set * set,int elem1,int elem2)1008 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
1009 {
1010   set->alloc = 2;
1011   set->elems = re_malloc (int, 2);
1012   if (BE (set->elems == NULL, 0))
1013     return REG_ESPACE;
1014   if (elem1 == elem2)
1015     {
1016       set->nelem = 1;
1017       set->elems[0] = elem1;
1018     }
1019   else
1020     {
1021       set->nelem = 2;
1022       if (elem1 < elem2)
1023 	{
1024 	  set->elems[0] = elem1;
1025 	  set->elems[1] = elem2;
1026 	}
1027       else
1028 	{
1029 	  set->elems[0] = elem2;
1030 	  set->elems[1] = elem1;
1031 	}
1032     }
1033   return REG_NOERROR;
1034 }
1035 
1036 static reg_errcode_t
1037 internal_function
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)1038 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1039 {
1040   dest->nelem = src->nelem;
1041   if (src->nelem > 0)
1042     {
1043       dest->alloc = dest->nelem;
1044       dest->elems = re_malloc (int, dest->alloc);
1045       if (BE (dest->elems == NULL, 0))
1046 	{
1047 	  dest->alloc = dest->nelem = 0;
1048 	  return REG_ESPACE;
1049 	}
1050       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1051     }
1052   else
1053     re_node_set_init_empty (dest);
1054   return REG_NOERROR;
1055 }
1056 
1057 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1058    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1059    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1060 
1061 static reg_errcode_t
1062 internal_function
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1063 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1064 			   const re_node_set *src2)
1065 {
1066   int i1, i2, is, id, delta, sbase;
1067   if (src1->nelem == 0 || src2->nelem == 0)
1068     return REG_NOERROR;
1069 
1070   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1071      conservative estimate.  */
1072   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1073     {
1074       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1075       int *new_elems = re_realloc (dest->elems, int, new_alloc);
1076       if (BE (new_elems == NULL, 0))
1077 	return REG_ESPACE;
1078       dest->elems = new_elems;
1079       dest->alloc = new_alloc;
1080     }
1081 
1082   /* Find the items in the intersection of SRC1 and SRC2, and copy
1083      into the top of DEST those that are not already in DEST itself.  */
1084   sbase = dest->nelem + src1->nelem + src2->nelem;
1085   i1 = src1->nelem - 1;
1086   i2 = src2->nelem - 1;
1087   id = dest->nelem - 1;
1088   for (;;)
1089     {
1090       if (src1->elems[i1] == src2->elems[i2])
1091 	{
1092 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1093 	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
1094 	    --id;
1095 
1096 	  if (id < 0 || dest->elems[id] != src1->elems[i1])
1097 	    dest->elems[--sbase] = src1->elems[i1];
1098 
1099 	  if (--i1 < 0 || --i2 < 0)
1100 	    break;
1101 	}
1102 
1103       /* Lower the highest of the two items.  */
1104       else if (src1->elems[i1] < src2->elems[i2])
1105 	{
1106 	  if (--i2 < 0)
1107 	    break;
1108 	}
1109       else
1110 	{
1111 	  if (--i1 < 0)
1112 	    break;
1113 	}
1114     }
1115 
1116   id = dest->nelem - 1;
1117   is = dest->nelem + src1->nelem + src2->nelem - 1;
1118   delta = is - sbase + 1;
1119 
1120   /* Now copy.  When DELTA becomes zero, the remaining
1121      DEST elements are already in place; this is more or
1122      less the same loop that is in re_node_set_merge.  */
1123   dest->nelem += delta;
1124   if (delta > 0 && id >= 0)
1125     for (;;)
1126       {
1127 	if (dest->elems[is] > dest->elems[id])
1128 	  {
1129 	    /* Copy from the top.  */
1130 	    dest->elems[id + delta--] = dest->elems[is--];
1131 	    if (delta == 0)
1132 	      break;
1133 	  }
1134 	else
1135 	  {
1136 	    /* Slide from the bottom.  */
1137 	    dest->elems[id + delta] = dest->elems[id];
1138 	    if (--id < 0)
1139 	      break;
1140 	  }
1141       }
1142 
1143   /* Copy remaining SRC elements.  */
1144   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1145 
1146   return REG_NOERROR;
1147 }
1148 
1149 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1150    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1151 
1152 static reg_errcode_t
1153 internal_function
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1154 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1155 			const re_node_set *src2)
1156 {
1157   int i1, i2, id;
1158   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1159     {
1160       dest->alloc = src1->nelem + src2->nelem;
1161       dest->elems = re_malloc (int, dest->alloc);
1162       if (BE (dest->elems == NULL, 0))
1163 	return REG_ESPACE;
1164     }
1165   else
1166     {
1167       if (src1 != NULL && src1->nelem > 0)
1168 	return re_node_set_init_copy (dest, src1);
1169       else if (src2 != NULL && src2->nelem > 0)
1170 	return re_node_set_init_copy (dest, src2);
1171       else
1172 	re_node_set_init_empty (dest);
1173       return REG_NOERROR;
1174     }
1175   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1176     {
1177       if (src1->elems[i1] > src2->elems[i2])
1178 	{
1179 	  dest->elems[id++] = src2->elems[i2++];
1180 	  continue;
1181 	}
1182       if (src1->elems[i1] == src2->elems[i2])
1183 	++i2;
1184       dest->elems[id++] = src1->elems[i1++];
1185     }
1186   if (i1 < src1->nelem)
1187     {
1188       memcpy (dest->elems + id, src1->elems + i1,
1189 	     (src1->nelem - i1) * sizeof (int));
1190       id += src1->nelem - i1;
1191     }
1192   else if (i2 < src2->nelem)
1193     {
1194       memcpy (dest->elems + id, src2->elems + i2,
1195 	     (src2->nelem - i2) * sizeof (int));
1196       id += src2->nelem - i2;
1197     }
1198   dest->nelem = id;
1199   return REG_NOERROR;
1200 }
1201 
1202 /* Calculate the union set of the sets DEST and SRC. And store it to
1203    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1204 
1205 static reg_errcode_t
1206 internal_function
re_node_set_merge(re_node_set * dest,const re_node_set * src)1207 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1208 {
1209   int is, id, sbase, delta;
1210   if (src == NULL || src->nelem == 0)
1211     return REG_NOERROR;
1212   if (dest->alloc < 2 * src->nelem + dest->nelem)
1213     {
1214       int new_alloc = 2 * (src->nelem + dest->alloc);
1215       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1216       if (BE (new_buffer == NULL, 0))
1217 	return REG_ESPACE;
1218       dest->elems = new_buffer;
1219       dest->alloc = new_alloc;
1220     }
1221 
1222   if (BE (dest->nelem == 0, 0))
1223     {
1224       dest->nelem = src->nelem;
1225       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1226       return REG_NOERROR;
1227     }
1228 
1229   /* Copy into the top of DEST the items of SRC that are not
1230      found in DEST.  Maybe we could binary search in DEST?  */
1231   for (sbase = dest->nelem + 2 * src->nelem,
1232        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1233     {
1234       if (dest->elems[id] == src->elems[is])
1235 	is--, id--;
1236       else if (dest->elems[id] < src->elems[is])
1237 	dest->elems[--sbase] = src->elems[is--];
1238       else /* if (dest->elems[id] > src->elems[is]) */
1239 	--id;
1240     }
1241 
1242   if (is >= 0)
1243     {
1244       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1245       sbase -= is + 1;
1246       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1247     }
1248 
1249   id = dest->nelem - 1;
1250   is = dest->nelem + 2 * src->nelem - 1;
1251   delta = is - sbase + 1;
1252   if (delta == 0)
1253     return REG_NOERROR;
1254 
1255   /* Now copy.  When DELTA becomes zero, the remaining
1256      DEST elements are already in place.  */
1257   dest->nelem += delta;
1258   for (;;)
1259     {
1260       if (dest->elems[is] > dest->elems[id])
1261 	{
1262 	  /* Copy from the top.  */
1263 	  dest->elems[id + delta--] = dest->elems[is--];
1264 	  if (delta == 0)
1265 	    break;
1266 	}
1267       else
1268 	{
1269 	  /* Slide from the bottom.  */
1270 	  dest->elems[id + delta] = dest->elems[id];
1271 	  if (--id < 0)
1272 	    {
1273 	      /* Copy remaining SRC elements.  */
1274 	      memcpy (dest->elems, dest->elems + sbase,
1275 		      delta * sizeof (int));
1276 	      break;
1277 	    }
1278 	}
1279     }
1280 
1281   return REG_NOERROR;
1282 }
1283 
1284 /* Insert the new element ELEM to the re_node_set* SET.
1285    SET should not already have ELEM.
1286    return -1 if an error has occurred, return 1 otherwise.  */
1287 
1288 static int
1289 internal_function
re_node_set_insert(re_node_set * set,int elem)1290 re_node_set_insert (re_node_set *set, int elem)
1291 {
1292   int idx;
1293   /* In case the set is empty.  */
1294   if (set->alloc == 0)
1295     {
1296       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1297 	return 1;
1298       else
1299 	return -1;
1300     }
1301 
1302   if (BE (set->nelem, 0) == 0)
1303     {
1304       /* We already guaranteed above that set->alloc != 0.  */
1305       set->elems[0] = elem;
1306       ++set->nelem;
1307       return 1;
1308     }
1309 
1310   /* Realloc if we need.  */
1311   if (set->alloc == set->nelem)
1312     {
1313       int *new_elems;
1314       set->alloc = set->alloc * 2;
1315       new_elems = re_realloc (set->elems, int, set->alloc);
1316       if (BE (new_elems == NULL, 0))
1317 	return -1;
1318       set->elems = new_elems;
1319     }
1320 
1321   /* Move the elements which follows the new element.  Test the
1322      first element separately to skip a check in the inner loop.  */
1323   if (elem < set->elems[0])
1324     {
1325       idx = 0;
1326       for (idx = set->nelem; idx > 0; idx--)
1327 	set->elems[idx] = set->elems[idx - 1];
1328     }
1329   else
1330     {
1331       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1332 	set->elems[idx] = set->elems[idx - 1];
1333     }
1334 
1335   /* Insert the new element.  */
1336   set->elems[idx] = elem;
1337   ++set->nelem;
1338   return 1;
1339 }
1340 
1341 /* Insert the new element ELEM to the re_node_set* SET.
1342    SET should not already have any element greater than or equal to ELEM.
1343    Return -1 if an error has occurred, return 1 otherwise.  */
1344 
1345 static int
1346 internal_function
re_node_set_insert_last(re_node_set * set,int elem)1347 re_node_set_insert_last (re_node_set *set, int elem)
1348 {
1349   /* Realloc if we need.  */
1350   if (set->alloc == set->nelem)
1351     {
1352       int *new_elems;
1353       set->alloc = (set->alloc + 1) * 2;
1354       new_elems = re_realloc (set->elems, int, set->alloc);
1355       if (BE (new_elems == NULL, 0))
1356 	return -1;
1357       set->elems = new_elems;
1358     }
1359 
1360   /* Insert the new element.  */
1361   set->elems[set->nelem++] = elem;
1362   return 1;
1363 }
1364 
1365 /* Compare two node sets SET1 and SET2.
1366    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1367 
1368 static int
internal_function(pure)1369 internal_function __attribute ((pure))
1370 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1371 {
1372   int i;
1373   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1374     return 0;
1375   for (i = set1->nelem ; --i >= 0 ; )
1376     if (set1->elems[i] != set2->elems[i])
1377       return 0;
1378   return 1;
1379 }
1380 
1381 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1382 
1383 static int
internal_function(pure)1384 internal_function __attribute ((pure))
1385 re_node_set_contains (const re_node_set *set, int elem)
1386 {
1387   unsigned int idx, right, mid;
1388   if (set->nelem <= 0)
1389     return 0;
1390 
1391   /* Binary search the element.  */
1392   idx = 0;
1393   right = set->nelem - 1;
1394   while (idx < right)
1395     {
1396       mid = idx + (right - idx) / 2;
1397       if (set->elems[mid] < elem)
1398 	idx = mid + 1;
1399       else
1400 	right = mid;
1401     }
1402   return set->elems[idx] == elem ? idx + 1 : 0;
1403 }
1404 
1405 static void
1406 internal_function
re_node_set_remove_at(re_node_set * set,int idx)1407 re_node_set_remove_at (re_node_set *set, int idx)
1408 {
1409   if (idx < 0 || idx >= set->nelem)
1410     return;
1411   --set->nelem;
1412   for (; idx < set->nelem; idx++)
1413     set->elems[idx] = set->elems[idx + 1];
1414 }
1415 
1416 
1417 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1418    Or return -1, if an error has occurred.  */
1419 
1420 static int
1421 internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1422 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1423 {
1424   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1425     {
1426       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1427       int *new_nexts, *new_indices;
1428       re_node_set *new_edests, *new_eclosures;
1429       re_token_t *new_nodes;
1430 
1431       /* Avoid overflows in realloc.  */
1432       const size_t max_object_size = MAX (sizeof (re_token_t),
1433 					  MAX (sizeof (re_node_set),
1434 					       sizeof (int)));
1435       if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1436 	return -1;
1437 
1438       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1439       if (BE (new_nodes == NULL, 0))
1440 	return -1;
1441       dfa->nodes = new_nodes;
1442       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1443       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1444       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1445       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1446       if (BE (new_nexts == NULL || new_indices == NULL
1447 	      || new_edests == NULL || new_eclosures == NULL, 0))
1448 	return -1;
1449       dfa->nexts = new_nexts;
1450       dfa->org_indices = new_indices;
1451       dfa->edests = new_edests;
1452       dfa->eclosures = new_eclosures;
1453       dfa->nodes_alloc = new_nodes_alloc;
1454     }
1455   dfa->nodes[dfa->nodes_len] = token;
1456   dfa->nodes[dfa->nodes_len].constraint = 0;
1457 #ifdef RE_ENABLE_I18N
1458   dfa->nodes[dfa->nodes_len].accept_mb =
1459     (token.type == OP_PERIOD && dfa->mb_cur_max > 1) || token.type == COMPLEX_BRACKET;
1460 #endif
1461   dfa->nexts[dfa->nodes_len] = -1;
1462   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1463   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1464   return dfa->nodes_len++;
1465 }
1466 
1467 static inline unsigned int
1468 internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)1469 calc_state_hash (const re_node_set *nodes, unsigned int context)
1470 {
1471   unsigned int hash = nodes->nelem + context;
1472   int i;
1473   for (i = 0 ; i < nodes->nelem ; i++)
1474     hash += nodes->elems[i];
1475   return hash;
1476 }
1477 
1478 /* Search for the state whose node_set is equivalent to NODES.
1479    Return the pointer to the state, if we found it in the DFA.
1480    Otherwise create the new one and return it.  In case of an error
1481    return NULL and set the error code in ERR.
1482    Note: - We assume NULL as the invalid state, then it is possible that
1483 	   return value is NULL and ERR is REG_NOERROR.
1484 	 - We never return non-NULL value in case of any errors, it is for
1485 	   optimization.  */
1486 
1487 static re_dfastate_t *
1488 internal_function
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)1489 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1490 		  const re_node_set *nodes)
1491 {
1492   unsigned int hash;
1493   re_dfastate_t *new_state;
1494   struct re_state_table_entry *spot;
1495   int i;
1496   if (BE (nodes->nelem == 0, 0))
1497     {
1498       *err = REG_NOERROR;
1499       return NULL;
1500     }
1501   hash = calc_state_hash (nodes, 0);
1502   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1503 
1504   for (i = 0 ; i < spot->num ; i++)
1505     {
1506       re_dfastate_t *state = spot->array[i];
1507       if (hash != state->hash)
1508 	continue;
1509       if (re_node_set_compare (&state->nodes, nodes))
1510 	return state;
1511     }
1512 
1513   /* There are no appropriate state in the dfa, create the new one.  */
1514   new_state = create_ci_newstate (dfa, nodes, hash);
1515   if (BE (new_state == NULL, 0))
1516     *err = REG_ESPACE;
1517 
1518   return new_state;
1519 }
1520 
1521 /* Search for the state whose node_set is equivalent to NODES and
1522    whose context is equivalent to CONTEXT.
1523    Return the pointer to the state, if we found it in the DFA.
1524    Otherwise create the new one and return it.  In case of an error
1525    return NULL and set the error code in ERR.
1526    Note: - We assume NULL as the invalid state, then it is possible that
1527 	   return value is NULL and ERR is REG_NOERROR.
1528 	 - We never return non-NULL value in case of any errors, it is for
1529 	   optimization.  */
1530 
1531 static re_dfastate_t *
1532 internal_function
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1533 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1534 			  const re_node_set *nodes, unsigned int context)
1535 {
1536   unsigned int hash;
1537   re_dfastate_t *new_state;
1538   struct re_state_table_entry *spot;
1539   int i;
1540   if (nodes->nelem == 0)
1541     {
1542       *err = REG_NOERROR;
1543       return NULL;
1544     }
1545   hash = calc_state_hash (nodes, context);
1546   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1547 
1548   for (i = 0 ; i < spot->num ; i++)
1549     {
1550       re_dfastate_t *state = spot->array[i];
1551       if (state->hash == hash
1552 	  && state->context == context
1553 	  && re_node_set_compare (state->entrance_nodes, nodes))
1554 	return state;
1555     }
1556   /* There are no appropriate state in `dfa', create the new one.  */
1557   new_state = create_cd_newstate (dfa, nodes, context, hash);
1558   if (BE (new_state == NULL, 0))
1559     *err = REG_ESPACE;
1560 
1561   return new_state;
1562 }
1563 
1564 /* Finish initialization of the new state NEWSTATE, and using its hash value
1565    HASH put in the appropriate bucket of DFA's state table.  Return value
1566    indicates the error code if failed.  */
1567 
1568 static reg_errcode_t
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,unsigned int hash)1569 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1570 		unsigned int hash)
1571 {
1572   struct re_state_table_entry *spot;
1573   reg_errcode_t err;
1574   int i;
1575 
1576   newstate->hash = hash;
1577   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1578   if (BE (err != REG_NOERROR, 0))
1579     return REG_ESPACE;
1580   for (i = 0; i < newstate->nodes.nelem; i++)
1581     {
1582       int elem = newstate->nodes.elems[i];
1583       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1584 	if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1585 	  return REG_ESPACE;
1586     }
1587 
1588   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1589   if (BE (spot->alloc <= spot->num, 0))
1590     {
1591       int new_alloc = 2 * spot->num + 2;
1592       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1593 					      new_alloc);
1594       if (BE (new_array == NULL, 0))
1595 	return REG_ESPACE;
1596       spot->array = new_array;
1597       spot->alloc = new_alloc;
1598     }
1599   spot->array[spot->num++] = newstate;
1600   return REG_NOERROR;
1601 }
1602 
1603 static void
free_state(re_dfastate_t * state)1604 free_state (re_dfastate_t *state)
1605 {
1606   re_node_set_free (&state->non_eps_nodes);
1607   re_node_set_free (&state->inveclosure);
1608   if (state->entrance_nodes != &state->nodes)
1609     {
1610       re_node_set_free (state->entrance_nodes);
1611       re_free (state->entrance_nodes);
1612     }
1613   re_node_set_free (&state->nodes);
1614   re_free (state->word_trtable);
1615   re_free (state->trtable);
1616   re_free (state);
1617 }
1618 
1619 /* Create the new state which is independent of contexts.
1620    Return the new state if succeeded, otherwise return NULL.  */
1621 
1622 static re_dfastate_t *
1623 internal_function
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int hash)1624 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1625 		    unsigned int hash)
1626 {
1627   int i;
1628   reg_errcode_t err;
1629   re_dfastate_t *newstate;
1630 
1631   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1632   if (BE (newstate == NULL, 0))
1633     return NULL;
1634   err = re_node_set_init_copy (&newstate->nodes, nodes);
1635   if (BE (err != REG_NOERROR, 0))
1636     {
1637       re_free (newstate);
1638       return NULL;
1639     }
1640 
1641   newstate->entrance_nodes = &newstate->nodes;
1642   for (i = 0 ; i < nodes->nelem ; i++)
1643     {
1644       re_token_t *node = dfa->nodes + nodes->elems[i];
1645       re_token_type_t type = node->type;
1646       if (type == CHARACTER && !node->constraint)
1647 	continue;
1648 #ifdef RE_ENABLE_I18N
1649       newstate->accept_mb |= node->accept_mb;
1650 #endif /* RE_ENABLE_I18N */
1651 
1652       /* If the state has the halt node, the state is a halt state.  */
1653       if (type == END_OF_RE)
1654 	newstate->halt = 1;
1655       else if (type == OP_BACK_REF)
1656 	newstate->has_backref = 1;
1657       else if (type == ANCHOR || node->constraint)
1658 	newstate->has_constraint = 1;
1659     }
1660   err = register_state (dfa, newstate, hash);
1661   if (BE (err != REG_NOERROR, 0))
1662     {
1663       free_state (newstate);
1664       newstate = NULL;
1665     }
1666   return newstate;
1667 }
1668 
1669 /* Create the new state which is depend on the context CONTEXT.
1670    Return the new state if succeeded, otherwise return NULL.  */
1671 
1672 static re_dfastate_t *
1673 internal_function
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,unsigned int hash)1674 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1675 		    unsigned int context, unsigned int hash)
1676 {
1677   int i, nctx_nodes = 0;
1678   reg_errcode_t err;
1679   re_dfastate_t *newstate;
1680 
1681   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1682   if (BE (newstate == NULL, 0))
1683     return NULL;
1684   err = re_node_set_init_copy (&newstate->nodes, nodes);
1685   if (BE (err != REG_NOERROR, 0))
1686     {
1687       re_free (newstate);
1688       return NULL;
1689     }
1690 
1691   newstate->context = context;
1692   newstate->entrance_nodes = &newstate->nodes;
1693 
1694   for (i = 0 ; i < nodes->nelem ; i++)
1695     {
1696       re_token_t *node = dfa->nodes + nodes->elems[i];
1697       re_token_type_t type = node->type;
1698       unsigned int constraint = node->constraint;
1699 
1700       if (type == CHARACTER && !constraint)
1701 	continue;
1702 #ifdef RE_ENABLE_I18N
1703       newstate->accept_mb |= node->accept_mb;
1704 #endif /* RE_ENABLE_I18N */
1705 
1706       /* If the state has the halt node, the state is a halt state.  */
1707       if (type == END_OF_RE)
1708 	newstate->halt = 1;
1709       else if (type == OP_BACK_REF)
1710 	newstate->has_backref = 1;
1711 
1712       if (constraint)
1713 	{
1714 	  if (newstate->entrance_nodes == &newstate->nodes)
1715 	    {
1716 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1717 	      if (BE (newstate->entrance_nodes == NULL, 0))
1718 		{
1719 		  free_state (newstate);
1720 		  return NULL;
1721 		}
1722 	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1723 		  != REG_NOERROR)
1724 		return NULL;
1725 	      nctx_nodes = 0;
1726 	      newstate->has_constraint = 1;
1727 	    }
1728 
1729 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1730 	    {
1731 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1732 	      ++nctx_nodes;
1733 	    }
1734 	}
1735     }
1736   err = register_state (dfa, newstate, hash);
1737   if (BE (err != REG_NOERROR, 0))
1738     {
1739       free_state (newstate);
1740       newstate = NULL;
1741     }
1742   return  newstate;
1743 }
1744