1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, write to the Free
18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20 
21 static void re_string_construct_common (const char *str, int len,
22 					re_string_t *pstr,
23 					RE_TRANSLATE_TYPE trans, int icase,
24 					const re_dfa_t *dfa) internal_function;
25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 					  const re_node_set *nodes,
27 					  unsigned int hash) internal_function;
28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 					  const re_node_set *nodes,
30 					  unsigned int context,
31 					  unsigned int hash) internal_function;
32 
33 /* Functions for string operation.  */
34 
35 /* This function allocate the buffers.  It is necessary to call
36    re_string_reconstruct before using the object.  */
37 
38 static reg_errcode_t
39 internal_function
re_string_allocate(re_string_t * pstr,const char * str,int len,int init_len,RE_TRANSLATE_TYPE trans,int icase,const re_dfa_t * dfa)40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
41 		    RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
42 {
43   reg_errcode_t ret;
44   int init_buf_len;
45 
46   /* Ensure at least one character fits into the buffers.  */
47   if (init_len < dfa->mb_cur_max)
48     init_len = dfa->mb_cur_max;
49   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50   re_string_construct_common (str, len, pstr, trans, icase, dfa);
51 
52   ret = re_string_realloc_buffers (pstr, init_buf_len);
53   if (BE (ret != REG_NOERROR, 0))
54     return ret;
55 
56   pstr->word_char = dfa->word_char;
57   pstr->word_ops_used = dfa->word_ops_used;
58   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
60   pstr->valid_raw_len = pstr->valid_len;
61   return REG_NOERROR;
62 }
63 
64 /* This function allocate the buffers, and initialize them.  */
65 
66 static reg_errcode_t
67 internal_function
re_string_construct(re_string_t * pstr,const char * str,int len,RE_TRANSLATE_TYPE trans,int icase,const re_dfa_t * dfa)68 re_string_construct (re_string_t *pstr, const char *str, int len,
69 		     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
70 {
71   reg_errcode_t ret;
72   memset (pstr, '\0', sizeof (re_string_t));
73   re_string_construct_common (str, len, pstr, trans, icase, dfa);
74 
75   if (len > 0)
76     {
77       ret = re_string_realloc_buffers (pstr, len + 1);
78       if (BE (ret != REG_NOERROR, 0))
79 	return ret;
80     }
81   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
82 
83   if (icase)
84     {
85 #ifdef RE_ENABLE_I18N
86       if (dfa->mb_cur_max > 1)
87 	{
88 	  while (1)
89 	    {
90 	      ret = build_wcs_upper_buffer (pstr);
91 	      if (BE (ret != REG_NOERROR, 0))
92 		return ret;
93 	      if (pstr->valid_raw_len >= len)
94 		break;
95 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 		break;
97 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98 	      if (BE (ret != REG_NOERROR, 0))
99 		return ret;
100 	    }
101 	}
102       else
103 #endif /* RE_ENABLE_I18N  */
104 	build_upper_buffer (pstr);
105     }
106   else
107     {
108 #ifdef RE_ENABLE_I18N
109       if (dfa->mb_cur_max > 1)
110 	build_wcs_buffer (pstr);
111       else
112 #endif /* RE_ENABLE_I18N  */
113 	{
114 	  if (trans != NULL)
115 	    re_string_translate_buffer (pstr);
116 	  else
117 	    {
118 	      pstr->valid_len = pstr->bufs_len;
119 	      pstr->valid_raw_len = pstr->bufs_len;
120 	    }
121 	}
122     }
123 
124   return REG_NOERROR;
125 }
126 
127 /* Helper functions for re_string_allocate, and re_string_construct.  */
128 
129 static reg_errcode_t
130 internal_function
re_string_realloc_buffers(re_string_t * pstr,int new_buf_len)131 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
132 {
133 #ifdef RE_ENABLE_I18N
134   if (pstr->mb_cur_max > 1)
135     {
136       wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
137       if (BE (new_wcs == NULL, 0))
138 	return REG_ESPACE;
139       pstr->wcs = new_wcs;
140       if (pstr->offsets != NULL)
141 	{
142 	  int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
143 	  if (BE (new_offsets == NULL, 0))
144 	    return REG_ESPACE;
145 	  pstr->offsets = new_offsets;
146 	}
147     }
148 #endif /* RE_ENABLE_I18N  */
149   if (pstr->mbs_allocated)
150     {
151       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
152 					   new_buf_len);
153       if (BE (new_mbs == NULL, 0))
154 	return REG_ESPACE;
155       pstr->mbs = new_mbs;
156     }
157   pstr->bufs_len = new_buf_len;
158   return REG_NOERROR;
159 }
160 
161 
162 static void
163 internal_function
re_string_construct_common(const char * str,int len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,int icase,const re_dfa_t * dfa)164 re_string_construct_common (const char *str, int len, re_string_t *pstr,
165 			    RE_TRANSLATE_TYPE trans, int icase,
166 			    const re_dfa_t *dfa)
167 {
168   pstr->raw_mbs = (const unsigned char *) str;
169   pstr->len = len;
170   pstr->raw_len = len;
171   pstr->trans = trans;
172   pstr->icase = icase ? 1 : 0;
173   pstr->mbs_allocated = (trans != NULL || icase);
174   pstr->mb_cur_max = dfa->mb_cur_max;
175   pstr->is_utf8 = dfa->is_utf8;
176   pstr->map_notascii = dfa->map_notascii;
177   pstr->stop = pstr->len;
178   pstr->raw_stop = pstr->stop;
179 }
180 
181 #ifdef RE_ENABLE_I18N
182 
183 /* Build wide character buffer PSTR->WCS.
184    If the byte sequence of the string are:
185      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
186    Then wide character buffer will be:
187      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
188    We use WEOF for padding, they indicate that the position isn't
189    a first byte of a multibyte character.
190 
191    Note that this function assumes PSTR->VALID_LEN elements are already
192    built and starts from PSTR->VALID_LEN.  */
193 
194 static void
195 internal_function
build_wcs_buffer(re_string_t * pstr)196 build_wcs_buffer (re_string_t *pstr)
197 {
198 #ifdef _LIBC
199   unsigned char buf[MB_LEN_MAX];
200   assert (MB_LEN_MAX >= pstr->mb_cur_max);
201 #else
202   unsigned char buf[64];
203 #endif
204   mbstate_t prev_st;
205   int byte_idx, end_idx, remain_len;
206   size_t mbclen;
207 
208   /* Build the buffers from pstr->valid_len to either pstr->len or
209      pstr->bufs_len.  */
210   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
211   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
212     {
213       wchar_t wc;
214       const char *p;
215 
216       remain_len = end_idx - byte_idx;
217       prev_st = pstr->cur_state;
218       /* Apply the translation if we need.  */
219       if (BE (pstr->trans != NULL, 0))
220 	{
221 	  int i, ch;
222 
223 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
224 	    {
225 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
226 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
227 	    }
228 	  p = (const char *) buf;
229 	}
230       else
231 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
232       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
233       if (BE (mbclen == (size_t) -2, 0))
234 	{
235 	  /* The buffer doesn't have enough space, finish to build.  */
236 	  pstr->cur_state = prev_st;
237 	  break;
238 	}
239       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
240 	{
241 	  /* We treat these cases as a singlebyte character.  */
242 	  mbclen = 1;
243 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
244 	  if (BE (pstr->trans != NULL, 0))
245 	    wc = pstr->trans[wc];
246 	  pstr->cur_state = prev_st;
247 	}
248 
249       /* Write wide character and padding.  */
250       pstr->wcs[byte_idx++] = wc;
251       /* Write paddings.  */
252       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
253 	pstr->wcs[byte_idx++] = WEOF;
254     }
255   pstr->valid_len = byte_idx;
256   pstr->valid_raw_len = byte_idx;
257 }
258 
259 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
260    but for REG_ICASE.  */
261 
262 static reg_errcode_t
263 internal_function
build_wcs_upper_buffer(re_string_t * pstr)264 build_wcs_upper_buffer (re_string_t *pstr)
265 {
266   mbstate_t prev_st;
267   int src_idx, byte_idx, end_idx, remain_len;
268   size_t mbclen;
269 #ifdef _LIBC
270   char buf[MB_LEN_MAX];
271   assert (MB_LEN_MAX >= pstr->mb_cur_max);
272 #else
273   char buf[64];
274 #endif
275 
276   byte_idx = pstr->valid_len;
277   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
278 
279   /* The following optimization assumes that ASCII characters can be
280      mapped to wide characters with a simple cast.  */
281   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
282     {
283       while (byte_idx < end_idx)
284 	{
285 	  wchar_t wc;
286 
287 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
288 	      && mbsinit (&pstr->cur_state))
289 	    {
290 	      /* In case of a singlebyte character.  */
291 	      pstr->mbs[byte_idx]
292 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
293 	      /* The next step uses the assumption that wchar_t is encoded
294 		 ASCII-safe: all ASCII values can be converted like this.  */
295 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
296 	      ++byte_idx;
297 	      continue;
298 	    }
299 
300 	  remain_len = end_idx - byte_idx;
301 	  prev_st = pstr->cur_state;
302 	  mbclen = mbrtowc (&wc,
303 			    ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
304 			     + byte_idx), remain_len, &pstr->cur_state);
305 	  if (BE (mbclen + 2 > 2, 1))
306 	    {
307 	      wchar_t wcu = wc;
308 	      if (iswlower (wc))
309 		{
310 		  size_t mbcdlen;
311 
312 		  wcu = towupper (wc);
313 		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
314 		  if (BE (mbclen == mbcdlen, 1))
315 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
316 		  else
317 		    {
318 		      src_idx = byte_idx;
319 		      goto offsets_needed;
320 		    }
321 		}
322 	      else
323 		memcpy (pstr->mbs + byte_idx,
324 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
325 	      pstr->wcs[byte_idx++] = wcu;
326 	      /* Write paddings.  */
327 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
328 		pstr->wcs[byte_idx++] = WEOF;
329 	    }
330 	  else if (mbclen == (size_t) -1 || mbclen == 0)
331 	    {
332 	      /* It is an invalid character or '\0'.  Just use the byte.  */
333 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
334 	      pstr->mbs[byte_idx] = ch;
335 	      /* And also cast it to wide char.  */
336 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
337 	      if (BE (mbclen == (size_t) -1, 0))
338 		pstr->cur_state = prev_st;
339 	    }
340 	  else
341 	    {
342 	      /* The buffer doesn't have enough space, finish to build.  */
343 	      pstr->cur_state = prev_st;
344 	      break;
345 	    }
346 	}
347       pstr->valid_len = byte_idx;
348       pstr->valid_raw_len = byte_idx;
349       return REG_NOERROR;
350     }
351   else
352     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
353       {
354 	wchar_t wc;
355 	const char *p;
356       offsets_needed:
357 	remain_len = end_idx - byte_idx;
358 	prev_st = pstr->cur_state;
359 	if (BE (pstr->trans != NULL, 0))
360 	  {
361 	    int i, ch;
362 
363 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
364 	      {
365 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
366 		buf[i] = pstr->trans[ch];
367 	      }
368 	    p = (const char *) buf;
369 	  }
370 	else
371 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
372 	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
373 	if (BE (mbclen + 2 > 2, 1))
374 	  {
375 	    wchar_t wcu = wc;
376 	    if (iswlower (wc))
377 	      {
378 		size_t mbcdlen;
379 
380 		wcu = towupper (wc);
381 		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
382 		if (BE (mbclen == mbcdlen, 1))
383 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
384 		else if (mbcdlen != (size_t) -1)
385 		  {
386 		    size_t i;
387 
388 		    if (byte_idx + mbcdlen > pstr->bufs_len)
389 		      {
390 			pstr->cur_state = prev_st;
391 			break;
392 		      }
393 
394 		    if (pstr->offsets == NULL)
395 		      {
396 			pstr->offsets = re_malloc (int, pstr->bufs_len);
397 
398 			if (pstr->offsets == NULL)
399 			  return REG_ESPACE;
400 		      }
401 		    if (!pstr->offsets_needed)
402 		      {
403 			for (i = 0; i < (size_t) byte_idx; ++i)
404 			  pstr->offsets[i] = i;
405 			pstr->offsets_needed = 1;
406 		      }
407 
408 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
409 		    pstr->wcs[byte_idx] = wcu;
410 		    pstr->offsets[byte_idx] = src_idx;
411 		    for (i = 1; i < mbcdlen; ++i)
412 		      {
413 			pstr->offsets[byte_idx + i]
414 			  = src_idx + (i < mbclen ? i : mbclen - 1);
415 			pstr->wcs[byte_idx + i] = WEOF;
416 		      }
417 		    pstr->len += mbcdlen - mbclen;
418 		    if (pstr->raw_stop > src_idx)
419 		      pstr->stop += mbcdlen - mbclen;
420 		    end_idx = (pstr->bufs_len > pstr->len)
421 			      ? pstr->len : pstr->bufs_len;
422 		    byte_idx += mbcdlen;
423 		    src_idx += mbclen;
424 		    continue;
425 		  }
426                 else
427                   memcpy (pstr->mbs + byte_idx, p, mbclen);
428 	      }
429 	    else
430 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
431 
432 	    if (BE (pstr->offsets_needed != 0, 0))
433 	      {
434 		size_t i;
435 		for (i = 0; i < mbclen; ++i)
436 		  pstr->offsets[byte_idx + i] = src_idx + i;
437 	      }
438 	    src_idx += mbclen;
439 
440 	    pstr->wcs[byte_idx++] = wcu;
441 	    /* Write paddings.  */
442 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
443 	      pstr->wcs[byte_idx++] = WEOF;
444 	  }
445 	else if (mbclen == (size_t) -1 || mbclen == 0)
446 	  {
447 	    /* It is an invalid character or '\0'.  Just use the byte.  */
448 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
449 
450 	    if (BE (pstr->trans != NULL, 0))
451 	      ch = pstr->trans [ch];
452 	    pstr->mbs[byte_idx] = ch;
453 
454 	    if (BE (pstr->offsets_needed != 0, 0))
455 	      pstr->offsets[byte_idx] = src_idx;
456 	    ++src_idx;
457 
458 	    /* And also cast it to wide char.  */
459 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
460 	    if (BE (mbclen == (size_t) -1, 0))
461 	      pstr->cur_state = prev_st;
462 	  }
463 	else
464 	  {
465 	    /* The buffer doesn't have enough space, finish to build.  */
466 	    pstr->cur_state = prev_st;
467 	    break;
468 	  }
469       }
470   pstr->valid_len = byte_idx;
471   pstr->valid_raw_len = src_idx;
472   return REG_NOERROR;
473 }
474 
475 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
476    Return the index.  */
477 
478 static int
479 internal_function
re_string_skip_chars(re_string_t * pstr,int new_raw_idx,wint_t * last_wc)480 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
481 {
482   mbstate_t prev_st;
483   int rawbuf_idx;
484   size_t mbclen;
485   wchar_t wc = WEOF;
486 
487   /* Skip the characters which are not necessary to check.  */
488   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
489        rawbuf_idx < new_raw_idx;)
490     {
491       int remain_len;
492       remain_len = pstr->len - rawbuf_idx;
493       prev_st = pstr->cur_state;
494       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
495 			remain_len, &pstr->cur_state);
496       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
497 	{
498 	  /* We treat these cases as a single byte character.  */
499 	  if (mbclen == 0 || remain_len == 0)
500 	    wc = L'\0';
501 	  else
502 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
503 	  mbclen = 1;
504 	  pstr->cur_state = prev_st;
505 	}
506       /* Then proceed the next character.  */
507       rawbuf_idx += mbclen;
508     }
509   *last_wc = (wint_t) wc;
510   return rawbuf_idx;
511 }
512 #endif /* RE_ENABLE_I18N  */
513 
514 /* Build the buffer PSTR->MBS, and apply the translation if we need.
515    This function is used in case of REG_ICASE.  */
516 
517 static void
518 internal_function
build_upper_buffer(re_string_t * pstr)519 build_upper_buffer (re_string_t *pstr)
520 {
521   int char_idx, end_idx;
522   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
523 
524   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
525     {
526       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
527       if (BE (pstr->trans != NULL, 0))
528 	ch = pstr->trans[ch];
529       if (islower (ch))
530 	pstr->mbs[char_idx] = toupper (ch);
531       else
532 	pstr->mbs[char_idx] = ch;
533     }
534   pstr->valid_len = char_idx;
535   pstr->valid_raw_len = char_idx;
536 }
537 
538 /* Apply TRANS to the buffer in PSTR.  */
539 
540 static void
541 internal_function
re_string_translate_buffer(re_string_t * pstr)542 re_string_translate_buffer (re_string_t *pstr)
543 {
544   int buf_idx, end_idx;
545   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
546 
547   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
548     {
549       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
550       pstr->mbs[buf_idx] = pstr->trans[ch];
551     }
552 
553   pstr->valid_len = buf_idx;
554   pstr->valid_raw_len = buf_idx;
555 }
556 
557 /* This function re-construct the buffers.
558    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
559    convert to upper case in case of REG_ICASE, apply translation.  */
560 
561 static reg_errcode_t
562 internal_function
re_string_reconstruct(re_string_t * pstr,int idx,int eflags)563 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
564 {
565   int offset = idx - pstr->raw_mbs_idx;
566   if (BE (offset < 0, 0))
567     {
568       /* Reset buffer.  */
569 #ifdef RE_ENABLE_I18N
570       if (pstr->mb_cur_max > 1)
571 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
572 #endif /* RE_ENABLE_I18N */
573       pstr->len = pstr->raw_len;
574       pstr->stop = pstr->raw_stop;
575       pstr->valid_len = 0;
576       pstr->raw_mbs_idx = 0;
577       pstr->valid_raw_len = 0;
578       pstr->offsets_needed = 0;
579       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
580 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
581       if (!pstr->mbs_allocated)
582 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
583       offset = idx;
584     }
585 
586   if (BE (offset != 0, 1))
587     {
588       /* Should the already checked characters be kept?  */
589       if (BE (offset < pstr->valid_raw_len, 1))
590 	{
591 	  /* Yes, move them to the front of the buffer.  */
592 #ifdef RE_ENABLE_I18N
593 	  if (BE (pstr->offsets_needed, 0))
594 	    {
595 	      int low = 0, high = pstr->valid_len, mid;
596 	      do
597 		{
598 		  mid = (high + low) / 2;
599 		  if (pstr->offsets[mid] > offset)
600 		    high = mid;
601 		  else if (pstr->offsets[mid] < offset)
602 		    low = mid + 1;
603 		  else
604 		    break;
605 		}
606 	      while (low < high);
607 	      if (pstr->offsets[mid] < offset)
608 		++mid;
609 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
610 							eflags);
611 	      /* This can be quite complicated, so handle specially
612 		 only the common and easy case where the character with
613 		 different length representation of lower and upper
614 		 case is present at or after offset.  */
615 	      if (pstr->valid_len > offset
616 		  && mid == offset && pstr->offsets[mid] == offset)
617 		{
618 		  memmove (pstr->wcs, pstr->wcs + offset,
619 			   (pstr->valid_len - offset) * sizeof (wint_t));
620 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
621 		  pstr->valid_len -= offset;
622 		  pstr->valid_raw_len -= offset;
623 		  for (low = 0; low < pstr->valid_len; low++)
624 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
625 		}
626 	      else
627 		{
628 		  /* Otherwise, just find out how long the partial multibyte
629 		     character at offset is and fill it with WEOF/255.  */
630 		  pstr->len = pstr->raw_len - idx + offset;
631 		  pstr->stop = pstr->raw_stop - idx + offset;
632 		  pstr->offsets_needed = 0;
633 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
634 		    --mid;
635 		  while (mid < pstr->valid_len)
636 		    if (pstr->wcs[mid] != WEOF)
637 		      break;
638 		    else
639 		      ++mid;
640 		  if (mid == pstr->valid_len)
641 		    pstr->valid_len = 0;
642 		  else
643 		    {
644 		      pstr->valid_len = pstr->offsets[mid] - offset;
645 		      if (pstr->valid_len)
646 			{
647 			  for (low = 0; low < pstr->valid_len; ++low)
648 			    pstr->wcs[low] = WEOF;
649 			  memset (pstr->mbs, 255, pstr->valid_len);
650 			}
651 		    }
652 		  pstr->valid_raw_len = pstr->valid_len;
653 		}
654 	    }
655 	  else
656 #endif
657 	    {
658 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
659 							eflags);
660 #ifdef RE_ENABLE_I18N
661 	      if (pstr->mb_cur_max > 1)
662 		memmove (pstr->wcs, pstr->wcs + offset,
663 			 (pstr->valid_len - offset) * sizeof (wint_t));
664 #endif /* RE_ENABLE_I18N */
665 	      if (BE (pstr->mbs_allocated, 0))
666 		memmove (pstr->mbs, pstr->mbs + offset,
667 			 pstr->valid_len - offset);
668 	      pstr->valid_len -= offset;
669 	      pstr->valid_raw_len -= offset;
670 #if DEBUG
671 	      assert (pstr->valid_len > 0);
672 #endif
673 	    }
674 	}
675       else
676 	{
677 #ifdef RE_ENABLE_I18N
678 	  /* No, skip all characters until IDX.  */
679 	  int prev_valid_len = pstr->valid_len;
680 
681 	  if (BE (pstr->offsets_needed, 0))
682 	    {
683 	      pstr->len = pstr->raw_len - idx + offset;
684 	      pstr->stop = pstr->raw_stop - idx + offset;
685 	      pstr->offsets_needed = 0;
686 	    }
687 #endif
688 	  pstr->valid_len = 0;
689 #ifdef RE_ENABLE_I18N
690 	  if (pstr->mb_cur_max > 1)
691 	    {
692 	      int wcs_idx;
693 	      wint_t wc = WEOF;
694 
695 	      if (pstr->is_utf8)
696 		{
697 		  const unsigned char *raw, *p, *q, *end;
698 
699 		  /* Special case UTF-8.  Multi-byte chars start with any
700 		     byte other than 0x80 - 0xbf.  */
701 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
702 		  end = raw + (offset - pstr->mb_cur_max);
703 		  if (end < pstr->raw_mbs)
704 		    end = pstr->raw_mbs;
705 		  p = raw + offset - 1;
706 #ifdef _LIBC
707 		  /* We know the wchar_t encoding is UCS4, so for the simple
708 		     case, ASCII characters, skip the conversion step.  */
709 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
710 		    {
711 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
712 		      /* pstr->valid_len = 0; */
713 		      wc = (wchar_t) *p;
714 		    }
715 		  else
716 #endif
717 		    for (; p >= end; --p)
718 		      if ((*p & 0xc0) != 0x80)
719 			{
720 			  mbstate_t cur_state;
721 			  wchar_t wc2;
722 			  int mlen = raw + pstr->len - p;
723 			  unsigned char buf[6];
724 			  size_t mbclen;
725 
726 			  q = p;
727 			  if (BE (pstr->trans != NULL, 0))
728 			    {
729 			      int i = mlen < 6 ? mlen : 6;
730 			      while (--i >= 0)
731 				buf[i] = pstr->trans[p[i]];
732 			      q = buf;
733 			    }
734 			  /* XXX Don't use mbrtowc, we know which conversion
735 			     to use (UTF-8 -> UCS4).  */
736 			  memset (&cur_state, 0, sizeof (cur_state));
737 			  mbclen = mbrtowc (&wc2, (const char *) p, mlen,
738 					    &cur_state);
739 			  if (raw + offset - p <= mbclen
740 			      && mbclen < (size_t) -2)
741 			    {
742 			      memset (&pstr->cur_state, '\0',
743 				      sizeof (mbstate_t));
744 			      pstr->valid_len = mbclen - (raw + offset - p);
745 			      wc = wc2;
746 			    }
747 			  break;
748 			}
749 		}
750 
751 	      if (wc == WEOF)
752 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
753 	      if (wc == WEOF)
754 		pstr->tip_context
755 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
756 	      else
757 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
758 				      && IS_WIDE_WORD_CHAR (wc))
759 				     ? CONTEXT_WORD
760 				     : ((IS_WIDE_NEWLINE (wc)
761 					 && pstr->newline_anchor)
762 					? CONTEXT_NEWLINE : 0));
763 	      if (BE (pstr->valid_len, 0))
764 		{
765 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
766 		    pstr->wcs[wcs_idx] = WEOF;
767 		  if (pstr->mbs_allocated)
768 		    memset (pstr->mbs, 255, pstr->valid_len);
769 		}
770 	      pstr->valid_raw_len = pstr->valid_len;
771 	    }
772 	  else
773 #endif /* RE_ENABLE_I18N */
774 	    {
775 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
776 	      pstr->valid_raw_len = 0;
777 	      if (pstr->trans)
778 		c = pstr->trans[c];
779 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
780 				   ? CONTEXT_WORD
781 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
782 				      ? CONTEXT_NEWLINE : 0));
783 	    }
784 	}
785       if (!BE (pstr->mbs_allocated, 0))
786 	pstr->mbs += offset;
787     }
788   pstr->raw_mbs_idx = idx;
789   pstr->len -= offset;
790   pstr->stop -= offset;
791 
792   /* Then build the buffers.  */
793 #ifdef RE_ENABLE_I18N
794   if (pstr->mb_cur_max > 1)
795     {
796       if (pstr->icase)
797 	{
798 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
799 	  if (BE (ret != REG_NOERROR, 0))
800 	    return ret;
801 	}
802       else
803 	build_wcs_buffer (pstr);
804     }
805   else
806 #endif /* RE_ENABLE_I18N */
807     if (BE (pstr->mbs_allocated, 0))
808       {
809 	if (pstr->icase)
810 	  build_upper_buffer (pstr);
811 	else if (pstr->trans != NULL)
812 	  re_string_translate_buffer (pstr);
813       }
814     else
815       pstr->valid_len = pstr->len;
816 
817   pstr->cur_idx = 0;
818   return REG_NOERROR;
819 }
820 
821 static unsigned char
internal_function(pure)822 internal_function __attribute ((pure))
823 re_string_peek_byte_case (const re_string_t *pstr, int idx)
824 {
825   int ch, off;
826 
827   /* Handle the common (easiest) cases first.  */
828   if (BE (!pstr->mbs_allocated, 1))
829     return re_string_peek_byte (pstr, idx);
830 
831 #ifdef RE_ENABLE_I18N
832   if (pstr->mb_cur_max > 1
833       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
834     return re_string_peek_byte (pstr, idx);
835 #endif
836 
837   off = pstr->cur_idx + idx;
838 #ifdef RE_ENABLE_I18N
839   if (pstr->offsets_needed)
840     off = pstr->offsets[off];
841 #endif
842 
843   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
844 
845 #ifdef RE_ENABLE_I18N
846   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
847      this function returns CAPITAL LETTER I instead of first byte of
848      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
849      since peek_byte_case doesn't advance cur_idx in any way.  */
850   if (pstr->offsets_needed && !isascii (ch))
851     return re_string_peek_byte (pstr, idx);
852 #endif
853 
854   return ch;
855 }
856 
857 static unsigned char
internal_function(pure)858 internal_function __attribute ((pure))
859 re_string_fetch_byte_case (re_string_t *pstr)
860 {
861   if (BE (!pstr->mbs_allocated, 1))
862     return re_string_fetch_byte (pstr);
863 
864 #ifdef RE_ENABLE_I18N
865   if (pstr->offsets_needed)
866     {
867       int off, ch;
868 
869       /* For tr_TR.UTF-8 [[:islower:]] there is
870 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
871 	 in that case the whole multi-byte character and return
872 	 the original letter.  On the other side, with
873 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
874 	 anything else would complicate things too much.  */
875 
876       if (!re_string_first_byte (pstr, pstr->cur_idx))
877 	return re_string_fetch_byte (pstr);
878 
879       off = pstr->offsets[pstr->cur_idx];
880       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
881 
882       if (! isascii (ch))
883 	return re_string_fetch_byte (pstr);
884 
885       re_string_skip_bytes (pstr,
886 			    re_string_char_size_at (pstr, pstr->cur_idx));
887       return ch;
888     }
889 #endif
890 
891   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
892 }
893 
894 static void
895 internal_function
re_string_destruct(re_string_t * pstr)896 re_string_destruct (re_string_t *pstr)
897 {
898 #ifdef RE_ENABLE_I18N
899   re_free (pstr->wcs);
900   re_free (pstr->offsets);
901 #endif /* RE_ENABLE_I18N  */
902   if (pstr->mbs_allocated)
903     re_free (pstr->mbs);
904 }
905 
906 /* Return the context at IDX in INPUT.  */
907 
908 static unsigned int
909 internal_function
re_string_context_at(const re_string_t * input,int idx,int eflags)910 re_string_context_at (const re_string_t *input, int idx, int eflags)
911 {
912   int c;
913   if (BE (idx < 0, 0))
914     /* In this case, we use the value stored in input->tip_context,
915        since we can't know the character in input->mbs[-1] here.  */
916     return input->tip_context;
917   if (BE (idx == input->len, 0))
918     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
919 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
920 #ifdef RE_ENABLE_I18N
921   if (input->mb_cur_max > 1)
922     {
923       wint_t wc;
924       int wc_idx = idx;
925       while(input->wcs[wc_idx] == WEOF)
926 	{
927 #ifdef DEBUG
928 	  /* It must not happen.  */
929 	  assert (wc_idx >= 0);
930 #endif
931 	  --wc_idx;
932 	  if (wc_idx < 0)
933 	    return input->tip_context;
934 	}
935       wc = input->wcs[wc_idx];
936       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
937 	return CONTEXT_WORD;
938       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
939 	      ? CONTEXT_NEWLINE : 0);
940     }
941   else
942 #endif
943     {
944       c = re_string_byte_at (input, idx);
945       if (bitset_contain (input->word_char, c))
946 	return CONTEXT_WORD;
947       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
948     }
949 }
950 
951 /* Functions for set operation.  */
952 
953 static reg_errcode_t
954 internal_function
re_node_set_alloc(re_node_set * set,int size)955 re_node_set_alloc (re_node_set *set, int size)
956 {
957   set->alloc = size;
958   set->nelem = 0;
959   set->elems = re_malloc (int, size);
960   if (BE (set->elems == NULL, 0))
961     return REG_ESPACE;
962   return REG_NOERROR;
963 }
964 
965 static reg_errcode_t
966 internal_function
re_node_set_init_1(re_node_set * set,int elem)967 re_node_set_init_1 (re_node_set *set, int elem)
968 {
969   set->alloc = 1;
970   set->nelem = 1;
971   set->elems = re_malloc (int, 1);
972   if (BE (set->elems == NULL, 0))
973     {
974       set->alloc = set->nelem = 0;
975       return REG_ESPACE;
976     }
977   set->elems[0] = elem;
978   return REG_NOERROR;
979 }
980 
981 static reg_errcode_t
982 internal_function
re_node_set_init_2(re_node_set * set,int elem1,int elem2)983 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
984 {
985   set->alloc = 2;
986   set->elems = re_malloc (int, 2);
987   if (BE (set->elems == NULL, 0))
988     return REG_ESPACE;
989   if (elem1 == elem2)
990     {
991       set->nelem = 1;
992       set->elems[0] = elem1;
993     }
994   else
995     {
996       set->nelem = 2;
997       if (elem1 < elem2)
998 	{
999 	  set->elems[0] = elem1;
1000 	  set->elems[1] = elem2;
1001 	}
1002       else
1003 	{
1004 	  set->elems[0] = elem2;
1005 	  set->elems[1] = elem1;
1006 	}
1007     }
1008   return REG_NOERROR;
1009 }
1010 
1011 static reg_errcode_t
1012 internal_function
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)1013 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1014 {
1015   dest->nelem = src->nelem;
1016   if (src->nelem > 0)
1017     {
1018       dest->alloc = dest->nelem;
1019       dest->elems = re_malloc (int, dest->alloc);
1020       if (BE (dest->elems == NULL, 0))
1021 	{
1022 	  dest->alloc = dest->nelem = 0;
1023 	  return REG_ESPACE;
1024 	}
1025       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1026     }
1027   else
1028     re_node_set_init_empty (dest);
1029   return REG_NOERROR;
1030 }
1031 
1032 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1033    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1034    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1035 
1036 static reg_errcode_t
1037 internal_function
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1038 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1039 			   const re_node_set *src2)
1040 {
1041   int i1, i2, is, id, delta, sbase;
1042   if (src1->nelem == 0 || src2->nelem == 0)
1043     return REG_NOERROR;
1044 
1045   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1046      conservative estimate.  */
1047   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1048     {
1049       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1050       int *new_elems = re_realloc (dest->elems, int, new_alloc);
1051       if (BE (new_elems == NULL, 0))
1052         return REG_ESPACE;
1053       dest->elems = new_elems;
1054       dest->alloc = new_alloc;
1055     }
1056 
1057   /* Find the items in the intersection of SRC1 and SRC2, and copy
1058      into the top of DEST those that are not already in DEST itself.  */
1059   sbase = dest->nelem + src1->nelem + src2->nelem;
1060   i1 = src1->nelem - 1;
1061   i2 = src2->nelem - 1;
1062   id = dest->nelem - 1;
1063   for (;;)
1064     {
1065       if (src1->elems[i1] == src2->elems[i2])
1066 	{
1067 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1068 	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
1069 	    --id;
1070 
1071           if (id < 0 || dest->elems[id] != src1->elems[i1])
1072             dest->elems[--sbase] = src1->elems[i1];
1073 
1074 	  if (--i1 < 0 || --i2 < 0)
1075 	    break;
1076 	}
1077 
1078       /* Lower the highest of the two items.  */
1079       else if (src1->elems[i1] < src2->elems[i2])
1080 	{
1081 	  if (--i2 < 0)
1082 	    break;
1083 	}
1084       else
1085 	{
1086 	  if (--i1 < 0)
1087 	    break;
1088 	}
1089     }
1090 
1091   id = dest->nelem - 1;
1092   is = dest->nelem + src1->nelem + src2->nelem - 1;
1093   delta = is - sbase + 1;
1094 
1095   /* Now copy.  When DELTA becomes zero, the remaining
1096      DEST elements are already in place; this is more or
1097      less the same loop that is in re_node_set_merge.  */
1098   dest->nelem += delta;
1099   if (delta > 0 && id >= 0)
1100     for (;;)
1101       {
1102         if (dest->elems[is] > dest->elems[id])
1103           {
1104             /* Copy from the top.  */
1105             dest->elems[id + delta--] = dest->elems[is--];
1106             if (delta == 0)
1107               break;
1108           }
1109         else
1110           {
1111             /* Slide from the bottom.  */
1112             dest->elems[id + delta] = dest->elems[id];
1113             if (--id < 0)
1114               break;
1115           }
1116       }
1117 
1118   /* Copy remaining SRC elements.  */
1119   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1120 
1121   return REG_NOERROR;
1122 }
1123 
1124 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1125    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1126 
1127 static reg_errcode_t
1128 internal_function
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1129 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1130 			const re_node_set *src2)
1131 {
1132   int i1, i2, id;
1133   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1134     {
1135       dest->alloc = src1->nelem + src2->nelem;
1136       dest->elems = re_malloc (int, dest->alloc);
1137       if (BE (dest->elems == NULL, 0))
1138 	return REG_ESPACE;
1139     }
1140   else
1141     {
1142       if (src1 != NULL && src1->nelem > 0)
1143 	return re_node_set_init_copy (dest, src1);
1144       else if (src2 != NULL && src2->nelem > 0)
1145 	return re_node_set_init_copy (dest, src2);
1146       else
1147 	re_node_set_init_empty (dest);
1148       return REG_NOERROR;
1149     }
1150   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1151     {
1152       if (src1->elems[i1] > src2->elems[i2])
1153 	{
1154 	  dest->elems[id++] = src2->elems[i2++];
1155 	  continue;
1156 	}
1157       if (src1->elems[i1] == src2->elems[i2])
1158 	++i2;
1159       dest->elems[id++] = src1->elems[i1++];
1160     }
1161   if (i1 < src1->nelem)
1162     {
1163       memcpy (dest->elems + id, src1->elems + i1,
1164 	     (src1->nelem - i1) * sizeof (int));
1165       id += src1->nelem - i1;
1166     }
1167   else if (i2 < src2->nelem)
1168     {
1169       memcpy (dest->elems + id, src2->elems + i2,
1170 	     (src2->nelem - i2) * sizeof (int));
1171       id += src2->nelem - i2;
1172     }
1173   dest->nelem = id;
1174   return REG_NOERROR;
1175 }
1176 
1177 /* Calculate the union set of the sets DEST and SRC. And store it to
1178    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1179 
1180 static reg_errcode_t
1181 internal_function
re_node_set_merge(re_node_set * dest,const re_node_set * src)1182 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1183 {
1184   int is, id, sbase, delta;
1185   if (src == NULL || src->nelem == 0)
1186     return REG_NOERROR;
1187   if (dest->alloc < 2 * src->nelem + dest->nelem)
1188     {
1189       int new_alloc = 2 * (src->nelem + dest->alloc);
1190       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1191       if (BE (new_buffer == NULL, 0))
1192 	return REG_ESPACE;
1193       dest->elems = new_buffer;
1194       dest->alloc = new_alloc;
1195     }
1196 
1197   if (BE (dest->nelem == 0, 0))
1198     {
1199       dest->nelem = src->nelem;
1200       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1201       return REG_NOERROR;
1202     }
1203 
1204   /* Copy into the top of DEST the items of SRC that are not
1205      found in DEST.  Maybe we could binary search in DEST?  */
1206   for (sbase = dest->nelem + 2 * src->nelem,
1207        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1208     {
1209       if (dest->elems[id] == src->elems[is])
1210         is--, id--;
1211       else if (dest->elems[id] < src->elems[is])
1212         dest->elems[--sbase] = src->elems[is--];
1213       else /* if (dest->elems[id] > src->elems[is]) */
1214         --id;
1215     }
1216 
1217   if (is >= 0)
1218     {
1219       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1220       sbase -= is + 1;
1221       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1222     }
1223 
1224   id = dest->nelem - 1;
1225   is = dest->nelem + 2 * src->nelem - 1;
1226   delta = is - sbase + 1;
1227   if (delta == 0)
1228     return REG_NOERROR;
1229 
1230   /* Now copy.  When DELTA becomes zero, the remaining
1231      DEST elements are already in place.  */
1232   dest->nelem += delta;
1233   for (;;)
1234     {
1235       if (dest->elems[is] > dest->elems[id])
1236         {
1237 	  /* Copy from the top.  */
1238           dest->elems[id + delta--] = dest->elems[is--];
1239 	  if (delta == 0)
1240 	    break;
1241 	}
1242       else
1243         {
1244           /* Slide from the bottom.  */
1245           dest->elems[id + delta] = dest->elems[id];
1246 	  if (--id < 0)
1247 	    {
1248 	      /* Copy remaining SRC elements.  */
1249 	      memcpy (dest->elems, dest->elems + sbase,
1250 	              delta * sizeof (int));
1251 	      break;
1252 	    }
1253 	}
1254     }
1255 
1256   return REG_NOERROR;
1257 }
1258 
1259 /* Insert the new element ELEM to the re_node_set* SET.
1260    SET should not already have ELEM.
1261    return -1 if an error is occured, return 1 otherwise.  */
1262 
1263 static int
1264 internal_function
re_node_set_insert(re_node_set * set,int elem)1265 re_node_set_insert (re_node_set *set, int elem)
1266 {
1267   int idx;
1268   /* In case the set is empty.  */
1269   if (set->alloc == 0)
1270     {
1271       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1272 	return 1;
1273       else
1274 	return -1;
1275     }
1276 
1277   if (BE (set->nelem, 0) == 0)
1278     {
1279       /* We already guaranteed above that set->alloc != 0.  */
1280       set->elems[0] = elem;
1281       ++set->nelem;
1282       return 1;
1283     }
1284 
1285   /* Realloc if we need.  */
1286   if (set->alloc == set->nelem)
1287     {
1288       int *new_elems;
1289       set->alloc = set->alloc * 2;
1290       new_elems = re_realloc (set->elems, int, set->alloc);
1291       if (BE (new_elems == NULL, 0))
1292 	return -1;
1293       set->elems = new_elems;
1294     }
1295 
1296   /* Move the elements which follows the new element.  Test the
1297      first element separately to skip a check in the inner loop.  */
1298   if (elem < set->elems[0])
1299     {
1300       idx = 0;
1301       for (idx = set->nelem; idx > 0; idx--)
1302         set->elems[idx] = set->elems[idx - 1];
1303     }
1304   else
1305     {
1306       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1307         set->elems[idx] = set->elems[idx - 1];
1308     }
1309 
1310   /* Insert the new element.  */
1311   set->elems[idx] = elem;
1312   ++set->nelem;
1313   return 1;
1314 }
1315 
1316 /* Insert the new element ELEM to the re_node_set* SET.
1317    SET should not already have any element greater than or equal to ELEM.
1318    Return -1 if an error is occured, return 1 otherwise.  */
1319 
1320 static int
1321 internal_function
re_node_set_insert_last(re_node_set * set,int elem)1322 re_node_set_insert_last (re_node_set *set, int elem)
1323 {
1324   /* Realloc if we need.  */
1325   if (set->alloc == set->nelem)
1326     {
1327       int *new_elems;
1328       set->alloc = (set->alloc + 1) * 2;
1329       new_elems = re_realloc (set->elems, int, set->alloc);
1330       if (BE (new_elems == NULL, 0))
1331 	return -1;
1332       set->elems = new_elems;
1333     }
1334 
1335   /* Insert the new element.  */
1336   set->elems[set->nelem++] = elem;
1337   return 1;
1338 }
1339 
1340 /* Compare two node sets SET1 and SET2.
1341    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1342 
1343 static int
internal_function(pure)1344 internal_function __attribute ((pure))
1345 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1346 {
1347   int i;
1348   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1349     return 0;
1350   for (i = set1->nelem ; --i >= 0 ; )
1351     if (set1->elems[i] != set2->elems[i])
1352       return 0;
1353   return 1;
1354 }
1355 
1356 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1357 
1358 static int
internal_function(pure)1359 internal_function __attribute ((pure))
1360 re_node_set_contains (const re_node_set *set, int elem)
1361 {
1362   unsigned int idx, right, mid;
1363   if (set->nelem <= 0)
1364     return 0;
1365 
1366   /* Binary search the element.  */
1367   idx = 0;
1368   right = set->nelem - 1;
1369   while (idx < right)
1370     {
1371       mid = (idx + right) / 2;
1372       if (set->elems[mid] < elem)
1373 	idx = mid + 1;
1374       else
1375 	right = mid;
1376     }
1377   return set->elems[idx] == elem ? idx + 1 : 0;
1378 }
1379 
1380 static void
1381 internal_function
re_node_set_remove_at(re_node_set * set,int idx)1382 re_node_set_remove_at (re_node_set *set, int idx)
1383 {
1384   if (idx < 0 || idx >= set->nelem)
1385     return;
1386   --set->nelem;
1387   for (; idx < set->nelem; idx++)
1388     set->elems[idx] = set->elems[idx + 1];
1389 }
1390 
1391 
1392 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1393    Or return -1, if an error will be occured.  */
1394 
1395 static int
1396 internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1397 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1398 {
1399 #ifdef RE_ENABLE_I18N
1400   int type = token.type;
1401 #endif
1402   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1403     {
1404       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1405       int *new_nexts, *new_indices;
1406       re_node_set *new_edests, *new_eclosures;
1407       re_token_t *new_nodes;
1408 
1409       /* Avoid overflows.  */
1410       if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
1411 	return -1;
1412 
1413       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1414       if (BE (new_nodes == NULL, 0))
1415 	return -1;
1416       dfa->nodes = new_nodes;
1417       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1418       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1419       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1420       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1421       if (BE (new_nexts == NULL || new_indices == NULL
1422 	      || new_edests == NULL || new_eclosures == NULL, 0))
1423 	return -1;
1424       dfa->nexts = new_nexts;
1425       dfa->org_indices = new_indices;
1426       dfa->edests = new_edests;
1427       dfa->eclosures = new_eclosures;
1428       dfa->nodes_alloc = new_nodes_alloc;
1429     }
1430   dfa->nodes[dfa->nodes_len] = token;
1431   dfa->nodes[dfa->nodes_len].constraint = 0;
1432 #ifdef RE_ENABLE_I18N
1433   dfa->nodes[dfa->nodes_len].accept_mb =
1434     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1435 #endif
1436   dfa->nexts[dfa->nodes_len] = -1;
1437   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1438   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1439   return dfa->nodes_len++;
1440 }
1441 
1442 static inline unsigned int
1443 internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)1444 calc_state_hash (const re_node_set *nodes, unsigned int context)
1445 {
1446   unsigned int hash = nodes->nelem + context;
1447   int i;
1448   for (i = 0 ; i < nodes->nelem ; i++)
1449     hash += nodes->elems[i];
1450   return hash;
1451 }
1452 
1453 /* Search for the state whose node_set is equivalent to NODES.
1454    Return the pointer to the state, if we found it in the DFA.
1455    Otherwise create the new one and return it.  In case of an error
1456    return NULL and set the error code in ERR.
1457    Note: - We assume NULL as the invalid state, then it is possible that
1458 	   return value is NULL and ERR is REG_NOERROR.
1459 	 - We never return non-NULL value in case of any errors, it is for
1460 	   optimization.  */
1461 
1462 static re_dfastate_t *
1463 internal_function
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)1464 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1465 		  const re_node_set *nodes)
1466 {
1467   unsigned int hash;
1468   re_dfastate_t *new_state;
1469   struct re_state_table_entry *spot;
1470   int i;
1471   if (BE (nodes->nelem == 0, 0))
1472     {
1473       *err = REG_NOERROR;
1474       return NULL;
1475     }
1476   hash = calc_state_hash (nodes, 0);
1477   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1478 
1479   for (i = 0 ; i < spot->num ; i++)
1480     {
1481       re_dfastate_t *state = spot->array[i];
1482       if (hash != state->hash)
1483 	continue;
1484       if (re_node_set_compare (&state->nodes, nodes))
1485 	return state;
1486     }
1487 
1488   /* There are no appropriate state in the dfa, create the new one.  */
1489   new_state = create_ci_newstate (dfa, nodes, hash);
1490   if (BE (new_state == NULL, 0))
1491     *err = REG_ESPACE;
1492 
1493   return new_state;
1494 }
1495 
1496 /* Search for the state whose node_set is equivalent to NODES and
1497    whose context is equivalent to CONTEXT.
1498    Return the pointer to the state, if we found it in the DFA.
1499    Otherwise create the new one and return it.  In case of an error
1500    return NULL and set the error code in ERR.
1501    Note: - We assume NULL as the invalid state, then it is possible that
1502 	   return value is NULL and ERR is REG_NOERROR.
1503 	 - We never return non-NULL value in case of any errors, it is for
1504 	   optimization.  */
1505 
1506 static re_dfastate_t *
1507 internal_function
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1508 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1509 			  const re_node_set *nodes, unsigned int context)
1510 {
1511   unsigned int hash;
1512   re_dfastate_t *new_state;
1513   struct re_state_table_entry *spot;
1514   int i;
1515   if (nodes->nelem == 0)
1516     {
1517       *err = REG_NOERROR;
1518       return NULL;
1519     }
1520   hash = calc_state_hash (nodes, context);
1521   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1522 
1523   for (i = 0 ; i < spot->num ; i++)
1524     {
1525       re_dfastate_t *state = spot->array[i];
1526       if (state->hash == hash
1527 	  && state->context == context
1528 	  && re_node_set_compare (state->entrance_nodes, nodes))
1529 	return state;
1530     }
1531   /* There are no appropriate state in `dfa', create the new one.  */
1532   new_state = create_cd_newstate (dfa, nodes, context, hash);
1533   if (BE (new_state == NULL, 0))
1534     *err = REG_ESPACE;
1535 
1536   return new_state;
1537 }
1538 
1539 /* Finish initialization of the new state NEWSTATE, and using its hash value
1540    HASH put in the appropriate bucket of DFA's state table.  Return value
1541    indicates the error code if failed.  */
1542 
1543 static reg_errcode_t
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,unsigned int hash)1544 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1545 		unsigned int hash)
1546 {
1547   struct re_state_table_entry *spot;
1548   reg_errcode_t err;
1549   int i;
1550 
1551   newstate->hash = hash;
1552   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1553   if (BE (err != REG_NOERROR, 0))
1554     return REG_ESPACE;
1555   for (i = 0; i < newstate->nodes.nelem; i++)
1556     {
1557       int elem = newstate->nodes.elems[i];
1558       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1559         re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1560     }
1561 
1562   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1563   if (BE (spot->alloc <= spot->num, 0))
1564     {
1565       int new_alloc = 2 * spot->num + 2;
1566       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1567 					      new_alloc);
1568       if (BE (new_array == NULL, 0))
1569 	return REG_ESPACE;
1570       spot->array = new_array;
1571       spot->alloc = new_alloc;
1572     }
1573   spot->array[spot->num++] = newstate;
1574   return REG_NOERROR;
1575 }
1576 
1577 static void
free_state(re_dfastate_t * state)1578 free_state (re_dfastate_t *state)
1579 {
1580   re_node_set_free (&state->non_eps_nodes);
1581   re_node_set_free (&state->inveclosure);
1582   if (state->entrance_nodes != &state->nodes)
1583     {
1584       re_node_set_free (state->entrance_nodes);
1585       re_free (state->entrance_nodes);
1586     }
1587   re_node_set_free (&state->nodes);
1588   re_free (state->word_trtable);
1589   re_free (state->trtable);
1590   re_free (state);
1591 }
1592 
1593 /* Create the new state which is independ of contexts.
1594    Return the new state if succeeded, otherwise return NULL.  */
1595 
1596 static re_dfastate_t *
1597 internal_function
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int hash)1598 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1599 		    unsigned int hash)
1600 {
1601   int i;
1602   reg_errcode_t err;
1603   re_dfastate_t *newstate;
1604 
1605   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1606   if (BE (newstate == NULL, 0))
1607     return NULL;
1608   err = re_node_set_init_copy (&newstate->nodes, nodes);
1609   if (BE (err != REG_NOERROR, 0))
1610     {
1611       re_free (newstate);
1612       return NULL;
1613     }
1614 
1615   newstate->entrance_nodes = &newstate->nodes;
1616   for (i = 0 ; i < nodes->nelem ; i++)
1617     {
1618       re_token_t *node = dfa->nodes + nodes->elems[i];
1619       re_token_type_t type = node->type;
1620       if (type == CHARACTER && !node->constraint)
1621 	continue;
1622 #ifdef RE_ENABLE_I18N
1623       newstate->accept_mb |= node->accept_mb;
1624 #endif /* RE_ENABLE_I18N */
1625 
1626       /* If the state has the halt node, the state is a halt state.  */
1627       if (type == END_OF_RE)
1628 	newstate->halt = 1;
1629       else if (type == OP_BACK_REF)
1630 	newstate->has_backref = 1;
1631       else if (type == ANCHOR || node->constraint)
1632 	newstate->has_constraint = 1;
1633     }
1634   err = register_state (dfa, newstate, hash);
1635   if (BE (err != REG_NOERROR, 0))
1636     {
1637       free_state (newstate);
1638       newstate = NULL;
1639     }
1640   return newstate;
1641 }
1642 
1643 /* Create the new state which is depend on the context CONTEXT.
1644    Return the new state if succeeded, otherwise return NULL.  */
1645 
1646 static re_dfastate_t *
1647 internal_function
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,unsigned int hash)1648 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1649 		    unsigned int context, unsigned int hash)
1650 {
1651   int i, nctx_nodes = 0;
1652   reg_errcode_t err;
1653   re_dfastate_t *newstate;
1654 
1655   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1656   if (BE (newstate == NULL, 0))
1657     return NULL;
1658   err = re_node_set_init_copy (&newstate->nodes, nodes);
1659   if (BE (err != REG_NOERROR, 0))
1660     {
1661       re_free (newstate);
1662       return NULL;
1663     }
1664 
1665   newstate->context = context;
1666   newstate->entrance_nodes = &newstate->nodes;
1667 
1668   for (i = 0 ; i < nodes->nelem ; i++)
1669     {
1670       unsigned int constraint = 0;
1671       re_token_t *node = dfa->nodes + nodes->elems[i];
1672       re_token_type_t type = node->type;
1673       if (node->constraint)
1674 	constraint = node->constraint;
1675 
1676       if (type == CHARACTER && !constraint)
1677 	continue;
1678 #ifdef RE_ENABLE_I18N
1679       newstate->accept_mb |= node->accept_mb;
1680 #endif /* RE_ENABLE_I18N */
1681 
1682       /* If the state has the halt node, the state is a halt state.  */
1683       if (type == END_OF_RE)
1684 	newstate->halt = 1;
1685       else if (type == OP_BACK_REF)
1686 	newstate->has_backref = 1;
1687       else if (type == ANCHOR)
1688 	constraint = node->opr.ctx_type;
1689 
1690       if (constraint)
1691 	{
1692 	  if (newstate->entrance_nodes == &newstate->nodes)
1693 	    {
1694 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1695 	      if (BE (newstate->entrance_nodes == NULL, 0))
1696 		{
1697 		  free_state (newstate);
1698 		  return NULL;
1699 		}
1700 	      re_node_set_init_copy (newstate->entrance_nodes, nodes);
1701 	      nctx_nodes = 0;
1702 	      newstate->has_constraint = 1;
1703 	    }
1704 
1705 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1706 	    {
1707 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1708 	      ++nctx_nodes;
1709 	    }
1710 	}
1711     }
1712   err = register_state (dfa, newstate, hash);
1713   if (BE (err != REG_NOERROR, 0))
1714     {
1715       free_state (newstate);
1716       newstate = NULL;
1717     }
1718   return  newstate;
1719 }
1720