xref: /dragonfly/contrib/tre/lib/tre-match-utils.h (revision 0ca59c34)
1 /*
2   tre-match-utils.h - TRE matcher helper definitions
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 #include "tre-internal.h"
10 
11 #define str_source ((const tre_str_source*)string)
12 
13 #ifdef TRE_WCHAR
14 
15 #ifdef TRE_MULTIBYTE
16 
17 /* Wide character and multibyte support. */
18 
19 /*
20  * Because all multibyte encodings are exclusively single-shift encoding,
21  * with the shift codes having the high bit set, we can make an optimization
22  * for STR_MBS that only calls tre_mbrtowc_l() when a high-bit character
23  * is detected, and just do a direct copy for ASCII characters.
24  */
25 #define GET_NEXT_WCHAR()						      \
26   do {									      \
27     prev_c = next_c;							      \
28     switch (type)							      \
29       {									      \
30       case STR_BYTE:							      \
31 	pos++;								      \
32 	if (len >= 0 && pos >= len)					      \
33 	  next_c = '\0';						      \
34 	else								      \
35 	  next_c = (unsigned char)(*str_byte++);			      \
36 	break;								      \
37       case STR_WIDE:							      \
38 	pos++;								      \
39 	if (len >= 0 && pos >= len)					      \
40 	  next_c = L'\0';						      \
41 	else								      \
42 	  next_c = *str_wide++;						      \
43 	break;								      \
44       case STR_MBS:							      \
45 	pos += pos_add_next;					      	      \
46 	if (__builtin_expect(len >= 0 && pos >= len, 0))		      \
47 	  {								      \
48 	    next_c = L'\0';						      \
49 	    pos_add_next = 1;						      \
50 	  }								      \
51 	else if (__builtin_expect(!(*str_byte & 0x80), 1))		      \
52 	  {								      \
53 	    next_c = (unsigned char)(*str_byte++);			      \
54 	    pos_add_next = 1;						      \
55 	  }								      \
56 	else								      \
57 	  {								      \
58 	    size_t w;							      \
59 	    int max;							      \
60 	    if (len >= 0)						      \
61 	      max = len - pos;						      \
62 	    else							      \
63 	      max = 32;							      \
64 	    w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate,	      \
65 			      tnfa->loc);				      \
66 	    if (w == (size_t)-1 || w == (size_t)-2)			      \
67 	      return REG_ILLSEQ;					      \
68 	    if (w == 0 && len >= 0)					      \
69 	      {								      \
70 		pos_add_next = 1;					      \
71 		next_c = 0;						      \
72 		str_byte++;						      \
73 	      }								      \
74 	    else							      \
75 	      {								      \
76 		pos_add_next = w;					      \
77 		str_byte += w;						      \
78 	      }								      \
79 	  }								      \
80 	break;								      \
81       }									      \
82   } while(/*CONSTCOND*/0)
83 
84 #else /* !TRE_MULTIBYTE */
85 
86 /* Wide character support, no multibyte support. */
87 #error TRE_MULTIBYTE undefined
88 
89 #define GET_NEXT_WCHAR()						      \
90   do {									      \
91     prev_c = next_c;							      \
92     if (type == STR_BYTE)						      \
93       {									      \
94 	pos++;								      \
95 	if (len >= 0 && pos >= len)					      \
96 	  next_c = '\0';						      \
97 	else								      \
98 	  next_c = (unsigned char)(*str_byte++);			      \
99       }									      \
100     else if (type == STR_WIDE)						      \
101       {									      \
102 	pos++;								      \
103 	if (len >= 0 && pos >= len)					      \
104 	  next_c = L'\0';						      \
105 	else								      \
106 	  next_c = *str_wide++;						      \
107       }									      \
108   } while(/*CONSTCOND*/0)
109 
110 #endif /* !TRE_MULTIBYTE */
111 
112 #else /* !TRE_WCHAR */
113 
114 /* No wide character or multibyte support. */
115 #error TRE_WCHAR undefined
116 
117 #define GET_NEXT_WCHAR()						      \
118   do {									      \
119     prev_c = next_c;							      \
120     if (type == STR_BYTE)						      \
121       {									      \
122 	pos++;								      \
123 	if (len >= 0 && pos >= len)					      \
124 	  next_c = '\0';						      \
125 	else								      \
126 	  next_c = (unsigned char)(*str_byte++);			      \
127       }									      \
128   } while(/*CONSTCOND*/0)
129 
130 #endif /* !TRE_WCHAR */
131 
132 
133 
134 /* Assumes tre_tnfa_t *tnfa in scope */
135 #define IS_WORD_CHAR(c)	 ((c) == L'_' || tre_isalnum_l(c, tnfa->loc))
136 
137 #define CHECK_ASSERTIONS(assertions)					      \
138   (((assertions & ASSERT_AT_BOL)					      \
139     && (pos > 0 || reg_notbol)						      \
140     && (prev_c != L'\n' || !reg_newline))				      \
141    || ((assertions & ASSERT_AT_EOL)					      \
142        && (next_c != L'\0' || reg_noteol)				      \
143        && (next_c != L'\n' || !reg_newline))				      \
144    || ((assertions & ASSERT_AT_BOW)					      \
145        && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c)))	              \
146    || ((assertions & ASSERT_AT_EOW)					      \
147        && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c)))		      \
148    || ((assertions & ASSERT_AT_WB)					      \
149        && (pos != 0 && next_c != L'\0'					      \
150 	   && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c)))		      \
151    || ((assertions & ASSERT_AT_WB_NEG)					      \
152        && (pos == 0 || next_c == L'\0'					      \
153 	   || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
154 
155 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)                             \
156   ((trans_i->assertions & ASSERT_BRACKET_MATCH)                               \
157     && !tre_bracket_match(trans_i->u.bracket_match_list,(tre_cint_t)prev_c,    \
158                                       tnfa))
159 
160 
161 
162 
163 inline static int
164 tre_tag_get(const tre_tag_t *tags, int i)
165 {
166   tags += i;
167   return tags->count > 0 ? tags->value : -1;
168 }
169 
170 inline static void
171 tre_tag_set(tre_tag_t *tags, int i, int val, int touch)
172 {
173   tags += i;
174   if (tags->count++ == 0)
175     tags->first = val;
176   tags->value = val;
177   tags->touch = touch;
178 }
179 
180 inline static void
181 tre_tag_reset(tre_tag_t *tags, int i)
182 {
183   tags[i].count = 0;
184 }
185 
186 inline static int
187 tre_tag_touch_get(const tre_tag_t *tags, int i)
188 {
189   return tags[i].touch;
190 }
191 
192 #ifdef TRE_DEBUG
193 inline static void
194 tre_print_tags(const tre_tag_t *tags, int num_tags)
195 {
196   int i;
197   for (i = 0; i < num_tags; i++, tags++)
198     {
199       switch(tags->count)
200 	{
201 	case 0:
202 	  DPRINT(("%d:(0,-1)", i));
203 	  break;
204 	case 1:
205 	  DPRINT(("%d:(1,%d)", i, tags->first));
206 	  break;
207 	default:
208 	  DPRINT(("%d:(%d,%d,%d)", i, tags->count, tags->first,
209 		  tags->value));
210 	  break;
211 	}
212       if (i < (num_tags - 1))
213 	DPRINT((" "));
214     }
215 }
216 
217 inline static void
218 tre_print_tags_all(const tre_tag_t *tags, int num_tags)
219 {
220   int i;
221   for (i = 0; i < num_tags; i++, tags++)
222     {
223       switch(tags->count)
224 	{
225 	case 0:
226 	  DPRINT(("%d:(0,-1)/%d", i, tags->touch));
227 	  break;
228 	case 1:
229 	  DPRINT(("%d:(1,%d)/%d", i, tags->first, tags->touch));
230 	  break;
231 	default:
232 	  DPRINT(("%d:(%d,%d,%d)/%d", i, tags->count, tags->first,
233 		  tags->value, tags->touch));
234 	  break;
235 	}
236       if (i < (num_tags - 1))
237 	DPRINT((" "));
238     }
239 }
240 #endif /* TRE_DEBUG */
241 
242 /* Return < 0, = 0 or > 0 depending on how the start/end pairs of a minimal
243  * group between t1 and t2 compare (t1 loses if < 0, t1 wins if > 0) */
244 inline static int
245 tre_minimal_tag_order(int start, int end, const tre_tag_t *tags1,
246 		      const tre_tag_t *tags2)
247 {
248   const tre_tag_t *t1, *t2;
249 
250   t1 = tags1 + start;
251   t2 = tags2 + start;
252   /* We need both start tags to be set */
253   if (t1->count == 0 || t2->count == 0)
254     return 0;
255 
256   /* The start tags must be equal */
257   if (t1->value != t2->value)
258     return 0;
259 
260   t1 = tags1 + end;
261   t2 = tags2 + end;
262   /* For the end tags, we prefer set over unset, because unset means that
263    * the end tag is still growing */
264   if (t1->count == 0)
265     {
266       /* if t2 is set, t1 loses since it is unset */
267       if (t2->count != 0)
268 	return -1;
269     }
270   /* if t2 not set, t1 wins since it is set */
271   else if (t2->count == 0)
272     return 1;
273 
274   /* least current value wins */
275   return t2->value - t1->value;
276 }
277 
278 /* Return < 0, = 0 or > 0 depending on how the i-th item of t1 and t2 compare
279  * (t1 loses if < 0, t1 wins if > 0) */
280 inline static int
281 tre_tag_order_1(int i, tre_tag_direction_t dir, const tre_tag_t *t1,
282 		const tre_tag_t *t2)
283 {
284   int diff;
285 
286   t1 += i;
287   t2 += i;
288   switch (dir)
289     {
290     case TRE_TAG_MINIMIZE:
291       /* least current value wins (because tags are initialized to all zeros,
292        * unset wins over set; also, tre_minimal_tag_order() will have already
293        * been run, which checks for being unset) */
294       return t2->value - t1->value;
295 
296     case TRE_TAG_MAXIMIZE:
297       /* prefer set */
298       if (t1->count == 0)
299 	{
300 	  /* if neither t1 and t2 are set, try next tag */
301 	  if (t2->count == 0)
302 	    return 0;
303 	  /* t2 is set, t1 loses since it is unset */
304 	  return -1;
305 	}
306       /* if t2 not set, t1 wins since it is set */
307       else if (t2->count == 0)
308 	return 1;
309       /* greatest initial value wins */
310       if ((diff = t1->first - t2->first) != 0)
311 	return diff;
312       /* least number of times the tag was set, wins */
313       if ((diff = t2->count - t1->count) != 0)
314 	return diff;
315       /* if the tags were only set once, they only have initial values */
316       if (t1->count == 1)
317 	return 0;
318       /* greatest current value wins */
319       return t1->value - t2->value;
320 
321     case TRE_TAG_LEFT_MAXIMIZE:
322       /* prefer set */
323       if (t1->count == 0)
324 	{
325 	  /* if neither t1 and t2 are set, try next tag */
326 	  if (t2->count == 0)
327 	    return 0;
328 	  /* t2 is set, t1 loses since it is unset */
329 	  return -1;
330 	}
331       /* if t2 not set, t1 wins since it is set */
332       else if (t2->count == 0)
333 	return 1;
334       /* least initial value wins */
335       if ((diff = t2->first - t1->first) != 0)
336 	return diff;
337       /* least number of times the tag was set, wins */
338       if ((diff = t2->count - t1->count) != 0)
339 	return diff;
340       /* if the tags were only set once, they only have initial values */
341       if (t1->count == 1)
342 	return 0;
343       /* greatest current value wins */
344       return t1->value - t2->value;
345 
346     default:
347       /* Shouldn't happen: only assert if TRE_DEBUG defined */
348       assert(0);
349       break;
350     }
351   return 0;
352 }
353 
354 #ifdef TRE_DEBUG
355 #define _MORE_DEBUGGING
356 #endif /* TRE_DEBUG */
357 
358 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
359 inline static int
360 #ifdef _MORE_DEBUGGING
361 _tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
362 	      const tre_tag_t *t1, const tre_tag_t *t2)
363 #else /* !_MORE_DEBUGGING */
364 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
365 	      const tre_tag_t *t1, const tre_tag_t *t2)
366 #endif /* !_MORE_DEBUGGING */
367 {
368   int i, ret;
369 
370   for (i = 0; i < num_tags; i++)
371     {
372       if ((ret = tre_tag_order_1(i, tag_directions[i], t1, t2)) != 0)
373 	return (ret > 0);
374     }
375   /*  assert(0);*/
376   return 0;
377 }
378 
379 #ifdef _MORE_DEBUGGING
380 inline static int
381 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
382 	      const tre_tag_t *t1, const tre_tag_t *t2)
383 {
384   int ret = _tre_tag_order(num_tags, tag_directions, t1, t2);
385   DPRINT(("tre_tag_order: "));
386   tre_print_tags(t1, num_tags);
387   DPRINT((" %s ", ret ? "wins" : "doesn't win"));
388   tre_print_tags(t2, num_tags);
389   DPRINT(("\n"));
390   return ret;
391 }
392 #endif /* _MORE_DEBUGGING */
393 
394 int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len);
395 
396 inline static int
397 tre_bracket_match(tre_bracket_match_list_t * __restrict list, tre_cint_t wc,
398 		  const tre_tnfa_t * __restrict tnfa)
399 {
400   int match = 0;
401   int i;
402   tre_bracket_match_t *b;
403   tre_cint_t uc, lc;
404   int we, ue, le, got_equiv = 0;
405   int icase = ((tnfa->cflags & REG_ICASE) != 0);
406 
407   DPRINT(("tre_bracket_match: %p, %d, %d\n", list, wc, icase));
408   if (icase)
409     {
410       if (tre_islower_l(wc, tnfa->loc))
411 	{
412 	  lc = wc;
413 	  uc = tre_toupper_l(wc, tnfa->loc);
414 	}
415       else if (tre_isupper_l(wc, tnfa->loc))
416 	{
417 	  uc = wc;
418 	  lc = tre_tolower_l(wc, tnfa->loc);
419 	}
420       else
421 	{
422 	  icase = 0;
423 	}
424     }
425   for (i = 0, b = list->bracket_matches; i < list->num_bracket_matches;
426        i++, b++)
427     {
428       switch (b->type)
429 	{
430 	case TRE_BRACKET_MATCH_TYPE_CHAR:
431 	  if (icase)
432 	    match = (b->value == uc || b->value == lc);
433 	  else
434 	    match = (b->value == wc);
435 	  break;
436 	case TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN:
437 	  {
438 	    tre_cint_t start = b->value, end;
439 	    if (++i >= list->num_bracket_matches ||
440 		(++b)->type != TRE_BRACKET_MATCH_TYPE_RANGE_END)
441 	      {
442 		DPRINT(("tre_bracket_match: no following range end\n"));
443 		assert(0);
444 		goto error;
445 	      }
446 	    end = b->value;
447 	    if (!got_equiv)
448 	      {
449 		if (icase)
450 		  {
451 		    ue = __collate_equiv_value(tnfa->loc, &uc, 1);
452 		    le = __collate_equiv_value(tnfa->loc, &lc, 1);
453 		  }
454 		else
455 		    we = __collate_equiv_value(tnfa->loc, &wc, 1);
456 		got_equiv = 1;
457 	      }
458 	    if (icase)
459 	      match = ((start <= ue && ue <= end) ||
460 		      (start <= le && le <= end));
461 	    else
462 	      match = (start <= we && we <= end);
463 	    break;
464 	  }
465 	case TRE_BRACKET_MATCH_TYPE_RANGE_END:
466 	  DPRINT(("tre_bracket_match: range end without preceeding start\n"));
467 	  assert(0);
468 	  break;
469 	case TRE_BRACKET_MATCH_TYPE_CLASS:
470 	  if (icase)
471 	    match = (tre_isctype_l(uc, b->value, tnfa->loc) ||
472 		     tre_isctype_l(lc, b->value, tnfa->loc));
473 	  else
474 	    match = (tre_isctype_l(wc, b->value, tnfa->loc));
475 	  break;
476 	case TRE_BRACKET_MATCH_TYPE_EQUIVALENCE:
477 	  if (!got_equiv)
478 	    {
479 	      if (icase)
480 		{
481 		  ue = __collate_equiv_value(tnfa->loc, &uc, 1);
482 		  le = __collate_equiv_value(tnfa->loc, &lc, 1);
483 		}
484 	      else
485 		  we = __collate_equiv_value(tnfa->loc, &wc, 1);
486 	      got_equiv = 1;
487 	    }
488 	  if (icase)
489 	    match = (b->value == ue || b->value == le);
490 	  else
491 	    match = (b->value == we);
492 	  break;
493 	default:
494 	  DPRINT(("tre_bracket_match: unknown type %d\n", b->type));
495 	  assert(0);
496 	  break;
497 	}
498 	if (match)
499 	  break;
500     }
501 error:
502   if (list->flags & TRE_BRACKET_MATCH_FLAG_NEGATE) {
503     if ((tnfa->cflags & REG_NEWLINE) && wc == '\n') return 0;
504     match = !match;
505   }
506   return match;
507 }
508