1 /* 2 tre-match-utils.h - TRE matcher helper definitions 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7 */ 8 9 #include "tre-internal.h" 10 11 #define str_source ((const tre_str_source*)string) 12 13 #ifdef TRE_WCHAR 14 15 #ifdef TRE_MULTIBYTE 16 17 /* Wide character and multibyte support. */ 18 19 /* 20 * Because all multibyte encodings are exclusively single-shift encoding, 21 * with the shift codes having the high bit set, we can make an optimization 22 * for STR_MBS that only calls tre_mbrtowc_l() when a high-bit character 23 * is detected, and just do a direct copy for ASCII characters. 24 */ 25 #define GET_NEXT_WCHAR() \ 26 do { \ 27 prev_c = next_c; \ 28 switch (type) \ 29 { \ 30 case STR_BYTE: \ 31 pos++; \ 32 if (len >= 0 && pos >= len) \ 33 next_c = '\0'; \ 34 else \ 35 next_c = (unsigned char)(*str_byte++); \ 36 break; \ 37 case STR_WIDE: \ 38 pos++; \ 39 if (len >= 0 && pos >= len) \ 40 next_c = L'\0'; \ 41 else \ 42 next_c = *str_wide++; \ 43 break; \ 44 case STR_MBS: \ 45 pos += pos_add_next; \ 46 if (__builtin_expect(len >= 0 && pos >= len, 0)) \ 47 { \ 48 next_c = L'\0'; \ 49 pos_add_next = 1; \ 50 } \ 51 else if (__builtin_expect(!(*str_byte & 0x80), 1)) \ 52 { \ 53 next_c = (unsigned char)(*str_byte++); \ 54 pos_add_next = 1; \ 55 } \ 56 else \ 57 { \ 58 size_t w; \ 59 int max; \ 60 if (len >= 0) \ 61 max = len - pos; \ 62 else \ 63 max = 32; \ 64 w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate, \ 65 tnfa->loc); \ 66 if (w == (size_t)-1 || w == (size_t)-2) \ 67 return REG_ILLSEQ; \ 68 if (w == 0 && len >= 0) \ 69 { \ 70 pos_add_next = 1; \ 71 next_c = 0; \ 72 str_byte++; \ 73 } \ 74 else \ 75 { \ 76 pos_add_next = w; \ 77 str_byte += w; \ 78 } \ 79 } \ 80 break; \ 81 } \ 82 } while(/*CONSTCOND*/0) 83 84 #else /* !TRE_MULTIBYTE */ 85 86 /* Wide character support, no multibyte support. */ 87 #error TRE_MULTIBYTE undefined 88 89 #define GET_NEXT_WCHAR() \ 90 do { \ 91 prev_c = next_c; \ 92 if (type == STR_BYTE) \ 93 { \ 94 pos++; \ 95 if (len >= 0 && pos >= len) \ 96 next_c = '\0'; \ 97 else \ 98 next_c = (unsigned char)(*str_byte++); \ 99 } \ 100 else if (type == STR_WIDE) \ 101 { \ 102 pos++; \ 103 if (len >= 0 && pos >= len) \ 104 next_c = L'\0'; \ 105 else \ 106 next_c = *str_wide++; \ 107 } \ 108 } while(/*CONSTCOND*/0) 109 110 #endif /* !TRE_MULTIBYTE */ 111 112 #else /* !TRE_WCHAR */ 113 114 /* No wide character or multibyte support. */ 115 #error TRE_WCHAR undefined 116 117 #define GET_NEXT_WCHAR() \ 118 do { \ 119 prev_c = next_c; \ 120 if (type == STR_BYTE) \ 121 { \ 122 pos++; \ 123 if (len >= 0 && pos >= len) \ 124 next_c = '\0'; \ 125 else \ 126 next_c = (unsigned char)(*str_byte++); \ 127 } \ 128 } while(/*CONSTCOND*/0) 129 130 #endif /* !TRE_WCHAR */ 131 132 133 134 /* Assumes tre_tnfa_t *tnfa in scope */ 135 #define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum_l(c, tnfa->loc)) 136 137 #define CHECK_ASSERTIONS(assertions) \ 138 (((assertions & ASSERT_AT_BOL) \ 139 && (pos > 0 || reg_notbol) \ 140 && (prev_c != L'\n' || !reg_newline)) \ 141 || ((assertions & ASSERT_AT_EOL) \ 142 && (next_c != L'\0' || reg_noteol) \ 143 && (next_c != L'\n' || !reg_newline)) \ 144 || ((assertions & ASSERT_AT_BOW) \ 145 && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \ 146 || ((assertions & ASSERT_AT_EOW) \ 147 && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \ 148 || ((assertions & ASSERT_AT_WB) \ 149 && (pos != 0 && next_c != L'\0' \ 150 && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \ 151 || ((assertions & ASSERT_AT_WB_NEG) \ 152 && (pos == 0 || next_c == L'\0' \ 153 || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c)))) 154 155 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \ 156 ((trans_i->assertions & ASSERT_BRACKET_MATCH) \ 157 && !tre_bracket_match(trans_i->u.bracket_match_list,(tre_cint_t)prev_c, \ 158 tnfa)) 159 160 161 162 163 inline static int 164 tre_tag_get(const tre_tag_t *tags, int i) 165 { 166 tags += i; 167 return tags->count > 0 ? tags->value : -1; 168 } 169 170 inline static void 171 tre_tag_set(tre_tag_t *tags, int i, int val, int touch) 172 { 173 tags += i; 174 if (tags->count++ == 0) 175 tags->first = val; 176 tags->value = val; 177 tags->touch = touch; 178 } 179 180 inline static void 181 tre_tag_reset(tre_tag_t *tags, int i) 182 { 183 tags[i].count = 0; 184 } 185 186 inline static int 187 tre_tag_touch_get(const tre_tag_t *tags, int i) 188 { 189 return tags[i].touch; 190 } 191 192 #ifdef TRE_DEBUG 193 inline static void 194 tre_print_tags(const tre_tag_t *tags, int num_tags) 195 { 196 int i; 197 for (i = 0; i < num_tags; i++, tags++) 198 { 199 switch(tags->count) 200 { 201 case 0: 202 DPRINT(("%d:(0,-1)", i)); 203 break; 204 case 1: 205 DPRINT(("%d:(1,%d)", i, tags->first)); 206 break; 207 default: 208 DPRINT(("%d:(%d,%d,%d)", i, tags->count, tags->first, 209 tags->value)); 210 break; 211 } 212 if (i < (num_tags - 1)) 213 DPRINT((" ")); 214 } 215 } 216 217 inline static void 218 tre_print_tags_all(const tre_tag_t *tags, int num_tags) 219 { 220 int i; 221 for (i = 0; i < num_tags; i++, tags++) 222 { 223 switch(tags->count) 224 { 225 case 0: 226 DPRINT(("%d:(0,-1)/%d", i, tags->touch)); 227 break; 228 case 1: 229 DPRINT(("%d:(1,%d)/%d", i, tags->first, tags->touch)); 230 break; 231 default: 232 DPRINT(("%d:(%d,%d,%d)/%d", i, tags->count, tags->first, 233 tags->value, tags->touch)); 234 break; 235 } 236 if (i < (num_tags - 1)) 237 DPRINT((" ")); 238 } 239 } 240 #endif /* TRE_DEBUG */ 241 242 /* Return < 0, = 0 or > 0 depending on how the start/end pairs of a minimal 243 * group between t1 and t2 compare (t1 loses if < 0, t1 wins if > 0) */ 244 inline static int 245 tre_minimal_tag_order(int start, int end, const tre_tag_t *tags1, 246 const tre_tag_t *tags2) 247 { 248 const tre_tag_t *t1, *t2; 249 250 t1 = tags1 + start; 251 t2 = tags2 + start; 252 /* We need both start tags to be set */ 253 if (t1->count == 0 || t2->count == 0) 254 return 0; 255 256 /* The start tags must be equal */ 257 if (t1->value != t2->value) 258 return 0; 259 260 t1 = tags1 + end; 261 t2 = tags2 + end; 262 /* For the end tags, we prefer set over unset, because unset means that 263 * the end tag is still growing */ 264 if (t1->count == 0) 265 { 266 /* if t2 is set, t1 loses since it is unset */ 267 if (t2->count != 0) 268 return -1; 269 } 270 /* if t2 not set, t1 wins since it is set */ 271 else if (t2->count == 0) 272 return 1; 273 274 /* least current value wins */ 275 return t2->value - t1->value; 276 } 277 278 /* Return < 0, = 0 or > 0 depending on how the i-th item of t1 and t2 compare 279 * (t1 loses if < 0, t1 wins if > 0) */ 280 inline static int 281 tre_tag_order_1(int i, tre_tag_direction_t dir, const tre_tag_t *t1, 282 const tre_tag_t *t2) 283 { 284 int diff; 285 286 t1 += i; 287 t2 += i; 288 switch (dir) 289 { 290 case TRE_TAG_MINIMIZE: 291 /* least current value wins (because tags are initialized to all zeros, 292 * unset wins over set; also, tre_minimal_tag_order() will have already 293 * been run, which checks for being unset) */ 294 return t2->value - t1->value; 295 296 case TRE_TAG_MAXIMIZE: 297 /* prefer set */ 298 if (t1->count == 0) 299 { 300 /* if neither t1 and t2 are set, try next tag */ 301 if (t2->count == 0) 302 return 0; 303 /* t2 is set, t1 loses since it is unset */ 304 return -1; 305 } 306 /* if t2 not set, t1 wins since it is set */ 307 else if (t2->count == 0) 308 return 1; 309 /* greatest initial value wins */ 310 if ((diff = t1->first - t2->first) != 0) 311 return diff; 312 /* least number of times the tag was set, wins */ 313 if ((diff = t2->count - t1->count) != 0) 314 return diff; 315 /* if the tags were only set once, they only have initial values */ 316 if (t1->count == 1) 317 return 0; 318 /* greatest current value wins */ 319 return t1->value - t2->value; 320 321 case TRE_TAG_LEFT_MAXIMIZE: 322 /* prefer set */ 323 if (t1->count == 0) 324 { 325 /* if neither t1 and t2 are set, try next tag */ 326 if (t2->count == 0) 327 return 0; 328 /* t2 is set, t1 loses since it is unset */ 329 return -1; 330 } 331 /* if t2 not set, t1 wins since it is set */ 332 else if (t2->count == 0) 333 return 1; 334 /* least initial value wins */ 335 if ((diff = t2->first - t1->first) != 0) 336 return diff; 337 /* least number of times the tag was set, wins */ 338 if ((diff = t2->count - t1->count) != 0) 339 return diff; 340 /* if the tags were only set once, they only have initial values */ 341 if (t1->count == 1) 342 return 0; 343 /* greatest current value wins */ 344 return t1->value - t2->value; 345 346 default: 347 /* Shouldn't happen: only assert if TRE_DEBUG defined */ 348 assert(0); 349 break; 350 } 351 return 0; 352 } 353 354 #ifdef TRE_DEBUG 355 #define _MORE_DEBUGGING 356 #endif /* TRE_DEBUG */ 357 358 /* Returns 1 if `t1' wins `t2', 0 otherwise. */ 359 inline static int 360 #ifdef _MORE_DEBUGGING 361 _tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, 362 const tre_tag_t *t1, const tre_tag_t *t2) 363 #else /* !_MORE_DEBUGGING */ 364 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, 365 const tre_tag_t *t1, const tre_tag_t *t2) 366 #endif /* !_MORE_DEBUGGING */ 367 { 368 int i, ret; 369 370 for (i = 0; i < num_tags; i++) 371 { 372 if ((ret = tre_tag_order_1(i, tag_directions[i], t1, t2)) != 0) 373 return (ret > 0); 374 } 375 /* assert(0);*/ 376 return 0; 377 } 378 379 #ifdef _MORE_DEBUGGING 380 inline static int 381 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, 382 const tre_tag_t *t1, const tre_tag_t *t2) 383 { 384 int ret = _tre_tag_order(num_tags, tag_directions, t1, t2); 385 DPRINT(("tre_tag_order: ")); 386 tre_print_tags(t1, num_tags); 387 DPRINT((" %s ", ret ? "wins" : "doesn't win")); 388 tre_print_tags(t2, num_tags); 389 DPRINT(("\n")); 390 return ret; 391 } 392 #endif /* _MORE_DEBUGGING */ 393 394 int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len); 395 396 inline static int 397 tre_bracket_match(tre_bracket_match_list_t * __restrict list, tre_cint_t wc, 398 const tre_tnfa_t * __restrict tnfa) 399 { 400 int match = 0; 401 int i; 402 tre_bracket_match_t *b; 403 tre_cint_t uc, lc; 404 int we, ue, le, got_equiv = 0; 405 int icase = ((tnfa->cflags & REG_ICASE) != 0); 406 407 DPRINT(("tre_bracket_match: %p, %d, %d\n", list, wc, icase)); 408 if (icase) 409 { 410 if (tre_islower_l(wc, tnfa->loc)) 411 { 412 lc = wc; 413 uc = tre_toupper_l(wc, tnfa->loc); 414 } 415 else if (tre_isupper_l(wc, tnfa->loc)) 416 { 417 uc = wc; 418 lc = tre_tolower_l(wc, tnfa->loc); 419 } 420 else 421 { 422 icase = 0; 423 } 424 } 425 for (i = 0, b = list->bracket_matches; i < list->num_bracket_matches; 426 i++, b++) 427 { 428 switch (b->type) 429 { 430 case TRE_BRACKET_MATCH_TYPE_CHAR: 431 if (icase) 432 match = (b->value == uc || b->value == lc); 433 else 434 match = (b->value == wc); 435 break; 436 case TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN: 437 { 438 tre_cint_t start = b->value, end; 439 if (++i >= list->num_bracket_matches || 440 (++b)->type != TRE_BRACKET_MATCH_TYPE_RANGE_END) 441 { 442 DPRINT(("tre_bracket_match: no following range end\n")); 443 assert(0); 444 goto error; 445 } 446 end = b->value; 447 if (!got_equiv) 448 { 449 if (icase) 450 { 451 ue = __collate_equiv_value(tnfa->loc, &uc, 1); 452 le = __collate_equiv_value(tnfa->loc, &lc, 1); 453 } 454 else 455 we = __collate_equiv_value(tnfa->loc, &wc, 1); 456 got_equiv = 1; 457 } 458 if (icase) 459 match = ((start <= ue && ue <= end) || 460 (start <= le && le <= end)); 461 else 462 match = (start <= we && we <= end); 463 break; 464 } 465 case TRE_BRACKET_MATCH_TYPE_RANGE_END: 466 DPRINT(("tre_bracket_match: range end without preceeding start\n")); 467 assert(0); 468 break; 469 case TRE_BRACKET_MATCH_TYPE_CLASS: 470 if (icase) 471 match = (tre_isctype_l(uc, b->value, tnfa->loc) || 472 tre_isctype_l(lc, b->value, tnfa->loc)); 473 else 474 match = (tre_isctype_l(wc, b->value, tnfa->loc)); 475 break; 476 case TRE_BRACKET_MATCH_TYPE_EQUIVALENCE: 477 if (!got_equiv) 478 { 479 if (icase) 480 { 481 ue = __collate_equiv_value(tnfa->loc, &uc, 1); 482 le = __collate_equiv_value(tnfa->loc, &lc, 1); 483 } 484 else 485 we = __collate_equiv_value(tnfa->loc, &wc, 1); 486 got_equiv = 1; 487 } 488 if (icase) 489 match = (b->value == ue || b->value == le); 490 else 491 match = (b->value == we); 492 break; 493 default: 494 DPRINT(("tre_bracket_match: unknown type %d\n", b->type)); 495 assert(0); 496 break; 497 } 498 if (match) 499 break; 500 } 501 error: 502 if (list->flags & TRE_BRACKET_MATCH_FLAG_NEGATE) { 503 if ((tnfa->cflags & REG_NEWLINE) && wc == '\n') return 0; 504 match = !match; 505 } 506 return match; 507 } 508