1 /* 2 tre_regexec.c - TRE POSIX compatible matching functions (and more). 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7 */ 8 9 #ifdef HAVE_CONFIG_H 10 #include <config.h> 11 #endif /* HAVE_CONFIG_H */ 12 13 #ifdef TRE_USE_ALLOCA 14 /* AIX requires this to be the first thing in the file. */ 15 #ifndef __GNUC__ 16 # if HAVE_ALLOCA_H 17 # include <alloca.h> 18 # else 19 # ifdef _AIX 20 #pragma alloca 21 # else 22 # ifndef alloca /* predefined by HP cc +Olibcalls */ 23 char *alloca (); 24 # endif 25 # endif 26 # endif 27 #endif 28 #endif /* TRE_USE_ALLOCA */ 29 30 #include <assert.h> 31 #include <stdlib.h> 32 #include <string.h> 33 #ifdef HAVE_WCHAR_H 34 #include <wchar.h> 35 #endif /* HAVE_WCHAR_H */ 36 #ifdef HAVE_WCTYPE_H 37 #include <wctype.h> 38 #endif /* HAVE_WCTYPE_H */ 39 #ifndef TRE_WCHAR 40 #include <ctype.h> 41 #endif /* !TRE_WCHAR */ 42 #ifdef HAVE_MALLOC_H 43 #include <malloc.h> 44 #endif /* HAVE_MALLOC_H */ 45 #include <limits.h> 46 47 #include "tre-internal.h" 48 #include "tre-match-utils.h" 49 #include "tre.h" 50 #include "xmalloc.h" 51 52 53 /* For each tre_last_matched_t in the lm array, find the last matched branch by 54 comparing the touch value of the cmp_tag's. For all other branches, reset 55 the corresponding tags. If reset_all is non-zero, reset all tags in all 56 branches. Recurse into the nested last matched structures, clearing tags as 57 apprpriate. */ 58 static void 59 tre_reset_last_matched_branches(tre_tag_t *tags, const tre_last_matched_t *lm, 60 int n, int start_tag, int reset_all) 61 { 62 int max, i, reset; 63 tre_last_matched_branch_t *b; 64 65 DPRINT(("tre_reset_last_matched_branches: n=%d start_tag=%d reset_all=%d\n", 66 n, start_tag, reset_all)); 67 for (; n-- > 0; lm++) 68 { 69 if (lm->n_branches == 1) 70 { 71 b = lm->branches; 72 if (start_tag > 0) 73 { 74 DPRINT((" b->cmp_tag=%d %d <? %d\n", b->cmp_tag, 75 tre_tag_touch_get(tags, b->cmp_tag), 76 tre_tag_touch_get(tags, start_tag))); 77 reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) < 78 tre_tag_touch_get(tags, start_tag)); 79 } 80 else 81 reset = 0; 82 83 if (reset) 84 { 85 int *t; 86 87 for (i = b->n_tags, t = b->tags; i > 0; i--, t++) 88 { 89 DPRINT((" Resetting t%d\n", *t)); 90 tre_tag_reset(tags, *t); 91 } 92 } 93 if (b->n_last_matched > 0) 94 tre_reset_last_matched_branches(tags, b->last_matched, 95 b->n_last_matched, 96 lm->start_tag, reset); 97 } 98 else 99 { 100 if (!reset_all) 101 { 102 #ifdef TRE_DEBUG 103 int last; 104 #endif /* TRE_DEBUG */ 105 max = 0; 106 for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++) 107 { 108 int t = b->cmp_tag; 109 int touch = tre_tag_touch_get(tags, t); 110 if (touch > max) 111 { 112 max = touch; 113 #ifdef TRE_DEBUG 114 last = t; 115 #endif /* TRE_DEBUG */ 116 } 117 } 118 DPRINT((" Last touched end tag t%d=%d\n", last, max)); 119 } 120 121 for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++) 122 { 123 reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) < max); 124 if (reset) 125 { 126 int j; 127 int *t; 128 129 for (j = b->n_tags, t = b->tags; j > 0; j--, t++) 130 { 131 DPRINT((" Resetting t%d\n", *t)); 132 tre_tag_reset(tags, *t); 133 } 134 } 135 if (b->n_last_matched > 0) 136 tre_reset_last_matched_branches(tags, b->last_matched, 137 b->n_last_matched, 138 lm->start_tag, reset); 139 } 140 } 141 } 142 } 143 144 145 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match 146 endpoint values. */ 147 reg_errcode_t 148 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, 149 const tre_tnfa_t *tnfa, const tre_tag_t *intags, int match_eo) 150 { 151 unsigned int i; 152 153 if (cflags & REG_NOSUB) return REG_OK; 154 155 i = 0; 156 if (match_eo >= 0 && intags) 157 { 158 const tre_tag_t *tags = intags; 159 tre_submatch_data_t *submatch_data; 160 161 if (tnfa->last_matched_branch && 162 tnfa->last_matched_branch->n_last_matched > 0) 163 { 164 tre_tag_t *t; 165 #ifdef TRE_USE_ALLOCA 166 t = alloca(sizeof(*t) * tnfa->num_tags); 167 #else /* !TRE_USE_ALLOCA */ 168 t = xmalloc(sizeof(*t) * tnfa->num_tags); 169 #endif /* !TRE_USE_ALLOCA */ 170 if (!t) return REG_ESPACE; 171 memcpy(t, intags, tnfa->num_tags * sizeof(tre_tag_t)); 172 tre_reset_last_matched_branches(t, 173 tnfa->last_matched_branch->last_matched, 174 tnfa->last_matched_branch->n_last_matched, 175 0, 0); 176 tags = t; 177 } 178 /* Construct submatch offsets from the tags. */ 179 DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo)); 180 submatch_data = tnfa->submatch_data; 181 while (i < tnfa->num_submatches && i < nmatch) 182 { 183 if (submatch_data[i].so_tag == tnfa->end_tag) 184 pmatch[i].rm_so = match_eo; 185 else 186 pmatch[i].rm_so = tre_tag_get(tags, submatch_data[i].so_tag); 187 188 if (submatch_data[i].eo_tag == tnfa->end_tag) 189 pmatch[i].rm_eo = match_eo; 190 else 191 pmatch[i].rm_eo = tre_tag_get(tags, submatch_data[i].eo_tag); 192 193 /* If either of the endpoints were not used, this submatch 194 was not part of the match. */ 195 if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1) 196 pmatch[i].rm_so = pmatch[i].rm_eo = -1; 197 198 DPRINT(("pmatch[%d] = {t%d = %qd, t%d = %qd}\n", i, 199 submatch_data[i].so_tag, pmatch[i].rm_so, 200 submatch_data[i].eo_tag, pmatch[i].rm_eo)); 201 i++; 202 } 203 #ifndef TRE_USE_ALLOCA 204 if (tags != intags) xfree(__DECONST(tre_tag_t *,tags)); 205 #endif /* !TRE_USE_ALLOCA */ 206 } 207 208 while (i < nmatch) 209 { 210 pmatch[i].rm_so = -1; 211 pmatch[i].rm_eo = -1; 212 i++; 213 } 214 215 return REG_OK; 216 } 217 218 219 /* 220 Wrapper functions for POSIX compatible regexp matching. 221 */ 222 223 int 224 tre_have_backrefs(const regex_t *preg) 225 { 226 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 227 return tnfa->have_backrefs; 228 } 229 230 #ifdef TRE_APPROX 231 int 232 tre_have_approx(const regex_t *preg) 233 { 234 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 235 return tnfa->have_approx; 236 } 237 #endif /* TRE_APPROX */ 238 239 static int 240 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, 241 tre_str_type_t type, size_t nmatch, regmatch_t pmatch[], 242 int eflags) 243 { 244 reg_errcode_t status; 245 tre_tag_t *tags = NULL; 246 int eo; 247 size_t offset = 0, count = 0; 248 if (tnfa->num_tags > 0 && nmatch > 0) 249 { 250 #ifdef TRE_USE_ALLOCA 251 tags = alloca(sizeof(*tags) * tnfa->num_tags); 252 #else /* !TRE_USE_ALLOCA */ 253 tags = xmalloc(sizeof(*tags) * tnfa->num_tags); 254 #endif /* !TRE_USE_ALLOCA */ 255 if (tags == NULL) 256 return REG_ESPACE; 257 } 258 259 if ( 260 (eflags & REG_STARTEND) && pmatch) 261 { 262 if (pmatch->rm_so < 0) 263 return REG_INVARG; 264 if (len == (size_t)-1) 265 { 266 if (pmatch->rm_eo < 0 || pmatch->rm_so > pmatch->rm_eo) 267 return REG_INVARG; 268 len = pmatch->rm_eo - pmatch->rm_so; 269 } 270 count = offset = pmatch->rm_so; 271 if (type == STR_WIDE) offset *= sizeof(wchar_t); 272 } 273 274 /* Dispatch to the appropriate matcher. */ 275 if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER) 276 { 277 /* The regex has back references, use the backtracking matcher. */ 278 status = tre_tnfa_run_backtrack(tnfa, string + offset, (int)len, type, 279 tags, eflags, &eo); 280 } 281 #ifdef TRE_APPROX 282 else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER) 283 { 284 /* The regex uses approximate matching, use the approximate matcher. */ 285 regamatch_t match; 286 regaparams_t params; 287 tre_regaparams_default(¶ms); 288 params.max_err = 0; 289 params.max_cost = 0; 290 status = tre_tnfa_run_approx(tnfa, string + offset, (int)len, type, tags, 291 &match, params, eflags, &eo); 292 } 293 #endif /* TRE_APPROX */ 294 else 295 { 296 /* Exact matching, no back references, use the parallel matcher. */ 297 status = tre_tnfa_run_parallel(tnfa, string + offset, (int)len, type, 298 tags, eflags, &eo); 299 } 300 301 if (status == REG_OK) 302 { 303 /* A match was found, so fill the submatch registers. */ 304 status = tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); 305 /* If doing REG_STARTEND, adjust the pmatch array (we can't build 306 this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls 307 tre_fill_pmatch itself). */ 308 if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) && 309 (eflags & REG_STARTEND) && pmatch && nmatch > 0) 310 { 311 size_t i; 312 regmatch_t *p; 313 for (i = nmatch, p = pmatch; i > 0; p++, i--) 314 { 315 if (p->rm_so >= 0) p->rm_so += count; 316 if (p->rm_eo >= 0) p->rm_eo += count; 317 } 318 } 319 } 320 #ifndef TRE_USE_ALLOCA 321 if (tags) 322 xfree(tags); 323 #endif /* !TRE_USE_ALLOCA */ 324 return status; 325 } 326 327 int 328 tre_regnexec(const regex_t *preg, const char *str, size_t len, 329 size_t nmatch, regmatch_t pmatch[], int eflags) 330 { 331 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 332 tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS; 333 334 #ifdef TRE_USE_SYSTEM_REGEX_H 335 if (preg->re_magic != RE_MAGIC) return REG_BADPAT; 336 #endif /* TRE_USE_SYSTEM_REGEX_H */ 337 338 return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags); 339 } 340 341 int 342 tre_regexec(const regex_t *preg, const char *str, 343 size_t nmatch, regmatch_t pmatch[], int eflags) 344 { 345 return tre_regnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags); 346 } 347 348 349 #ifdef TRE_WCHAR 350 351 int 352 tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len, 353 size_t nmatch, regmatch_t pmatch[], int eflags) 354 { 355 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 356 357 #ifdef TRE_USE_SYSTEM_REGEX_H 358 if (preg->re_magic != RE_MAGIC) return REG_BADPAT; 359 #endif /* TRE_USE_SYSTEM_REGEX_H */ 360 361 return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags); 362 } 363 364 int 365 tre_regwexec(const regex_t *preg, const wchar_t *str, 366 size_t nmatch, regmatch_t pmatch[], int eflags) 367 { 368 return tre_regwnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags); 369 } 370 371 #endif /* TRE_WCHAR */ 372 373 #ifdef TRE_APPROX 374 375 /* 376 Wrapper functions for approximate regexp matching. 377 */ 378 379 static int 380 tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len, 381 tre_str_type_t type, regamatch_t *match, regaparams_t params, 382 int eflags) 383 { 384 reg_errcode_t status; 385 tre_tag_t *tags = NULL; 386 int eo; 387 size_t offset = 0, count = 0; 388 389 /* If the regexp does not use approximate matching features, the 390 maximum cost is zero, and the approximate matcher isn't forced, 391 use the exact matcher instead. */ 392 if (params.max_cost == 0 && !tnfa->have_approx 393 && !(eflags & REG_APPROX_MATCHER)) 394 return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch, 395 eflags); 396 397 /* Back references are not supported by the approximate matcher. */ 398 if (tnfa->have_backrefs) 399 return REG_BADPAT; 400 401 if (tnfa->num_tags > 0 && match->nmatch > 0) 402 { 403 #if TRE_USE_ALLOCA 404 tags = alloca(sizeof(*tags) * tnfa->num_tags); 405 #else /* !TRE_USE_ALLOCA */ 406 tags = xmalloc(sizeof(*tags) * tnfa->num_tags); 407 #endif /* !TRE_USE_ALLOCA */ 408 if (tags == NULL) 409 return REG_ESPACE; 410 } 411 412 if ( 413 (eflags & REG_STARTEND) && match->pmatch) 414 { 415 if (match->pmatch->rm_so < 0) 416 return REG_INVARG; 417 if (len == (size_t)-1) 418 { 419 if (match->pmatch->rm_eo < 0 || match->pmatch->rm_so > 420 match->pmatch->rm_eo) 421 return REG_INVARG; 422 len = match->pmatch->rm_eo - match->pmatch->rm_so; 423 } 424 count = offset = match->pmatch->rm_so; 425 if (type == STR_WIDE) offset *= sizeof(wchar_t); 426 } 427 428 status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags, 429 match, params, eflags, &eo); 430 if (status == REG_OK) 431 { 432 status = tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags, 433 tnfa, tags, eo); 434 /* If doing REG_STARTEND, adjust the pmatch array (we can't build 435 this into tre_fill_pmatch, because tre_tnfa_run_backtrack call 436 tre_fill_pmatch itself). */ 437 if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) && 438 (eflags & REG_STARTEND) && match->pmatch && match->nmatch > 0) 439 { 440 size_t i; 441 regmatch_t *p; 442 for (i = match->nmatch, p = match->pmatch; i > 0; p++, i--) 443 { 444 if (p->rm_so >= 0) p->rm_so += count; 445 if (p->rm_eo >= 0) p->rm_eo += count; 446 } 447 } 448 } 449 #ifndef TRE_USE_ALLOCA 450 if (tags) 451 xfree(tags); 452 #endif /* !TRE_USE_ALLOCA */ 453 return status; 454 } 455 456 int 457 tre_reganexec(const regex_t *preg, const char *str, size_t len, 458 regamatch_t *match, regaparams_t params, int eflags) 459 { 460 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 461 tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS; 462 463 #ifdef TRE_USE_SYSTEM_REGEX_H 464 if (preg->re_magic != RE_MAGIC) return REG_BADPAT; 465 #endif /* TRE_USE_SYSTEM_REGEX_H */ 466 467 return tre_match_approx(tnfa, str, len, type, match, params, eflags); 468 } 469 470 int 471 tre_regaexec(const regex_t *preg, const char *str, 472 regamatch_t *match, regaparams_t params, int eflags) 473 { 474 return tre_reganexec(preg, str, (size_t)-1, match, params, eflags); 475 } 476 477 #ifdef TRE_WCHAR 478 479 int 480 tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len, 481 regamatch_t *match, regaparams_t params, int eflags) 482 { 483 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 484 485 #ifdef TRE_USE_SYSTEM_REGEX_H 486 if (preg->re_magic != RE_MAGIC) return REG_BADPAT; 487 #endif /* TRE_USE_SYSTEM_REGEX_H */ 488 489 return tre_match_approx(tnfa, str, len, STR_WIDE, 490 match, params, eflags); 491 } 492 493 int 494 tre_regawexec(const regex_t *preg, const wchar_t *str, 495 regamatch_t *match, regaparams_t params, int eflags) 496 { 497 return tre_regawnexec(preg, str, (size_t)-1, match, params, eflags); 498 } 499 500 #endif /* TRE_WCHAR */ 501 502 void 503 tre_regaparams_default(regaparams_t *params) 504 { 505 memset(params, 0, sizeof(*params)); 506 params->cost_ins = 1; 507 params->cost_del = 1; 508 params->cost_subst = 1; 509 params->max_cost = INT_MAX; 510 params->max_ins = INT_MAX; 511 params->max_del = INT_MAX; 512 params->max_subst = INT_MAX; 513 params->max_err = INT_MAX; 514 } 515 516 #endif /* TRE_APPROX */ 517 518 /* EOF */ 519