/* tre-parse.c - Regexp parser This software is released under a BSD-style license. See the file LICENSE for details and copyright. */ /* This parser is just a simple recursive descent parser for POSIX.2 regexps. The parser supports both the obsolete default syntax and the "extended" syntax, and some nonstandard extensions. */ #ifdef HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ #include #include #include #include #include "xmalloc.h" #include "tre-mem.h" #include "tre-ast.h" #include "tre-stack.h" #include "tre-parse.h" #include "xlocale_private.h" #include "collate.h" /* BSD compatibility: Before looking up a collating symbol, check if the name matches in the character names (cnames) array; if so, use the corresponding character. Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve the implementation choice that for ERE, a non-numeric character following a left brace that would normally be a bound, causes the left brace to be literal. */ #define BSD_COMPATIBILITY #ifdef BSD_COMPATIBILITY #include "cname.h" #define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND #endif /* BSD_COMPATIBILITY */ /* Characters with special meanings in regexp syntax. */ #define CHAR_PIPE L'|' #define CHAR_LPAREN L'(' #define CHAR_RPAREN L')' #define CHAR_LBRACE L'{' #define CHAR_RBRACE L'}' #define CHAR_LBRACKET L'[' #define CHAR_RBRACKET L']' #define CHAR_MINUS L'-' #define CHAR_STAR L'*' #define CHAR_QUESTIONMARK L'?' #define CHAR_PLUS L'+' #define CHAR_PERIOD L'.' #define CHAR_COLON L':' #define CHAR_EQUAL L'=' #define CHAR_COMMA L',' #define CHAR_CARET L'^' #define CHAR_DOLLAR L'$' #define CHAR_BACKSLASH L'\\' #define CHAR_HASH L'#' #define CHAR_TILDE L'~' /* Some macros for expanding \w, \s, etc. */ static const struct tre_macro_struct { const char c; const char *expansion; } tre_macros[] = { {'t', "\t"}, {'n', "\n"}, {'r', "\r"}, {'f', "\f"}, {'a', "\a"}, {'e', "\033"}, {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, { 0, NULL } }; /* Expands a macro delimited by `regex' and `regex_end' to `buf', which must have at least `len' items. Sets buf[0] to zero if the there is no match in `tre_macros'. */ static void tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end, tre_char_t *buf, size_t buf_len) { int i; buf[0] = 0; if (regex >= regex_end) return; for (i = 0; tre_macros[i].expansion; i++) { if (tre_macros[i].c == *regex) { unsigned int j; DPRINT(("Expanding macro '%c' => '%s'\n", tre_macros[i].c, tre_macros[i].expansion)); for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++) buf[j] = tre_macros[i].expansion[j]; buf[j] = 0; break; } } } static reg_errcode_t tre_new_item(tre_mem_t mem, int type, int val, int *max_i, tre_bracket_match_list_t **items) { reg_errcode_t status = REG_OK; tre_bracket_match_list_t *array = *items; int i = array->num_bracket_matches; /* Allocate more space if necessary. */ if (i >= *max_i) { tre_bracket_match_list_t *new_items; DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i)); /* If the array is already 1024 items large, give up -- there's probably an error in the regexp (e.g. not a '\0' terminated string and missing ']') */ if (*max_i >= 1024) return REG_ESPACE; *max_i *= 2; new_items = xrealloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i)); if (new_items == NULL) return REG_ESPACE; *items = array = new_items; } array->bracket_matches[i].type = type; array->bracket_matches[i].value = val; array->num_bracket_matches++; return status; } #ifndef TRE_USE_SYSTEM_WCTYPE /* isalnum() and the rest may be macros, so wrap them to functions. */ int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); } int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); } #ifdef tre_isascii int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); } #else /* !tre_isascii */ int tre_isascii_func(tre_cint_t c) { return !(c >> 7); } #endif /* !tre_isascii */ #ifdef tre_isblank int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); } #else /* !tre_isblank */ int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); } #endif /* !tre_isblank */ int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); } int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); } int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); } int tre_islower_func(tre_cint_t c) { return tre_islower(c); } int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); } int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); } int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); } int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); } int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); } struct { char *name; int (*func)(tre_cint_t); } tre_ctype_map[] = { { "alnum", &tre_isalnum_func }, { "alpha", &tre_isalpha_func }, #ifdef tre_isascii { "ascii", &tre_isascii_func }, #endif /* tre_isascii */ #ifdef tre_isblank { "blank", &tre_isblank_func }, #endif /* tre_isblank */ { "cntrl", &tre_iscntrl_func }, { "digit", &tre_isdigit_func }, { "graph", &tre_isgraph_func }, { "lower", &tre_islower_func }, { "print", &tre_isprint_func }, { "punct", &tre_ispunct_func }, { "space", &tre_isspace_func }, { "upper", &tre_isupper_func }, { "xdigit", &tre_isxdigit_func }, { NULL, NULL} }; tre_ctype_t tre_ctype(const char *name) { int i; for (i = 0; tre_ctype_map[i].name != NULL; i++) { if (strcmp(name, tre_ctype_map[i].name) == 0) return tre_ctype_map[i].func; } return (tre_ctype_t)0; } #endif /* !TRE_USE_SYSTEM_WCTYPE */ #define REST(re) (int)(ctx->re_end - (re)), (re) #define START_COLLATING_SYMBOLS 16 #define MAX_COLLATING_SYMBOL_LEN 4 typedef struct { const tre_char_t *start; int len; } tre_collating_symbol; #ifdef BSD_COMPATIBILITY static wchar_t tre_search_cnames(const wchar_t *name, size_t len) { size_t low = 0; size_t high = NCNAMES - 1; size_t cur; int cmp; while(low <= high) { cur = (low + high) / 2; cmp = wcsncmp(name, cnames[cur].name, len); if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code; if (cmp > 0) low = cur + 1; else high = cur - 1; } return (wchar_t)-1; } #endif /* BSD_COMPATIBILITY */ /* Scan the contents of a bracket expression, and create a * tre_bracket_match_list_t encoding the bracket expression. If during * the scan, multi-character collating symbols are detected, switch * into a mode to collect those MCCSs into a tre_collating_symbol * list and pass them back. tre_parse_bracket will use that to * create a new string composed of a union of the bracket expression * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and * call tre_parse (recursive) to parse that new string (which will * call tre_parse_bracket and tre_parse_bracket_items again. */ static reg_errcode_t tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items, int *items_size, tre_collating_symbol **result) { const tre_char_t *re = ctx->re; const tre_char_t *re_end = ctx->re_end; tre_collating_symbol *col_syms = NULL; tre_collating_symbol *cp = NULL; int n_col_syms = 0; reg_errcode_t status; int max_i = *items_size; int other = 0; /* contains content other than multi-character collating * symbols */ int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */ tre_cint_t min, c; int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE); int collect_MCCS = 0; const tre_char_t *start; for ( ;re < re_end; re++) { switch (*re) { case CHAR_MINUS: /* A first hyphen */ if (re == ctx->re) { DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); min = CHAR_MINUS; other++; range = 0; break; } /* The hyphen is the end range */ if (range > 0) { DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); c = CHAR_MINUS; goto process_end_range; } if (re + 1 >= re_end) { status = REG_EBRACK; goto error; } /* The hyphen is at the end */ if (re[1] == CHAR_RBRACKET) { DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); c = CHAR_MINUS; goto process_begin_range; } /* Two ranges are not allowed to share an endpoint, or begin * range is illegal. */ if (range < 0) { status = REG_ERANGE; goto error; } range = 1; /* Expect end range */ DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re))); break; case CHAR_LBRACKET: if (re + 1 >= re_end) { status = REG_EBRACK; goto error; } switch (re[1]) { case CHAR_PERIOD: { re += 2; start = re; for (;; re++) { if (re >= re_end) { status = REG_ECOLLATE; goto error; } if (*re == CHAR_PERIOD) { if (re + 1 >= re_end) { status = REG_ECOLLATE; goto error; } /* Found end */ if (re[1] == CHAR_RBRACKET) { DPRINT(("tre_parse_bracket: collating " "symbol: '%.*" STRF "'\n", REST(start - 2))); /* Empty name */ if (re == start) { status = REG_ECOLLATE; goto error; } #ifdef BSD_COMPATIBILITY /* Check if the name is in cnames; if so, use the corresponding code */ c = tre_search_cnames(start, re - start); if (c != (wchar_t)-1) { re++; goto process_single_character; } #endif /* BSD_COMPATIBILITY */ /* Verify this is a known sequence */ if (__collate_equiv_value(ctx->loc, start, re - start) <= 0) { status = REG_ECOLLATE; goto error; } /* Process single character collating symbols */ if (re - start == 1) { c = *start; re++; goto process_single_character; } /* Inverted MCCSs are undefined */ if (invert) { status = REG_ECOLLATE; goto error; } /* Can't have MCCSs as an endpoint to a range */ if (range > 0) { status = REG_ERANGE; goto error; } range = -1; /* Switch into MCCS collection mode (if not * already there */ #if TRE_DEBUG if (!collect_MCCS) { collect_MCCS = 1; DPRINT(("tre_parse_bracket: Detected MCCS\n")); } #else /* !TRE_DEBUG */ collect_MCCS = 1; #endif /* !TRE_DEBUG */ /* Allocate a memory block the first time */ if (!cp) { if ((col_syms = xmalloc(sizeof(*col_syms) * (START_COLLATING_SYMBOLS + 2))) == NULL) return REG_ESPACE; cp = col_syms + 1; n_col_syms = START_COLLATING_SYMBOLS; } /* Enlarge the memory block is more is needed */ if ((cp - col_syms) - 1 >= n_col_syms) { int i = n_col_syms; tre_collating_symbol *tmp = xrealloc(col_syms, sizeof(*col_syms) * ((n_col_syms *= 2) + 2)); if (tmp == NULL) { xfree(col_syms); return REG_ESPACE; } DPRINT(("tre_list_collating_symbols: " "Enlarging col_syms to %d\n", n_col_syms)); col_syms = tmp; cp = col_syms + i + 1; } cp->start = start; cp->len = re - start; cp++; re++; break; } } } break; } case CHAR_EQUAL: case CHAR_COLON: { /* Process equivalence and character classes */ tre_char_t kind = re[1]; /* Can't have a class as an endpoint to a range */ if (range > 0) { status = REG_ERANGE; goto error; } if (!collect_MCCS && range == 0) { status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR, min, &max_i, items); if (status != REG_OK) goto error; } range = -1; re += 2; start = re; for (;; re++) { if (re >= re_end) { status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE; goto error; } if (*re == kind) { if (re + 1 >= re_end) { status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE; goto error; } /* Found end */ if (re[1] == CHAR_RBRACKET) { if (re == start) { /* Empty class name */ status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE; goto error; } /* Process equivalence class */ if (kind == CHAR_EQUAL) { int equiv; DPRINT(("tre_parse_bracket: equivalence: '%.*" STRF "'\n", REST(start - 2))); /* While we find the collation value even for multi-character collating elements , we don't (yet) match any collation values against multi-character sequences. We'd have to enumerate those multi-character sequences and like multi-character collating symbols, create a union of those sequences with the rest of the bracket expression. While doable, a bracket expression matching multiple characters, that doesn't explicitly contain multi-character sequences, might be unexpected, so we punt for now. */ if ((equiv = __collate_equiv_value(ctx->loc, start, re - start)) <= 0) { /* The standard says that if no collating element if found, we use the collating symbol itself. But __collate_equiv_value doesn't make a distinction between an element that is in a equvalence class with others, or is the only member, so we already know there is no collating symbol. (Note that in the case of a collating element whose collation value is unique, matching against the collating element itself, or against its collation value, is equivalent.) */ #ifdef BSD_COMPATIBILITY /* Check if the name is in cnames; if so, use the corresponding code */ c = tre_search_cnames(start, re - start); if (c != (wchar_t)-1) { re++; goto process_single_character; } #endif /* BSD_COMPATIBILITY */ status = REG_ECOLLATE; goto error; } if (!collect_MCCS) { status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_EQUIVALENCE, equiv, &max_i, items); if (status != REG_OK) goto error; } } else { /* Process character class */ DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(start - 2))); if (!collect_MCCS) { char tmp_str[64]; tre_ctype_t class; int len = MIN(re - start, 63); #ifdef TRE_WCHAR { tre_char_t tmp_wcs[64]; wcsncpy(tmp_wcs, start, (size_t)len); tmp_wcs[len] = L'\0'; #if defined HAVE_WCSRTOMBS { mbstate_t state; const tre_char_t *src = tmp_wcs; memset(&state, '\0', sizeof(state)); len = wcsrtombs_l(tmp_str, &src, sizeof(tmp_str), &state, ctx->loc); } #elif defined HAVE_WCSTOMBS len = wcstombs(tmp_str, tmp_wcs, 63); #endif /* defined HAVE_WCSTOMBS */ } #else /* !TRE_WCHAR */ strncpy(tmp_str, (const char*)start, len); #endif /* !TRE_WCHAR */ tmp_str[len] = '\0'; DPRINT((" class name: %s\n", tmp_str)); class = tre_ctype_l(tmp_str, ctx->loc); if (!class) { status = REG_ECTYPE; goto error; } status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CLASS, class, &max_i, items); if (status != REG_OK) goto error; } } re++; break; } } } other++; break; } default: DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); c = CHAR_LBRACKET; goto process_single_character; break; } break; case CHAR_RBRACKET: /* A first right bracket */ if (re == ctx->re) { DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); min = CHAR_RBRACKET; range = 0; other++; break; } /* Done */ if (collect_MCCS) { DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re))); if (col_syms) { /* Mark the character following the right bracket. Set len * to whether there are other things besides the * multi-character collating symbols */ col_syms->start = re + 1; col_syms->len = other; /* Mark the end of the list */ cp->start = NULL; } *result = col_syms; return REG_OK; } /* range > 0 is not possible, since we did a lookahead after the * hyphen */ if (range == 0) { status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR, min, &max_i, items); if (status != REG_OK) goto error; } DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re))); *items_size = max_i; ctx->re = re + 1; return REG_OK; default: DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); c = *re; process_single_character: /* Process single character */ if (range > 0) { int mine, maxe; process_end_range: /* Get collation equivalence values */ mine = __collate_equiv_value(ctx->loc, &min, 1); maxe = __collate_equiv_value(ctx->loc, &c, 1); if (maxe < mine) { status = REG_ERANGE; goto error; } if (!collect_MCCS) { status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN, mine, &max_i, items); if (status != REG_OK) goto error; status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_RANGE_END, maxe, &max_i, items); if (status != REG_OK) goto error; } range = -1; } else { process_begin_range: if (!collect_MCCS) { if (range == 0) { status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR, min, &max_i, items); if (status != REG_OK) goto error; } min = c; } range = 0; } other++; break; } } status = REG_EBRACK; error: DPRINT(("tre_parse_bracket: error: '%.*" STRF "', status=%d\n", REST(re), status)); if (col_syms) xfree(col_syms); return status; } #ifdef TRE_DEBUG static const char *bracket_match_type_str[] = { "unused", "char", "range begin", "range end", "class", "equivalence value", }; #endif /* TRE_DEBUG */ static reg_errcode_t tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) { tre_ast_node_t *node; reg_errcode_t status = REG_OK; tre_bracket_match_list_t *items; int max_i = 32; tre_collating_symbol *col_syms = NULL; /* Handle special cases [[:<:]] and [[:>:]] */ if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>') && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET && ctx->re[5] == CHAR_RBRACKET) { *result = tre_ast_new_literal(ctx->mem, ASSERTION, (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW, -1); DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ? "[[:<:]]" : "[[:>:]]")); ctx->re += 6; return *result ? REG_OK : REG_ESPACE; } /* Start off with an array of `max_i' elements. */ items = xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i)); if (items == NULL) return REG_ESPACE; if (*ctx->re == CHAR_CARET) { DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re))); items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE; ctx->re++; } status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms); if (status != REG_OK) goto parse_bracket_done; /* If there are collating symbols, split off the multi-character ones * into a union of the bracket expression (without the collating symbols) * and the multiple-character sequences. We create an equivalent input * string and run tre_parse() recursively */ if (col_syms) { tre_char_t *str, *sp; tre_collating_symbol *cp; tre_parse_ctx_t subctx; /* Allocate a new string. We start with the size of the original * bracket expression (minus 1) and add 2 (for a leading "[" and * a trailing nil; don't need a "^", since it is illegal to have * inverted MCCSs). Since a multi-character collating symbols * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't * need to worry about the new string getting too long. */ xfree(items); str = xmalloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2)); if (str == NULL) { xfree(col_syms); return REG_ESPACE; } sp = str; if (col_syms->len > 0) { /* There are other items in the bracket expression besides the * multi-character collating symbols, so create a new bracket * expression with only those other itmes. */ const tre_char_t *re; ptrdiff_t i; *sp++ = '['; re = ctx->re; for (cp = col_syms + 1; cp->start; cp++) { /* The "- 2" is to account for the "[." */ if ((i = ((cp->start - re) - 2)) > 0) { memcpy(sp, re, sizeof(*sp) * i); sp += i; } /* The "+ 2" is to account for the ".]" */ re = cp->start + cp->len + 2; } i = col_syms->start - re; /* Includes the trailing right bracket */ memcpy(sp, re, sizeof(*sp) * i); sp += i; *sp++ = '|'; } for (cp = col_syms + 1; cp->start; cp++) { memcpy(sp, cp->start, sizeof(*sp) * cp->len); sp += cp->len; if (cp[1].start) *sp++ = '|'; } *sp = 0; DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n", str)); memcpy(&subctx, ctx, sizeof(subctx)); subctx.re = str; subctx.len = sp - str; subctx.nofirstsub = 1; subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */ status = tre_parse(&subctx); xfree(str); if (status != REG_OK) { xfree(col_syms); return status; } ctx->re = col_syms->start; ctx->position = subctx.position; xfree(col_syms); *result = subctx.result; DPRINT(("tre_parse_bracket: Returning to original string\n")); return REG_OK; } DPRINT(("tre_parse_bracket: creating bracket expression literal\n")); node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position); if (node == NULL) { status = REG_ESPACE; goto parse_bracket_done; } else { tre_literal_t *l = node->obj; l->u.bracket_match_list = tre_mem_alloc(ctx->mem, SIZEOF_BRACKET_MATCH_LIST(items)); if (l->u.bracket_match_list == NULL) { status = REG_ESPACE; goto parse_bracket_done; } memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items)); } #ifdef TRE_DEBUG { int i; tre_bracket_match_t *b; DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n", items->num_bracket_matches, items->flags)); for (i = 0, b = items->bracket_matches; i < items->num_bracket_matches; i++, b++) { DPRINT((" %d: %s %d\n", i, bracket_match_type_str[b->type], b->value)); } } #endif /* TRE_DEBUG */ parse_bracket_done: xfree(items); ctx->position++; *result = node; return status; } /* Parses a positive decimal integer. Returns -1 if the string does not contain a valid number. */ static int tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end) { int num = -1; const tre_char_t *r = *regex; while (r < regex_end && *r >= L'0' && *r <= L'9') { if (num < 0) num = 0; num = num * 10 + *r - L'0'; r++; } *regex = r; return num; } static reg_errcode_t tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) { int min, max; #ifdef TRE_APPROX int i; int cost_ins, cost_del, cost_subst, cost_max; int limit_ins, limit_del, limit_subst, limit_err; const tre_char_t *start; #endif /* TRE_APPROX */ const tre_char_t *r = ctx->re; int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; #ifdef TRE_APPROX int approx = 0; int costs_set = 0; int counts_set = 0; cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET; limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET; #endif /* TRE_APPROX */ /* Parse number (minimum repetition count). */ min = -1; if (r >= ctx->re_end) #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE; #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ return REG_EBRACE; #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ if (*r >= L'0' && *r <= L'9') { DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r))); min = tre_parse_int(&r, ctx->re_end); } #ifndef TRE_APPROX else #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND /* For ERE, return REG_NOMATCH to signal that the lbrace should be treated as a literal */ return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR; #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ return REG_BADBR; #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ #endif /* !TRE_APPROX */ /* Parse comma and second number (maximum repetition count). */ max = min; if (r < ctx->re_end && *r == CHAR_COMMA) { r++; DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r))); max = tre_parse_int(&r, ctx->re_end); } /* Check that the repeat counts are sane. */ if ((max >= 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX) return REG_BADBR; #ifdef TRE_APPROX /* '{' optionally followed immediately by a number == minimum repcount optionally followed by , then a number == maximum repcount + then a number == maximum insertion count - then a number == maximum deletion count # then a number == maximum substitution count ~ then a number == maximum number of errors Any of +, -, # or ~ without followed by a number means that the maximum count/number of errors is infinite. An equation of the form Xi + Yd + Zs < C can be specified to set costs and the cost limit to a value different from the default value: - X is the cost of an insertion - Y is the cost of a deletion - Z is the cost of a substitution - C is the maximum cost If no count limit or cost is set for an operation, the operation is not allowed at all. */ do { int done; start = r; /* Parse count limit settings */ done = 0; if (!counts_set) while (r + 1 < ctx->re_end && !done) { switch (*r) { case CHAR_PLUS: /* Insert limit */ DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r))); r++; limit_ins = tre_parse_int(&r, ctx->re_end); if (limit_ins < 0) limit_ins = INT_MAX; counts_set = 1; break; case CHAR_MINUS: /* Delete limit */ DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r))); r++; limit_del = tre_parse_int(&r, ctx->re_end); if (limit_del < 0) limit_del = INT_MAX; counts_set = 1; break; case CHAR_HASH: /* Substitute limit */ DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r))); r++; limit_subst = tre_parse_int(&r, ctx->re_end); if (limit_subst < 0) limit_subst = INT_MAX; counts_set = 1; break; case CHAR_TILDE: /* Maximum number of changes */ DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r))); r++; limit_err = tre_parse_int(&r, ctx->re_end); if (limit_err < 0) limit_err = INT_MAX; approx = 1; break; case CHAR_COMMA: r++; break; case L' ': r++; break; case L'}': done = 1; break; default: done = 1; break; } } /* Parse cost restriction equation. */ done = 0; if (!costs_set) while (r + 1 < ctx->re_end && !done) { switch (*r) { case CHAR_PLUS: case L' ': r++; break; case L'<': DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r))); r++; while (*r == L' ') r++; cost_max = tre_parse_int(&r, ctx->re_end); if (cost_max < 0) cost_max = INT_MAX; else cost_max--; approx = 1; break; case CHAR_COMMA: r++; done = 1; break; default: if (*r >= L'0' && *r <= L'9') { #ifdef TRE_DEBUG const tre_char_t *sr = r; #endif /* TRE_DEBUG */ int cost = tre_parse_int(&r, ctx->re_end); /* XXX - make sure r is not past end. */ switch (*r) { case L'i': /* Insert cost */ DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n", REST(sr))); r++; cost_ins = cost; costs_set = 1; break; case L'd': /* Delete cost */ DPRINT(("tre_parse: del cost: '%.*" STRF "'\n", REST(sr))); r++; cost_del = cost; costs_set = 1; break; case L's': /* Substitute cost */ DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n", REST(sr))); r++; cost_subst = cost; costs_set = 1; break; default: return REG_BADBR; } } else { done = 1; break; } } } } while (start != r); #endif /* TRE_APPROX */ /*{*//* Missing }. */ if (r >= ctx->re_end) return REG_EBRACE; /* Empty contents of {}. */ if (r == ctx->re) return REG_BADBR; /* Parse the ending '}' or '\}'.*/ if (ctx->cflags & REG_EXTENDED) { if (r >= ctx->re_end || *r != CHAR_RBRACE) return REG_BADBR; r++; /* Parse trailing '?' marking minimal repetition. */ if (r < ctx->re_end) { if (*r == CHAR_QUESTIONMARK) { /* Process the question mark only in enhanced mode. Otherwise, the question mark is an error in ERE or a literal in BRE */ if (ctx->cflags & REG_ENHANCED) { minimal = !(ctx->cflags & REG_UNGREEDY); r++; } else return REG_BADRPT; } else if (*r == CHAR_STAR || *r == CHAR_PLUS) { /* These are reserved for future extensions. */ return REG_BADRPT; } } } else { if (r + 1 >= ctx->re_end || *r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE) return REG_BADBR; r += 2; if (r < ctx->re_end && *r == CHAR_STAR) { /* This is reserved for future extensions. */ return REG_BADRPT; } } if (minimal) ctx->num_reorder_tags++; if (!result) goto parse_bound_exit; /* Create the AST node(s). */ /* Originally, if min == 0 && max == 0, we immediately replace the whole iteration with EMPTY. This unfortunately drops any submatches, and messes up setting the pmatch values (we can get tags of -1, and tag values in the billions). So we leave it and process this case as usual, and wait until tre_expand_ast() to replace with EMPTY */ #ifdef TRE_APPROX if (min < 0 && max < 0) /* Only approximate parameters set, no repetitions. */ min = max = 1; #endif /* TRE_APPROX */ *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal); if (!*result) return REG_ESPACE; #ifdef TRE_APPROX /* If approximate matching parameters are set, add them to the iteration node. */ if (approx || costs_set || counts_set) { int *params; tre_iteration_t *iter = (*result)->obj; if (costs_set || counts_set) { if (limit_ins == TRE_PARAM_UNSET) { if (cost_ins == TRE_PARAM_UNSET) limit_ins = 0; else limit_ins = INT_MAX; } if (limit_del == TRE_PARAM_UNSET) { if (cost_del == TRE_PARAM_UNSET) limit_del = 0; else limit_del = INT_MAX; } if (limit_subst == TRE_PARAM_UNSET) { if (cost_subst == TRE_PARAM_UNSET) limit_subst = 0; else limit_subst = INT_MAX; } } if (cost_max == TRE_PARAM_UNSET) cost_max = INT_MAX; if (limit_err == TRE_PARAM_UNSET) limit_err = INT_MAX; ctx->have_approx = 1; params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST); if (!params) return REG_ESPACE; for (i = 0; i < TRE_PARAM_LAST; i++) params[i] = TRE_PARAM_UNSET; params[TRE_PARAM_COST_INS] = cost_ins; params[TRE_PARAM_COST_DEL] = cost_del; params[TRE_PARAM_COST_SUBST] = cost_subst; params[TRE_PARAM_COST_MAX] = cost_max; params[TRE_PARAM_MAX_INS] = limit_ins; params[TRE_PARAM_MAX_DEL] = limit_del; params[TRE_PARAM_MAX_SUBST] = limit_subst; params[TRE_PARAM_MAX_ERR] = limit_err; iter->params = params; } #endif /* TRE_APPROX */ parse_bound_exit: #ifdef TRE_APPROX DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], " "limits [%d,%d,%d, total %d]\n", min, max, cost_ins, cost_del, cost_subst, cost_max, limit_ins, limit_del, limit_subst, limit_err)); #else /* !TRE_APPROX */ DPRINT(("tre_parse_bound: min %d, max %d\n", min, max)); #endif /* !TRE_APPROX */ ctx->re = r; return REG_OK; } /* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for non-self-contained options, like (?i), this causes ((?i)fu)bar to be treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect. Because we now set up tags for even non-capturing parenthesized subexpressions, we always call PARSE_MARK_FOR_SUBMATCH. So if we pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and have it restore cflags after the subexpression, we don't need to have a separate PARSE_RESTORE_CFLAGS, and then after processing the non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE. This has the side-benefit of now matching the perl behavior: the RE foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior of foo AND (?i) (bar OR zap). */ typedef enum { PARSE_RE = 0, PARSE_ATOM, PARSE_MARK_FOR_SUBMATCH, PARSE_BRANCH, PARSE_PIECE, PARSE_CATENATION, PARSE_POST_CATENATION, PARSE_UNION, PARSE_POST_UNION, PARSE_POSTFIX, } tre_parse_re_stack_symbol_t; reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) { tre_ast_node_t *result = NULL; tre_parse_re_stack_symbol_t symbol; reg_errcode_t status = REG_OK; tre_stack_t *stack = ctx->stack; int bottom = tre_stack_num_objects(stack); int depth = 0; int temporary_cflags = 0; int bre_branch_begin; #ifdef TRE_DEBUG const tre_char_t *tmp_re; #endif DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n", ctx->len, ctx->re, ctx->len, ctx->cflags)); if (ctx->len <= 0) return REG_EMPTY; if (!ctx->nofirstsub) { STACK_PUSH(stack, int, ctx->cflags); STACK_PUSH(stack, int, ctx->submatch_id); STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH); ctx->submatch_id++; } STACK_PUSH(stack, int, 0); // bre_branch_begin STACK_PUSH(stack, int, PARSE_RE); ctx->re_start = ctx->re; ctx->re_end = ctx->re + ctx->len; /* The following is basically just a recursive descent parser. I use an explicit stack instead of recursive functions mostly because of two reasons: compatibility with systems which have an overflowable call stack, and efficiency (both in lines of code and speed). */ while (tre_stack_num_objects(stack) > bottom) { symbol = tre_stack_pop_int(stack); switch (symbol) { case PARSE_RE: /* Parse a full regexp. A regexp is one or more branches, separated by the union operator `|'. */ bre_branch_begin = tre_stack_pop_int(stack); if ( #ifdef REG_LITERAL !(ctx->cflags & REG_LITERAL) && #endif /* REG_LITERAL */ ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) STACK_PUSHX(stack, int, PARSE_UNION); STACK_PUSHX(stack, int, bre_branch_begin); STACK_PUSHX(stack, int, PARSE_BRANCH); break; case PARSE_BRANCH: /* Parse a branch. A branch is one or more pieces, concatenated. A piece is an atom possibly followed by a postfix operator. */ bre_branch_begin = tre_stack_pop_int(stack); STACK_PUSHX(stack, int, PARSE_CATENATION); STACK_PUSHX(stack, int, bre_branch_begin); STACK_PUSHX(stack, int, PARSE_PIECE); break; case PARSE_PIECE: /* Parse a piece. A piece is an atom possibly followed by one or more postfix operators. */ bre_branch_begin = tre_stack_pop_int(stack); STACK_PUSHX(stack, int, PARSE_POSTFIX); STACK_PUSHX(stack, int, bre_branch_begin); STACK_PUSHX(stack, int, PARSE_ATOM); break; case PARSE_CATENATION: /* If the expression has not ended, parse another piece. */ { tre_char_t c; if (ctx->re >= ctx->re_end) break; c = *ctx->re; #ifdef REG_LITERAL if (!(ctx->cflags & REG_LITERAL)) { #endif /* REG_LITERAL */ if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) || ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH && *(ctx->re + 1) == CHAR_PIPE)) break; if ((ctx->cflags & REG_EXTENDED && c == CHAR_RPAREN && depth > 0) || (!(ctx->cflags & REG_EXTENDED) && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH && *(ctx->re + 1) == CHAR_RPAREN)) { if (!(ctx->cflags & REG_EXTENDED) && depth == 0) return REG_EPAREN; DPRINT(("tre_parse: group end: '%.*" STRF "'\n", REST(ctx->re))); depth--; if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED))) ctx->re += 2; break; } #ifdef REG_LITERAL } #endif /* REG_LITERAL */ #ifdef REG_LEFT_ASSOC if (ctx->cflags & REG_LEFT_ASSOC) { /* Left associative concatenation. */ STACK_PUSHX(stack, int, PARSE_CATENATION); STACK_PUSHX(stack, voidptr, result); STACK_PUSHX(stack, int, PARSE_POST_CATENATION); STACK_PUSHX(stack, int, 0); // bre_branch_begin STACK_PUSHX(stack, int, PARSE_PIECE); } else #endif /* REG_LEFT_ASSOC */ { /* Default case, right associative concatenation. */ STACK_PUSHX(stack, voidptr, result); STACK_PUSHX(stack, int, PARSE_POST_CATENATION); STACK_PUSHX(stack, int, PARSE_CATENATION); STACK_PUSHX(stack, int, 0); // bre_branch_begin STACK_PUSHX(stack, int, PARSE_PIECE); } break; } case PARSE_POST_CATENATION: { tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); tre_ast_node_t *tmp_node; tmp_node = tre_ast_new_catenation(ctx->mem, tree, result); if (!tmp_node) return REG_ESPACE; result = tmp_node; break; } case PARSE_UNION: if (ctx->re >= ctx->re_end) break; #ifdef REG_LITERAL if (ctx->cflags & REG_LITERAL) break; #endif /* REG_LITERAL */ if (!(ctx->cflags & REG_EXTENDED)) { if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end) break; ctx->re++; } switch (*ctx->re) { case CHAR_PIPE: DPRINT(("tre_parse: union: '%.*" STRF "'\n", REST(ctx->re))); STACK_PUSHX(stack, int, PARSE_UNION); STACK_PUSHX(stack, voidptr, (void *)ctx->re); STACK_PUSHX(stack, voidptr, result); STACK_PUSHX(stack, int, PARSE_POST_UNION); /* We need to pass a boolean (eventually) to PARSE_ATOM to indicate if this is the beginning of a BRE extended branch. */ STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin STACK_PUSHX(stack, int, PARSE_BRANCH); ctx->re++; break; case CHAR_RPAREN: ctx->re++; break; default: if (!(ctx->cflags & REG_EXTENDED)) ctx->re--; break; } break; case PARSE_POST_UNION: { tre_ast_node_t *tmp_node; tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); const tre_char_t *pipechar = tre_stack_pop_voidptr(stack); /* error on empty expression at end of union */ if (pipechar == ctx->re - 1) { return REG_EMPTY; } tmp_node = tre_ast_new_union(ctx->mem, tree, result); if (!tmp_node) return REG_ESPACE; result = tmp_node; break; } case PARSE_POSTFIX: /* Parse postfix operators. */ if (ctx->re >= ctx->re_end) break; #ifdef REG_LITERAL if (ctx->cflags & REG_LITERAL) break; #endif /* REG_LITERAL */ int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; int rep_min = 0; int rep_max = -1; #ifdef TRE_DEBUG int lbrace_off; #endif switch (*ctx->re) { case CHAR_PLUS: case CHAR_QUESTIONMARK: if (!(ctx->cflags & REG_EXTENDED)) break; /*FALLTHROUGH*/ case CHAR_STAR: { tre_ast_node_t *tmp_node; #ifdef TRE_DEBUG const char *tstr = "star"; tmp_re = ctx->re; #endif handle_plus_or_question: /* error on iteration of raw assertion (not in subexpression) */ if (result->type == LITERAL && result->submatch_id < 0 && IS_ASSERTION((tre_literal_t *)result->obj)) { if (!(ctx->cflags & REG_EXTENDED)) break; return REG_BADRPT; } if (*ctx->re == CHAR_PLUS) { rep_min = 1; #ifdef TRE_DEBUG tstr = "plus"; #endif } if (*ctx->re == CHAR_QUESTIONMARK) { rep_max = 1; #ifdef TRE_DEBUG tstr = "questionmark"; #endif } if (ctx->cflags & REG_EXTENDED) { if (ctx->re + 1 < ctx->re_end) { if (*(ctx->re + 1) == CHAR_QUESTIONMARK) { /* Process the question mark only in enhanced mode. Otherwise, the question mark is an error in ERE */ if (ctx->cflags & REG_ENHANCED) { minimal = !(ctx->cflags & REG_UNGREEDY); ctx->re++; } else return REG_BADRPT; } else if (*(ctx->re + 1) == CHAR_STAR || *(ctx->re + 1) == CHAR_PLUS) { /* These are reserved for future extensions. */ return REG_BADRPT; } } } else { if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR) { /* This is reserved for future extensions. */ return REG_BADRPT; } if (ctx->re + 2 < ctx->re_end) { if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_QUESTIONMARK) { /* Process the question mark only in enhanced mode. Otherwise, the question mark is a literal in BRE */ if (ctx->cflags & REG_ENHANCED) { minimal = !(ctx->cflags & REG_UNGREEDY); ctx->re += 2; } } else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS) { /* This is reserved for future extensions. */ return REG_BADRPT; } } } if (minimal) ctx->num_reorder_tags++; DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n", minimal ? " minimal" : "greedy", tstr, REST(tmp_re))); if (result == NULL) { if (ctx->cflags & REG_EXTENDED) return REG_BADRPT; else goto parse_literal; } ctx->re++; tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max, minimal); if (tmp_node == NULL) return REG_ESPACE; result = tmp_node; /* Set the iterator with a submatch id in the invisible range * (which will be overridden if a real submatch is needed) */ result->submatch_id = ctx->submatch_id_invisible++; #if 0 /* We don't allow multiple postfixes, but this might be needed to support approximate matching */ STACK_PUSHX(stack, int, PARSE_POSTFIX); #endif } break; case CHAR_BACKSLASH: /* "\{" is special without REG_EXTENDED */ /* "\+" and "\?" are special with REG_ENHANCED for BRE */ if (!(ctx->cflags & REG_EXTENDED) && ctx->re + 1 < ctx->re_end) { switch (*(ctx->re + 1)) { case CHAR_LBRACE: ctx->re++; #ifdef TRE_DEBUG lbrace_off = 2; #endif goto parse_brace; case CHAR_PLUS: case CHAR_QUESTIONMARK: if (ctx->cflags & REG_ENHANCED) { #ifdef TRE_DEBUG tmp_re = ctx->re; #endif ctx->re++; goto handle_plus_or_question; } break; } break; } else break; case CHAR_LBRACE: { int raw_assertion; /* "{" is literal without REG_EXTENDED */ if (!(ctx->cflags & REG_EXTENDED)) break; #ifdef TRE_DEBUG lbrace_off = 1; #endif parse_brace: /* error on iteration of raw assertion (not in subexpression), but wait until after parsing bounds */ raw_assertion = (result->type == LITERAL && result->submatch_id < 0 && IS_ASSERTION((tre_literal_t *)result->obj)); ctx->re++; status = tre_parse_bound(ctx, &result); #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND /* For ERE, if status is REG_NOMATCH, this mean the lbrace is to be treated as a literal. */ if (status == REG_NOMATCH) { ctx->re--; break; } #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ DPRINT(("tre_parse: bound: '%.*" STRF "'\n", REST(ctx->re - lbrace_off))); if (status != REG_OK) return status; if (raw_assertion) return REG_BADRPT; /* Set the iterator with a submatch id in the invisible range * (which will be overridden if a real submatch is needed) */ if (result->type == ITERATION) result->submatch_id = ctx->submatch_id_invisible++; #if 0 /* We don't allow multiple postfixes, but this might be needed to support approximate matching */ STACK_PUSHX(stack, int, PARSE_POSTFIX); #endif break; } } break; case PARSE_ATOM: { /* Parse an atom. An atom is a regular expression enclosed in `()', an empty set of `()', a bracket expression, `.', `^', `$', a `\' followed by a character, or a single character. */ /* The stack contains a boolean value, whether PARSE_ATOM is being called just after the start of a group (left paren) in a BRE */ bre_branch_begin = tre_stack_pop_int(stack); /* End of regexp? (empty string). */ if (ctx->re >= ctx->re_end) goto parse_literal; #ifdef REG_LITERAL if (ctx->cflags & REG_LITERAL) goto parse_literal; #endif /* REG_LITERAL */ switch (*ctx->re) { case CHAR_LPAREN: /* parenthesized subexpression */ /* Handle "(?...)" extensions. They work in a way similar to Perls corresponding extensions. */ if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) == (REG_EXTENDED|REG_ENHANCED) && *(ctx->re + 1) == CHAR_QUESTIONMARK) { int new_cflags = ctx->cflags; int bit = 1; int invisible_submatch = 0; DPRINT(("tre_parse: extension: '%.*" STRF "'\n", REST(ctx->re))); ctx->re += 2; while (/*CONSTCOND*/1) { if (*ctx->re == L'i') { DPRINT(("tre_parse: icase: '%.*" STRF "'\n", REST(ctx->re))); if (bit) new_cflags |= REG_ICASE; else new_cflags &= ~REG_ICASE; ctx->re++; } else if (*ctx->re == L'n') { DPRINT(("tre_parse: newline: '%.*" STRF "'\n", REST(ctx->re))); if (bit) new_cflags |= REG_NEWLINE; else new_cflags &= ~REG_NEWLINE; ctx->re++; } #ifdef REG_LEFT_ASSOC else if (*ctx->re == L'l') { DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n", REST(ctx->re))); if (bit) new_cflags |= REG_LEFT_ASSOC; else new_cflags &= ~REG_LEFT_ASSOC; ctx->re++; } #endif /* REG_LEFT_ASSOC */ #ifdef REG_UNGREEDY else if (*ctx->re == L'U') { DPRINT(("tre_parse: ungreedy: '%.*" STRF "'\n", REST(ctx->re))); if (bit) new_cflags |= REG_UNGREEDY; else new_cflags &= ~REG_UNGREEDY; ctx->re++; } #endif /* REG_UNGREEDY */ else if (*ctx->re == CHAR_MINUS) { DPRINT(("tre_parse: turn off: '%.*" STRF "'\n", REST(ctx->re))); ctx->re++; bit = 0; } else if (*ctx->re == CHAR_COLON) { DPRINT(("tre_parse: no group: '%.*" STRF "', (invisible submatch %d)\n", REST(ctx->re), ctx->submatch_id_invisible)); ctx->re++; depth++; invisible_submatch = 1; break; } else if (*ctx->re == CHAR_HASH) { DPRINT(("tre_parse: comment: '%.*" STRF "'\n", REST(ctx->re))); /* A comment can contain any character except a right parenthesis */ while (*ctx->re != CHAR_RPAREN && ctx->re < ctx->re_end) ctx->re++; if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end) { ctx->re++; break; } else return REG_BADPAT; } else if (*ctx->re == CHAR_RPAREN) { ctx->re++; break; } else return REG_BADRPT; } /* Turn on the cflags changes for the rest of the enclosing group. */ if (invisible_submatch) { STACK_PUSHX(stack, int, ctx->cflags); STACK_PUSHX(stack, int, ctx->submatch_id_invisible); STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); ctx->submatch_id_invisible++; STACK_PUSHX(stack, int, 0); // bre_branch_begin STACK_PUSHX(stack, int, PARSE_RE); } else { STACK_PUSHX(stack, int, 0); // bre_branch_begin STACK_PUSHX(stack, int, PARSE_ATOM); } ctx->cflags = new_cflags; break; } if (ctx->cflags & REG_EXTENDED) { parse_bre_lparen: DPRINT(("tre_parse: group begin: '%.*" STRF "', submatch %d\n", REST(ctx->re), ctx->submatch_id)); ctx->re++; /* First parse a whole RE, then mark the resulting tree for submatching. */ STACK_PUSHX(stack, int, ctx->cflags); STACK_PUSHX(stack, int, ctx->submatch_id); STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); /* We need to pass a boolean (eventually) to PARSE_ATOM to indicate if this is the beginning of a BRE group. */ STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED)); STACK_PUSHX(stack, int, PARSE_RE); ctx->submatch_id++; depth++; } else goto parse_literal; break; case CHAR_RPAREN: /* end of current subexpression */ if (ctx->cflags & REG_EXTENDED && depth > 0) { parse_bre_rparen_empty: if (!(ctx->cflags & REG_EXTENDED) && depth == 0) return REG_EPAREN; DPRINT(("tre_parse: empty: '%.*" STRF "'\n", REST(ctx->re))); /* We were expecting an atom, but instead the current subexpression was closed. POSIX leaves the meaning of this to be implementation-defined. We interpret this as an empty expression (which matches an empty string). */ result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); if (result == NULL) return REG_ESPACE; if (!(ctx->cflags & REG_EXTENDED)) ctx->re--; } else goto parse_literal; break; case CHAR_LBRACKET: /* bracket expression */ DPRINT(("tre_parse: bracket: '%.*" STRF "'\n", REST(ctx->re))); ctx->re++; status = tre_parse_bracket(ctx, &result); if (status != REG_OK) return status; break; case CHAR_BACKSLASH: /* Deal with "\(", "\)" or "\{" for BREs */ if (!(ctx->cflags & REG_EXTENDED) && ctx->re + 1 < ctx->re_end) { if (*(ctx->re + 1) == CHAR_LPAREN) { ctx->re++; goto parse_bre_lparen; } else if (*(ctx->re + 1) == CHAR_RPAREN) { ctx->re++; goto parse_bre_rparen_empty; } if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal; } if (ctx->re + 1 >= ctx->re_end) /* Trailing backslash. */ return REG_EESCAPE; if (!(ctx->cflags & REG_ENHANCED)) { DPRINT(("tre_parse: unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re))); ctx->re++; goto unenhanced_backslash; } /* If a macro is used, parse the expanded macro recursively. */ { tre_char_t buf[64]; tre_expand_macro(ctx->re + 1, ctx->re_end, buf, elementsof(buf)); if (buf[0] != 0) { tre_parse_ctx_t subctx; memcpy(&subctx, ctx, sizeof(subctx)); subctx.re = buf; subctx.len = tre_strlen(buf); subctx.nofirstsub = 1; status = tre_parse(&subctx); if (status != REG_OK) return status; ctx->re += 2; ctx->position = subctx.position; result = subctx.result; break; } } #ifdef REG_LITERAL if (*(ctx->re + 1) == L'Q') { DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n", REST(ctx->re))); ctx->cflags |= REG_LITERAL; temporary_cflags |= REG_LITERAL; ctx->re += 2; STACK_PUSHX(stack, int, 0); STACK_PUSHX(stack, int, PARSE_ATOM); break; } #endif /* REG_LITERAL */ DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re))); ctx->re++; switch (*ctx->re) { case L'b': result = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1); ctx->re++; break; case L'B': result = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1); ctx->re++; break; case L'<': result = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1); ctx->re++; break; case L'>': result = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1); ctx->re++; break; case L'x': ctx->re++; if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end) { /* 8 bit hex char. */ char tmp[3] = {0, 0, 0}; long val; DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n", REST(ctx->re - 2))); if (tre_isxdigit_l(ctx->re[0], ctx->loc) && ctx->re < ctx->re_end) { tmp[0] = (char)ctx->re[0]; ctx->re++; } if (tre_isxdigit_l(ctx->re[0], ctx->loc) && ctx->re < ctx->re_end) { tmp[1] = (char)ctx->re[0]; ctx->re++; } val = strtol(tmp, NULL, 16); result = tre_ast_new_literal(ctx->mem, (int)val, (int)val, ctx->position); ctx->position++; break; } else if (ctx->re < ctx->re_end) { /* Wide char. */ char tmp[32]; long val; int i = 0; ctx->re++; while (ctx->re_end - ctx->re >= 0) { if (ctx->re[0] == CHAR_RBRACE) break; if (tre_isxdigit_l(ctx->re[0], ctx->loc)) { tmp[i] = (char)ctx->re[0]; i++; ctx->re++; continue; } return REG_EBRACE; } ctx->re++; tmp[i] = 0; val = strtol(tmp, NULL, 16); result = tre_ast_new_literal(ctx->mem, (int)val, (int)val, ctx->position); ctx->position++; break; } /*FALLTHROUGH*/ default: unenhanced_backslash: if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) != REG_EXTENDED && tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0') { /* Back reference (only in BRE or enhanced). */ int val = *ctx->re - L'0'; DPRINT(("tre_parse: backref: '%.*" STRF "'\n", REST(ctx->re - 1))); result = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position); if (result == NULL) return REG_ESPACE; /* Set the backref with a submatch id in the invisible * range (which will be overridden if a real submatch * is needed) */ result->submatch_id = ctx->submatch_id_invisible++; ctx->position++; ctx->num_reorder_tags++; ctx->max_backref = MAX(val, ctx->max_backref); ctx->re++; } else { /* Escaped character. */ DPRINT(("tre_parse: escaped: '%.*" STRF "'\n", REST(ctx->re - 1))); result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, ctx->position); ctx->position++; ctx->re++; } break; } if (result == NULL) return REG_ESPACE; break; case CHAR_PERIOD: /* the any-symbol */ DPRINT(("tre_parse: any: '%.*" STRF "'\n", REST(ctx->re))); if (ctx->cflags & REG_NEWLINE) { tre_ast_node_t *tmp1; tre_ast_node_t *tmp2; tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1, ctx->position); if (!tmp1) return REG_ESPACE; tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX, ctx->position + 1); if (!tmp2) return REG_ESPACE; result = tre_ast_new_union(ctx->mem, tmp1, tmp2); if (!result) return REG_ESPACE; ctx->position += 2; } else { result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position); if (!result) return REG_ESPACE; ctx->position++; } ctx->re++; break; case CHAR_CARET: /* beginning of line assertion */ /* '^' has a special meaning everywhere in EREs, at the beginning of the RE and after \( is BREs. It is also special in enhanced BREs at the beginning of each branches of a union */ if (ctx->cflags & REG_EXTENDED || bre_branch_begin || ctx->re == ctx->re_start) { DPRINT(("tre_parse: BOL: '%.*" STRF "'\n", REST(ctx->re))); result = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1); if (result == NULL) return REG_ESPACE; ctx->re++; } else goto parse_literal; break; case CHAR_DOLLAR: /* end of line assertion. */ /* '$' is special everywhere in EREs, and in the end of the string and before \) is BREs. */ if (ctx->cflags & REG_EXTENDED || (ctx->re + 2 < ctx->re_end && *(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_RPAREN) || ctx->re + 1 == ctx->re_end) { DPRINT(("tre_parse: EOL: '%.*" STRF "'\n", REST(ctx->re))); result = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1); if (result == NULL) return REG_ESPACE; ctx->re++; } else goto parse_literal; break; default: parse_literal: if (temporary_cflags && ctx->re + 1 < ctx->re_end && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E') { DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n", REST(ctx->re))); ctx->cflags &= ~temporary_cflags; temporary_cflags = 0; ctx->re += 2; if (ctx->re < ctx->re_end) { STACK_PUSHX(stack, int, 0); STACK_PUSHX(stack, int, PARSE_ATOM); } else { result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); if (!result) return REG_ESPACE; } break; } /* We are expecting an atom. If the subexpression (or the whole regexp ends here, we interpret it as an empty expression (which matches an empty string), which is an error. Iterations of an empty expression is also an error. */ #ifdef REG_LITERAL if (!(ctx->cflags & REG_LITERAL)) { #endif /* REG_LITERAL */ /* error on end of string */ if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN : REG_EMPTY; /* error on unions and iterations of empty expressions */ if (ctx->cflags & REG_EXTENDED) { if (ctx->re < ctx->re_end) { if (*ctx->re == CHAR_PIPE) return REG_EMPTY; if (*ctx->re == CHAR_LBRACE) { ctx->re++; empty_parse_bound: /* We need to parse the bound first and return any error, before returning REG_BADRPT */ status = tre_parse_bound(ctx, NULL); #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND /* For ERE, if REG_NOMATCH is returned, we treat the lbrace as a literal. */ if (status == REG_NOMATCH) { ctx->re--; /* Drop down to literal-handling code */ } else { #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ if (status != REG_OK) return status; return REG_BADRPT; #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND } #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ } #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND else #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ if (*ctx->re == CHAR_STAR || *ctx->re == CHAR_PLUS || *ctx->re == CHAR_QUESTIONMARK) { return REG_BADRPT; } } } else if (ctx->re + 1 < ctx->re_end && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == CHAR_LBRACE) { ctx->re += 2; goto empty_parse_bound; } #ifdef REG_LITERAL } #endif /* REG_LITERAL */ DPRINT(("tre_parse: literal: '%.*" STRF "'\n", REST(ctx->re))); /* Note that we can't use an tre_isalpha() test here, since there may be characters which are alphabetic but neither upper or lower case. */ if (ctx->cflags & REG_ICASE && (tre_isupper_l(*ctx->re, ctx->loc) || tre_islower_l(*ctx->re, ctx->loc))) { tre_ast_node_t *tmp1; tre_ast_node_t *tmp2; /* XXX - Can there be more than one opposite-case counterpoints for some character in some locale? Or more than two characters which all should be regarded the same character if case is ignored? If yes, there does not seem to be a portable way to detect it. I guess that at least for multi-character collating elements there could be several opposite-case counterpoints, but they cannot be supported portably anyway. */ tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper_l(*ctx->re, ctx->loc), tre_toupper_l(*ctx->re, ctx->loc), ctx->position); if (!tmp1) return REG_ESPACE; tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower_l(*ctx->re, ctx->loc), tre_tolower_l(*ctx->re, ctx->loc), ctx->position); if (!tmp2) return REG_ESPACE; result = tre_ast_new_union(ctx->mem, tmp1, tmp2); if (!result) return REG_ESPACE; } else { result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, ctx->position); if (!result) return REG_ESPACE; } ctx->position++; ctx->re++; break; } break; } case PARSE_MARK_FOR_SUBMATCH: { int submatch_id = tre_stack_pop_int(stack); ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */ if (result->submatch_id >= 0 && result->submatch_id < SUBMATCH_ID_INVISIBLE_START) { tre_ast_node_t *n, *tmp_node; if (submatch_id >= SUBMATCH_ID_INVISIBLE_START) break; n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); if (n == NULL) return REG_ESPACE; tmp_node = tre_ast_new_catenation(ctx->mem, n, result); if (tmp_node == NULL) return REG_ESPACE; tmp_node->num_submatches = result->num_submatches; result = tmp_node; } result->submatch_id = submatch_id; if (submatch_id < SUBMATCH_ID_INVISIBLE_START) result->num_submatches++; break; } default: assert(0); break; } } /* Check for missing closing parentheses. */ if (depth > 0) return REG_EPAREN; ctx->result = result; return REG_OK; } /* EOF */