1 /* 2 * Internal interface definitions, etc., for the reg package 3 * 4 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. 5 * 6 * Development of this software was funded, in part, by Cray Research Inc., 7 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics 8 * Corporation, none of whom are responsible for the results. The author 9 * thanks all of them. 10 * 11 * Redistribution and use in source and binary forms -- with or without 12 * modification -- are permitted for any purpose, provided that 13 * redistributions in source form retain this entire copyright notice and 14 * indicate the origin and nature of any modifications. 15 * 16 * I'd appreciate being given credit for this package in the documentation 17 * of software which uses it, but that is not a requirement. 18 * 19 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 20 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 21 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 22 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 25 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 26 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 27 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 28 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 * 30 * src/include/regex/regguts.h 31 */ 32 33 34 35 /* 36 * Environmental customization. It should not (I hope) be necessary to 37 * alter the file you are now reading -- regcustom.h should handle it all, 38 * given care here and elsewhere. 39 */ 40 #include "regcustom.h" 41 42 43 44 /* 45 * Things that regcustom.h might override. 46 */ 47 48 /* assertions */ 49 #ifndef assert 50 #ifndef REG_DEBUG 51 #define NDEBUG /* no assertions */ 52 #endif 53 #include <assert.h> 54 #endif 55 56 /* voids */ 57 #ifndef DISCARD 58 #define DISCARD void /* for throwing values away */ 59 #endif 60 #ifndef VS 61 #define VS(x) ((void *)(x)) /* cast something to generic ptr */ 62 #endif 63 64 /* function-pointer declarator */ 65 #ifndef FUNCPTR 66 #define FUNCPTR(name, args) (*(name)) args 67 #endif 68 69 /* memory allocation */ 70 #ifndef MALLOC 71 #define MALLOC(n) malloc(n) 72 #endif 73 #ifndef REALLOC 74 #define REALLOC(p, n) realloc(VS(p), n) 75 #endif 76 #ifndef FREE 77 #define FREE(p) free(VS(p)) 78 #endif 79 80 /* want size of a char in bits, and max value in bounded quantifiers */ 81 #ifndef _POSIX2_RE_DUP_MAX 82 #define _POSIX2_RE_DUP_MAX 255 /* normally from <limits.h> */ 83 #endif 84 85 86 87 /* 88 * misc 89 */ 90 91 #define NOTREACHED 0 92 93 #define DUPMAX _POSIX2_RE_DUP_MAX 94 #define DUPINF (DUPMAX+1) 95 96 #define REMAGIC 0xfed7 /* magic number for main struct */ 97 98 /* Type codes for lookaround constraints */ 99 #define LATYPE_AHEAD_POS 03 /* positive lookahead */ 100 #define LATYPE_AHEAD_NEG 02 /* negative lookahead */ 101 #define LATYPE_BEHIND_POS 01 /* positive lookbehind */ 102 #define LATYPE_BEHIND_NEG 00 /* negative lookbehind */ 103 #define LATYPE_IS_POS(la) ((la) & 01) 104 #define LATYPE_IS_AHEAD(la) ((la) & 02) 105 106 107 /* 108 * debugging facilities 109 */ 110 #ifdef REG_DEBUG 111 /* FDEBUG does finite-state tracing */ 112 #define FDEBUG(arglist) { if (v->eflags®_FTRACE) printf arglist; } 113 /* MDEBUG does higher-level tracing */ 114 #define MDEBUG(arglist) { if (v->eflags®_MTRACE) printf arglist; } 115 #else 116 #define FDEBUG(arglist) {} 117 #define MDEBUG(arglist) {} 118 #endif 119 120 121 122 /* 123 * bitmap manipulation 124 */ 125 #define UBITS (CHAR_BIT * sizeof(unsigned)) 126 #define BSET(uv, sn) ((uv)[(sn)/UBITS] |= (unsigned)1 << ((sn)%UBITS)) 127 #define ISBSET(uv, sn) ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS))) 128 129 130 /* 131 * known character classes 132 */ 133 enum char_classes 134 { 135 CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, 136 CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT, CC_WORD 137 }; 138 139 #define NUM_CCLASSES 14 140 141 142 /* 143 * As soon as possible, we map chrs into equivalence classes -- "colors" -- 144 * which are of much more manageable number. 145 * 146 * To further reduce the number of arcs in NFAs and DFAs, we also have a 147 * special RAINBOW "color" that can be assigned to an arc. This is not a 148 * real color, in that it has no entry in color maps. 149 */ 150 typedef short color; /* colors of characters */ 151 152 #define MAX_COLOR 32767 /* max color (must fit in 'color' datatype) */ 153 #define COLORLESS (-1) /* impossible color */ 154 #define RAINBOW (-2) /* represents all colors except pseudocolors */ 155 #define WHITE 0 /* default color, parent of all others */ 156 /* Note: various places in the code know that WHITE is zero */ 157 158 159 /* 160 * Per-color data structure for the compile-time color machinery 161 * 162 * If "sub" is not NOSUB then it is the number of the color's current 163 * subcolor, i.e. we are in process of dividing this color (character 164 * equivalence class) into two colors. See src/backend/regex/README for 165 * discussion of subcolors. 166 * 167 * Currently-unused colors have the FREECOL bit set and are linked into a 168 * freelist using their "sub" fields, but only if their color numbers are 169 * less than colormap.max. Any array entries beyond "max" are just garbage. 170 */ 171 struct colordesc 172 { 173 int nschrs; /* number of simple chars of this color */ 174 int nuchrs; /* number of upper map entries of this color */ 175 color sub; /* open subcolor, if any; or free-chain ptr */ 176 #define NOSUB COLORLESS /* value of "sub" when no open subcolor */ 177 struct arc *arcs; /* chain of all arcs of this color */ 178 chr firstchr; /* simple char first assigned to this color */ 179 int flags; /* bitmask of the following flags: */ 180 #define FREECOL 01 /* currently free */ 181 #define PSEUDO 02 /* pseudocolor, no real chars */ 182 #define COLMARK 04 /* temporary marker used in some functions */ 183 }; 184 185 #define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL) 186 187 /* 188 * The color map itself 189 * 190 * This struct holds both data used only at compile time, and the chr to 191 * color mapping information, used at both compile and run time. The latter 192 * is the bulk of the space, so it's not really worth separating out the 193 * compile-only portion. 194 * 195 * Ideally, the mapping data would just be an array of colors indexed by 196 * chr codes; but for large character sets that's impractical. Fortunately, 197 * common characters have smaller codes, so we can use a simple array for chr 198 * codes up to MAX_SIMPLE_CHR, and do something more complex for codes above 199 * that, without much loss of performance. The "something more complex" is a 200 * 2-D array of color entries, where row indexes correspond to individual chrs 201 * or chr ranges that have been mentioned in the regex (with row zero 202 * representing all other chrs), and column indexes correspond to different 203 * sets of locale-dependent character classes such as "isalpha". The 204 * classbits[k] entry is zero if we do not care about the k'th character class 205 * in this regex, and otherwise it is the bit to be OR'd into the column index 206 * if the character in question is a member of that class. We find the color 207 * of a high-valued chr by identifying which colormaprange it is in to get 208 * the row index (use row zero if it's in none of them), identifying which of 209 * the interesting cclasses it's in to get the column index, and then indexing 210 * into the 2-D hicolormap array. 211 * 212 * The colormapranges are required to be nonempty, nonoverlapping, and to 213 * appear in increasing chr-value order. 214 */ 215 216 typedef struct colormaprange 217 { 218 chr cmin; /* range represents cmin..cmax inclusive */ 219 chr cmax; 220 int rownum; /* row index in hicolormap array (>= 1) */ 221 } colormaprange; 222 223 struct colormap 224 { 225 int magic; 226 #define CMMAGIC 0x876 227 struct vars *v; /* for compile error reporting */ 228 size_t ncds; /* allocated length of colordescs array */ 229 size_t max; /* highest color number currently in use */ 230 color free; /* beginning of free chain (if non-0) */ 231 struct colordesc *cd; /* pointer to array of colordescs */ 232 #define CDEND(cm) (&(cm)->cd[(cm)->max + 1]) 233 234 /* mapping data for chrs <= MAX_SIMPLE_CHR: */ 235 color *locolormap; /* simple array indexed by chr code */ 236 237 /* mapping data for chrs > MAX_SIMPLE_CHR: */ 238 int classbits[NUM_CCLASSES]; /* see comment above */ 239 int numcmranges; /* number of colormapranges */ 240 colormaprange *cmranges; /* ranges of high chrs */ 241 color *hicolormap; /* 2-D array of color entries */ 242 int maxarrayrows; /* number of array rows allocated */ 243 int hiarrayrows; /* number of array rows in use */ 244 int hiarraycols; /* number of array columns (2^N) */ 245 246 /* If we need up to NINLINECDS, we store them here to save a malloc */ 247 #define NINLINECDS ((size_t) 10) 248 struct colordesc cdspace[NINLINECDS]; 249 }; 250 251 /* fetch color for chr; beware of multiple evaluation of c argument */ 252 #define GETCOLOR(cm, c) \ 253 ((c) <= MAX_SIMPLE_CHR ? (cm)->locolormap[(c) - CHR_MIN] : pg_reg_getcolor(cm, c)) 254 255 256 /* 257 * Interface definitions for locale-interface functions in regc_locale.c. 258 */ 259 260 /* 261 * Representation of a set of characters. chrs[] represents individual 262 * code points, ranges[] represents ranges in the form min..max inclusive. 263 * 264 * If the cvec represents a locale-specific character class, eg [[:alpha:]], 265 * then the chrs[] and ranges[] arrays contain only members of that class 266 * up to MAX_SIMPLE_CHR (inclusive). cclasscode is set to regc_locale.c's 267 * code for the class, rather than being -1 as it is in an ordinary cvec. 268 * 269 * Note that in cvecs gotten from newcvec() and intended to be freed by 270 * freecvec(), both arrays of chrs are after the end of the struct, not 271 * separately malloc'd; so chrspace and rangespace are effectively immutable. 272 */ 273 struct cvec 274 { 275 int nchrs; /* number of chrs */ 276 int chrspace; /* number of chrs allocated in chrs[] */ 277 chr *chrs; /* pointer to vector of chrs */ 278 int nranges; /* number of ranges (chr pairs) */ 279 int rangespace; /* number of ranges allocated in ranges[] */ 280 chr *ranges; /* pointer to vector of chr pairs */ 281 int cclasscode; /* value of "enum classes", or -1 */ 282 }; 283 284 285 /* 286 * definitions for NFA internal representation 287 */ 288 struct state; 289 290 struct arc 291 { 292 int type; /* 0 if free, else an NFA arc type code */ 293 color co; /* color the arc matches (possibly RAINBOW) */ 294 struct state *from; /* where it's from */ 295 struct state *to; /* where it's to */ 296 struct arc *outchain; /* link in *from's outs chain or free chain */ 297 struct arc *outchainRev; /* back-link in *from's outs chain */ 298 #define freechain outchain /* we do not maintain "freechainRev" */ 299 struct arc *inchain; /* link in *to's ins chain */ 300 struct arc *inchainRev; /* back-link in *to's ins chain */ 301 /* these fields are not used when co == RAINBOW: */ 302 struct arc *colorchain; /* link in color's arc chain */ 303 struct arc *colorchainRev; /* back-link in color's arc chain */ 304 }; 305 306 struct arcbatch 307 { /* for bulk allocation of arcs */ 308 struct arcbatch *next; /* chain link */ 309 size_t narcs; /* number of arcs allocated in this arcbatch */ 310 struct arc a[FLEXIBLE_ARRAY_MEMBER]; 311 }; 312 #define ARCBATCHSIZE(n) ((n) * sizeof(struct arc) + offsetof(struct arcbatch, a)) 313 /* first batch will have FIRSTABSIZE arcs; then double it until MAXABSIZE */ 314 #define FIRSTABSIZE 64 315 #define MAXABSIZE 1024 316 317 struct state 318 { 319 int no; /* state number, zero and up; or FREESTATE */ 320 #define FREESTATE (-1) 321 char flag; /* marks special states */ 322 int nins; /* number of inarcs */ 323 int nouts; /* number of outarcs */ 324 struct arc *ins; /* chain of inarcs */ 325 struct arc *outs; /* chain of outarcs */ 326 struct state *tmp; /* temporary for traversal algorithms */ 327 struct state *next; /* chain for traversing all live states */ 328 /* the "next" field is also used to chain free states together */ 329 struct state *prev; /* back-link in chain of all live states */ 330 }; 331 332 struct statebatch 333 { /* for bulk allocation of states */ 334 struct statebatch *next; /* chain link */ 335 size_t nstates; /* number of states allocated in this batch */ 336 struct state s[FLEXIBLE_ARRAY_MEMBER]; 337 }; 338 #define STATEBATCHSIZE(n) ((n) * sizeof(struct state) + offsetof(struct statebatch, s)) 339 /* first batch will have FIRSTSBSIZE states; then double it until MAXSBSIZE */ 340 #define FIRSTSBSIZE 32 341 #define MAXSBSIZE 1024 342 343 struct nfa 344 { 345 struct state *pre; /* pre-initial state */ 346 struct state *init; /* initial state */ 347 struct state *final; /* final state */ 348 struct state *post; /* post-final state */ 349 int nstates; /* for numbering states */ 350 struct state *states; /* chain of live states */ 351 struct state *slast; /* tail of the chain */ 352 struct state *freestates; /* chain of free states */ 353 struct arc *freearcs; /* chain of free arcs */ 354 struct statebatch *lastsb; /* chain of statebatches */ 355 struct arcbatch *lastab; /* chain of arcbatches */ 356 size_t lastsbused; /* number of states consumed from *lastsb */ 357 size_t lastabused; /* number of arcs consumed from *lastab */ 358 struct colormap *cm; /* the color map */ 359 color bos[2]; /* colors, if any, assigned to BOS and BOL */ 360 color eos[2]; /* colors, if any, assigned to EOS and EOL */ 361 int flags; /* flags to pass forward to cNFA */ 362 int minmatchall; /* min number of chrs to match, if matchall */ 363 int maxmatchall; /* max number of chrs to match, or DUPINF */ 364 struct vars *v; /* simplifies compile error reporting */ 365 struct nfa *parent; /* parent NFA, if any */ 366 }; 367 368 369 370 /* 371 * definitions for compacted NFA 372 * 373 * The main space savings in a compacted NFA is from making the arcs as small 374 * as possible. We store only the transition color and next-state number for 375 * each arc. The list of out arcs for each state is an array beginning at 376 * cnfa.states[statenumber], and terminated by a dummy carc struct with 377 * co == COLORLESS. 378 * 379 * The non-dummy carc structs are of two types: plain arcs and LACON arcs. 380 * Plain arcs just store the transition color number as "co". LACON arcs 381 * store the lookaround constraint number plus cnfa.ncolors as "co". LACON 382 * arcs can be distinguished from plain by testing for co >= cnfa.ncolors. 383 * 384 * Note that in a plain arc, "co" can be RAINBOW; since that's negative, 385 * it doesn't break the rule about how to recognize LACON arcs. 386 * 387 * We have special markings for "trivial" NFAs that can match any string 388 * (possibly with limits on the number of characters therein). In such a 389 * case, flags & MATCHALL is set (and HASLACONS can't be set). Then the 390 * fields minmatchall and maxmatchall give the minimum and maximum numbers 391 * of characters to match. For example, ".*" produces minmatchall = 0 392 * and maxmatchall = DUPINF, while ".+" produces minmatchall = 1 and 393 * maxmatchall = DUPINF. 394 */ 395 struct carc 396 { 397 color co; /* COLORLESS is list terminator */ 398 int to; /* next-state number */ 399 }; 400 401 struct cnfa 402 { 403 int nstates; /* number of states */ 404 int ncolors; /* number of colors (max color in use + 1) */ 405 int flags; /* bitmask of the following flags: */ 406 #define HASLACONS 01 /* uses lookaround constraints */ 407 #define MATCHALL 02 /* matches all strings of a range of lengths */ 408 int pre; /* setup state number */ 409 int post; /* teardown state number */ 410 color bos[2]; /* colors, if any, assigned to BOS and BOL */ 411 color eos[2]; /* colors, if any, assigned to EOS and EOL */ 412 char *stflags; /* vector of per-state flags bytes */ 413 #define CNFA_NOPROGRESS 01 /* flag bit for a no-progress state */ 414 struct carc **states; /* vector of pointers to outarc lists */ 415 /* states[n] are pointers into a single malloc'd array of arcs */ 416 struct carc *arcs; /* the area for the lists */ 417 /* these fields are used only in a MATCHALL NFA (else they're -1): */ 418 int minmatchall; /* min number of chrs to match */ 419 int maxmatchall; /* max number of chrs to match, or DUPINF */ 420 }; 421 422 /* 423 * When debugging, it's helpful if an un-filled CNFA is all-zeroes. 424 * In production, though, we only require nstates to be zero. 425 */ 426 #ifdef REG_DEBUG 427 #define ZAPCNFA(cnfa) memset(&(cnfa), 0, sizeof(cnfa)) 428 #else 429 #define ZAPCNFA(cnfa) ((cnfa).nstates = 0) 430 #endif 431 #define NULLCNFA(cnfa) ((cnfa).nstates == 0) 432 433 /* 434 * This symbol limits the transient heap space used by the regex compiler, 435 * and thereby also the maximum complexity of NFAs that we'll deal with. 436 * Currently we only count NFA states and arcs against this; the other 437 * transient data is generally not large enough to notice compared to those. 438 * Note that we do not charge anything for the final output data structures 439 * (the compacted NFA and the colormap). 440 * The scaling here is based on an empirical measurement that very large 441 * NFAs tend to have about 4 arcs/state. 442 */ 443 #ifndef REG_MAX_COMPILE_SPACE 444 #define REG_MAX_COMPILE_SPACE \ 445 (500000 * (sizeof(struct state) + 4 * sizeof(struct arc))) 446 #endif 447 448 /* 449 * subexpression tree 450 * 451 * "op" is one of: 452 * '=' plain regex without interesting substructure (implemented as DFA) 453 * 'b' back-reference (has no substructure either) 454 * '(' no-op capture node: captures the match of its single child 455 * '.' concatenation: matches a match for first child, then second child 456 * '|' alternation: matches a match for any of its children 457 * '*' iteration: matches some number of matches of its single child 458 * 459 * An alternation node can have any number of children (but at least two), 460 * linked through their sibling fields. 461 * 462 * A concatenation node must have exactly two children. It might be useful 463 * to support more, but that would complicate the executor. Note that it is 464 * the first child's greediness that determines the node's preference for 465 * where to split a match. 466 * 467 * Note: when a backref is directly quantified, we stick the min/max counts 468 * into the backref rather than plastering an iteration node on top. This is 469 * for efficiency: there is no need to search for possible division points. 470 */ 471 struct subre 472 { 473 char op; /* see type codes above */ 474 char flags; 475 #define LONGER 01 /* prefers longer match */ 476 #define SHORTER 02 /* prefers shorter match */ 477 #define MIXED 04 /* mixed preference below */ 478 #define CAP 010 /* capturing parens here or below */ 479 #define BACKR 020 /* back reference here or below */ 480 #define INUSE 0100 /* in use in final tree */ 481 #define NOPROP 03 /* bits which may not propagate up */ 482 #define LMIX(f) ((f)<<2) /* LONGER -> MIXED */ 483 #define SMIX(f) ((f)<<1) /* SHORTER -> MIXED */ 484 #define UP(f) (((f)&~NOPROP) | (LMIX(f) & SMIX(f) & MIXED)) 485 #define MESSY(f) ((f)&(MIXED|CAP|BACKR)) 486 #define PREF(f) ((f)&NOPROP) 487 #define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2)) 488 #define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) 489 char latype; /* LATYPE code, if lookaround constraint */ 490 int id; /* ID of subre (1..ntree-1) */ 491 int capno; /* if capture node, subno to capture into */ 492 int backno; /* if backref node, subno it refers to */ 493 short min; /* min repetitions for iteration or backref */ 494 short max; /* max repetitions for iteration or backref */ 495 struct subre *child; /* first child, if any (also freelist chain) */ 496 struct subre *sibling; /* next child of same parent, if any */ 497 struct state *begin; /* outarcs from here... */ 498 struct state *end; /* ...ending in inarcs here */ 499 struct cnfa cnfa; /* compacted NFA, if any */ 500 struct subre *chain; /* for bookkeeping and error cleanup */ 501 }; 502 503 504 505 /* 506 * table of function pointers for generic manipulation functions 507 * A regex_t's re_fns points to one of these. 508 */ 509 struct fns 510 { 511 void FUNCPTR(free, (regex_t *)); 512 int FUNCPTR(cancel_requested, (void)); 513 int FUNCPTR(stack_too_deep, (void)); 514 }; 515 516 #define CANCEL_REQUESTED(re) \ 517 ((*((struct fns *) (re)->re_fns)->cancel_requested) ()) 518 519 #define STACK_TOO_DEEP(re) \ 520 ((*((struct fns *) (re)->re_fns)->stack_too_deep) ()) 521 522 523 /* 524 * the insides of a regex_t, hidden behind a void * 525 */ 526 struct guts 527 { 528 int magic; 529 #define GUTSMAGIC 0xfed9 530 int cflags; /* copy of compile flags */ 531 long info; /* copy of re_info */ 532 size_t nsub; /* copy of re_nsub */ 533 struct subre *tree; 534 struct cnfa search; /* for fast preliminary search */ 535 int ntree; /* number of subre's, plus one */ 536 struct colormap cmap; 537 int FUNCPTR(compare, (const chr *, const chr *, size_t)); 538 struct subre *lacons; /* lookaround-constraint vector */ 539 int nlacons; /* size of lacons[]; note that only slots 540 * numbered 1 .. nlacons-1 are used */ 541 }; 542 543 544 /* prototypes for functions that are exported from regcomp.c to regexec.c */ 545 extern void pg_set_regex_collation(Oid collation); 546 extern color pg_reg_getcolor(struct colormap *cm, chr c); 547