1 /* 2 * Definitions etc. for regexp(3) routines. 3 * 4 * Caveat: this is V8 regexp(3) [actually, a reimplementation thereof], 5 * not the System V one. 6 */ 7 8 #ifndef REP_REGEXP_H 9 #define REP_REGEXP_H 10 11 #define rep_NSUBEXP 10 12 13 typedef enum rep_regtype { 14 rep_reg_string = 0, 15 rep_reg_obj 16 } rep_regtype; 17 18 typedef union rep_regsubs { 19 struct { 20 char *startp[rep_NSUBEXP]; 21 char *endp[rep_NSUBEXP]; 22 } string; 23 struct { 24 repv startp[rep_NSUBEXP]; 25 repv endp[rep_NSUBEXP]; 26 } obj; 27 } rep_regsubs; 28 29 typedef struct rep_regexp { 30 rep_regtype lasttype; 31 rep_regsubs matches; 32 33 char regstart; /* Internal use only. */ 34 char reganch; /* Internal use only. */ 35 char *regmust; /* Internal use only. */ 36 int regmlen; /* Internal use only. */ 37 int regsize; /* actual size of regexp structure */ 38 char program[1]; /* Unwarranted chumminess with compiler. */ 39 } rep_regexp; 40 41 /* Data structure used to save and restore regexp data internally */ 42 struct rep_saved_regexp_data { 43 struct rep_saved_regexp_data *next; 44 rep_regtype type; 45 repv data; 46 rep_regsubs matches; 47 }; 48 49 /* eflags for regexec2() */ 50 #define rep_REG_NOTBOL 1 /* start of input isn't start of line */ 51 #define rep_REG_NOCASE 2 /* fold upper and lower case */ 52 #define rep_REG_1LINE 4 /* for regexec_tx: only search to the 53 end of the line for the start of the 54 match. */ 55 56 #define rep_regexec(p,s) rep_regexec2(p,s,0) 57 58 extern rep_regexp *rep_regcomp(char *); 59 extern int rep_regexec2(rep_regexp *, char *, int); 60 extern int rep_regmatch_string(rep_regexp *, char *, int); 61 62 extern int rep_regexp_max_depth; 63 64 65 /* Only include the internal stuff if it's explicitly requested, since 66 it comtaminates the namespace.. */ 67 68 #ifdef rep_NEED_REGEXP_INTERNALS 69 70 /* 71 * Structure for regexp "program". This is essentially a linear encoding of 72 * a nondeterministic finite-state machine (aka syntax charts or "railroad 73 * normal form" in parsing technology). Each node is an opcode plus a "next" 74 * pointer, possibly plus an operand. "Next" pointers of all nodes except 75 * BRANCH implement concatenation; a "next" pointer with a BRANCH on both 76 * ends of it is connecting two alternatives. (Here we have one of the 77 * subtle syntax dependencies: an individual BRANCH (as opposed to a 78 * collection of them) is never concatenated with anything because of 79 * operator precedence.) The operand of some types of node is a literal 80 * string; for others, it is a node leading into a sub-FSM. In particular, 81 * the operand of a BRANCH node is the first node of the branch. (NB this is 82 * *not* a tree structure: the tail of the branch connects to the thing 83 * following the set of BRANCHes.) The opcodes are: 84 */ 85 86 /* definition number opnd? meaning */ 87 #define END 0 /* no End of program. */ 88 #define BOL 1 /* no Match "" at beginning of line. */ 89 #define EOL 2 /* no Match "" at end of line. */ 90 #define ANY 3 /* no Match any one character. */ 91 #define ANYOF 4 /* str Match any character in this string. */ 92 #define ANYBUT 5 /* str Match any character not in this 93 * string. */ 94 #define BRANCH 6 /* node Match this alternative, or the 95 * next... */ 96 #define BACK 7 /* no Match "", "next" ptr points backward. */ 97 #define EXACTLY 8 /* str Match this string. */ 98 #define NOTHING 9 /* no Match empty string. */ 99 #define STAR 10 /* node Match this (simple) thing 0 or more 100 * times. */ 101 #define PLUS 11 /* node Match this (simple) thing 1 or more 102 * times. */ 103 #define WORD 12 /* no Match alphanumeric or _ char */ 104 #define NWORD 13 /* no Match non-(alphanumeric or _) char */ 105 #define WSPC 14 /* no Match whitespace char */ 106 #define NWSPC 15 /* no Match non-whitespace char */ 107 #define DIGI 16 /* no Match digit char */ 108 #define NDIGI 17 /* no Match non-digit char */ 109 #define WEDGE 18 /* no Match "" at word boundary */ 110 #define NWEDGE 19 /* no Match "" not at word boundary */ 111 #define OPEN 20 /* no Mark this point in input as start of 112 * #n. */ 113 /* OPEN+1 is number 1, etc. */ 114 #define CLOSE 30 /* no Analogous to OPEN. */ 115 #define NGSTAR 40 /* node Match this (simple) thing 0 or more 116 times (non-greedily) */ 117 #define NGPLUS 41 /* node Match this (simple) thing 1 or more 118 times (non-greedily) */ 119 120 /* 121 * Opcode notes: 122 * 123 * BRANCH The set of branches constituting a single choice are hooked together 124 * with their "next" pointers, since precedence prevents anything being 125 * concatenated to any individual branch. The "next" pointer of the last 126 * BRANCH in a choice points to the thing following the whole choice. This 127 * is also where the final "next" pointer of each individual branch points; 128 * each branch starts with the operand node of a BRANCH node. 129 * 130 * BACK Normal "next" pointers all implicitly point forward; BACK exists to 131 * make loop structures possible. 132 * 133 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 134 * BRANCH structures using BACK. Simple cases (one character per match) are 135 * implemented with STAR and PLUS for speed and to minimize recursive 136 * plunges. 137 * 138 * OPEN,CLOSE ...are numbered at compile time. 139 */ 140 141 /* 142 * A node is one char of opcode followed by two chars of "next" pointer. 143 * "Next" pointers are stored as two 8-bit pieces, high order first. The 144 * value is a positive offset from the opcode of the node containing it. An 145 * operand, if any, simply follows the node. (Note that much of the code 146 * generation knows about this implicit relationship.) 147 * 148 * Using two bytes for the "next" pointer is vast overkill for most things, but 149 * allows patterns to get big without disasters. 150 */ 151 #define OP(p) (*(p)) 152 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 153 #define OPERAND(p) ((p) + 3) 154 155 156 /* 157 * The first byte of the regexp internal "program" is actually this magic 158 * number; the start node begins in the second byte. 159 */ 160 #define MAGIC 0234 161 162 #endif /* rep_NEED_REGEXP_INTERNALS */ 163 164 #endif /* REP_REGEXP_H */ 165