1 #ifndef _RE2C_LIB_REGEX_ 2 #define _RE2C_LIB_REGEX_ 3 4 #include <stddef.h> 5 #include <stdint.h> 6 #include <limits.h> 7 8 9 namespace re2c { 10 struct nfa_t; 11 struct dfa_t; 12 class RangeMgr; 13 } // namespace re2c 14 15 namespace re2c { 16 namespace libre2c { 17 struct rldfa_t; 18 struct regoff_trie_t; 19 } // namespace libre2c 20 } // namespace re2c 21 22 typedef ptrdiff_t regoff_t; 23 24 // regmatch_t stores an offset pair representing a capturing group. 25 // No match is represented with (-1,-1). 26 struct regmatch_t { 27 regoff_t rm_so; 28 regoff_t rm_eo; 29 }; 30 31 // subhistory_t stores subhistory of a capturing group: an array of offset pairs 32 // corresponding to all positions in the input string where this capturing group 33 // has matched. The length of the array depends on how many times the group has 34 // matched, which depends on the RE and the input string. 35 struct subhistory_t { 36 size_t size; 37 regmatch_t *offs; 38 }; 39 40 // T-string chars are 16 bits. 41 // This is aligned with the Java implementation by Angelo Borsotti. 42 typedef uint16_t tchar_t; 43 44 // A t-string is a flattened representation a parse tree where characters of the 45 // input string are interleaved with tags. Input characters occupy the lower 46 // byte of the 16-bit value, leaving the higher byte as zero. Tags are stored as 47 // numeric values shifted to the upper half of the 16-bit range. 48 struct tstring_t { 49 size_t capacity; 50 size_t length; 51 tchar_t *string; 52 }; 53 54 // Tags use the upper half of the t-string charater value range. 55 static const tchar_t TAG_BASE = 1 << (8 * sizeof(tchar_t) - 1); 56 57 // standard flags 58 static const int REG_EXTENDED = 1u << 0; 59 static const int REG_ICASE = 1u << 1; 60 static const int REG_NOSUB = 1u << 2; 61 static const int REG_NEWLINE = 1u << 3; 62 static const int REG_NOTBOL = 1u << 4; 63 static const int REG_NOTEOL = 1u << 5; 64 // extensions 65 static const int REG_NFA = 1u << 6; 66 static const int REG_LEFTMOST = 1u << 7; 67 static const int REG_TRIE = 1u << 8; 68 static const int REG_GTOP = 1u << 9; 69 static const int REG_SLOWPREC = 1u << 10; 70 static const int REG_BACKWARD = 1u << 11; 71 static const int REG_KUKLEWICZ = 1u << 12; 72 static const int REG_STADFA = 1u << 13; 73 static const int REG_REGLESS = 1u << 14; 74 static const int REG_SUBHIST = 1u << 15; 75 static const int REG_TSTRING = 1u << 16; 76 static const int REG_AUTOTAGS = 1u << 17; 77 78 struct regex_t { 79 size_t re_nsub; 80 size_t re_ntag; 81 re2c::RangeMgr *rmgr; 82 const re2c::nfa_t *nfa; 83 const re2c::dfa_t *dfa; 84 const re2c::libre2c::rldfa_t *rldfa; 85 void *simctx; 86 size_t *char2class; 87 int flags; 88 union { 89 regoff_t *regs; 90 re2c::libre2c::regoff_trie_t *regtrie; 91 }; 92 // Allow storing submatch results in RE (this is needed for Java bindings). 93 union { 94 regmatch_t *pmatch; 95 subhistory_t *psubhist; 96 mutable tstring_t tstring; 97 }; 98 }; 99 100 static const int REG_NOMATCH = INT_MAX; 101 102 // standard functions 103 int regcomp(regex_t *preg, const char *pattern, int cflags); 104 size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size); 105 int regexec(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], 106 int eflags); 107 void regfree(regex_t *preg); 108 109 // The regparse() function returns parse results as an array of nmatch size, 110 // where each element is an array of offset pairs for the corresponding 111 // capturing group. If a group has matched repeatedly at different parts of the 112 // input string, then its array will contain multiple offset pairs; otherwise, a 113 // single pair. This differs from the standard POSIX API, where only the last 114 // offset pair for each group is returned. 115 // 116 // regparse() can be used if REG_SUBHIST flag has been passed to regcomp(). 117 // 118 // The allocation of memory for the parse results is done by the library (the 119 // user cannot know how much memory will be needed). The ownership of parse 120 // results is transferred to the user, who is expected to deallocate memory 121 // allocated by regparse() with regfreesub(). 122 subhistory_t *regparse(const regex_t *preg, const char *string, size_t nmatch); 123 124 // regfreesub() deallocates memory allocated with regparse(). It must be called 125 // every time after regparse() is called. 126 void regfreesub(subhistory_t *history); 127 128 // The regtstring() function constructs a t-string. 129 // 130 // It can be used if REG_TSTRING flag has been passed to regcomp(). 131 // 132 // The goal of this function is to construct parse results in the cheapest 133 // possible way. There is no standard representation of a parse tree, and any 134 // nontrivial representation incurs significant overhead on construction (the 135 // algorithm spends more time constructing parse results than doing the actual 136 // parsing). The t-string representation requires very little effort to 137 // construct, provided that the t-string fragments corresponding to the tagged 138 // paths in the epsilon-closure are prepared in advance at regcomp() time. The 139 // fragments are stored as arrays of tags, so that they can be easily copied to 140 // the resulting t-string. Importantly, this does not require unwinding of tag 141 // history stored in a trie form, which is rather slow. 142 // 143 // T-string allocation is done on the library side, and the t-string is stored 144 // in the regexp structure. The user gets an immutable view of it and is not 145 // expected to free memory after calling regtstring(). Moreover, the library 146 // reuses the same memory on each regtstring() call and reallocates it only if 147 // the new t-string does not fit into the old memory area. So each call to 148 // regtstring() invalidates the previous results returned from it. 149 // 150 // The regtstring() function is primarily intended for benchmarking the speed 151 // of a parsing algorithm. 152 const tstring_t *regtstring(const regex_t *preg, const char *string); 153 154 155 #endif // _RE2C_LIB_REGEX_ 156