1 #ifndef _RE2C_LIB_REGEX_
2 #define _RE2C_LIB_REGEX_
3 
4 #include <stddef.h>
5 #include <stdint.h>
6 #include <limits.h>
7 
8 
9 namespace re2c {
10 struct nfa_t;
11 struct dfa_t;
12 class RangeMgr;
13 } // namespace re2c
14 
15 namespace re2c {
16 namespace libre2c {
17 struct rldfa_t;
18 struct regoff_trie_t;
19 } // namespace libre2c
20 } // namespace re2c
21 
22 typedef ptrdiff_t regoff_t;
23 
24 // regmatch_t stores an offset pair representing a capturing group.
25 // No match is represented with (-1,-1).
26 struct regmatch_t {
27     regoff_t rm_so;
28     regoff_t rm_eo;
29 };
30 
31 // subhistory_t stores subhistory of a capturing group: an array of offset pairs
32 // corresponding to all positions in the input string where this capturing group
33 // has matched. The length of the array depends on how many times the group has
34 // matched, which depends on the RE and the input string.
35 struct subhistory_t {
36     size_t size;
37     regmatch_t *offs;
38 };
39 
40 // T-string chars are 16 bits.
41 // This is aligned with the Java implementation by Angelo Borsotti.
42 typedef uint16_t tchar_t;
43 
44 // A t-string is a flattened representation a parse tree where characters of the
45 // input string are interleaved with tags. Input characters occupy the lower
46 // byte of the 16-bit value, leaving the higher byte as zero. Tags are stored as
47 // numeric values shifted to the upper half of the 16-bit range.
48 struct tstring_t {
49     size_t capacity;
50     size_t length;
51     tchar_t *string;
52 };
53 
54 // Tags use the upper half of the t-string charater value range.
55 static const tchar_t TAG_BASE = 1 << (8 * sizeof(tchar_t) - 1);
56 
57 // standard flags
58 static const int REG_EXTENDED  = 1u << 0;
59 static const int REG_ICASE     = 1u << 1;
60 static const int REG_NOSUB     = 1u << 2;
61 static const int REG_NEWLINE   = 1u << 3;
62 static const int REG_NOTBOL    = 1u << 4;
63 static const int REG_NOTEOL    = 1u << 5;
64 // extensions
65 static const int REG_NFA       = 1u << 6;
66 static const int REG_LEFTMOST  = 1u << 7;
67 static const int REG_TRIE      = 1u << 8;
68 static const int REG_GTOP      = 1u << 9;
69 static const int REG_SLOWPREC  = 1u << 10;
70 static const int REG_BACKWARD  = 1u << 11;
71 static const int REG_KUKLEWICZ = 1u << 12;
72 static const int REG_STADFA    = 1u << 13;
73 static const int REG_REGLESS   = 1u << 14;
74 static const int REG_SUBHIST   = 1u << 15;
75 static const int REG_TSTRING   = 1u << 16;
76 static const int REG_AUTOTAGS  = 1u << 17;
77 
78 struct regex_t {
79     size_t re_nsub;
80     size_t re_ntag;
81     re2c::RangeMgr *rmgr;
82     const re2c::nfa_t *nfa;
83     const re2c::dfa_t *dfa;
84     const re2c::libre2c::rldfa_t *rldfa;
85     void *simctx;
86     size_t *char2class;
87     int flags;
88     union {
89         regoff_t *regs;
90         re2c::libre2c::regoff_trie_t *regtrie;
91     };
92     // Allow storing submatch results in RE (this is needed for Java bindings).
93     union {
94         regmatch_t *pmatch;
95         subhistory_t *psubhist;
96         mutable tstring_t tstring;
97     };
98 };
99 
100 static const int REG_NOMATCH = INT_MAX;
101 
102 // standard functions
103 int regcomp(regex_t *preg, const char *pattern, int cflags);
104 size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size);
105 int regexec(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[],
106     int eflags);
107 void regfree(regex_t *preg);
108 
109 // The regparse() function returns parse results as an array of nmatch size,
110 // where each element is an array of offset pairs for the corresponding
111 // capturing group. If a group has matched repeatedly at different parts of the
112 // input string, then its array will contain multiple offset pairs; otherwise, a
113 // single pair. This differs from the standard POSIX API, where only the last
114 // offset pair for each group is returned.
115 //
116 // regparse() can be used if REG_SUBHIST flag has been passed to regcomp().
117 //
118 // The allocation of memory for the parse results is done by the library (the
119 // user cannot know how much memory will be needed). The ownership of parse
120 // results is transferred to the user, who is expected to deallocate memory
121 // allocated by regparse() with regfreesub().
122 subhistory_t *regparse(const regex_t *preg, const char *string, size_t nmatch);
123 
124 // regfreesub() deallocates memory allocated with regparse(). It must be called
125 // every time after regparse() is called.
126 void regfreesub(subhistory_t *history);
127 
128 // The regtstring() function constructs a t-string.
129 //
130 // It can be used if REG_TSTRING flag has been passed to regcomp().
131 //
132 // The goal of this function is to construct parse results in the cheapest
133 // possible way. There is no standard representation of a parse tree, and any
134 // nontrivial representation incurs significant overhead on construction (the
135 // algorithm spends more time constructing parse results than doing the actual
136 // parsing). The t-string representation requires very little effort to
137 // construct, provided that the t-string fragments corresponding to the tagged
138 // paths in the epsilon-closure are prepared in advance at regcomp() time. The
139 // fragments are stored as arrays of tags, so that they can be easily copied to
140 // the resulting t-string. Importantly, this does not require unwinding of tag
141 // history stored in a trie form, which is rather slow.
142 //
143 // T-string allocation is done on the library side, and the t-string is stored
144 // in the regexp structure. The user gets an immutable view of it and is not
145 // expected to free memory after calling regtstring(). Moreover, the library
146 // reuses the same memory on each regtstring() call and reallocates it only if
147 // the new t-string does not fit into the old memory area. So each call to
148 // regtstring() invalidates the previous results returned from it.
149 //
150 // The regtstring() function is primarily intended for benchmarking the speed
151 // of a parsing algorithm.
152 const tstring_t *regtstring(const regex_t *preg, const char *string);
153 
154 
155 #endif // _RE2C_LIB_REGEX_
156