1 #include "src/util/c99_stdint.h"
2 #include <string.h>
3 #include <set>
4 #include <string>
5 #include <valarray>
6 #include <vector>
7
8 #include "src/dfa/dfa.h"
9 #include "src/msg/msg.h"
10 #include "src/msg/location.h"
11 #include "src/msg/warn.h"
12 #include "src/regexp/rule.h"
13 #include "src/options/opt.h"
14 #include "src/util/forbid_copy.h"
15
16
17 namespace re2c {
18
19 struct tcmd_t;
20
21 /* note [unreachable rules]
22 *
23 * DFA may contain useless final states. Such states may
24 * appear as a result of:
25 * - (part of) one rule being shadowed by another rule,
26 * e.g. rule [ab] partially shadows [ac] and completely
27 * shadows [a]
28 *
29 * - infinite rules that greedily eat all input characters
30 * and never stop (they either fail on YYFILL or crash),
31 * e.g. [^]*. This does not occur when EOF rule is used,
32 * because in that case YYFILL returns on failure.
33 *
34 * - rules that contain never-matching link, e.g. '[]'
35 * with option '--empty-class match-none'
36 *
37 * Useless final states should be eliminated so that they
38 * don't interfere with further analyses and optimizations.
39 * If all final states of a rule are useless, then the whole
40 * rule is unreachable and should be reported.
41 *
42 * In order to find out if a given final state is useless,
43 * we have to find out if all outgoing paths from this state
44 * match longer rules (otherwise, some paths go to default
45 * state and fallback to this state). We do this by finding
46 * all states that have transitions to default state and back
47 * propagation of "none-rule" from these states. As the back
48 * propagation meets the first final state on its way, it
49 * substitutes "none-rule" with the corresponding rule,
50 * which is further propagated back to the start state of DFA.
51 */
52
53
54 /* note [fallback states]
55 *
56 * Find states that are accepting, but may be shadowed
57 * by other accepting states: when the short rule matches,
58 * lexer must try to match longer rules; if this attempt is
59 * unsuccessful it must fallback to the short match.
60 *
61 * In order to find fallback states we need to know if
62 * "none-rule" is reachable from the given state, the information
63 * we have after rule liveness analyses. Fallback states are
64 * needed at different points in time (both before and after
65 * certain transformations on DFA). Fortunately, fallback states
66 * are not affected by these transformations, so we can calculate
67 * them here and save for future use.
68 */
69
70 // reversed DFA
71 struct rdfa_t
72 {
73 struct arc_t
74 {
75 size_t dest;
76 arc_t *next;
77 };
78
79 struct state_t
80 {
81 arc_t *arcs;
82 size_t rule;
83 bool fallthru;
84 };
85
86 size_t nstates;
87 size_t nrules;
88 state_t *states;
89 arc_t *arcs;
90
rdfa_tre2c::rdfa_t91 explicit rdfa_t(const dfa_t &dfa)
92 : nstates(dfa.states.size())
93 , nrules(dfa.rules.size())
94 , states(new state_t[nstates]())
95 , arcs(new arc_t[nstates * dfa.nchars])
96 {
97 // init states
98 for (size_t i = 0; i < nstates; ++i) {
99 state_t &s = states[i];
100 s.arcs = NULL;
101 const size_t r = dfa.states[i]->rule;
102 s.rule = r == Rule::NONE ? nrules : r;
103 s.fallthru = false;
104 }
105 // init arcs
106 arc_t *a = arcs;
107 for (size_t i = 0; i < nstates; ++i) {
108 dfa_state_t *s = dfa.states[i];
109 for (size_t c = 0; c < dfa.nchars; ++c) {
110 const size_t j = s->arcs[c];
111 if (j != dfa_t::NIL) {
112 a->dest = i;
113 a->next = states[j].arcs;
114 states[j].arcs = a++;
115 } else {
116 states[i].fallthru = true;
117 }
118 }
119 }
120 }
121
~rdfa_tre2c::rdfa_t122 ~rdfa_t()
123 {
124 delete[] states;
125 delete[] arcs;
126 }
127
128 FORBID_COPY(rdfa_t);
129 };
130
backprop(const rdfa_t & rdfa,bool * live,size_t rule,size_t state)131 static void backprop(const rdfa_t &rdfa, bool *live,
132 size_t rule, size_t state)
133 {
134 // "none-rule" is unreachable from final states:
135 // be careful to mask it before propagating
136 const rdfa_t::state_t &s = rdfa.states[state];
137 if (rule == rdfa.nrules) {
138 rule = s.rule;
139 }
140
141 // if the rule has already been set, than either it's a loop
142 // or another branch of back propagation has already been here,
143 // in both cases we should stop: there's nothing new to propagate
144 bool &l = live[rule * rdfa.nstates + state];
145 if (l) return;
146 l = true;
147
148 for (const rdfa_t::arc_t *a = s.arcs; a; a = a->next) {
149 backprop(rdfa, live, rule, a->dest);
150 }
151 }
152
liveness_analyses(const rdfa_t & rdfa,bool * live)153 static void liveness_analyses(const rdfa_t &rdfa, bool *live)
154 {
155 for (size_t i = 0; i < rdfa.nstates; ++i) {
156 const rdfa_t::state_t &s = rdfa.states[i];
157 if (s.fallthru) {
158 backprop(rdfa, live, s.rule, i);
159 }
160 }
161 }
162
warn_dead_rules(const dfa_t & dfa,const std::string & cond,const bool * live,Msg & msg)163 static void warn_dead_rules(const dfa_t &dfa, const std::string &cond,
164 const bool *live, Msg &msg)
165 {
166 const size_t nstates = dfa.states.size();
167 const size_t nrules = dfa.rules.size();
168
169 for (size_t i = 0; i < nstates; ++i) {
170 const size_t r = dfa.states[i]->rule;
171 if (r != Rule::NONE && !live[r * nstates + i]) {
172 // skip last rule (it's the NONE-rule)
173 for (size_t j = 0; j < nrules; ++j) {
174 if (live[j * nstates + i]) {
175 dfa.rules[r].shadow.insert(dfa.rules[j].semact->loc.line);
176 }
177 }
178 }
179 }
180
181 for (size_t i = 0; i < nrules; ++i) {
182 // default rule '*' should not be reported
183 if (i != dfa.def_rule && !live[i * nstates]) {
184 msg.warn.unreachable_rule(cond, dfa.rules[i]);
185 }
186 }
187 }
188
warn_sentinel_in_midrule(const dfa_t & dfa,const opt_t * opts,const std::string & cond,const bool * live,Msg & msg)189 static void warn_sentinel_in_midrule(const dfa_t &dfa, const opt_t *opts
190 , const std::string &cond, const bool *live, Msg &msg)
191 {
192 // perform check only in case sentinel method is used
193 if (opts->fill_use || opts->eof != NOEOF) return;
194
195 const size_t nstates = dfa.states.size();
196 const size_t nsym = dfa.nchars;
197 const size_t nrules = dfa.rules.size();
198 bool *bad = new bool[nrules];
199 memset(bad, 0, nrules);
200
201 // find character class that contains sentinel symbol
202 const uint32_t sentsym = opts->sentinel == NOEOF ? 0 : opts->sentinel;
203 uint32_t sentcls = 0;
204 DASSERT(dfa.charset.size() == nsym + 1);
205 for (; sentcls < nsym && sentsym >= dfa.charset[sentcls + 1]; ++sentcls)
206 ;
207 DASSERT(sentcls < nsym);
208
209 // check that every transition on sentinel symbol goes to an end state
210 // that has no further transitions; otherwise, give a warning or, if
211 // 're2c:sentinel' configuration is used, an error
212 for (size_t i = 0; i < nstates; ++i) {
213 const size_t j = dfa.states[i]->arcs[sentcls];
214 if (j == dfa_t::NIL) continue;
215
216 const size_t *arcs = dfa.states[j]->arcs;
217 for (size_t c = 0; c < nsym; ++c) {
218 const size_t k = arcs[c];
219 if (k == dfa_t::NIL) continue;
220
221 for (size_t r = 0; r < nrules; ++r) {
222 bad[r] |= live[r * nstates + k];
223 }
224 }
225 }
226
227 for (size_t r = 0; r < nrules; ++r) {
228 if (bad[r]) {
229 msg.warn.sentinel_in_midrule(dfa.rules[r].semact->loc, cond, opts->sentinel);
230 }
231 }
232
233 delete[] bad;
234 }
235
remove_dead_final_states(dfa_t & dfa,const bool * fallthru)236 static void remove_dead_final_states(dfa_t &dfa, const bool *fallthru)
237 {
238 const size_t
239 nstates = dfa.states.size(),
240 nsym = dfa.nchars;
241
242 for (size_t i = 0; i < nstates; ++i) {
243 dfa_state_t *s = dfa.states[i];
244 if (s->rule == Rule::NONE) continue;
245
246 // final state is useful iff there is at least one
247 // non-accepting path from this state
248 bool shadowed = true;
249 for (size_t c = 0; c < nsym; ++c) {
250 const size_t j = s->arcs[c];
251 if (j == dfa_t::NIL || fallthru[j]) {
252 shadowed = false;
253 break;
254 }
255 }
256
257 if (shadowed) {
258 s->rule = Rule::NONE;
259 s->tcmd[nsym] = NULL;
260 }
261 }
262 }
263
find_fallback_states(dfa_t & dfa,const bool * fallthru)264 static void find_fallback_states(dfa_t &dfa, const bool *fallthru)
265 {
266 const size_t nstate = dfa.states.size();
267 const size_t nsym = dfa.nchars;
268
269 for (size_t i = 0; i < nstate; ++i) {
270 dfa_state_t *s = dfa.states[i];
271 s->fallthru = fallthru[i];
272 if (s->rule == Rule::NONE) continue;
273
274 // A final state is a fallback state if there are non-accepting paths
275 // from it (i.e. paths that end with a transition to default state).
276 for (size_t c = 0; c < nsym; ++c) {
277 const size_t j = s->arcs[c];
278 if (j != dfa_t::NIL && fallthru[j]) {
279 s->fallback = true;
280 break;
281 }
282 }
283 }
284 }
285
find_fallback_states_with_eof_rule(dfa_t & dfa)286 static void find_fallback_states_with_eof_rule(dfa_t &dfa)
287 {
288 const size_t nstate = dfa.states.size();
289 const size_t nsym = dfa.nchars;
290
291 for (size_t i = 0; i < nstate; ++i) {
292 dfa_state_t *s = dfa.states[i];
293 if (s->rule == Rule::NONE || s->rule == dfa.eof_rule) continue;
294
295 // With EOF rule, a final state is a fallback state if it has outgoing
296 // transitions to any non-accepting states (even if all possible paths
297 // on those transitions are accepting, there is a possibility of YYFILL
298 // failure on such path, which adds a default quasi-transition).
299 for (size_t c = 0; c < nsym; ++c) {
300 const size_t j = s->arcs[c];
301 if (j != dfa_t::NIL && dfa.states[j]->rule == Rule::NONE) {
302 s->fallback = true;
303 break;
304 }
305 }
306 }
307 }
308
remove_dead_final_states_with_eof_rule(dfa_t & dfa)309 static void remove_dead_final_states_with_eof_rule(dfa_t &dfa)
310 {
311 // EOF is like a special symbol that is not covered by any of the rules.
312 // Therefore rules cannot be completely shadowed by other rules, with one
313 // exception: if a rule matches empty string and the initial state is not a
314 // fallback state (i.e. all outgoing paths are accepting), then this rule
315 // will never match (if it is EOF in the initial state, then EOF rule takes
316 // priority, otherwise one of the longer rules will match).
317 DASSERT(!dfa.states.empty());
318 dfa_state_t *s0 = dfa.states[0];
319 if (s0->rule != Rule::NONE && s0->rule != dfa.eof_rule && !s0->fallback) {
320 s0->rule = dfa.eof_rule;
321 }
322 }
323
cutoff_dead_rules(dfa_t & dfa,const opt_t * opts,const std::string & cond,Msg & msg)324 void cutoff_dead_rules(dfa_t &dfa, const opt_t *opts, const std::string &cond, Msg &msg)
325 {
326 if (opts->eof != NOEOF) {
327 // See note [EOF rule handling].
328 find_fallback_states_with_eof_rule(dfa);
329 remove_dead_final_states_with_eof_rule(dfa);
330 } else {
331 const rdfa_t rdfa(dfa);
332 const size_t ns = rdfa.nstates;
333 const size_t nl = (rdfa.nrules + 1) * ns;
334 bool *live = new bool[nl];
335 bool *fallthru = live + nl - ns;
336 memset(live, 0, nl * sizeof(bool));
337
338 liveness_analyses(rdfa, live);
339
340 warn_dead_rules(dfa, cond, live, msg);
341 remove_dead_final_states(dfa, fallthru);
342 warn_sentinel_in_midrule(dfa, opts, cond, live, msg);
343 find_fallback_states(dfa, fallthru);
344
345 delete[] live;
346 }
347 }
348
349 } // namespace re2c
350
351