1 #include "src/util/c99_stdint.h"
2 #include <string.h>
3 #include <set>
4 #include <string>
5 #include <valarray>
6 #include <vector>
7 
8 #include "src/dfa/dfa.h"
9 #include "src/msg/msg.h"
10 #include "src/msg/location.h"
11 #include "src/msg/warn.h"
12 #include "src/regexp/rule.h"
13 #include "src/options/opt.h"
14 #include "src/util/forbid_copy.h"
15 
16 
17 namespace re2c {
18 
19 struct tcmd_t;
20 
21 /* note [unreachable rules]
22  *
23  * DFA may contain useless final states. Such states may
24  * appear as a result of:
25  *   - (part of) one rule being shadowed by another rule,
26  *     e.g. rule [ab] partially shadows [ac] and completely
27  *     shadows [a]
28  *
29  *   - infinite rules that greedily eat all input characters
30  *     and never stop (they either fail on YYFILL or crash),
31  *     e.g. [^]*. This does not occur when EOF rule is used,
32  *     because in that case YYFILL returns on failure.
33  *
34  *   - rules that contain never-matching link, e.g. '[]'
35  *     with option '--empty-class match-none'
36  *
37  * Useless final states should be eliminated so that they
38  * don't interfere with further analyses and optimizations.
39  * If all final states of a rule are useless, then the whole
40  * rule is unreachable and should be reported.
41  *
42  * In order to find out if a given final state is useless,
43  * we have to find out if all outgoing paths from this state
44  * match longer rules (otherwise, some paths go to default
45  * state and fallback to this state). We do this by finding
46  * all states that have transitions to default state and back
47  * propagation of "none-rule" from these states. As the back
48  * propagation meets the first final state on its way, it
49  * substitutes "none-rule" with the corresponding rule,
50  * which is further propagated back to the start state of DFA.
51  */
52 
53 
54 /* note [fallback states]
55  *
56  * Find states that are accepting, but may be shadowed
57  * by other accepting states: when the short rule matches,
58  * lexer must try to match longer rules; if this attempt is
59  * unsuccessful it must fallback to the short match.
60  *
61  * In order to find fallback states we need to know if
62  * "none-rule" is reachable from the given state, the information
63  * we have after rule liveness analyses. Fallback states are
64  * needed at different points in time (both before and after
65  * certain transformations on DFA). Fortunately, fallback states
66  * are not affected by these transformations, so we can calculate
67  * them here and save for future use.
68  */
69 
70 // reversed DFA
71 struct rdfa_t
72 {
73     struct arc_t
74     {
75         size_t dest;
76         arc_t *next;
77     };
78 
79     struct state_t
80     {
81         arc_t *arcs;
82         size_t rule;
83         bool fallthru;
84     };
85 
86     size_t nstates;
87     size_t nrules;
88     state_t *states;
89     arc_t *arcs;
90 
rdfa_tre2c::rdfa_t91     explicit rdfa_t(const dfa_t &dfa)
92         : nstates(dfa.states.size())
93         , nrules(dfa.rules.size())
94         , states(new state_t[nstates]())
95         , arcs(new arc_t[nstates * dfa.nchars])
96     {
97         // init states
98         for (size_t i = 0; i < nstates; ++i) {
99             state_t &s = states[i];
100             s.arcs = NULL;
101             const size_t r = dfa.states[i]->rule;
102             s.rule = r == Rule::NONE ? nrules : r;
103             s.fallthru = false;
104         }
105         // init arcs
106         arc_t *a = arcs;
107         for (size_t i = 0; i < nstates; ++i) {
108             dfa_state_t *s = dfa.states[i];
109             for (size_t c = 0; c < dfa.nchars; ++c) {
110                 const size_t j = s->arcs[c];
111                 if (j != dfa_t::NIL) {
112                     a->dest = i;
113                     a->next = states[j].arcs;
114                     states[j].arcs = a++;
115                 } else {
116                     states[i].fallthru = true;
117                 }
118             }
119         }
120     }
121 
~rdfa_tre2c::rdfa_t122     ~rdfa_t()
123     {
124         delete[] states;
125         delete[] arcs;
126     }
127 
128     FORBID_COPY(rdfa_t);
129 };
130 
backprop(const rdfa_t & rdfa,bool * live,size_t rule,size_t state)131 static void backprop(const rdfa_t &rdfa, bool *live,
132     size_t rule, size_t state)
133 {
134     // "none-rule" is unreachable from final states:
135     // be careful to mask it before propagating
136     const rdfa_t::state_t &s = rdfa.states[state];
137     if (rule == rdfa.nrules) {
138         rule = s.rule;
139     }
140 
141     // if the rule has already been set, than either it's a loop
142     // or another branch of back propagation has already been here,
143     // in both cases we should stop: there's nothing new to propagate
144     bool &l = live[rule * rdfa.nstates + state];
145     if (l) return;
146     l = true;
147 
148     for (const rdfa_t::arc_t *a = s.arcs; a; a = a->next) {
149         backprop(rdfa, live, rule, a->dest);
150     }
151 }
152 
liveness_analyses(const rdfa_t & rdfa,bool * live)153 static void liveness_analyses(const rdfa_t &rdfa, bool *live)
154 {
155     for (size_t i = 0; i < rdfa.nstates; ++i) {
156         const rdfa_t::state_t &s = rdfa.states[i];
157         if (s.fallthru) {
158             backprop(rdfa, live, s.rule, i);
159         }
160     }
161 }
162 
warn_dead_rules(const dfa_t & dfa,const std::string & cond,const bool * live,Msg & msg)163 static void warn_dead_rules(const dfa_t &dfa, const std::string &cond,
164     const bool *live, Msg &msg)
165 {
166     const size_t nstates = dfa.states.size();
167     const size_t nrules = dfa.rules.size();
168 
169     for (size_t i = 0; i < nstates; ++i) {
170         const size_t r = dfa.states[i]->rule;
171         if (r != Rule::NONE && !live[r * nstates + i]) {
172             // skip last rule (it's the NONE-rule)
173             for (size_t j = 0; j < nrules; ++j) {
174                 if (live[j * nstates + i]) {
175                     dfa.rules[r].shadow.insert(dfa.rules[j].semact->loc.line);
176                 }
177             }
178         }
179     }
180 
181     for (size_t i = 0; i < nrules; ++i) {
182         // default rule '*' should not be reported
183         if (i != dfa.def_rule && !live[i * nstates]) {
184             msg.warn.unreachable_rule(cond, dfa.rules[i]);
185         }
186     }
187 }
188 
warn_sentinel_in_midrule(const dfa_t & dfa,const opt_t * opts,const std::string & cond,const bool * live,Msg & msg)189 static void warn_sentinel_in_midrule(const dfa_t &dfa, const opt_t *opts
190     , const std::string &cond, const bool *live, Msg &msg)
191 {
192     // perform check only in case sentinel method is used
193     if (opts->fill_use || opts->eof != NOEOF) return;
194 
195     const size_t nstates = dfa.states.size();
196     const size_t nsym = dfa.nchars;
197     const size_t nrules = dfa.rules.size();
198     bool *bad = new bool[nrules];
199     memset(bad, 0, nrules);
200 
201     // find character class that contains sentinel symbol
202     const uint32_t sentsym = opts->sentinel == NOEOF ? 0 : opts->sentinel;
203     uint32_t sentcls = 0;
204     DASSERT(dfa.charset.size() == nsym + 1);
205     for (; sentcls < nsym && sentsym >= dfa.charset[sentcls + 1]; ++sentcls)
206         ;
207     DASSERT(sentcls < nsym);
208 
209     // check that every transition on sentinel symbol goes to an end state
210     // that has no further transitions; otherwise, give a warning or, if
211     // 're2c:sentinel' configuration is used, an error
212     for (size_t i = 0; i < nstates; ++i) {
213         const size_t j = dfa.states[i]->arcs[sentcls];
214         if (j == dfa_t::NIL) continue;
215 
216         const size_t *arcs = dfa.states[j]->arcs;
217         for (size_t c = 0; c < nsym; ++c) {
218             const size_t k = arcs[c];
219             if (k == dfa_t::NIL) continue;
220 
221             for (size_t r = 0; r < nrules; ++r) {
222                 bad[r] |= live[r * nstates + k];
223             }
224         }
225     }
226 
227     for (size_t r = 0; r < nrules; ++r) {
228         if (bad[r]) {
229             msg.warn.sentinel_in_midrule(dfa.rules[r].semact->loc, cond, opts->sentinel);
230         }
231     }
232 
233     delete[] bad;
234 }
235 
remove_dead_final_states(dfa_t & dfa,const bool * fallthru)236 static void remove_dead_final_states(dfa_t &dfa, const bool *fallthru)
237 {
238     const size_t
239         nstates = dfa.states.size(),
240         nsym = dfa.nchars;
241 
242     for (size_t i = 0; i < nstates; ++i) {
243         dfa_state_t *s = dfa.states[i];
244         if (s->rule == Rule::NONE) continue;
245 
246         // final state is useful iff there is at least one
247         // non-accepting path from this state
248         bool shadowed = true;
249         for (size_t c = 0; c < nsym; ++c) {
250             const size_t j = s->arcs[c];
251             if (j == dfa_t::NIL || fallthru[j]) {
252                 shadowed = false;
253                 break;
254             }
255         }
256 
257         if (shadowed) {
258             s->rule = Rule::NONE;
259             s->tcmd[nsym] = NULL;
260         }
261     }
262 }
263 
find_fallback_states(dfa_t & dfa,const bool * fallthru)264 static void find_fallback_states(dfa_t &dfa, const bool *fallthru)
265 {
266     const size_t nstate = dfa.states.size();
267     const size_t nsym = dfa.nchars;
268 
269     for (size_t i = 0; i < nstate; ++i) {
270         dfa_state_t *s = dfa.states[i];
271         s->fallthru = fallthru[i];
272         if (s->rule == Rule::NONE) continue;
273 
274         // A final state is a fallback state if there are non-accepting paths
275         // from it (i.e. paths that end with a transition to default state).
276         for (size_t c = 0; c < nsym; ++c) {
277             const size_t j = s->arcs[c];
278             if (j != dfa_t::NIL && fallthru[j]) {
279                 s->fallback = true;
280                 break;
281             }
282         }
283     }
284 }
285 
find_fallback_states_with_eof_rule(dfa_t & dfa)286 static void find_fallback_states_with_eof_rule(dfa_t &dfa)
287 {
288     const size_t nstate = dfa.states.size();
289     const size_t nsym = dfa.nchars;
290 
291     for (size_t i = 0; i < nstate; ++i) {
292         dfa_state_t *s = dfa.states[i];
293         if (s->rule == Rule::NONE || s->rule == dfa.eof_rule) continue;
294 
295         // With EOF rule, a final state is a fallback state if it has outgoing
296         // transitions to any non-accepting states (even if all possible paths
297         // on those transitions are accepting, there is a possibility of YYFILL
298         // failure on such path, which adds a default quasi-transition).
299         for (size_t c = 0; c < nsym; ++c) {
300             const size_t j = s->arcs[c];
301             if (j != dfa_t::NIL && dfa.states[j]->rule == Rule::NONE) {
302                 s->fallback = true;
303                 break;
304             }
305         }
306     }
307 }
308 
remove_dead_final_states_with_eof_rule(dfa_t & dfa)309 static void remove_dead_final_states_with_eof_rule(dfa_t &dfa)
310 {
311     // EOF is like a special symbol that is not covered by any of the rules.
312     // Therefore rules cannot be completely shadowed by other rules, with one
313     // exception: if a rule matches empty string and the initial state is not a
314     // fallback state (i.e. all outgoing paths are accepting), then this rule
315     // will never match (if it is EOF in the initial state, then EOF rule takes
316     // priority, otherwise one of the longer rules will match).
317     DASSERT(!dfa.states.empty());
318     dfa_state_t *s0 = dfa.states[0];
319     if (s0->rule != Rule::NONE && s0->rule != dfa.eof_rule && !s0->fallback) {
320         s0->rule = dfa.eof_rule;
321     }
322 }
323 
cutoff_dead_rules(dfa_t & dfa,const opt_t * opts,const std::string & cond,Msg & msg)324 void cutoff_dead_rules(dfa_t &dfa, const opt_t *opts, const std::string &cond, Msg &msg)
325 {
326     if (opts->eof != NOEOF) {
327         // See note [EOF rule handling].
328         find_fallback_states_with_eof_rule(dfa);
329         remove_dead_final_states_with_eof_rule(dfa);
330     } else {
331         const rdfa_t rdfa(dfa);
332         const size_t ns = rdfa.nstates;
333         const size_t nl = (rdfa.nrules + 1) * ns;
334         bool *live = new bool[nl];
335         bool *fallthru = live + nl - ns;
336         memset(live, 0, nl * sizeof(bool));
337 
338         liveness_analyses(rdfa, live);
339 
340         warn_dead_rules(dfa, cond, live, msg);
341         remove_dead_final_states(dfa, fallthru);
342         warn_sentinel_in_midrule(dfa, opts, cond, live, msg);
343         find_fallback_states(dfa, fallthru);
344 
345         delete[] live;
346     }
347 }
348 
349 } // namespace re2c
350 
351