1 /*
2  * Copyright (c) 2015-2017, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /** \file
30  * \brief Utility functions related to SOM ("Start of Match").
31  */
32 #include "ng_som_util.h"
33 
34 #include "ng_depth.h"
35 #include "ng_execute.h"
36 #include "ng_holder.h"
37 #include "ng_prune.h"
38 #include "ng_util.h"
39 #include "util/container.h"
40 #include "util/graph_range.h"
41 
42 using namespace std;
43 
44 namespace ue2 {
45 
46 static
wireSuccessorsToStart(NGHolder & g,NFAVertex u)47 void wireSuccessorsToStart(NGHolder &g, NFAVertex u) {
48     for (auto v : adjacent_vertices_range(u, g)) {
49         add_edge_if_not_present(g.start, v, g);
50     }
51 }
52 
getDistancesFromSOM(const NGHolder & g_orig)53 vector<DepthMinMax> getDistancesFromSOM(const NGHolder &g_orig) {
54     // We operate on a temporary copy of the original graph here, so we don't
55     // have to mutate the original.
56     NGHolder g;
57     unordered_map<NFAVertex, NFAVertex> vmap; // vertex in g_orig to vertex in g
58     cloneHolder(g, g_orig, &vmap);
59 
60     vector<NFAVertex> vstarts;
61     for (auto v : vertices_range(g)) {
62         if (is_virtual_start(v, g)) {
63             vstarts.push_back(v);
64         }
65     }
66     vstarts.push_back(g.startDs);
67 
68     // wire the successors of every virtual start or startDs to g.start.
69     for (auto v : vstarts) {
70         wireSuccessorsToStart(g, v);
71     }
72 
73     // drop the in-edges of every virtual start so that they don't participate
74     // in the depth calculation.
75     for (auto v : vstarts) {
76         clear_in_edges(v, g);
77     }
78 
79     //dumpGraph("som_depth.dot", g);
80 
81     // Find depths, indexed by vertex index in g
82     auto temp_depths = calcDepthsFrom(g, g.start);
83 
84     // Transfer depths, indexed by vertex index in g_orig.
85     vector<DepthMinMax> depths(num_vertices(g_orig));
86 
87     for (auto v_orig : vertices_range(g_orig)) {
88         assert(contains(vmap, v_orig));
89         NFAVertex v_new = vmap[v_orig];
90 
91         u32 orig_idx = g_orig[v_orig].index;
92 
93         DepthMinMax &d = depths.at(orig_idx);
94 
95         if (v_orig == g_orig.startDs || is_virtual_start(v_orig, g_orig)) {
96             // StartDs and virtual starts always have zero depth.
97             d = DepthMinMax(depth(0), depth(0));
98         } else {
99             u32 new_idx = g[v_new].index;
100             d = temp_depths.at(new_idx);
101         }
102     }
103 
104     return depths;
105 }
106 
firstMatchIsFirst(const NGHolder & p)107 bool firstMatchIsFirst(const NGHolder &p) {
108     /* If the first match (by end offset) is not the first match (by start
109      * offset) then we can't create a lock after it.
110      *
111      * Consider: 4009:/(foobar|ob).*bugger/s
112      *
113      * We don't care about races on the last byte as they can be resolved easily
114      * at runtime /(foobar|obar).*hi/
115      *
116      * It should be obvious we don't care about one match being a prefix
117      * of another as they share the same start offset.
118      *
119      * Therefore, the case were we cannot establish that the som does not
120      * regress is when there exists s1 and s2 in the language of p and s2 is a
121      * proper infix of s1.
122      *
123      * It is tempting to add the further restriction that there does not exist a
124      * prefix of s1 that is in the language of p (as in which case we would
125      * presume, the lock has already been set). However, we have no way of
126      * knowing if the lock can be cleared by some characters, and if so, if it
127      * is still set. TODO: if we knew the lock's escapes where we could verify
128      * that the rest of s1 does not clear the lock. (1)
129      */
130 
131     DEBUG_PRINTF("entry\n");
132 
133     /* If there are any big cycles throw up our hands in despair */
134     if (hasBigCycles(p)) {
135         DEBUG_PRINTF("fail, big cycles\n");
136         return false;
137     }
138 
139     flat_set<NFAVertex> states;
140     /* turn on all states (except starts - avoid suffix matches) */
141     /* If we were doing (1) we would also except states leading to accepts -
142        avoid prefix matches */
143     for (auto v : vertices_range(p)) {
144         assert(!is_virtual_start(v, p));
145         if (!is_special(v, p)) {
146             DEBUG_PRINTF("turning on %zu\n", p[v].index);
147             states.insert(v);
148         }
149     }
150 
151     /* run the prefix the main graph */
152     states = execute_graph(p, p, states);
153 
154     for (auto v : states) {
155         /* need to check if this vertex may represent an infix match - ie
156          * it does not have an edge to accept. */
157         DEBUG_PRINTF("check %zu\n", p[v].index);
158         if (!edge(v, p.accept, p).second) {
159             DEBUG_PRINTF("fail %zu\n", p[v].index);
160             return false;
161         }
162     }
163 
164     DEBUG_PRINTF("done first is first check\n");
165     return true;
166 }
167 
somMayGoBackwards(NFAVertex u,const NGHolder & g,const unordered_map<NFAVertex,u32> & region_map,smgb_cache & cache)168 bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
169                        const unordered_map<NFAVertex, u32> &region_map,
170                        smgb_cache &cache) {
171     /* Need to ensure all matches of the graph g up to u contain no infixes
172      * which are also matches of the graph to u.
173      *
174      * This is basically the same as firstMatchIsFirst except we g is not
175      * always a dag. As we haven't gotten around to writing an execute_graph
176      * that operates on general graphs, we take some (hopefully) conservative
177      * short cuts.
178      *
179      * Note: if the u can be jumped we will take jump edges
180      * into account as a possibility of som going backwards
181      *
182      * TODO: write a generalised ng_execute_graph/make this less hacky
183      */
184     assert(&g == &cache.g);
185     if (contains(cache.smgb, u)) {
186         return cache.smgb[u];
187     }
188 
189     DEBUG_PRINTF("checking if som can go backwards on %zu\n", g[u].index);
190 
191     set<NFAEdge> be;
192     BackEdges<set<NFAEdge>> backEdgeVisitor(be);
193     boost::depth_first_search(g, visitor(backEdgeVisitor).root_vertex(g.start));
194 
195     bool rv;
196     if (0) {
197     exit:
198         DEBUG_PRINTF("using cached result\n");
199         cache.smgb[u] = rv;
200         return rv;
201     }
202 
203     assert(contains(region_map, u));
204     const u32 u_region = region_map.at(u);
205 
206     for (const auto &e : be) {
207         NFAVertex s = source(e, g);
208         NFAVertex t = target(e, g);
209         /* only need to worry about big cycles including/before u */
210         DEBUG_PRINTF("back edge %zu %zu\n", g[s].index, g[t].index);
211         if (s != t && region_map.at(s) <= u_region) {
212             DEBUG_PRINTF("eek big cycle\n");
213             rv = true; /* big cycle -> eek */
214             goto exit;
215         }
216     }
217 
218     unordered_map<NFAVertex, NFAVertex> orig_to_copy;
219     NGHolder c_g;
220     cloneHolder(c_g, g, &orig_to_copy);
221 
222     /* treat virtual starts as unconditional - wire to startDs instead */
223     for (NFAVertex v : vertices_range(g)) {
224         if (!is_virtual_start(v, g)) {
225             continue;
226         }
227         NFAVertex c_v = orig_to_copy[v];
228         orig_to_copy[v] = c_g.startDs;
229         for (NFAVertex c_w : adjacent_vertices_range(c_v, c_g)) {
230             add_edge_if_not_present(c_g.startDs, c_w, c_g);
231         }
232         clear_vertex(c_v, c_g);
233     }
234 
235     /* treat u as the only accept state */
236     NFAVertex c_u = orig_to_copy[u];
237     clear_in_edges(c_g.acceptEod, c_g);
238     add_edge(c_g.accept, c_g.acceptEod, c_g);
239     clear_in_edges(c_g.accept, c_g);
240     clear_out_edges(c_u, c_g);
241     if (hasSelfLoop(u, g)) {
242         add_edge(c_u, c_u, c_g);
243     }
244     add_edge(c_u, c_g.accept, c_g);
245 
246     set<NFAVertex> u_succ;
247     insert(&u_succ, adjacent_vertices(u, g));
248     u_succ.erase(u);
249 
250     for (auto t : inv_adjacent_vertices_range(u, g)) {
251         if (t == u) {
252             continue;
253         }
254         for (auto v : adjacent_vertices_range(t, g)) {
255             if (contains(u_succ, v)) {
256                 /* due to virtual starts being aliased with normal starts in the
257                  * copy of the graph, we may have already added the edges. */
258                 add_edge_if_not_present(orig_to_copy[t], c_g.accept, c_g);
259                 break;
260             }
261         }
262     }
263 
264     pruneUseless(c_g);
265 
266     be.clear();
267     boost::depth_first_search(c_g, visitor(backEdgeVisitor)
268                                    .root_vertex(c_g.start));
269 
270     for (const auto &e : be) {
271         NFAVertex s = source(e, c_g);
272         NFAVertex t = target(e, c_g);
273         DEBUG_PRINTF("back edge %zu %zu\n", c_g[s].index, c_g[t].index);
274         if (s != t) {
275             assert(0);
276             DEBUG_PRINTF("eek big cycle\n");
277             rv = true; /* big cycle -> eek */
278             goto exit;
279         }
280     }
281 
282     DEBUG_PRINTF("checking acyclic+selfloop graph\n");
283 
284     rv = !firstMatchIsFirst(c_g);
285     DEBUG_PRINTF("som may regress? %d\n", (int)rv);
286     goto exit;
287 }
288 
sentClearsTail(const NGHolder & g,const unordered_map<NFAVertex,u32> & region_map,const NGHolder & sent,u32 last_head_region,u32 * bad_region)289 bool sentClearsTail(const NGHolder &g,
290                     const unordered_map<NFAVertex, u32> &region_map,
291                     const NGHolder &sent, u32 last_head_region,
292                     u32 *bad_region) {
293     /* if a subsequent match from the prefix clears the rest of the pattern
294      * we can just keep track of the last match of the prefix.
295      * To see if this property holds, we could:
296      *
297      * 1A: turn on all states in the tail and run all strings that may
298      *    match the prefix past the tail, if we are still in any states then
299      *    this property does not hold.
300      *
301      * 1B: we turn on the initial states of the tail and run any strings which
302      *   may finish any partial matches in the prefix and see if we end up with
303      *   anything which would also imply that this property does not hold.
304      *
305      * OR
306      *
307      * 2: we just turn everything and run the prefix inputs past it and see what
308      * we are left with. I think that is equivalent to scheme 1 and is easier to
309      * implement. TODO: ponder
310      *
311      * Anyway, we are going with scheme 2 until further notice.
312      */
313 
314     u32 first_bad_region = ~0U;
315     flat_set<NFAVertex> states;
316     /* turn on all states */
317     DEBUG_PRINTF("region %u is cutover\n", last_head_region);
318     for (auto v : vertices_range(g)) {
319         if (v != g.accept && v != g.acceptEod) {
320             states.insert(v);
321         }
322     }
323 
324     for (UNUSED auto v : states) {
325         DEBUG_PRINTF("start state: %zu\n", g[v].index);
326     }
327 
328     /* run the prefix the main graph */
329     states = execute_graph(g, sent, states);
330 
331     /* .. and check if we are left with anything in the tail region */
332     for (auto v : states) {
333         if (v == g.start || v == g.startDs) {
334             continue; /* not in tail */
335         }
336 
337         DEBUG_PRINTF("v %zu is still on\n", g[v].index);
338         assert(v != g.accept && v != g.acceptEod); /* no cr */
339 
340         assert(contains(region_map, v));
341         const u32 v_region = region_map.at(v);
342         if (v_region > last_head_region) {
343             DEBUG_PRINTF("bailing, %u > %u\n", v_region, last_head_region);
344             first_bad_region = min(first_bad_region, v_region);
345         }
346     }
347 
348     if (first_bad_region != ~0U) {
349         DEBUG_PRINTF("first bad region is %u\n", first_bad_region);
350         *bad_region = first_bad_region;
351         return false;
352     }
353 
354     return true;
355 }
356 
357 } // namespace ue2
358