1 /*
2  * Copyright (c) 2015-2017, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /** \file
30  * \brief Analysis pass to reform leading dots.
31  *
32  * We have found that many regexes found in the wild use an anchored dot-repeat
33  * to represent an unanchored pattern, particularly if they have been used with
34  * a regex engine that assumes that a pattern is anchored. This pass reforms
35  * patterns that begin with sequences of dots into a more standard form.
36  *
37  * In addition, both anchored and unanchored patterns with dot repeats as
38  * prefixes will have these prefixes reformed into a canonical form, which some
39  * later analyses depend upon.
40  */
41 #include "ng_anchored_dots.h"
42 
43 #include "grey.h"
44 #include "ng_holder.h"
45 #include "ng_util.h"
46 #include "ue2common.h"
47 #include "util/container.h"
48 #include "util/depth.h"
49 #include "util/graph_range.h"
50 
51 #include <algorithm>
52 #include <queue>
53 #include <set>
54 #include <vector>
55 
56 using namespace std;
57 
58 namespace ue2 {
59 
60 static
findStarts(const NGHolder & g,set<NFAVertex> & anchored,set<NFAVertex> & unanchored)61 bool findStarts(const NGHolder &g, set<NFAVertex> &anchored,
62                 set<NFAVertex> &unanchored) {
63     // Populate unanchored map
64     for (auto v : adjacent_vertices_range(g.startDs, g)) {
65         if (is_special(v, g)) {
66             continue;
67         }
68         unanchored.insert(v);
69     }
70 
71     // Populate anchored map
72     for (auto v : adjacent_vertices_range(g.start, g)) {
73         if (is_special(v, g)) {
74             continue;
75         }
76         anchored.insert(v);
77     }
78 
79     if (unanchored == anchored) {
80         anchored.clear();
81     } else if (!unanchored.empty() && !anchored.empty()) {
82         return false;
83     }
84 
85     return !anchored.empty() || !unanchored.empty();
86 }
87 
88 namespace {
89 class DotInfo {
90 public:
DotInfo(NFAVertex v,bool se,u32 idx)91     DotInfo(NFAVertex v, bool se, u32 idx)
92         : vertex(v), hasSelfLoop(se), index(idx) {}
93 
operator <(const DotInfo & other) const94     bool operator<(const DotInfo &other) const {
95         if (hasSelfLoop != other.hasSelfLoop)
96             return hasSelfLoop < other.hasSelfLoop;
97         // tie break with vertex id: lowest ID wins
98         return index > other.index;
99     }
100 
101     NFAVertex vertex;
102     bool hasSelfLoop;
103     u32 index;
104 };
105 }
106 
107 // Returns nullptr if all vertices in the given set are not dots.
108 // We can only pick one dot vertex, so we go for a dot-star if it exists,
109 // otherwise the dot without a self-edge with the lowest ID.
110 static
findReformable(const NGHolder & g,const set<NFAVertex> & starts,set<NFAVertex> & otherV)111 NFAVertex findReformable(const NGHolder &g, const set<NFAVertex> &starts,
112                          set<NFAVertex> &otherV) {
113     priority_queue<DotInfo> dotq;
114     for (auto v : starts) {
115         if (is_dot(v, g)) {
116             u32 idx = g[v].index;
117             dotq.push(DotInfo(v, hasSelfLoop(v, g), idx));
118         }
119     }
120 
121     if (dotq.empty()) {
122         return NGHolder::null_vertex();
123     }
124 
125     const DotInfo &dot = dotq.top();
126     otherV = starts;
127     otherV.erase(dot.vertex);
128     DEBUG_PRINTF("selected dot vertex %u (%s)\n", dot.index,
129                  dot.hasSelfLoop ? "has self-edge" : "no self-edge");
130     DEBUG_PRINTF("%zu other vertices\n", otherV.size());
131     return dot.vertex;
132 }
133 
134 // Returns true if the given vertex is only preceded by start. If start is
135 // graph.startDs (i.e. unanchored), the given vertex can also be connected to
136 // graph.start. If selfLoopIsAcceptable is set, self-loops are ignored.
137 static
isStartNode(NFAVertex v,NFAVertex start,const NGHolder & g,bool selfLoopIsAcceptable)138 bool isStartNode(NFAVertex v, NFAVertex start, const NGHolder &g,
139                  bool selfLoopIsAcceptable) {
140     for (auto u : inv_adjacent_vertices_range(v, g)) {
141         if (selfLoopIsAcceptable && u == v) {
142             continue;
143         } else if (u == start) {
144             continue;
145         } else if (start == g.startDs && u == g.start) {
146             continue;
147         } else {
148             return false;
149         }
150     }
151     return true;
152 }
153 
154 // Note: this will only remove the anchored first dot in the chain -- any other
155 // removable nodes will be handled by the unanchored case below.
156 static
reformAnchoredRepeatsComponent(NGHolder & g,set<NFAVertex> & compAnchoredStarts,set<NFAVertex> & compUnanchoredStarts,set<NFAVertex> & dead,depth * startBegin,depth * startEnd)157 void reformAnchoredRepeatsComponent(NGHolder &g,
158                                     set<NFAVertex> &compAnchoredStarts,
159                                     set<NFAVertex> &compUnanchoredStarts,
160                                     set<NFAVertex> &dead, depth *startBegin,
161                                     depth *startEnd) {
162     // anchored cases can not have any unanchored starts
163     if (!compUnanchoredStarts.empty()) {
164         DEBUG_PRINTF("we have unanchored starts, skipping\n");
165         return;
166     }
167 
168     NFAVertex dotV = NGHolder::null_vertex();
169     set<NFAVertex> otherV;
170     dotV = findReformable(g, compAnchoredStarts, otherV);
171     if (dotV == NGHolder::null_vertex()) {
172         DEBUG_PRINTF("no candidate reformable dot found.\n");
173         return;
174     }
175 
176     NFAEdge loopEdge;
177     bool selfLoop = false;
178     bool bustOut = false;
179 
180     for (const auto &e : out_edges_range(dotV, g)) {
181         NFAVertex t = target(e, g);
182         if (t == dotV) {
183             selfLoop = true;
184             loopEdge = e;
185             continue;
186         }
187 
188         if (is_special(t, g)) {
189             bustOut = true;
190             break;
191         }
192 
193         if (!otherV.empty() && otherV.find(t) == otherV.end()) {
194             bustOut = true;
195             break;
196         }
197     }
198 
199     if (bustOut) {
200         DEBUG_PRINTF("busting out\n");
201         return;
202     }
203 
204     if (!isStartNode(dotV, g.start, g, true)) {
205         DEBUG_PRINTF("fleeing: vertex %zu has other preds\n", g[dotV].index);
206         return;
207     }
208 
209     /* get bounds */
210     depth min;
211     depth max(1);
212 
213     if (selfLoop) {
214         // A self-loop indicates that this is a '.+' or '.*'
215         max = depth::infinity();
216     }
217 
218     if (!otherV.empty()) {
219         /* We require that the successors of the dot node are are the same
220          * as the start vertex. TODO: remember why.
221          */
222         if (selfLoop) {
223             if (otherV.size() != out_degree(dotV, g) - 1) {
224                 return;
225             }
226         } else {
227             if (otherV.size() != out_degree(dotV, g)) {
228                 return;
229             }
230         }
231 
232         min = depth(0);
233     } else {
234         min = depth(1);
235     }
236 
237     *startBegin = min;
238     *startEnd = max;
239 
240     for (auto t : adjacent_vertices_range(dotV, g)) {
241         if (t != dotV) {
242             add_edge_if_not_present(g.startDs, t, g);
243             add_edge_if_not_present(g.start, t, g);
244             compUnanchoredStarts.insert(t);
245         }
246     }
247 
248     for (auto v : otherV) {
249         remove_edge(g.start, v, g);
250     }
251 
252     DEBUG_PRINTF("removing vertex %zu\n", g[dotV].index);
253     clear_vertex(dotV, g);
254     dead.insert(dotV);
255     compAnchoredStarts.erase(dotV);
256 }
257 
258 static
reformUnanchoredRepeatsComponent(NGHolder & g,set<NFAVertex> & compAnchoredStarts,set<NFAVertex> & compUnanchoredStarts,set<NFAVertex> & dead,depth * startBegin,depth * startEnd)259 void reformUnanchoredRepeatsComponent(NGHolder &g,
260                                       set<NFAVertex> &compAnchoredStarts,
261                                       set<NFAVertex> &compUnanchoredStarts,
262                                       set<NFAVertex> &dead,
263                                       depth *startBegin, depth *startEnd) {
264     // unanchored cases can not have any anchored starts
265     if (!compAnchoredStarts.empty()) {
266         DEBUG_PRINTF("we have anchored starts, skipping\n");
267         return;
268     }
269 
270     while (true) {
271         NFAVertex dotV = NGHolder::null_vertex();
272         set<NFAVertex> otherV;
273         dotV = findReformable(g, compUnanchoredStarts, otherV);
274         if (dotV == NGHolder::null_vertex()) {
275             DEBUG_PRINTF("no candidate reformable dot found.\n");
276             return;
277         }
278 
279         NFAEdge loopEdge;
280         bool selfLoop = false;
281         bool bustOut = false;
282 
283         for (const auto &e : out_edges_range(dotV, g)) {
284             NFAVertex t = target(e, g);
285 
286             if (t == dotV) {
287                 selfLoop = true;
288                 loopEdge = e;
289                 continue;
290             }
291 
292             if (is_special(t, g)) {
293                 bustOut = true;
294                 break;
295             }
296 
297             if (!otherV.empty() && otherV.find(t) == otherV.end()) {
298                 bustOut = true;
299                 break;
300             }
301         }
302 
303         if (bustOut) {
304             DEBUG_PRINTF("busting out\n");
305             if (!selfLoop) {
306                 return;
307             }
308 
309             for (auto v : otherV) {
310                 if (!edge(dotV, v, g).second) {
311                     return;
312                 }
313             }
314 
315             // A self-loop indicates that this is a '.+' or '.*'
316             DEBUG_PRINTF("self-loop detected on %zu\n", g[dotV].index);
317             *startEnd = depth::infinity();
318             remove_edge(dotV, dotV, g);
319             return;
320         }
321 
322         if (!isStartNode(dotV, g.startDs, g, true)) {
323             DEBUG_PRINTF("fleeing: vertex %zu has other preds\n",
324                          g[dotV].index);
325             return;
326         }
327 
328         /* get bounds */
329         depth min(1);
330         depth max(1);
331 
332         if (selfLoop) {
333             // A self-loop indicates that this is a '.+' or '.*'
334             DEBUG_PRINTF("self-loop detected\n");
335             max = depth::infinity();
336         }
337 
338         if (!otherV.empty()) {
339             if (!selfLoop && otherV.size() != out_degree(dotV, g)) {
340                 return;
341             }
342 
343             if (selfLoop && otherV.size() != out_degree(dotV, g) - 1) {
344                 return;
345             }
346 
347             if (min > depth(1)) {
348                 /* this is not a case we can handle */
349                 DEBUG_PRINTF("min greater than one, skipping\n");
350                 return;
351             }
352             min = depth(0);
353         }
354 
355         *startBegin += min;
356         *startEnd += max;
357 
358         for (auto v : otherV) {
359             remove_edge(g.start, v, g);
360             remove_edge(g.startDs, v, g);
361         }
362 
363         compUnanchoredStarts.clear();
364         for (auto t : adjacent_vertices_range(dotV, g)) {
365             if (t != dotV) {
366                 DEBUG_PRINTF("connecting sds -> %zu\n", g[t].index);
367                 add_edge(g.startDs, t, g);
368                 add_edge(g.start, t, g);
369                 compUnanchoredStarts.insert(t);
370             }
371         }
372 
373         DEBUG_PRINTF("removing vertex %zu\n", g[dotV].index);
374         dead.insert(dotV);
375         clear_vertex(dotV, g);
376         compUnanchoredStarts.erase(dotV);
377     }
378 }
379 
380 // for t to be another optional dot, it must have only in-edges from v and from
381 // starts
382 static
isOptionalDot(NFAVertex t,NFAVertex v,const NGHolder & g)383 bool isOptionalDot(NFAVertex t, NFAVertex v, const NGHolder &g) {
384     if (!is_dot(t, g)) {
385         return false;
386     }
387 
388     bool found_v = false, found_start = false;
389 
390     for (auto u : inv_adjacent_vertices_range(t, g)) {
391         if (u == v) {
392             found_v = true;
393         } else if (u == g.start || u == g.startDs) {
394             found_start = true;
395         } else {
396             return false;
397         }
398     }
399 
400     return found_v && found_start;
401 }
402 
403 static
gatherParticipants(const NGHolder & g,NFAVertex start,NFAVertex initialDot,set<NFAVertex> & dots,set<NFAVertex> & succ)404 bool gatherParticipants(const NGHolder &g,
405                         NFAVertex start, NFAVertex initialDot,
406                         set<NFAVertex> &dots, set<NFAVertex> &succ) {
407     // Walk the graph downwards from the initial dot; each dot will have:
408     //      1) a single optional dot successor, or
409     //      2) N successors (our terminating case)
410     dots.insert(initialDot);
411     NFAVertex v = initialDot;
412 
413     while (out_degree(v, g) == 1) {
414         NFAVertex t = *(adjacent_vertices(v, g).first);
415         // for t to be another optional dot, it must have only in-edges from v
416         // and from starts
417         if (isOptionalDot(t, v, g)) {
418             // another dot; bail if we've seen it once already
419             if (dots.find(t) != dots.end()) {
420                 DEBUG_PRINTF("cycle detected at vertex %zu\n", g[t].index);
421                 return false;
422             }
423             dots.insert(t);
424             v = t;
425             continue;
426         }
427         // otherwise, we found a terminating dot state
428         break;
429     }
430 
431     // Our terminating states are the successors of v.
432     // All of these MUST have an edge from start as well.
433     for (auto w : adjacent_vertices_range(v, g)) {
434         succ.insert(w);
435         if (!edge(start, w, g).second) {
436             DEBUG_PRINTF("failing, vertex %zu does not have edge from start\n",
437                          g[w].index);
438             return false;
439         }
440     }
441 
442     /* All the non chained v connected to start must be in succ as well
443      * TODO: remember why (and document). */
444     for (auto u : adjacent_vertices_range(start, g)) {
445         if (is_special(u, g)) {
446             continue;
447         }
448         if (!contains(dots, u) && !contains(succ, u)) {
449             return false;
450         }
451     }
452 
453     return !succ.empty();
454 }
455 
456 static
collapseVariableDotRepeat(NGHolder & g,NFAVertex start,set<NFAVertex> & dead,UNUSED depth * startBegin,depth * startEnd)457 void collapseVariableDotRepeat(NGHolder &g, NFAVertex start,
458                                set<NFAVertex> &dead, UNUSED depth *startBegin,
459                                depth *startEnd) {
460     // Handle optional dot repeat prefixes, e.g.
461     //     /^.{0,30}foo/s, /^.{0,5}foo/s, unanchored equivs
462     // Note that this code assumes that fixed repeats ('^.{5,20}') have been
463     // pruned already, down (in this case) to '^.{0,15}'.
464 
465     // The first of our optional dots must be connected to start. The jump edge
466     // past it will be verified in gatherParticipants(). If start is
467     // graph.start, it should not be connected to startDs.
468     NFAVertex initialDot = NGHolder::null_vertex();
469     for (auto v : adjacent_vertices_range(start, g)) {
470         if (is_special(v, g)) {
471             continue;
472         }
473         if (is_dot(v, g) && isStartNode(v, start, g, false)) {
474             if (initialDot) {
475                 return;
476             }
477             initialDot = v;
478             DEBUG_PRINTF("initial dot vertex is %zu\n", g[v].index);
479         }
480     }
481 
482     if (!initialDot) {
483         return;
484     }
485 
486     // Collect all the other optional dot vertices and the successor vertices
487     // by walking down the graph from initialDot
488     set<NFAVertex> dots, succ;
489     if (!gatherParticipants(g, start, initialDot, dots, succ)) {
490         DEBUG_PRINTF("gatherParticipants failed\n");
491         return;
492     }
493 
494     DEBUG_PRINTF("optional dot repeat with %zu participants, "
495                  "terminating in %zu non-dot nodes\n",
496                  dots.size(), succ.size());
497 
498     // Remove all the participants and set the start offset
499     dead.insert(dots.begin(), dots.end());
500 
501     DEBUG_PRINTF("current offsets: %s-%s\n", startBegin->str().c_str(),
502                  startEnd->str().c_str());
503 
504     if (start == g.start && startEnd->is_infinite()) {
505         *startEnd = depth(dots.size());
506     } else if (startEnd->is_finite()) {
507         *startEnd += dots.size();
508     }
509     assert(startEnd->is_reachable());
510 
511     // Connect our successor vertices to both start and startDs.
512     for (auto v : succ) {
513         add_edge_if_not_present(g.start, v, g);
514         add_edge_if_not_present(g.startDs, v, g);
515     }
516 }
517 
518 static
deleteVertices(set<NFAVertex> & dead,NGHolder & g)519 void deleteVertices(set<NFAVertex> &dead, NGHolder &g) {
520     if (!dead.empty()) {
521         DEBUG_PRINTF("pruning %zu vertices\n", dead.size());
522         remove_vertices(dead, g);
523     }
524     dead.clear();
525 }
526 
527 static
reformAnchoredRepeats(NGHolder & g,depth * startBegin,depth * startEnd)528 void reformAnchoredRepeats(NGHolder &g, depth *startBegin, depth *startEnd) {
529     DEBUG_PRINTF("component\n");
530     set<NFAVertex> anchored, unanchored, dead;
531     if (!findStarts(g, anchored, unanchored)) {
532         DEBUG_PRINTF("no starts\n");
533         return;
534     }
535 
536     reformAnchoredRepeatsComponent(g, anchored, unanchored, dead, startBegin,
537                                    startEnd);
538     deleteVertices(dead, g);
539 
540     reformUnanchoredRepeatsComponent(g, anchored, unanchored, dead, startBegin,
541                                      startEnd);
542     deleteVertices(dead, g);
543 }
544 
545 static
collapseVariableRepeats(NGHolder & g,depth * startBegin,depth * startEnd)546 void collapseVariableRepeats(NGHolder &g, depth *startBegin, depth *startEnd) {
547     DEBUG_PRINTF("collapseVariableRepeats\n");
548     set<NFAVertex> dead;
549 
550     collapseVariableDotRepeat(g, g.start, dead, startBegin, startEnd);
551     deleteVertices(dead, g);
552 
553     collapseVariableDotRepeat(g, g.startDs, dead, startBegin, startEnd);
554     deleteVertices(dead, g);
555 }
556 
557 static
addDotsBetween(NGHolder & g,NFAVertex lhs,vector<NFAVertex> & rhs,depth min_repeat,depth max_repeat)558 void addDotsBetween(NGHolder &g, NFAVertex lhs, vector<NFAVertex> &rhs,
559                     depth min_repeat, depth max_repeat) {
560     const bool unbounded = max_repeat.is_infinite();
561     if (unbounded) {
562         max_repeat = min_repeat;
563     }
564 
565     assert(max_repeat.is_finite());
566 
567     NFAVertex u = lhs;
568 
569     if (!min_repeat && unbounded) {
570         NFAVertex v = add_vertex(g);
571         add_edge(u, v, g);
572         g[v].char_reach.setall();
573 
574         for (auto w : rhs) {
575             add_edge(lhs, w, g);
576         }
577     }
578 
579     for (u32 i = 0; i < min_repeat; i++) {
580         NFAVertex v = add_vertex(g);
581         add_edge(u, v, g);
582         g[v].char_reach.setall();
583         u = v;
584     }
585 
586     NFAVertex split = u;
587     /* lhs now split point for optional */
588     for (u32 i = min_repeat; i < max_repeat; i++) {
589         NFAVertex v = add_vertex(g);
590         add_edge(u, v, g);
591         if (u != split) {
592             add_edge(split, v, g);
593         }
594         g[v].char_reach.setall();
595         u = v;
596     }
597 
598     if (unbounded) {
599         add_edge(u, u, g);
600     }
601 
602     for (auto w : rhs) {
603         add_edge(u, w, g);
604         if (split != u) {
605             add_edge(split, w, g);
606         }
607     }
608 }
609 
610 static
restoreLeadingDots(NGHolder & g,const depth & startBegin,const depth & startEnd)611 void restoreLeadingDots(NGHolder &g, const depth &startBegin,
612                         const depth &startEnd) {
613     if (startBegin == depth(0) && startEnd.is_infinite()) {
614         return;
615     }
616     DEBUG_PRINTF("ungobble (%s, %s)\n", startBegin.str().c_str(),
617                  startEnd.str().c_str());
618 
619     for (UNUSED auto v : adjacent_vertices_range(g.start, g)) {
620         assert(edge(g.startDs, v, g).second);
621     }
622     clear_out_edges(g.start, g);
623     add_edge(g.start, g.startDs, g);
624 
625     const bool unbounded = startEnd.is_infinite();
626 
627     NFAVertex root = unbounded ? g.startDs : g.start;
628 
629     vector<NFAVertex> rhs;
630     insert(&rhs, rhs.end(), adjacent_vertices(g.startDs, g));
631     rhs.erase(remove(rhs.begin(), rhs.end(), g.startDs), rhs.end());
632     for (auto v : rhs) {
633         remove_edge(g.startDs, v, g);
634     }
635 
636     addDotsBetween(g, root, rhs, startBegin, startEnd);
637     renumber_vertices(g);
638     renumber_edges(g);
639 }
640 
641 // Entry point.
reformLeadingDots(NGHolder & g)642 void reformLeadingDots(NGHolder &g) {
643     depth startBegin(0);
644     depth startEnd = depth::infinity();
645 
646     reformAnchoredRepeats(g, &startBegin, &startEnd);
647     collapseVariableRepeats(g, &startBegin, &startEnd);
648     restoreLeadingDots(g, startBegin, startEnd);
649 }
650 
651 } // namespace ue2
652