1 /*
2  * Copyright (c) 2015-2018, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /** \file
30  * \brief Stop Alphabet calculation.
31  */
32 #include "ng_stop.h"
33 
34 #include "ng_depth.h"
35 #include "ng_holder.h"
36 #include "ng_misc_opt.h"
37 #include "ng_util.h"
38 #include "ue2common.h"
39 #include "nfa/castlecompile.h"
40 #include "som/som.h"
41 #include "util/charreach.h"
42 #include "util/container.h"
43 #include "util/dump_charclass.h"
44 #include "util/graph.h"
45 #include "util/graph_range.h"
46 #include "util/verify_types.h"
47 
48 #include <map>
49 #include <set>
50 #include <vector>
51 
52 using namespace std;
53 
54 namespace ue2 {
55 
56 /** Stop alphabet depth threshold. */
57 static const u32 MAX_STOP_DEPTH = 8;
58 
59 namespace {
60 
61 /** Depths from start, startDs for this graph. */
62 struct InitDepths {
InitDepthsue2::__anon07f050fb0111::InitDepths63     explicit InitDepths(const NGHolder &g)
64         : start(calcDepthsFrom(g, g.start)),
65           startDs(calcDepthsFrom(g, g.startDs)) {}
66 
maxDistue2::__anon07f050fb0111::InitDepths67     depth maxDist(const NGHolder &g, NFAVertex v) const {
68         u32 idx = g[v].index;
69         assert(idx < start.size() && idx < startDs.size());
70         const depth &d_start = start.at(idx).max;
71         const depth &d_startDs = startDs.at(idx).max;
72         if (d_start.is_unreachable()) {
73             return d_startDs;
74         } else if (d_startDs.is_unreachable()) {
75             return d_start;
76         }
77         return max(d_start, d_startDs);
78     }
79 
80 private:
81     vector<DepthMinMax> start;
82     vector<DepthMinMax> startDs;
83 };
84 
85 } // namespace
86 
87 /** Find the set of characters that are not present in the reachability of
88  * graph \p g after a certain depth (currently 8). If a character in this set
89  * is encountered, it means that the NFA is either dead or has not progressed
90  * more than 8 characters from its start states.
91  *
92  * This is only used to guide merging heuristics, use
93  * findLeftOffsetStopAlphabet for real uses.
94  */
findStopAlphabet(const NGHolder & g,som_type som)95 CharReach findStopAlphabet(const NGHolder &g, som_type som) {
96     const depth max_depth(MAX_STOP_DEPTH);
97     const InitDepths depths(g);
98     const map<NFAVertex, BoundedRepeatSummary> no_vertices;
99 
100     CharReach stopcr;
101 
102     for (auto v : vertices_range(g)) {
103         if (is_special(v, g)) {
104             continue;
105         }
106 
107         if (depths.maxDist(g, v) >= max_depth) {
108             if (som == SOM_NONE) {
109                 stopcr |= reduced_cr(v, g, no_vertices);
110             } else {
111                 stopcr |= g[v].char_reach;
112             }
113         }
114     }
115 
116     // Turn alphabet into stops.
117     stopcr.flip();
118 
119     return stopcr;
120 }
121 
122 /** Calculate the stop alphabet for each depth from 0 to MAX_STOP_DEPTH. Then
123  * build an eight-bit mask per character C, with each bit representing the
124  * depth before the location of character C (if encountered) that the NFA would
125  * be in a predictable start state. */
findLeftOffsetStopAlphabet(const NGHolder & g,som_type som)126 vector<u8> findLeftOffsetStopAlphabet(const NGHolder &g, som_type som) {
127     const depth max_depth(MAX_STOP_DEPTH);
128     const InitDepths depths(g);
129     const map<NFAVertex, BoundedRepeatSummary> no_vertices;
130 
131     vector<CharReach> reach(MAX_STOP_DEPTH);
132 
133     for (auto v : vertices_range(g)) {
134         if (is_special(v, g)) {
135             continue;
136         }
137         CharReach v_cr;
138         if (som == SOM_NONE) {
139             v_cr = reduced_cr(v, g, no_vertices);
140         } else {
141             v_cr = g[v].char_reach;
142         }
143 
144         u32 d = min(max_depth, depths.maxDist(g, v));
145         for (u32 i = 0; i < d; i++) {
146             reach[i] |= v_cr;
147         }
148     }
149 
150 #ifdef DEBUG
151     for (u32 i = 0; i < MAX_STOP_DEPTH; i++) {
152         DEBUG_PRINTF("depth %u, stop chars: ", i);
153         describeClass(stdout, ~reach[i], 20, CC_OUT_TEXT);
154         printf("\n");
155     }
156 #endif
157 
158     vector<u8> stop(N_CHARS, 0);
159 
160     for (u32 i = 0; i < MAX_STOP_DEPTH; i++) {
161         CharReach cr = ~reach[i]; // invert reach for stop chars.
162         const u8 mask = 1U << i;
163         for (size_t c = cr.find_first(); c != cr.npos; c = cr.find_next(c)) {
164             stop[c] |= mask;
165         }
166     }
167 
168     return stop;
169 }
170 
findLeftOffsetStopAlphabet(const CastleProto & castle,UNUSED som_type som)171 vector<u8> findLeftOffsetStopAlphabet(const CastleProto &castle,
172                                       UNUSED som_type som) {
173     const depth max_width = findMaxWidth(castle);
174     DEBUG_PRINTF("castle has reach %s and max width %s\n",
175                   describeClass(castle.reach()).c_str(),
176                   max_width.str().c_str());
177 
178     const CharReach escape = ~castle.reach(); // invert reach for stop chars.
179 
180     u32 d = min(max_width, depth(MAX_STOP_DEPTH));
181     const u8 mask = verify_u8((1U << d) - 1);
182 
183     vector<u8> stop(N_CHARS, 0);
184 
185     for (size_t c = escape.find_first(); c != escape.npos;
186          c = escape.find_next(c)) {
187         stop[c] |= mask;
188     }
189 
190     return stop;
191 }
192 
193 } // namespace ue2
194