1 /*
2  * Copyright (c) 2015-2017, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /**
30  * \file
31  * \brief Build code for DFA minimization.
32  */
33 
34 /**
35  * /Summary of the Hopcroft minimisation algorithm/
36  *
37  * partition := {F, Q \ F};
38  * work_queue := {F};
39  * while (work_queue is not empty) do
40  *    choose and remove a set A from work_queue
41  *    for each c in . do
42  *         let X be the set of states for which a transition on c
43  *                                                  leads to a state in A
44  *         for each set Y in partition for which X . Y is nonempty and
45  *                                                  Y \ X is nonempty do
46  *              replace Y in partition by the two sets X . Y and Y \ X
47  *              if Y is in work_queue
48  *                   replace Y in work_queue by the same two sets
49  *              else
50  *                   if |X . Y| <= |Y \ X|
51  *                        add X . Y to work_queue
52  *                   else
53  *                        add Y \ X to work_queue
54  *         end;
55  *    end;
56  * end;
57  */
58 
59 #include "dfa_min.h"
60 
61 #include "grey.h"
62 #include "mcclellancompile_util.h"
63 #include "rdfa.h"
64 #include "ue2common.h"
65 #include "util/container.h"
66 #include "util/flat_containers.h"
67 #include "util/noncopyable.h"
68 #include "util/partitioned_set.h"
69 
70 #include <algorithm>
71 #include <functional>
72 #include <iterator>
73 #include <map>
74 #include <queue>
75 #include <set>
76 #include <vector>
77 
78 using namespace std;
79 
80 namespace ue2 {
81 
82 namespace {
83 
84 struct hopcroft_state_info {
hopcroft_state_infoue2::__anone072aabd0111::hopcroft_state_info85     explicit hopcroft_state_info(size_t alpha_size) : prev(alpha_size) {}
86 
87     /** \brief Mapping from symbol to a list of predecessors that transition to
88      * this state on that symbol. */
89     vector<vector<dstate_id_t>> prev;
90 };
91 
92 struct HopcroftInfo : noncopyable {
93     size_t alpha_size; //!< Size of DFA alphabet.
94     queue<size_t> work_queue; //!< Hopcroft work queue of partition indices.
95     partitioned_set<dstate_id_t> partition; //!< Partition set of DFA states.
96     vector<hopcroft_state_info> states; //!< Pre-calculated state info (preds)
97 
98     explicit HopcroftInfo(const raw_dfa &rdfa);
99 };
100 
101 } // namespace
102 
103 /**
104  * \brief Create an initial partitioning and work_queue.
105  *
106  * Initial partition contains {accepting states..., Non-accepting states}
107  * Initial work_queue contains accepting state subsets
108  *
109  * The initial partitioning needs to distinguish between the different
110  * reporting behaviours (unlike standard Hopcroft) --> more than one subset
111  * possible for the accepting states.
112  *
113  * Look for accepting states in both reports and reports_eod.
114  * Creates a map with a key(reports, reports_eod) and an id.
115  * Reports of each state are searched against the map and
116  * added to the corresponding id -> partition[id] and work_queue[id].
117  * Non Accept states are added to partition[id+1].
118  */
119 static
create_map(const raw_dfa & rdfa,queue<size_t> & work_queue)120 vector<size_t> create_map(const raw_dfa &rdfa, queue<size_t> &work_queue) {
121     using ReportKey = pair<flat_set<ReportID>, flat_set<ReportID>>;
122     map<ReportKey, size_t> subset_map;
123     vector<size_t> state_to_subset(rdfa.states.size(), INVALID_SUBSET);
124 
125     for (size_t i = 0; i < rdfa.states.size(); i++) {
126         const auto &ds = rdfa.states[i];
127         if (!ds.reports.empty() || !ds.reports_eod.empty()) {
128             ReportKey key(ds.reports, ds.reports_eod);
129             if (contains(subset_map, key)) {
130                 state_to_subset[i] = subset_map[key];
131             } else {
132                 size_t sub = subset_map.size();
133                 subset_map.emplace(std::move(key), sub);
134                 state_to_subset[i] = sub;
135                 work_queue.push(sub);
136             }
137         }
138     }
139 
140     /* Give non-accept states their own subset. */
141     size_t non_accept_sub = subset_map.size();
142     replace(state_to_subset.begin(), state_to_subset.end(), INVALID_SUBSET,
143             non_accept_sub);
144 
145     return state_to_subset;
146 }
147 
HopcroftInfo(const raw_dfa & rdfa)148 HopcroftInfo::HopcroftInfo(const raw_dfa &rdfa)
149     : alpha_size(rdfa.alpha_size), partition(create_map(rdfa, work_queue)),
150       states(rdfa.states.size(), hopcroft_state_info(alpha_size)) {
151     /* Construct predecessor lists for each state, indexed by symbol. */
152     for (size_t i = 0; i < states.size(); i++) { // i is the previous state
153         for (size_t sym = 0; sym < alpha_size; sym++) {
154             dstate_id_t present_state = rdfa.states[i].next[sym];
155             states[present_state].prev[sym].push_back(i);
156         }
157     }
158 }
159 
160 /**
161  * For a split set X, each subset S (given by part_index) in the partition, two
162  * sets are created: v_inter (X intersection S) and v_sub (S - X).
163  *
164  * For each subset S in the partition that could be split (v_inter is nonempty
165  * and v_sub is nonempty):
166  *  - replace S in partition by the two sets v_inter and v_sub.
167  *  - if S is in work_queue:
168  *      - replace S in work_queue by the two subsets.
169  *  - else:
170  *      - replace S in work_queue by the smaller of the two sets.
171  */
172 static
split_and_replace_set(const size_t part_index,HopcroftInfo & info,const flat_set<dstate_id_t> & splitter)173 void split_and_replace_set(const size_t part_index, HopcroftInfo &info,
174                            const flat_set<dstate_id_t> &splitter) {
175     /* singleton sets cannot be split */
176     if (info.partition[part_index].size() == 1) {
177         return;
178     }
179 
180     size_t small_index = info.partition.split(part_index, splitter);
181 
182     if (small_index == INVALID_SUBSET) {
183         /* the set could not be split */
184         return;
185     }
186 
187     /* larger subset remains at the input subset index, if the input subset was
188      * already in the work queue then the larger subset will remain there. */
189 
190     info.work_queue.push(small_index);
191 }
192 
193 /**
194  * \brief Core of the Hopcroft minimisation algorithm.
195  */
196 static
dfa_min(HopcroftInfo & info)197 void dfa_min(HopcroftInfo &info) {
198     flat_set<dstate_id_t> curr, sym_preds;
199     vector<size_t> cand_subsets;
200 
201     while (!info.work_queue.empty()) {
202         /* Choose and remove a set of states (curr, or A in the description
203          * above) from the work queue. Note that we copy the set because the
204          * partition may be split by the loop below. */
205         curr.clear();
206         insert(&curr, info.partition[info.work_queue.front()]);
207         info.work_queue.pop();
208 
209         for (size_t sym = 0; sym < info.alpha_size; sym++) {
210             /* Find the set of states sym_preds for which a transition on the
211              * given symbol leads to a state in curr. */
212             sym_preds.clear();
213             for (dstate_id_t s : curr) {
214                 insert(&sym_preds, info.states[s].prev[sym]);
215             }
216 
217             if (sym_preds.empty()) {
218                 continue;
219             }
220 
221             /* we only need to consider subsets with at least one member in
222              * sym_preds for splitting */
223             cand_subsets.clear();
224             info.partition.find_overlapping(sym_preds, &cand_subsets);
225 
226             for (size_t sub : cand_subsets) {
227                 split_and_replace_set(sub, info, sym_preds);
228             }
229         }
230     }
231 }
232 
233 /**
234  * \brief Build the new DFA state table.
235  */
236 static
mapping_new_states(const HopcroftInfo & info,vector<dstate_id_t> & old_to_new,raw_dfa & rdfa)237 void mapping_new_states(const HopcroftInfo &info,
238                         vector<dstate_id_t> &old_to_new, raw_dfa &rdfa) {
239     const size_t num_partitions = info.partition.size();
240 
241     // Mapping from equiv class's first state to equiv class index.
242     map<dstate_id_t, size_t> ordering;
243 
244     // New state id for each equiv class.
245     vector<dstate_id_t> eq_state(num_partitions);
246 
247     for (size_t i = 0; i < num_partitions; i++) {
248         ordering[*info.partition[i].begin()] = i;
249     }
250 
251     dstate_id_t new_id = 0;
252     for (const auto &m : ordering) {
253         eq_state[m.second] = new_id++;
254     }
255 
256     for (size_t t = 0; t < info.partition.size(); t++) {
257         for (dstate_id_t id : info.partition[t]) {
258             old_to_new[id] = eq_state[t];
259         }
260     }
261 
262     vector<dstate> new_states;
263     new_states.reserve(num_partitions);
264 
265     for (const auto &m : ordering) {
266         new_states.push_back(rdfa.states[m.first]);
267     }
268     rdfa.states = std::move(new_states);
269 }
270 
271 static
renumber_new_states(const HopcroftInfo & info,const vector<dstate_id_t> & old_to_new,raw_dfa & rdfa)272 void renumber_new_states(const HopcroftInfo &info,
273                          const vector<dstate_id_t> &old_to_new, raw_dfa &rdfa) {
274     for (size_t i = 0; i < info.partition.size(); i++) {
275         for (size_t sym = 0; sym < info.alpha_size; sym++) {
276             dstate_id_t output = rdfa.states[i].next[sym];
277             rdfa.states[i].next[sym] = old_to_new[output];
278         }
279         dstate_id_t dad = rdfa.states[i].daddy;
280         rdfa.states[i].daddy = old_to_new[dad];
281     }
282 
283     rdfa.start_floating = old_to_new[rdfa.start_floating];
284     rdfa.start_anchored = old_to_new[rdfa.start_anchored];
285 }
286 
287 static
new_dfa(raw_dfa & rdfa,const HopcroftInfo & info)288 void new_dfa(raw_dfa &rdfa, const HopcroftInfo &info) {
289     if (info.partition.size() == info.states.size()) {
290         return;
291     }
292 
293     vector<dstate_id_t> old_to_new(info.states.size());
294     mapping_new_states(info, old_to_new, rdfa);
295     renumber_new_states(info, old_to_new, rdfa);
296 }
297 
minimize_hopcroft(raw_dfa & rdfa,const Grey & grey)298 void minimize_hopcroft(raw_dfa &rdfa, const Grey &grey) {
299     if (!grey.minimizeDFA) {
300         return;
301     }
302 
303     if (is_dead(rdfa)) {
304         DEBUG_PRINTF("dfa is empty\n");
305     }
306 
307     UNUSED const size_t states_before = rdfa.states.size();
308 
309     HopcroftInfo info(rdfa);
310 
311     dfa_min(info);
312     new_dfa(rdfa, info);
313 
314     DEBUG_PRINTF("reduced from %zu to %zu states\n", states_before,
315                  rdfa.states.size());
316 }
317 
318 } // namespace ue2
319