1 /*
2  * Copyright (c) 2015-2020, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /**
30  * \file
31  * \brief Main NFA build code.
32  */
33 
34 #include "limex_compile.h"
35 
36 #include "accel.h"
37 #include "accelcompile.h"
38 #include "grey.h"
39 #include "limex_internal.h"
40 #include "limex_limits.h"
41 #include "nfa_build_util.h"
42 #include "nfagraph/ng_dominators.h"
43 #include "nfagraph/ng_holder.h"
44 #include "nfagraph/ng_limex_accel.h"
45 #include "nfagraph/ng_repeat.h"
46 #include "nfagraph/ng_squash.h"
47 #include "nfagraph/ng_util.h"
48 #include "ue2common.h"
49 #include "repeatcompile.h"
50 #include "util/alloc.h"
51 #include "util/bitutils.h"
52 #include "util/bytecode_ptr.h"
53 #include "util/charreach.h"
54 #include "util/compile_context.h"
55 #include "util/container.h"
56 #include "util/flat_containers.h"
57 #include "util/graph.h"
58 #include "util/graph_range.h"
59 #include "util/graph_small_color_map.h"
60 #include "util/order_check.h"
61 #include "util/unordered.h"
62 #include "util/verify_types.h"
63 
64 #include <algorithm>
65 #include <cassert>
66 #include <cstddef>
67 #include <cstdlib>
68 #include <cstring>
69 #include <map>
70 #include <set>
71 #include <vector>
72 
73 #include <boost/graph/breadth_first_search.hpp>
74 #include <boost/graph/depth_first_search.hpp>
75 #include <boost/range/adaptor/map.hpp>
76 
77 using namespace std;
78 using boost::adaptors::map_values;
79 
80 namespace ue2 {
81 
82 /**
83  * \brief Special state index value meaning that the vertex will not
84  * participate in an (NFA/DFA/etc) implementation.
85  */
86 static constexpr u32 NO_STATE = ~0;
87 
88 /* Maximum number of states taken as a small NFA */
89 static constexpr u32 MAX_SMALL_NFA_STATES = 64;
90 
91 /* Maximum bounded repeat upper bound to consider as a fast NFA */
92 static constexpr u64a MAX_REPEAT_SIZE = 200;
93 
94 /* Maximum bounded repeat char reach size to consider as a fast NFA */
95 static constexpr u32 MAX_REPEAT_CHAR_REACH = 26;
96 
97 /* Minimum bounded repeat trigger distance to consider as a fast NFA */
98 static constexpr u8 MIN_REPEAT_TRIGGER_DISTANCE = 6;
99 
100 namespace {
101 
102 struct precalcAccel {
precalcAccelue2::__anonf5c954560111::precalcAccel103     precalcAccel() : single_offset(0), double_offset(0) {}
104     CharReach single_cr;
105     u32 single_offset;
106 
107     CharReach double_cr;
108     flat_set<pair<u8, u8>> double_lits; /* double-byte accel stop literals */
109     u32 double_offset;
110 };
111 
112 struct limex_accel_info {
113     unordered_set<NFAVertex> accelerable;
114     map<NFAStateSet, precalcAccel> precalc;
115     unordered_map<NFAVertex, flat_set<NFAVertex>> friends;
116     unordered_map<NFAVertex, AccelScheme> accel_map;
117 };
118 
119 static
120 unordered_map<NFAVertex, NFAStateSet>
reindexByStateId(const unordered_map<NFAVertex,NFAStateSet> & in,const NGHolder & g,const unordered_map<NFAVertex,u32> & state_ids,const u32 num_states)121 reindexByStateId(const unordered_map<NFAVertex, NFAStateSet> &in,
122                  const NGHolder &g,
123                  const unordered_map<NFAVertex, u32> &state_ids,
124                  const u32 num_states) {
125     unordered_map<NFAVertex, NFAStateSet> out;
126     out.reserve(in.size());
127 
128     vector<u32> indexToState(num_vertices(g), NO_STATE);
129     for (const auto &m : state_ids) {
130         u32 vert_id = g[m.first].index;
131         assert(vert_id < indexToState.size());
132         indexToState[vert_id] = m.second;
133     }
134 
135     for (const auto &m : in) {
136         NFAVertex v = m.first;
137         assert(m.second.size() <= indexToState.size());
138 
139         NFAStateSet mask(num_states);
140         for (size_t i = m.second.find_first(); i != m.second.npos;
141              i = m.second.find_next(i)) {
142             u32 state_id = indexToState[i];
143             if (state_id == NO_STATE) {
144                 continue;
145             }
146             mask.set(state_id);
147         }
148         out.emplace(v, mask);
149     }
150 
151     return out;
152 }
153 
154 struct build_info {
build_infoue2::__anonf5c954560111::build_info155     build_info(NGHolder &hi,
156                const unordered_map<NFAVertex, u32> &states_in,
157                const vector<BoundedRepeatData> &ri,
158                const unordered_map<NFAVertex, NFAStateSet> &rsmi,
159                const unordered_map<NFAVertex, NFAStateSet> &smi,
160                const map<u32, set<NFAVertex>> &ti, const set<NFAVertex> &zi,
161                bool dai, bool sci, const CompileContext &cci, u32 nsi)
162         : h(hi), state_ids(states_in), repeats(ri), tops(ti), tugs(nsi),
163           zombies(zi), do_accel(dai), stateCompression(sci), cc(cci),
164           num_states(nsi) {
165         for (const auto &br : repeats) {
166             for (auto v : br.tug_triggers) {
167                 assert(state_ids.at(v) != NO_STATE);
168                 tugs.set(state_ids.at(v));
169             }
170             br_cyclic[br.cyclic] =
171                 BoundedRepeatSummary(br.repeatMin, br.repeatMax);
172         }
173 
174         // Convert squash maps to be indexed by state index rather than
175         // vertex_index.
176         squashMap = reindexByStateId(smi, h, state_ids, num_states);
177         reportSquashMap = reindexByStateId(rsmi, h, state_ids, num_states);
178     }
179 
180     NGHolder &h;
181     const unordered_map<NFAVertex, u32> &state_ids;
182     const vector<BoundedRepeatData> &repeats;
183 
184     // Squash maps; state sets are indexed by state_id.
185     unordered_map<NFAVertex, NFAStateSet> reportSquashMap;
186     unordered_map<NFAVertex, NFAStateSet> squashMap;
187 
188     const map<u32, set<NFAVertex>> &tops;
189     NFAStateSet tugs;
190     map<NFAVertex, BoundedRepeatSummary> br_cyclic;
191     const set<NFAVertex> &zombies;
192     bool do_accel;
193     bool stateCompression;
194     const CompileContext &cc;
195     u32 num_states;
196     limex_accel_info accel;
197 };
198 
199 #define LAST_LIMEX_NFA LIMEX_NFA_512
200 
201 // Constants for scoring mechanism
202 const int SHIFT_COST = 10; // limex: cost per shift mask
203 const int EXCEPTION_COST = 4; // limex: per exception
204 
205 template<NFAEngineType t> struct NFATraits { };
206 
207 template<template<NFAEngineType t> class sfunc, typename rv_t, typename arg_t,
208          NFAEngineType lb>
209 struct DISPATCH_BY_LIMEX_TYPE_INT {
doOpue2::__anonf5c954560111::DISPATCH_BY_LIMEX_TYPE_INT210     static rv_t doOp(NFAEngineType i, const arg_t &arg) {
211         if (i == lb) {
212             return sfunc<lb>::call(arg);
213         } else {
214             return DISPATCH_BY_LIMEX_TYPE_INT<sfunc, rv_t, arg_t,
215                                             (NFAEngineType)(lb + 1)>
216                      ::doOp(i, arg);
217         }
218     }
219 };
220 
221 template<template<NFAEngineType t> class sfunc, typename rv_t, typename arg_t>
222 struct DISPATCH_BY_LIMEX_TYPE_INT<sfunc, rv_t, arg_t,
223                                   (NFAEngineType)(LAST_LIMEX_NFA + 1)> {
224     // dummy
doOpue2::__anonf5c954560111::DISPATCH_BY_LIMEX_TYPE_INT225     static rv_t doOp(NFAEngineType, const arg_t &) {
226         assert(0);
227         throw std::logic_error("Unreachable");
228     }
229 };
230 
231 #define DISPATCH_BY_LIMEX_TYPE(i, op, arg)                                     \
232     DISPATCH_BY_LIMEX_TYPE_INT<op, decltype(op<(NFAEngineType)0>::call(arg)),  \
233                                decltype(arg), (NFAEngineType)0>::doOp(i, arg)
234 
235 // Given a number of states, find the size of the smallest container NFA it
236 // will fit in. We support NFAs of the following sizes: 32, 64, 128, 256, 384,
237 // 512.
findContainerSize(size_t states)238 size_t findContainerSize(size_t states) {
239     if (states > 256 && states <= 384) {
240         return 384;
241     }
242     return 1ULL << (lg2(states - 1) + 1);
243 }
244 
isLimitedTransition(int from,int to,int maxshift)245 bool isLimitedTransition(int from, int to, int maxshift) {
246     int diff = to - from;
247 
248     // within our shift?
249     if (diff < 0 || diff > maxshift) {
250         return false;
251     }
252 
253     // can't jump over a bollard
254     return (from & ~63) == (to & ~63);
255 }
256 
257 // Fill a bit mask
258 template<class Mask>
maskFill(Mask & m,u8 c)259 void maskFill(Mask &m, u8 c) {
260     memset(&m, c, sizeof(m));
261 }
262 
263 // Clear a bit mask.
264 template<class Mask>
maskClear(Mask & m)265 void maskClear(Mask &m) {
266     memset(&m, 0, sizeof(m));
267 }
268 
269 template<class Mask>
maskGetByte(Mask & m,u32 bit)270 u8 *maskGetByte(Mask &m, u32 bit) {
271     assert(bit < sizeof(m)*8);
272     u8 *m8 = (u8 *)&m;
273 
274     return m8 + bit/8;
275 }
276 
277 // Set a bit in a mask, starting from the little end.
278 template<class Mask>
maskSetBit(Mask & m,const unsigned int bit)279 void maskSetBit(Mask &m, const unsigned int bit) {
280     u8 *byte = maskGetByte(m, bit);
281     *byte |= 1U << (bit % 8);
282 }
283 
284 template<class Mask>
maskSetBits(Mask & m,const NFAStateSet & bits)285 void maskSetBits(Mask &m, const NFAStateSet &bits) {
286     for (size_t i = bits.find_first(); i != bits.npos; i = bits.find_next(i)) {
287         maskSetBit(m, i);
288     }
289 }
290 
291 template<class Mask>
isMaskZero(Mask & m)292 bool isMaskZero(Mask &m) {
293     u8 *m8 = (u8 *)&m;
294     for (u32 i = 0; i < sizeof(m); i++) {
295         if (m8[i]) {
296             return false;
297         }
298     }
299     return true;
300 }
301 
302 // Sets an entire byte in a mask to the given value
303 template<class Mask>
maskSetByte(Mask & m,const unsigned int idx,const char val)304 void maskSetByte(Mask &m, const unsigned int idx, const char val) {
305     assert(idx < sizeof(m));
306     char *m8 = (char *)&m;
307     char &byte = m8[idx];
308     byte = val;
309 }
310 
311 // Clear a bit in the mask, starting from the little end.
312 template<class Mask>
maskClearBit(Mask & m,const u32 bit)313 void maskClearBit(Mask &m, const u32 bit) {
314     u8 *byte = maskGetByte(m, bit);
315     *byte &= ~(1U << (bit % 8));
316 }
317 
318 /*
319  * Common code: the following code operates on parts of the NFA that are common
320  * to both the (defunct) General and the LimEx models.
321  */
322 
323 static
buildReachMapping(const build_info & args,vector<NFAStateSet> & reach,vector<u8> & reachMap)324 void buildReachMapping(const build_info &args, vector<NFAStateSet> &reach,
325                        vector<u8> &reachMap) {
326     const NGHolder &h = args.h;
327     const auto &state_ids = args.state_ids;
328 
329     // Build a list of vertices with a state index assigned.
330     vector<NFAVertex> verts;
331     verts.reserve(args.num_states);
332     for (auto v : vertices_range(h)) {
333         if (state_ids.at(v) != NO_STATE) {
334             verts.push_back(v);
335         }
336     }
337 
338     // Build a mapping from set-of-states -> reachability.
339     map<NFAStateSet, CharReach> mapping;
340     NFAStateSet states(args.num_states);
341     for (size_t i = 0; i < N_CHARS; i++) {
342         states.reset();
343         for (auto v : verts) {
344             const CharReach &cr = h[v].char_reach;
345             if (cr.test(i)) {
346                 u32 state_id = state_ids.at(v);
347                 states.set(state_id);
348             }
349         }
350         mapping[states].set(i);
351     }
352 
353     DEBUG_PRINTF("%zu distinct reachability entries\n", mapping.size());
354     assert(!mapping.empty());
355 
356     // Build a vector of distinct reachability entries and a mapping from every
357     // character to one of those entries.
358 
359     reach.reserve(mapping.size());
360     reachMap.assign(N_CHARS, 0);
361 
362     u8 num = 0;
363     for (auto mi = mapping.begin(), me = mapping.end(); mi != me; ++mi, ++num) {
364         // Reach entry.
365         reach.push_back(mi->first);
366 
367         // Character mapping.
368         const CharReach &cr = mi->second;
369         for (size_t i = cr.find_first(); i != CharReach::npos;
370              i = cr.find_next(i)) {
371             reachMap[i] = num;
372         }
373     }
374 }
375 
376 struct AccelBuild {
AccelBuildue2::__anonf5c954560111::AccelBuild377     AccelBuild() : v(NGHolder::null_vertex()), state(0), offset(0) {}
378     NFAVertex v;
379     u32 state;
380     u32 offset; // offset correction to apply
381     CharReach stop1; // single-byte accel stop literals
382     flat_set<pair<u8, u8>> stop2; // double-byte accel stop literals
383 };
384 
385 static
findStopLiterals(const build_info & bi,NFAVertex v,AccelBuild & build)386 void findStopLiterals(const build_info &bi, NFAVertex v, AccelBuild &build) {
387     u32 state = bi.state_ids.at(v);
388     build.v = v;
389     build.state = state;
390     NFAStateSet ss(bi.num_states);
391     ss.set(state);
392 
393     if (!contains(bi.accel.precalc, ss)) {
394         build.stop1 = CharReach::dot();
395     } else {
396         const precalcAccel &precalc = bi.accel.precalc.at(ss);
397         if (precalc.double_lits.empty()) {
398             build.stop1 = precalc.single_cr;
399             build.offset = precalc.single_offset;
400         } else {
401             build.stop1 = precalc.double_cr;
402             build.stop2 = precalc.double_lits;
403             build.offset = precalc.double_offset;
404         }
405     }
406 
407 #ifdef DEBUG
408     printf("state %u stop1:", state);
409     for (size_t j = build.stop1.find_first(); j != build.stop1.npos;
410          j = build.stop1.find_next(j)) {
411         printf(" 0x%02x", (u32)j);
412     }
413     printf("\n");
414     printf("state %u stop2:", state);
415     for (auto it = build.stop2.begin(); it != build.stop2.end(); ++it) {
416         printf(" 0x%02hhx%02hhx", it->first, it->second);
417     }
418     printf("\n");
419 #endif
420 }
421 
422 // Generate all the data we need for at most NFA_MAX_ACCEL_STATES accelerable
423 // states.
424 static
gatherAccelStates(const build_info & bi,vector<AccelBuild> & accelStates)425 void gatherAccelStates(const build_info &bi, vector<AccelBuild> &accelStates) {
426     for (auto v : bi.accel.accelerable) {
427         DEBUG_PRINTF("state %u is accelerable\n", bi.state_ids.at(v));
428         AccelBuild a;
429         findStopLiterals(bi, v, a);
430         accelStates.push_back(a);
431     }
432 
433     // AccelStates should be sorted by state number, so that we build our accel
434     // masks correctly.
435     sort(accelStates.begin(), accelStates.end(),
436          [](const AccelBuild &a, const AccelBuild &b) {
437              return a.state < b.state;
438          });
439 
440     // Our caller shouldn't have fed us too many accel states.
441     assert(accelStates.size() <= NFA_MAX_ACCEL_STATES);
442     if (accelStates.size() > NFA_MAX_ACCEL_STATES) {
443         accelStates.resize(NFA_MAX_ACCEL_STATES);
444     }
445 }
446 
447 static
combineAccel(const AccelBuild & in,AccelBuild & out)448 void combineAccel(const AccelBuild &in, AccelBuild &out) {
449     // stop1 and stop2 union
450     out.stop1 |= in.stop1;
451     out.stop2.insert(in.stop2.begin(), in.stop2.end());
452     // offset is maximum of the two
453     out.offset = max(out.offset, in.offset);
454 }
455 
456 static
minimiseAccel(AccelBuild & build)457 void minimiseAccel(AccelBuild &build) {
458     flat_set<pair<u8, u8>> new_stop2;
459     // Any two-byte accels beginning with a one-byte accel should be removed
460     for (const auto &si : build.stop2) {
461         if (!build.stop1.test(si.first)) {
462             new_stop2.insert(si);
463         }
464     }
465     build.stop2 = new_stop2;
466 }
467 
468 struct AccelAuxCmp {
AccelAuxCmpue2::__anonf5c954560111::AccelAuxCmp469     explicit AccelAuxCmp(const AccelAux &aux_in) : aux(aux_in) {}
operator ()ue2::__anonf5c954560111::AccelAuxCmp470     bool operator()(const AccelAux &a) const {
471         return !memcmp(&a, &aux, sizeof(AccelAux));
472     }
473 private:
474     const AccelAux &aux;
475 };
476 
477 static
allow_wide_accel(NFAVertex v,const NGHolder & g,NFAVertex sds_or_proxy)478 bool allow_wide_accel(NFAVertex v, const NGHolder &g, NFAVertex sds_or_proxy) {
479     return v == sds_or_proxy || edge(g.start, v, g).second;
480 }
481 
482 static
allow_wide_accel(const vector<NFAVertex> & vv,const NGHolder & g,NFAVertex sds_or_proxy)483 bool allow_wide_accel(const vector<NFAVertex> &vv, const NGHolder &g,
484                       NFAVertex sds_or_proxy) {
485     for (auto v : vv) {
486         if (allow_wide_accel(v, g, sds_or_proxy)) {
487             return true;
488         }
489     }
490 
491     return false;
492 }
493 
494 // identify and mark states that we feel are accelerable (for a limex NFA)
495 /* Note: leftfix nfas allow accepts to be accelerated */
496 static
nfaFindAccelSchemes(const NGHolder & g,const map<NFAVertex,BoundedRepeatSummary> & br_cyclic,unordered_map<NFAVertex,AccelScheme> * out)497 void nfaFindAccelSchemes(const NGHolder &g,
498                          const map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
499                          unordered_map<NFAVertex, AccelScheme> *out) {
500     vector<CharReach> refined_cr = reduced_cr(g, br_cyclic);
501 
502     NFAVertex sds_or_proxy = get_sds_or_proxy(g);
503 
504     for (auto v : vertices_range(g)) {
505         // We want to skip any vertices that don't lead to at least one other
506         // (self-loops don't count) vertex.
507         if (!has_proper_successor(v, g)) {
508             DEBUG_PRINTF("skipping vertex %zu\n", g[v].index);
509             continue;
510         }
511 
512         bool allow_wide = allow_wide_accel(v, g, sds_or_proxy);
513 
514         AccelScheme as;
515         if (nfaCheckAccel(g, v, refined_cr, br_cyclic, &as, allow_wide)) {
516             DEBUG_PRINTF("graph vertex %zu is accelerable with offset %u.\n",
517                           g[v].index, as.offset);
518             (*out)[v] = as;
519         }
520     }
521 }
522 
523 struct fas_visitor : public boost::default_bfs_visitor {
fas_visitorue2::__anonf5c954560111::fas_visitor524     fas_visitor(const unordered_map<NFAVertex, AccelScheme> &am_in,
525                 unordered_map<NFAVertex, AccelScheme> *out_in)
526         : accel_map(am_in), out(out_in) {}
527 
discover_vertexue2::__anonf5c954560111::fas_visitor528     void discover_vertex(NFAVertex v, const NGHolder &) {
529         if (accel_map.find(v) != accel_map.end()) {
530             (*out)[v] = accel_map.find(v)->second;
531         }
532         if (out->size() >= NFA_MAX_ACCEL_STATES) {
533             throw this; /* done */
534         }
535     }
536     const unordered_map<NFAVertex, AccelScheme> &accel_map;
537     unordered_map<NFAVertex, AccelScheme> *out;
538 };
539 
540 static
filterAccelStates(NGHolder & g,const map<u32,set<NFAVertex>> & tops,unordered_map<NFAVertex,AccelScheme> * accel_map)541 void filterAccelStates(NGHolder &g, const map<u32, set<NFAVertex>> &tops,
542                        unordered_map<NFAVertex, AccelScheme> *accel_map) {
543     /* We want the NFA_MAX_ACCEL_STATES best acceleration states, everything
544      * else should be ditched. We use a simple BFS to choose accel states near
545      * the start. */
546 
547     vector<NFAEdge> tempEdges;
548     for (const auto &vv : tops | map_values) {
549         for (NFAVertex v : vv) {
550             if (!edge(g.start, v, g).second) {
551                 tempEdges.push_back(add_edge(g.start, v, g).first);
552             }
553         }
554     }
555 
556     // Similarly, connect (start, startDs) if necessary.
557     if (!edge(g.start, g.startDs, g).second) {
558         NFAEdge e = add_edge(g.start, g.startDs, g);
559         tempEdges.push_back(e); // Remove edge later.
560     }
561 
562     unordered_map<NFAVertex, AccelScheme> out;
563 
564     try {
565         boost::breadth_first_search(g, g.start,
566                                     visitor(fas_visitor(*accel_map, &out))
567                                         .color_map(make_small_color_map(g)));
568     } catch (fas_visitor *) {
569         ; /* found max accel_states */
570     }
571 
572     remove_edges(tempEdges, g);
573 
574     assert(out.size() <= NFA_MAX_ACCEL_STATES);
575     accel_map->swap(out);
576 }
577 
578 static
containsBadSubset(const limex_accel_info & accel,const NFAStateSet & state_set,const u32 effective_sds)579 bool containsBadSubset(const limex_accel_info &accel,
580                        const NFAStateSet &state_set, const u32 effective_sds) {
581     NFAStateSet subset(state_set.size());
582     for (size_t j = state_set.find_first(); j != state_set.npos;
583          j = state_set.find_next(j)) {
584         subset = state_set;
585         subset.reset(j);
586 
587         if (effective_sds != NO_STATE && subset.count() == 1 &&
588             subset.test(effective_sds)) {
589             continue;
590         }
591 
592         if (subset.any() && !contains(accel.precalc, subset)) {
593             return true;
594         }
595     }
596     return false;
597 }
598 
599 static
is_too_wide(const AccelScheme & as)600 bool is_too_wide(const AccelScheme &as) {
601     return as.cr.count() > MAX_MERGED_ACCEL_STOPS;
602 }
603 
604 static
fillAccelInfo(build_info & bi)605 void fillAccelInfo(build_info &bi) {
606     if (!bi.do_accel) {
607         return;
608     }
609 
610     NGHolder &g = bi.h;
611     limex_accel_info &accel = bi.accel;
612     unordered_map<NFAVertex, AccelScheme> &accel_map = accel.accel_map;
613     const map<NFAVertex, BoundedRepeatSummary> &br_cyclic = bi.br_cyclic;
614     const unordered_map<NFAVertex, u32> &state_ids = bi.state_ids;
615     const u32 num_states = bi.num_states;
616 
617     nfaFindAccelSchemes(g, br_cyclic, &accel_map);
618     filterAccelStates(g, bi.tops, &accel_map);
619 
620     assert(accel_map.size() <= NFA_MAX_ACCEL_STATES);
621 
622     vector<CharReach> refined_cr = reduced_cr(g, br_cyclic);
623 
624     vector<NFAVertex> astates;
625     for (const auto &m : accel_map) {
626         astates.push_back(m.first);
627     }
628 
629     NFAStateSet useful(num_states);
630     NFAStateSet state_set(num_states);
631     vector<NFAVertex> states;
632 
633     NFAVertex sds_or_proxy = get_sds_or_proxy(g);
634     const u32 effective_sds = state_ids.at(sds_or_proxy);
635 
636     /* for each subset of the accel keys need to find an accel scheme */
637     assert(astates.size() < 32);
638     sort(astates.begin(), astates.end());
639 
640     for (u32 i = 1, i_end = 1U << astates.size(); i < i_end; i++) {
641         DEBUG_PRINTF("saving info for accel %u\n", i);
642         states.clear();
643         state_set.reset();
644         for (u32 j = 0, j_end = astates.size(); j < j_end; j++) {
645             if (i & (1U << j)) {
646                 NFAVertex v = astates[j];
647                 states.push_back(v);
648                 state_set.set(state_ids.at(v));
649             }
650         }
651 
652         if (containsBadSubset(accel, state_set, effective_sds)) {
653             DEBUG_PRINTF("accel %u has bad subset\n", i);
654             continue; /* if a subset failed to build we would too */
655         }
656 
657         const bool allow_wide = allow_wide_accel(states, g, sds_or_proxy);
658 
659         AccelScheme as = nfaFindAccel(g, states, refined_cr, br_cyclic,
660                                       allow_wide, true);
661         if (is_too_wide(as)) {
662             DEBUG_PRINTF("accel %u too wide (%zu, %d)\n", i,
663                          as.cr.count(), MAX_MERGED_ACCEL_STOPS);
664             continue;
665         }
666 
667         DEBUG_PRINTF("accel %u ok with offset s%u, d%u\n", i, as.offset,
668                      as.double_offset);
669 
670         precalcAccel &pa = accel.precalc[state_set];
671         pa.single_offset = as.offset;
672         pa.single_cr = as.cr;
673 
674         if (as.double_byte.size() != 0) {
675             pa.double_offset = as.double_offset;
676             pa.double_lits = as.double_byte;
677             pa.double_cr = as.double_cr;
678         }
679 
680         useful |= state_set;
681     }
682 
683     for (const auto &m : accel_map) {
684         NFAVertex v = m.first;
685         const u32 state_id = state_ids.at(v);
686 
687         /* if we we unable to make a scheme out of the state in any context,
688          * there is not point marking it as accelerable */
689         if (!useful.test(state_id)) {
690             continue;
691         }
692 
693         u32 offset = 0;
694         state_set.reset();
695         state_set.set(state_id);
696 
697         accel.accelerable.insert(v);
698         findAccelFriends(g, v, br_cyclic, offset, &accel.friends[v]);
699     }
700 }
701 
702 /** The AccelAux structure has large alignment specified, and this makes some
703  * compilers do odd things unless we specify a custom allocator. */
704 typedef vector<AccelAux, AlignedAllocator<AccelAux, alignof(AccelAux)>>
705     AccelAuxVector;
706 
707 #define IMPOSSIBLE_ACCEL_MASK (~0U)
708 
709 static
getEffectiveAccelStates(const build_info & args,const unordered_map<NFAVertex,NFAVertex> & dom_map,u32 active_accel_mask,const vector<AccelBuild> & accelStates)710 u32 getEffectiveAccelStates(const build_info &args,
711                             const unordered_map<NFAVertex, NFAVertex> &dom_map,
712                             u32 active_accel_mask,
713                             const vector<AccelBuild> &accelStates) {
714     /* accelStates is indexed by the acceleration bit index and contains a
715      * reference to the original vertex & state_id */
716 
717     /* Cases to consider:
718      *
719      * 1: Accel states a and b are on and b can squash a
720      *    --> we can ignore a. This will result in a no longer being accurately
721      *        modelled - we may miss escapes turning it off and we may also miss
722      *        its successors being activated.
723      *
724      * 2: Accel state b is on but accel state a is off and a is .* and must be
725      *    seen before b is reached (and would not be covered by (1))
726      *    --> if a is squashable (or may die unexpectedly) we should continue
727      *        as is
728      *    --> if a is not squashable we can treat this as a+b or as a no accel,
729      *        impossible case
730      *    --> this case could be extended to handle non dot reaches by
731      *        effectively creating something similar to squash masks for the
732      *        reverse graph
733      *
734      *
735      * Other cases:
736      *
737      * 3: Accel states a and b are on but have incompatible reaches
738      *    --> we should treat this as an impossible case. Actually, this case
739      *        is unlikely to arise as we pick states with wide reaches to
740      *        accelerate so an empty intersection is unlikely.
741      *
742      * Note: we need to be careful when dealing with accel states corresponding
743      * to bounded repeat cyclics - they may 'turn off' based on a max bound and
744      * so we may still require on earlier states to be accurately modelled.
745      */
746     const NGHolder &h = args.h;
747 
748     /* map from accel_id to mask of accel_ids that it is dominated by */
749     vector<u32> dominated_by(accelStates.size());
750 
751     map<NFAVertex, u32> accel_id_map;
752     for (u32 accel_id = 0; accel_id < accelStates.size(); accel_id++) {
753         NFAVertex v = accelStates[accel_id].v;
754         accel_id_map[v] = accel_id;
755     }
756 
757     /* Note: we want a slightly less strict defn of dominate as skip edges
758      * prevent .* 'truly' dominating */
759     for (u32 local_accel_mask = active_accel_mask; local_accel_mask; ) {
760         u32 accel_id = findAndClearLSB_32(&local_accel_mask);
761         assert(accel_id < accelStates.size());
762         NFAVertex v = accelStates[accel_id].v;
763         while (contains(dom_map, v) && dom_map.at(v)) {
764             v = dom_map.at(v);
765             if (contains(accel_id_map, v)) {
766                 dominated_by[accel_id] |= 1U << accel_id_map[v];
767             }
768             /* TODO: could also look at inv_adj vertices to handle fan-in */
769             for (NFAVertex a : adjacent_vertices_range(v, h)) {
770                 if (a == v || !contains(accel_id_map, a)
771                     || a == accelStates[accel_id].v /* not likely */) {
772                     continue;
773                 }
774                 if (!is_subset_of(h[v].reports, h[a].reports)) {
775                     continue;
776                 }
777                 auto v_succ = succs(v, h);
778                 auto a_succ = succs(a, h);
779                 if (is_subset_of(v_succ, a_succ)) {
780                     dominated_by[accel_id] |= 1U << accel_id_map[a];
781                 }
782             }
783         }
784     }
785 
786     u32 may_turn_off = 0; /* BR with max bound, non-dots, squashed, etc */
787     for (u32 local_accel_mask = active_accel_mask; local_accel_mask; ) {
788         u32 accel_id = findAndClearLSB_32(&local_accel_mask);
789         NFAVertex v = accelStates[accel_id].v;
790         u32 state_id = accelStates[accel_id].state;
791         assert(contains(args.accel.accelerable, v));
792         if (!h[v].char_reach.all()) {
793             may_turn_off |= 1U << accel_id;
794             continue;
795         }
796         if (contains(args.br_cyclic, v)
797             && args.br_cyclic.at(v).repeatMax != depth::infinity()) {
798             may_turn_off |= 1U << accel_id;
799             continue;
800         }
801         for (const auto &s_mask : args.squashMap | map_values) {
802             if (!s_mask.test(state_id)) {
803                 may_turn_off |= 1U << accel_id;
804                 break;
805             }
806         }
807         for (const auto &s_mask : args.reportSquashMap | map_values) {
808             if (!s_mask.test(state_id)) {
809                 may_turn_off |= 1U << accel_id;
810                 break;
811             }
812         }
813     }
814 
815     /* Case 1: */
816     u32 ignored = 0;
817     for (u32 local_accel_mask = active_accel_mask; local_accel_mask; ) {
818         u32 accel_id_b = findAndClearLSB_32(&local_accel_mask);
819         NFAVertex v = accelStates[accel_id_b].v;
820         if (!contains(args.squashMap, v)) {
821             continue;
822         }
823         assert(!contains(args.br_cyclic, v)
824                || args.br_cyclic.at(v).repeatMax == depth::infinity());
825         NFAStateSet squashed = args.squashMap.at(v);
826         squashed.flip(); /* default sense for mask of survivors */
827 
828         for (u32 local_accel_mask2 = active_accel_mask; local_accel_mask2; ) {
829             u32 accel_id_a = findAndClearLSB_32(&local_accel_mask2);
830             if (squashed.test(accelStates[accel_id_a].state)) {
831                 ignored |= 1U << accel_id_a;
832             }
833         }
834     }
835 
836     /* Case 2: */
837     for (u32 local_accel_mask = active_accel_mask; local_accel_mask; ) {
838         u32 accel_id = findAndClearLSB_32(&local_accel_mask);
839 
840         u32 stuck_dominators = dominated_by[accel_id] & ~may_turn_off;
841         if ((stuck_dominators & active_accel_mask) != stuck_dominators) {
842             DEBUG_PRINTF("only %08x on, but we require %08x\n",
843                          active_accel_mask, stuck_dominators);
844             return IMPOSSIBLE_ACCEL_MASK;
845         }
846     }
847 
848     if (ignored) {
849         DEBUG_PRINTF("in %08x, ignoring %08x\n", active_accel_mask, ignored);
850     }
851 
852     return active_accel_mask & ~ignored;
853 }
854 
855 static
buildAccel(const build_info & args,NFAStateSet & accelMask,NFAStateSet & accelFriendsMask,AccelAuxVector & auxvec,vector<u8> & accelTable)856 void buildAccel(const build_info &args, NFAStateSet &accelMask,
857                 NFAStateSet &accelFriendsMask, AccelAuxVector &auxvec,
858                 vector<u8> &accelTable) {
859     const limex_accel_info &accel = args.accel;
860 
861     // Init, all zeroes.
862     accelMask.resize(args.num_states);
863     accelFriendsMask.resize(args.num_states);
864 
865     if (!args.do_accel) {
866         return;
867     }
868 
869     vector<AccelBuild> accelStates;
870     gatherAccelStates(args, accelStates);
871 
872     if (accelStates.empty()) {
873         DEBUG_PRINTF("no accelerable states\n");
874         return;
875     }
876 
877     const auto dom_map = findDominators(args.h);
878 
879     // We have 2^n different accel entries, one for each possible
880     // combination of accelerable states.
881     assert(accelStates.size() < 32);
882     const u32 accelCount = 1U << accelStates.size();
883     assert(accelCount <= 256);
884 
885     // Set up a unioned AccelBuild for every possible combination of the set
886     // bits in accelStates.
887     vector<AccelBuild> accelOuts(accelCount);
888     vector<u32> effective_accel_set;
889     effective_accel_set.push_back(0); /* empty is effectively empty */
890 
891     for (u32 i = 1; i < accelCount; i++) {
892         u32 effective_i = getEffectiveAccelStates(args, dom_map, i,
893                                                   accelStates);
894         effective_accel_set.push_back(effective_i);
895 
896         if (effective_i == IMPOSSIBLE_ACCEL_MASK) {
897             DEBUG_PRINTF("this combination of accel states is not possible\n");
898             accelOuts[i].stop1 = CharReach::dot();
899             continue;
900         }
901 
902         while (effective_i) {
903             u32 base_accel_state = findAndClearLSB_32(&effective_i);
904             combineAccel(accelStates[base_accel_state], accelOuts[i]);
905         }
906         minimiseAccel(accelOuts[i]);
907     }
908 
909     accelTable.resize(accelCount);
910 
911     // We dedupe our AccelAux structures here, so that we only write one copy
912     // of each unique accel scheme into the bytecode, using the accelTable as
913     // an index.
914 
915     // Start with the NONE case.
916     auxvec.push_back(AccelAux());
917     memset(&auxvec[0], 0, sizeof(AccelAux));
918     auxvec[0].accel_type = ACCEL_NONE; // no states on.
919 
920     AccelAux aux;
921     for (u32 i = 1; i < accelCount; i++) {
922         memset(&aux, 0, sizeof(aux));
923 
924         NFAStateSet effective_states(args.num_states);
925         u32 effective_i = effective_accel_set[i];
926 
927         AccelInfo ainfo;
928         ainfo.double_offset = accelOuts[i].offset;
929         ainfo.double_stop1 = accelOuts[i].stop1;
930         ainfo.double_stop2 = accelOuts[i].stop2;
931 
932         if (effective_i != IMPOSSIBLE_ACCEL_MASK) {
933             while (effective_i) {
934                 u32 base_accel_id = findAndClearLSB_32(&effective_i);
935                 effective_states.set(accelStates[base_accel_id].state);
936             }
937 
938             if (contains(accel.precalc, effective_states)) {
939                 const auto &precalc = accel.precalc.at(effective_states);
940                 ainfo.single_offset = precalc.single_offset;
941                 ainfo.single_stops = precalc.single_cr;
942             }
943         }
944 
945         buildAccelAux(ainfo, &aux);
946 
947         // FIXME: We may want a faster way to find AccelAux structures that
948         // we've already built before.
949         auto it = find_if(auxvec.begin(), auxvec.end(), AccelAuxCmp(aux));
950         if (it == auxvec.end()) {
951             accelTable[i] = verify_u8(auxvec.size());
952             auxvec.push_back(aux);
953         } else {
954             accelTable[i] = verify_u8(it - auxvec.begin());
955         }
956     }
957 
958     DEBUG_PRINTF("%zu unique accel schemes (of max %u)\n", auxvec.size(),
959                  accelCount);
960 
961     // XXX: ACCEL_NONE?
962     for (const auto &as : accelStates) {
963         NFAVertex v = as.v;
964         assert(v && args.state_ids.at(v) == as.state);
965 
966         accelMask.set(as.state);
967         accelFriendsMask.set(as.state);
968 
969         if (!contains(accel.friends, v)) {
970             continue;
971         }
972         // Add the friends of this state to the friends mask.
973         const flat_set<NFAVertex> &friends = accel.friends.at(v);
974         DEBUG_PRINTF("%u has %zu friends\n", as.state, friends.size());
975         for (auto friend_v : friends) {
976             u32 state_id = args.state_ids.at(friend_v);
977             DEBUG_PRINTF("--> %u\n", state_id);
978             accelFriendsMask.set(state_id);
979         }
980     }
981 }
982 
983 static
addSquashMask(const build_info & args,const NFAVertex & v,vector<NFAStateSet> & squash)984 u32 addSquashMask(const build_info &args, const NFAVertex &v,
985                   vector<NFAStateSet> &squash) {
986     auto sit = args.reportSquashMap.find(v);
987     if (sit == args.reportSquashMap.end()) {
988         return MO_INVALID_IDX;
989     }
990 
991     // This state has a squash mask. Paw through the existing vector to
992     // see if we've already seen it, otherwise add a new one.
993     auto it = find(squash.begin(), squash.end(), sit->second);
994     if (it != squash.end()) {
995         return verify_u32(std::distance(squash.begin(), it));
996     }
997     u32 idx = verify_u32(squash.size());
998     squash.push_back(sit->second);
999     return idx;
1000 }
1001 
1002 using ReportListCache = ue2_unordered_map<vector<ReportID>, u32>;
1003 
1004 static
addReports(const flat_set<ReportID> & r,vector<ReportID> & reports,ReportListCache & reports_cache)1005 u32 addReports(const flat_set<ReportID> &r, vector<ReportID> &reports,
1006                ReportListCache &reports_cache) {
1007     assert(!r.empty());
1008 
1009     vector<ReportID> my_reports(begin(r), end(r));
1010     my_reports.push_back(MO_INVALID_IDX); // sentinel
1011 
1012     auto cache_it = reports_cache.find(my_reports);
1013     if (cache_it != end(reports_cache)) {
1014         u32 offset = cache_it->second;
1015         DEBUG_PRINTF("reusing cached report list at %u\n", offset);
1016         return offset;
1017     }
1018 
1019     auto it = search(begin(reports), end(reports), begin(my_reports),
1020                      end(my_reports));
1021     if (it != end(reports)) {
1022         u32 offset = verify_u32(std::distance(begin(reports), it));
1023         DEBUG_PRINTF("reusing found report list at %u\n", offset);
1024         return offset;
1025     }
1026 
1027     u32 offset = verify_u32(reports.size());
1028     insert(&reports, reports.end(), my_reports);
1029     reports_cache.emplace(move(my_reports), offset);
1030     return offset;
1031 }
1032 
1033 static
buildAcceptsList(const build_info & args,ReportListCache & reports_cache,vector<NFAVertex> & verts,vector<NFAAccept> & accepts,vector<ReportID> & reports,vector<NFAStateSet> & squash)1034 void buildAcceptsList(const build_info &args, ReportListCache &reports_cache,
1035                       vector<NFAVertex> &verts, vector<NFAAccept> &accepts,
1036                       vector<ReportID> &reports, vector<NFAStateSet> &squash) {
1037     if (verts.empty()) {
1038         return;
1039     }
1040 
1041     DEBUG_PRINTF("building accept lists for %zu states\n", verts.size());
1042 
1043     auto cmp_state_id = [&args](NFAVertex a, NFAVertex b) {
1044         u32 a_state = args.state_ids.at(a);
1045         u32 b_state = args.state_ids.at(b);
1046         assert(a_state != b_state || a == b);
1047         return a_state < b_state;
1048     };
1049 
1050     sort(begin(verts), end(verts), cmp_state_id);
1051 
1052     const NGHolder &h = args.h;
1053     for (const auto &v : verts) {
1054         DEBUG_PRINTF("state=%u, reports: [%s]\n", args.state_ids.at(v),
1055                      as_string_list(h[v].reports).c_str());
1056         NFAAccept a;
1057         memset(&a, 0, sizeof(a));
1058         assert(!h[v].reports.empty());
1059         if (h[v].reports.size() == 1) {
1060             a.single_report = 1;
1061             a.reports = *h[v].reports.begin();
1062         } else {
1063             a.single_report = 0;
1064             a.reports = addReports(h[v].reports, reports, reports_cache);
1065         }
1066         a.squash = addSquashMask(args, v, squash);
1067         accepts.push_back(move(a));
1068     }
1069 }
1070 
1071 static
buildAccepts(const build_info & args,ReportListCache & reports_cache,NFAStateSet & acceptMask,NFAStateSet & acceptEodMask,vector<NFAAccept> & accepts,vector<NFAAccept> & acceptsEod,vector<ReportID> & reports,vector<NFAStateSet> & squash)1072 void buildAccepts(const build_info &args, ReportListCache &reports_cache,
1073                   NFAStateSet &acceptMask, NFAStateSet &acceptEodMask,
1074                   vector<NFAAccept> &accepts, vector<NFAAccept> &acceptsEod,
1075                   vector<ReportID> &reports, vector<NFAStateSet> &squash) {
1076     const NGHolder &h = args.h;
1077 
1078     acceptMask.resize(args.num_states);
1079     acceptEodMask.resize(args.num_states);
1080 
1081     vector<NFAVertex> verts_accept, verts_accept_eod;
1082 
1083     for (auto v : vertices_range(h)) {
1084         u32 state_id = args.state_ids.at(v);
1085 
1086         if (state_id == NO_STATE || !is_match_vertex(v, h)) {
1087             continue;
1088         }
1089 
1090         if (edge(v, h.accept, h).second) {
1091             acceptMask.set(state_id);
1092             verts_accept.push_back(v);
1093         } else {
1094             assert(edge(v, h.acceptEod, h).second);
1095             acceptEodMask.set(state_id);
1096             verts_accept_eod.push_back(v);
1097         }
1098     }
1099 
1100     buildAcceptsList(args, reports_cache, verts_accept, accepts, reports,
1101                      squash);
1102     buildAcceptsList(args, reports_cache, verts_accept_eod, acceptsEod, reports,
1103                      squash);
1104 }
1105 
1106 static
buildTopMasks(const build_info & args,vector<NFAStateSet> & topMasks)1107 void buildTopMasks(const build_info &args, vector<NFAStateSet> &topMasks) {
1108     if (args.tops.empty()) {
1109         return; // No tops, probably an outfix NFA.
1110     }
1111 
1112     u32 numMasks = args.tops.rbegin()->first + 1; // max mask index
1113     DEBUG_PRINTF("we have %u top masks\n", numMasks);
1114 
1115     topMasks.assign(numMasks, NFAStateSet(args.num_states)); // all zeroes
1116 
1117     for (const auto &m : args.tops) {
1118         u32 mask_idx = m.first;
1119         for (NFAVertex v : m.second) {
1120             u32 state_id = args.state_ids.at(v);
1121             DEBUG_PRINTF("state %u is in top mask %u\n", state_id, mask_idx);
1122 
1123             assert(mask_idx < numMasks);
1124             assert(state_id != NO_STATE);
1125 
1126             topMasks[mask_idx].set(state_id);
1127         }
1128     }
1129 }
1130 
1131 static
uncompressedStateSize(u32 num_states)1132 u32 uncompressedStateSize(u32 num_states) {
1133     // Number of bytes required to store all our states.
1134     return ROUNDUP_N(num_states, 8)/8;
1135 }
1136 
1137 static
compressedStateSize(const NGHolder & h,const NFAStateSet & maskedStates,const unordered_map<NFAVertex,u32> & state_ids)1138 u32 compressedStateSize(const NGHolder &h, const NFAStateSet &maskedStates,
1139                         const unordered_map<NFAVertex, u32> &state_ids) {
1140     // Shrink state requirement to enough to fit the compressed largest reach.
1141     vector<u32> allreach(N_CHARS, 0);
1142 
1143     for (auto v : vertices_range(h)) {
1144         u32 i = state_ids.at(v);
1145         if (i == NO_STATE || maskedStates.test(i)) {
1146             continue;
1147         }
1148         const CharReach &cr = h[v].char_reach;
1149         for (size_t j = cr.find_first(); j != cr.npos; j = cr.find_next(j)) {
1150             allreach[j]++; // state 'i' can reach character 'j'.
1151         }
1152     }
1153 
1154     u32 maxreach = *max_element(allreach.begin(), allreach.end());
1155     DEBUG_PRINTF("max reach is %u\n", maxreach);
1156     return (maxreach + 7) / 8;
1157 }
1158 
1159 static
hasSquashableInitDs(const build_info & args)1160 bool hasSquashableInitDs(const build_info &args) {
1161     const NGHolder &h = args.h;
1162 
1163     if (args.squashMap.empty()) {
1164         DEBUG_PRINTF("squash map is empty\n");
1165         return false;
1166     }
1167 
1168     NFAStateSet initDs(args.num_states);
1169     u32 sds_state = args.state_ids.at(h.startDs);
1170     if (sds_state == NO_STATE) {
1171         DEBUG_PRINTF("no states in initds\n");
1172         return false;
1173     }
1174 
1175     initDs.set(sds_state);
1176 
1177     /* TODO: simplify */
1178 
1179     // Check normal squash map.
1180     for (const auto &m : args.squashMap) {
1181         DEBUG_PRINTF("checking squash mask for state %u\n",
1182                      args.state_ids.at(m.first));
1183         NFAStateSet squashed = ~(m.second); // flip mask
1184         assert(squashed.size() == initDs.size());
1185         if (squashed.intersects(initDs)) {
1186             DEBUG_PRINTF("state %u squashes initds states\n",
1187                          args.state_ids.at(m.first));
1188             return true;
1189         }
1190     }
1191 
1192     // Check report squash map.
1193     for (const auto &m : args.reportSquashMap) {
1194         DEBUG_PRINTF("checking report squash mask for state %u\n",
1195                      args.state_ids.at(m.first));
1196         NFAStateSet squashed = ~(m.second); // flip mask
1197         assert(squashed.size() == initDs.size());
1198         if (squashed.intersects(initDs)) {
1199             DEBUG_PRINTF("state %u squashes initds states\n",
1200                          args.state_ids.at(m.first));
1201             return true;
1202         }
1203     }
1204 
1205     return false;
1206 }
1207 
1208 static
hasInitDsStates(const NGHolder & h,const unordered_map<NFAVertex,u32> & state_ids)1209 bool hasInitDsStates(const NGHolder &h,
1210                      const unordered_map<NFAVertex, u32> &state_ids) {
1211     if (state_ids.at(h.startDs) != NO_STATE) {
1212         return true;
1213     }
1214 
1215     if (is_triggered(h) && state_ids.at(h.start) != NO_STATE) {
1216         return true;
1217     }
1218 
1219     return false;
1220 }
1221 
1222 static
findMaskedCompressionStates(const build_info & args,NFAStateSet & maskedStates)1223 void findMaskedCompressionStates(const build_info &args,
1224                                  NFAStateSet &maskedStates) {
1225     const NGHolder &h = args.h;
1226     if (!generates_callbacks(h)) {
1227         // Rose leftfixes can mask out initds, which is worth doing if it will
1228         // stay on forever (i.e. it's not squashable).
1229         u32 sds_i = args.state_ids.at(h.startDs);
1230         if (sds_i != NO_STATE && !hasSquashableInitDs(args)) {
1231             maskedStates.set(sds_i);
1232             DEBUG_PRINTF("masking out initds state\n");
1233         }
1234     }
1235 
1236     // Suffixes and outfixes can mask out leaf states, which should all be
1237     // accepts. Right now we can only do this when there is nothing in initDs,
1238     // as we switch that on unconditionally in the expand call.
1239     if (!inspects_states_for_accepts(h)
1240         && !hasInitDsStates(h, args.state_ids)) {
1241         NFAStateSet nonleaf(args.num_states);
1242         for (const auto &e : edges_range(h)) {
1243             u32 from = args.state_ids.at(source(e, h));
1244             u32 to = args.state_ids.at(target(e, h));
1245             if (from == NO_STATE) {
1246                 continue;
1247             }
1248 
1249             // We cannot mask out EOD accepts, as they have to perform an
1250             // action after they're switched on that may be delayed until the
1251             // next stream write.
1252             if (to == NO_STATE && target(e, h) != h.acceptEod) {
1253                 continue;
1254             }
1255 
1256             nonleaf.set(from);
1257         }
1258 
1259         for (u32 i = 0; i < args.num_states; i++) {
1260             if (!nonleaf.test(i)) {
1261                 maskedStates.set(i);
1262             }
1263         }
1264 
1265         DEBUG_PRINTF("masking out %zu leaf states\n", maskedStates.count());
1266     }
1267 }
1268 
1269 /** \brief Sets a given flag in the LimEx structure. */
1270 template<class implNFA_t>
1271 static
setLimexFlag(implNFA_t * limex,u32 flag)1272 void setLimexFlag(implNFA_t *limex, u32 flag) {
1273     assert(flag);
1274     assert((flag & (flag - 1)) == 0);
1275     limex->flags |= flag;
1276 }
1277 
1278 /** \brief Sets a given flag in the NFA structure */
1279 static
setNfaFlag(NFA * nfa,u32 flag)1280 void setNfaFlag(NFA *nfa, u32 flag) {
1281     assert(flag);
1282     assert((flag & (flag - 1)) == 0);
1283     nfa->flags |= flag;
1284 }
1285 
1286 // Some of our NFA types support compressing the state down if we're not using
1287 // all of it.
1288 template<class implNFA_t>
1289 static
findStateSize(const build_info & args,implNFA_t * limex)1290 void findStateSize(const build_info &args, implNFA_t *limex) {
1291     // Nothing is masked off by default.
1292     maskFill(limex->compressMask, 0xff);
1293 
1294     u32 sizeUncompressed = uncompressedStateSize(args.num_states);
1295     assert(sizeUncompressed <= sizeof(limex->compressMask));
1296 
1297     if (!args.stateCompression) {
1298         DEBUG_PRINTF("compression disabled, uncompressed state size %u\n",
1299                      sizeUncompressed);
1300         limex->stateSize = sizeUncompressed;
1301         return;
1302     }
1303 
1304     NFAStateSet maskedStates(args.num_states);
1305     findMaskedCompressionStates(args, maskedStates);
1306 
1307     u32 sizeCompressed = compressedStateSize(args.h, maskedStates, args.state_ids);
1308     assert(sizeCompressed <= sizeof(limex->compressMask));
1309 
1310     DEBUG_PRINTF("compressed=%u, uncompressed=%u\n", sizeCompressed,
1311                  sizeUncompressed);
1312 
1313     // Must be at least a 10% saving.
1314     if ((sizeCompressed * 100) <= (sizeUncompressed * 90)) {
1315         DEBUG_PRINTF("using compression, state size %u\n",
1316                      sizeCompressed);
1317         setLimexFlag(limex, LIMEX_FLAG_COMPRESS_STATE);
1318         limex->stateSize = sizeCompressed;
1319 
1320         if (maskedStates.any()) {
1321             DEBUG_PRINTF("masking %zu states\n", maskedStates.count());
1322             setLimexFlag(limex, LIMEX_FLAG_COMPRESS_MASKED);
1323             for (size_t i = maskedStates.find_first(); i != NFAStateSet::npos;
1324                     i = maskedStates.find_next(i)) {
1325                 maskClearBit(limex->compressMask, i);
1326             }
1327         }
1328     } else {
1329         DEBUG_PRINTF("not using compression, state size %u\n",
1330                      sizeUncompressed);
1331         limex->stateSize = sizeUncompressed;
1332     }
1333 }
1334 
1335 /*
1336  * LimEx NFA: code for building NFAs in the Limited+Exceptional model. Most
1337  * transitions are limited, with transitions outside the constraints of our
1338  * shifts taken care of as 'exceptions'. Exceptions are also used to handle
1339  * accepts and squash behaviour.
1340  */
1341 
1342 /**
1343  * \brief Prototype exception class.
1344  *
1345  * Used to build up the map of exceptions before being converted to real
1346  * NFAException32 (etc) structures.
1347  */
1348 struct ExceptionProto {
1349     u32 reports_index = MO_INVALID_IDX;
1350     NFAStateSet succ_states;
1351     NFAStateSet squash_states;
1352     u32 repeat_index = MO_INVALID_IDX;
1353     enum LimExTrigger trigger = LIMEX_TRIGGER_NONE;
1354     enum LimExSquash squash = LIMEX_SQUASH_NONE;
1355 
ExceptionProtoue2::__anonf5c954560111::ExceptionProto1356     explicit ExceptionProto(u32 num_states)
1357         : succ_states(num_states), squash_states(num_states) {
1358         // Squash states are represented as the set of states to leave on,
1359         // so we start with all-ones.
1360         squash_states.set();
1361     }
1362 
operator <ue2::__anonf5c954560111::ExceptionProto1363     bool operator<(const ExceptionProto &b) const {
1364         const ExceptionProto &a = *this;
1365 
1366         ORDER_CHECK(reports_index);
1367         ORDER_CHECK(repeat_index);
1368         ORDER_CHECK(trigger);
1369         ORDER_CHECK(squash);
1370         ORDER_CHECK(succ_states);
1371         ORDER_CHECK(squash_states);
1372 
1373         return false;
1374     }
1375 };
1376 
1377 static
buildExceptionMap(const build_info & args,ReportListCache & reports_cache,const unordered_set<NFAEdge> & exceptional,map<ExceptionProto,vector<u32>> & exceptionMap,vector<ReportID> & reportList)1378 u32 buildExceptionMap(const build_info &args, ReportListCache &reports_cache,
1379                       const unordered_set<NFAEdge> &exceptional,
1380                       map<ExceptionProto, vector<u32>> &exceptionMap,
1381                       vector<ReportID> &reportList) {
1382     const NGHolder &h = args.h;
1383     const u32 num_states = args.num_states;
1384     u32 exceptionCount = 0;
1385 
1386     unordered_map<NFAVertex, u32> pos_trigger;
1387     unordered_map<NFAVertex, u32> tug_trigger;
1388 
1389     for (u32 i = 0; i < args.repeats.size(); i++) {
1390         const BoundedRepeatData &br = args.repeats[i];
1391         assert(!contains(pos_trigger, br.pos_trigger));
1392         pos_trigger[br.pos_trigger] = i;
1393         for (auto v : br.tug_triggers) {
1394             assert(!contains(tug_trigger, v));
1395             tug_trigger[v] = i;
1396         }
1397     }
1398 
1399     for (auto v : vertices_range(h)) {
1400         const u32 i = args.state_ids.at(v);
1401 
1402         if (i == NO_STATE) {
1403             continue;
1404         }
1405 
1406         bool addMe = false;
1407         ExceptionProto e(num_states);
1408 
1409         if (edge(v, h.accept, h).second && generates_callbacks(h)) {
1410             /* if nfa is never used to produce callbacks, no need to mark
1411              * states as exceptional */
1412             const auto &reports = h[v].reports;
1413 
1414             DEBUG_PRINTF("state %u is exceptional due to accept "
1415                          "(%zu reports)\n", i, reports.size());
1416 
1417             if (reports.empty()) {
1418                 e.reports_index = MO_INVALID_IDX;
1419             } else {
1420                 e.reports_index =
1421                     addReports(reports, reportList, reports_cache);
1422             }
1423 
1424             // We may be applying a report squash too.
1425             auto mi = args.reportSquashMap.find(v);
1426             if (mi != args.reportSquashMap.end()) {
1427                 DEBUG_PRINTF("report squashes states\n");
1428                 assert(e.squash_states.size() == mi->second.size());
1429                 e.squash_states = mi->second;
1430                 e.squash = LIMEX_SQUASH_REPORT;
1431             }
1432 
1433             addMe = true;
1434         }
1435 
1436         if (contains(pos_trigger, v)) {
1437             u32 repeat_index = pos_trigger[v];
1438             assert(e.trigger == LIMEX_TRIGGER_NONE);
1439             e.trigger = LIMEX_TRIGGER_POS;
1440             e.repeat_index = repeat_index;
1441             DEBUG_PRINTF("state %u has pos trigger for repeat %u\n", i,
1442                          repeat_index);
1443             addMe = true;
1444         }
1445 
1446         if (contains(tug_trigger, v)) {
1447             u32 repeat_index = tug_trigger[v];
1448             assert(e.trigger == LIMEX_TRIGGER_NONE);
1449             e.trigger = LIMEX_TRIGGER_TUG;
1450             e.repeat_index = repeat_index;
1451 
1452             // TUG triggers can squash the preceding cyclic state.
1453             u32 cyclic = args.state_ids.at(args.repeats[repeat_index].cyclic);
1454             e.squash_states.reset(cyclic);
1455             e.squash = LIMEX_SQUASH_TUG;
1456             DEBUG_PRINTF("state %u has tug trigger for repeat %u, can squash "
1457                          "state %u\n", i, repeat_index, cyclic);
1458             addMe = true;
1459         }
1460 
1461         // are we a non-limited transition?
1462         for (const auto &oe : out_edges_range(v, h)) {
1463             if (contains(exceptional, oe)) {
1464                 NFAVertex w = target(oe, h);
1465                 u32 w_idx = args.state_ids.at(w);
1466                 assert(w_idx != NO_STATE);
1467                 e.succ_states.set(w_idx);
1468                 DEBUG_PRINTF("exceptional transition %u->%u\n", i, w_idx);
1469                 addMe = true;
1470             }
1471         }
1472 
1473         // do we lead SOLELY to a squasher state? (we use the successors as
1474         // a proxy for the out-edge here, so there must be only one for us
1475         // to do this safely)
1476         /* The above comment is IMHO bogus and would result in all squashing
1477          * being disabled around stars */
1478         if (e.trigger != LIMEX_TRIGGER_TUG) {
1479             for (auto w : adjacent_vertices_range(v, h)) {
1480                 if (w == v) {
1481                     continue;
1482                 }
1483                 u32 j = args.state_ids.at(w);
1484                 if (j == NO_STATE) {
1485                     continue;
1486                 }
1487                 DEBUG_PRINTF("we are checking if succ %u is a squasher\n", j);
1488                 auto mi = args.squashMap.find(w);
1489                 if (mi != args.squashMap.end()) {
1490                     DEBUG_PRINTF("squasher edge (%u, %u)\n", i, j);
1491                     DEBUG_PRINTF("e.squash_states.size() == %zu, "
1492                                  "mi->second.size() = %zu\n",
1493                                  e.squash_states.size(), mi->second.size());
1494                     assert(e.squash_states.size() == mi->second.size());
1495                     e.squash_states = mi->second;
1496 
1497                     // NOTE: this might be being combined with the report
1498                     // squashing above.
1499 
1500                     e.squash = LIMEX_SQUASH_CYCLIC;
1501                     DEBUG_PRINTF("squashing succ %u (turns off %zu states)\n",
1502                                  j, mi->second.size() - mi->second.count());
1503                     addMe = true;
1504                 }
1505             }
1506         }
1507 
1508         if (addMe) {
1509             // Add 'e' if it isn't in the map, and push state i on to its list
1510             // of states.
1511             assert(e.succ_states.size() == num_states);
1512             assert(e.squash_states.size() == num_states);
1513             exceptionMap[e].push_back(i);
1514             exceptionCount++;
1515         }
1516     }
1517 
1518     DEBUG_PRINTF("%u exceptions found (%zu unique)\n", exceptionCount,
1519                  exceptionMap.size());
1520     return exceptionCount;
1521 }
1522 
1523 static
depth_to_u32(const depth & d)1524 u32 depth_to_u32(const depth &d) {
1525     assert(d.is_reachable());
1526     if (d.is_infinite()) {
1527         return REPEAT_INF;
1528     }
1529 
1530     u32 d_val = d;
1531     assert(d_val < REPEAT_INF);
1532     return d_val;
1533 }
1534 
1535 static
isExceptionalTransition(u32 from,u32 to,const build_info & args,u32 maxShift)1536 bool isExceptionalTransition(u32 from, u32 to, const build_info &args,
1537                              u32 maxShift) {
1538     if (!isLimitedTransition(from, to, maxShift)) {
1539         return true;
1540     }
1541 
1542     // All transitions out of a tug trigger are exceptional.
1543     if (args.tugs.test(from)) {
1544         return true;
1545     }
1546     return false;
1547 }
1548 
1549 static
findMaxVarShift(const build_info & args,u32 nShifts)1550 u32 findMaxVarShift(const build_info &args, u32 nShifts) {
1551     const NGHolder &h = args.h;
1552     u32 shiftMask = 0;
1553     for (const auto &e : edges_range(h)) {
1554         u32 from = args.state_ids.at(source(e, h));
1555         u32 to = args.state_ids.at(target(e, h));
1556         if (from == NO_STATE || to == NO_STATE) {
1557             continue;
1558         }
1559         if (!isExceptionalTransition(from, to, args, MAX_SHIFT_AMOUNT)) {
1560             shiftMask |= (1UL << (to - from));
1561         }
1562     }
1563 
1564     u32 maxVarShift = 0;
1565     for (u32 shiftCnt = 0; shiftMask != 0 && shiftCnt < nShifts; shiftCnt++) {
1566         maxVarShift = findAndClearLSB_32(&shiftMask);
1567     }
1568 
1569     return maxVarShift;
1570 }
1571 
1572 static
getLimexScore(const build_info & args,u32 nShifts)1573 int getLimexScore(const build_info &args, u32 nShifts) {
1574     const NGHolder &h = args.h;
1575     u32 maxVarShift = nShifts;
1576     int score = 0;
1577 
1578     score += SHIFT_COST * nShifts;
1579     maxVarShift = findMaxVarShift(args, nShifts);
1580 
1581     NFAStateSet exceptionalStates(args.num_states);
1582     for (const auto &e : edges_range(h)) {
1583         u32 from = args.state_ids.at(source(e, h));
1584         u32 to = args.state_ids.at(target(e, h));
1585         if (from == NO_STATE || to == NO_STATE) {
1586             continue;
1587         }
1588         if (isExceptionalTransition(from, to, args, maxVarShift)) {
1589             exceptionalStates.set(from);
1590         }
1591     }
1592     score += EXCEPTION_COST * exceptionalStates.count();
1593     return score;
1594 }
1595 
1596 // This function finds the best shift scheme with highest score
1597 // Returns number of shifts and score calculated for appropriate scheme
1598 // Returns zero if no appropriate scheme was found
1599 static
findBestNumOfVarShifts(const build_info & args,int * bestScoreRet=nullptr)1600 u32 findBestNumOfVarShifts(const build_info &args,
1601                            int *bestScoreRet = nullptr) {
1602     u32 bestNumOfVarShifts = 0;
1603     int bestScore = INT_MAX;
1604     for (u32 shiftCount = 1; shiftCount <= MAX_SHIFT_COUNT; shiftCount++) {
1605         int score = getLimexScore(args, shiftCount);
1606         if (score < bestScore) {
1607             bestScore = score;
1608             bestNumOfVarShifts = shiftCount;
1609         }
1610     }
1611     if (bestScoreRet != nullptr) {
1612         *bestScoreRet = bestScore;
1613     }
1614     return bestNumOfVarShifts;
1615 }
1616 
1617 static
cannotDie(const build_info & args,const set<NFAVertex> & tops)1618 bool cannotDie(const build_info &args, const set<NFAVertex> &tops) {
1619     const auto &h = args.h;
1620 
1621     // When this top is activated, all of the vertices in 'tops' are switched
1622     // on. If any of those lead to a graph that cannot die, then this top
1623     // cannot die.
1624 
1625     // For each top, we use a depth-first search to traverse the graph from the
1626     // top, looking for a cyclic path consisting of vertices of dot reach. If
1627     // one exists, than the NFA cannot die after this top is triggered.
1628 
1629     auto colour_map = make_small_color_map(h);
1630 
1631     struct CycleFound {};
1632     struct CannotDieVisitor : public boost::default_dfs_visitor {
1633         void back_edge(const NFAEdge &e, const NGHolder &g) const {
1634             DEBUG_PRINTF("back-edge %zu,%zu\n", g[source(e, g)].index,
1635                          g[target(e, g)].index);
1636             if (g[target(e, g)].char_reach.all()) {
1637                 assert(g[source(e, g)].char_reach.all());
1638                 throw CycleFound();
1639             }
1640         }
1641     };
1642 
1643     try {
1644         for (const auto &top : tops) {
1645             DEBUG_PRINTF("checking top vertex %zu\n", h[top].index);
1646 
1647             // Constrain the search to the top vertices and any dot vertices it
1648             // can reach.
1649             auto term_func = [&](NFAVertex v, const NGHolder &g) {
1650                 if (v == top) {
1651                     return false;
1652                 }
1653                 if (!g[v].char_reach.all()) {
1654                     return true;
1655                 }
1656                 if (contains(args.br_cyclic, v) &&
1657                     args.br_cyclic.at(v).repeatMax != depth::infinity()) {
1658                     // Bounded repeat vertices without inf max can be turned
1659                     // off.
1660                     return true;
1661                 }
1662                 return false;
1663             };
1664 
1665             boost::depth_first_visit(h, top, CannotDieVisitor(), colour_map,
1666                                      term_func);
1667         }
1668     } catch (const CycleFound &) {
1669         DEBUG_PRINTF("cycle found\n");
1670         return true;
1671     }
1672 
1673     return false;
1674 }
1675 
1676 /** \brief True if this NFA cannot ever be in no states at all. */
1677 static
cannotDie(const build_info & args)1678 bool cannotDie(const build_info &args) {
1679     const auto &h = args.h;
1680     const auto &state_ids = args.state_ids;
1681 
1682     // If we have a startDs we're actually using, we can't die.
1683     if (state_ids.at(h.startDs) != NO_STATE) {
1684         DEBUG_PRINTF("is using startDs\n");
1685         return true;
1686     }
1687 
1688     return all_of_in(args.tops | map_values, [&](const set<NFAVertex> &verts) {
1689         return cannotDie(args, verts);
1690     });
1691 }
1692 
1693 template<NFAEngineType dtype>
1694 struct Factory {
1695     // typedefs for readability, for types derived from traits
1696     typedef typename NFATraits<dtype>::exception_t exception_t;
1697     typedef typename NFATraits<dtype>::implNFA_t implNFA_t;
1698     typedef typename NFATraits<dtype>::tableRow_t tableRow_t;
1699 
1700     static
allocStateue2::__anonf5c954560111::Factory1701     void allocState(NFA *nfa, u32 repeatscratchStateSize,
1702                     u32 repeatStreamState) {
1703         implNFA_t *limex = (implNFA_t *)getMutableImplNfa(nfa);
1704 
1705         // LimEx NFAs now store the following in state:
1706         // 1. state bitvector (always present)
1707         // 2. space associated with repeats
1708         // This function just needs to size these correctly.
1709 
1710         u32 stateSize = limex->stateSize;
1711 
1712         DEBUG_PRINTF("bitvector=%zu/%u, repeat full=%u, stream=%u\n",
1713                      sizeof(limex->init), stateSize, repeatscratchStateSize,
1714                      repeatStreamState);
1715 
1716         size_t scratchStateSize = NFATraits<dtype>::scratch_state_size;
1717 
1718         if (repeatscratchStateSize) {
1719             scratchStateSize
1720                 = ROUNDUP_N(scratchStateSize, alignof(RepeatControl));
1721             scratchStateSize += repeatscratchStateSize;
1722         }
1723         size_t streamStateSize = stateSize + repeatStreamState;
1724 
1725         nfa->scratchStateSize = verify_u32(scratchStateSize);
1726         nfa->streamStateSize = verify_u32(streamStateSize);
1727     }
1728 
1729     static
repeatAllocSizeue2::__anonf5c954560111::Factory1730     size_t repeatAllocSize(const BoundedRepeatData &br, u32 *tableOffset,
1731                            u32 *tugMaskOffset) {
1732         size_t len = sizeof(NFARepeatInfo) + sizeof(RepeatInfo);
1733 
1734         // sparse lookup table.
1735         if (br.type == REPEAT_SPARSE_OPTIMAL_P) {
1736             len = ROUNDUP_N(len, alignof(u64a));
1737             *tableOffset = verify_u32(len);
1738             len += sizeof(u64a) * (br.repeatMax + 1);
1739         } else {
1740             *tableOffset = 0;
1741         }
1742 
1743         // tug mask.
1744         len = ROUNDUP_N(len, alignof(tableRow_t));
1745         *tugMaskOffset = verify_u32(len);
1746         len += sizeof(tableRow_t);
1747 
1748         // to simplify layout.
1749         len = ROUNDUP_CL(len);
1750 
1751         return len;
1752     }
1753 
1754     static
buildRepeatsue2::__anonf5c954560111::Factory1755     void buildRepeats(const build_info &args,
1756                       vector<bytecode_ptr<NFARepeatInfo>> &out,
1757                       u32 *scratchStateSize, u32 *streamState) {
1758         out.reserve(args.repeats.size());
1759 
1760         u32 repeat_idx = 0;
1761         for (auto it = args.repeats.begin(), ite = args.repeats.end();
1762              it != ite; ++it, ++repeat_idx) {
1763             const BoundedRepeatData &br = *it;
1764             assert(args.state_ids.at(br.cyclic) != NO_STATE);
1765 
1766             u32 tableOffset, tugMaskOffset;
1767             size_t len = repeatAllocSize(br, &tableOffset, &tugMaskOffset);
1768             auto info = make_zeroed_bytecode_ptr<NFARepeatInfo>(len);
1769             char *info_ptr = (char *)info.get();
1770 
1771             // Collect state space info.
1772             RepeatStateInfo rsi(br.type, br.repeatMin, br.repeatMax, br.minPeriod);
1773             u32 streamStateLen = rsi.packedCtrlSize + rsi.stateSize;
1774 
1775             // Fill the NFARepeatInfo structure.
1776             info->cyclicState = args.state_ids.at(br.cyclic);
1777             info->ctrlIndex = repeat_idx;
1778             info->packedCtrlOffset = *streamState;
1779             info->stateOffset = *streamState + rsi.packedCtrlSize;
1780             info->stateSize = streamStateLen;
1781             info->tugMaskOffset = tugMaskOffset;
1782 
1783             // Fill the RepeatInfo structure.
1784             RepeatInfo *repeat =
1785                 (RepeatInfo *)(info_ptr + sizeof(NFARepeatInfo));
1786             repeat->type = br.type;
1787             repeat->repeatMin = depth_to_u32(br.repeatMin);
1788             repeat->repeatMax = depth_to_u32(br.repeatMax);
1789             repeat->horizon = rsi.horizon;
1790             repeat->packedCtrlSize = rsi.packedCtrlSize;
1791             repeat->stateSize = rsi.stateSize;
1792             copy_bytes(repeat->packedFieldSizes, rsi.packedFieldSizes);
1793             repeat->patchCount = rsi.patchCount;
1794             repeat->patchSize = rsi.patchSize;
1795             repeat->encodingSize = rsi.encodingSize;
1796             repeat->patchesOffset = rsi.patchesOffset;
1797 
1798             u32 repeat_len = sizeof(RepeatInfo);
1799             if (br.type == REPEAT_SPARSE_OPTIMAL_P) {
1800                 repeat_len += sizeof(u64a) * (rsi.patchSize + 1);
1801             }
1802             repeat->length = repeat_len;
1803 
1804             // Copy in the sparse lookup table.
1805             if (br.type == REPEAT_SPARSE_OPTIMAL_P) {
1806                 assert(!rsi.table.empty());
1807                 copy_bytes(info_ptr + tableOffset, rsi.table);
1808             }
1809 
1810             // Fill the tug mask.
1811             tableRow_t *tugMask = (tableRow_t *)(info_ptr + tugMaskOffset);
1812             for (auto v : br.tug_triggers) {
1813                 u32 state_id = args.state_ids.at(v);
1814                 assert(state_id != NO_STATE);
1815                 maskSetBit(*tugMask, state_id);
1816             }
1817 
1818             assert(streamStateLen);
1819             *streamState += streamStateLen;
1820             *scratchStateSize += sizeof(RepeatControl);
1821 
1822             out.emplace_back(move(info));
1823         }
1824     }
1825 
1826     static
writeLimexMasksue2::__anonf5c954560111::Factory1827     void writeLimexMasks(const build_info &args, implNFA_t *limex) {
1828         const NGHolder &h = args.h;
1829 
1830         // Init masks.
1831         u32 s_i = args.state_ids.at(h.start);
1832         u32 sds_i = args.state_ids.at(h.startDs);
1833 
1834         if (s_i != NO_STATE) {
1835             maskSetBit(limex->init, s_i);
1836             if (is_triggered(h)) {
1837                 maskSetBit(limex->initDS, s_i);
1838             }
1839         }
1840 
1841         if (sds_i != NO_STATE) {
1842             maskSetBit(limex->init, sds_i);
1843             maskSetBit(limex->initDS, sds_i);
1844         }
1845 
1846         // Zombie mask.
1847         for (auto v : args.zombies) {
1848             u32 state_id = args.state_ids.at(v);
1849             assert(state_id != NO_STATE);
1850             maskSetBit(limex->zombieMask, state_id);
1851         }
1852 
1853         // Repeat cyclic mask.
1854         for (const auto &br : args.repeats) {
1855             u32 cyclic = args.state_ids.at(br.cyclic);
1856             assert(cyclic != NO_STATE);
1857             maskSetBit(limex->repeatCyclicMask, cyclic);
1858         }
1859         /* also include tugs in repeat cyclic mask */
1860         for (size_t i = args.tugs.find_first(); i != args.tugs.npos;
1861              i = args.tugs.find_next(i)) {
1862             maskSetBit(limex->repeatCyclicMask, i);
1863         }
1864     }
1865 
1866     static
writeShiftMasksue2::__anonf5c954560111::Factory1867     void writeShiftMasks(const build_info &args, implNFA_t *limex) {
1868         const NGHolder &h = args.h;
1869         u32 maxShift = findMaxVarShift(args, limex->shiftCount);
1870         u32 shiftMask = 0;
1871         int shiftMaskIdx = 0;
1872 
1873         for (const auto &e : edges_range(h)) {
1874             u32 from = args.state_ids.at(source(e, h));
1875             u32 to = args.state_ids.at(target(e, h));
1876             if (from == NO_STATE || to == NO_STATE) {
1877                 continue;
1878             }
1879 
1880             // We check for exceptional transitions here, as we don't want tug
1881             // trigger transitions emitted as limited transitions (even if they
1882             // could be in this model).
1883             if (!isExceptionalTransition(from, to, args, maxShift)) {
1884                 u32 shift = to - from;
1885                 if ((shiftMask & (1UL << shift)) == 0UL) {
1886                     shiftMask |= (1UL << shift);
1887                     limex->shiftAmount[shiftMaskIdx++] = (u8)shift;
1888                 }
1889                 assert(limex->shiftCount <= MAX_SHIFT_COUNT);
1890                 for (u32 i = 0; i < limex->shiftCount; i++) {
1891                     if (limex->shiftAmount[i] == (u8)shift) {
1892                         maskSetBit(limex->shift[i], from);
1893                         break;
1894                     }
1895                 }
1896             }
1897         }
1898         if (maxShift && limex->shiftCount > 1) {
1899             for (u32 i = 0; i < limex->shiftCount; i++) {
1900                 assert(!isMaskZero(limex->shift[i]));
1901             }
1902         }
1903     }
1904 
1905     static
findExceptionalTransitionsue2::__anonf5c954560111::Factory1906     void findExceptionalTransitions(const build_info &args,
1907                                     unordered_set<NFAEdge> &exceptional,
1908                                     u32 maxShift) {
1909         const NGHolder &h = args.h;
1910 
1911         for (const auto &e : edges_range(h)) {
1912             u32 from = args.state_ids.at(source(e, h));
1913             u32 to = args.state_ids.at(target(e, h));
1914             if (from == NO_STATE || to == NO_STATE) {
1915                 continue;
1916             }
1917 
1918             if (isExceptionalTransition(from, to, args, maxShift)) {
1919                 exceptional.insert(e);
1920             }
1921         }
1922     }
1923 
1924     static
writeExceptionsue2::__anonf5c954560111::Factory1925     void writeExceptions(const build_info &args,
1926                          const map<ExceptionProto, vector<u32>> &exceptionMap,
1927                          const vector<u32> &repeatOffsets, implNFA_t *limex,
1928                          const u32 exceptionsOffset,
1929                          const u32 reportListOffset) {
1930         DEBUG_PRINTF("exceptionsOffset=%u\n", exceptionsOffset);
1931 
1932         exception_t *etable = (exception_t *)((char *)limex + exceptionsOffset);
1933         assert(ISALIGNED(etable));
1934 
1935         map<u32, ExceptionProto> exception_by_state;
1936         for (const auto &m : exceptionMap) {
1937             const ExceptionProto &proto = m.first;
1938             const vector<u32> &states = m.second;
1939             for (u32 i : states) {
1940                 assert(!contains(exception_by_state, i));
1941                 exception_by_state.emplace(i, proto);
1942             }
1943         }
1944 
1945         u32 ecount = 0;
1946         for (const auto &m : exception_by_state) {
1947             const ExceptionProto &proto = m.second;
1948             u32 state_id = m.first;
1949             DEBUG_PRINTF("exception %u, triggered by state %u\n", ecount,
1950                          state_id);
1951 
1952             // Write the exception entry.
1953             exception_t &e = etable[ecount];
1954             maskSetBits(e.squash, proto.squash_states);
1955             maskSetBits(e.successors, proto.succ_states);
1956             if (proto.reports_index == MO_INVALID_IDX) {
1957                 e.reports = MO_INVALID_IDX;
1958             } else {
1959                 e.reports = reportListOffset +
1960                             proto.reports_index * sizeof(ReportID);
1961             }
1962             e.hasSquash = verify_u8(proto.squash);
1963             e.trigger = verify_u8(proto.trigger);
1964             u32 repeat_offset = proto.repeat_index == MO_INVALID_IDX
1965                                     ? MO_INVALID_IDX
1966                                     : repeatOffsets[proto.repeat_index];
1967             e.repeatOffset = repeat_offset;
1968 
1969             // for the state that can switch it on
1970             // set this bit in the exception mask
1971             maskSetBit(limex->exceptionMask, state_id);
1972 
1973             ecount++;
1974         }
1975 
1976         limex->exceptionOffset = exceptionsOffset;
1977         limex->exceptionCount = ecount;
1978 
1979         if (args.num_states > 64 && args.cc.target_info.has_avx512vbmi()) {
1980             const u8 *exceptionMask = (const u8 *)(&limex->exceptionMask);
1981             u8 *shufMask = (u8 *)&limex->exceptionShufMask;
1982             u8 *bitMask = (u8 *)&limex->exceptionBitMask;
1983             u8 *andMask = (u8 *)&limex->exceptionAndMask;
1984 
1985             u32 tot_cnt = 0;
1986             u32 pos = 0;
1987             bool valid = true;
1988             size_t tot = sizeof(limex->exceptionMask);
1989             size_t base = 0;
1990 
1991             // We normally have up to 64 exceptions to handle,
1992             // but treat 384 state Limex differently to simplify operations
1993             size_t limit = 64;
1994             if (args.num_states > 256 && args.num_states <= 384) {
1995                 limit = 48;
1996             }
1997 
1998             for (size_t i = 0; i < tot; i++) {
1999                 if (!exceptionMask[i]) {
2000                     continue;
2001                 }
2002                 u32 bit_cnt = popcount32(exceptionMask[i]);
2003 
2004                 tot_cnt += bit_cnt;
2005                 if (tot_cnt > limit) {
2006                     valid = false;
2007                     break;
2008                 }
2009 
2010                 u32 emsk = exceptionMask[i];
2011                 while (emsk) {
2012                     u32 t = findAndClearLSB_32(&emsk);
2013                     bitMask[pos] = 1U << t;
2014                     andMask[pos] = 1U << t;
2015                     shufMask[pos++] = i + base;
2016 
2017                     if (pos == 32 &&
2018                         (args.num_states > 128 && args.num_states <= 256)) {
2019                         base += 32;
2020                     }
2021                 }
2022             }
2023             // Avoid matching unused bytes
2024             for (u32 i = pos; i < 64; i++) {
2025                 bitMask[i] = 0xff;
2026             }
2027             if (valid) {
2028                 setLimexFlag(limex, LIMEX_FLAG_EXTRACT_EXP);
2029             }
2030         }
2031     }
2032 
2033     static
writeReachMappingue2::__anonf5c954560111::Factory2034     void writeReachMapping(const vector<NFAStateSet> &reach,
2035                            const vector<u8> &reachMap, implNFA_t *limex,
2036                            const u32 reachOffset) {
2037         DEBUG_PRINTF("reachOffset=%u\n", reachOffset);
2038 
2039         // Reach mapping is inside the LimEx structure.
2040         copy(reachMap.begin(), reachMap.end(), &limex->reachMap[0]);
2041 
2042         // Reach table is right after the LimEx structure.
2043         tableRow_t *reachMask = (tableRow_t *)((char *)limex + reachOffset);
2044         assert(ISALIGNED(reachMask));
2045         for (size_t i = 0, end = reach.size(); i < end; i++) {
2046             maskSetBits(reachMask[i], reach[i]);
2047         }
2048         limex->reachSize = verify_u32(reach.size());
2049     }
2050 
2051     static
writeTopMasksue2::__anonf5c954560111::Factory2052     void writeTopMasks(const vector<NFAStateSet> &tops, implNFA_t *limex,
2053                        const u32 topsOffset) {
2054         DEBUG_PRINTF("topsOffset=%u\n", topsOffset);
2055 
2056         limex->topOffset = topsOffset;
2057         tableRow_t *topMasks = (tableRow_t *)((char *)limex + topsOffset);
2058         assert(ISALIGNED(topMasks));
2059 
2060         for (size_t i = 0, end = tops.size(); i < end; i++) {
2061             maskSetBits(topMasks[i], tops[i]);
2062         }
2063 
2064         limex->topCount = verify_u32(tops.size());
2065     }
2066 
2067     static
writeAccelSsse3Masksue2::__anonf5c954560111::Factory2068     void writeAccelSsse3Masks(const NFAStateSet &accelMask, implNFA_t *limex) {
2069         char *perm_base = (char *)&limex->accelPermute;
2070         char *comp_base = (char *)&limex->accelCompare;
2071 
2072         u32 num = 0; // index in accel table.
2073         for (size_t i = accelMask.find_first(); i != accelMask.npos;
2074              i = accelMask.find_next(i), ++num) {
2075             u32 state_id = verify_u32(i);
2076             DEBUG_PRINTF("accel num=%u, state=%u\n", num, state_id);
2077 
2078             // PSHUFB permute and compare masks
2079             size_t mask_idx = sizeof(u_128) * (state_id / 128U);
2080             DEBUG_PRINTF("mask_idx=%zu\n", mask_idx);
2081             u_128 *perm = (u_128 *)(perm_base + mask_idx);
2082             u_128 *comp = (u_128 *)(comp_base + mask_idx);
2083             maskSetByte(*perm, num, ((state_id % 128U) / 8U));
2084             maskSetByte(*comp, num, ~(1U << (state_id % 8U)));
2085         }
2086     }
2087 
2088     static
writeAccelue2::__anonf5c954560111::Factory2089     void writeAccel(const NFAStateSet &accelMask,
2090                     const NFAStateSet &accelFriendsMask,
2091                     const AccelAuxVector &accelAux,
2092                     const vector<u8> &accelTable, implNFA_t *limex,
2093                     const u32 accelTableOffset, const u32 accelAuxOffset) {
2094         DEBUG_PRINTF("accelTableOffset=%u, accelAuxOffset=%u\n",
2095                       accelTableOffset, accelAuxOffset);
2096 
2097         // Write accel lookup table.
2098         limex->accelTableOffset = accelTableOffset;
2099         copy(accelTable.begin(), accelTable.end(),
2100              (u8 *)((char *)limex + accelTableOffset));
2101 
2102         // Write accel aux structures.
2103         limex->accelAuxOffset = accelAuxOffset;
2104         AccelAux *auxTable = (AccelAux *)((char *)limex + accelAuxOffset);
2105         assert(ISALIGNED(auxTable));
2106         copy(accelAux.begin(), accelAux.end(), auxTable);
2107 
2108         // Write LimEx structure members.
2109         limex->accelCount = verify_u32(accelTable.size());
2110         // FIXME: accelAuxCount is unused?
2111         limex->accelAuxCount = verify_u32(accelAux.size());
2112 
2113         // Write LimEx masks.
2114         maskSetBits(limex->accel, accelMask);
2115         maskSetBits(limex->accel_and_friends, accelFriendsMask);
2116 
2117         // We can use PSHUFB-based shuffles for models >= 128 states. These
2118         // require some additional masks in the bytecode.
2119         maskClear(limex->accelCompare);
2120         maskFill(limex->accelPermute, (char)0x80);
2121         if (NFATraits<dtype>::maxStates >= 128) {
2122             writeAccelSsse3Masks(accelMask, limex);
2123         }
2124     }
2125 
2126     static
writeAcceptsue2::__anonf5c954560111::Factory2127     void writeAccepts(const NFAStateSet &acceptMask,
2128                       const NFAStateSet &acceptEodMask,
2129                       const vector<NFAAccept> &accepts,
2130                       const vector<NFAAccept> &acceptsEod,
2131                       const vector<NFAStateSet> &squash, implNFA_t *limex,
2132                       const u32 acceptsOffset, const u32 acceptsEodOffset,
2133                       const u32 squashOffset, const u32 reportListOffset) {
2134         char *limex_base = (char *)limex;
2135 
2136         DEBUG_PRINTF("acceptsOffset=%u, acceptsEodOffset=%u, squashOffset=%u\n",
2137                      acceptsOffset, acceptsEodOffset, squashOffset);
2138 
2139         // LimEx masks (in structure)
2140         maskSetBits(limex->accept, acceptMask);
2141         maskSetBits(limex->acceptAtEOD, acceptEodMask);
2142 
2143         // Transforms the indices (report list, squash mask) into offsets
2144         // relative to the base of the limex.
2145         auto transform_offset_fn = [&](NFAAccept a) {
2146             if (!a.single_report) {
2147                 a.reports = reportListOffset + a.reports * sizeof(ReportID);
2148             }
2149             a.squash = squashOffset + a.squash * sizeof(tableRow_t);
2150             return a;
2151         };
2152 
2153         // Write accept table.
2154         limex->acceptOffset = acceptsOffset;
2155         limex->acceptCount = verify_u32(accepts.size());
2156         DEBUG_PRINTF("NFA has %zu accepts\n", accepts.size());
2157         NFAAccept *acceptsTable = (NFAAccept *)(limex_base + acceptsOffset);
2158         assert(ISALIGNED(acceptsTable));
2159         transform(accepts.begin(), accepts.end(), acceptsTable,
2160                   transform_offset_fn);
2161 
2162         // Write eod accept table.
2163         limex->acceptEodOffset = acceptsEodOffset;
2164         limex->acceptEodCount = verify_u32(acceptsEod.size());
2165         DEBUG_PRINTF("NFA has %zu EOD accepts\n", acceptsEod.size());
2166         NFAAccept *acceptsEodTable = (NFAAccept *)(limex_base + acceptsEodOffset);
2167         assert(ISALIGNED(acceptsEodTable));
2168         transform(acceptsEod.begin(), acceptsEod.end(), acceptsEodTable,
2169                   transform_offset_fn);
2170 
2171         // Write squash mask table.
2172         limex->squashCount = verify_u32(squash.size());
2173         limex->squashOffset = squashOffset;
2174         DEBUG_PRINTF("NFA has %zu report squash masks\n", squash.size());
2175         tableRow_t *mask = (tableRow_t *)(limex_base + squashOffset);
2176         assert(ISALIGNED(mask));
2177         for (size_t i = 0, end = squash.size(); i < end; i++) {
2178             maskSetBits(mask[i], squash[i]);
2179         }
2180     }
2181 
2182     static
writeRepeatsue2::__anonf5c954560111::Factory2183     void writeRepeats(const vector<bytecode_ptr<NFARepeatInfo>> &repeats,
2184                       vector<u32> &repeatOffsets, implNFA_t *limex,
2185                       const u32 repeatOffsetsOffset, const u32 repeatOffset) {
2186         const u32 num_repeats = verify_u32(repeats.size());
2187 
2188         DEBUG_PRINTF("repeatOffsetsOffset=%u, repeatOffset=%u\n",
2189                       repeatOffsetsOffset, repeatOffset);
2190 
2191         repeatOffsets.resize(num_repeats);
2192         u32 offset = repeatOffset;
2193 
2194         for (u32 i = 0; i < num_repeats; i++) {
2195             repeatOffsets[i] = offset;
2196             assert(repeats[i]);
2197             memcpy((char *)limex + offset, repeats[i].get(), repeats[i].size());
2198             offset += repeats[i].size();
2199         }
2200 
2201         // Write repeat offset lookup table.
2202         assert(ISALIGNED_N((char *)limex + repeatOffsetsOffset, alignof(u32)));
2203         copy_bytes((char *)limex + repeatOffsetsOffset, repeatOffsets);
2204 
2205         limex->repeatOffset = repeatOffsetsOffset;
2206         limex->repeatCount = num_repeats;
2207     }
2208 
2209     static
writeReportListue2::__anonf5c954560111::Factory2210     void writeReportList(const vector<ReportID> &reports, implNFA_t *limex,
2211                          const u32 reportListOffset) {
2212         DEBUG_PRINTF("reportListOffset=%u\n", reportListOffset);
2213         assert(ISALIGNED_N((char *)limex + reportListOffset,
2214                            alignof(ReportID)));
2215         copy_bytes((char *)limex + reportListOffset, reports);
2216     }
2217 
2218     static
generateNfaue2::__anonf5c954560111::Factory2219     bytecode_ptr<NFA> generateNfa(const build_info &args) {
2220         if (args.num_states > NFATraits<dtype>::maxStates) {
2221             return nullptr;
2222         }
2223 
2224         // Build bounded repeat structures.
2225         vector<bytecode_ptr<NFARepeatInfo>> repeats;
2226         u32 repeats_full_state = 0;
2227         u32 repeats_stream_state = 0;
2228         buildRepeats(args, repeats, &repeats_full_state, &repeats_stream_state);
2229         size_t repeatSize = 0;
2230         for (size_t i = 0; i < repeats.size(); i++) {
2231             repeatSize += repeats[i].size();
2232         }
2233 
2234         // We track report lists that have already been written into the global
2235         // list in case we can reuse them.
2236         ReportListCache reports_cache;
2237 
2238         unordered_set<NFAEdge> exceptional;
2239         u32 shiftCount = findBestNumOfVarShifts(args);
2240         assert(shiftCount);
2241         u32 maxShift = findMaxVarShift(args, shiftCount);
2242         findExceptionalTransitions(args, exceptional, maxShift);
2243 
2244         map<ExceptionProto, vector<u32>> exceptionMap;
2245         vector<ReportID> reportList;
2246 
2247         u32 exceptionCount = buildExceptionMap(args, reports_cache, exceptional,
2248                                                exceptionMap, reportList);
2249 
2250         assert(exceptionCount <= args.num_states);
2251 
2252         // Build reach table and character mapping.
2253         vector<NFAStateSet> reach;
2254         vector<u8> reachMap;
2255         buildReachMapping(args, reach, reachMap);
2256 
2257         // Build top masks.
2258         vector<NFAStateSet> tops;
2259         buildTopMasks(args, tops);
2260 
2261         // Build all our accept info.
2262         NFAStateSet acceptMask, acceptEodMask;
2263         vector<NFAAccept> accepts, acceptsEod;
2264         vector<NFAStateSet> squash;
2265         buildAccepts(args, reports_cache, acceptMask, acceptEodMask, accepts,
2266                      acceptsEod, reportList, squash);
2267 
2268         // Build all our accel info.
2269         NFAStateSet accelMask, accelFriendsMask;
2270         AccelAuxVector accelAux;
2271         vector<u8> accelTable;
2272         buildAccel(args, accelMask, accelFriendsMask, accelAux, accelTable);
2273 
2274         // Compute the offsets in the bytecode for this LimEx NFA for all of
2275         // our structures. First, the NFA and LimEx structures. All other
2276         // offsets are relative to the start of the LimEx struct, starting with
2277         // the reach table.
2278         u32 offset = sizeof(implNFA_t);
2279 
2280         const u32 reachOffset = offset;
2281         offset += sizeof(tableRow_t) * reach.size();
2282 
2283         const u32 topsOffset = offset;
2284         offset += sizeof(tableRow_t) * tops.size();
2285 
2286         const u32 accelTableOffset = offset;
2287         offset += sizeof(u8) * accelTable.size();
2288 
2289         offset = ROUNDUP_N(offset, alignof(AccelAux));
2290         const u32 accelAuxOffset = offset;
2291         offset += sizeof(AccelAux) * accelAux.size();
2292 
2293         offset = ROUNDUP_N(offset, alignof(NFAAccept));
2294         const u32 acceptsOffset = offset;
2295         offset += sizeof(NFAAccept) * accepts.size();
2296         const u32 acceptsEodOffset = offset;
2297         offset += sizeof(NFAAccept) * acceptsEod.size();
2298 
2299         offset = ROUNDUP_CL(offset);
2300         const u32 squashOffset = offset;
2301         offset += sizeof(tableRow_t) * squash.size();
2302 
2303         offset = ROUNDUP_CL(offset);
2304         const u32 exceptionsOffset = offset;
2305         offset += sizeof(exception_t) * exceptionCount;
2306 
2307         const u32 reportListOffset = offset;
2308         offset += sizeof(ReportID) * reportList.size();
2309 
2310         const u32 repeatOffsetsOffset = offset;
2311         offset += sizeof(u32) * args.repeats.size();
2312 
2313         offset = ROUNDUP_CL(offset);
2314         const u32 repeatsOffset = offset;
2315         offset += repeatSize;
2316 
2317         // Now we can allocate space for the NFA and get to work on layout.
2318 
2319         size_t nfaSize = sizeof(NFA) + offset;
2320         DEBUG_PRINTF("nfa size %zu\n", nfaSize);
2321         auto nfa = make_zeroed_bytecode_ptr<NFA>(nfaSize);
2322         assert(nfa); // otherwise we would have thrown std::bad_alloc
2323 
2324         implNFA_t *limex = (implNFA_t *)getMutableImplNfa(nfa.get());
2325         assert(ISALIGNED(limex));
2326 
2327         writeReachMapping(reach, reachMap, limex, reachOffset);
2328 
2329         writeTopMasks(tops, limex, topsOffset);
2330 
2331         writeAccel(accelMask, accelFriendsMask, accelAux, accelTable,
2332                    limex, accelTableOffset, accelAuxOffset);
2333 
2334         writeAccepts(acceptMask, acceptEodMask, accepts, acceptsEod, squash,
2335                      limex, acceptsOffset, acceptsEodOffset, squashOffset,
2336                      reportListOffset);
2337 
2338         limex->shiftCount = shiftCount;
2339         writeShiftMasks(args, limex);
2340 
2341         if (cannotDie(args)) {
2342             DEBUG_PRINTF("nfa cannot die\n");
2343             setLimexFlag(limex, LIMEX_FLAG_CANNOT_DIE);
2344         }
2345 
2346         // Determine the state required for our state vector.
2347         findStateSize(args, limex);
2348 
2349         writeReportList(reportList, limex, reportListOffset);
2350 
2351         // Repeat structures and offset table.
2352         vector<u32> repeatOffsets;
2353         writeRepeats(repeats, repeatOffsets, limex, repeatOffsetsOffset,
2354                      repeatsOffset);
2355 
2356         writeExceptions(args, exceptionMap, repeatOffsets, limex, exceptionsOffset,
2357                         reportListOffset);
2358 
2359         writeLimexMasks(args, limex);
2360 
2361         allocState(nfa.get(), repeats_full_state, repeats_stream_state);
2362 
2363         nfa->type = dtype;
2364         nfa->length = verify_u32(nfaSize);
2365         nfa->nPositions = args.num_states;
2366 
2367         if (!args.zombies.empty()) {
2368             setNfaFlag(nfa.get(), NFA_ZOMBIE);
2369         }
2370         if (!acceptsEod.empty()) {
2371             setNfaFlag(nfa.get(), NFA_ACCEPTS_EOD);
2372         }
2373 
2374         return nfa;
2375     }
2376 
scoreue2::__anonf5c954560111::Factory2377     static int score(const build_info &args) {
2378         // LimEx NFAs are available in sizes from 32 to 512-bit.
2379         size_t num_states = args.num_states;
2380 
2381         size_t sz = findContainerSize(num_states);
2382         if (sz < 32) {
2383             sz = 32;
2384         }
2385 
2386         if (args.cc.grey.nfaForceSize) {
2387             sz = args.cc.grey.nfaForceSize;
2388         }
2389 
2390         if (sz != NFATraits<dtype>::maxStates) {
2391             return -1; // fail, size not appropriate
2392         }
2393 
2394         // We are of the right size, calculate a score based on the number
2395         // of exceptions and the number of shifts used by this LimEx.
2396         int score;
2397         u32 shiftCount = findBestNumOfVarShifts(args, &score);
2398         if (shiftCount == 0) {
2399             return -1;
2400         }
2401         return score;
2402     }
2403 };
2404 
2405 template<NFAEngineType dtype>
2406 struct generateNfa {
callue2::__anonf5c954560111::generateNfa2407     static bytecode_ptr<NFA> call(const build_info &args) {
2408         return Factory<dtype>::generateNfa(args);
2409     }
2410 };
2411 
2412 template<NFAEngineType dtype>
2413 struct scoreNfa {
callue2::__anonf5c954560111::scoreNfa2414     static int call(const build_info &args) {
2415         return Factory<dtype>::score(args);
2416     }
2417 };
2418 
2419 #define MAKE_LIMEX_TRAITS(mlt_size)                                            \
2420     template<> struct NFATraits<LIMEX_NFA_##mlt_size> {                        \
2421         typedef LimExNFA##mlt_size implNFA_t;                                  \
2422         typedef u_##mlt_size tableRow_t;                                       \
2423         typedef NFAException##mlt_size exception_t;                            \
2424         static const size_t maxStates = mlt_size;                              \
2425         static const size_t scratch_state_size = mlt_size == 64 ? sizeof(m128) \
2426                                                  : sizeof(tableRow_t);         \
2427     };
2428 
2429 MAKE_LIMEX_TRAITS(32)
2430 MAKE_LIMEX_TRAITS(64)
2431 MAKE_LIMEX_TRAITS(128)
2432 MAKE_LIMEX_TRAITS(256)
2433 MAKE_LIMEX_TRAITS(384)
2434 MAKE_LIMEX_TRAITS(512)
2435 
2436 } // namespace
2437 
2438 #ifndef NDEBUG
2439 // Some sanity tests, called by an assertion in generate().
2440 static UNUSED
isSane(const NGHolder & h,const map<u32,set<NFAVertex>> & tops,const unordered_map<NFAVertex,u32> & state_ids,u32 num_states)2441 bool isSane(const NGHolder &h, const map<u32, set<NFAVertex>> &tops,
2442             const unordered_map<NFAVertex, u32> &state_ids,
2443             u32 num_states) {
2444     unordered_set<u32> seen;
2445     unordered_set<NFAVertex> top_starts;
2446     for (const auto &vv : tops | map_values) {
2447         insert(&top_starts, vv);
2448     }
2449 
2450     for (auto v : vertices_range(h)) {
2451         if (!contains(state_ids, v)) {
2452             DEBUG_PRINTF("no entry for vertex %zu in state map\n", h[v].index);
2453             return false;
2454         }
2455         const u32 i = state_ids.at(v);
2456         if (i == NO_STATE) {
2457             continue;
2458         }
2459 
2460         DEBUG_PRINTF("checking vertex %zu (state %u)\n", h[v].index, i);
2461 
2462         if (i >= num_states || contains(seen, i)) {
2463             DEBUG_PRINTF("vertex %u/%u has invalid state\n", i, num_states);
2464             return false;
2465         }
2466         seen.insert(i);
2467 
2468         // All our states should be reachable and have a state assigned.
2469         if (h[v].char_reach.none()) {
2470             DEBUG_PRINTF("vertex %zu has empty reachability\n", h[v].index);
2471             return false;
2472         }
2473 
2474         // Every state that isn't a start state (or top, in triggered NFAs)
2475         // must have at least one predecessor that is not itself.
2476         if (v != h.start && v != h.startDs && !contains(top_starts, v)
2477             && !proper_in_degree(v, h)) {
2478             DEBUG_PRINTF("vertex %zu has no pred\n", h[v].index);
2479             return false;
2480         }
2481     }
2482 
2483     if (seen.size() != num_states) {
2484         return false;
2485     }
2486 
2487     return true;
2488 }
2489 #endif // NDEBUG
2490 
2491 static
isFast(const build_info & args)2492 bool isFast(const build_info &args) {
2493     const NGHolder &h = args.h;
2494     const u32 num_states = args.num_states;
2495 
2496     if (num_states > MAX_SMALL_NFA_STATES) {
2497         return false;
2498     }
2499 
2500     unordered_map<NFAVertex, bool> pos_trigger;
2501     for (u32 i = 0; i < args.repeats.size(); i++) {
2502         const BoundedRepeatData &br = args.repeats[i];
2503         assert(!contains(pos_trigger, br.pos_trigger));
2504         pos_trigger[br.pos_trigger] = br.repeatMax <= MAX_REPEAT_SIZE;
2505     }
2506 
2507     // Small NFA without bounded repeat should be fast.
2508     if (pos_trigger.empty()) {
2509         return true;
2510     }
2511 
2512     vector<NFAVertex> cur;
2513     unordered_set<NFAVertex> visited;
2514     for (const auto &m : args.tops) {
2515         for (NFAVertex v : m.second) {
2516             cur.push_back(v);
2517             visited.insert(v);
2518         }
2519     }
2520 
2521     u8 pos_dist = 0;
2522     while (!cur.empty()) {
2523         vector<NFAVertex> next;
2524         for (const auto &v : cur) {
2525             if (contains(pos_trigger, v)) {
2526                 const CharReach &cr = h[v].char_reach;
2527                 if (!pos_trigger[v] && cr.count() > MAX_REPEAT_CHAR_REACH) {
2528                     return false;
2529                 }
2530             }
2531             for (const auto &w : adjacent_vertices_range(v, h)) {
2532                 if (w == v) {
2533                     continue;
2534                 }
2535                 u32 j = args.state_ids.at(w);
2536                 if (j == NO_STATE) {
2537                     continue;
2538                 }
2539                 if (!contains(visited, w)) {
2540                     next.push_back(w);
2541                     visited.insert(w);
2542                 }
2543             }
2544         }
2545         if (++pos_dist >= MIN_REPEAT_TRIGGER_DISTANCE) {
2546             break;
2547         }
2548         swap(cur, next);
2549     }
2550     return true;
2551 }
2552 
2553 static
max_state(const unordered_map<NFAVertex,u32> & state_ids)2554 u32 max_state(const unordered_map<NFAVertex, u32> &state_ids) {
2555     u32 rv = 0;
2556     for (const auto &m : state_ids) {
2557         DEBUG_PRINTF("state %u\n", m.second);
2558         if (m.second != NO_STATE) {
2559             rv = max(m.second, rv);
2560         }
2561     }
2562     DEBUG_PRINTF("max %u\n", rv);
2563     return rv;
2564 }
2565 
generate(NGHolder & h,const unordered_map<NFAVertex,u32> & states,const vector<BoundedRepeatData> & repeats,const unordered_map<NFAVertex,NFAStateSet> & reportSquashMap,const unordered_map<NFAVertex,NFAStateSet> & squashMap,const map<u32,set<NFAVertex>> & tops,const set<NFAVertex> & zombies,bool do_accel,bool stateCompression,bool & fast,u32 hint,const CompileContext & cc)2566 bytecode_ptr<NFA> generate(NGHolder &h,
2567                 const unordered_map<NFAVertex, u32> &states,
2568                 const vector<BoundedRepeatData> &repeats,
2569                 const unordered_map<NFAVertex, NFAStateSet> &reportSquashMap,
2570                 const unordered_map<NFAVertex, NFAStateSet> &squashMap,
2571                 const map<u32, set<NFAVertex>> &tops,
2572                 const set<NFAVertex> &zombies, bool do_accel,
2573                 bool stateCompression, bool &fast, u32 hint,
2574                 const CompileContext &cc) {
2575     const u32 num_states = max_state(states) + 1;
2576     DEBUG_PRINTF("total states: %u\n", num_states);
2577 
2578     if (!cc.grey.allowLimExNFA) {
2579         DEBUG_PRINTF("limex not allowed\n");
2580         return nullptr;
2581     }
2582 
2583     // If you ask for a particular type, it had better be an NFA.
2584     assert(hint == INVALID_NFA || hint <= LAST_LIMEX_NFA);
2585     DEBUG_PRINTF("hint=%u\n", hint);
2586 
2587     // Sanity check the input data.
2588     assert(isSane(h, tops, states, num_states));
2589 
2590     // Build arguments used in the rest of this file.
2591     build_info arg(h, states, repeats, reportSquashMap, squashMap, tops,
2592                    zombies, do_accel, stateCompression, cc, num_states);
2593 
2594     // Acceleration analysis.
2595     fillAccelInfo(arg);
2596 
2597     vector<pair<int, NFAEngineType>> scores;
2598 
2599     if (hint != INVALID_NFA) {
2600         // The caller has told us what to (attempt to) build.
2601         scores.emplace_back(0, (NFAEngineType)hint);
2602     } else {
2603         for (size_t i = 0; i <= LAST_LIMEX_NFA; i++) {
2604             NFAEngineType ntype = (NFAEngineType)i;
2605             int score = DISPATCH_BY_LIMEX_TYPE(ntype, scoreNfa, arg);
2606             if (score >= 0) {
2607                 DEBUG_PRINTF("%s scores %d\n", nfa_type_name(ntype), score);
2608                 scores.emplace_back(score, ntype);
2609             }
2610         }
2611     }
2612 
2613     if (scores.empty()) {
2614         DEBUG_PRINTF("No NFA returned a valid score for this case.\n");
2615         return nullptr;
2616     }
2617 
2618     // Sort acceptable models in priority order, lowest score first.
2619     sort(scores.begin(), scores.end());
2620 
2621     for (const auto &elem : scores) {
2622         assert(elem.first >= 0);
2623         NFAEngineType limex_model = elem.second;
2624         auto nfa = DISPATCH_BY_LIMEX_TYPE(limex_model, generateNfa, arg);
2625         if (nfa) {
2626             DEBUG_PRINTF("successful build with NFA engine: %s\n",
2627                          nfa_type_name(limex_model));
2628             fast = isFast(arg);
2629             return nfa;
2630         }
2631     }
2632 
2633     DEBUG_PRINTF("NFA build failed.\n");
2634     return nullptr;
2635 }
2636 
countAccelStates(NGHolder & h,const unordered_map<NFAVertex,u32> & states,const vector<BoundedRepeatData> & repeats,const unordered_map<NFAVertex,NFAStateSet> & reportSquashMap,const unordered_map<NFAVertex,NFAStateSet> & squashMap,const map<u32,set<NFAVertex>> & tops,const set<NFAVertex> & zombies,const CompileContext & cc)2637 u32 countAccelStates(NGHolder &h,
2638                 const unordered_map<NFAVertex, u32> &states,
2639                 const vector<BoundedRepeatData> &repeats,
2640                 const unordered_map<NFAVertex, NFAStateSet> &reportSquashMap,
2641                 const unordered_map<NFAVertex, NFAStateSet> &squashMap,
2642                 const map<u32, set<NFAVertex>> &tops,
2643                 const set<NFAVertex> &zombies,
2644                 const CompileContext &cc) {
2645     const u32 num_states = max_state(states) + 1;
2646     DEBUG_PRINTF("total states: %u\n", num_states);
2647 
2648     if (!cc.grey.allowLimExNFA) {
2649         DEBUG_PRINTF("limex not allowed\n");
2650         return 0;
2651     }
2652 
2653     // Sanity check the input data.
2654     assert(isSane(h, tops, states, num_states));
2655 
2656     const bool do_accel = true;
2657     const bool state_compression = false;
2658 
2659     // Build arguments used in the rest of this file.
2660     build_info bi(h, states, repeats, reportSquashMap, squashMap, tops, zombies,
2661                   do_accel, state_compression, cc, num_states);
2662 
2663     // Acceleration analysis.
2664     nfaFindAccelSchemes(bi.h, bi.br_cyclic, &bi.accel.accel_map);
2665 
2666     u32 num_accel = verify_u32(bi.accel.accel_map.size());
2667     DEBUG_PRINTF("found %u accel states\n", num_accel);
2668     return num_accel;
2669 }
2670 
2671 } // namespace ue2
2672