1 /*
2  * Copyright (c) 2015-2018, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /** \file
30  * \brief NG and graph handling.
31  */
32 #include "ng.h"
33 
34 #include "grey.h"
35 #include "ng_anchored_acyclic.h"
36 #include "ng_anchored_dots.h"
37 #include "ng_asserts.h"
38 #include "ng_calc_components.h"
39 #include "ng_cyclic_redundancy.h"
40 #include "ng_dump.h"
41 #include "ng_edge_redundancy.h"
42 #include "ng_equivalence.h"
43 #include "ng_extparam.h"
44 #include "ng_fixed_width.h"
45 #include "ng_fuzzy.h"
46 #include "ng_haig.h"
47 #include "ng_literal_component.h"
48 #include "ng_literal_decorated.h"
49 #include "ng_misc_opt.h"
50 #include "ng_puff.h"
51 #include "ng_prefilter.h"
52 #include "ng_prune.h"
53 #include "ng_redundancy.h"
54 #include "ng_region.h"
55 #include "ng_region_redundancy.h"
56 #include "ng_reports.h"
57 #include "ng_sep.h"
58 #include "ng_small_literal_set.h"
59 #include "ng_som.h"
60 #include "ng_vacuous.h"
61 #include "ng_violet.h"
62 #include "ng_utf8.h"
63 #include "ng_util.h"
64 #include "ng_width.h"
65 #include "ue2common.h"
66 #include "compiler/compiler.h"
67 #include "nfa/goughcompile.h"
68 #include "rose/rose_build.h"
69 #include "smallwrite/smallwrite_build.h"
70 #include "util/compile_error.h"
71 #include "util/container.h"
72 #include "util/depth.h"
73 #include "util/graph_range.h"
74 #include "util/make_unique.h"
75 #include "util/ue2string.h"
76 
77 using namespace std;
78 
79 namespace ue2 {
80 
NG(const CompileContext & in_cc,size_t num_patterns,unsigned in_somPrecision)81 NG::NG(const CompileContext &in_cc, size_t num_patterns,
82        unsigned in_somPrecision)
83     : maxSomRevHistoryAvailable(in_cc.grey.somMaxRevNfaLength),
84       minWidth(depth::infinity()),
85       rm(in_cc.grey),
86       ssm(in_somPrecision),
87       cc(in_cc),
88       smwr(makeSmallWriteBuilder(num_patterns, rm, cc)),
89       rose(makeRoseBuilder(rm, ssm, *smwr, cc, boundary)) {
90 }
91 
~NG()92 NG::~NG() {
93     // empty
94 }
95 
96 /** \brief SOM handling code, called by \ref addComponent.
97  *
98  * \return true if the component was handled completely by something (e.g. a
99  * Haig outfix), false if SOM could be established but implementation via an
100  * engine will be required.
101  *
102  * \throw CompileError if SOM cannot be supported for the component.
103  */
104 static
addComponentSom(NG & ng,NGHolder & g,const ExpressionInfo & expr,const som_type som,const u32 comp_id)105 bool addComponentSom(NG &ng, NGHolder &g, const ExpressionInfo &expr,
106                      const som_type som, const u32 comp_id) {
107     DEBUG_PRINTF("doing som\n");
108     dumpComponent(g, "03_presom", expr.index, comp_id, ng.cc.grey);
109     assert(hasCorrectlyNumberedVertices(g));
110     assert(allMatchStatesHaveReports(g));
111 
112     // First, we try the "SOM chain" support in ng_som.cpp.
113 
114     sombe_rv rv = doSom(ng, g, expr, comp_id, som);
115     if (rv == SOMBE_HANDLED_INTERNAL) {
116         return false;
117     } else if (rv == SOMBE_HANDLED_ALL) {
118         return true;
119     }
120     assert(rv == SOMBE_FAIL);
121 
122     /* Next, Sombe style approaches */
123     rv = doSomWithHaig(ng, g, expr, comp_id, som);
124     if (rv == SOMBE_HANDLED_INTERNAL) {
125         return false;
126     } else if (rv == SOMBE_HANDLED_ALL) {
127         return true;
128     }
129     assert(rv == SOMBE_FAIL);
130 
131     // If the previous approach could not support this pattern, we try treating
132     // it monolithically, as a Haig outfix.
133 
134     vector<vector<CharReach> > triggers; /* empty for outfix */
135 
136     assert(g.kind == NFA_OUTFIX);
137     dumpComponent(g, "haig", expr.index, comp_id, ng.cc.grey);
138     makeReportsSomPass(ng.rm, g);
139     auto haig = attemptToBuildHaig(g, som, ng.ssm.somPrecision(), triggers,
140                                    ng.cc.grey);
141     if (haig) {
142         DEBUG_PRINTF("built haig outfix\n");
143         ng.rose->addOutfix(g, *haig);
144         return true;
145     }
146 
147     /* Our various strategies for supporting SOM for this pattern have failed.
148      * Provide a generic pattern not supported/too large return value as it is
149      * unclear what the meaning of a specific SOM error would be */
150     throw CompileError(expr.index, "Pattern is too large.");
151 
152     assert(0); // unreachable
153     return false;
154 }
155 
reduceGraph(NGHolder & g,som_type som,bool utf8,const CompileContext & cc)156 void reduceGraph(NGHolder &g, som_type som, bool utf8,
157                  const CompileContext &cc) {
158     if (!cc.grey.performGraphSimplification) {
159         return;
160     }
161 
162     // We run reduction passes until either the graph stops changing or we hit
163     // a (small) limit.
164 
165     if (!som) {
166         mergeCyclicDotStars(g);
167     }
168 
169     const unsigned MAX_PASSES = 3;
170     for (unsigned pass = 1; pass <= MAX_PASSES; pass++) {
171         bool changed = false;
172         DEBUG_PRINTF("reduce pass %u/%u\n", pass, MAX_PASSES);
173         changed |= removeEdgeRedundancy(g, som, cc);
174         changed |= reduceGraphEquivalences(g, cc);
175         changed |= removeRedundancy(g, som);
176         changed |= removeCyclicPathRedundancy(g);
177         if (!changed) {
178             DEBUG_PRINTF("graph unchanged after pass %u, stopping\n", pass);
179             break;
180         }
181     }
182 
183     if (utf8) {
184         utf8DotRestoration(g, som);
185     }
186 
187     /* Minor non-redundancy improvements */
188     if (improveGraph(g, som)) {
189         /* may be some more edges to remove */
190         removeEdgeRedundancy(g, som, cc);
191     }
192 
193     removeCyclicDominated(g, som);
194 
195     if (!som) {
196         mergeCyclicDotStars(g);
197     }
198 
199     if (!som) {
200         removeSiblingsOfStartDotStar(g);
201     }
202 }
203 
204 static
addComponent(NG & ng,NGHolder & g,const ExpressionInfo & expr,const som_type som,const u32 comp_id)205 bool addComponent(NG &ng, NGHolder &g, const ExpressionInfo &expr,
206                   const som_type som, const u32 comp_id) {
207     const CompileContext &cc = ng.cc;
208     assert(hasCorrectlyNumberedVertices(g));
209 
210     DEBUG_PRINTF("expr=%u, comp=%u: %zu vertices, %zu edges\n",
211                  expr.index, comp_id, num_vertices(g), num_edges(g));
212 
213     dumpComponent(g, "01_begin", expr.index, comp_id, ng.cc.grey);
214 
215     assert(allMatchStatesHaveReports(g));
216 
217     reduceExtendedParams(g, ng.rm, som);
218     reduceGraph(g, som, expr.utf8, cc);
219 
220     dumpComponent(g, "02_reduced", expr.index, comp_id, ng.cc.grey);
221 
222     // There may be redundant regions that we can remove
223     if (cc.grey.performGraphSimplification) {
224         removeRegionRedundancy(g, som);
225     }
226 
227     // We might be done at this point: if we've run out of vertices, we can
228     // stop processing.
229     if (num_vertices(g) == N_SPECIALS) {
230         DEBUG_PRINTF("all vertices claimed\n");
231         return true;
232     }
233 
234     // "Short Exhaustible Passthrough" patterns always become outfixes.
235     if (!som && isSEP(g, ng.rm, cc.grey)) {
236         DEBUG_PRINTF("graph is SEP\n");
237         if (ng.rose->addOutfix(g)) {
238             return true;
239         }
240     }
241 
242     // Start Of Match handling.
243     if (som) {
244         if (addComponentSom(ng, g, expr, som, comp_id)) {
245             return true;
246         }
247     }
248 
249     assert(allMatchStatesHaveReports(g));
250 
251     if (splitOffAnchoredAcyclic(*ng.rose, g, cc)) {
252         return true;
253     }
254 
255     if (handleSmallLiteralSets(*ng.rose, g, cc)
256         || handleFixedWidth(*ng.rose, g, cc.grey)) {
257         return true;
258     }
259 
260     if (handleDecoratedLiterals(*ng.rose, g, cc)) {
261         return true;
262     }
263 
264     if (doViolet(*ng.rose, g, expr.prefilter, false, ng.rm, cc)) {
265         return true;
266     }
267 
268     if (splitOffPuffs(*ng.rose, ng.rm, g, expr.prefilter, cc)) {
269         return true;
270     }
271 
272     if (handleSmallLiteralSets(*ng.rose, g, cc)
273         || handleFixedWidth(*ng.rose, g, cc.grey)) {
274         return true;
275     }
276 
277     if (handleDecoratedLiterals(*ng.rose, g, cc)) {
278         return true;
279     }
280 
281     if (doViolet(*ng.rose, g, expr.prefilter, true, ng.rm, cc)) {
282         return true;
283     }
284 
285     DEBUG_PRINTF("testing for outfix\n");
286     assert(allMatchStatesHaveReports(g));
287     if (ng.rose->addOutfix(g)) {
288         return true;
289     }
290 
291     return false;
292 }
293 
294 // Returns true if all components have been added.
295 static
processComponents(NG & ng,ExpressionInfo & expr,deque<unique_ptr<NGHolder>> & g_comp,const som_type som)296 bool processComponents(NG &ng, ExpressionInfo &expr,
297                        deque<unique_ptr<NGHolder>> &g_comp,
298                        const som_type som) {
299     const u32 num_components = g_comp.size();
300 
301     u32 failed = 0;
302     for (u32 i = 0; i < num_components; i++) {
303         if (!g_comp[i]) {
304             continue;
305         }
306         if (addComponent(ng, *g_comp[i], expr, som, i)) {
307             g_comp[i].reset();
308             continue;
309         }
310 
311         if (som) { /* bail immediately */
312             return false;
313         }
314         failed++;
315     }
316 
317     if (!failed) {
318         DEBUG_PRINTF("all components claimed\n");
319         return true;
320     }
321 
322     DEBUG_PRINTF("%u components still remain\n", failed);
323     return false;
324 }
325 
addGraph(ExpressionInfo & expr,unique_ptr<NGHolder> g_ptr)326 bool NG::addGraph(ExpressionInfo &expr, unique_ptr<NGHolder> g_ptr) {
327     assert(g_ptr);
328     NGHolder &g = *g_ptr;
329 
330     // remove reports that aren't on vertices connected to accept.
331     clearReports(g);
332 
333     som_type som = expr.som;
334     if (som && isVacuous(g)) {
335         throw CompileError(expr.index, "Start of match is not "
336                            "currently supported for patterns which match an "
337                            "empty buffer.");
338     }
339 
340     dumpDotWrapper(g, expr, "01_initial", cc.grey);
341     assert(allMatchStatesHaveReports(g));
342 
343     /* ensure utf8 starts at cp boundary */
344     ensureCodePointStart(rm, g, expr);
345 
346     if (can_never_match(g)) {
347         throw CompileError(expr.index, "Pattern can never match.");
348     }
349 
350     bool hamming = expr.hamm_distance > 0;
351     u32 e_dist = hamming ? expr.hamm_distance : expr.edit_distance;
352 
353     DEBUG_PRINTF("edit distance = %u hamming = %s\n", e_dist, hamming ? "true" : "false");
354 
355     // validate graph's suitability for fuzzing before resolving asserts
356     validate_fuzzy_compile(g, e_dist, hamming, expr.utf8, cc.grey);
357 
358     resolveAsserts(rm, g, expr);
359     dumpDotWrapper(g, expr, "02_post_assert_resolve", cc.grey);
360     assert(allMatchStatesHaveReports(g));
361 
362     make_fuzzy(g, e_dist, hamming, cc.grey);
363     dumpDotWrapper(g, expr, "02a_post_fuzz", cc.grey);
364 
365     pruneUseless(g);
366     pruneEmptyVertices(g);
367 
368     if (can_never_match(g)) {
369         throw CompileError(expr.index, "Pattern can never match.");
370     }
371 
372     optimiseVirtualStarts(g); /* good for som */
373 
374     propagateExtendedParams(g, expr, rm);
375     reduceExtendedParams(g, rm, som);
376 
377     // We may have removed all the edges to accept, in which case this
378     // expression cannot match.
379     if (can_never_match(g)) {
380         throw CompileError(expr.index, "Extended parameter constraints can not "
381                                        "be satisfied for any match from this "
382                                        "expression.");
383     }
384 
385     if (any_of_in(all_reports(g), [&](ReportID id) {
386             return rm.getReport(id).minLength;
387         })) {
388         // We have at least one report with a minimum length constraint, which
389         // we currently use SOM to satisfy.
390         som = SOM_LEFT;
391         ssm.somPrecision(8);
392     }
393 
394     if (som) {
395         rose->setSom();
396     }
397 
398     // first, we can perform graph work that can be done on an individual
399     // expression basis.
400 
401     if (expr.utf8) {
402         relaxForbiddenUtf8(g, expr);
403     }
404 
405     if (all_of_in(all_reports(g), [&](ReportID id) {
406             const auto &report = rm.getReport(id);
407             return report.ekey != INVALID_EKEY && !report.minLength &&
408                    !report.minOffset;
409         })) {
410         // In highlander mode: if we don't have constraints on our reports that
411         // may prevent us accepting our first match (i.e. extended params) we
412         // can prune the other out-edges of all vertices connected to accept.
413         // TODO: shift the report checking down into pruneHighlanderAccepts()
414         // to allow us to handle the parts we can in mixed cases.
415         pruneHighlanderAccepts(g, rm);
416     }
417 
418     dumpDotWrapper(g, expr, "02b_fairly_early", cc.grey);
419 
420     // If we're a vacuous pattern, we can handle this early.
421     if (splitOffVacuous(boundary, rm, g, expr)) {
422         DEBUG_PRINTF("split off vacuous\n");
423     }
424 
425     // We might be done at this point: if we've run out of vertices, we can
426     // stop processing.
427     if (num_vertices(g) == N_SPECIALS) {
428         DEBUG_PRINTF("all vertices claimed by vacuous handling\n");
429         return true;
430     }
431 
432     // Now that vacuous edges have been removed, update the min width exclusive
433     // of boundary reports.
434     minWidth = min(minWidth, findMinWidth(g));
435 
436     // Add the pattern to the small write builder.
437     smwr->add(g, expr);
438 
439     if (!som) {
440         removeSiblingsOfStartDotStar(g);
441     }
442 
443     dumpDotWrapper(g, expr, "03_early", cc.grey);
444 
445     // Perform a reduction pass to merge sibling character classes together.
446     if (cc.grey.performGraphSimplification) {
447         removeRedundancy(g, som);
448         prunePathsRedundantWithSuccessorOfCyclics(g, som);
449     }
450 
451     dumpDotWrapper(g, expr, "04_reduced", cc.grey);
452 
453     // If we've got some literals that span the graph from start to accept, we
454     // can split them off into Rose from here.
455     if (!som) {
456         if (splitOffLiterals(*this, g)) {
457             DEBUG_PRINTF("some vertices claimed by literals\n");
458         }
459     }
460 
461     // We might be done at this point: if we've run out of vertices, we can
462     // stop processing.
463     if (num_vertices(g) == N_SPECIALS) {
464         DEBUG_PRINTF("all vertices claimed before calc components\n");
465         return true;
466     }
467 
468     // Split the graph into a set of connected components and process those.
469     // Note: this invalidates g_ptr.
470 
471     auto g_comp = calcComponents(std::move(g_ptr), cc.grey);
472     assert(!g_comp.empty());
473 
474     if (!som) {
475         for (auto &gc : g_comp) {
476             assert(gc);
477             reformLeadingDots(*gc);
478         }
479 
480         recalcComponents(g_comp, cc.grey);
481     }
482 
483     if (processComponents(*this, expr, g_comp, som)) {
484         return true;
485     }
486 
487     // If we're in prefiltering mode, we can run the prefilter reductions and
488     // have another shot at accepting the graph.
489 
490     if (cc.grey.prefilterReductions && expr.prefilter) {
491         for (auto &gc : g_comp) {
492             if (!gc) {
493                 continue;
494             }
495             prefilterReductions(*gc, cc);
496         }
497 
498         if (processComponents(*this, expr, g_comp, som)) {
499             return true;
500         }
501     }
502 
503     // We must have components that could not be compiled.
504     for (u32 i = 0; i < g_comp.size(); i++) {
505         if (g_comp[i]) {
506             DEBUG_PRINTF("could not compile component %u with %zu vertices\n",
507                          i, num_vertices(*g_comp[i]));
508             throw CompileError(expr.index, "Pattern is too large.");
509         }
510     }
511 
512     assert(0); // should have thrown.
513     return false;
514 }
515 
516 /** \brief Used from SOM mode to add an arbitrary NGHolder as an engine. */
addHolder(NGHolder & g)517 bool NG::addHolder(NGHolder &g) {
518     DEBUG_PRINTF("adding holder of %zu states\n", num_vertices(g));
519     assert(allMatchStatesHaveReports(g));
520     assert(hasCorrectlyNumberedVertices(g));
521 
522     /* We don't update the global minWidth here as we care about the min width
523      * of the whole pattern - not a just a prefix of it. */
524 
525     bool prefilter = false;
526     //dumpDotComp(comp, g, *this, 20, "prefix_init");
527 
528     som_type som = SOM_NONE; /* the prefixes created by the SOM code do not
529                                 themselves track som */
530     bool utf8 = false; // handling done earlier
531     reduceGraph(g, som, utf8, cc);
532 
533     // There may be redundant regions that we can remove
534     if (cc.grey.performGraphSimplification) {
535         removeRegionRedundancy(g, som);
536     }
537 
538     // "Short Exhaustible Passthrough" patterns always become outfixes.
539     if (isSEP(g, rm, cc.grey)) {
540         DEBUG_PRINTF("graph is SEP\n");
541         if (rose->addOutfix(g)) {
542             return true;
543         }
544     }
545 
546     if (splitOffAnchoredAcyclic(*rose, g, cc)) {
547         return true;
548     }
549 
550     if (handleSmallLiteralSets(*rose, g, cc)
551         || handleFixedWidth(*rose, g, cc.grey)) {
552         return true;
553     }
554 
555     if (handleDecoratedLiterals(*rose, g, cc)) {
556         return true;
557     }
558 
559     if (doViolet(*rose, g, prefilter, false, rm, cc)) {
560         return true;
561     }
562     if (splitOffPuffs(*rose, rm, g, prefilter, cc)) {
563         return true;
564     }
565     if (doViolet(*rose, g, prefilter, true, rm, cc)) {
566         return true;
567     }
568 
569     DEBUG_PRINTF("trying for outfix\n");
570     if (rose->addOutfix(g)) {
571         DEBUG_PRINTF("ok\n");
572         return true;
573     }
574     DEBUG_PRINTF("trying for outfix - failed\n");
575     DEBUG_PRINTF("nobody would take us\n");
576     return false;
577 }
578 
addLiteral(const ue2_literal & literal,u32 expr_index,u32 external_report,bool highlander,som_type som,bool quiet)579 bool NG::addLiteral(const ue2_literal &literal, u32 expr_index,
580                     u32 external_report, bool highlander, som_type som,
581                     bool quiet) {
582     assert(!literal.empty());
583 
584     if (!cc.grey.shortcutLiterals) {
585         return false;
586     }
587 
588     // We can't natively handle arbitrary literals with mixed case sensitivity
589     // in Rose -- they require mechanisms like benefits masks, which have
590     // length limits etc. Better to let those go through full graph processing.
591     if (mixed_sensitivity(literal)) {
592         DEBUG_PRINTF("mixed sensitivity\n");
593         return false;
594     }
595 
596     // Register external report and validate highlander constraints.
597     rm.registerExtReport(external_report,
598                          external_report_info(highlander, expr_index));
599 
600     ReportID id;
601     if (som) {
602         assert(!highlander); // not allowed, checked earlier.
603         Report r = makeSomRelativeCallback(external_report, 0, literal.length());
604         id = rm.getInternalId(r);
605         rose->setSom();
606     } else {
607         u32 ekey = highlander ? rm.getExhaustibleKey(external_report)
608                               : INVALID_EKEY;
609         Report r = makeECallback(external_report, 0, ekey, quiet);
610         id = rm.getInternalId(r);
611     }
612 
613     DEBUG_PRINTF("success: graph is literal '%s', report ID %u\n",
614                  dumpString(literal).c_str(), id);
615 
616     rose->add(false, false, literal, {id});
617 
618     minWidth = min(minWidth, depth(literal.length()));
619 
620     /* inform small write handler about this literal */
621     smwr->add(literal, id);
622 
623     return true;
624 }
625 
626 } // namespace ue2
627