1 /*
2  * Copyright (c) 2015-2019, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /** \file
30  * \brief Shortcut literal pass: directly add literal components to Rose.
31  */
32 #include "AsciiComponentClass.h"
33 #include "Utf8ComponentClass.h"
34 #include "ComponentAssertion.h"
35 #include "ComponentAtomicGroup.h"
36 #include "ComponentBackReference.h"
37 #include "ComponentBoundary.h"
38 #include "ComponentClass.h"
39 #include "ComponentCondReference.h"
40 #include "ComponentRepeat.h"
41 #include "ComponentSequence.h"
42 #include "ComponentVisitor.h"
43 #include "ComponentWordBoundary.h"
44 #include "ConstComponentVisitor.h"
45 #include "parse_error.h"
46 #include "shortcut_literal.h"
47 #include "grey.h"
48 #include "nfagraph/ng.h"
49 #include "compiler/compiler.h"
50 #include "util/ue2string.h"
51 #include "ue2common.h"
52 
53 #include <stack>
54 
55 using namespace std;
56 
57 namespace ue2 {
58 
59 /**
60  * \brief Visitor that constructs a ue2_literal from a component tree.
61  *
62  * If a component that can't be part of a literal is encountered, this visitor
63  * will throw ConstructLiteralVisitor::NotLiteral.
64  */
65 class ConstructLiteralVisitor : public ConstComponentVisitor {
66 public:
67     ~ConstructLiteralVisitor() override;
68 
69     /** \brief Thrown if this component does not represent a literal. */
70     struct NotLiteral {};
71 
pre(const AsciiComponentClass & c)72     void pre(const AsciiComponentClass &c) override {
73         const CharReach &cr = c.cr;
74         const size_t width = cr.count();
75         if (width == 1) {
76             lit.push_back(cr.find_first(), false);
77         } else if (width == 2 && cr.isCaselessChar()) {
78             lit.push_back(cr.find_first(), true);
79         } else {
80             throw NotLiteral();
81         }
82     }
83 
pre(const ComponentRepeat & c)84     void pre(const ComponentRepeat &c) override {
85         if (c.m_min == 0 || c.m_min != c.m_max) {
86             throw NotLiteral();
87         }
88 
89         if (c.m_max < ComponentRepeat::NoLimit && c.m_max > 32767) {
90             throw ParseError("Bounded repeat is too large.");
91         }
92 
93         // Store the current length of the literal; in this repeat's post()
94         // call we will append N-1 more copies of [index..end].
95         repeat_stack.push(lit.length());
96     }
97 
post(const ComponentRepeat & c)98     void post(const ComponentRepeat &c) override {
99         // Add N-1 copies of the string between the entry to the repeat and the
100         // current end of the literal.
101         assert(!repeat_stack.empty());
102         const ue2_literal suffix = lit.substr(repeat_stack.top());
103         repeat_stack.pop();
104 
105         for (unsigned i = 1; i < c.m_min; i++) {
106             lit += suffix;
107         }
108     }
109 
pre(const ComponentSequence &)110     void pre(const ComponentSequence &) override {
111         // Pass through.
112     }
113 
pre(const ComponentAlternation &)114     void pre(const ComponentAlternation &) override { throw NotLiteral(); }
pre(const ComponentAssertion &)115     void pre(const ComponentAssertion &) override { throw NotLiteral(); }
pre(const ComponentAtomicGroup &)116     void pre(const ComponentAtomicGroup &) override { throw NotLiteral(); }
pre(const ComponentBackReference &)117     void pre(const ComponentBackReference &) override { throw NotLiteral(); }
pre(const ComponentBoundary &)118     void pre(const ComponentBoundary &) override { throw NotLiteral(); }
pre(const ComponentByte &)119     void pre(const ComponentByte &) override { throw NotLiteral(); }
pre(const ComponentCondReference &)120     void pre(const ComponentCondReference &) override { throw NotLiteral(); }
pre(const ComponentEmpty &)121     void pre(const ComponentEmpty &) override { throw NotLiteral(); }
pre(const ComponentEUS &)122     void pre(const ComponentEUS &) override { throw NotLiteral(); }
pre(const ComponentWordBoundary &)123     void pre(const ComponentWordBoundary &) override { throw NotLiteral(); }
pre(const UTF8ComponentClass &)124     void pre(const UTF8ComponentClass &) override { throw NotLiteral(); }
125 
during(const AsciiComponentClass &)126     void during(const AsciiComponentClass &) override {}
during(const ComponentAlternation &)127     void during(const ComponentAlternation &) override {}
during(const ComponentAssertion &)128     void during(const ComponentAssertion &) override {}
during(const ComponentAtomicGroup &)129     void during(const ComponentAtomicGroup &) override {}
during(const ComponentBackReference &)130     void during(const ComponentBackReference &) override {}
during(const ComponentBoundary &)131     void during(const ComponentBoundary &) override {}
during(const ComponentByte &)132     void during(const ComponentByte &) override {}
during(const ComponentCondReference &)133     void during(const ComponentCondReference &) override {}
during(const ComponentEmpty &)134     void during(const ComponentEmpty &) override {}
during(const ComponentEUS &)135     void during(const ComponentEUS &) override {}
during(const ComponentRepeat &)136     void during(const ComponentRepeat &) override {}
during(const ComponentSequence &)137     void during(const ComponentSequence &) override {}
during(const ComponentWordBoundary &)138     void during(const ComponentWordBoundary &) override {}
during(const UTF8ComponentClass &)139     void during(const UTF8ComponentClass &) override {}
140 
post(const AsciiComponentClass &)141     void post(const AsciiComponentClass &) override {}
post(const ComponentAlternation &)142     void post(const ComponentAlternation &) override {}
post(const ComponentAssertion &)143     void post(const ComponentAssertion &) override {}
post(const ComponentAtomicGroup &)144     void post(const ComponentAtomicGroup &) override {}
post(const ComponentBackReference &)145     void post(const ComponentBackReference &) override {}
post(const ComponentBoundary &)146     void post(const ComponentBoundary &) override {}
post(const ComponentByte &)147     void post(const ComponentByte &) override {}
post(const ComponentCondReference &)148     void post(const ComponentCondReference &) override {}
post(const ComponentEmpty &)149     void post(const ComponentEmpty &) override {}
post(const ComponentEUS &)150     void post(const ComponentEUS &) override {}
post(const ComponentSequence &)151     void post(const ComponentSequence &) override {}
post(const ComponentWordBoundary &)152     void post(const ComponentWordBoundary &) override {}
post(const UTF8ComponentClass &)153     void post(const UTF8ComponentClass &) override {}
154 
155     ue2_literal lit;
156     stack<size_t> repeat_stack; //!< index of entry to repeat.
157 };
158 
~ConstructLiteralVisitor()159 ConstructLiteralVisitor::~ConstructLiteralVisitor() {}
160 
161 /** \brief True if the literal expression \a expr could be added to Rose. */
shortcutLiteral(NG & ng,const ParsedExpression & pe)162 bool shortcutLiteral(NG &ng, const ParsedExpression &pe) {
163     assert(pe.component);
164 
165     if (!ng.cc.grey.allowLiteral) {
166         return false;
167     }
168 
169     const auto &expr = pe.expr;
170 
171     // XXX: don't shortcut literals with extended params (yet)
172     if (expr.min_offset || expr.max_offset != MAX_OFFSET || expr.min_length ||
173         expr.edit_distance || expr.hamm_distance) {
174         DEBUG_PRINTF("extended params not allowed\n");
175         return false;
176     }
177 
178     ConstructLiteralVisitor vis;
179     try {
180         assert(pe.component);
181         pe.component->accept(vis);
182         assert(vis.repeat_stack.empty());
183     } catch (const ConstructLiteralVisitor::NotLiteral&) {
184         DEBUG_PRINTF("not a literal\n");
185         return false;
186     }
187 
188     const ue2_literal &lit = vis.lit;
189 
190     if (lit.empty()) {
191         DEBUG_PRINTF("empty literal\n");
192         return false;
193     }
194 
195     if (expr.highlander && lit.length() <= 1) {
196         DEBUG_PRINTF("not shortcutting SEP literal\n");
197         return false;
198     }
199 
200     DEBUG_PRINTF("constructed literal %s\n", dumpString(lit).c_str());
201     return ng.addLiteral(lit, expr.index, expr.report, expr.highlander,
202                          expr.som, expr.quiet);
203 }
204 
205 } // namespace ue2
206