1 /*
2 * Copyright (c) 2015-2019, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /** \file
30 * \brief Shortcut literal pass: directly add literal components to Rose.
31 */
32 #include "AsciiComponentClass.h"
33 #include "Utf8ComponentClass.h"
34 #include "ComponentAssertion.h"
35 #include "ComponentAtomicGroup.h"
36 #include "ComponentBackReference.h"
37 #include "ComponentBoundary.h"
38 #include "ComponentClass.h"
39 #include "ComponentCondReference.h"
40 #include "ComponentRepeat.h"
41 #include "ComponentSequence.h"
42 #include "ComponentVisitor.h"
43 #include "ComponentWordBoundary.h"
44 #include "ConstComponentVisitor.h"
45 #include "parse_error.h"
46 #include "shortcut_literal.h"
47 #include "grey.h"
48 #include "nfagraph/ng.h"
49 #include "compiler/compiler.h"
50 #include "util/ue2string.h"
51 #include "ue2common.h"
52
53 #include <stack>
54
55 using namespace std;
56
57 namespace ue2 {
58
59 /**
60 * \brief Visitor that constructs a ue2_literal from a component tree.
61 *
62 * If a component that can't be part of a literal is encountered, this visitor
63 * will throw ConstructLiteralVisitor::NotLiteral.
64 */
65 class ConstructLiteralVisitor : public ConstComponentVisitor {
66 public:
67 ~ConstructLiteralVisitor() override;
68
69 /** \brief Thrown if this component does not represent a literal. */
70 struct NotLiteral {};
71
pre(const AsciiComponentClass & c)72 void pre(const AsciiComponentClass &c) override {
73 const CharReach &cr = c.cr;
74 const size_t width = cr.count();
75 if (width == 1) {
76 lit.push_back(cr.find_first(), false);
77 } else if (width == 2 && cr.isCaselessChar()) {
78 lit.push_back(cr.find_first(), true);
79 } else {
80 throw NotLiteral();
81 }
82 }
83
pre(const ComponentRepeat & c)84 void pre(const ComponentRepeat &c) override {
85 if (c.m_min == 0 || c.m_min != c.m_max) {
86 throw NotLiteral();
87 }
88
89 if (c.m_max < ComponentRepeat::NoLimit && c.m_max > 32767) {
90 throw ParseError("Bounded repeat is too large.");
91 }
92
93 // Store the current length of the literal; in this repeat's post()
94 // call we will append N-1 more copies of [index..end].
95 repeat_stack.push(lit.length());
96 }
97
post(const ComponentRepeat & c)98 void post(const ComponentRepeat &c) override {
99 // Add N-1 copies of the string between the entry to the repeat and the
100 // current end of the literal.
101 assert(!repeat_stack.empty());
102 const ue2_literal suffix = lit.substr(repeat_stack.top());
103 repeat_stack.pop();
104
105 for (unsigned i = 1; i < c.m_min; i++) {
106 lit += suffix;
107 }
108 }
109
pre(const ComponentSequence &)110 void pre(const ComponentSequence &) override {
111 // Pass through.
112 }
113
pre(const ComponentAlternation &)114 void pre(const ComponentAlternation &) override { throw NotLiteral(); }
pre(const ComponentAssertion &)115 void pre(const ComponentAssertion &) override { throw NotLiteral(); }
pre(const ComponentAtomicGroup &)116 void pre(const ComponentAtomicGroup &) override { throw NotLiteral(); }
pre(const ComponentBackReference &)117 void pre(const ComponentBackReference &) override { throw NotLiteral(); }
pre(const ComponentBoundary &)118 void pre(const ComponentBoundary &) override { throw NotLiteral(); }
pre(const ComponentByte &)119 void pre(const ComponentByte &) override { throw NotLiteral(); }
pre(const ComponentCondReference &)120 void pre(const ComponentCondReference &) override { throw NotLiteral(); }
pre(const ComponentEmpty &)121 void pre(const ComponentEmpty &) override { throw NotLiteral(); }
pre(const ComponentEUS &)122 void pre(const ComponentEUS &) override { throw NotLiteral(); }
pre(const ComponentWordBoundary &)123 void pre(const ComponentWordBoundary &) override { throw NotLiteral(); }
pre(const UTF8ComponentClass &)124 void pre(const UTF8ComponentClass &) override { throw NotLiteral(); }
125
during(const AsciiComponentClass &)126 void during(const AsciiComponentClass &) override {}
during(const ComponentAlternation &)127 void during(const ComponentAlternation &) override {}
during(const ComponentAssertion &)128 void during(const ComponentAssertion &) override {}
during(const ComponentAtomicGroup &)129 void during(const ComponentAtomicGroup &) override {}
during(const ComponentBackReference &)130 void during(const ComponentBackReference &) override {}
during(const ComponentBoundary &)131 void during(const ComponentBoundary &) override {}
during(const ComponentByte &)132 void during(const ComponentByte &) override {}
during(const ComponentCondReference &)133 void during(const ComponentCondReference &) override {}
during(const ComponentEmpty &)134 void during(const ComponentEmpty &) override {}
during(const ComponentEUS &)135 void during(const ComponentEUS &) override {}
during(const ComponentRepeat &)136 void during(const ComponentRepeat &) override {}
during(const ComponentSequence &)137 void during(const ComponentSequence &) override {}
during(const ComponentWordBoundary &)138 void during(const ComponentWordBoundary &) override {}
during(const UTF8ComponentClass &)139 void during(const UTF8ComponentClass &) override {}
140
post(const AsciiComponentClass &)141 void post(const AsciiComponentClass &) override {}
post(const ComponentAlternation &)142 void post(const ComponentAlternation &) override {}
post(const ComponentAssertion &)143 void post(const ComponentAssertion &) override {}
post(const ComponentAtomicGroup &)144 void post(const ComponentAtomicGroup &) override {}
post(const ComponentBackReference &)145 void post(const ComponentBackReference &) override {}
post(const ComponentBoundary &)146 void post(const ComponentBoundary &) override {}
post(const ComponentByte &)147 void post(const ComponentByte &) override {}
post(const ComponentCondReference &)148 void post(const ComponentCondReference &) override {}
post(const ComponentEmpty &)149 void post(const ComponentEmpty &) override {}
post(const ComponentEUS &)150 void post(const ComponentEUS &) override {}
post(const ComponentSequence &)151 void post(const ComponentSequence &) override {}
post(const ComponentWordBoundary &)152 void post(const ComponentWordBoundary &) override {}
post(const UTF8ComponentClass &)153 void post(const UTF8ComponentClass &) override {}
154
155 ue2_literal lit;
156 stack<size_t> repeat_stack; //!< index of entry to repeat.
157 };
158
~ConstructLiteralVisitor()159 ConstructLiteralVisitor::~ConstructLiteralVisitor() {}
160
161 /** \brief True if the literal expression \a expr could be added to Rose. */
shortcutLiteral(NG & ng,const ParsedExpression & pe)162 bool shortcutLiteral(NG &ng, const ParsedExpression &pe) {
163 assert(pe.component);
164
165 if (!ng.cc.grey.allowLiteral) {
166 return false;
167 }
168
169 const auto &expr = pe.expr;
170
171 // XXX: don't shortcut literals with extended params (yet)
172 if (expr.min_offset || expr.max_offset != MAX_OFFSET || expr.min_length ||
173 expr.edit_distance || expr.hamm_distance) {
174 DEBUG_PRINTF("extended params not allowed\n");
175 return false;
176 }
177
178 ConstructLiteralVisitor vis;
179 try {
180 assert(pe.component);
181 pe.component->accept(vis);
182 assert(vis.repeat_stack.empty());
183 } catch (const ConstructLiteralVisitor::NotLiteral&) {
184 DEBUG_PRINTF("not a literal\n");
185 return false;
186 }
187
188 const ue2_literal &lit = vis.lit;
189
190 if (lit.empty()) {
191 DEBUG_PRINTF("empty literal\n");
192 return false;
193 }
194
195 if (expr.highlander && lit.length() <= 1) {
196 DEBUG_PRINTF("not shortcutting SEP literal\n");
197 return false;
198 }
199
200 DEBUG_PRINTF("constructed literal %s\n", dumpString(lit).c_str());
201 return ng.addLiteral(lit, expr.index, expr.report, expr.highlander,
202 expr.som, expr.quiet);
203 }
204
205 } // namespace ue2
206