1 /*
2  * Copyright (c) 2015-2016, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /** \file
30  * \brief Prefiltering component tree transformation.
31  */
32 #include "ComponentAssertion.h"
33 #include "ComponentAtomicGroup.h"
34 #include "ComponentBackReference.h"
35 #include "ComponentBoundary.h"
36 #include "ComponentClass.h"
37 #include "ComponentCondReference.h"
38 #include "ComponentRepeat.h"
39 #include "ComponentSequence.h"
40 #include "ComponentVisitor.h"
41 #include "ComponentWordBoundary.h"
42 #include "ConstComponentVisitor.h"
43 #include "Parser.h"
44 #include "prefilter.h"
45 
46 #include <algorithm>
47 #include <stack>
48 
49 using namespace std;
50 
51 namespace ue2 {
52 
53 /** \brief Max number of positions a referent can have to be considered safe to
54  * replace a reference in prefiltering mode. */
55 static const size_t MAX_REFERENT_POSITIONS = 1;
56 
57 /** \brief Constructs a \ref ComponentClass that matches a dot (any
58  * byte/codepoint, depending on whether UTF-8). */
59 static
makeDotClass(const ParseMode & mode_in)60 unique_ptr<ComponentClass> makeDotClass(const ParseMode &mode_in) {
61     ParseMode mode(mode_in);
62     mode.dotall = true;
63     return generateComponent(CLASS_ANY, false, mode);
64 }
65 
66 namespace {
67 
68 /**
69  * \brief Visitor used to determine if a given referent component is safe to
70  * replace its reference in prefiltering mode. Throws
71  * SafeReferentVisitor::Unsafe to terminate early on unsafe cases. */
72 class SafeReferentVisitor : public DefaultConstComponentVisitor {
73 public:
74     struct Unsafe {};
75 
SafeReferentVisitor()76     SafeReferentVisitor() : numPositions(0) {}
77 
is_safe() const78     bool is_safe() const {
79         DEBUG_PRINTF("numPositions = %zu\n", numPositions);
80         return numPositions <= MAX_REFERENT_POSITIONS;
81     }
82 
83     using DefaultConstComponentVisitor::pre;
84     using DefaultConstComponentVisitor::post;
85 
pre(const AsciiComponentClass &)86     void pre(const AsciiComponentClass &) override {
87         numPositions++;
88     }
89 
pre(const UTF8ComponentClass &)90     void pre(const UTF8ComponentClass &) override {
91         // FIXME: we should be able to tell precisely how many positions this
92         // class will use. Right now, use the worst case.
93         numPositions += 4;
94     }
95 
pre(const ComponentBoundary &)96     void pre(const ComponentBoundary &) override {
97         numPositions++;
98     }
99 
pre(const ComponentByte &)100     void pre(const ComponentByte &) override {
101         numPositions++;
102     }
103 
pre(const ComponentEUS &)104     void pre(const ComponentEUS &) override {
105         numPositions++;
106     }
107 
pre(const ComponentRepeat &)108     void pre(const ComponentRepeat &) override {
109         // Record the number of positions used before we visit the contents of
110         // the repeat.
111         countStack.push(numPositions);
112     }
113 
post(const ComponentRepeat & c)114     void post(const ComponentRepeat &c) override {
115         assert(!countStack.empty());
116         size_t before = countStack.top();
117         countStack.pop();
118         assert(before <= numPositions);
119 
120         std::pair<u32, u32> bounds = c.getBounds();
121         size_t subPositions = numPositions - before;
122         size_t copies = bounds.second < ComponentRepeat::NoLimit
123                             ? bounds.second
124                             : max(bounds.first, 1U);
125         numPositions = before + (subPositions * copies);
126     }
127 
pre(const ComponentWordBoundary &)128     void pre(const ComponentWordBoundary &) override {
129         // not quite accurate, as these are expanded out in assert
130         // resolution...
131         numPositions++;
132     }
133 
pre(const ComponentBackReference &)134     void pre(const ComponentBackReference &) override {
135         throw Unsafe();
136     }
137 
pre(const ComponentCondReference &)138     void pre(const ComponentCondReference &) override {
139         throw Unsafe();
140     }
141 
142 private:
143     size_t numPositions;
144 
145     // For temporary use
146     std::stack<size_t> countStack;
147 };
148 
149 static
isSafeReferent(const Component & c)150 bool isSafeReferent(const Component &c) {
151     try {
152         SafeReferentVisitor vis;
153         c.accept(vis);
154         return vis.is_safe();
155     }
156     catch (const SafeReferentVisitor::Unsafe &) {
157         return false;
158     }
159 }
160 
161 /**
162  * \brief Visitor to find the \ref ComponentSequence with a given reference ID
163  * or name: if found, the visitor will throw a const ptr to it.
164  */
165 class FindSequenceVisitor : public DefaultConstComponentVisitor {
166 public:
FindSequenceVisitor(unsigned ref_id)167     explicit FindSequenceVisitor(unsigned ref_id) : id(ref_id) {}
FindSequenceVisitor(const std::string & s)168     explicit FindSequenceVisitor(const std::string &s) : name(s) {}
169 
170     using DefaultConstComponentVisitor::pre;
171 
pre(const ComponentSequence & c)172     void pre(const ComponentSequence &c) override {
173         if (!name.empty()) {
174             if (c.getCaptureName() == name) {
175                 throw &c;
176             }
177         } else if (c.getCaptureIndex() == id) {
178             throw &c;
179         }
180     }
181 private:
182     const std::string name;
183     const unsigned id = 0;
184 };
185 
186 static
findCapturingGroup(const Component * root,FindSequenceVisitor & vis)187 const ComponentSequence *findCapturingGroup(const Component *root,
188                                             FindSequenceVisitor &vis) {
189     try {
190         root->accept(vis);
191         DEBUG_PRINTF("group not found\n");
192         return nullptr;
193     } catch (const ComponentSequence *seq) {
194         return seq;
195     }
196 }
197 
198 } // namespace
199 
200 /**
201  * \brief Visitor to apply prefilter reductions, swapping components for which
202  * we don't have real implementations with implementable ones. Any such
203  * replacement should produce a superset of the matches that would be produced
204  * by the original.
205  */
206 class PrefilterVisitor : public DefaultComponentVisitor {
207 public:
PrefilterVisitor(Component * c,const ParseMode & m)208     PrefilterVisitor(Component *c, const ParseMode &m) : root(c), mode(m) {}
209     ~PrefilterVisitor() override;
210 
211     using DefaultComponentVisitor::visit;
212 
213     /** \brief Calls the visitor (recursively) on a new replacement component
214      * we've just created. Takes care of freeing it if the sequence is itself
215      * replaced. */
216     template<class T>
visit_replacement(T * r)217     Component *visit_replacement(T *r) {
218         Component *c = r->accept(*this);
219         if (c != r) {
220             delete r;
221         }
222         return c;
223     }
224 
visit(ComponentBackReference * c)225     Component *visit(ComponentBackReference *c) override {
226         assert(c);
227 
228         // If the referent is simple (represents a single position), then we
229         // replace the back-reference with a copy of it.
230         const ComponentSequence *ref = nullptr;
231         const std::string &ref_name = c->getRefName();
232         const unsigned ref_id = c->getRefID();
233         if (!ref_name.empty()) {
234             FindSequenceVisitor vis(ref_name);
235             ref = findCapturingGroup(root, vis);
236         } else if (ref_id > 0) {
237             FindSequenceVisitor vis(ref_id);
238             ref = findCapturingGroup(root, vis);
239         }
240 
241         if (ref && isSafeReferent(*ref)) {
242             DEBUG_PRINTF("found safe ref %p\n", ref);
243             ComponentSequence *seq = ref->clone();
244             // Remove labels from cloned sequence.
245             seq->setCaptureName("");
246             seq->setCaptureIndex(ComponentSequence::NOT_CAPTURED);
247 
248             return visit_replacement(seq);
249         }
250 
251         // Replace with ".*".
252         auto rep = makeComponentRepeat(makeDotClass(mode), 0,
253                                        ComponentRepeat::NoLimit,
254                                        ComponentRepeat::REPEAT_GREEDY);
255         return rep.release(); // FIXME: owning raw ptr
256     }
257 
visit(UNUSED ComponentAssertion * c)258     Component *visit(UNUSED ComponentAssertion *c) override {
259         assert(c);
260         // Replace with an empty sequence.
261         return new ComponentSequence();
262     }
263 
visit(ComponentRepeat * c)264     Component *visit(ComponentRepeat *c) override {
265         assert(c);
266         // Possessive repeats become greedy.
267         if (c->type == ComponentRepeat::REPEAT_POSSESSIVE) {
268             c->type = ComponentRepeat::REPEAT_GREEDY;
269         }
270         return c;
271     }
272 
visit(ComponentAtomicGroup * c)273     Component *visit(ComponentAtomicGroup *c) override {
274         assert(c);
275         // Replace with a plain sequence containing the atomic group's
276         // children.
277         ComponentSequence *seq = new ComponentSequence();
278         const auto &children = c->getChildren();
279         for (const auto &child : children) {
280             assert(child);
281             seq->addComponent(unique_ptr<Component>(child->clone()));
282         }
283 
284         return visit_replacement(seq);
285     }
286 
visit(UNUSED ComponentEUS * c)287     Component *visit(UNUSED ComponentEUS *c) override {
288         assert(c);
289         // Replace with ".+".
290         auto rep = makeComponentRepeat(makeDotClass(mode), 1,
291                                        ComponentRepeat::NoLimit,
292                                        ComponentRepeat::REPEAT_GREEDY);
293         return rep.release(); // FIXME: owning raw ptr
294     }
295 
visit(ComponentWordBoundary * c)296     Component *visit(ComponentWordBoundary *c) override {
297         assert(c);
298 
299         // TODO: Right now, we do not have correct code for resolving these
300         // when prefiltering is on, UCP is on, and UTF-8 is *off*. For now, we
301         // just replace with an empty sequence (as that will return a superset
302         // of matches).
303         if (mode.ucp && !mode.utf8) {
304             return new ComponentSequence();
305         }
306 
307         // All other cases can be prefiltered.
308         c->setPrefilter(true);
309         return c;
310     }
311 
visit(ComponentCondReference * c)312     Component *visit(ComponentCondReference *c) override {
313         assert(c);
314         // Replace with a plain sequence containing the conditional reference's
315         // children.
316         ComponentSequence *seq = new ComponentSequence();
317         const auto &children = c->getChildren();
318 
319         // Empty children is accepted by PCRE as a "do nothing" case.
320         if (children.empty()) {
321             return seq;
322         }
323 
324         for (const auto &child : children) {
325             assert(child);
326             seq->addComponent(unique_ptr<Component>(child->clone()));
327         }
328 
329         // If the conditional reference had just a YES branch, we want this to
330         // be an alternation with an empty sequence (the NO branch).
331         if (!c->hasBothBranches) {
332             seq->addAlternation();
333             seq->finalize();
334         }
335 
336         return visit_replacement(seq);
337     }
338 
339 private:
340     Component *root;
341     const ParseMode &mode;
342 };
343 
~PrefilterVisitor()344 PrefilterVisitor::~PrefilterVisitor() {}
345 
prefilterTree(unique_ptr<Component> & root,const ParseMode & mode)346 void prefilterTree(unique_ptr<Component> &root, const ParseMode &mode) {
347     assert(root);
348     PrefilterVisitor vis(root.get(), mode);
349 
350     Component *c = root->accept(vis);
351     if (c != root.get()) {
352         root.reset(c);
353     }
354 }
355 
356 } // namespace ue2
357