1 /*
2  * Copyright (c) 2015, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /** \file
30  * \brief Repeats ('*', '+', '?', '{M,N}', etc)
31  */
32 
33 
34 #include "ComponentRepeat.h"
35 
36 #include "buildstate.h"
37 #include "nfagraph/ng_builder.h"
38 #include "parse_error.h"
39 #include "Parser.h"
40 #include "position.h"
41 #include "position_dump.h"
42 #include "position_info.h"
43 #include "ue2common.h"
44 #include "util/make_unique.h"
45 
46 #include <algorithm>
47 #include <cassert>
48 
49 using namespace std;
50 
51 namespace ue2 {
52 
53 /** \brief Hard limit on the maximum repeat for bounded repeats. */
54 static constexpr u32 MAX_REPEAT = 32767;
55 
56 /** \brief If expanding a repeat would lead to this many positions being
57  * generated, we fail the pattern. */
58 static constexpr u32 MAX_POSITIONS_EXPANDED = 500000; // arbitrarily huge
59 
60 /* no edge priorities means that if our subcomponent can be empty, our min
61  * extent is effectively zero. */
ComponentRepeat(unique_ptr<Component> sub_comp_in,u32 min,u32 max,enum RepeatType t)62 ComponentRepeat::ComponentRepeat(unique_ptr<Component> sub_comp_in, u32 min,
63                                  u32 max, enum RepeatType t)
64     : type(t), sub_comp(move(sub_comp_in)), m_min(min), m_max(max),
65       posFirst(GlushkovBuildState::POS_UNINITIALIZED),
66       posLast(GlushkovBuildState::POS_UNINITIALIZED) {
67     assert(sub_comp);
68     assert(max > 0);
69     assert(m_min <= m_max);
70 
71     if (m_min > MAX_REPEAT) {
72         throw ParseError("Bounded repeat is too large.");
73     }
74     if (m_max != NoLimit && m_max > MAX_REPEAT) {
75         throw ParseError("Bounded repeat is too large.");
76     }
77 }
78 
~ComponentRepeat()79 ComponentRepeat::~ComponentRepeat() {}
80 
clone() const81 ComponentRepeat *ComponentRepeat::clone() const {
82     return new ComponentRepeat(*this);
83 }
84 
ComponentRepeat(const ComponentRepeat & other)85 ComponentRepeat::ComponentRepeat(const ComponentRepeat &other)
86     : Component(other),
87       type(other.type), sub_comp(unique_ptr<Component>(other.sub_comp->clone())),
88       m_min(other.m_min), m_max(other.m_max),
89       m_firsts(other.m_firsts), m_lasts(other.m_lasts),
90       posFirst(other.posFirst), posLast(other.posLast) {}
91 
empty() const92 bool ComponentRepeat::empty() const {
93     return m_min == 0 || sub_comp->empty();
94 }
95 
repeatable() const96 bool ComponentRepeat::repeatable() const {
97     return false;
98 }
99 
100 static
addBase(Position base,vector<PositionInfo> & firsts,vector<PositionInfo> & lasts)101 void addBase(Position base, vector<PositionInfo> &firsts,
102              vector<PositionInfo> &lasts) {
103     for (auto &e : firsts) {
104         if (e.pos != GlushkovBuildState::POS_EPSILON) {
105             e.pos += base;
106         }
107     }
108     for (auto &e : lasts) {
109         e.pos += base;
110     }
111 }
112 
113 static
checkPositions(vector<PositionInfo> & v,const GlushkovBuildState & bs)114 void checkPositions(vector<PositionInfo> &v, const GlushkovBuildState &bs) {
115     const NFABuilder& builder = bs.getBuilder();
116     for (const auto &e : v) {
117         if (builder.isSpecialState(e.pos)) {
118             throw ParseError("Embedded anchors not supported.");
119         }
120     }
121 }
122 
notePositions(GlushkovBuildState & bs)123 void ComponentRepeat::notePositions(GlushkovBuildState &bs) {
124     assert(m_max > 0);
125     assert(m_max == NoLimit || m_max < MAX_REPEAT);
126 
127     /* Note: We can construct smaller subgraphs if we're not maintaining edge
128      * priorities. */
129 
130     // We create one copy only through a recursive call to notePositions(),
131     // first() and last(). Then we clone its positions and store the
132     // appropriate firsts and lasts values for the copies.
133     posFirst = bs.getBuilder().numVertices();
134     sub_comp->notePositions(bs);
135 
136     u32 copies = m_max < NoLimit ? m_max : MAX(m_min, 1);
137     DEBUG_PRINTF("building %u copies of repeated region\n", copies);
138     m_firsts.clear();
139     m_lasts.clear();
140     m_firsts.resize(copies);
141     m_lasts.resize(copies);
142 
143     m_firsts[0] = sub_comp->first();
144     m_lasts[0] = sub_comp->last();
145 
146     postSubNotePositionHook();
147 
148     posLast = bs.getBuilder().numVertices() - 1;
149     u32 vcount = posLast + 1 - posFirst;
150 
151     // If we're making more than one copy, then our firsts and lasts must only
152     // contain vertices inside [posFirst, posLast]: anything else means we have
153     // an embedded anchor or otherwise weird situation.
154     if (copies > 1) {
155         checkPositions(m_firsts[0], bs);
156         checkPositions(m_lasts[0], bs);
157     }
158 
159     // Avoid enormous expansions
160     if (vcount * copies > MAX_POSITIONS_EXPANDED) {
161         throw ParseError("Bounded repeat is too large.");
162     }
163 
164     // Add positions for the rest of the copies
165     size_t copyPositions = vcount * (copies - 1);
166     bs.getBuilder().makePositions(copyPositions);
167 
168     // Calculate our firsts and lasts for the copies
169     for (u32 i = 1; i < copies; ++i) {
170         m_firsts[i] = m_firsts[0];
171         m_lasts[i] = m_lasts[0];
172         u32 base = i * vcount;
173         addBase(base, m_firsts[i], m_lasts[i]);
174     }
175 
176     recordPosBounds(posFirst, bs.getBuilder().numVertices());
177 
178     // Each optional repeat has an epsilon at the end of its firsts list.
179     for (u32 i = m_min; i < m_firsts.size(); i++) {
180         m_firsts[i].push_back(GlushkovBuildState::POS_EPSILON);
181     }
182 
183 }
184 
first() const185 vector<PositionInfo> ComponentRepeat::first() const {
186     if (!m_max) {
187         return {};
188     }
189 
190     assert(!m_firsts.empty()); // notePositions should already have run
191     const vector<PositionInfo> &firsts = m_firsts.front();
192     DEBUG_PRINTF("firsts = %s\n",
193                  dumpPositions(begin(firsts), end(firsts)).c_str());
194     return firsts;
195 }
196 
buildFollowSet(GlushkovBuildState & bs,const vector<PositionInfo> & lastPos)197 void ComponentRepeat::buildFollowSet(GlushkovBuildState &bs,
198                                      const vector<PositionInfo> &lastPos) {
199     if (!m_max) {
200         return;
201     }
202     DEBUG_PRINTF("enter\n");
203 
204     // Wire up the first (the "real") entry
205 
206     DEBUG_PRINTF("initial repeat\n");
207     sub_comp->buildFollowSet(bs, lastPos);
208 
209     // Clone the subgraph we just added N times, where N is the minimum extent
210     // of the graph minus one, wiring them up in a linear sequence
211 
212     u32 copies = m_firsts.size();
213     DEBUG_PRINTF("cloning %u copies of repeat\n", copies - 1);
214     for (u32 rep = 1; rep < copies; rep++) {
215         u32 offset = (posLast + 1 - posFirst) * rep;
216         if (offset > 0) {
217             bs.cloneFollowSet(posFirst, posLast, offset);
218         }
219     }
220 
221     wireRepeats(bs);
222 
223     DEBUG_PRINTF("leave\n");
224 }
225 
optimise(bool connected_to_sds)226 void ComponentRepeat::optimise(bool connected_to_sds) {
227     DEBUG_PRINTF("opt %d\n", (int)connected_to_sds);
228     if (!connected_to_sds) {
229         return;
230     }
231 
232     DEBUG_PRINTF("setting m_max to %u\n", m_min);
233     m_max = m_min;
234 }
235 
vacuous_everywhere() const236 bool ComponentRepeat::vacuous_everywhere() const {
237     return !m_min || sub_comp->vacuous_everywhere();
238 }
239 
checkEmbeddedStartAnchor(bool at_start) const240 bool ComponentRepeat::checkEmbeddedStartAnchor(bool at_start) const {
241     at_start = sub_comp->checkEmbeddedStartAnchor(at_start);
242 
243     if (m_max > 1) {
244         at_start = sub_comp->checkEmbeddedStartAnchor(at_start);
245     }
246 
247     return at_start;
248 }
249 
checkEmbeddedEndAnchor(bool at_end) const250 bool ComponentRepeat::checkEmbeddedEndAnchor(bool at_end) const {
251     at_end = sub_comp->checkEmbeddedEndAnchor(at_end);
252 
253     if (m_max > 1) {
254         at_end = sub_comp->checkEmbeddedEndAnchor(at_end);
255     }
256 
257     return at_end;
258 }
259 
accept(ComponentVisitor & v)260 Component *ComponentRepeat::accept(ComponentVisitor &v) {
261     Component *c = v.visit(this);
262     if (c != this) {
263         v.post(this);
264         return c;
265     }
266 
267     c = sub_comp->accept(v);
268     if (c != sub_comp.get()) {
269         sub_comp.reset(c);
270     }
271 
272     v.post(this);
273     return !sub_comp ? nullptr : this;
274 }
275 
accept(ConstComponentVisitor & v) const276 void ComponentRepeat::accept(ConstComponentVisitor &v) const {
277     v.pre(*this);
278     sub_comp->accept(v);
279     v.post(*this);
280 }
281 
last() const282 vector<PositionInfo> ComponentRepeat::last() const {
283     vector<PositionInfo> lasts;
284     if (!m_max) {
285         return lasts;
286     }
287 
288     assert(!m_firsts.empty()); // notePositions should already have run
289     assert(!m_lasts.empty());
290 
291     const auto &l = m_min ? m_lasts[m_min - 1] : m_lasts[0];
292     lasts.insert(lasts.end(), l.begin(), l.end());
293 
294     if (!m_min || m_min != m_lasts.size()) {
295         lasts.insert(lasts.end(), m_lasts.back().begin(), m_lasts.back().end());
296     }
297 
298     DEBUG_PRINTF("lasts = %s\n",
299                  dumpPositions(lasts.begin(), lasts.end()).c_str());
300     return lasts;
301 }
302 
wireRepeats(GlushkovBuildState & bs)303 void ComponentRepeat::wireRepeats(GlushkovBuildState &bs) {
304     /* note: m_lasts[0] already valid */
305     u32 copies = m_firsts.size();
306     const bool isEmpty = sub_comp->empty();
307     const vector<PositionInfo> &optLasts =
308         m_min ? m_lasts[m_min - 1] : m_lasts[0];
309 
310     if (!copies) {
311         goto inf_check;
312     }
313 
314     DEBUG_PRINTF("wiring up %u mand repeats\n", m_min);
315     for (u32 rep = 1; rep < m_min; rep++) {
316         bs.connectRegions(m_lasts[rep - 1], m_firsts[rep]);
317 
318         if (isEmpty) {
319             m_lasts[rep].insert(m_lasts[rep].end(), m_lasts[rep - 1].begin(),
320                                 m_lasts[rep - 1].end());
321         }
322     }
323 
324     DEBUG_PRINTF("wiring up %d optional repeats\n", copies - m_min);
325     for (u32 rep = MAX(m_min, 1); rep < copies; rep++) {
326         vector<PositionInfo> lasts = m_lasts[rep - 1];
327         if (rep != m_min) {
328             lasts.insert(lasts.end(), optLasts.begin(), optLasts.end());
329             sort(lasts.begin(), lasts.end());
330             lasts.erase(unique(lasts.begin(), lasts.end()), lasts.end());
331         }
332         bs.connectRegions(lasts, m_firsts[rep]);
333     }
334 
335 inf_check:
336     // If we have no max bound, we need a self-loop as well.
337     if (m_max == NoLimit) {
338         DEBUG_PRINTF("final repeat self-loop\n");
339         bs.connectRegions(m_lasts.back(), m_firsts.back());
340     }
341 }
342 
343 static
hasPositionFlags(const Component & c)344 bool hasPositionFlags(const Component &c) {
345     for (const auto &e : c.first()) {
346         if (e.flags) {
347             return true;
348         }
349     }
350     return false;
351 }
352 
postSubNotePositionHook()353 void ComponentRepeat::postSubNotePositionHook() {
354     // UE-444 optimization: we can REWRITE m_min under various circumstances,
355     // so that we create smaller NFA graphs. Note that this is _not_ possible
356     // if our subcomponent contains a flagged position, e.g. nofloat.
357     if (!hasPositionFlags(*sub_comp) && sub_comp->empty()) {
358         m_min = 0;
359     }
360 }
361 
makeComponentRepeat(unique_ptr<Component> sub_comp,u32 min,u32 max,ComponentRepeat::RepeatType t)362 unique_ptr<ComponentRepeat> makeComponentRepeat(unique_ptr<Component> sub_comp,
363                                                 u32 min, u32 max,
364                                                 ComponentRepeat::RepeatType t) {
365     return ue2::make_unique<ComponentRepeat>(move(sub_comp), min, max, t);
366 }
367 
368 } // namespace ue2
369