1 /********************* */
2 /*! \file regexp_elim.cpp
3 ** \verbatim
4 ** Top contributors (to current version):
5 ** Andrew Reynolds, Tianyi Liang
6 ** This file is part of the CVC4 project.
7 ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS
8 ** in the top-level source directory) and their institutional affiliations.
9 ** All rights reserved. See the file COPYING in the top-level source
10 ** directory for licensing information.\endverbatim
11 **
12 ** \brief Implementation of techniques for eliminating regular expressions
13 **
14 **/
15
16 #include "theory/strings/regexp_elim.h"
17
18 #include "options/strings_options.h"
19 #include "theory/strings/theory_strings_rewriter.h"
20
21 using namespace CVC4;
22 using namespace CVC4::kind;
23 using namespace CVC4::theory;
24 using namespace CVC4::theory::strings;
25
RegExpElimination()26 RegExpElimination::RegExpElimination()
27 {
28 d_zero = NodeManager::currentNM()->mkConst(Rational(0));
29 d_one = NodeManager::currentNM()->mkConst(Rational(1));
30 d_neg_one = NodeManager::currentNM()->mkConst(Rational(-1));
31 }
32
eliminate(Node atom)33 Node RegExpElimination::eliminate(Node atom)
34 {
35 Assert(atom.getKind() == STRING_IN_REGEXP);
36 if (atom[1].getKind() == REGEXP_CONCAT)
37 {
38 return eliminateConcat(atom);
39 }
40 else if (atom[1].getKind() == REGEXP_STAR)
41 {
42 return eliminateStar(atom);
43 }
44 return Node::null();
45 }
46
eliminateConcat(Node atom)47 Node RegExpElimination::eliminateConcat(Node atom)
48 {
49 NodeManager* nm = NodeManager::currentNM();
50 Node x = atom[0];
51 Node lenx = nm->mkNode(STRING_LENGTH, x);
52 Node re = atom[1];
53 std::vector<Node> children;
54 TheoryStringsRewriter::getConcat(re, children);
55
56 // If it can be reduced to memberships in fixed length regular expressions.
57 // This includes concatenations where at most one child is of the form
58 // (re.* re.allchar), which we abbreviate _* below, and all other children
59 // have a fixed length.
60 // The intuition why this is a "non-aggressive" rewrite is that membership
61 // into fixed length regular expressions are easy to handle.
62 bool hasFixedLength = true;
63 // the index of _* in re
64 unsigned pivotIndex = 0;
65 bool hasPivotIndex = false;
66 std::vector<Node> childLengths;
67 std::vector<Node> childLengthsPostPivot;
68 for (unsigned i = 0, size = children.size(); i < size; i++)
69 {
70 Node c = children[i];
71 Node fl = TheoryStringsRewriter::getFixedLengthForRegexp(c);
72 if (fl.isNull())
73 {
74 if (!hasPivotIndex && c.getKind() == REGEXP_STAR
75 && c[0].getKind() == REGEXP_SIGMA)
76 {
77 hasPivotIndex = true;
78 pivotIndex = i;
79 // set to zero for the sum below
80 fl = d_zero;
81 }
82 else
83 {
84 hasFixedLength = false;
85 break;
86 }
87 }
88 childLengths.push_back(fl);
89 if (hasPivotIndex)
90 {
91 childLengthsPostPivot.push_back(fl);
92 }
93 }
94 if (hasFixedLength)
95 {
96 Assert(re.getNumChildren() == children.size());
97 Node sum = nm->mkNode(PLUS, childLengths);
98 std::vector<Node> conc;
99 conc.push_back(nm->mkNode(hasPivotIndex ? GEQ : EQUAL, lenx, sum));
100 Node currEnd = d_zero;
101 for (unsigned i = 0, size = childLengths.size(); i < size; i++)
102 {
103 if (hasPivotIndex && i == pivotIndex)
104 {
105 Node ppSum = childLengthsPostPivot.size() == 1
106 ? childLengthsPostPivot[0]
107 : nm->mkNode(PLUS, childLengthsPostPivot);
108 currEnd = nm->mkNode(MINUS, lenx, ppSum);
109 }
110 else
111 {
112 Node curr = nm->mkNode(STRING_SUBSTR, x, currEnd, childLengths[i]);
113 // We do not need to include memberships of the form
114 // (str.substr x n 1) in re.allchar
115 // since we know that by construction, n < len( x ).
116 if (re[i].getKind() != REGEXP_SIGMA)
117 {
118 Node currMem = nm->mkNode(STRING_IN_REGEXP, curr, re[i]);
119 conc.push_back(currMem);
120 }
121 currEnd = nm->mkNode(PLUS, currEnd, childLengths[i]);
122 currEnd = Rewriter::rewrite(currEnd);
123 }
124 }
125 Node res = nm->mkNode(AND, conc);
126 // For example:
127 // x in re.++(re.union(re.range("A", "J"), re.range("N", "Z")), "AB") -->
128 // len( x ) = 3 ^
129 // substr(x,0,1) in re.union(re.range("A", "J"), re.range("N", "Z")) ^
130 // substr(x,1,2) in "AB"
131 // An example with a pivot index:
132 // x in re.++( "AB" ++ _* ++ "C" ) -->
133 // len( x ) >= 3 ^
134 // substr( x, 0, 2 ) in "AB" ^
135 // substr( x, len( x ) - 1, 1 ) in "C"
136 return returnElim(atom, res, "concat-fixed-len");
137 }
138
139 // memberships of the form x in re.++ * s1 * ... * sn *, where * are
140 // any number of repetitions (exact or indefinite) of re.allchar.
141 Trace("re-elim-debug") << "Try re concat with gaps " << atom << std::endl;
142 bool onlySigmasAndConsts = true;
143 std::vector<Node> sep_children;
144 std::vector<unsigned> gap_minsize;
145 std::vector<bool> gap_exact;
146 // the first gap is initially strict zero
147 gap_minsize.push_back(0);
148 gap_exact.push_back(true);
149 for (const Node& c : children)
150 {
151 Trace("re-elim-debug") << " " << c << std::endl;
152 onlySigmasAndConsts = false;
153 if (c.getKind() == STRING_TO_REGEXP)
154 {
155 onlySigmasAndConsts = true;
156 sep_children.push_back(c[0]);
157 // the next gap is initially strict zero
158 gap_minsize.push_back(0);
159 gap_exact.push_back(true);
160 }
161 else if (c.getKind() == REGEXP_STAR && c[0].getKind() == REGEXP_SIGMA)
162 {
163 // found a gap of any size
164 onlySigmasAndConsts = true;
165 gap_exact[gap_exact.size() - 1] = false;
166 }
167 else if (c.getKind() == REGEXP_SIGMA)
168 {
169 // add one to the minimum size of the gap
170 onlySigmasAndConsts = true;
171 gap_minsize[gap_minsize.size() - 1]++;
172 }
173 if (!onlySigmasAndConsts)
174 {
175 Trace("re-elim-debug") << "...cannot handle " << c << std::endl;
176 break;
177 }
178 }
179 // we should always rewrite concatenations that are purely re.allchar
180 // and re.*( re.allchar ).
181 Assert(!onlySigmasAndConsts || !sep_children.empty());
182 if (onlySigmasAndConsts && !sep_children.empty())
183 {
184 bool canProcess = true;
185 std::vector<Node> conj;
186 // The following constructs a set of constraints that encodes that a
187 // set of string terms are found, in order, in string x.
188 // prev_end stores the current (symbolic) index in x that we are
189 // searching.
190 Node prev_end = d_zero;
191 // the symbolic index we start searching, for each child in sep_children.
192 std::vector<Node> prev_ends;
193 unsigned gap_minsize_end = gap_minsize.back();
194 bool gap_exact_end = gap_exact.back();
195 std::vector<Node> non_greedy_find_vars;
196 for (unsigned i = 0, size = sep_children.size(); i < size; i++)
197 {
198 if (gap_minsize[i] > 0)
199 {
200 // the gap to this child is at least gap_minsize[i]
201 prev_end =
202 nm->mkNode(PLUS, prev_end, nm->mkConst(Rational(gap_minsize[i])));
203 }
204 prev_ends.push_back(prev_end);
205 Node sc = sep_children[i];
206 Node lensc = nm->mkNode(STRING_LENGTH, sc);
207 if (gap_exact[i])
208 {
209 // if the gap is exact, it is a substring constraint
210 Node curr = prev_end;
211 Node ss = nm->mkNode(STRING_SUBSTR, x, curr, lensc);
212 conj.push_back(ss.eqNode(sc));
213 prev_end = nm->mkNode(PLUS, curr, lensc);
214 }
215 else
216 {
217 // otherwise, we can use indexof to represent some next occurrence
218 if (gap_exact[i + 1] && i + 1 != size)
219 {
220 if (!options::regExpElimAgg())
221 {
222 canProcess = false;
223 break;
224 }
225 // if the gap after this one is strict, we need a non-greedy find
226 // thus, we add a symbolic constant
227 Node k = nm->mkBoundVar(nm->integerType());
228 non_greedy_find_vars.push_back(k);
229 prev_end = nm->mkNode(PLUS, prev_end, k);
230 }
231 Node curr = nm->mkNode(STRING_STRIDOF, x, sc, prev_end);
232 Node idofFind = curr.eqNode(d_neg_one).negate();
233 conj.push_back(idofFind);
234 prev_end = nm->mkNode(PLUS, curr, lensc);
235 }
236 }
237
238 if (canProcess)
239 {
240 // since sep_children is non-empty, conj is non-empty
241 Assert(!conj.empty());
242 // Process the last gap, if necessary.
243 // Notice that if the last gap is not exact and its minsize is zero,
244 // then the last indexof/substr constraint entails the following
245 // constraint, so it is not necessary to add.
246 // Below, we may write "A" for (str.to.re "A") and _ for re.allchar:
247 Node cEnd = nm->mkConst(Rational(gap_minsize_end));
248 if (gap_exact_end)
249 {
250 Assert(!sep_children.empty());
251 // if it is strict, it corresponds to a substr case.
252 // For example:
253 // x in (re.++ "A" (re.* _) "B" _ _) --->
254 // ... ^ "B" = substr( x, len( x ) - 3, 1 ) ^ ...
255 Node sc = sep_children.back();
256 Node lenSc = nm->mkNode(STRING_LENGTH, sc);
257 Node loc = nm->mkNode(MINUS, lenx, nm->mkNode(PLUS, lenSc, cEnd));
258 Node scc = sc.eqNode(nm->mkNode(STRING_SUBSTR, x, loc, lenSc));
259 // We also must ensure that we fit. This constraint is necessary in
260 // addition to the constraint above. Take this example:
261 // x in (re.++ "A" _ (re.* _) "B" _) --->
262 // substr( x, 0, 1 ) = "A" ^ // find "A"
263 // indexof( x, "B", 2 ) != -1 ^ // find "B" >=1 after "A"
264 // substr( x, len(x)-2, 1 ) = "B" ^ // "B" is at end - 2.
265 // indexof( x, "B", 2 ) <= len( x ) - 2
266 // The last constaint ensures that the second and third constraints
267 // may refer to the same "B". If it were not for the last constraint, it
268 // would have been the case than "ABB" would be a model for x, where
269 // the second constraint refers to the third position, and the third
270 // constraint refers to the second position.
271 //
272 // With respect to the above example, the following is an optimization.
273 // For that example, we instead produce:
274 // x in (re.++ "A" _ (re.* _) "B" _) --->
275 // substr( x, 0, 1 ) = "A" ^ // find "A"
276 // substr( x, len(x)-2, 1 ) = "B" ^ // "B" is at end - 2
277 // 2 <= len( x ) - 2
278 // The intuition is that above, there are two constraints that insist
279 // that "B" is found, whereas we only need one. The last constraint
280 // above says that the "B" we find at end-2 can be found >=1 after
281 // the "A".
282 conj.pop_back();
283 Node fit = nm->mkNode(gap_exact[sep_children.size() - 1] ? EQUAL : LEQ,
284 prev_ends.back(),
285 loc);
286
287 conj.push_back(scc);
288 conj.push_back(fit);
289 }
290 else if (gap_minsize_end > 0)
291 {
292 // if it is non-strict, we are in a "greedy find" situtation where
293 // we just need to ensure that the next occurrence fits.
294 // For example:
295 // x in (re.++ "A" (re.* _) "B" _ _ (re.* _)) --->
296 // ... ^ indexof( x, "B", 1 ) + 2 <= len( x )
297 Node fit = nm->mkNode(LEQ, nm->mkNode(PLUS, prev_end, cEnd), lenx);
298 conj.push_back(fit);
299 }
300 Node res = conj.size() == 1 ? conj[0] : nm->mkNode(AND, conj);
301 // process the non-greedy find variables
302 if (!non_greedy_find_vars.empty())
303 {
304 std::vector<Node> children;
305 for (const Node& v : non_greedy_find_vars)
306 {
307 Node bound = nm->mkNode(
308 AND, nm->mkNode(LEQ, d_zero, v), nm->mkNode(LT, v, lenx));
309 children.push_back(bound);
310 }
311 children.push_back(res);
312 Node body = nm->mkNode(AND, children);
313 Node bvl = nm->mkNode(BOUND_VAR_LIST, non_greedy_find_vars);
314 res = nm->mkNode(EXISTS, bvl, body);
315 }
316 // Examples of this elimination:
317 // x in (re.++ "A" (re.* _) "B" (re.* _)) --->
318 // substr(x,0,1)="A" ^ indexof(x,"B",1)!=-1
319 // x in (re.++ (re.* _) "A" _ _ _ (re.* _) "B" _ _ (re.* _)) --->
320 // indexof(x,"A",0)!=-1 ^
321 // indexof( x, "B", indexof( x, "A", 0 ) + 1 + 3 ) != -1 ^
322 // indexof( x, "B", indexof( x, "A", 0 ) + 1 + 3 )+1+2 <= len(x)
323
324 // An example of a non-greedy find:
325 // x in re.++( re.*( _ ), "A", _, "B", re.*( _ ) ) --->
326 // exists k. 0 <= k < len( x ) ^
327 // indexof( x, "A", k ) != -1 ^
328 // substr( x, indexof( x, "A", k )+2, 1 ) = "B"
329 return returnElim(atom, res, "concat-with-gaps");
330 }
331 }
332
333 if (!options::regExpElimAgg())
334 {
335 return Node::null();
336 }
337 // only aggressive rewrites below here
338
339 // if the first or last child is constant string, we can split the membership
340 // into a conjunction of two memberships.
341 Node sStartIndex = d_zero;
342 Node sLength = lenx;
343 std::vector<Node> sConstraints;
344 std::vector<Node> rexpElimChildren;
345 unsigned nchildren = children.size();
346 Assert(nchildren > 1);
347 for (unsigned r = 0; r < 2; r++)
348 {
349 unsigned index = r == 0 ? 0 : nchildren - 1;
350 Assert(children[index + (r == 0 ? 1 : -1)].getKind() != STRING_TO_REGEXP);
351 Node c = children[index];
352 if (c.getKind() == STRING_TO_REGEXP)
353 {
354 Node s = c[0];
355 Node lens = nm->mkNode(STRING_LENGTH, s);
356 Node sss = r == 0 ? d_zero : nm->mkNode(MINUS, lenx, lens);
357 Node ss = nm->mkNode(STRING_SUBSTR, x, sss, lens);
358 sConstraints.push_back(ss.eqNode(s));
359 if (r == 0)
360 {
361 sStartIndex = lens;
362 }
363 sLength = nm->mkNode(MINUS, sLength, lens);
364 }
365 if (r == 1 && !sConstraints.empty())
366 {
367 // add the middle children
368 for (unsigned i = 1; i < (nchildren - 1); i++)
369 {
370 rexpElimChildren.push_back(children[i]);
371 }
372 }
373 if (c.getKind() != STRING_TO_REGEXP)
374 {
375 rexpElimChildren.push_back(c);
376 }
377 }
378 Assert(rexpElimChildren.size() + sConstraints.size() == nchildren);
379 if (!sConstraints.empty())
380 {
381 Node ss = nm->mkNode(STRING_SUBSTR, x, sStartIndex, sLength);
382 Assert(!rexpElimChildren.empty());
383 Node regElim =
384 TheoryStringsRewriter::mkConcat(REGEXP_CONCAT, rexpElimChildren);
385 sConstraints.push_back(nm->mkNode(STRING_IN_REGEXP, ss, regElim));
386 Node ret = nm->mkNode(AND, sConstraints);
387 // e.g.
388 // x in re.++( "A", R ) ---> substr(x,0,1)="A" ^ substr(x,1,len(x)-1) in R
389 return returnElim(atom, ret, "concat-splice");
390 }
391 Assert(nchildren > 1);
392 for (unsigned i = 0; i < nchildren; i++)
393 {
394 if (children[i].getKind() == STRING_TO_REGEXP)
395 {
396 Node s = children[i][0];
397 Node lens = nm->mkNode(STRING_LENGTH, s);
398 // there exists an index in this string such that the substring is this
399 Node k;
400 std::vector<Node> echildren;
401 if (i == 0)
402 {
403 k = d_zero;
404 }
405 else if (i + 1 == nchildren)
406 {
407 k = nm->mkNode(MINUS, lenx, lens);
408 }
409 else
410 {
411 k = nm->mkBoundVar(nm->integerType());
412 Node bound =
413 nm->mkNode(AND,
414 nm->mkNode(LEQ, d_zero, k),
415 nm->mkNode(LT, k, nm->mkNode(MINUS, lenx, lens)));
416 echildren.push_back(bound);
417 }
418 Node substrEq = nm->mkNode(STRING_SUBSTR, x, k, lens).eqNode(s);
419 echildren.push_back(substrEq);
420 if (i > 0)
421 {
422 std::vector<Node> rprefix;
423 rprefix.insert(rprefix.end(), children.begin(), children.begin() + i);
424 Node rpn = TheoryStringsRewriter::mkConcat(REGEXP_CONCAT, rprefix);
425 Node substrPrefix = nm->mkNode(
426 STRING_IN_REGEXP, nm->mkNode(STRING_SUBSTR, x, d_zero, k), rpn);
427 echildren.push_back(substrPrefix);
428 }
429 if (i + 1 < nchildren)
430 {
431 std::vector<Node> rsuffix;
432 rsuffix.insert(rsuffix.end(), children.begin() + i + 1, children.end());
433 Node rps = TheoryStringsRewriter::mkConcat(REGEXP_CONCAT, rsuffix);
434 Node ks = nm->mkNode(PLUS, k, lens);
435 Node substrSuffix = nm->mkNode(
436 STRING_IN_REGEXP,
437 nm->mkNode(STRING_SUBSTR, x, ks, nm->mkNode(MINUS, lenx, ks)),
438 rps);
439 echildren.push_back(substrSuffix);
440 }
441 Node body = nm->mkNode(AND, echildren);
442 if (k.getKind() == BOUND_VARIABLE)
443 {
444 Node bvl = nm->mkNode(BOUND_VAR_LIST, k);
445 body = nm->mkNode(EXISTS, bvl, body);
446 }
447 // e.g. x in re.++( R1, "AB", R2 ) --->
448 // exists k.
449 // 0 <= k <= (len(x)-2) ^
450 // substr( x, k, 2 ) = "AB" ^
451 // substr( x, 0, k ) in R1 ^
452 // substr( x, k+2, len(x)-(k+2) ) in R2
453 return returnElim(atom, body, "concat-find");
454 }
455 }
456 return Node::null();
457 }
458
eliminateStar(Node atom)459 Node RegExpElimination::eliminateStar(Node atom)
460 {
461 if (!options::regExpElimAgg())
462 {
463 return Node::null();
464 }
465 // only aggressive rewrites below here
466
467 NodeManager* nm = NodeManager::currentNM();
468 Node x = atom[0];
469 Node lenx = nm->mkNode(STRING_LENGTH, x);
470 Node re = atom[1];
471 // for regular expression star,
472 // if the period is a fixed constant, we can turn it into a bounded
473 // quantifier
474 std::vector<Node> disj;
475 if (re[0].getKind() == REGEXP_UNION)
476 {
477 for (const Node& r : re[0])
478 {
479 disj.push_back(r);
480 }
481 }
482 else
483 {
484 disj.push_back(re[0]);
485 }
486 bool lenOnePeriod = true;
487 std::vector<Node> char_constraints;
488 Node index = nm->mkBoundVar(nm->integerType());
489 Node substr_ch = nm->mkNode(STRING_SUBSTR, x, index, d_one);
490 substr_ch = Rewriter::rewrite(substr_ch);
491 // handle the case where it is purely characters
492 for (const Node& r : disj)
493 {
494 Assert(r.getKind() != REGEXP_UNION);
495 Assert(r.getKind() != REGEXP_SIGMA);
496 lenOnePeriod = false;
497 // lenOnePeriod is true if this regular expression is a single character
498 // regular expression
499 if (r.getKind() == STRING_TO_REGEXP)
500 {
501 Node s = r[0];
502 if (s.isConst() && s.getConst<String>().size() == 1)
503 {
504 lenOnePeriod = true;
505 }
506 }
507 else if (r.getKind() == REGEXP_RANGE)
508 {
509 lenOnePeriod = true;
510 }
511 if (!lenOnePeriod)
512 {
513 break;
514 }
515 else
516 {
517 Node regexp_ch = nm->mkNode(STRING_IN_REGEXP, substr_ch, r);
518 regexp_ch = Rewriter::rewrite(regexp_ch);
519 Assert(regexp_ch.getKind() != STRING_IN_REGEXP);
520 char_constraints.push_back(regexp_ch);
521 }
522 }
523 if (lenOnePeriod)
524 {
525 Assert(!char_constraints.empty());
526 Node bound = nm->mkNode(
527 AND, nm->mkNode(LEQ, d_zero, index), nm->mkNode(LT, index, lenx));
528 Node conc = char_constraints.size() == 1 ? char_constraints[0]
529 : nm->mkNode(OR, char_constraints);
530 Node body = nm->mkNode(OR, bound.negate(), conc);
531 Node bvl = nm->mkNode(BOUND_VAR_LIST, index);
532 Node res = nm->mkNode(FORALL, bvl, body);
533 // e.g.
534 // x in (re.* (re.union "A" "B" )) --->
535 // forall k. 0<=k<len(x) => (substr(x,k,1) in "A" OR substr(x,k,1) in "B")
536 return returnElim(atom, res, "star-char");
537 }
538 // otherwise, for stars of constant length these are periodic
539 if (disj.size() == 1)
540 {
541 Node r = disj[0];
542 if (r.getKind() == STRING_TO_REGEXP)
543 {
544 Node s = r[0];
545 if (s.isConst())
546 {
547 Node lens = nm->mkNode(STRING_LENGTH, s);
548 lens = Rewriter::rewrite(lens);
549 Assert(lens.isConst());
550 std::vector<Node> conj;
551 Node bound = nm->mkNode(
552 AND,
553 nm->mkNode(LEQ, d_zero, index),
554 nm->mkNode(LT, index, nm->mkNode(INTS_DIVISION, lenx, lens)));
555 Node conc =
556 nm->mkNode(STRING_SUBSTR, x, nm->mkNode(MULT, index, lens), lens)
557 .eqNode(s);
558 Node body = nm->mkNode(OR, bound.negate(), conc);
559 Node bvl = nm->mkNode(BOUND_VAR_LIST, index);
560 Node res = nm->mkNode(FORALL, bvl, body);
561 res = nm->mkNode(
562 AND, nm->mkNode(INTS_MODULUS, lenx, lens).eqNode(d_zero), res);
563 // e.g.
564 // x in ("abc")* --->
565 // forall k. 0 <= k < (len( x ) div 3) => substr(x,3*k,3) = "abc" ^
566 // len(x) mod 3 = 0
567 return returnElim(atom, res, "star-constant");
568 }
569 }
570 }
571 return Node::null();
572 }
573
returnElim(Node atom,Node atomElim,const char * id)574 Node RegExpElimination::returnElim(Node atom, Node atomElim, const char* id)
575 {
576 Trace("re-elim") << "re-elim: " << atom << " to " << atomElim << " by " << id
577 << "." << std::endl;
578 return atomElim;
579 }
580