1 /*********************                                                        */
2 /*! \file regexp_elim.cpp
3  ** \verbatim
4  ** Top contributors (to current version):
5  **   Andrew Reynolds, Tianyi Liang
6  ** This file is part of the CVC4 project.
7  ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS
8  ** in the top-level source directory) and their institutional affiliations.
9  ** All rights reserved.  See the file COPYING in the top-level source
10  ** directory for licensing information.\endverbatim
11  **
12  ** \brief Implementation of techniques for eliminating regular expressions
13  **
14  **/
15 
16 #include "theory/strings/regexp_elim.h"
17 
18 #include "options/strings_options.h"
19 #include "theory/strings/theory_strings_rewriter.h"
20 
21 using namespace CVC4;
22 using namespace CVC4::kind;
23 using namespace CVC4::theory;
24 using namespace CVC4::theory::strings;
25 
RegExpElimination()26 RegExpElimination::RegExpElimination()
27 {
28   d_zero = NodeManager::currentNM()->mkConst(Rational(0));
29   d_one = NodeManager::currentNM()->mkConst(Rational(1));
30   d_neg_one = NodeManager::currentNM()->mkConst(Rational(-1));
31 }
32 
eliminate(Node atom)33 Node RegExpElimination::eliminate(Node atom)
34 {
35   Assert(atom.getKind() == STRING_IN_REGEXP);
36   if (atom[1].getKind() == REGEXP_CONCAT)
37   {
38     return eliminateConcat(atom);
39   }
40   else if (atom[1].getKind() == REGEXP_STAR)
41   {
42     return eliminateStar(atom);
43   }
44   return Node::null();
45 }
46 
eliminateConcat(Node atom)47 Node RegExpElimination::eliminateConcat(Node atom)
48 {
49   NodeManager* nm = NodeManager::currentNM();
50   Node x = atom[0];
51   Node lenx = nm->mkNode(STRING_LENGTH, x);
52   Node re = atom[1];
53   std::vector<Node> children;
54   TheoryStringsRewriter::getConcat(re, children);
55 
56   // If it can be reduced to memberships in fixed length regular expressions.
57   // This includes concatenations where at most one child is of the form
58   // (re.* re.allchar), which we abbreviate _* below, and all other children
59   // have a fixed length.
60   // The intuition why this is a "non-aggressive" rewrite is that membership
61   // into fixed length regular expressions are easy to handle.
62   bool hasFixedLength = true;
63   // the index of _* in re
64   unsigned pivotIndex = 0;
65   bool hasPivotIndex = false;
66   std::vector<Node> childLengths;
67   std::vector<Node> childLengthsPostPivot;
68   for (unsigned i = 0, size = children.size(); i < size; i++)
69   {
70     Node c = children[i];
71     Node fl = TheoryStringsRewriter::getFixedLengthForRegexp(c);
72     if (fl.isNull())
73     {
74       if (!hasPivotIndex && c.getKind() == REGEXP_STAR
75           && c[0].getKind() == REGEXP_SIGMA)
76       {
77         hasPivotIndex = true;
78         pivotIndex = i;
79         // set to zero for the sum below
80         fl = d_zero;
81       }
82       else
83       {
84         hasFixedLength = false;
85         break;
86       }
87     }
88     childLengths.push_back(fl);
89     if (hasPivotIndex)
90     {
91       childLengthsPostPivot.push_back(fl);
92     }
93   }
94   if (hasFixedLength)
95   {
96     Assert(re.getNumChildren() == children.size());
97     Node sum = nm->mkNode(PLUS, childLengths);
98     std::vector<Node> conc;
99     conc.push_back(nm->mkNode(hasPivotIndex ? GEQ : EQUAL, lenx, sum));
100     Node currEnd = d_zero;
101     for (unsigned i = 0, size = childLengths.size(); i < size; i++)
102     {
103       if (hasPivotIndex && i == pivotIndex)
104       {
105         Node ppSum = childLengthsPostPivot.size() == 1
106                          ? childLengthsPostPivot[0]
107                          : nm->mkNode(PLUS, childLengthsPostPivot);
108         currEnd = nm->mkNode(MINUS, lenx, ppSum);
109       }
110       else
111       {
112         Node curr = nm->mkNode(STRING_SUBSTR, x, currEnd, childLengths[i]);
113         // We do not need to include memberships of the form
114         //   (str.substr x n 1) in re.allchar
115         // since we know that by construction, n < len( x ).
116         if (re[i].getKind() != REGEXP_SIGMA)
117         {
118           Node currMem = nm->mkNode(STRING_IN_REGEXP, curr, re[i]);
119           conc.push_back(currMem);
120         }
121         currEnd = nm->mkNode(PLUS, currEnd, childLengths[i]);
122         currEnd = Rewriter::rewrite(currEnd);
123       }
124     }
125     Node res = nm->mkNode(AND, conc);
126     // For example:
127     //   x in re.++(re.union(re.range("A", "J"), re.range("N", "Z")), "AB") -->
128     //   len( x ) = 3 ^
129     //   substr(x,0,1) in re.union(re.range("A", "J"), re.range("N", "Z")) ^
130     //   substr(x,1,2) in "AB"
131     // An example with a pivot index:
132     //   x in re.++( "AB" ++ _* ++ "C" ) -->
133     //   len( x ) >= 3 ^
134     //   substr( x, 0, 2 ) in "AB" ^
135     //   substr( x, len( x ) - 1, 1 ) in "C"
136     return returnElim(atom, res, "concat-fixed-len");
137   }
138 
139   // memberships of the form x in re.++ * s1 * ... * sn *, where * are
140   // any number of repetitions (exact or indefinite) of re.allchar.
141   Trace("re-elim-debug") << "Try re concat with gaps " << atom << std::endl;
142   bool onlySigmasAndConsts = true;
143   std::vector<Node> sep_children;
144   std::vector<unsigned> gap_minsize;
145   std::vector<bool> gap_exact;
146   // the first gap is initially strict zero
147   gap_minsize.push_back(0);
148   gap_exact.push_back(true);
149   for (const Node& c : children)
150   {
151     Trace("re-elim-debug") << "  " << c << std::endl;
152     onlySigmasAndConsts = false;
153     if (c.getKind() == STRING_TO_REGEXP)
154     {
155       onlySigmasAndConsts = true;
156       sep_children.push_back(c[0]);
157       // the next gap is initially strict zero
158       gap_minsize.push_back(0);
159       gap_exact.push_back(true);
160     }
161     else if (c.getKind() == REGEXP_STAR && c[0].getKind() == REGEXP_SIGMA)
162     {
163       // found a gap of any size
164       onlySigmasAndConsts = true;
165       gap_exact[gap_exact.size() - 1] = false;
166     }
167     else if (c.getKind() == REGEXP_SIGMA)
168     {
169       // add one to the minimum size of the gap
170       onlySigmasAndConsts = true;
171       gap_minsize[gap_minsize.size() - 1]++;
172     }
173     if (!onlySigmasAndConsts)
174     {
175       Trace("re-elim-debug") << "...cannot handle " << c << std::endl;
176       break;
177     }
178   }
179   // we should always rewrite concatenations that are purely re.allchar
180   // and re.*( re.allchar ).
181   Assert(!onlySigmasAndConsts || !sep_children.empty());
182   if (onlySigmasAndConsts && !sep_children.empty())
183   {
184     bool canProcess = true;
185     std::vector<Node> conj;
186     // The following constructs a set of constraints that encodes that a
187     // set of string terms are found, in order, in string x.
188     // prev_end stores the current (symbolic) index in x that we are
189     // searching.
190     Node prev_end = d_zero;
191     // the symbolic index we start searching, for each child in sep_children.
192     std::vector<Node> prev_ends;
193     unsigned gap_minsize_end = gap_minsize.back();
194     bool gap_exact_end = gap_exact.back();
195     std::vector<Node> non_greedy_find_vars;
196     for (unsigned i = 0, size = sep_children.size(); i < size; i++)
197     {
198       if (gap_minsize[i] > 0)
199       {
200         // the gap to this child is at least gap_minsize[i]
201         prev_end =
202             nm->mkNode(PLUS, prev_end, nm->mkConst(Rational(gap_minsize[i])));
203       }
204       prev_ends.push_back(prev_end);
205       Node sc = sep_children[i];
206       Node lensc = nm->mkNode(STRING_LENGTH, sc);
207       if (gap_exact[i])
208       {
209         // if the gap is exact, it is a substring constraint
210         Node curr = prev_end;
211         Node ss = nm->mkNode(STRING_SUBSTR, x, curr, lensc);
212         conj.push_back(ss.eqNode(sc));
213         prev_end = nm->mkNode(PLUS, curr, lensc);
214       }
215       else
216       {
217         // otherwise, we can use indexof to represent some next occurrence
218         if (gap_exact[i + 1] && i + 1 != size)
219         {
220           if (!options::regExpElimAgg())
221           {
222             canProcess = false;
223             break;
224           }
225           // if the gap after this one is strict, we need a non-greedy find
226           // thus, we add a symbolic constant
227           Node k = nm->mkBoundVar(nm->integerType());
228           non_greedy_find_vars.push_back(k);
229           prev_end = nm->mkNode(PLUS, prev_end, k);
230         }
231         Node curr = nm->mkNode(STRING_STRIDOF, x, sc, prev_end);
232         Node idofFind = curr.eqNode(d_neg_one).negate();
233         conj.push_back(idofFind);
234         prev_end = nm->mkNode(PLUS, curr, lensc);
235       }
236     }
237 
238     if (canProcess)
239     {
240       // since sep_children is non-empty, conj is non-empty
241       Assert(!conj.empty());
242       // Process the last gap, if necessary.
243       // Notice that if the last gap is not exact and its minsize is zero,
244       // then the last indexof/substr constraint entails the following
245       // constraint, so it is not necessary to add.
246       // Below, we may write "A" for (str.to.re "A") and _ for re.allchar:
247       Node cEnd = nm->mkConst(Rational(gap_minsize_end));
248       if (gap_exact_end)
249       {
250         Assert(!sep_children.empty());
251         // if it is strict, it corresponds to a substr case.
252         // For example:
253         //     x in (re.++ "A" (re.* _) "B" _ _) --->
254         //        ... ^ "B" = substr( x, len( x ) - 3, 1 )  ^ ...
255         Node sc = sep_children.back();
256         Node lenSc = nm->mkNode(STRING_LENGTH, sc);
257         Node loc = nm->mkNode(MINUS, lenx, nm->mkNode(PLUS, lenSc, cEnd));
258         Node scc = sc.eqNode(nm->mkNode(STRING_SUBSTR, x, loc, lenSc));
259         // We also must ensure that we fit. This constraint is necessary in
260         // addition to the constraint above. Take this example:
261         //     x in (re.++ "A" _ (re.* _) "B" _) --->
262         //       substr( x, 0, 1 ) = "A" ^             // find "A"
263         //       indexof( x, "B", 2 ) != -1 ^          // find "B" >=1 after "A"
264         //       substr( x, len(x)-2, 1 ) = "B" ^      // "B" is at end - 2.
265         //       indexof( x, "B", 2 ) <= len( x ) - 2
266         // The last constaint ensures that the second and third constraints
267         // may refer to the same "B". If it were not for the last constraint, it
268         // would have been the case than "ABB" would be a model for x, where
269         // the second constraint refers to the third position, and the third
270         // constraint refers to the second position.
271         //
272         // With respect to the above example, the following is an optimization.
273         // For that example, we instead produce:
274         //     x in (re.++ "A" _ (re.* _) "B" _) --->
275         //       substr( x, 0, 1 ) = "A" ^          // find "A"
276         //       substr( x, len(x)-2, 1 ) = "B" ^   // "B" is at end - 2
277         //       2 <= len( x ) - 2
278         // The intuition is that above, there are two constraints that insist
279         // that "B" is found, whereas we only need one. The last constraint
280         // above says that the "B" we find at end-2 can be found >=1 after
281         // the "A".
282         conj.pop_back();
283         Node fit = nm->mkNode(gap_exact[sep_children.size() - 1] ? EQUAL : LEQ,
284                               prev_ends.back(),
285                               loc);
286 
287         conj.push_back(scc);
288         conj.push_back(fit);
289       }
290       else if (gap_minsize_end > 0)
291       {
292         // if it is non-strict, we are in a "greedy find" situtation where
293         // we just need to ensure that the next occurrence fits.
294         // For example:
295         //     x in (re.++ "A" (re.* _) "B" _ _ (re.* _)) --->
296         //        ... ^ indexof( x, "B", 1 ) + 2 <= len( x )
297         Node fit = nm->mkNode(LEQ, nm->mkNode(PLUS, prev_end, cEnd), lenx);
298         conj.push_back(fit);
299       }
300       Node res = conj.size() == 1 ? conj[0] : nm->mkNode(AND, conj);
301       // process the non-greedy find variables
302       if (!non_greedy_find_vars.empty())
303       {
304         std::vector<Node> children;
305         for (const Node& v : non_greedy_find_vars)
306         {
307           Node bound = nm->mkNode(
308               AND, nm->mkNode(LEQ, d_zero, v), nm->mkNode(LT, v, lenx));
309           children.push_back(bound);
310         }
311         children.push_back(res);
312         Node body = nm->mkNode(AND, children);
313         Node bvl = nm->mkNode(BOUND_VAR_LIST, non_greedy_find_vars);
314         res = nm->mkNode(EXISTS, bvl, body);
315       }
316       // Examples of this elimination:
317       //   x in (re.++ "A" (re.* _) "B" (re.* _)) --->
318       //     substr(x,0,1)="A" ^ indexof(x,"B",1)!=-1
319       //   x in (re.++ (re.* _) "A" _ _ _ (re.* _) "B" _ _ (re.* _)) --->
320       //     indexof(x,"A",0)!=-1 ^
321       //     indexof( x, "B", indexof( x, "A", 0 ) + 1 + 3 ) != -1 ^
322       //     indexof( x, "B", indexof( x, "A", 0 ) + 1 + 3 )+1+2 <= len(x)
323 
324       // An example of a non-greedy find:
325       //   x in re.++( re.*( _ ), "A", _, "B", re.*( _ ) ) --->
326       //     exists k. 0 <= k < len( x ) ^
327       //               indexof( x, "A", k ) != -1 ^
328       //               substr( x, indexof( x, "A", k )+2, 1 ) = "B"
329       return returnElim(atom, res, "concat-with-gaps");
330     }
331   }
332 
333   if (!options::regExpElimAgg())
334   {
335     return Node::null();
336   }
337   // only aggressive rewrites below here
338 
339   // if the first or last child is constant string, we can split the membership
340   // into a conjunction of two memberships.
341   Node sStartIndex = d_zero;
342   Node sLength = lenx;
343   std::vector<Node> sConstraints;
344   std::vector<Node> rexpElimChildren;
345   unsigned nchildren = children.size();
346   Assert(nchildren > 1);
347   for (unsigned r = 0; r < 2; r++)
348   {
349     unsigned index = r == 0 ? 0 : nchildren - 1;
350     Assert(children[index + (r == 0 ? 1 : -1)].getKind() != STRING_TO_REGEXP);
351     Node c = children[index];
352     if (c.getKind() == STRING_TO_REGEXP)
353     {
354       Node s = c[0];
355       Node lens = nm->mkNode(STRING_LENGTH, s);
356       Node sss = r == 0 ? d_zero : nm->mkNode(MINUS, lenx, lens);
357       Node ss = nm->mkNode(STRING_SUBSTR, x, sss, lens);
358       sConstraints.push_back(ss.eqNode(s));
359       if (r == 0)
360       {
361         sStartIndex = lens;
362       }
363       sLength = nm->mkNode(MINUS, sLength, lens);
364     }
365     if (r == 1 && !sConstraints.empty())
366     {
367       // add the middle children
368       for (unsigned i = 1; i < (nchildren - 1); i++)
369       {
370         rexpElimChildren.push_back(children[i]);
371       }
372     }
373     if (c.getKind() != STRING_TO_REGEXP)
374     {
375       rexpElimChildren.push_back(c);
376     }
377   }
378   Assert(rexpElimChildren.size() + sConstraints.size() == nchildren);
379   if (!sConstraints.empty())
380   {
381     Node ss = nm->mkNode(STRING_SUBSTR, x, sStartIndex, sLength);
382     Assert(!rexpElimChildren.empty());
383     Node regElim =
384         TheoryStringsRewriter::mkConcat(REGEXP_CONCAT, rexpElimChildren);
385     sConstraints.push_back(nm->mkNode(STRING_IN_REGEXP, ss, regElim));
386     Node ret = nm->mkNode(AND, sConstraints);
387     // e.g.
388     // x in re.++( "A", R ) ---> substr(x,0,1)="A" ^ substr(x,1,len(x)-1) in R
389     return returnElim(atom, ret, "concat-splice");
390   }
391   Assert(nchildren > 1);
392   for (unsigned i = 0; i < nchildren; i++)
393   {
394     if (children[i].getKind() == STRING_TO_REGEXP)
395     {
396       Node s = children[i][0];
397       Node lens = nm->mkNode(STRING_LENGTH, s);
398       // there exists an index in this string such that the substring is this
399       Node k;
400       std::vector<Node> echildren;
401       if (i == 0)
402       {
403         k = d_zero;
404       }
405       else if (i + 1 == nchildren)
406       {
407         k = nm->mkNode(MINUS, lenx, lens);
408       }
409       else
410       {
411         k = nm->mkBoundVar(nm->integerType());
412         Node bound =
413             nm->mkNode(AND,
414                        nm->mkNode(LEQ, d_zero, k),
415                        nm->mkNode(LT, k, nm->mkNode(MINUS, lenx, lens)));
416         echildren.push_back(bound);
417       }
418       Node substrEq = nm->mkNode(STRING_SUBSTR, x, k, lens).eqNode(s);
419       echildren.push_back(substrEq);
420       if (i > 0)
421       {
422         std::vector<Node> rprefix;
423         rprefix.insert(rprefix.end(), children.begin(), children.begin() + i);
424         Node rpn = TheoryStringsRewriter::mkConcat(REGEXP_CONCAT, rprefix);
425         Node substrPrefix = nm->mkNode(
426             STRING_IN_REGEXP, nm->mkNode(STRING_SUBSTR, x, d_zero, k), rpn);
427         echildren.push_back(substrPrefix);
428       }
429       if (i + 1 < nchildren)
430       {
431         std::vector<Node> rsuffix;
432         rsuffix.insert(rsuffix.end(), children.begin() + i + 1, children.end());
433         Node rps = TheoryStringsRewriter::mkConcat(REGEXP_CONCAT, rsuffix);
434         Node ks = nm->mkNode(PLUS, k, lens);
435         Node substrSuffix = nm->mkNode(
436             STRING_IN_REGEXP,
437             nm->mkNode(STRING_SUBSTR, x, ks, nm->mkNode(MINUS, lenx, ks)),
438             rps);
439         echildren.push_back(substrSuffix);
440       }
441       Node body = nm->mkNode(AND, echildren);
442       if (k.getKind() == BOUND_VARIABLE)
443       {
444         Node bvl = nm->mkNode(BOUND_VAR_LIST, k);
445         body = nm->mkNode(EXISTS, bvl, body);
446       }
447       // e.g. x in re.++( R1, "AB", R2 ) --->
448       //  exists k.
449       //    0 <= k <= (len(x)-2) ^
450       //    substr( x, k, 2 ) = "AB" ^
451       //    substr( x, 0, k ) in R1 ^
452       //    substr( x, k+2, len(x)-(k+2) ) in R2
453       return returnElim(atom, body, "concat-find");
454     }
455   }
456   return Node::null();
457 }
458 
eliminateStar(Node atom)459 Node RegExpElimination::eliminateStar(Node atom)
460 {
461   if (!options::regExpElimAgg())
462   {
463     return Node::null();
464   }
465   // only aggressive rewrites below here
466 
467   NodeManager* nm = NodeManager::currentNM();
468   Node x = atom[0];
469   Node lenx = nm->mkNode(STRING_LENGTH, x);
470   Node re = atom[1];
471   // for regular expression star,
472   // if the period is a fixed constant, we can turn it into a bounded
473   // quantifier
474   std::vector<Node> disj;
475   if (re[0].getKind() == REGEXP_UNION)
476   {
477     for (const Node& r : re[0])
478     {
479       disj.push_back(r);
480     }
481   }
482   else
483   {
484     disj.push_back(re[0]);
485   }
486   bool lenOnePeriod = true;
487   std::vector<Node> char_constraints;
488   Node index = nm->mkBoundVar(nm->integerType());
489   Node substr_ch = nm->mkNode(STRING_SUBSTR, x, index, d_one);
490   substr_ch = Rewriter::rewrite(substr_ch);
491   // handle the case where it is purely characters
492   for (const Node& r : disj)
493   {
494     Assert(r.getKind() != REGEXP_UNION);
495     Assert(r.getKind() != REGEXP_SIGMA);
496     lenOnePeriod = false;
497     // lenOnePeriod is true if this regular expression is a single character
498     // regular expression
499     if (r.getKind() == STRING_TO_REGEXP)
500     {
501       Node s = r[0];
502       if (s.isConst() && s.getConst<String>().size() == 1)
503       {
504         lenOnePeriod = true;
505       }
506     }
507     else if (r.getKind() == REGEXP_RANGE)
508     {
509       lenOnePeriod = true;
510     }
511     if (!lenOnePeriod)
512     {
513       break;
514     }
515     else
516     {
517       Node regexp_ch = nm->mkNode(STRING_IN_REGEXP, substr_ch, r);
518       regexp_ch = Rewriter::rewrite(regexp_ch);
519       Assert(regexp_ch.getKind() != STRING_IN_REGEXP);
520       char_constraints.push_back(regexp_ch);
521     }
522   }
523   if (lenOnePeriod)
524   {
525     Assert(!char_constraints.empty());
526     Node bound = nm->mkNode(
527         AND, nm->mkNode(LEQ, d_zero, index), nm->mkNode(LT, index, lenx));
528     Node conc = char_constraints.size() == 1 ? char_constraints[0]
529                                              : nm->mkNode(OR, char_constraints);
530     Node body = nm->mkNode(OR, bound.negate(), conc);
531     Node bvl = nm->mkNode(BOUND_VAR_LIST, index);
532     Node res = nm->mkNode(FORALL, bvl, body);
533     // e.g.
534     //   x in (re.* (re.union "A" "B" )) --->
535     //   forall k. 0<=k<len(x) => (substr(x,k,1) in "A" OR substr(x,k,1) in "B")
536     return returnElim(atom, res, "star-char");
537   }
538   // otherwise, for stars of constant length these are periodic
539   if (disj.size() == 1)
540   {
541     Node r = disj[0];
542     if (r.getKind() == STRING_TO_REGEXP)
543     {
544       Node s = r[0];
545       if (s.isConst())
546       {
547         Node lens = nm->mkNode(STRING_LENGTH, s);
548         lens = Rewriter::rewrite(lens);
549         Assert(lens.isConst());
550         std::vector<Node> conj;
551         Node bound = nm->mkNode(
552             AND,
553             nm->mkNode(LEQ, d_zero, index),
554             nm->mkNode(LT, index, nm->mkNode(INTS_DIVISION, lenx, lens)));
555         Node conc =
556             nm->mkNode(STRING_SUBSTR, x, nm->mkNode(MULT, index, lens), lens)
557                 .eqNode(s);
558         Node body = nm->mkNode(OR, bound.negate(), conc);
559         Node bvl = nm->mkNode(BOUND_VAR_LIST, index);
560         Node res = nm->mkNode(FORALL, bvl, body);
561         res = nm->mkNode(
562             AND, nm->mkNode(INTS_MODULUS, lenx, lens).eqNode(d_zero), res);
563         // e.g.
564         //    x in ("abc")* --->
565         //    forall k. 0 <= k < (len( x ) div 3) => substr(x,3*k,3) = "abc" ^
566         //    len(x) mod 3 = 0
567         return returnElim(atom, res, "star-constant");
568       }
569     }
570   }
571   return Node::null();
572 }
573 
returnElim(Node atom,Node atomElim,const char * id)574 Node RegExpElimination::returnElim(Node atom, Node atomElim, const char* id)
575 {
576   Trace("re-elim") << "re-elim: " << atom << " to " << atomElim << " by " << id
577                    << "." << std::endl;
578   return atomElim;
579 }
580