1 /*++
2   Module Name:
3 
4   theory_str_regex.cpp
5 
6   Abstract:
7 
8   Regular expression components for Z3str3 (theory_str).
9 
10   Author:
11 
12   Murphy Berzish (2019-10-25)
13 
14   Revision History:
15 
16   --*/
17 
18 #include "smt/theory_str.h"
19 
20 namespace smt {
21 
22     // saturating unsigned addition
_qadd(unsigned a,unsigned b)23     unsigned inline _qadd(unsigned a, unsigned b) {
24         if (a == UINT_MAX || b == UINT_MAX) {
25             return UINT_MAX;
26         }
27         unsigned result = a + b;
28         if (result < a || result < b) {
29             return UINT_MAX;
30         }
31         return result;
32     }
33 
34     // saturating unsigned multiply
_qmul(unsigned a,unsigned b)35     unsigned inline _qmul(unsigned a, unsigned b) {
36         if (a == UINT_MAX || b == UINT_MAX) {
37             return UINT_MAX;
38         }
39         uint64_t result = static_cast<uint64_t>(a) * static_cast<uint64_t>(b);
40         if (result > UINT_MAX) {
41             return UINT_MAX;
42         }
43         return static_cast<unsigned>(result);
44     }
45 
46     // Returns false if we need to give up solving, e.g. because we found symbolic expressions in an automaton.
solve_regex_automata()47     bool theory_str::solve_regex_automata() {
48         for (auto str_in_re : regex_terms) {
49             expr * str = nullptr;
50             expr * re = nullptr;
51             u.str.is_in_re(str_in_re, str, re);
52             if (!ctx.b_internalized(str_in_re)) {
53                 TRACE("str", tout << "regex term " << mk_pp(str_in_re, m) << " not internalized; fixing and continuing" << std::endl;);
54                 ctx.internalize(str_in_re, false);
55                 finalCheckProgressIndicator = true;
56                 continue;
57             }
58             lbool current_assignment = ctx.get_assignment(str_in_re);
59             TRACE("str", tout << "regex term: " << mk_pp(str, m) << " in " << mk_pp(re, m) << " : " << current_assignment << std::endl;);
60             if (current_assignment == l_undef) {
61                 continue;
62             }
63 
64             if (!regex_terms_with_length_constraints.contains(str_in_re)) {
65                 if (current_assignment == l_true && check_regex_length_linearity(re)) {
66                     TRACE("str", tout << "regex length constraints expected to be linear -- generating and asserting them" << std::endl;);
67 
68                     if (regex_term_to_length_constraint.contains(str_in_re)) {
69                         // use existing length constraint
70                         expr * top_level_length_constraint = nullptr;
71                         regex_term_to_length_constraint.find(str_in_re, top_level_length_constraint);
72 
73                         ptr_vector<expr> extra_length_vars;
74                         regex_term_to_extra_length_vars.find(str_in_re, extra_length_vars);
75 
76                         assert_axiom(top_level_length_constraint);
77                         for(auto v : extra_length_vars) {
78                             refresh_theory_var(v);
79                             expr_ref len_constraint(m_autil.mk_ge(v, m_autil.mk_numeral(rational::zero(), true)), m);
80                             assert_axiom(len_constraint);
81                         }
82                     } else {
83                         // generate new length constraint
84                         expr_ref_vector extra_length_vars(m);
85                         expr_ref _top_level_length_constraint = infer_all_regex_lengths(mk_strlen(str), re, extra_length_vars);
86                         expr_ref premise(str_in_re, m);
87                         expr_ref top_level_length_constraint(m.mk_implies(premise, _top_level_length_constraint), m);
88                         th_rewriter rw(m);
89                         rw(top_level_length_constraint);
90                         TRACE("str", tout << "top-level length constraint: " << mk_pp(top_level_length_constraint, m) << std::endl;);
91                         // assert and track length constraint
92                         assert_axiom(top_level_length_constraint);
93                         for(auto v : extra_length_vars) {
94                             expr_ref len_constraint(m_autil.mk_ge(v, m_autil.mk_numeral(rational::zero(), true)), m);
95                             assert_axiom(len_constraint);
96                         }
97 
98                         regex_term_to_length_constraint.insert(str_in_re, top_level_length_constraint);
99                         ptr_vector<expr> vtmp;
100                         for(auto v : extra_length_vars) {
101                             vtmp.push_back(v);
102                         }
103                         regex_term_to_extra_length_vars.insert(str_in_re, vtmp);
104                     }
105 
106                     regex_terms_with_length_constraints.insert(str_in_re);
107                     m_trail_stack.push(insert_obj_trail<expr>(regex_terms_with_length_constraints, str_in_re));
108                 }
109             } // re not in regex_terms_with_length_constraints
110 
111             rational exact_length_value;
112             if (get_len_value(str, exact_length_value)) {
113                 TRACE("str", tout << "exact length of " << mk_pp(str, m) << " is " << exact_length_value << std::endl;);
114 
115                 if (regex_terms_with_path_constraints.contains(str_in_re)) {
116                     TRACE("str", tout << "term " << mk_pp(str_in_re, m) << " already has path constraints set up" << std::endl;);
117                     continue;
118                 }
119 
120                 // find a consistent automaton for this term
121                 bool found = false;
122                 regex_automaton_under_assumptions assumption;
123                 if (regex_automaton_assumptions.contains(re) &&
124                         !regex_automaton_assumptions[re].empty()){
125                     for (auto autA : regex_automaton_assumptions[re]) {
126                         rational assumed_upper_bound, assumed_lower_bound;
127                         bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound);
128                         bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound);
129                         if (!assumes_upper_bound && !assumes_lower_bound) {
130                             // automaton with no assumptions is always usable
131                             assumption = autA;
132                             found = true;
133                             break;
134                         }
135                         // TODO check consistency of bounds assumptions
136                     } // foreach(a in regex_automaton_assumptions)
137                 }
138                 if (found) {
139                     if (exact_length_value.is_zero()) {
140                         // check consistency of 0-length solution with automaton
141                         eautomaton * aut = assumption.get_automaton();
142                         bool zero_solution = false;
143                         unsigned initial_state = aut->init();
144                         if (aut->is_final_state(initial_state)) {
145                             zero_solution = true;
146                         } else {
147                             unsigned_vector eps_states;
148                             aut->get_epsilon_closure(initial_state, eps_states);
149                             for (unsigned_vector::iterator it = eps_states.begin(); it != eps_states.end(); ++it) {
150                                 unsigned state = *it;
151                                 if (aut->is_final_state(state)) {
152                                     zero_solution = true;
153                                     break;
154                                 }
155                             }
156                         }
157 
158                         // now check polarity of automaton wrt. original term
159                         if ( (current_assignment == l_true && !assumption.get_polarity())
160                                 || (current_assignment == l_false && assumption.get_polarity())) {
161                             // invert sense
162                             zero_solution = !zero_solution;
163                         }
164 
165                         if (zero_solution) {
166                             TRACE("str", tout << "zero-length solution OK -- asserting empty path constraint" << std::endl;);
167                             expr_ref_vector lhs_terms(m);
168                             if (current_assignment == l_true) {
169                                 lhs_terms.push_back(str_in_re);
170                             } else {
171                                 lhs_terms.push_back(m.mk_not(str_in_re));
172                             }
173                             lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true)));
174                             expr_ref lhs(mk_and(lhs_terms), m);
175                             expr_ref rhs(ctx.mk_eq_atom(str, mk_string("")), m);
176                             assert_implication(lhs, rhs);
177                             regex_terms_with_path_constraints.insert(str_in_re);
178                             m_trail_stack.push(insert_obj_trail<expr>(regex_terms_with_path_constraints, str_in_re));
179                         } else {
180                             TRACE("str", tout << "zero-length solution not admitted by this automaton -- asserting conflict clause" << std::endl;);
181                             expr_ref_vector lhs_terms(m);
182                             if (current_assignment == l_true) {
183                                 lhs_terms.push_back(str_in_re);
184                             } else {
185                                 lhs_terms.push_back(m.mk_not(str_in_re));
186                             }
187                             lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true)));
188                             expr_ref lhs(mk_and(lhs_terms), m);
189                             expr_ref conflict(m.mk_not(lhs), m);
190                             assert_axiom(conflict);
191                         }
192                         regex_inc_counter(regex_length_attempt_count, re);
193                         continue;
194                     } else {
195                         // fixed-length model construction handles path constraints on our behalf, and with a better reduction
196                         continue;
197                     }
198                 } else {
199                     // no automata available, or else all bounds assumptions are invalid
200                     unsigned expected_complexity = estimate_regex_complexity(re);
201                     if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold) {
202                         CTRACE("str", regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold,
203                                 tout << "failed automaton threshold reached for " << mk_pp(str_in_re, m) << " -- automatically constructing full automaton" << std::endl;);
204                         eautomaton * aut = m_mk_aut(re);
205                         if (aut == nullptr) {
206                             TRACE("str", tout << "ERROR: symbolic automaton construction failed, likely due to non-constant term in regex" << std::endl;);
207                             return false;
208                         }
209                         aut->compress();
210                         regex_automata.push_back(aut);
211                         regex_automaton_under_assumptions new_aut(re, aut, true);
212                         if (!regex_automaton_assumptions.contains(re)) {
213                             regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>());
214                         }
215                         regex_automaton_assumptions[re].push_back(new_aut);
216                         TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
217                         find_automaton_initial_bounds(str_in_re, aut);
218                     } else {
219                         regex_inc_counter(regex_fail_count, str_in_re);
220                     }
221                     continue;
222                 }
223             } // get_len_value()
224             expr_ref str_len(mk_strlen(str), m);
225             rational lower_bound_value;
226             rational upper_bound_value;
227             bool lower_bound_exists = lower_bound(str_len, lower_bound_value);
228             bool upper_bound_exists = upper_bound(str_len, upper_bound_value);
229             CTRACE("str", lower_bound_exists, tout << "lower bound of " << mk_pp(str, m) << " is " << lower_bound_value << std::endl;);
230             CTRACE("str", upper_bound_exists, tout << "upper bound of " << mk_pp(str, m) << " is " << upper_bound_value << std::endl;);
231 
232             bool new_lower_bound_info = true;
233             bool new_upper_bound_info = true;
234             // check last seen lower/upper bound to avoid performing duplicate work
235             if (regex_last_lower_bound.contains(str)) {
236                 rational last_lb_value;
237                 regex_last_lower_bound.find(str, last_lb_value);
238                 if (last_lb_value == lower_bound_value) {
239                     new_lower_bound_info = false;
240                 }
241             }
242             if (regex_last_upper_bound.contains(str)) {
243                 rational last_ub_value;
244                 regex_last_upper_bound.find(str, last_ub_value);
245                 if (last_ub_value == upper_bound_value) {
246                     new_upper_bound_info = false;
247                 }
248             }
249 
250             if (new_lower_bound_info) {
251                 regex_last_lower_bound.insert(str, lower_bound_value);
252             }
253             if (new_upper_bound_info) {
254                 regex_last_upper_bound.insert(str, upper_bound_value);
255             }
256 
257             if (upper_bound_exists && new_upper_bound_info) {
258                 // check current assumptions
259                 if (regex_automaton_assumptions.contains(re) &&
260                         !regex_automaton_assumptions[re].empty()){
261                     // one or more existing assumptions.
262                     // see if the (current best) upper bound can be refined
263                     // (note that if we have an automaton with no assumption,
264                     // this automatically counts as best)
265                     bool need_assumption = true;
266                     regex_automaton_under_assumptions last_assumption;
267                     rational last_ub = rational::minus_one();
268                     for (auto autA : regex_automaton_assumptions[re]) {
269                         if ((current_assignment == l_true && autA.get_polarity() == false)
270                                 || (current_assignment == l_false && autA.get_polarity() == true)) {
271                             // automaton uses incorrect polarity
272                             continue;
273                         }
274                         rational this_ub;
275                         if (autA.get_upper_bound(this_ub)) {
276                             if (last_ub == rational::minus_one() || this_ub < last_ub) {
277                                 last_ub = this_ub;
278                                 last_assumption = autA;
279                             }
280                         } else {
281                             need_assumption = false;
282                             last_assumption = autA;
283                             break;
284                         }
285                     }
286                     if (!last_ub.is_minus_one() || !need_assumption) {
287                         CTRACE("str", !need_assumption, tout << "using automaton with full length information" << std::endl;);
288                         CTRACE("str", need_assumption, tout << "using automaton with assumed upper bound of " << last_ub << std::endl;);
289 
290                         rational refined_upper_bound;
291                         bool solution_at_upper_bound = refine_automaton_upper_bound(last_assumption.get_automaton(),
292                                 upper_bound_value, refined_upper_bound);
293                         TRACE("str", tout << "refined upper bound is " << refined_upper_bound <<
294                                 (solution_at_upper_bound?", solution at upper bound":", no solution at upper bound") << std::endl;);
295 
296                         expr_ref_vector lhs(m);
297                         if (current_assignment == l_false) {
298                             lhs.push_back(m.mk_not(str_in_re));
299                         } else {
300                             lhs.push_back(str_in_re);
301                         }
302                         if (need_assumption) {
303                             lhs.push_back(m_autil.mk_le(str_len, m_autil.mk_numeral(last_ub, true)));
304                         }
305                         lhs.push_back(m_autil.mk_le(str_len, m_autil.mk_numeral(upper_bound_value, true)));
306 
307                         expr_ref_vector rhs(m);
308 
309                         if (solution_at_upper_bound) {
310                             if (refined_upper_bound.is_minus_one()) {
311                                 // If there are solutions at the upper bound but not below it, make the bound exact.
312                                 rhs.push_back(ctx.mk_eq_atom(str_len, m_autil.mk_numeral(upper_bound_value, true)));
313                             } else {
314                                 // If there are solutions at and below the upper bound, add an additional bound.
315                                 rhs.push_back(m.mk_or(
316                                         ctx.mk_eq_atom(str_len, m_autil.mk_numeral(upper_bound_value, true)),
317                                         m_autil.mk_le(str_len, m_autil.mk_numeral(refined_upper_bound, true))
318                                         ));
319                             }
320                         } else {
321                             if (refined_upper_bound.is_minus_one()) {
322                                 // If there are no solutions at or below the upper bound, assert a conflict clause.
323                                 rhs.push_back(m.mk_not(m_autil.mk_le(str_len, m_autil.mk_numeral(upper_bound_value, true))));
324                             } else {
325                                 // If there are solutions below the upper bound but not at it, refine the bound.
326                                 rhs.push_back(m_autil.mk_le(str_len, m_autil.mk_numeral(refined_upper_bound, true)));
327                             }
328                         }
329 
330                         if (!rhs.empty()) {
331                             expr_ref lhs_terms(mk_and(lhs), m);
332                             expr_ref rhs_terms(mk_and(rhs), m);
333                             assert_implication(lhs_terms, rhs_terms);
334                         }
335                     }
336                 } else {
337                     // no existing automata/assumptions.
338                     // if it's easy to construct a full automaton for R, do so
339                     unsigned expected_complexity = estimate_regex_complexity(re);
340                     bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold);
341                     if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || failureThresholdExceeded) {
342                         eautomaton * aut = m_mk_aut(re);
343                         if (aut == nullptr) {
344                             TRACE("str", tout << "ERROR: symbolic automaton construction failed, likely due to non-constant term in regex" << std::endl;);
345                             return false;
346                         }
347                         aut->compress();
348                         regex_automata.push_back(aut);
349                         regex_automaton_under_assumptions new_aut(re, aut, true);
350                         if (!regex_automaton_assumptions.contains(re)) {
351                             regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>());
352                         }
353                         regex_automaton_assumptions[re].push_back(new_aut);
354                         TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
355                         find_automaton_initial_bounds(str_in_re, aut);
356                     } else {
357                         regex_inc_counter(regex_fail_count, str_in_re);
358                     }
359                     continue;
360                 }
361             } else { // !upper_bound_exists
362                 // no upper bound information
363                 if (lower_bound_exists && !lower_bound_value.is_zero() && new_lower_bound_info) {
364                     // nonzero lower bound, no upper bound
365 
366                     // check current assumptions
367                     if (regex_automaton_assumptions.contains(re) &&
368                             !regex_automaton_assumptions[re].empty()){
369                         // one or more existing assumptions.
370                         // see if the (current best) lower bound can be refined
371                         // (note that if we have an automaton with no assumption,
372                         // this automatically counts as best)
373                         bool need_assumption = true;
374                         regex_automaton_under_assumptions last_assumption;
375                         rational last_lb = rational::zero(); // the default
376                         for (auto autA : regex_automaton_assumptions[re]) {
377                             if ((current_assignment == l_true && autA.get_polarity() == false)
378                                     || (current_assignment == l_false && autA.get_polarity() == true)) {
379                                 // automaton uses incorrect polarity
380                                 continue;
381                             }
382                             rational this_lb;
383                             if (autA.get_lower_bound(this_lb)) {
384                                 if (this_lb > last_lb) {
385                                     last_lb = this_lb;
386                                     last_assumption = autA;
387                                 }
388                             } else {
389                                 need_assumption = false;
390                                 last_assumption = autA;
391                                 break;
392                             }
393                         }
394                         if (!last_lb.is_zero() || !need_assumption) {
395                             CTRACE("str", !need_assumption, tout << "using automaton with full length information" << std::endl;);
396                             CTRACE("str", need_assumption, tout << "using automaton with assumed lower bound of " << last_lb << std::endl;);
397                             rational refined_lower_bound;
398                             bool solution_at_lower_bound = refine_automaton_lower_bound(last_assumption.get_automaton(),
399                                     lower_bound_value, refined_lower_bound);
400                             TRACE("str", tout << "refined lower bound is " << refined_lower_bound <<
401                                     (solution_at_lower_bound?", solution at lower bound":", no solution at lower bound") << std::endl;);
402 
403                             expr_ref_vector lhs(m);
404                             if (current_assignment == l_false) {
405                                 lhs.push_back(m.mk_not(str_in_re));
406                             } else {
407                                 lhs.push_back(str_in_re);
408                             }
409                             if (need_assumption) {
410                                 lhs.push_back(m_autil.mk_ge(str_len, m_autil.mk_numeral(last_lb, true)));
411                             }
412                             lhs.push_back(m_autil.mk_ge(str_len, m_autil.mk_numeral(lower_bound_value, true)));
413 
414                             expr_ref_vector rhs(m);
415 
416                             if (solution_at_lower_bound) {
417                                 if (refined_lower_bound.is_minus_one()) {
418                                     // If there are solutions at the lower bound but not above it, make the bound exact.
419                                     rhs.push_back(ctx.mk_eq_atom(str_len, m_autil.mk_numeral(lower_bound_value, true)));
420                                 } else {
421                                     // If there are solutions at and above the lower bound, add an additional bound.
422                                     // DISABLED as this is causing non-termination in the integer solver. --mtrberzi
423                                     /*
424                                     rhs.push_back(m.mk_or(
425                                             ctx.mk_eq_atom(str_len, m_autil.mk_numeral(lower_bound_value, true)),
426                                             m_autil.mk_ge(str_len, m_autil.mk_numeral(refined_lower_bound, true))
427                                     ));
428                                     */
429                                 }
430                             } else {
431                                 if (refined_lower_bound.is_minus_one()) {
432                                     // If there are no solutions at or above the lower bound, assert a conflict clause.
433                                     rhs.push_back(m.mk_not(m_autil.mk_ge(str_len, m_autil.mk_numeral(lower_bound_value, true))));
434                                 } else {
435                                     // If there are solutions above the lower bound but not at it, refine the bound.
436                                     rhs.push_back(m_autil.mk_ge(str_len, m_autil.mk_numeral(refined_lower_bound, true)));
437                                 }
438                             }
439 
440                             if (!rhs.empty()) {
441                                 expr_ref lhs_terms(mk_and(lhs), m);
442                                 expr_ref rhs_terms(mk_and(rhs), m);
443                                 assert_implication(lhs_terms, rhs_terms);
444                             }
445                         }
446                     } else {
447                         // no existing automata/assumptions.
448                         // if it's easy to construct a full automaton for R, do so
449                         unsigned expected_complexity = estimate_regex_complexity(re);
450                         bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold);
451                         if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || failureThresholdExceeded) {
452                             eautomaton * aut = m_mk_aut(re);
453                             if (aut == nullptr) {
454                                 TRACE("str", tout << "ERROR: symbolic automaton construction failed, likely due to non-constant term in regex" << std::endl;);
455                                 return false;
456                             }
457                             aut->compress();
458                             regex_automata.push_back(aut);
459                             regex_automaton_under_assumptions new_aut(re, aut, true);
460                             if (!regex_automaton_assumptions.contains(re)) {
461                                 regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>());
462                             }
463                             regex_automaton_assumptions[re].push_back(new_aut);
464                             TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
465                             find_automaton_initial_bounds(str_in_re, aut);
466                         } else {
467                             // TODO check negation?
468                             // TODO construct a partial automaton for R to the given lower bound?
469                             if (false) {
470 
471                             } else {
472                                 regex_inc_counter(regex_fail_count, str_in_re);
473                             }
474                         }
475                         continue;
476                     }
477                 } else { // !lower_bound_exists
478                     // no bounds information
479                     // check for existing automata;
480                     // try to construct an automaton if we don't have one yet
481                     // and doing so without bounds is not difficult
482                     bool existingAutomata = (regex_automaton_assumptions.contains(re) && !regex_automaton_assumptions[re].empty());
483                     bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold);
484                     if (!existingAutomata) {
485                         unsigned expected_complexity = estimate_regex_complexity(re);
486                         if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold
487                                 || failureThresholdExceeded) {
488                             eautomaton * aut = m_mk_aut(re);
489                             if (aut == nullptr) {
490                                 TRACE("str", tout << "ERROR: symbolic automaton construction failed, likely due to non-constant term in regex" << std::endl;);
491                                 return false;
492                             }
493                             aut->compress();
494                             regex_automata.push_back(aut);
495                             regex_automaton_under_assumptions new_aut(re, aut, true);
496                             if (!regex_automaton_assumptions.contains(re)) {
497                                 regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>());
498                             }
499                             regex_automaton_assumptions[re].push_back(new_aut);
500                             TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
501                             find_automaton_initial_bounds(str_in_re, aut);
502                         } else {
503                             regex_inc_counter(regex_fail_count, str_in_re);
504                         }
505                     } else {
506                         regex_inc_counter(regex_fail_count, str_in_re);
507                     }
508                 }
509             }
510         } // foreach (entry in regex_terms)
511 
512         for (auto entry : regex_terms_by_string) {
513             // TODO do we need to check equivalence classes of strings here?
514 
515             expr* str = entry.m_key;
516             ptr_vector<expr> str_in_re_terms = entry.m_value;
517 
518             svector<regex_automaton_under_assumptions> intersect_constraints;
519             // we may find empty intersection before checking every constraint;
520             // this vector keeps track of which ones actually take part in intersection
521             svector<regex_automaton_under_assumptions> used_intersect_constraints;
522 
523             // choose an automaton/assumption for each assigned (str.in.re)
524             // that's consistent with the current length information
525             for (auto str_in_re_term : str_in_re_terms) {
526                 expr * _unused = nullptr;
527                 expr * re = nullptr;
528                 SASSERT(u.str.is_in_re(str_in_re_term));
529                 u.str.is_in_re(str_in_re_term, _unused, re);
530 
531                 rational exact_len;
532                 bool has_exact_len = get_len_value(str, exact_len);
533 
534                 rational lb, ub;
535                 bool has_lower_bound = lower_bound(mk_strlen(str), lb);
536                 bool has_upper_bound = upper_bound(mk_strlen(str), ub);
537 
538                 if (regex_automaton_assumptions.contains(re) &&
539                                                 !regex_automaton_assumptions[re].empty()){
540                     for (auto aut : regex_automaton_assumptions[re]) {
541                         rational aut_ub;
542                         bool assume_ub = aut.get_upper_bound(aut_ub);
543                         rational aut_lb;
544                         bool assume_lb = aut.get_lower_bound(aut_lb);
545                         bool consistent = true;
546 
547                         if (assume_ub) {
548                             // check consistency of assumed upper bound
549                             if (has_exact_len) {
550                                 if (exact_len > aut_ub) {
551                                     consistent = false;
552                                 }
553                             } else {
554                                 if (has_upper_bound && ub > aut_ub) {
555                                     consistent = false;
556                                 }
557                             }
558                         }
559 
560                         if (assume_lb) {
561                             // check consistency of assumed lower bound
562                             if (has_exact_len) {
563                                 if (exact_len < aut_lb) {
564                                     consistent = false;
565                                 }
566                             } else {
567                                 if (has_lower_bound && lb < aut_lb) {
568                                     consistent = false;
569                                 }
570                             }
571                         }
572 
573                         if (consistent) {
574                             intersect_constraints.push_back(aut);
575                             break;
576                         }
577                     }
578                 }
579             } // foreach(term in str_in_re_terms)
580 
581             eautomaton * aut_inter = nullptr;
582             CTRACE("str", !intersect_constraints.empty(), tout << "check intersection of automata constraints for " << mk_pp(str, m) << std::endl;);
583             for (auto aut : intersect_constraints) {
584                 TRACE("str",
585                       {
586                           unsigned v = regex_get_counter(regex_length_attempt_count, aut.get_regex_term());
587                           tout << "length attempt count of " << mk_pp(aut.get_regex_term(), m) << " is " << v
588                                << ", threshold is " << m_params.m_RegexAutomata_LengthAttemptThreshold << std::endl;
589                       });
590 
591                 if (regex_get_counter(regex_length_attempt_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_LengthAttemptThreshold) {
592                     unsigned intersectionDifficulty = 0;
593                     if (aut_inter != nullptr) {
594                         intersectionDifficulty = estimate_automata_intersection_difficulty(aut_inter, aut.get_automaton());
595                     }
596                     TRACE("str", tout << "intersection difficulty is " << intersectionDifficulty << std::endl;);
597                     if (intersectionDifficulty <= m_params.m_RegexAutomata_IntersectionDifficultyThreshold
598                             || regex_get_counter(regex_intersection_fail_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_FailedIntersectionThreshold) {
599 
600                         expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term()));
601                         lbool current_assignment = ctx.get_assignment(str_in_re_term);
602                         // if the assignment is consistent with our assumption, use the automaton directly;
603                         // otherwise, complement it (and save that automaton for next time)
604                         // TODO we should cache these intermediate results
605                         // TODO do we need to push the intermediates into a vector for deletion anyway?
606                         if ( (current_assignment == l_true && aut.get_polarity())
607                                 || (current_assignment == l_false && !aut.get_polarity())) {
608                             if (aut_inter == nullptr) {
609                                 aut_inter = aut.get_automaton();
610                             } else {
611                                 aut_inter = m_mk_aut.mk_product(aut_inter, aut.get_automaton());
612                                 m_automata.push_back(aut_inter);
613                             }
614                         } else {
615                             // need to complement first
616                             expr_ref rc(u.re.mk_complement(aut.get_regex_term()), m);
617                             eautomaton * aut_c = m_mk_aut(rc);
618                             if (aut_c == nullptr) {
619                                 TRACE("str", tout << "ERROR: symbolic automaton construction failed, likely due to non-constant term in regex" << std::endl;);
620                                 return false;
621                             }
622                             regex_automata.push_back(aut_c);
623                             // TODO is there any way to build a complement automaton from an existing one?
624                             // this discards length information
625                             if (aut_inter == nullptr) {
626                                 aut_inter = aut_c;
627                             } else {
628                                 aut_inter = m_mk_aut.mk_product(aut_inter, aut_c);
629                                 m_automata.push_back(aut_inter);
630                             }
631                         }
632                         used_intersect_constraints.push_back(aut);
633                         if (aut_inter->is_empty()) {
634                             break;
635                         }
636                     } else {
637                         // failed intersection
638                         regex_inc_counter(regex_intersection_fail_count, aut.get_regex_term());
639                     }
640                 }
641             } // foreach(entry in intersect_constraints)
642             if (aut_inter != nullptr) {
643                 aut_inter->compress();
644             }
645             TRACE("str", tout << "intersected " << used_intersect_constraints.size() << " constraints" << std::endl;);
646 
647             expr_ref_vector conflict_terms(m);
648             expr_ref conflict_lhs(m);
649             for (auto aut : used_intersect_constraints) {
650                 expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term()));
651                 lbool current_assignment = ctx.get_assignment(str_in_re_term);
652                 if (current_assignment == l_true) {
653                     conflict_terms.push_back(str_in_re_term);
654                 } else if (current_assignment == l_false) {
655                     conflict_terms.push_back(m.mk_not(str_in_re_term));
656                 }
657                 // add length assumptions, if any
658                 rational ub;
659                 if (aut.get_upper_bound(ub)) {
660                     expr_ref ub_term(m_autil.mk_le(mk_strlen(str), m_autil.mk_numeral(ub, true)), m);
661                     conflict_terms.push_back(ub_term);
662                 }
663                 rational lb;
664                 if (aut.get_lower_bound(lb)) {
665                     expr_ref lb_term(m_autil.mk_ge(mk_strlen(str), m_autil.mk_numeral(lb, true)), m);
666                     conflict_terms.push_back(lb_term);
667                 }
668             }
669             conflict_lhs = mk_and(conflict_terms);
670             TRACE("str", tout << "conflict lhs: " << mk_pp(conflict_lhs, m) << std::endl;);
671 
672             if (used_intersect_constraints.size() > 1 && aut_inter != nullptr) {
673                 // check whether the intersection is only the empty string
674                 unsigned initial_state = aut_inter->init();
675                 if (aut_inter->final_states().size() == 1 && aut_inter->is_final_state(initial_state)) {
676                     // initial state is final and it is the only final state
677                     // if there are no moves from the initial state,
678                     // the only solution is the empty string
679                     if (aut_inter->get_moves_from(initial_state).empty()) {
680                         TRACE("str", tout << "product automaton only accepts empty string" << std::endl;);
681                         expr_ref rhs1(ctx.mk_eq_atom(str, mk_string("")), m);
682                         expr_ref rhs2(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(rational::zero(), true)), m);
683                         expr_ref rhs(m.mk_and(rhs1, rhs2), m);
684                         assert_implication(conflict_lhs, rhs);
685                     }
686                 }
687             }
688 
689             if (aut_inter != nullptr && aut_inter->is_empty()) {
690                 TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;);
691                 expr_ref conflict_clause(m.mk_not(mk_and(conflict_terms)), m);
692                 assert_axiom(conflict_clause);
693                 add_persisted_axiom(conflict_clause);
694             }
695         } // foreach (entry in regex_terms_by_string)
696         return true;
697     }
698 
estimate_regex_complexity(expr * re)699     unsigned theory_str::estimate_regex_complexity(expr * re) {
700         ENSURE(u.is_re(re));
701         expr * sub1;
702         expr * sub2;
703         unsigned lo, hi;
704         if (u.re.is_to_re(re, sub1)) {
705             if (!u.str.is_string(sub1))
706                 throw default_exception("regular expressions must be built from string literals");
707             zstring str;
708             u.str.is_string(sub1, str);
709             return str.length();
710         } else if (u.re.is_complement(re, sub1)) {
711             return estimate_regex_complexity_under_complement(sub1);
712         } else if (u.re.is_concat(re, sub1, sub2)) {
713             unsigned cx1 = estimate_regex_complexity(sub1);
714             unsigned cx2 = estimate_regex_complexity(sub2);
715             return _qadd(cx1, cx2);
716         } else if (u.re.is_union(re, sub1, sub2)) {
717             unsigned cx1 = estimate_regex_complexity(sub1);
718             unsigned cx2 = estimate_regex_complexity(sub2);
719             return _qadd(cx1, cx2);
720         } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) {
721             unsigned cx = estimate_regex_complexity(sub1);
722             return _qmul(2, cx);
723         } else if (u.re.is_loop(re, sub1, lo, hi) || u.re.is_loop(re, sub1, lo)) {
724             unsigned cx = estimate_regex_complexity(sub1);
725             return _qadd(lo, cx);
726         } else if (u.re.is_range(re, sub1, sub2)) {
727             SASSERT(u.str.is_string(sub1));
728             SASSERT(u.str.is_string(sub2));
729             zstring str1, str2;
730             u.str.is_string(sub1, str1);
731             u.str.is_string(sub2, str2);
732             if (str1.length() == 1 && str2.length() == 1) {
733                 return 1 + str2[0] - str1[0];
734             } else {
735                 return 1;
736             }
737         } else if (u.re.is_full_char(re) || u.re.is_full_seq(re)) {
738             return 1;
739         } else {
740             TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;);
741             return 1;
742         }
743     }
744 
estimate_regex_complexity_under_complement(expr * re)745     unsigned theory_str::estimate_regex_complexity_under_complement(expr * re) {
746         ENSURE(u.is_re(re));
747         expr * sub1;
748         expr * sub2;
749         zstring str;
750         unsigned lo, hi;
751         if (u.re.is_to_re(re, sub1) && u.str.is_string(sub1)) {
752             return str.length();
753         } else if (u.re.is_complement(re, sub1)) {
754             // Why don't we return the regular complexity here?
755             // We could, but this might be called from under another complemented subexpression.
756             // It's better to give a worst-case complexity.
757             return estimate_regex_complexity_under_complement(sub1);
758         } else if (u.re.is_concat(re, sub1, sub2)) {
759             unsigned cx1 = estimate_regex_complexity_under_complement(sub1);
760             unsigned cx2 = estimate_regex_complexity_under_complement(sub2);
761             return _qadd(_qmul(2, cx1), cx2);
762         } else if (u.re.is_union(re, sub1, sub2)) {
763             unsigned cx1 = estimate_regex_complexity_under_complement(sub1);
764             unsigned cx2 = estimate_regex_complexity_under_complement(sub2);
765             return _qmul(cx1, cx2);
766         } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1) || u.re.is_loop(re, sub1, lo, hi) || u.re.is_loop(re, sub1, lo)) {
767             unsigned cx = estimate_regex_complexity_under_complement(sub1);
768             return _qmul(2, cx);
769         } else if (u.re.is_range(re, sub1, sub2)) {
770             SASSERT(u.str.is_string(sub1));
771             SASSERT(u.str.is_string(sub2));
772             zstring str1, str2;
773             u.str.is_string(sub1, str1);
774             u.str.is_string(sub2, str2);
775             SASSERT(str1.length() == 1);
776             SASSERT(str2.length() == 1);
777             return 1 + str2[0] - str1[0];
778         } else if (u.re.is_full_char(re) || u.re.is_full_seq(re)) {
779             return 1;
780         } else {
781             TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;);
782             return 1;
783         }
784     }
785 
estimate_automata_intersection_difficulty(eautomaton * aut1,eautomaton * aut2)786     unsigned theory_str::estimate_automata_intersection_difficulty(eautomaton * aut1, eautomaton * aut2) {
787         ENSURE(aut1 != nullptr);
788         ENSURE(aut2 != nullptr);
789         return _qmul(aut1->num_states(), aut2->num_states());
790     }
791 
792     // Check whether a regex translates well to a linear set of length constraints.
check_regex_length_linearity(expr * re)793     bool theory_str::check_regex_length_linearity(expr * re) {
794         return check_regex_length_linearity_helper(re, false);
795     }
796 
check_regex_length_linearity_helper(expr * re,bool already_star)797     bool theory_str::check_regex_length_linearity_helper(expr * re, bool already_star) {
798         expr * sub1;
799         expr * sub2;
800         unsigned lo, hi;
801         if (u.re.is_to_re(re)) {
802             return true;
803         } else if (u.re.is_concat(re, sub1, sub2)) {
804             return check_regex_length_linearity_helper(sub1, already_star) && check_regex_length_linearity_helper(sub2, already_star);
805         } else if (u.re.is_union(re, sub1, sub2)) {
806             return check_regex_length_linearity_helper(sub1, already_star) && check_regex_length_linearity_helper(sub2, already_star);
807         } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) {
808             if (already_star) {
809                 return false;
810             } else {
811                 return check_regex_length_linearity_helper(sub1, true);
812             }
813         } else if (u.re.is_range(re)) {
814             return true;
815         } else if (u.re.is_full_char(re)) {
816             return true;
817         } else if (u.re.is_full_seq(re)) {
818             return true;
819         } else if (u.re.is_complement(re)) {
820             // TODO can we do better?
821             return false;
822         } else if (u.re.is_intersection(re)) {
823             return false;
824         } else if (u.re.is_loop(re, sub1, lo, hi) || u.re.is_loop(re, sub1, lo)) {
825             return check_regex_length_linearity_helper(sub1, already_star);
826         } else {
827             TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;);
828             return false;
829         }
830     }
831 
832     // note: returns an empty set `lens` if something went wrong
check_subterm_lengths(expr * re,integer_set & lens)833     void theory_str::check_subterm_lengths(expr * re, integer_set & lens) {
834         expr * sub1;
835         expr * sub2;
836         unsigned lo, hi;
837         if (u.re.is_to_re(re, sub1)) {
838             SASSERT(u.str.is_string(sub1));
839             zstring str;
840             u.str.is_string(sub1, str);
841             lens.insert(str.length());
842         } else if (u.re.is_concat(re, sub1, sub2)) {
843             integer_set lens_1, lens_2;
844             check_subterm_lengths(sub1, lens_1);
845             check_subterm_lengths(sub2, lens_2);
846             if (lens_1.empty() || lens_2.empty()) {
847                 lens.reset();
848             } else {
849                 // take all pairwise lengths
850                 for (integer_set::iterator it1 = lens_1.begin(); it1 != lens_1.end(); ++it1) {
851                     for(integer_set::iterator it2 = lens_2.begin(); it2 != lens_2.end(); ++it2) {
852                         int l1 = *it1;
853                         int l2 = *it2;
854                         lens.insert(l1 + l2);
855                     }
856                 }
857             }
858         } else if (u.re.is_union(re, sub1, sub2)) {
859             integer_set lens_1, lens_2;
860             check_subterm_lengths(sub1, lens_1);
861             check_subterm_lengths(sub2, lens_2);
862             if (lens_1.empty() || lens_2.empty()) {
863                 lens.reset();
864             } else {
865                 // take all possibilities from either side
866                 for (integer_set::iterator it1 = lens_1.begin(); it1 != lens_1.end(); ++it1) {
867                     lens.insert(*it1);
868                 }
869                 for (integer_set::iterator it2 = lens_2.begin(); it2 != lens_2.end(); ++it2) {
870                     lens.insert(*it2);
871                 }
872             }
873         } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) {
874             // this is bad -- term generation requires this not to appear
875             lens.reset();
876         } else if (u.re.is_range(re, sub1, sub2)) {
877             SASSERT(u.str.is_string(sub1));
878             SASSERT(u.str.is_string(sub2));
879             zstring str1, str2;
880             u.str.is_string(sub1, str1);
881             u.str.is_string(sub2, str2);
882             // re.range is a language of singleton strings if both of its arguments are;
883             // otherwise it is the empty language
884             if (str1.length() == 1 && str2.length() == 1) {
885                 lens.insert(1);
886             } else {
887                 lens.insert(0);
888             }
889         } else if (u.re.is_full_char(re)) {
890             lens.insert(1);
891         } else if (u.re.is_full_seq(re)) {
892             lens.reset();
893         } else if (u.re.is_complement(re)) {
894             lens.reset();
895         } else if (u.re.is_loop(re, sub1, lo, hi)) {
896             integer_set lens_1;
897             check_subterm_lengths(sub1, lens_1);
898             for (unsigned i = lo; i <= hi; ++i) {
899                 for (auto j : lens_1) {
900                     lens.insert(i * j);
901                 }
902             }
903         } else {
904             TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;);
905             lens.reset();
906         }
907     }
908 
909     /*
910      * Infer all length constraints implied by the given regular expression `re`
911      * in order to constrain `lenVar` (which must be of sort Int).
912      * This assumes that `re` appears in a positive context.
913      * Returns a Boolean formula expressing the appropriate constraints over `lenVar`.
914      * In some cases, the returned formula requires one or more free integer variables to be created.
915      * These variables are returned in the reference parameter `freeVariables`.
916      * Extra assertions should be made for these free variables constraining them to be non-negative.
917      */
infer_all_regex_lengths(expr * lenVar,expr * re,expr_ref_vector & freeVariables)918     expr_ref theory_str::infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables) {
919         ENSURE(u.is_re(re));
920         expr * sub1;
921         expr * sub2;
922         unsigned lo, hi;
923         if (u.re.is_to_re(re, sub1)) {
924             if (!u.str.is_string(sub1))
925                 throw default_exception("regular expressions must be built from string literals");
926             zstring str;
927             u.str.is_string(sub1, str);
928             rational strlen(str.length());
929             expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(strlen, true)), m);
930             return retval;
931         } else if (u.re.is_union(re, sub1, sub2)) {
932             expr_ref r1 = infer_all_regex_lengths(lenVar, sub1, freeVariables);
933             expr_ref r2 = infer_all_regex_lengths(lenVar, sub2, freeVariables);
934             expr_ref retval(m.mk_or(r1, r2), m);
935             return retval;
936         } else if (u.re.is_concat(re, sub1, sub2)) {
937             expr * v1 = mk_int_var("rlen1");
938             expr * v2 = mk_int_var("rlen2");
939             freeVariables.push_back(v1);
940             freeVariables.push_back(v2);
941             expr_ref r1 = infer_all_regex_lengths(v1, sub1, freeVariables);
942             expr_ref r2 = infer_all_regex_lengths(v2, sub2, freeVariables);
943             expr_ref_vector finalResult(m);
944             finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_add(v1, v2)));
945             finalResult.push_back(r1);
946             finalResult.push_back(r2);
947             expr_ref retval(mk_and(finalResult), m);
948             return retval;
949         } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) {
950             // stars are generated as a linear combination of all possible subterm lengths;
951             // this requires that there are no stars under this one
952             /*
953                 expr * v = mk_int_var("rlen");
954                 expr * n = mk_int_var("rstar");
955                 freeVariables.push_back(v);
956                 freeVariables.push_back(n);
957                 expr_ref rsub = infer_all_regex_lengths(v, sub1, freeVariables);
958                 expr_ref_vector finalResult(m);
959                 finalResult.push_back(rsub);
960                 finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_mul(v, n)));
961                 expr_ref retval(mk_and(finalResult), m);
962                 return retval;
963              */
964             integer_set subterm_lens;
965             check_subterm_lengths(sub1, subterm_lens);
966             if (subterm_lens.empty()) {
967                 // somehow generation was impossible
968                 expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m);
969                 return retval;
970             } else {
971                 TRACE("str", tout << "subterm lengths:";
972                 for(integer_set::iterator it = subterm_lens.begin(); it != subterm_lens.end(); ++it) {
973                     tout << " " << *it;
974                 }
975                 tout << std::endl;);
976                 expr_ref_vector sum_terms(m);
977                 for (integer_set::iterator it = subterm_lens.begin(); it != subterm_lens.end(); ++it) {
978                     rational lenOption(*it);
979                     expr * n = mk_int_var("rstar");
980                     freeVariables.push_back(n);
981                     expr_ref term(m_autil.mk_mul(m_autil.mk_numeral(lenOption, true), n), m);
982                     expr_ref term2(term, m);
983                     if (u.re.is_plus(re)) {
984                         // n effectively starts at 1
985                         term2 = m_autil.mk_add(m_autil.mk_numeral(lenOption, true), term);
986                     }
987                     sum_terms.push_back(term2);
988                 }
989                 expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_add_simplify(sum_terms)), m);
990                 return retval;
991             }
992         } else if (u.re.is_loop(re, sub1, lo, hi)) {
993             expr * v1 = mk_int_var("rlen");
994             freeVariables.push_back(v1);
995             expr_ref r1 = infer_all_regex_lengths(v1, sub1, freeVariables);
996             expr_ref_vector v1_choices(m);
997             for (unsigned i = lo; i <= hi; ++i) {
998                 rational rI(i);
999                 expr_ref v1_i(ctx.mk_eq_atom(lenVar, m_autil.mk_mul(m_autil.mk_numeral(rI, true), v1)), m);
1000                 v1_choices.push_back(v1_i);
1001             }
1002             expr_ref_vector finalResult(m);
1003             finalResult.push_back(r1);
1004             finalResult.push_back(mk_or(v1_choices));
1005             expr_ref retval(mk_and(finalResult), m);
1006             SASSERT(retval);
1007             return retval;
1008         } else if (u.re.is_range(re, sub1, sub2)) {
1009             SASSERT(u.str.is_string(sub1));
1010             SASSERT(u.str.is_string(sub2));
1011             zstring str1, str2;
1012             u.str.is_string(sub1, str1);
1013             u.str.is_string(sub2, str2);
1014             SASSERT(str1.length() == 1);
1015             SASSERT(str2.length() == 1);
1016             expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(rational::one(), true)), m);
1017             return retval;
1018         } else if (u.re.is_full_char(re)) {
1019             expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(rational::one(), true)), m);
1020             return retval;
1021         } else if (u.re.is_full_seq(re)) {
1022             // match any unbounded string
1023             expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m);
1024             return retval;
1025         } else if (u.re.is_complement(re)) {
1026             // skip complement for now, in general this is difficult to predict
1027             expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m);
1028             return retval;
1029         } else {
1030             TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, m) << std::endl;);
1031             expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m);
1032             return retval;
1033         }
1034     }
1035 
1036     /*
1037      * Assert initial lower and upper bounds for the positive constraint (str in re) corresponding
1038      * to the automaton `aut`.
1039      * This asserts a constraint of the form:
1040      *   str_in_re --> (len(str) ?= 0 OR len(str) >= lb) AND len(str) <= ub
1041      * where the upper bound clause is omitted if the upper bound doesn't exist
1042      * and the equality with 0 is based on whether solutions of length 0 are allowed.
1043      */
find_automaton_initial_bounds(expr * str_in_re,eautomaton * aut)1044     void theory_str::find_automaton_initial_bounds(expr * str_in_re, eautomaton * aut) {
1045         ENSURE(aut != nullptr);
1046 
1047         expr_ref_vector rhs(m);
1048         expr * str = nullptr;
1049         expr * re = nullptr;
1050         u.str.is_in_re(str_in_re, str, re);
1051         expr_ref strlen(mk_strlen(str), m);
1052 
1053         // lower bound first
1054         rational nonzero_lower_bound;
1055         bool zero_sol_exists = refine_automaton_lower_bound(aut, rational::zero(), nonzero_lower_bound);
1056         if (zero_sol_exists) {
1057             regex_last_lower_bound.insert(str, rational::zero());
1058             // solution at 0
1059             if (!nonzero_lower_bound.is_minus_one()) {
1060                 expr_ref rhs1(ctx.mk_eq_atom(strlen, m_autil.mk_numeral(rational::zero(), true)), m);
1061                 expr_ref rhs2(m_autil.mk_ge(strlen, m_autil.mk_numeral(nonzero_lower_bound, true)), m);
1062                 rhs.push_back(m.mk_or(rhs1, rhs2));
1063             } else {
1064                 // length of solution can ONLY be 0
1065                 expr_ref rhs1(ctx.mk_eq_atom(strlen, m_autil.mk_numeral(rational::zero(), true)), m);
1066                 rhs.push_back(rhs1);
1067             }
1068         } else {
1069             // no solution at 0
1070             if (!nonzero_lower_bound.is_minus_one()) {
1071                 regex_last_lower_bound.insert(str, nonzero_lower_bound);
1072                 expr_ref rhs2(m_autil.mk_ge(strlen, m_autil.mk_numeral(nonzero_lower_bound, true)), m);
1073                 rhs.push_back(rhs2);
1074             } else {
1075                 // probably no solutions at all; just assume that 0 is a (safe) lower bound
1076                 regex_last_lower_bound.insert(str, rational::zero());
1077                 rhs.reset();
1078             }
1079         }
1080         // TODO upper bound check
1081 
1082         if (!rhs.empty()) {
1083             expr_ref lhs(str_in_re, m);
1084             expr_ref _rhs(mk_and(rhs), m);
1085             assert_implication(lhs, _rhs);
1086         }
1087     }
1088 
1089     /*
1090      * Refine the lower bound on the length of a solution to a given automaton.
1091      * The method returns TRUE if a solution of length `current_lower_bound` exists,
1092      * and FALSE otherwise. In addition, the reference parameter `refined_lower_bound`
1093      * is assigned the length of the shortest solution longer than `current_lower_bound`
1094      * if it exists, or -1 otherwise.
1095      */
refine_automaton_lower_bound(eautomaton * aut,rational current_lower_bound,rational & refined_lower_bound)1096     bool theory_str::refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound) {
1097         ENSURE(aut != nullptr);
1098 
1099         if (aut->final_states().empty()) {
1100             // no solutions at all
1101             refined_lower_bound = rational::minus_one();
1102             return false;
1103         }
1104 
1105         // from here we assume that there is a final state reachable from the initial state
1106 
1107         unsigned_vector search_queue;
1108         // populate search_queue with all states reachable from the epsilon-closure of start state
1109         aut->get_epsilon_closure(aut->init(), search_queue);
1110 
1111         unsigned search_depth = 0;
1112         hashtable<unsigned, unsigned_hash, default_eq<unsigned>> next_states;
1113         unsigned_vector next_search_queue;
1114 
1115         bool found_solution_at_lower_bound = false;
1116 
1117         while (!search_queue.empty()) {
1118             // if we are at the lower bound, check for final states
1119             if (search_depth == current_lower_bound.get_unsigned()) {
1120                 for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) {
1121                     unsigned state = *it;
1122                     if (aut->is_final_state(state)) {
1123                         found_solution_at_lower_bound = true;
1124                         break;
1125                     }
1126                 }
1127                 // end phase 1
1128                 break;
1129             }
1130             next_states.reset();
1131             next_search_queue.clear();
1132             // move one step along all states
1133             for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) {
1134                 unsigned src = *it;
1135                 eautomaton::moves next_moves;
1136                 aut->get_moves_from(src, next_moves, true);
1137                 for (eautomaton::moves::iterator move_it = next_moves.begin();
1138                         move_it != next_moves.end(); ++move_it) {
1139                     unsigned dst = move_it->dst();
1140                     if (!next_states.contains(dst)) {
1141                         next_states.insert(dst);
1142                         next_search_queue.push_back(dst);
1143                     }
1144                 }
1145             }
1146             search_queue.clear();
1147             search_queue.append(next_search_queue);
1148             search_depth += 1;
1149         } // !search_queue.empty()
1150 
1151         // if we got here before reaching the lower bound,
1152         // there aren't any solutions at or above it, so stop
1153         if (search_depth < current_lower_bound.get_unsigned()) {
1154             refined_lower_bound = rational::minus_one();
1155             return false;
1156         }
1157 
1158         // phase 2: continue exploring the automaton above the lower bound
1159         SASSERT(search_depth == current_lower_bound.get_unsigned());
1160 
1161         while (!search_queue.empty()) {
1162             if (search_depth > current_lower_bound.get_unsigned()) {
1163                 // check if we have found a solution above the lower bound
1164                 for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) {
1165                     unsigned state = *it;
1166                     if (aut->is_final_state(state)) {
1167                         // this is a solution at a depth higher than the lower bound
1168                         refined_lower_bound = rational(search_depth);
1169                         return found_solution_at_lower_bound;
1170                     }
1171                 }
1172             }
1173             next_states.reset();
1174             next_search_queue.clear();
1175             // move one step along all states
1176             for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) {
1177                 unsigned src = *it;
1178                 eautomaton::moves next_moves;
1179                 aut->get_moves_from(src, next_moves, true);
1180                 for (eautomaton::moves::iterator move_it = next_moves.begin();
1181                         move_it != next_moves.end(); ++move_it) {
1182                     unsigned dst = move_it->dst();
1183                     if (!next_states.contains(dst)) {
1184                         next_states.insert(dst);
1185                         next_search_queue.push_back(dst);
1186                     }
1187                 }
1188             }
1189             search_queue.clear();
1190             search_queue.append(next_search_queue);
1191             search_depth += 1;
1192         }
1193         // if we reached this point, we explored the whole automaton and didn't find any
1194         // solutions above the lower bound
1195         refined_lower_bound = rational::minus_one();
1196         return found_solution_at_lower_bound;
1197     }
1198 
1199     /*
1200      * Refine the upper bound on the length of a solution to a given automaton.
1201      * The method returns TRUE if a solution of length `current_upper_bound` exists,
1202      * and FALSE otherwise. In addition, the reference parameter `refined_upper_bound`
1203      * is assigned the length of the longest solution shorter than `current_upper_bound`,
1204      * if a shorter solution exists, or -1 otherwise.
1205      */
refine_automaton_upper_bound(eautomaton * aut,rational current_upper_bound,rational & refined_upper_bound)1206     bool theory_str::refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound) {
1207         ENSURE(aut != nullptr);
1208 
1209         if (aut->final_states().empty()) {
1210             // no solutions at all!
1211             refined_upper_bound = rational::minus_one();
1212             return false;
1213         }
1214 
1215         // from here we assume there is a final state reachable from the initial state
1216         unsigned_vector search_queue;
1217         // populate search queue with all states reachable from the epsilon-closure of the start state
1218         aut->get_epsilon_closure(aut->init(), search_queue);
1219 
1220         rational last_solution_depth = rational::minus_one();
1221         bool found_solution_at_upper_bound = false;
1222 
1223         unsigned search_depth = 0;
1224         hashtable<unsigned, unsigned_hash, default_eq<unsigned> > next_states;
1225         unsigned_vector next_search_queue;
1226 
1227         while(!search_queue.empty()) {
1228             // see if any of the current states are final
1229             for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) {
1230                 unsigned src = *it;
1231                 if (aut->is_final_state(src)) {
1232                     if (search_depth == current_upper_bound.get_unsigned()) {
1233                         found_solution_at_upper_bound = true;
1234                     } else {
1235                         last_solution_depth = rational(search_depth);
1236                     }
1237                     break;
1238                 }
1239             }
1240 
1241             if (search_depth == current_upper_bound.get_unsigned()) {
1242                 break;
1243             }
1244 
1245             next_states.reset();
1246             next_search_queue.clear();
1247             // move one step along all states
1248             for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) {
1249                 unsigned src = *it;
1250                 eautomaton::moves next_moves;
1251                 aut->get_moves_from(src, next_moves, true);
1252                 for (eautomaton::moves::iterator moves_it = next_moves.begin();
1253                         moves_it != next_moves.end(); ++moves_it) {
1254                     unsigned dst = moves_it->dst();
1255                     if (!next_states.contains(dst)) {
1256                         next_states.insert(dst);
1257                         next_search_queue.push_back(dst);
1258                     }
1259                 }
1260             }
1261             search_queue.clear();
1262             search_queue.append(next_search_queue);
1263             search_depth += 1;
1264         } //!search_queue.empty()
1265 
1266         refined_upper_bound = last_solution_depth;
1267         return found_solution_at_upper_bound;
1268     }
1269 
aut_path_add_next(u_map<expr * > & next,expr_ref_vector & trail,unsigned idx,expr * cond)1270     void theory_str::aut_path_add_next(u_map<expr*>& next, expr_ref_vector& trail, unsigned idx, expr* cond) {
1271         expr* acc;
1272         if (!get_manager().is_true(cond) && next.find(idx, acc)) {
1273             expr* args[2] = { cond, acc };
1274             cond = mk_or(get_manager(), 2, args);
1275         }
1276         trail.push_back(cond);
1277         next.insert(idx, cond);
1278     }
1279 
aut_path_rewrite_constraint(expr * cond,expr * ch_var)1280     expr_ref theory_str::aut_path_rewrite_constraint(expr * cond, expr * ch_var) {
1281 
1282         expr_ref retval(m);
1283 
1284         unsigned char_val = 0;
1285 
1286         expr * lhs;
1287         expr * rhs;
1288 
1289         if (u.is_const_char(cond, char_val)) {
1290             SASSERT(char_val < 256);
1291             TRACE("str", tout << "rewrite character constant " << char_val << std::endl;);
1292             zstring str_const(char_val);
1293             retval = u.str.mk_string(str_const);
1294             return retval;
1295         } else if (is_var(cond)) {
1296             TRACE("str", tout << "substitute var" << std::endl;);
1297             retval = ch_var;
1298             return retval;
1299         } else if (m.is_eq(cond, lhs, rhs)) {
1300             // handle this specially because the sort of the equality will change
1301             expr_ref new_lhs(aut_path_rewrite_constraint(lhs, ch_var), m);
1302             SASSERT(new_lhs);
1303             expr_ref new_rhs(aut_path_rewrite_constraint(rhs, ch_var), m);
1304             SASSERT(new_rhs);
1305             retval = ctx.mk_eq_atom(new_lhs, new_rhs);
1306             return retval;
1307         } else if (m.is_bool(cond)) {
1308             TRACE("str", tout << "rewrite boolean term " << mk_pp(cond, m) << std::endl;);
1309             app * a_cond = to_app(cond);
1310             expr_ref_vector rewritten_args(m);
1311             for (unsigned i = 0; i < a_cond->get_num_args(); ++i) {
1312                 expr * argI = a_cond->get_arg(i);
1313                 expr_ref new_arg(aut_path_rewrite_constraint(argI, ch_var), m);
1314                 SASSERT(new_arg);
1315                 rewritten_args.push_back(new_arg);
1316             }
1317             retval = m.mk_app(a_cond->get_decl(), rewritten_args.data());
1318             TRACE("str", tout << "final rewritten term is " << mk_pp(retval, m) << std::endl;);
1319             return retval;
1320         } else {
1321             TRACE("str", tout << "ERROR: unrecognized automaton path constraint " << mk_pp(cond, m) << ", cannot translate" << std::endl;);
1322             retval = nullptr;
1323             return retval;
1324         }
1325     }
1326 
1327     /*
1328      * Create finite path constraints for the string variable `str` with respect to the automaton `aut`.
1329      * The returned expression is the right-hand side of a constraint of the form
1330      * (str in re) AND (|str| = len) AND (any applicable length assumptions on aut) -> (rhs AND character constraints).
1331      * The character constraints, which are (str = c0 . c1 . (...) . cn) and (|c0| = 1, ...),
1332      * are returned in `characterConstraints`.
1333      */
generate_regex_path_constraints(expr * stringTerm,eautomaton * aut,rational lenVal,expr_ref & characterConstraints)1334     expr_ref theory_str::generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints) {
1335         ENSURE(aut != nullptr);
1336 
1337         if (lenVal.is_zero()) {
1338             // if any state in the epsilon-closure of the start state is accepting,
1339             // then the empty string is in this language
1340             unsigned_vector states;
1341             bool has_final = false;
1342             aut->get_epsilon_closure(aut->init(), states);
1343             for (unsigned i = 0; i < states.size() && !has_final; ++i) {
1344                 has_final = aut->is_final_state(states[i]);
1345             }
1346             if (has_final) {
1347                 // empty string is OK, assert axiom
1348                 expr_ref rhs(ctx.mk_eq_atom(stringTerm, mk_string("")), m);
1349                 SASSERT(rhs);
1350                 //regex_automata_assertions.insert(stringTerm, final_axiom);
1351                 //m_trail_stack.push(insert_obj_map<theory_str, expr, expr* >(regex_automata_assertions, stringTerm) );
1352                 return rhs;
1353             } else {
1354                 // negate -- the empty string isn't in the language
1355                 //expr_ref conflict(m.mk_not(mk_and(toplevel_lhs)), m);
1356                 //assert_axiom(conflict);
1357                 expr_ref conflict(m.mk_false(), m);
1358                 return conflict;
1359             }
1360         } // lenVal.is_zero()
1361 
1362         expr_ref_vector pathChars(m);
1363         expr_ref_vector pathChars_len_constraints(m);
1364 
1365         // reuse character terms over the same string
1366         if (string_chars.contains(stringTerm)) {
1367             // find out whether we have enough characters already
1368             ptr_vector<expr> old_chars;
1369             string_chars.find(stringTerm, old_chars);
1370             if (old_chars.size() < lenVal.get_unsigned()) {
1371                 for (unsigned i = old_chars.size(); i < lenVal.get_unsigned(); ++i) {
1372                     std::stringstream ss;
1373                     ss << "ch" << i;
1374                     expr_ref ch(mk_str_var(ss.str()), m);
1375                     m_trail.push_back(ch);
1376                     old_chars.push_back(ch);
1377                 }
1378             }
1379             string_chars.insert(stringTerm, old_chars);
1380             // now we're guaranteed to have at least the right number of characters in old_chars
1381             for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) {
1382                 expr_ref ch(old_chars.get(i), m);
1383                 refresh_theory_var(ch);
1384                 pathChars.push_back(ch);
1385                 pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true)));
1386             }
1387         } else {
1388             ptr_vector<expr> new_chars;
1389             for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) {
1390                 std::stringstream ss;
1391                 ss << "ch" << i;
1392                 expr_ref ch(mk_str_var(ss.str()), m);
1393                 pathChars.push_back(ch);
1394                 pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true)));
1395                 new_chars.push_back(ch);
1396             }
1397             string_chars.insert(stringTerm, new_chars);
1398         }
1399 
1400         // modification of code in seq_rewriter::mk_str_in_regexp()
1401         expr_ref_vector trail(m);
1402         u_map<expr*> maps[2];
1403         bool select_map = false;
1404         expr_ref ch(m), cond(m);
1405         eautomaton::moves mvs;
1406         maps[0].insert(aut->init(), m.mk_true());
1407         // is_accepted(a, aut) & some state in frontier is final.
1408         for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) {
1409             u_map<expr*>& frontier = maps[select_map];
1410             u_map<expr*>& next = maps[!select_map];
1411             select_map = !select_map;
1412             ch = pathChars.get(i);
1413             next.reset();
1414             u_map<expr*>::iterator it = frontier.begin(), end = frontier.end();
1415             for (; it != end; ++it) {
1416                 mvs.reset();
1417                 unsigned state = it->m_key;
1418                 expr*    acc  = it->m_value;
1419                 aut->get_moves_from(state, mvs, false);
1420                 for (unsigned j = 0; j < mvs.size(); ++j) {
1421                     eautomaton::move const& mv = mvs[j];
1422                     SASSERT(mv.t());
1423                     if (mv.t()->is_char() && m.is_value(mv.t()->get_char())) {
1424                         // change this to a string constraint
1425                         expr_ref cond_rhs = aut_path_rewrite_constraint(mv.t()->get_char(), ch);
1426                         SASSERT(cond_rhs);
1427                         cond = ctx.mk_eq_atom(ch, cond_rhs);
1428                         SASSERT(cond);
1429                         expr * args[2] = {cond, acc};
1430                         cond = mk_and(m, 2, args);
1431                         aut_path_add_next(next, trail, mv.dst(), cond);
1432                     } else if (mv.t()->is_range()) {
1433                         expr_ref range_lo(mv.t()->get_lo(), m);
1434                         expr_ref range_hi(mv.t()->get_hi(), m);
1435 
1436                         unsigned lo_val, hi_val;
1437 
1438                         if (u.is_const_char(range_lo, lo_val) && u.is_const_char(range_hi, hi_val)) {
1439                             TRACE("str", tout << "make range predicate from " << lo_val << " to " << hi_val << std::endl;);
1440                             expr_ref cond_rhs(m);
1441                             expr_ref_vector cond_rhs_terms(m);
1442                             for (unsigned i = lo_val; i <= hi_val; ++i) {
1443                                 zstring str_const(i);
1444                                 expr_ref str_expr(u.str.mk_string(str_const), m);
1445                                 cond_rhs_terms.push_back(ctx.mk_eq_atom(ch, str_expr));
1446                             }
1447                             cond_rhs = mk_or(cond_rhs_terms);
1448                             SASSERT(cond_rhs);
1449                             expr * args[2] = {cond_rhs, acc};
1450                             cond = mk_and(m, 2, args);
1451                             aut_path_add_next(next, trail, mv.dst(), cond);
1452                         } else {
1453                             TRACE("str", tout << "warning: non-bitvectors in automaton range predicate" << std::endl;);
1454                             UNREACHABLE();
1455                         }
1456                     } else if (mv.t()->is_pred()) {
1457                         // rewrite this constraint over string terms
1458                         expr_ref cond_rhs = aut_path_rewrite_constraint(mv.t()->get_pred(), ch);
1459                         SASSERT(cond_rhs);
1460 
1461                         if (m.is_false(cond_rhs)) {
1462                             continue;
1463                         } else if (m.is_true(cond_rhs)) {
1464                             aut_path_add_next(next, trail, mv.dst(), acc);
1465                             continue;
1466                         }
1467                         expr * args[2] = {cond_rhs, acc};
1468                         cond = mk_and(m, 2, args);
1469                         aut_path_add_next(next, trail, mv.dst(), cond);
1470                     }
1471                 }
1472             }
1473         }
1474         u_map<expr*> const& frontier = maps[select_map];
1475         u_map<expr*>::iterator it = frontier.begin(), end = frontier.end();
1476         expr_ref_vector ors(m);
1477         for (; it != end; ++it) {
1478             unsigned_vector states;
1479             bool has_final = false;
1480             aut->get_epsilon_closure(it->m_key, states);
1481             for (unsigned i = 0; i < states.size() && !has_final; ++i) {
1482                 has_final = aut->is_final_state(states[i]);
1483             }
1484             if (has_final) {
1485                 ors.push_back(it->m_value);
1486             }
1487         }
1488         expr_ref result(mk_or(ors));
1489         TRACE("str", tout << "regex path constraint: " << mk_pp(result, m) << "\n";);
1490 
1491         expr_ref concat_rhs(m);
1492         if (pathChars.size() == 1) {
1493             concat_rhs = ctx.mk_eq_atom(stringTerm, pathChars.get(0));
1494         } else {
1495             expr_ref acc(pathChars.get(0), m);
1496             for (unsigned i = 1; i < pathChars.size(); ++i) {
1497                 acc = mk_concat(acc, pathChars.get(i));
1498             }
1499             concat_rhs = ctx.mk_eq_atom(stringTerm, acc);
1500         }
1501 
1502         //expr_ref toplevel_rhs(m.mk_and(result, mk_and(pathChars_len_constraints), concat_rhs), m);
1503         characterConstraints = m.mk_and(mk_and(pathChars_len_constraints), concat_rhs);
1504         //expr_ref final_axiom(rewrite_implication(mk_and(toplevel_lhs), toplevel_rhs), m);
1505         //regex_automata_assertions.insert(stringTerm, final_axiom);
1506         //m_trail_stack.push(insert_obj_map<theory_str, expr, expr* >(regex_automata_assertions, stringTerm) );
1507         return result;
1508     }
1509 
regex_inc_counter(obj_map<expr,unsigned> & counter_map,expr * key)1510     void theory_str::regex_inc_counter(obj_map<expr, unsigned> & counter_map, expr * key) {
1511         unsigned old_v;
1512         if (counter_map.find(key, old_v)) {
1513             unsigned new_v = old_v += 1;
1514             counter_map.insert(key, new_v);
1515         } else {
1516             counter_map.insert(key, 1);
1517         }
1518     }
1519 
regex_get_counter(obj_map<expr,unsigned> & counter_map,expr * key)1520     unsigned theory_str::regex_get_counter(obj_map<expr, unsigned> & counter_map, expr * key) {
1521         unsigned v;
1522         if (counter_map.find(key, v)) {
1523             return v;
1524         } else {
1525             counter_map.insert(key, 0);
1526             return 0;
1527         }
1528     }
1529 
1530 }; /* namespace smt */
1531