1 /*++ 2 Module Name: 3 4 theory_str_regex.cpp 5 6 Abstract: 7 8 Regular expression components for Z3str3 (theory_str). 9 10 Author: 11 12 Murphy Berzish (2019-10-25) 13 14 Revision History: 15 16 --*/ 17 18 #include "smt/theory_str.h" 19 20 namespace smt { 21 22 // saturating unsigned addition _qadd(unsigned a,unsigned b)23 unsigned inline _qadd(unsigned a, unsigned b) { 24 if (a == UINT_MAX || b == UINT_MAX) { 25 return UINT_MAX; 26 } 27 unsigned result = a + b; 28 if (result < a || result < b) { 29 return UINT_MAX; 30 } 31 return result; 32 } 33 34 // saturating unsigned multiply _qmul(unsigned a,unsigned b)35 unsigned inline _qmul(unsigned a, unsigned b) { 36 if (a == UINT_MAX || b == UINT_MAX) { 37 return UINT_MAX; 38 } 39 uint64_t result = static_cast<uint64_t>(a) * static_cast<uint64_t>(b); 40 if (result > UINT_MAX) { 41 return UINT_MAX; 42 } 43 return static_cast<unsigned>(result); 44 } 45 46 // Returns false if we need to give up solving, e.g. because we found symbolic expressions in an automaton. solve_regex_automata()47 bool theory_str::solve_regex_automata() { 48 for (auto str_in_re : regex_terms) { 49 expr * str = nullptr; 50 expr * re = nullptr; 51 u.str.is_in_re(str_in_re, str, re); 52 if (!ctx.b_internalized(str_in_re)) { 53 TRACE("str", tout << "regex term " << mk_pp(str_in_re, m) << " not internalized; fixing and continuing" << std::endl;); 54 ctx.internalize(str_in_re, false); 55 finalCheckProgressIndicator = true; 56 continue; 57 } 58 lbool current_assignment = ctx.get_assignment(str_in_re); 59 TRACE("str", tout << "regex term: " << mk_pp(str, m) << " in " << mk_pp(re, m) << " : " << current_assignment << std::endl;); 60 if (current_assignment == l_undef) { 61 continue; 62 } 63 64 if (!regex_terms_with_length_constraints.contains(str_in_re)) { 65 if (current_assignment == l_true && check_regex_length_linearity(re)) { 66 TRACE("str", tout << "regex length constraints expected to be linear -- generating and asserting them" << std::endl;); 67 68 if (regex_term_to_length_constraint.contains(str_in_re)) { 69 // use existing length constraint 70 expr * top_level_length_constraint = nullptr; 71 regex_term_to_length_constraint.find(str_in_re, top_level_length_constraint); 72 73 ptr_vector<expr> extra_length_vars; 74 regex_term_to_extra_length_vars.find(str_in_re, extra_length_vars); 75 76 assert_axiom(top_level_length_constraint); 77 for(auto v : extra_length_vars) { 78 refresh_theory_var(v); 79 expr_ref len_constraint(m_autil.mk_ge(v, m_autil.mk_numeral(rational::zero(), true)), m); 80 assert_axiom(len_constraint); 81 } 82 } else { 83 // generate new length constraint 84 expr_ref_vector extra_length_vars(m); 85 expr_ref _top_level_length_constraint = infer_all_regex_lengths(mk_strlen(str), re, extra_length_vars); 86 expr_ref premise(str_in_re, m); 87 expr_ref top_level_length_constraint(m.mk_implies(premise, _top_level_length_constraint), m); 88 th_rewriter rw(m); 89 rw(top_level_length_constraint); 90 TRACE("str", tout << "top-level length constraint: " << mk_pp(top_level_length_constraint, m) << std::endl;); 91 // assert and track length constraint 92 assert_axiom(top_level_length_constraint); 93 for(auto v : extra_length_vars) { 94 expr_ref len_constraint(m_autil.mk_ge(v, m_autil.mk_numeral(rational::zero(), true)), m); 95 assert_axiom(len_constraint); 96 } 97 98 regex_term_to_length_constraint.insert(str_in_re, top_level_length_constraint); 99 ptr_vector<expr> vtmp; 100 for(auto v : extra_length_vars) { 101 vtmp.push_back(v); 102 } 103 regex_term_to_extra_length_vars.insert(str_in_re, vtmp); 104 } 105 106 regex_terms_with_length_constraints.insert(str_in_re); 107 m_trail_stack.push(insert_obj_trail<expr>(regex_terms_with_length_constraints, str_in_re)); 108 } 109 } // re not in regex_terms_with_length_constraints 110 111 rational exact_length_value; 112 if (get_len_value(str, exact_length_value)) { 113 TRACE("str", tout << "exact length of " << mk_pp(str, m) << " is " << exact_length_value << std::endl;); 114 115 if (regex_terms_with_path_constraints.contains(str_in_re)) { 116 TRACE("str", tout << "term " << mk_pp(str_in_re, m) << " already has path constraints set up" << std::endl;); 117 continue; 118 } 119 120 // find a consistent automaton for this term 121 bool found = false; 122 regex_automaton_under_assumptions assumption; 123 if (regex_automaton_assumptions.contains(re) && 124 !regex_automaton_assumptions[re].empty()){ 125 for (auto autA : regex_automaton_assumptions[re]) { 126 rational assumed_upper_bound, assumed_lower_bound; 127 bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound); 128 bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound); 129 if (!assumes_upper_bound && !assumes_lower_bound) { 130 // automaton with no assumptions is always usable 131 assumption = autA; 132 found = true; 133 break; 134 } 135 // TODO check consistency of bounds assumptions 136 } // foreach(a in regex_automaton_assumptions) 137 } 138 if (found) { 139 if (exact_length_value.is_zero()) { 140 // check consistency of 0-length solution with automaton 141 eautomaton * aut = assumption.get_automaton(); 142 bool zero_solution = false; 143 unsigned initial_state = aut->init(); 144 if (aut->is_final_state(initial_state)) { 145 zero_solution = true; 146 } else { 147 unsigned_vector eps_states; 148 aut->get_epsilon_closure(initial_state, eps_states); 149 for (unsigned_vector::iterator it = eps_states.begin(); it != eps_states.end(); ++it) { 150 unsigned state = *it; 151 if (aut->is_final_state(state)) { 152 zero_solution = true; 153 break; 154 } 155 } 156 } 157 158 // now check polarity of automaton wrt. original term 159 if ( (current_assignment == l_true && !assumption.get_polarity()) 160 || (current_assignment == l_false && assumption.get_polarity())) { 161 // invert sense 162 zero_solution = !zero_solution; 163 } 164 165 if (zero_solution) { 166 TRACE("str", tout << "zero-length solution OK -- asserting empty path constraint" << std::endl;); 167 expr_ref_vector lhs_terms(m); 168 if (current_assignment == l_true) { 169 lhs_terms.push_back(str_in_re); 170 } else { 171 lhs_terms.push_back(m.mk_not(str_in_re)); 172 } 173 lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true))); 174 expr_ref lhs(mk_and(lhs_terms), m); 175 expr_ref rhs(ctx.mk_eq_atom(str, mk_string("")), m); 176 assert_implication(lhs, rhs); 177 regex_terms_with_path_constraints.insert(str_in_re); 178 m_trail_stack.push(insert_obj_trail<expr>(regex_terms_with_path_constraints, str_in_re)); 179 } else { 180 TRACE("str", tout << "zero-length solution not admitted by this automaton -- asserting conflict clause" << std::endl;); 181 expr_ref_vector lhs_terms(m); 182 if (current_assignment == l_true) { 183 lhs_terms.push_back(str_in_re); 184 } else { 185 lhs_terms.push_back(m.mk_not(str_in_re)); 186 } 187 lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true))); 188 expr_ref lhs(mk_and(lhs_terms), m); 189 expr_ref conflict(m.mk_not(lhs), m); 190 assert_axiom(conflict); 191 } 192 regex_inc_counter(regex_length_attempt_count, re); 193 continue; 194 } else { 195 // fixed-length model construction handles path constraints on our behalf, and with a better reduction 196 continue; 197 } 198 } else { 199 // no automata available, or else all bounds assumptions are invalid 200 unsigned expected_complexity = estimate_regex_complexity(re); 201 if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold) { 202 CTRACE("str", regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold, 203 tout << "failed automaton threshold reached for " << mk_pp(str_in_re, m) << " -- automatically constructing full automaton" << std::endl;); 204 eautomaton * aut = m_mk_aut(re); 205 if (aut == nullptr) { 206 TRACE("str", tout << "ERROR: symbolic automaton construction failed, likely due to non-constant term in regex" << std::endl;); 207 return false; 208 } 209 aut->compress(); 210 regex_automata.push_back(aut); 211 regex_automaton_under_assumptions new_aut(re, aut, true); 212 if (!regex_automaton_assumptions.contains(re)) { 213 regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>()); 214 } 215 regex_automaton_assumptions[re].push_back(new_aut); 216 TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); 217 find_automaton_initial_bounds(str_in_re, aut); 218 } else { 219 regex_inc_counter(regex_fail_count, str_in_re); 220 } 221 continue; 222 } 223 } // get_len_value() 224 expr_ref str_len(mk_strlen(str), m); 225 rational lower_bound_value; 226 rational upper_bound_value; 227 bool lower_bound_exists = lower_bound(str_len, lower_bound_value); 228 bool upper_bound_exists = upper_bound(str_len, upper_bound_value); 229 CTRACE("str", lower_bound_exists, tout << "lower bound of " << mk_pp(str, m) << " is " << lower_bound_value << std::endl;); 230 CTRACE("str", upper_bound_exists, tout << "upper bound of " << mk_pp(str, m) << " is " << upper_bound_value << std::endl;); 231 232 bool new_lower_bound_info = true; 233 bool new_upper_bound_info = true; 234 // check last seen lower/upper bound to avoid performing duplicate work 235 if (regex_last_lower_bound.contains(str)) { 236 rational last_lb_value; 237 regex_last_lower_bound.find(str, last_lb_value); 238 if (last_lb_value == lower_bound_value) { 239 new_lower_bound_info = false; 240 } 241 } 242 if (regex_last_upper_bound.contains(str)) { 243 rational last_ub_value; 244 regex_last_upper_bound.find(str, last_ub_value); 245 if (last_ub_value == upper_bound_value) { 246 new_upper_bound_info = false; 247 } 248 } 249 250 if (new_lower_bound_info) { 251 regex_last_lower_bound.insert(str, lower_bound_value); 252 } 253 if (new_upper_bound_info) { 254 regex_last_upper_bound.insert(str, upper_bound_value); 255 } 256 257 if (upper_bound_exists && new_upper_bound_info) { 258 // check current assumptions 259 if (regex_automaton_assumptions.contains(re) && 260 !regex_automaton_assumptions[re].empty()){ 261 // one or more existing assumptions. 262 // see if the (current best) upper bound can be refined 263 // (note that if we have an automaton with no assumption, 264 // this automatically counts as best) 265 bool need_assumption = true; 266 regex_automaton_under_assumptions last_assumption; 267 rational last_ub = rational::minus_one(); 268 for (auto autA : regex_automaton_assumptions[re]) { 269 if ((current_assignment == l_true && autA.get_polarity() == false) 270 || (current_assignment == l_false && autA.get_polarity() == true)) { 271 // automaton uses incorrect polarity 272 continue; 273 } 274 rational this_ub; 275 if (autA.get_upper_bound(this_ub)) { 276 if (last_ub == rational::minus_one() || this_ub < last_ub) { 277 last_ub = this_ub; 278 last_assumption = autA; 279 } 280 } else { 281 need_assumption = false; 282 last_assumption = autA; 283 break; 284 } 285 } 286 if (!last_ub.is_minus_one() || !need_assumption) { 287 CTRACE("str", !need_assumption, tout << "using automaton with full length information" << std::endl;); 288 CTRACE("str", need_assumption, tout << "using automaton with assumed upper bound of " << last_ub << std::endl;); 289 290 rational refined_upper_bound; 291 bool solution_at_upper_bound = refine_automaton_upper_bound(last_assumption.get_automaton(), 292 upper_bound_value, refined_upper_bound); 293 TRACE("str", tout << "refined upper bound is " << refined_upper_bound << 294 (solution_at_upper_bound?", solution at upper bound":", no solution at upper bound") << std::endl;); 295 296 expr_ref_vector lhs(m); 297 if (current_assignment == l_false) { 298 lhs.push_back(m.mk_not(str_in_re)); 299 } else { 300 lhs.push_back(str_in_re); 301 } 302 if (need_assumption) { 303 lhs.push_back(m_autil.mk_le(str_len, m_autil.mk_numeral(last_ub, true))); 304 } 305 lhs.push_back(m_autil.mk_le(str_len, m_autil.mk_numeral(upper_bound_value, true))); 306 307 expr_ref_vector rhs(m); 308 309 if (solution_at_upper_bound) { 310 if (refined_upper_bound.is_minus_one()) { 311 // If there are solutions at the upper bound but not below it, make the bound exact. 312 rhs.push_back(ctx.mk_eq_atom(str_len, m_autil.mk_numeral(upper_bound_value, true))); 313 } else { 314 // If there are solutions at and below the upper bound, add an additional bound. 315 rhs.push_back(m.mk_or( 316 ctx.mk_eq_atom(str_len, m_autil.mk_numeral(upper_bound_value, true)), 317 m_autil.mk_le(str_len, m_autil.mk_numeral(refined_upper_bound, true)) 318 )); 319 } 320 } else { 321 if (refined_upper_bound.is_minus_one()) { 322 // If there are no solutions at or below the upper bound, assert a conflict clause. 323 rhs.push_back(m.mk_not(m_autil.mk_le(str_len, m_autil.mk_numeral(upper_bound_value, true)))); 324 } else { 325 // If there are solutions below the upper bound but not at it, refine the bound. 326 rhs.push_back(m_autil.mk_le(str_len, m_autil.mk_numeral(refined_upper_bound, true))); 327 } 328 } 329 330 if (!rhs.empty()) { 331 expr_ref lhs_terms(mk_and(lhs), m); 332 expr_ref rhs_terms(mk_and(rhs), m); 333 assert_implication(lhs_terms, rhs_terms); 334 } 335 } 336 } else { 337 // no existing automata/assumptions. 338 // if it's easy to construct a full automaton for R, do so 339 unsigned expected_complexity = estimate_regex_complexity(re); 340 bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold); 341 if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || failureThresholdExceeded) { 342 eautomaton * aut = m_mk_aut(re); 343 if (aut == nullptr) { 344 TRACE("str", tout << "ERROR: symbolic automaton construction failed, likely due to non-constant term in regex" << std::endl;); 345 return false; 346 } 347 aut->compress(); 348 regex_automata.push_back(aut); 349 regex_automaton_under_assumptions new_aut(re, aut, true); 350 if (!regex_automaton_assumptions.contains(re)) { 351 regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>()); 352 } 353 regex_automaton_assumptions[re].push_back(new_aut); 354 TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); 355 find_automaton_initial_bounds(str_in_re, aut); 356 } else { 357 regex_inc_counter(regex_fail_count, str_in_re); 358 } 359 continue; 360 } 361 } else { // !upper_bound_exists 362 // no upper bound information 363 if (lower_bound_exists && !lower_bound_value.is_zero() && new_lower_bound_info) { 364 // nonzero lower bound, no upper bound 365 366 // check current assumptions 367 if (regex_automaton_assumptions.contains(re) && 368 !regex_automaton_assumptions[re].empty()){ 369 // one or more existing assumptions. 370 // see if the (current best) lower bound can be refined 371 // (note that if we have an automaton with no assumption, 372 // this automatically counts as best) 373 bool need_assumption = true; 374 regex_automaton_under_assumptions last_assumption; 375 rational last_lb = rational::zero(); // the default 376 for (auto autA : regex_automaton_assumptions[re]) { 377 if ((current_assignment == l_true && autA.get_polarity() == false) 378 || (current_assignment == l_false && autA.get_polarity() == true)) { 379 // automaton uses incorrect polarity 380 continue; 381 } 382 rational this_lb; 383 if (autA.get_lower_bound(this_lb)) { 384 if (this_lb > last_lb) { 385 last_lb = this_lb; 386 last_assumption = autA; 387 } 388 } else { 389 need_assumption = false; 390 last_assumption = autA; 391 break; 392 } 393 } 394 if (!last_lb.is_zero() || !need_assumption) { 395 CTRACE("str", !need_assumption, tout << "using automaton with full length information" << std::endl;); 396 CTRACE("str", need_assumption, tout << "using automaton with assumed lower bound of " << last_lb << std::endl;); 397 rational refined_lower_bound; 398 bool solution_at_lower_bound = refine_automaton_lower_bound(last_assumption.get_automaton(), 399 lower_bound_value, refined_lower_bound); 400 TRACE("str", tout << "refined lower bound is " << refined_lower_bound << 401 (solution_at_lower_bound?", solution at lower bound":", no solution at lower bound") << std::endl;); 402 403 expr_ref_vector lhs(m); 404 if (current_assignment == l_false) { 405 lhs.push_back(m.mk_not(str_in_re)); 406 } else { 407 lhs.push_back(str_in_re); 408 } 409 if (need_assumption) { 410 lhs.push_back(m_autil.mk_ge(str_len, m_autil.mk_numeral(last_lb, true))); 411 } 412 lhs.push_back(m_autil.mk_ge(str_len, m_autil.mk_numeral(lower_bound_value, true))); 413 414 expr_ref_vector rhs(m); 415 416 if (solution_at_lower_bound) { 417 if (refined_lower_bound.is_minus_one()) { 418 // If there are solutions at the lower bound but not above it, make the bound exact. 419 rhs.push_back(ctx.mk_eq_atom(str_len, m_autil.mk_numeral(lower_bound_value, true))); 420 } else { 421 // If there are solutions at and above the lower bound, add an additional bound. 422 // DISABLED as this is causing non-termination in the integer solver. --mtrberzi 423 /* 424 rhs.push_back(m.mk_or( 425 ctx.mk_eq_atom(str_len, m_autil.mk_numeral(lower_bound_value, true)), 426 m_autil.mk_ge(str_len, m_autil.mk_numeral(refined_lower_bound, true)) 427 )); 428 */ 429 } 430 } else { 431 if (refined_lower_bound.is_minus_one()) { 432 // If there are no solutions at or above the lower bound, assert a conflict clause. 433 rhs.push_back(m.mk_not(m_autil.mk_ge(str_len, m_autil.mk_numeral(lower_bound_value, true)))); 434 } else { 435 // If there are solutions above the lower bound but not at it, refine the bound. 436 rhs.push_back(m_autil.mk_ge(str_len, m_autil.mk_numeral(refined_lower_bound, true))); 437 } 438 } 439 440 if (!rhs.empty()) { 441 expr_ref lhs_terms(mk_and(lhs), m); 442 expr_ref rhs_terms(mk_and(rhs), m); 443 assert_implication(lhs_terms, rhs_terms); 444 } 445 } 446 } else { 447 // no existing automata/assumptions. 448 // if it's easy to construct a full automaton for R, do so 449 unsigned expected_complexity = estimate_regex_complexity(re); 450 bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold); 451 if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || failureThresholdExceeded) { 452 eautomaton * aut = m_mk_aut(re); 453 if (aut == nullptr) { 454 TRACE("str", tout << "ERROR: symbolic automaton construction failed, likely due to non-constant term in regex" << std::endl;); 455 return false; 456 } 457 aut->compress(); 458 regex_automata.push_back(aut); 459 regex_automaton_under_assumptions new_aut(re, aut, true); 460 if (!regex_automaton_assumptions.contains(re)) { 461 regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>()); 462 } 463 regex_automaton_assumptions[re].push_back(new_aut); 464 TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); 465 find_automaton_initial_bounds(str_in_re, aut); 466 } else { 467 // TODO check negation? 468 // TODO construct a partial automaton for R to the given lower bound? 469 if (false) { 470 471 } else { 472 regex_inc_counter(regex_fail_count, str_in_re); 473 } 474 } 475 continue; 476 } 477 } else { // !lower_bound_exists 478 // no bounds information 479 // check for existing automata; 480 // try to construct an automaton if we don't have one yet 481 // and doing so without bounds is not difficult 482 bool existingAutomata = (regex_automaton_assumptions.contains(re) && !regex_automaton_assumptions[re].empty()); 483 bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold); 484 if (!existingAutomata) { 485 unsigned expected_complexity = estimate_regex_complexity(re); 486 if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold 487 || failureThresholdExceeded) { 488 eautomaton * aut = m_mk_aut(re); 489 if (aut == nullptr) { 490 TRACE("str", tout << "ERROR: symbolic automaton construction failed, likely due to non-constant term in regex" << std::endl;); 491 return false; 492 } 493 aut->compress(); 494 regex_automata.push_back(aut); 495 regex_automaton_under_assumptions new_aut(re, aut, true); 496 if (!regex_automaton_assumptions.contains(re)) { 497 regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>()); 498 } 499 regex_automaton_assumptions[re].push_back(new_aut); 500 TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); 501 find_automaton_initial_bounds(str_in_re, aut); 502 } else { 503 regex_inc_counter(regex_fail_count, str_in_re); 504 } 505 } else { 506 regex_inc_counter(regex_fail_count, str_in_re); 507 } 508 } 509 } 510 } // foreach (entry in regex_terms) 511 512 for (auto entry : regex_terms_by_string) { 513 // TODO do we need to check equivalence classes of strings here? 514 515 expr* str = entry.m_key; 516 ptr_vector<expr> str_in_re_terms = entry.m_value; 517 518 svector<regex_automaton_under_assumptions> intersect_constraints; 519 // we may find empty intersection before checking every constraint; 520 // this vector keeps track of which ones actually take part in intersection 521 svector<regex_automaton_under_assumptions> used_intersect_constraints; 522 523 // choose an automaton/assumption for each assigned (str.in.re) 524 // that's consistent with the current length information 525 for (auto str_in_re_term : str_in_re_terms) { 526 expr * _unused = nullptr; 527 expr * re = nullptr; 528 SASSERT(u.str.is_in_re(str_in_re_term)); 529 u.str.is_in_re(str_in_re_term, _unused, re); 530 531 rational exact_len; 532 bool has_exact_len = get_len_value(str, exact_len); 533 534 rational lb, ub; 535 bool has_lower_bound = lower_bound(mk_strlen(str), lb); 536 bool has_upper_bound = upper_bound(mk_strlen(str), ub); 537 538 if (regex_automaton_assumptions.contains(re) && 539 !regex_automaton_assumptions[re].empty()){ 540 for (auto aut : regex_automaton_assumptions[re]) { 541 rational aut_ub; 542 bool assume_ub = aut.get_upper_bound(aut_ub); 543 rational aut_lb; 544 bool assume_lb = aut.get_lower_bound(aut_lb); 545 bool consistent = true; 546 547 if (assume_ub) { 548 // check consistency of assumed upper bound 549 if (has_exact_len) { 550 if (exact_len > aut_ub) { 551 consistent = false; 552 } 553 } else { 554 if (has_upper_bound && ub > aut_ub) { 555 consistent = false; 556 } 557 } 558 } 559 560 if (assume_lb) { 561 // check consistency of assumed lower bound 562 if (has_exact_len) { 563 if (exact_len < aut_lb) { 564 consistent = false; 565 } 566 } else { 567 if (has_lower_bound && lb < aut_lb) { 568 consistent = false; 569 } 570 } 571 } 572 573 if (consistent) { 574 intersect_constraints.push_back(aut); 575 break; 576 } 577 } 578 } 579 } // foreach(term in str_in_re_terms) 580 581 eautomaton * aut_inter = nullptr; 582 CTRACE("str", !intersect_constraints.empty(), tout << "check intersection of automata constraints for " << mk_pp(str, m) << std::endl;); 583 for (auto aut : intersect_constraints) { 584 TRACE("str", 585 { 586 unsigned v = regex_get_counter(regex_length_attempt_count, aut.get_regex_term()); 587 tout << "length attempt count of " << mk_pp(aut.get_regex_term(), m) << " is " << v 588 << ", threshold is " << m_params.m_RegexAutomata_LengthAttemptThreshold << std::endl; 589 }); 590 591 if (regex_get_counter(regex_length_attempt_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_LengthAttemptThreshold) { 592 unsigned intersectionDifficulty = 0; 593 if (aut_inter != nullptr) { 594 intersectionDifficulty = estimate_automata_intersection_difficulty(aut_inter, aut.get_automaton()); 595 } 596 TRACE("str", tout << "intersection difficulty is " << intersectionDifficulty << std::endl;); 597 if (intersectionDifficulty <= m_params.m_RegexAutomata_IntersectionDifficultyThreshold 598 || regex_get_counter(regex_intersection_fail_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_FailedIntersectionThreshold) { 599 600 expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term())); 601 lbool current_assignment = ctx.get_assignment(str_in_re_term); 602 // if the assignment is consistent with our assumption, use the automaton directly; 603 // otherwise, complement it (and save that automaton for next time) 604 // TODO we should cache these intermediate results 605 // TODO do we need to push the intermediates into a vector for deletion anyway? 606 if ( (current_assignment == l_true && aut.get_polarity()) 607 || (current_assignment == l_false && !aut.get_polarity())) { 608 if (aut_inter == nullptr) { 609 aut_inter = aut.get_automaton(); 610 } else { 611 aut_inter = m_mk_aut.mk_product(aut_inter, aut.get_automaton()); 612 m_automata.push_back(aut_inter); 613 } 614 } else { 615 // need to complement first 616 expr_ref rc(u.re.mk_complement(aut.get_regex_term()), m); 617 eautomaton * aut_c = m_mk_aut(rc); 618 if (aut_c == nullptr) { 619 TRACE("str", tout << "ERROR: symbolic automaton construction failed, likely due to non-constant term in regex" << std::endl;); 620 return false; 621 } 622 regex_automata.push_back(aut_c); 623 // TODO is there any way to build a complement automaton from an existing one? 624 // this discards length information 625 if (aut_inter == nullptr) { 626 aut_inter = aut_c; 627 } else { 628 aut_inter = m_mk_aut.mk_product(aut_inter, aut_c); 629 m_automata.push_back(aut_inter); 630 } 631 } 632 used_intersect_constraints.push_back(aut); 633 if (aut_inter->is_empty()) { 634 break; 635 } 636 } else { 637 // failed intersection 638 regex_inc_counter(regex_intersection_fail_count, aut.get_regex_term()); 639 } 640 } 641 } // foreach(entry in intersect_constraints) 642 if (aut_inter != nullptr) { 643 aut_inter->compress(); 644 } 645 TRACE("str", tout << "intersected " << used_intersect_constraints.size() << " constraints" << std::endl;); 646 647 expr_ref_vector conflict_terms(m); 648 expr_ref conflict_lhs(m); 649 for (auto aut : used_intersect_constraints) { 650 expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term())); 651 lbool current_assignment = ctx.get_assignment(str_in_re_term); 652 if (current_assignment == l_true) { 653 conflict_terms.push_back(str_in_re_term); 654 } else if (current_assignment == l_false) { 655 conflict_terms.push_back(m.mk_not(str_in_re_term)); 656 } 657 // add length assumptions, if any 658 rational ub; 659 if (aut.get_upper_bound(ub)) { 660 expr_ref ub_term(m_autil.mk_le(mk_strlen(str), m_autil.mk_numeral(ub, true)), m); 661 conflict_terms.push_back(ub_term); 662 } 663 rational lb; 664 if (aut.get_lower_bound(lb)) { 665 expr_ref lb_term(m_autil.mk_ge(mk_strlen(str), m_autil.mk_numeral(lb, true)), m); 666 conflict_terms.push_back(lb_term); 667 } 668 } 669 conflict_lhs = mk_and(conflict_terms); 670 TRACE("str", tout << "conflict lhs: " << mk_pp(conflict_lhs, m) << std::endl;); 671 672 if (used_intersect_constraints.size() > 1 && aut_inter != nullptr) { 673 // check whether the intersection is only the empty string 674 unsigned initial_state = aut_inter->init(); 675 if (aut_inter->final_states().size() == 1 && aut_inter->is_final_state(initial_state)) { 676 // initial state is final and it is the only final state 677 // if there are no moves from the initial state, 678 // the only solution is the empty string 679 if (aut_inter->get_moves_from(initial_state).empty()) { 680 TRACE("str", tout << "product automaton only accepts empty string" << std::endl;); 681 expr_ref rhs1(ctx.mk_eq_atom(str, mk_string("")), m); 682 expr_ref rhs2(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(rational::zero(), true)), m); 683 expr_ref rhs(m.mk_and(rhs1, rhs2), m); 684 assert_implication(conflict_lhs, rhs); 685 } 686 } 687 } 688 689 if (aut_inter != nullptr && aut_inter->is_empty()) { 690 TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;); 691 expr_ref conflict_clause(m.mk_not(mk_and(conflict_terms)), m); 692 assert_axiom(conflict_clause); 693 add_persisted_axiom(conflict_clause); 694 } 695 } // foreach (entry in regex_terms_by_string) 696 return true; 697 } 698 estimate_regex_complexity(expr * re)699 unsigned theory_str::estimate_regex_complexity(expr * re) { 700 ENSURE(u.is_re(re)); 701 expr * sub1; 702 expr * sub2; 703 unsigned lo, hi; 704 if (u.re.is_to_re(re, sub1)) { 705 if (!u.str.is_string(sub1)) 706 throw default_exception("regular expressions must be built from string literals"); 707 zstring str; 708 u.str.is_string(sub1, str); 709 return str.length(); 710 } else if (u.re.is_complement(re, sub1)) { 711 return estimate_regex_complexity_under_complement(sub1); 712 } else if (u.re.is_concat(re, sub1, sub2)) { 713 unsigned cx1 = estimate_regex_complexity(sub1); 714 unsigned cx2 = estimate_regex_complexity(sub2); 715 return _qadd(cx1, cx2); 716 } else if (u.re.is_union(re, sub1, sub2)) { 717 unsigned cx1 = estimate_regex_complexity(sub1); 718 unsigned cx2 = estimate_regex_complexity(sub2); 719 return _qadd(cx1, cx2); 720 } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) { 721 unsigned cx = estimate_regex_complexity(sub1); 722 return _qmul(2, cx); 723 } else if (u.re.is_loop(re, sub1, lo, hi) || u.re.is_loop(re, sub1, lo)) { 724 unsigned cx = estimate_regex_complexity(sub1); 725 return _qadd(lo, cx); 726 } else if (u.re.is_range(re, sub1, sub2)) { 727 SASSERT(u.str.is_string(sub1)); 728 SASSERT(u.str.is_string(sub2)); 729 zstring str1, str2; 730 u.str.is_string(sub1, str1); 731 u.str.is_string(sub2, str2); 732 if (str1.length() == 1 && str2.length() == 1) { 733 return 1 + str2[0] - str1[0]; 734 } else { 735 return 1; 736 } 737 } else if (u.re.is_full_char(re) || u.re.is_full_seq(re)) { 738 return 1; 739 } else { 740 TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;); 741 return 1; 742 } 743 } 744 estimate_regex_complexity_under_complement(expr * re)745 unsigned theory_str::estimate_regex_complexity_under_complement(expr * re) { 746 ENSURE(u.is_re(re)); 747 expr * sub1; 748 expr * sub2; 749 zstring str; 750 unsigned lo, hi; 751 if (u.re.is_to_re(re, sub1) && u.str.is_string(sub1)) { 752 return str.length(); 753 } else if (u.re.is_complement(re, sub1)) { 754 // Why don't we return the regular complexity here? 755 // We could, but this might be called from under another complemented subexpression. 756 // It's better to give a worst-case complexity. 757 return estimate_regex_complexity_under_complement(sub1); 758 } else if (u.re.is_concat(re, sub1, sub2)) { 759 unsigned cx1 = estimate_regex_complexity_under_complement(sub1); 760 unsigned cx2 = estimate_regex_complexity_under_complement(sub2); 761 return _qadd(_qmul(2, cx1), cx2); 762 } else if (u.re.is_union(re, sub1, sub2)) { 763 unsigned cx1 = estimate_regex_complexity_under_complement(sub1); 764 unsigned cx2 = estimate_regex_complexity_under_complement(sub2); 765 return _qmul(cx1, cx2); 766 } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1) || u.re.is_loop(re, sub1, lo, hi) || u.re.is_loop(re, sub1, lo)) { 767 unsigned cx = estimate_regex_complexity_under_complement(sub1); 768 return _qmul(2, cx); 769 } else if (u.re.is_range(re, sub1, sub2)) { 770 SASSERT(u.str.is_string(sub1)); 771 SASSERT(u.str.is_string(sub2)); 772 zstring str1, str2; 773 u.str.is_string(sub1, str1); 774 u.str.is_string(sub2, str2); 775 SASSERT(str1.length() == 1); 776 SASSERT(str2.length() == 1); 777 return 1 + str2[0] - str1[0]; 778 } else if (u.re.is_full_char(re) || u.re.is_full_seq(re)) { 779 return 1; 780 } else { 781 TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;); 782 return 1; 783 } 784 } 785 estimate_automata_intersection_difficulty(eautomaton * aut1,eautomaton * aut2)786 unsigned theory_str::estimate_automata_intersection_difficulty(eautomaton * aut1, eautomaton * aut2) { 787 ENSURE(aut1 != nullptr); 788 ENSURE(aut2 != nullptr); 789 return _qmul(aut1->num_states(), aut2->num_states()); 790 } 791 792 // Check whether a regex translates well to a linear set of length constraints. check_regex_length_linearity(expr * re)793 bool theory_str::check_regex_length_linearity(expr * re) { 794 return check_regex_length_linearity_helper(re, false); 795 } 796 check_regex_length_linearity_helper(expr * re,bool already_star)797 bool theory_str::check_regex_length_linearity_helper(expr * re, bool already_star) { 798 expr * sub1; 799 expr * sub2; 800 unsigned lo, hi; 801 if (u.re.is_to_re(re)) { 802 return true; 803 } else if (u.re.is_concat(re, sub1, sub2)) { 804 return check_regex_length_linearity_helper(sub1, already_star) && check_regex_length_linearity_helper(sub2, already_star); 805 } else if (u.re.is_union(re, sub1, sub2)) { 806 return check_regex_length_linearity_helper(sub1, already_star) && check_regex_length_linearity_helper(sub2, already_star); 807 } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) { 808 if (already_star) { 809 return false; 810 } else { 811 return check_regex_length_linearity_helper(sub1, true); 812 } 813 } else if (u.re.is_range(re)) { 814 return true; 815 } else if (u.re.is_full_char(re)) { 816 return true; 817 } else if (u.re.is_full_seq(re)) { 818 return true; 819 } else if (u.re.is_complement(re)) { 820 // TODO can we do better? 821 return false; 822 } else if (u.re.is_intersection(re)) { 823 return false; 824 } else if (u.re.is_loop(re, sub1, lo, hi) || u.re.is_loop(re, sub1, lo)) { 825 return check_regex_length_linearity_helper(sub1, already_star); 826 } else { 827 TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;); 828 return false; 829 } 830 } 831 832 // note: returns an empty set `lens` if something went wrong check_subterm_lengths(expr * re,integer_set & lens)833 void theory_str::check_subterm_lengths(expr * re, integer_set & lens) { 834 expr * sub1; 835 expr * sub2; 836 unsigned lo, hi; 837 if (u.re.is_to_re(re, sub1)) { 838 SASSERT(u.str.is_string(sub1)); 839 zstring str; 840 u.str.is_string(sub1, str); 841 lens.insert(str.length()); 842 } else if (u.re.is_concat(re, sub1, sub2)) { 843 integer_set lens_1, lens_2; 844 check_subterm_lengths(sub1, lens_1); 845 check_subterm_lengths(sub2, lens_2); 846 if (lens_1.empty() || lens_2.empty()) { 847 lens.reset(); 848 } else { 849 // take all pairwise lengths 850 for (integer_set::iterator it1 = lens_1.begin(); it1 != lens_1.end(); ++it1) { 851 for(integer_set::iterator it2 = lens_2.begin(); it2 != lens_2.end(); ++it2) { 852 int l1 = *it1; 853 int l2 = *it2; 854 lens.insert(l1 + l2); 855 } 856 } 857 } 858 } else if (u.re.is_union(re, sub1, sub2)) { 859 integer_set lens_1, lens_2; 860 check_subterm_lengths(sub1, lens_1); 861 check_subterm_lengths(sub2, lens_2); 862 if (lens_1.empty() || lens_2.empty()) { 863 lens.reset(); 864 } else { 865 // take all possibilities from either side 866 for (integer_set::iterator it1 = lens_1.begin(); it1 != lens_1.end(); ++it1) { 867 lens.insert(*it1); 868 } 869 for (integer_set::iterator it2 = lens_2.begin(); it2 != lens_2.end(); ++it2) { 870 lens.insert(*it2); 871 } 872 } 873 } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) { 874 // this is bad -- term generation requires this not to appear 875 lens.reset(); 876 } else if (u.re.is_range(re, sub1, sub2)) { 877 SASSERT(u.str.is_string(sub1)); 878 SASSERT(u.str.is_string(sub2)); 879 zstring str1, str2; 880 u.str.is_string(sub1, str1); 881 u.str.is_string(sub2, str2); 882 // re.range is a language of singleton strings if both of its arguments are; 883 // otherwise it is the empty language 884 if (str1.length() == 1 && str2.length() == 1) { 885 lens.insert(1); 886 } else { 887 lens.insert(0); 888 } 889 } else if (u.re.is_full_char(re)) { 890 lens.insert(1); 891 } else if (u.re.is_full_seq(re)) { 892 lens.reset(); 893 } else if (u.re.is_complement(re)) { 894 lens.reset(); 895 } else if (u.re.is_loop(re, sub1, lo, hi)) { 896 integer_set lens_1; 897 check_subterm_lengths(sub1, lens_1); 898 for (unsigned i = lo; i <= hi; ++i) { 899 for (auto j : lens_1) { 900 lens.insert(i * j); 901 } 902 } 903 } else { 904 TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;); 905 lens.reset(); 906 } 907 } 908 909 /* 910 * Infer all length constraints implied by the given regular expression `re` 911 * in order to constrain `lenVar` (which must be of sort Int). 912 * This assumes that `re` appears in a positive context. 913 * Returns a Boolean formula expressing the appropriate constraints over `lenVar`. 914 * In some cases, the returned formula requires one or more free integer variables to be created. 915 * These variables are returned in the reference parameter `freeVariables`. 916 * Extra assertions should be made for these free variables constraining them to be non-negative. 917 */ infer_all_regex_lengths(expr * lenVar,expr * re,expr_ref_vector & freeVariables)918 expr_ref theory_str::infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables) { 919 ENSURE(u.is_re(re)); 920 expr * sub1; 921 expr * sub2; 922 unsigned lo, hi; 923 if (u.re.is_to_re(re, sub1)) { 924 if (!u.str.is_string(sub1)) 925 throw default_exception("regular expressions must be built from string literals"); 926 zstring str; 927 u.str.is_string(sub1, str); 928 rational strlen(str.length()); 929 expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(strlen, true)), m); 930 return retval; 931 } else if (u.re.is_union(re, sub1, sub2)) { 932 expr_ref r1 = infer_all_regex_lengths(lenVar, sub1, freeVariables); 933 expr_ref r2 = infer_all_regex_lengths(lenVar, sub2, freeVariables); 934 expr_ref retval(m.mk_or(r1, r2), m); 935 return retval; 936 } else if (u.re.is_concat(re, sub1, sub2)) { 937 expr * v1 = mk_int_var("rlen1"); 938 expr * v2 = mk_int_var("rlen2"); 939 freeVariables.push_back(v1); 940 freeVariables.push_back(v2); 941 expr_ref r1 = infer_all_regex_lengths(v1, sub1, freeVariables); 942 expr_ref r2 = infer_all_regex_lengths(v2, sub2, freeVariables); 943 expr_ref_vector finalResult(m); 944 finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_add(v1, v2))); 945 finalResult.push_back(r1); 946 finalResult.push_back(r2); 947 expr_ref retval(mk_and(finalResult), m); 948 return retval; 949 } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) { 950 // stars are generated as a linear combination of all possible subterm lengths; 951 // this requires that there are no stars under this one 952 /* 953 expr * v = mk_int_var("rlen"); 954 expr * n = mk_int_var("rstar"); 955 freeVariables.push_back(v); 956 freeVariables.push_back(n); 957 expr_ref rsub = infer_all_regex_lengths(v, sub1, freeVariables); 958 expr_ref_vector finalResult(m); 959 finalResult.push_back(rsub); 960 finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_mul(v, n))); 961 expr_ref retval(mk_and(finalResult), m); 962 return retval; 963 */ 964 integer_set subterm_lens; 965 check_subterm_lengths(sub1, subterm_lens); 966 if (subterm_lens.empty()) { 967 // somehow generation was impossible 968 expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m); 969 return retval; 970 } else { 971 TRACE("str", tout << "subterm lengths:"; 972 for(integer_set::iterator it = subterm_lens.begin(); it != subterm_lens.end(); ++it) { 973 tout << " " << *it; 974 } 975 tout << std::endl;); 976 expr_ref_vector sum_terms(m); 977 for (integer_set::iterator it = subterm_lens.begin(); it != subterm_lens.end(); ++it) { 978 rational lenOption(*it); 979 expr * n = mk_int_var("rstar"); 980 freeVariables.push_back(n); 981 expr_ref term(m_autil.mk_mul(m_autil.mk_numeral(lenOption, true), n), m); 982 expr_ref term2(term, m); 983 if (u.re.is_plus(re)) { 984 // n effectively starts at 1 985 term2 = m_autil.mk_add(m_autil.mk_numeral(lenOption, true), term); 986 } 987 sum_terms.push_back(term2); 988 } 989 expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_add_simplify(sum_terms)), m); 990 return retval; 991 } 992 } else if (u.re.is_loop(re, sub1, lo, hi)) { 993 expr * v1 = mk_int_var("rlen"); 994 freeVariables.push_back(v1); 995 expr_ref r1 = infer_all_regex_lengths(v1, sub1, freeVariables); 996 expr_ref_vector v1_choices(m); 997 for (unsigned i = lo; i <= hi; ++i) { 998 rational rI(i); 999 expr_ref v1_i(ctx.mk_eq_atom(lenVar, m_autil.mk_mul(m_autil.mk_numeral(rI, true), v1)), m); 1000 v1_choices.push_back(v1_i); 1001 } 1002 expr_ref_vector finalResult(m); 1003 finalResult.push_back(r1); 1004 finalResult.push_back(mk_or(v1_choices)); 1005 expr_ref retval(mk_and(finalResult), m); 1006 SASSERT(retval); 1007 return retval; 1008 } else if (u.re.is_range(re, sub1, sub2)) { 1009 SASSERT(u.str.is_string(sub1)); 1010 SASSERT(u.str.is_string(sub2)); 1011 zstring str1, str2; 1012 u.str.is_string(sub1, str1); 1013 u.str.is_string(sub2, str2); 1014 SASSERT(str1.length() == 1); 1015 SASSERT(str2.length() == 1); 1016 expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(rational::one(), true)), m); 1017 return retval; 1018 } else if (u.re.is_full_char(re)) { 1019 expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(rational::one(), true)), m); 1020 return retval; 1021 } else if (u.re.is_full_seq(re)) { 1022 // match any unbounded string 1023 expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m); 1024 return retval; 1025 } else if (u.re.is_complement(re)) { 1026 // skip complement for now, in general this is difficult to predict 1027 expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m); 1028 return retval; 1029 } else { 1030 TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, m) << std::endl;); 1031 expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m); 1032 return retval; 1033 } 1034 } 1035 1036 /* 1037 * Assert initial lower and upper bounds for the positive constraint (str in re) corresponding 1038 * to the automaton `aut`. 1039 * This asserts a constraint of the form: 1040 * str_in_re --> (len(str) ?= 0 OR len(str) >= lb) AND len(str) <= ub 1041 * where the upper bound clause is omitted if the upper bound doesn't exist 1042 * and the equality with 0 is based on whether solutions of length 0 are allowed. 1043 */ find_automaton_initial_bounds(expr * str_in_re,eautomaton * aut)1044 void theory_str::find_automaton_initial_bounds(expr * str_in_re, eautomaton * aut) { 1045 ENSURE(aut != nullptr); 1046 1047 expr_ref_vector rhs(m); 1048 expr * str = nullptr; 1049 expr * re = nullptr; 1050 u.str.is_in_re(str_in_re, str, re); 1051 expr_ref strlen(mk_strlen(str), m); 1052 1053 // lower bound first 1054 rational nonzero_lower_bound; 1055 bool zero_sol_exists = refine_automaton_lower_bound(aut, rational::zero(), nonzero_lower_bound); 1056 if (zero_sol_exists) { 1057 regex_last_lower_bound.insert(str, rational::zero()); 1058 // solution at 0 1059 if (!nonzero_lower_bound.is_minus_one()) { 1060 expr_ref rhs1(ctx.mk_eq_atom(strlen, m_autil.mk_numeral(rational::zero(), true)), m); 1061 expr_ref rhs2(m_autil.mk_ge(strlen, m_autil.mk_numeral(nonzero_lower_bound, true)), m); 1062 rhs.push_back(m.mk_or(rhs1, rhs2)); 1063 } else { 1064 // length of solution can ONLY be 0 1065 expr_ref rhs1(ctx.mk_eq_atom(strlen, m_autil.mk_numeral(rational::zero(), true)), m); 1066 rhs.push_back(rhs1); 1067 } 1068 } else { 1069 // no solution at 0 1070 if (!nonzero_lower_bound.is_minus_one()) { 1071 regex_last_lower_bound.insert(str, nonzero_lower_bound); 1072 expr_ref rhs2(m_autil.mk_ge(strlen, m_autil.mk_numeral(nonzero_lower_bound, true)), m); 1073 rhs.push_back(rhs2); 1074 } else { 1075 // probably no solutions at all; just assume that 0 is a (safe) lower bound 1076 regex_last_lower_bound.insert(str, rational::zero()); 1077 rhs.reset(); 1078 } 1079 } 1080 // TODO upper bound check 1081 1082 if (!rhs.empty()) { 1083 expr_ref lhs(str_in_re, m); 1084 expr_ref _rhs(mk_and(rhs), m); 1085 assert_implication(lhs, _rhs); 1086 } 1087 } 1088 1089 /* 1090 * Refine the lower bound on the length of a solution to a given automaton. 1091 * The method returns TRUE if a solution of length `current_lower_bound` exists, 1092 * and FALSE otherwise. In addition, the reference parameter `refined_lower_bound` 1093 * is assigned the length of the shortest solution longer than `current_lower_bound` 1094 * if it exists, or -1 otherwise. 1095 */ refine_automaton_lower_bound(eautomaton * aut,rational current_lower_bound,rational & refined_lower_bound)1096 bool theory_str::refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound) { 1097 ENSURE(aut != nullptr); 1098 1099 if (aut->final_states().empty()) { 1100 // no solutions at all 1101 refined_lower_bound = rational::minus_one(); 1102 return false; 1103 } 1104 1105 // from here we assume that there is a final state reachable from the initial state 1106 1107 unsigned_vector search_queue; 1108 // populate search_queue with all states reachable from the epsilon-closure of start state 1109 aut->get_epsilon_closure(aut->init(), search_queue); 1110 1111 unsigned search_depth = 0; 1112 hashtable<unsigned, unsigned_hash, default_eq<unsigned>> next_states; 1113 unsigned_vector next_search_queue; 1114 1115 bool found_solution_at_lower_bound = false; 1116 1117 while (!search_queue.empty()) { 1118 // if we are at the lower bound, check for final states 1119 if (search_depth == current_lower_bound.get_unsigned()) { 1120 for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { 1121 unsigned state = *it; 1122 if (aut->is_final_state(state)) { 1123 found_solution_at_lower_bound = true; 1124 break; 1125 } 1126 } 1127 // end phase 1 1128 break; 1129 } 1130 next_states.reset(); 1131 next_search_queue.clear(); 1132 // move one step along all states 1133 for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { 1134 unsigned src = *it; 1135 eautomaton::moves next_moves; 1136 aut->get_moves_from(src, next_moves, true); 1137 for (eautomaton::moves::iterator move_it = next_moves.begin(); 1138 move_it != next_moves.end(); ++move_it) { 1139 unsigned dst = move_it->dst(); 1140 if (!next_states.contains(dst)) { 1141 next_states.insert(dst); 1142 next_search_queue.push_back(dst); 1143 } 1144 } 1145 } 1146 search_queue.clear(); 1147 search_queue.append(next_search_queue); 1148 search_depth += 1; 1149 } // !search_queue.empty() 1150 1151 // if we got here before reaching the lower bound, 1152 // there aren't any solutions at or above it, so stop 1153 if (search_depth < current_lower_bound.get_unsigned()) { 1154 refined_lower_bound = rational::minus_one(); 1155 return false; 1156 } 1157 1158 // phase 2: continue exploring the automaton above the lower bound 1159 SASSERT(search_depth == current_lower_bound.get_unsigned()); 1160 1161 while (!search_queue.empty()) { 1162 if (search_depth > current_lower_bound.get_unsigned()) { 1163 // check if we have found a solution above the lower bound 1164 for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { 1165 unsigned state = *it; 1166 if (aut->is_final_state(state)) { 1167 // this is a solution at a depth higher than the lower bound 1168 refined_lower_bound = rational(search_depth); 1169 return found_solution_at_lower_bound; 1170 } 1171 } 1172 } 1173 next_states.reset(); 1174 next_search_queue.clear(); 1175 // move one step along all states 1176 for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { 1177 unsigned src = *it; 1178 eautomaton::moves next_moves; 1179 aut->get_moves_from(src, next_moves, true); 1180 for (eautomaton::moves::iterator move_it = next_moves.begin(); 1181 move_it != next_moves.end(); ++move_it) { 1182 unsigned dst = move_it->dst(); 1183 if (!next_states.contains(dst)) { 1184 next_states.insert(dst); 1185 next_search_queue.push_back(dst); 1186 } 1187 } 1188 } 1189 search_queue.clear(); 1190 search_queue.append(next_search_queue); 1191 search_depth += 1; 1192 } 1193 // if we reached this point, we explored the whole automaton and didn't find any 1194 // solutions above the lower bound 1195 refined_lower_bound = rational::minus_one(); 1196 return found_solution_at_lower_bound; 1197 } 1198 1199 /* 1200 * Refine the upper bound on the length of a solution to a given automaton. 1201 * The method returns TRUE if a solution of length `current_upper_bound` exists, 1202 * and FALSE otherwise. In addition, the reference parameter `refined_upper_bound` 1203 * is assigned the length of the longest solution shorter than `current_upper_bound`, 1204 * if a shorter solution exists, or -1 otherwise. 1205 */ refine_automaton_upper_bound(eautomaton * aut,rational current_upper_bound,rational & refined_upper_bound)1206 bool theory_str::refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound) { 1207 ENSURE(aut != nullptr); 1208 1209 if (aut->final_states().empty()) { 1210 // no solutions at all! 1211 refined_upper_bound = rational::minus_one(); 1212 return false; 1213 } 1214 1215 // from here we assume there is a final state reachable from the initial state 1216 unsigned_vector search_queue; 1217 // populate search queue with all states reachable from the epsilon-closure of the start state 1218 aut->get_epsilon_closure(aut->init(), search_queue); 1219 1220 rational last_solution_depth = rational::minus_one(); 1221 bool found_solution_at_upper_bound = false; 1222 1223 unsigned search_depth = 0; 1224 hashtable<unsigned, unsigned_hash, default_eq<unsigned> > next_states; 1225 unsigned_vector next_search_queue; 1226 1227 while(!search_queue.empty()) { 1228 // see if any of the current states are final 1229 for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { 1230 unsigned src = *it; 1231 if (aut->is_final_state(src)) { 1232 if (search_depth == current_upper_bound.get_unsigned()) { 1233 found_solution_at_upper_bound = true; 1234 } else { 1235 last_solution_depth = rational(search_depth); 1236 } 1237 break; 1238 } 1239 } 1240 1241 if (search_depth == current_upper_bound.get_unsigned()) { 1242 break; 1243 } 1244 1245 next_states.reset(); 1246 next_search_queue.clear(); 1247 // move one step along all states 1248 for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { 1249 unsigned src = *it; 1250 eautomaton::moves next_moves; 1251 aut->get_moves_from(src, next_moves, true); 1252 for (eautomaton::moves::iterator moves_it = next_moves.begin(); 1253 moves_it != next_moves.end(); ++moves_it) { 1254 unsigned dst = moves_it->dst(); 1255 if (!next_states.contains(dst)) { 1256 next_states.insert(dst); 1257 next_search_queue.push_back(dst); 1258 } 1259 } 1260 } 1261 search_queue.clear(); 1262 search_queue.append(next_search_queue); 1263 search_depth += 1; 1264 } //!search_queue.empty() 1265 1266 refined_upper_bound = last_solution_depth; 1267 return found_solution_at_upper_bound; 1268 } 1269 aut_path_add_next(u_map<expr * > & next,expr_ref_vector & trail,unsigned idx,expr * cond)1270 void theory_str::aut_path_add_next(u_map<expr*>& next, expr_ref_vector& trail, unsigned idx, expr* cond) { 1271 expr* acc; 1272 if (!get_manager().is_true(cond) && next.find(idx, acc)) { 1273 expr* args[2] = { cond, acc }; 1274 cond = mk_or(get_manager(), 2, args); 1275 } 1276 trail.push_back(cond); 1277 next.insert(idx, cond); 1278 } 1279 aut_path_rewrite_constraint(expr * cond,expr * ch_var)1280 expr_ref theory_str::aut_path_rewrite_constraint(expr * cond, expr * ch_var) { 1281 1282 expr_ref retval(m); 1283 1284 unsigned char_val = 0; 1285 1286 expr * lhs; 1287 expr * rhs; 1288 1289 if (u.is_const_char(cond, char_val)) { 1290 SASSERT(char_val < 256); 1291 TRACE("str", tout << "rewrite character constant " << char_val << std::endl;); 1292 zstring str_const(char_val); 1293 retval = u.str.mk_string(str_const); 1294 return retval; 1295 } else if (is_var(cond)) { 1296 TRACE("str", tout << "substitute var" << std::endl;); 1297 retval = ch_var; 1298 return retval; 1299 } else if (m.is_eq(cond, lhs, rhs)) { 1300 // handle this specially because the sort of the equality will change 1301 expr_ref new_lhs(aut_path_rewrite_constraint(lhs, ch_var), m); 1302 SASSERT(new_lhs); 1303 expr_ref new_rhs(aut_path_rewrite_constraint(rhs, ch_var), m); 1304 SASSERT(new_rhs); 1305 retval = ctx.mk_eq_atom(new_lhs, new_rhs); 1306 return retval; 1307 } else if (m.is_bool(cond)) { 1308 TRACE("str", tout << "rewrite boolean term " << mk_pp(cond, m) << std::endl;); 1309 app * a_cond = to_app(cond); 1310 expr_ref_vector rewritten_args(m); 1311 for (unsigned i = 0; i < a_cond->get_num_args(); ++i) { 1312 expr * argI = a_cond->get_arg(i); 1313 expr_ref new_arg(aut_path_rewrite_constraint(argI, ch_var), m); 1314 SASSERT(new_arg); 1315 rewritten_args.push_back(new_arg); 1316 } 1317 retval = m.mk_app(a_cond->get_decl(), rewritten_args.data()); 1318 TRACE("str", tout << "final rewritten term is " << mk_pp(retval, m) << std::endl;); 1319 return retval; 1320 } else { 1321 TRACE("str", tout << "ERROR: unrecognized automaton path constraint " << mk_pp(cond, m) << ", cannot translate" << std::endl;); 1322 retval = nullptr; 1323 return retval; 1324 } 1325 } 1326 1327 /* 1328 * Create finite path constraints for the string variable `str` with respect to the automaton `aut`. 1329 * The returned expression is the right-hand side of a constraint of the form 1330 * (str in re) AND (|str| = len) AND (any applicable length assumptions on aut) -> (rhs AND character constraints). 1331 * The character constraints, which are (str = c0 . c1 . (...) . cn) and (|c0| = 1, ...), 1332 * are returned in `characterConstraints`. 1333 */ generate_regex_path_constraints(expr * stringTerm,eautomaton * aut,rational lenVal,expr_ref & characterConstraints)1334 expr_ref theory_str::generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints) { 1335 ENSURE(aut != nullptr); 1336 1337 if (lenVal.is_zero()) { 1338 // if any state in the epsilon-closure of the start state is accepting, 1339 // then the empty string is in this language 1340 unsigned_vector states; 1341 bool has_final = false; 1342 aut->get_epsilon_closure(aut->init(), states); 1343 for (unsigned i = 0; i < states.size() && !has_final; ++i) { 1344 has_final = aut->is_final_state(states[i]); 1345 } 1346 if (has_final) { 1347 // empty string is OK, assert axiom 1348 expr_ref rhs(ctx.mk_eq_atom(stringTerm, mk_string("")), m); 1349 SASSERT(rhs); 1350 //regex_automata_assertions.insert(stringTerm, final_axiom); 1351 //m_trail_stack.push(insert_obj_map<theory_str, expr, expr* >(regex_automata_assertions, stringTerm) ); 1352 return rhs; 1353 } else { 1354 // negate -- the empty string isn't in the language 1355 //expr_ref conflict(m.mk_not(mk_and(toplevel_lhs)), m); 1356 //assert_axiom(conflict); 1357 expr_ref conflict(m.mk_false(), m); 1358 return conflict; 1359 } 1360 } // lenVal.is_zero() 1361 1362 expr_ref_vector pathChars(m); 1363 expr_ref_vector pathChars_len_constraints(m); 1364 1365 // reuse character terms over the same string 1366 if (string_chars.contains(stringTerm)) { 1367 // find out whether we have enough characters already 1368 ptr_vector<expr> old_chars; 1369 string_chars.find(stringTerm, old_chars); 1370 if (old_chars.size() < lenVal.get_unsigned()) { 1371 for (unsigned i = old_chars.size(); i < lenVal.get_unsigned(); ++i) { 1372 std::stringstream ss; 1373 ss << "ch" << i; 1374 expr_ref ch(mk_str_var(ss.str()), m); 1375 m_trail.push_back(ch); 1376 old_chars.push_back(ch); 1377 } 1378 } 1379 string_chars.insert(stringTerm, old_chars); 1380 // now we're guaranteed to have at least the right number of characters in old_chars 1381 for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) { 1382 expr_ref ch(old_chars.get(i), m); 1383 refresh_theory_var(ch); 1384 pathChars.push_back(ch); 1385 pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true))); 1386 } 1387 } else { 1388 ptr_vector<expr> new_chars; 1389 for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) { 1390 std::stringstream ss; 1391 ss << "ch" << i; 1392 expr_ref ch(mk_str_var(ss.str()), m); 1393 pathChars.push_back(ch); 1394 pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true))); 1395 new_chars.push_back(ch); 1396 } 1397 string_chars.insert(stringTerm, new_chars); 1398 } 1399 1400 // modification of code in seq_rewriter::mk_str_in_regexp() 1401 expr_ref_vector trail(m); 1402 u_map<expr*> maps[2]; 1403 bool select_map = false; 1404 expr_ref ch(m), cond(m); 1405 eautomaton::moves mvs; 1406 maps[0].insert(aut->init(), m.mk_true()); 1407 // is_accepted(a, aut) & some state in frontier is final. 1408 for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) { 1409 u_map<expr*>& frontier = maps[select_map]; 1410 u_map<expr*>& next = maps[!select_map]; 1411 select_map = !select_map; 1412 ch = pathChars.get(i); 1413 next.reset(); 1414 u_map<expr*>::iterator it = frontier.begin(), end = frontier.end(); 1415 for (; it != end; ++it) { 1416 mvs.reset(); 1417 unsigned state = it->m_key; 1418 expr* acc = it->m_value; 1419 aut->get_moves_from(state, mvs, false); 1420 for (unsigned j = 0; j < mvs.size(); ++j) { 1421 eautomaton::move const& mv = mvs[j]; 1422 SASSERT(mv.t()); 1423 if (mv.t()->is_char() && m.is_value(mv.t()->get_char())) { 1424 // change this to a string constraint 1425 expr_ref cond_rhs = aut_path_rewrite_constraint(mv.t()->get_char(), ch); 1426 SASSERT(cond_rhs); 1427 cond = ctx.mk_eq_atom(ch, cond_rhs); 1428 SASSERT(cond); 1429 expr * args[2] = {cond, acc}; 1430 cond = mk_and(m, 2, args); 1431 aut_path_add_next(next, trail, mv.dst(), cond); 1432 } else if (mv.t()->is_range()) { 1433 expr_ref range_lo(mv.t()->get_lo(), m); 1434 expr_ref range_hi(mv.t()->get_hi(), m); 1435 1436 unsigned lo_val, hi_val; 1437 1438 if (u.is_const_char(range_lo, lo_val) && u.is_const_char(range_hi, hi_val)) { 1439 TRACE("str", tout << "make range predicate from " << lo_val << " to " << hi_val << std::endl;); 1440 expr_ref cond_rhs(m); 1441 expr_ref_vector cond_rhs_terms(m); 1442 for (unsigned i = lo_val; i <= hi_val; ++i) { 1443 zstring str_const(i); 1444 expr_ref str_expr(u.str.mk_string(str_const), m); 1445 cond_rhs_terms.push_back(ctx.mk_eq_atom(ch, str_expr)); 1446 } 1447 cond_rhs = mk_or(cond_rhs_terms); 1448 SASSERT(cond_rhs); 1449 expr * args[2] = {cond_rhs, acc}; 1450 cond = mk_and(m, 2, args); 1451 aut_path_add_next(next, trail, mv.dst(), cond); 1452 } else { 1453 TRACE("str", tout << "warning: non-bitvectors in automaton range predicate" << std::endl;); 1454 UNREACHABLE(); 1455 } 1456 } else if (mv.t()->is_pred()) { 1457 // rewrite this constraint over string terms 1458 expr_ref cond_rhs = aut_path_rewrite_constraint(mv.t()->get_pred(), ch); 1459 SASSERT(cond_rhs); 1460 1461 if (m.is_false(cond_rhs)) { 1462 continue; 1463 } else if (m.is_true(cond_rhs)) { 1464 aut_path_add_next(next, trail, mv.dst(), acc); 1465 continue; 1466 } 1467 expr * args[2] = {cond_rhs, acc}; 1468 cond = mk_and(m, 2, args); 1469 aut_path_add_next(next, trail, mv.dst(), cond); 1470 } 1471 } 1472 } 1473 } 1474 u_map<expr*> const& frontier = maps[select_map]; 1475 u_map<expr*>::iterator it = frontier.begin(), end = frontier.end(); 1476 expr_ref_vector ors(m); 1477 for (; it != end; ++it) { 1478 unsigned_vector states; 1479 bool has_final = false; 1480 aut->get_epsilon_closure(it->m_key, states); 1481 for (unsigned i = 0; i < states.size() && !has_final; ++i) { 1482 has_final = aut->is_final_state(states[i]); 1483 } 1484 if (has_final) { 1485 ors.push_back(it->m_value); 1486 } 1487 } 1488 expr_ref result(mk_or(ors)); 1489 TRACE("str", tout << "regex path constraint: " << mk_pp(result, m) << "\n";); 1490 1491 expr_ref concat_rhs(m); 1492 if (pathChars.size() == 1) { 1493 concat_rhs = ctx.mk_eq_atom(stringTerm, pathChars.get(0)); 1494 } else { 1495 expr_ref acc(pathChars.get(0), m); 1496 for (unsigned i = 1; i < pathChars.size(); ++i) { 1497 acc = mk_concat(acc, pathChars.get(i)); 1498 } 1499 concat_rhs = ctx.mk_eq_atom(stringTerm, acc); 1500 } 1501 1502 //expr_ref toplevel_rhs(m.mk_and(result, mk_and(pathChars_len_constraints), concat_rhs), m); 1503 characterConstraints = m.mk_and(mk_and(pathChars_len_constraints), concat_rhs); 1504 //expr_ref final_axiom(rewrite_implication(mk_and(toplevel_lhs), toplevel_rhs), m); 1505 //regex_automata_assertions.insert(stringTerm, final_axiom); 1506 //m_trail_stack.push(insert_obj_map<theory_str, expr, expr* >(regex_automata_assertions, stringTerm) ); 1507 return result; 1508 } 1509 regex_inc_counter(obj_map<expr,unsigned> & counter_map,expr * key)1510 void theory_str::regex_inc_counter(obj_map<expr, unsigned> & counter_map, expr * key) { 1511 unsigned old_v; 1512 if (counter_map.find(key, old_v)) { 1513 unsigned new_v = old_v += 1; 1514 counter_map.insert(key, new_v); 1515 } else { 1516 counter_map.insert(key, 1); 1517 } 1518 } 1519 regex_get_counter(obj_map<expr,unsigned> & counter_map,expr * key)1520 unsigned theory_str::regex_get_counter(obj_map<expr, unsigned> & counter_map, expr * key) { 1521 unsigned v; 1522 if (counter_map.find(key, v)) { 1523 return v; 1524 } else { 1525 counter_map.insert(key, 0); 1526 return 0; 1527 } 1528 } 1529 1530 }; /* namespace smt */ 1531