1 /*------------------------------------------------------------------------- 2 * 3 * like_support.c 4 * Planner support functions for LIKE, regex, and related operators. 5 * 6 * These routines handle special optimization of operators that can be 7 * used with index scans even though they are not known to the executor's 8 * indexscan machinery. The key idea is that these operators allow us 9 * to derive approximate indexscan qual clauses, such that any tuples 10 * that pass the operator clause itself must also satisfy the simpler 11 * indexscan condition(s). Then we can use the indexscan machinery 12 * to avoid scanning as much of the table as we'd otherwise have to, 13 * while applying the original operator as a qpqual condition to ensure 14 * we deliver only the tuples we want. (In essence, we're using a regular 15 * index as if it were a lossy index.) 16 * 17 * An example of what we're doing is 18 * textfield LIKE 'abc%def' 19 * from which we can generate the indexscanable conditions 20 * textfield >= 'abc' AND textfield < 'abd' 21 * which allow efficient scanning of an index on textfield. 22 * (In reality, character set and collation issues make the transformation 23 * from LIKE to indexscan limits rather harder than one might think ... 24 * but that's the basic idea.) 25 * 26 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group 27 * Portions Copyright (c) 1994, Regents of the University of California 28 * 29 * 30 * IDENTIFICATION 31 * src/backend/utils/adt/like_support.c 32 * 33 *------------------------------------------------------------------------- 34 */ 35 #include "postgres.h" 36 37 #include <math.h> 38 39 #include "access/htup_details.h" 40 #include "access/stratnum.h" 41 #include "catalog/pg_collation.h" 42 #include "catalog/pg_operator.h" 43 #include "catalog/pg_opfamily.h" 44 #include "catalog/pg_statistic.h" 45 #include "catalog/pg_type.h" 46 #include "mb/pg_wchar.h" 47 #include "nodes/makefuncs.h" 48 #include "nodes/nodeFuncs.h" 49 #include "nodes/supportnodes.h" 50 #include "utils/builtins.h" 51 #include "utils/datum.h" 52 #include "utils/lsyscache.h" 53 #include "utils/pg_locale.h" 54 #include "utils/selfuncs.h" 55 #include "utils/varlena.h" 56 57 58 typedef enum 59 { 60 Pattern_Type_Like, 61 Pattern_Type_Like_IC, 62 Pattern_Type_Regex, 63 Pattern_Type_Regex_IC, 64 Pattern_Type_Prefix 65 } Pattern_Type; 66 67 typedef enum 68 { 69 Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact 70 } Pattern_Prefix_Status; 71 72 static Node *like_regex_support(Node *rawreq, Pattern_Type ptype); 73 static List *match_pattern_prefix(Node *leftop, 74 Node *rightop, 75 Pattern_Type ptype, 76 Oid expr_coll, 77 Oid opfamily, 78 Oid indexcollation); 79 static double patternsel_common(PlannerInfo *root, 80 Oid oprid, 81 Oid opfuncid, 82 List *args, 83 int varRelid, 84 Oid collation, 85 Pattern_Type ptype, 86 bool negate); 87 static Pattern_Prefix_Status pattern_fixed_prefix(Const *patt, 88 Pattern_Type ptype, 89 Oid collation, 90 Const **prefix, 91 Selectivity *rest_selec); 92 static Selectivity prefix_selectivity(PlannerInfo *root, 93 VariableStatData *vardata, 94 Oid eqopr, Oid ltopr, Oid geopr, 95 Oid collation, 96 Const *prefixcon); 97 static Selectivity like_selectivity(const char *patt, int pattlen, 98 bool case_insensitive); 99 static Selectivity regex_selectivity(const char *patt, int pattlen, 100 bool case_insensitive, 101 int fixed_prefix_len); 102 static int pattern_char_isalpha(char c, bool is_multibyte, 103 pg_locale_t locale, bool locale_is_c); 104 static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc, 105 Oid collation); 106 static Datum string_to_datum(const char *str, Oid datatype); 107 static Const *string_to_const(const char *str, Oid datatype); 108 static Const *string_to_bytea_const(const char *str, size_t str_len); 109 110 111 /* 112 * Planner support functions for LIKE, regex, and related operators 113 */ 114 Datum 115 textlike_support(PG_FUNCTION_ARGS) 116 { 117 Node *rawreq = (Node *) PG_GETARG_POINTER(0); 118 119 PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like)); 120 } 121 122 Datum 123 texticlike_support(PG_FUNCTION_ARGS) 124 { 125 Node *rawreq = (Node *) PG_GETARG_POINTER(0); 126 127 PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like_IC)); 128 } 129 130 Datum 131 textregexeq_support(PG_FUNCTION_ARGS) 132 { 133 Node *rawreq = (Node *) PG_GETARG_POINTER(0); 134 135 PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex)); 136 } 137 138 Datum 139 texticregexeq_support(PG_FUNCTION_ARGS) 140 { 141 Node *rawreq = (Node *) PG_GETARG_POINTER(0); 142 143 PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex_IC)); 144 } 145 146 /* Common code for the above */ 147 static Node * 148 like_regex_support(Node *rawreq, Pattern_Type ptype) 149 { 150 Node *ret = NULL; 151 152 if (IsA(rawreq, SupportRequestSelectivity)) 153 { 154 /* 155 * Make a selectivity estimate for a function call, just as we'd do if 156 * the call was via the corresponding operator. 157 */ 158 SupportRequestSelectivity *req = (SupportRequestSelectivity *) rawreq; 159 Selectivity s1; 160 161 if (req->is_join) 162 { 163 /* 164 * For the moment we just punt. If patternjoinsel is ever 165 * improved to do better, this should be made to call it. 166 */ 167 s1 = DEFAULT_MATCH_SEL; 168 } 169 else 170 { 171 /* Share code with operator restriction selectivity functions */ 172 s1 = patternsel_common(req->root, 173 InvalidOid, 174 req->funcid, 175 req->args, 176 req->varRelid, 177 req->inputcollid, 178 ptype, 179 false); 180 } 181 req->selectivity = s1; 182 ret = (Node *) req; 183 } 184 else if (IsA(rawreq, SupportRequestIndexCondition)) 185 { 186 /* Try to convert operator/function call to index conditions */ 187 SupportRequestIndexCondition *req = (SupportRequestIndexCondition *) rawreq; 188 189 /* 190 * Currently we have no "reverse" match operators with the pattern on 191 * the left, so we only need consider cases with the indexkey on the 192 * left. 193 */ 194 if (req->indexarg != 0) 195 return NULL; 196 197 if (is_opclause(req->node)) 198 { 199 OpExpr *clause = (OpExpr *) req->node; 200 201 Assert(list_length(clause->args) == 2); 202 ret = (Node *) 203 match_pattern_prefix((Node *) linitial(clause->args), 204 (Node *) lsecond(clause->args), 205 ptype, 206 clause->inputcollid, 207 req->opfamily, 208 req->indexcollation); 209 } 210 else if (is_funcclause(req->node)) /* be paranoid */ 211 { 212 FuncExpr *clause = (FuncExpr *) req->node; 213 214 Assert(list_length(clause->args) == 2); 215 ret = (Node *) 216 match_pattern_prefix((Node *) linitial(clause->args), 217 (Node *) lsecond(clause->args), 218 ptype, 219 clause->inputcollid, 220 req->opfamily, 221 req->indexcollation); 222 } 223 } 224 225 return ret; 226 } 227 228 /* 229 * match_pattern_prefix 230 * Try to generate an indexqual for a LIKE or regex operator. 231 */ 232 static List * 233 match_pattern_prefix(Node *leftop, 234 Node *rightop, 235 Pattern_Type ptype, 236 Oid expr_coll, 237 Oid opfamily, 238 Oid indexcollation) 239 { 240 List *result; 241 Const *patt; 242 Const *prefix; 243 Pattern_Prefix_Status pstatus; 244 Oid ldatatype; 245 Oid rdatatype; 246 Oid eqopr; 247 Oid ltopr; 248 Oid geopr; 249 bool collation_aware; 250 Expr *expr; 251 FmgrInfo ltproc; 252 Const *greaterstr; 253 254 /* 255 * Can't do anything with a non-constant or NULL pattern argument. 256 * 257 * Note that since we restrict ourselves to cases with a hard constant on 258 * the RHS, it's a-fortiori a pseudoconstant, and we don't need to worry 259 * about verifying that. 260 */ 261 if (!IsA(rightop, Const) || 262 ((Const *) rightop)->constisnull) 263 return NIL; 264 patt = (Const *) rightop; 265 266 /* 267 * Not supported if the expression collation is nondeterministic. The 268 * optimized equality or prefix tests use bytewise comparisons, which is 269 * not consistent with nondeterministic collations. The actual 270 * pattern-matching implementation functions will later error out that 271 * pattern-matching is not supported with nondeterministic collations. (We 272 * could also error out here, but by doing it later we get more precise 273 * error messages.) (It should be possible to support at least 274 * Pattern_Prefix_Exact, but no point as long as the actual 275 * pattern-matching implementations don't support it.) 276 * 277 * expr_coll is not set for a non-collation-aware data type such as bytea. 278 */ 279 if (expr_coll && !get_collation_isdeterministic(expr_coll)) 280 return NIL; 281 282 /* 283 * Try to extract a fixed prefix from the pattern. 284 */ 285 pstatus = pattern_fixed_prefix(patt, ptype, expr_coll, 286 &prefix, NULL); 287 288 /* fail if no fixed prefix */ 289 if (pstatus == Pattern_Prefix_None) 290 return NIL; 291 292 /* 293 * Identify the operators we want to use, based on the type of the 294 * left-hand argument. Usually these are just the type's regular 295 * comparison operators, but if we are considering one of the semi-legacy 296 * "pattern" opclasses, use the "pattern" operators instead. Those are 297 * not collation-sensitive but always use C collation, as we want. The 298 * selected operators also determine the needed type of the prefix 299 * constant. 300 */ 301 ldatatype = exprType(leftop); 302 switch (ldatatype) 303 { 304 case TEXTOID: 305 if (opfamily == TEXT_PATTERN_BTREE_FAM_OID || 306 opfamily == TEXT_SPGIST_FAM_OID) 307 { 308 eqopr = TextEqualOperator; 309 ltopr = TextPatternLessOperator; 310 geopr = TextPatternGreaterEqualOperator; 311 collation_aware = false; 312 } 313 else 314 { 315 eqopr = TextEqualOperator; 316 ltopr = TextLessOperator; 317 geopr = TextGreaterEqualOperator; 318 collation_aware = true; 319 } 320 rdatatype = TEXTOID; 321 break; 322 case NAMEOID: 323 324 /* 325 * Note that here, we need the RHS type to be text, so that the 326 * comparison value isn't improperly truncated to NAMEDATALEN. 327 */ 328 eqopr = NameEqualTextOperator; 329 ltopr = NameLessTextOperator; 330 geopr = NameGreaterEqualTextOperator; 331 collation_aware = true; 332 rdatatype = TEXTOID; 333 break; 334 case BPCHAROID: 335 if (opfamily == BPCHAR_PATTERN_BTREE_FAM_OID) 336 { 337 eqopr = BpcharEqualOperator; 338 ltopr = BpcharPatternLessOperator; 339 geopr = BpcharPatternGreaterEqualOperator; 340 collation_aware = false; 341 } 342 else 343 { 344 eqopr = BpcharEqualOperator; 345 ltopr = BpcharLessOperator; 346 geopr = BpcharGreaterEqualOperator; 347 collation_aware = true; 348 } 349 rdatatype = BPCHAROID; 350 break; 351 case BYTEAOID: 352 eqopr = ByteaEqualOperator; 353 ltopr = ByteaLessOperator; 354 geopr = ByteaGreaterEqualOperator; 355 collation_aware = false; 356 rdatatype = BYTEAOID; 357 break; 358 default: 359 /* Can't get here unless we're attached to the wrong operator */ 360 return NIL; 361 } 362 363 /* 364 * If necessary, verify that the index's collation behavior is compatible. 365 * For an exact-match case, we don't have to be picky. Otherwise, insist 366 * that the index collation be "C". Note that here we are looking at the 367 * index's collation, not the expression's collation -- this test is *not* 368 * dependent on the LIKE/regex operator's collation. 369 */ 370 if (collation_aware) 371 { 372 if (!(pstatus == Pattern_Prefix_Exact || 373 lc_collate_is_c(indexcollation))) 374 return NIL; 375 } 376 377 /* 378 * If necessary, coerce the prefix constant to the right type. The given 379 * prefix constant is either text or bytea type, therefore the only case 380 * where we need to do anything is when converting text to bpchar. Those 381 * two types are binary-compatible, so relabeling the Const node is 382 * sufficient. 383 */ 384 if (prefix->consttype != rdatatype) 385 { 386 Assert(prefix->consttype == TEXTOID && 387 rdatatype == BPCHAROID); 388 prefix->consttype = rdatatype; 389 } 390 391 /* 392 * If we found an exact-match pattern, generate an "=" indexqual. 393 * 394 * Here and below, check to see whether the desired operator is actually 395 * supported by the index opclass, and fail quietly if not. This allows 396 * us to not be concerned with specific opclasses (except for the legacy 397 * "pattern" cases); any index that correctly implements the operators 398 * will work. 399 */ 400 if (pstatus == Pattern_Prefix_Exact) 401 { 402 if (!op_in_opfamily(eqopr, opfamily)) 403 return NIL; 404 expr = make_opclause(eqopr, BOOLOID, false, 405 (Expr *) leftop, (Expr *) prefix, 406 InvalidOid, indexcollation); 407 result = list_make1(expr); 408 return result; 409 } 410 411 /* 412 * Otherwise, we have a nonempty required prefix of the values. 413 * 414 * We can always say "x >= prefix". 415 */ 416 if (!op_in_opfamily(geopr, opfamily)) 417 return NIL; 418 expr = make_opclause(geopr, BOOLOID, false, 419 (Expr *) leftop, (Expr *) prefix, 420 InvalidOid, indexcollation); 421 result = list_make1(expr); 422 423 /*------- 424 * If we can create a string larger than the prefix, we can say 425 * "x < greaterstr". NB: we rely on make_greater_string() to generate 426 * a guaranteed-greater string, not just a probably-greater string. 427 * In general this is only guaranteed in C locale, so we'd better be 428 * using a C-locale index collation. 429 *------- 430 */ 431 if (!op_in_opfamily(ltopr, opfamily)) 432 return result; 433 fmgr_info(get_opcode(ltopr), <proc); 434 greaterstr = make_greater_string(prefix, <proc, indexcollation); 435 if (greaterstr) 436 { 437 expr = make_opclause(ltopr, BOOLOID, false, 438 (Expr *) leftop, (Expr *) greaterstr, 439 InvalidOid, indexcollation); 440 result = lappend(result, expr); 441 } 442 443 return result; 444 } 445 446 447 /* 448 * patternsel_common - generic code for pattern-match restriction selectivity. 449 * 450 * To support using this from either the operator or function paths, caller 451 * may pass either operator OID or underlying function OID; we look up the 452 * latter from the former if needed. (We could just have patternsel() call 453 * get_opcode(), but the work would be wasted if we don't have a need to 454 * compare a fixed prefix to the pg_statistic data.) 455 * 456 * Note that oprid and/or opfuncid should be for the positive-match operator 457 * even when negate is true. 458 */ 459 static double 460 patternsel_common(PlannerInfo *root, 461 Oid oprid, 462 Oid opfuncid, 463 List *args, 464 int varRelid, 465 Oid collation, 466 Pattern_Type ptype, 467 bool negate) 468 { 469 VariableStatData vardata; 470 Node *other; 471 bool varonleft; 472 Datum constval; 473 Oid consttype; 474 Oid vartype; 475 Oid rdatatype; 476 Oid eqopr; 477 Oid ltopr; 478 Oid geopr; 479 Pattern_Prefix_Status pstatus; 480 Const *patt; 481 Const *prefix = NULL; 482 Selectivity rest_selec = 0; 483 double nullfrac = 0.0; 484 double result; 485 486 /* 487 * Initialize result to the appropriate default estimate depending on 488 * whether it's a match or not-match operator. 489 */ 490 if (negate) 491 result = 1.0 - DEFAULT_MATCH_SEL; 492 else 493 result = DEFAULT_MATCH_SEL; 494 495 /* 496 * If expression is not variable op constant, then punt and return the 497 * default estimate. 498 */ 499 if (!get_restriction_variable(root, args, varRelid, 500 &vardata, &other, &varonleft)) 501 return result; 502 if (!varonleft || !IsA(other, Const)) 503 { 504 ReleaseVariableStats(vardata); 505 return result; 506 } 507 508 /* 509 * If the constant is NULL, assume operator is strict and return zero, ie, 510 * operator will never return TRUE. (It's zero even for a negator op.) 511 */ 512 if (((Const *) other)->constisnull) 513 { 514 ReleaseVariableStats(vardata); 515 return 0.0; 516 } 517 constval = ((Const *) other)->constvalue; 518 consttype = ((Const *) other)->consttype; 519 520 /* 521 * The right-hand const is type text or bytea for all supported operators. 522 * We do not expect to see binary-compatible types here, since 523 * const-folding should have relabeled the const to exactly match the 524 * operator's declared type. 525 */ 526 if (consttype != TEXTOID && consttype != BYTEAOID) 527 { 528 ReleaseVariableStats(vardata); 529 return result; 530 } 531 532 /* 533 * Similarly, the exposed type of the left-hand side should be one of 534 * those we know. (Do not look at vardata.atttype, which might be 535 * something binary-compatible but different.) We can use it to identify 536 * the comparison operators and the required type of the comparison 537 * constant, much as in match_pattern_prefix(). 538 */ 539 vartype = vardata.vartype; 540 541 switch (vartype) 542 { 543 case TEXTOID: 544 eqopr = TextEqualOperator; 545 ltopr = TextLessOperator; 546 geopr = TextGreaterEqualOperator; 547 rdatatype = TEXTOID; 548 break; 549 case NAMEOID: 550 551 /* 552 * Note that here, we need the RHS type to be text, so that the 553 * comparison value isn't improperly truncated to NAMEDATALEN. 554 */ 555 eqopr = NameEqualTextOperator; 556 ltopr = NameLessTextOperator; 557 geopr = NameGreaterEqualTextOperator; 558 rdatatype = TEXTOID; 559 break; 560 case BPCHAROID: 561 eqopr = BpcharEqualOperator; 562 ltopr = BpcharLessOperator; 563 geopr = BpcharGreaterEqualOperator; 564 rdatatype = BPCHAROID; 565 break; 566 case BYTEAOID: 567 eqopr = ByteaEqualOperator; 568 ltopr = ByteaLessOperator; 569 geopr = ByteaGreaterEqualOperator; 570 rdatatype = BYTEAOID; 571 break; 572 default: 573 /* Can't get here unless we're attached to the wrong operator */ 574 ReleaseVariableStats(vardata); 575 return result; 576 } 577 578 /* 579 * Grab the nullfrac for use below. 580 */ 581 if (HeapTupleIsValid(vardata.statsTuple)) 582 { 583 Form_pg_statistic stats; 584 585 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); 586 nullfrac = stats->stanullfrac; 587 } 588 589 /* 590 * Pull out any fixed prefix implied by the pattern, and estimate the 591 * fractional selectivity of the remainder of the pattern. Unlike many 592 * other selectivity estimators, we use the pattern operator's actual 593 * collation for this step. This is not because we expect the collation 594 * to make a big difference in the selectivity estimate (it seldom would), 595 * but because we want to be sure we cache compiled regexps under the 596 * right cache key, so that they can be re-used at runtime. 597 */ 598 patt = (Const *) other; 599 pstatus = pattern_fixed_prefix(patt, ptype, collation, 600 &prefix, &rest_selec); 601 602 /* 603 * If necessary, coerce the prefix constant to the right type. The only 604 * case where we need to do anything is when converting text to bpchar. 605 * Those two types are binary-compatible, so relabeling the Const node is 606 * sufficient. 607 */ 608 if (prefix && prefix->consttype != rdatatype) 609 { 610 Assert(prefix->consttype == TEXTOID && 611 rdatatype == BPCHAROID); 612 prefix->consttype = rdatatype; 613 } 614 615 if (pstatus == Pattern_Prefix_Exact) 616 { 617 /* 618 * Pattern specifies an exact match, so estimate as for '=' 619 */ 620 result = var_eq_const(&vardata, eqopr, collation, prefix->constvalue, 621 false, true, false); 622 } 623 else 624 { 625 /* 626 * Not exact-match pattern. If we have a sufficiently large 627 * histogram, estimate selectivity for the histogram part of the 628 * population by counting matches in the histogram. If not, estimate 629 * selectivity of the fixed prefix and remainder of pattern 630 * separately, then combine the two to get an estimate of the 631 * selectivity for the part of the column population represented by 632 * the histogram. (For small histograms, we combine these 633 * approaches.) 634 * 635 * We then add up data for any most-common-values values; these are 636 * not in the histogram population, and we can get exact answers for 637 * them by applying the pattern operator, so there's no reason to 638 * approximate. (If the MCVs cover a significant part of the total 639 * population, this gives us a big leg up in accuracy.) 640 */ 641 Selectivity selec; 642 int hist_size; 643 FmgrInfo opproc; 644 double mcv_selec, 645 sumcommon; 646 647 /* Try to use the histogram entries to get selectivity */ 648 if (!OidIsValid(opfuncid)) 649 opfuncid = get_opcode(oprid); 650 fmgr_info(opfuncid, &opproc); 651 652 selec = histogram_selectivity(&vardata, &opproc, collation, 653 constval, true, 654 10, 1, &hist_size); 655 656 /* If not at least 100 entries, use the heuristic method */ 657 if (hist_size < 100) 658 { 659 Selectivity heursel; 660 Selectivity prefixsel; 661 662 if (pstatus == Pattern_Prefix_Partial) 663 prefixsel = prefix_selectivity(root, &vardata, 664 eqopr, ltopr, geopr, 665 collation, 666 prefix); 667 else 668 prefixsel = 1.0; 669 heursel = prefixsel * rest_selec; 670 671 if (selec < 0) /* fewer than 10 histogram entries? */ 672 selec = heursel; 673 else 674 { 675 /* 676 * For histogram sizes from 10 to 100, we combine the 677 * histogram and heuristic selectivities, putting increasingly 678 * more trust in the histogram for larger sizes. 679 */ 680 double hist_weight = hist_size / 100.0; 681 682 selec = selec * hist_weight + heursel * (1.0 - hist_weight); 683 } 684 } 685 686 /* In any case, don't believe extremely small or large estimates. */ 687 if (selec < 0.0001) 688 selec = 0.0001; 689 else if (selec > 0.9999) 690 selec = 0.9999; 691 692 /* 693 * If we have most-common-values info, add up the fractions of the MCV 694 * entries that satisfy MCV OP PATTERN. These fractions contribute 695 * directly to the result selectivity. Also add up the total fraction 696 * represented by MCV entries. 697 */ 698 mcv_selec = mcv_selectivity(&vardata, &opproc, collation, 699 constval, true, 700 &sumcommon); 701 702 /* 703 * Now merge the results from the MCV and histogram calculations, 704 * realizing that the histogram covers only the non-null values that 705 * are not listed in MCV. 706 */ 707 selec *= 1.0 - nullfrac - sumcommon; 708 selec += mcv_selec; 709 result = selec; 710 } 711 712 /* now adjust if we wanted not-match rather than match */ 713 if (negate) 714 result = 1.0 - result - nullfrac; 715 716 /* result should be in range, but make sure... */ 717 CLAMP_PROBABILITY(result); 718 719 if (prefix) 720 { 721 pfree(DatumGetPointer(prefix->constvalue)); 722 pfree(prefix); 723 } 724 725 ReleaseVariableStats(vardata); 726 727 return result; 728 } 729 730 /* 731 * Fix impedance mismatch between SQL-callable functions and patternsel_common 732 */ 733 static double 734 patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) 735 { 736 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); 737 Oid operator = PG_GETARG_OID(1); 738 List *args = (List *) PG_GETARG_POINTER(2); 739 int varRelid = PG_GETARG_INT32(3); 740 Oid collation = PG_GET_COLLATION(); 741 742 /* 743 * If this is for a NOT LIKE or similar operator, get the corresponding 744 * positive-match operator and work with that. 745 */ 746 if (negate) 747 { 748 operator = get_negator(operator); 749 if (!OidIsValid(operator)) 750 elog(ERROR, "patternsel called for operator without a negator"); 751 } 752 753 return patternsel_common(root, 754 operator, 755 InvalidOid, 756 args, 757 varRelid, 758 collation, 759 ptype, 760 negate); 761 } 762 763 /* 764 * regexeqsel - Selectivity of regular-expression pattern match. 765 */ 766 Datum 767 regexeqsel(PG_FUNCTION_ARGS) 768 { 769 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false)); 770 } 771 772 /* 773 * icregexeqsel - Selectivity of case-insensitive regex match. 774 */ 775 Datum 776 icregexeqsel(PG_FUNCTION_ARGS) 777 { 778 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false)); 779 } 780 781 /* 782 * likesel - Selectivity of LIKE pattern match. 783 */ 784 Datum 785 likesel(PG_FUNCTION_ARGS) 786 { 787 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false)); 788 } 789 790 /* 791 * prefixsel - selectivity of prefix operator 792 */ 793 Datum 794 prefixsel(PG_FUNCTION_ARGS) 795 { 796 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false)); 797 } 798 799 /* 800 * 801 * iclikesel - Selectivity of ILIKE pattern match. 802 */ 803 Datum 804 iclikesel(PG_FUNCTION_ARGS) 805 { 806 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false)); 807 } 808 809 /* 810 * regexnesel - Selectivity of regular-expression pattern non-match. 811 */ 812 Datum 813 regexnesel(PG_FUNCTION_ARGS) 814 { 815 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true)); 816 } 817 818 /* 819 * icregexnesel - Selectivity of case-insensitive regex non-match. 820 */ 821 Datum 822 icregexnesel(PG_FUNCTION_ARGS) 823 { 824 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true)); 825 } 826 827 /* 828 * nlikesel - Selectivity of LIKE pattern non-match. 829 */ 830 Datum 831 nlikesel(PG_FUNCTION_ARGS) 832 { 833 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true)); 834 } 835 836 /* 837 * icnlikesel - Selectivity of ILIKE pattern non-match. 838 */ 839 Datum 840 icnlikesel(PG_FUNCTION_ARGS) 841 { 842 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true)); 843 } 844 845 /* 846 * patternjoinsel - Generic code for pattern-match join selectivity. 847 */ 848 static double 849 patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) 850 { 851 /* For the moment we just punt. */ 852 return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL; 853 } 854 855 /* 856 * regexeqjoinsel - Join selectivity of regular-expression pattern match. 857 */ 858 Datum 859 regexeqjoinsel(PG_FUNCTION_ARGS) 860 { 861 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false)); 862 } 863 864 /* 865 * icregexeqjoinsel - Join selectivity of case-insensitive regex match. 866 */ 867 Datum 868 icregexeqjoinsel(PG_FUNCTION_ARGS) 869 { 870 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false)); 871 } 872 873 /* 874 * likejoinsel - Join selectivity of LIKE pattern match. 875 */ 876 Datum 877 likejoinsel(PG_FUNCTION_ARGS) 878 { 879 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false)); 880 } 881 882 /* 883 * prefixjoinsel - Join selectivity of prefix operator 884 */ 885 Datum 886 prefixjoinsel(PG_FUNCTION_ARGS) 887 { 888 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false)); 889 } 890 891 /* 892 * iclikejoinsel - Join selectivity of ILIKE pattern match. 893 */ 894 Datum 895 iclikejoinsel(PG_FUNCTION_ARGS) 896 { 897 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false)); 898 } 899 900 /* 901 * regexnejoinsel - Join selectivity of regex non-match. 902 */ 903 Datum 904 regexnejoinsel(PG_FUNCTION_ARGS) 905 { 906 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true)); 907 } 908 909 /* 910 * icregexnejoinsel - Join selectivity of case-insensitive regex non-match. 911 */ 912 Datum 913 icregexnejoinsel(PG_FUNCTION_ARGS) 914 { 915 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true)); 916 } 917 918 /* 919 * nlikejoinsel - Join selectivity of LIKE pattern non-match. 920 */ 921 Datum 922 nlikejoinsel(PG_FUNCTION_ARGS) 923 { 924 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true)); 925 } 926 927 /* 928 * icnlikejoinsel - Join selectivity of ILIKE pattern non-match. 929 */ 930 Datum 931 icnlikejoinsel(PG_FUNCTION_ARGS) 932 { 933 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true)); 934 } 935 936 937 /*------------------------------------------------------------------------- 938 * 939 * Pattern analysis functions 940 * 941 * These routines support analysis of LIKE and regular-expression patterns 942 * by the planner/optimizer. It's important that they agree with the 943 * regular-expression code in backend/regex/ and the LIKE code in 944 * backend/utils/adt/like.c. Also, the computation of the fixed prefix 945 * must be conservative: if we report a string longer than the true fixed 946 * prefix, the query may produce actually wrong answers, rather than just 947 * getting a bad selectivity estimate! 948 * 949 *------------------------------------------------------------------------- 950 */ 951 952 /* 953 * Extract the fixed prefix, if any, for a pattern. 954 * 955 * *prefix is set to a palloc'd prefix string (in the form of a Const node), 956 * or to NULL if no fixed prefix exists for the pattern. 957 * If rest_selec is not NULL, *rest_selec is set to an estimate of the 958 * selectivity of the remainder of the pattern (without any fixed prefix). 959 * The prefix Const has the same type (TEXT or BYTEA) as the input pattern. 960 * 961 * The return value distinguishes no fixed prefix, a partial prefix, 962 * or an exact-match-only pattern. 963 */ 964 965 static Pattern_Prefix_Status 966 like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, 967 Const **prefix_const, Selectivity *rest_selec) 968 { 969 char *match; 970 char *patt; 971 int pattlen; 972 Oid typeid = patt_const->consttype; 973 int pos, 974 match_pos; 975 bool is_multibyte = (pg_database_encoding_max_length() > 1); 976 pg_locale_t locale = 0; 977 bool locale_is_c = false; 978 979 /* the right-hand const is type text or bytea */ 980 Assert(typeid == BYTEAOID || typeid == TEXTOID); 981 982 if (case_insensitive) 983 { 984 if (typeid == BYTEAOID) 985 ereport(ERROR, 986 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), 987 errmsg("case insensitive matching not supported on type bytea"))); 988 989 /* If case-insensitive, we need locale info */ 990 if (lc_ctype_is_c(collation)) 991 locale_is_c = true; 992 else if (collation != DEFAULT_COLLATION_OID) 993 { 994 if (!OidIsValid(collation)) 995 { 996 /* 997 * This typically means that the parser could not resolve a 998 * conflict of implicit collations, so report it that way. 999 */ 1000 ereport(ERROR, 1001 (errcode(ERRCODE_INDETERMINATE_COLLATION), 1002 errmsg("could not determine which collation to use for ILIKE"), 1003 errhint("Use the COLLATE clause to set the collation explicitly."))); 1004 } 1005 locale = pg_newlocale_from_collation(collation); 1006 } 1007 } 1008 1009 if (typeid != BYTEAOID) 1010 { 1011 patt = TextDatumGetCString(patt_const->constvalue); 1012 pattlen = strlen(patt); 1013 } 1014 else 1015 { 1016 bytea *bstr = DatumGetByteaPP(patt_const->constvalue); 1017 1018 pattlen = VARSIZE_ANY_EXHDR(bstr); 1019 patt = (char *) palloc(pattlen); 1020 memcpy(patt, VARDATA_ANY(bstr), pattlen); 1021 Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue)); 1022 } 1023 1024 match = palloc(pattlen + 1); 1025 match_pos = 0; 1026 for (pos = 0; pos < pattlen; pos++) 1027 { 1028 /* % and _ are wildcard characters in LIKE */ 1029 if (patt[pos] == '%' || 1030 patt[pos] == '_') 1031 break; 1032 1033 /* Backslash escapes the next character */ 1034 if (patt[pos] == '\\') 1035 { 1036 pos++; 1037 if (pos >= pattlen) 1038 break; 1039 } 1040 1041 /* Stop if case-varying character (it's sort of a wildcard) */ 1042 if (case_insensitive && 1043 pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c)) 1044 break; 1045 1046 match[match_pos++] = patt[pos]; 1047 } 1048 1049 match[match_pos] = '\0'; 1050 1051 if (typeid != BYTEAOID) 1052 *prefix_const = string_to_const(match, typeid); 1053 else 1054 *prefix_const = string_to_bytea_const(match, match_pos); 1055 1056 if (rest_selec != NULL) 1057 *rest_selec = like_selectivity(&patt[pos], pattlen - pos, 1058 case_insensitive); 1059 1060 pfree(patt); 1061 pfree(match); 1062 1063 /* in LIKE, an empty pattern is an exact match! */ 1064 if (pos == pattlen) 1065 return Pattern_Prefix_Exact; /* reached end of pattern, so exact */ 1066 1067 if (match_pos > 0) 1068 return Pattern_Prefix_Partial; 1069 1070 return Pattern_Prefix_None; 1071 } 1072 1073 static Pattern_Prefix_Status 1074 regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, 1075 Const **prefix_const, Selectivity *rest_selec) 1076 { 1077 Oid typeid = patt_const->consttype; 1078 char *prefix; 1079 bool exact; 1080 1081 /* 1082 * Should be unnecessary, there are no bytea regex operators defined. As 1083 * such, it should be noted that the rest of this function has *not* been 1084 * made safe for binary (possibly NULL containing) strings. 1085 */ 1086 if (typeid == BYTEAOID) 1087 ereport(ERROR, 1088 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), 1089 errmsg("regular-expression matching not supported on type bytea"))); 1090 1091 /* Use the regexp machinery to extract the prefix, if any */ 1092 prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue), 1093 case_insensitive, collation, 1094 &exact); 1095 1096 if (prefix == NULL) 1097 { 1098 *prefix_const = NULL; 1099 1100 if (rest_selec != NULL) 1101 { 1102 char *patt = TextDatumGetCString(patt_const->constvalue); 1103 1104 *rest_selec = regex_selectivity(patt, strlen(patt), 1105 case_insensitive, 1106 0); 1107 pfree(patt); 1108 } 1109 1110 return Pattern_Prefix_None; 1111 } 1112 1113 *prefix_const = string_to_const(prefix, typeid); 1114 1115 if (rest_selec != NULL) 1116 { 1117 if (exact) 1118 { 1119 /* Exact match, so there's no additional selectivity */ 1120 *rest_selec = 1.0; 1121 } 1122 else 1123 { 1124 char *patt = TextDatumGetCString(patt_const->constvalue); 1125 1126 *rest_selec = regex_selectivity(patt, strlen(patt), 1127 case_insensitive, 1128 strlen(prefix)); 1129 pfree(patt); 1130 } 1131 } 1132 1133 pfree(prefix); 1134 1135 if (exact) 1136 return Pattern_Prefix_Exact; /* pattern specifies exact match */ 1137 else 1138 return Pattern_Prefix_Partial; 1139 } 1140 1141 static Pattern_Prefix_Status 1142 pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation, 1143 Const **prefix, Selectivity *rest_selec) 1144 { 1145 Pattern_Prefix_Status result; 1146 1147 switch (ptype) 1148 { 1149 case Pattern_Type_Like: 1150 result = like_fixed_prefix(patt, false, collation, 1151 prefix, rest_selec); 1152 break; 1153 case Pattern_Type_Like_IC: 1154 result = like_fixed_prefix(patt, true, collation, 1155 prefix, rest_selec); 1156 break; 1157 case Pattern_Type_Regex: 1158 result = regex_fixed_prefix(patt, false, collation, 1159 prefix, rest_selec); 1160 break; 1161 case Pattern_Type_Regex_IC: 1162 result = regex_fixed_prefix(patt, true, collation, 1163 prefix, rest_selec); 1164 break; 1165 case Pattern_Type_Prefix: 1166 /* Prefix type work is trivial. */ 1167 result = Pattern_Prefix_Partial; 1168 *rest_selec = 1.0; /* all */ 1169 *prefix = makeConst(patt->consttype, 1170 patt->consttypmod, 1171 patt->constcollid, 1172 patt->constlen, 1173 datumCopy(patt->constvalue, 1174 patt->constbyval, 1175 patt->constlen), 1176 patt->constisnull, 1177 patt->constbyval); 1178 break; 1179 default: 1180 elog(ERROR, "unrecognized ptype: %d", (int) ptype); 1181 result = Pattern_Prefix_None; /* keep compiler quiet */ 1182 break; 1183 } 1184 return result; 1185 } 1186 1187 /* 1188 * Estimate the selectivity of a fixed prefix for a pattern match. 1189 * 1190 * A fixed prefix "foo" is estimated as the selectivity of the expression 1191 * "variable >= 'foo' AND variable < 'fop'". 1192 * 1193 * The selectivity estimate is with respect to the portion of the column 1194 * population represented by the histogram --- the caller must fold this 1195 * together with info about MCVs and NULLs. 1196 * 1197 * We use the given comparison operators and collation to do the estimation. 1198 * The given variable and Const must be of the associated datatype(s). 1199 * 1200 * XXX Note: we make use of the upper bound to estimate operator selectivity 1201 * even if the locale is such that we cannot rely on the upper-bound string. 1202 * The selectivity only needs to be approximately right anyway, so it seems 1203 * more useful to use the upper-bound code than not. 1204 */ 1205 static Selectivity 1206 prefix_selectivity(PlannerInfo *root, VariableStatData *vardata, 1207 Oid eqopr, Oid ltopr, Oid geopr, 1208 Oid collation, 1209 Const *prefixcon) 1210 { 1211 Selectivity prefixsel; 1212 FmgrInfo opproc; 1213 Const *greaterstrcon; 1214 Selectivity eq_sel; 1215 1216 /* Estimate the selectivity of "x >= prefix" */ 1217 fmgr_info(get_opcode(geopr), &opproc); 1218 1219 prefixsel = ineq_histogram_selectivity(root, vardata, 1220 geopr, &opproc, true, true, 1221 collation, 1222 prefixcon->constvalue, 1223 prefixcon->consttype); 1224 1225 if (prefixsel < 0.0) 1226 { 1227 /* No histogram is present ... return a suitable default estimate */ 1228 return DEFAULT_MATCH_SEL; 1229 } 1230 1231 /* 1232 * If we can create a string larger than the prefix, say "x < greaterstr". 1233 */ 1234 fmgr_info(get_opcode(ltopr), &opproc); 1235 greaterstrcon = make_greater_string(prefixcon, &opproc, collation); 1236 if (greaterstrcon) 1237 { 1238 Selectivity topsel; 1239 1240 topsel = ineq_histogram_selectivity(root, vardata, 1241 ltopr, &opproc, false, false, 1242 collation, 1243 greaterstrcon->constvalue, 1244 greaterstrcon->consttype); 1245 1246 /* ineq_histogram_selectivity worked before, it shouldn't fail now */ 1247 Assert(topsel >= 0.0); 1248 1249 /* 1250 * Merge the two selectivities in the same way as for a range query 1251 * (see clauselist_selectivity()). Note that we don't need to worry 1252 * about double-exclusion of nulls, since ineq_histogram_selectivity 1253 * doesn't count those anyway. 1254 */ 1255 prefixsel = topsel + prefixsel - 1.0; 1256 } 1257 1258 /* 1259 * If the prefix is long then the two bounding values might be too close 1260 * together for the histogram to distinguish them usefully, resulting in a 1261 * zero estimate (plus or minus roundoff error). To avoid returning a 1262 * ridiculously small estimate, compute the estimated selectivity for 1263 * "variable = 'foo'", and clamp to that. (Obviously, the resultant 1264 * estimate should be at least that.) 1265 * 1266 * We apply this even if we couldn't make a greater string. That case 1267 * suggests that the prefix is near the maximum possible, and thus 1268 * probably off the end of the histogram, and thus we probably got a very 1269 * small estimate from the >= condition; so we still need to clamp. 1270 */ 1271 eq_sel = var_eq_const(vardata, eqopr, collation, prefixcon->constvalue, 1272 false, true, false); 1273 1274 prefixsel = Max(prefixsel, eq_sel); 1275 1276 return prefixsel; 1277 } 1278 1279 1280 /* 1281 * Estimate the selectivity of a pattern of the specified type. 1282 * Note that any fixed prefix of the pattern will have been removed already, 1283 * so actually we may be looking at just a fragment of the pattern. 1284 * 1285 * For now, we use a very simplistic approach: fixed characters reduce the 1286 * selectivity a good deal, character ranges reduce it a little, 1287 * wildcards (such as % for LIKE or .* for regex) increase it. 1288 */ 1289 1290 #define FIXED_CHAR_SEL 0.20 /* about 1/5 */ 1291 #define CHAR_RANGE_SEL 0.25 1292 #define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */ 1293 #define FULL_WILDCARD_SEL 5.0 1294 #define PARTIAL_WILDCARD_SEL 2.0 1295 1296 static Selectivity 1297 like_selectivity(const char *patt, int pattlen, bool case_insensitive) 1298 { 1299 Selectivity sel = 1.0; 1300 int pos; 1301 1302 /* Skip any leading wildcard; it's already factored into initial sel */ 1303 for (pos = 0; pos < pattlen; pos++) 1304 { 1305 if (patt[pos] != '%' && patt[pos] != '_') 1306 break; 1307 } 1308 1309 for (; pos < pattlen; pos++) 1310 { 1311 /* % and _ are wildcard characters in LIKE */ 1312 if (patt[pos] == '%') 1313 sel *= FULL_WILDCARD_SEL; 1314 else if (patt[pos] == '_') 1315 sel *= ANY_CHAR_SEL; 1316 else if (patt[pos] == '\\') 1317 { 1318 /* Backslash quotes the next character */ 1319 pos++; 1320 if (pos >= pattlen) 1321 break; 1322 sel *= FIXED_CHAR_SEL; 1323 } 1324 else 1325 sel *= FIXED_CHAR_SEL; 1326 } 1327 /* Could get sel > 1 if multiple wildcards */ 1328 if (sel > 1.0) 1329 sel = 1.0; 1330 return sel; 1331 } 1332 1333 static Selectivity 1334 regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive) 1335 { 1336 Selectivity sel = 1.0; 1337 int paren_depth = 0; 1338 int paren_pos = 0; /* dummy init to keep compiler quiet */ 1339 int pos; 1340 1341 for (pos = 0; pos < pattlen; pos++) 1342 { 1343 if (patt[pos] == '(') 1344 { 1345 if (paren_depth == 0) 1346 paren_pos = pos; /* remember start of parenthesized item */ 1347 paren_depth++; 1348 } 1349 else if (patt[pos] == ')' && paren_depth > 0) 1350 { 1351 paren_depth--; 1352 if (paren_depth == 0) 1353 sel *= regex_selectivity_sub(patt + (paren_pos + 1), 1354 pos - (paren_pos + 1), 1355 case_insensitive); 1356 } 1357 else if (patt[pos] == '|' && paren_depth == 0) 1358 { 1359 /* 1360 * If unquoted | is present at paren level 0 in pattern, we have 1361 * multiple alternatives; sum their probabilities. 1362 */ 1363 sel += regex_selectivity_sub(patt + (pos + 1), 1364 pattlen - (pos + 1), 1365 case_insensitive); 1366 break; /* rest of pattern is now processed */ 1367 } 1368 else if (patt[pos] == '[') 1369 { 1370 bool negclass = false; 1371 1372 if (patt[++pos] == '^') 1373 { 1374 negclass = true; 1375 pos++; 1376 } 1377 if (patt[pos] == ']') /* ']' at start of class is not special */ 1378 pos++; 1379 while (pos < pattlen && patt[pos] != ']') 1380 pos++; 1381 if (paren_depth == 0) 1382 sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL); 1383 } 1384 else if (patt[pos] == '.') 1385 { 1386 if (paren_depth == 0) 1387 sel *= ANY_CHAR_SEL; 1388 } 1389 else if (patt[pos] == '*' || 1390 patt[pos] == '?' || 1391 patt[pos] == '+') 1392 { 1393 /* Ought to be smarter about quantifiers... */ 1394 if (paren_depth == 0) 1395 sel *= PARTIAL_WILDCARD_SEL; 1396 } 1397 else if (patt[pos] == '{') 1398 { 1399 while (pos < pattlen && patt[pos] != '}') 1400 pos++; 1401 if (paren_depth == 0) 1402 sel *= PARTIAL_WILDCARD_SEL; 1403 } 1404 else if (patt[pos] == '\\') 1405 { 1406 /* backslash quotes the next character */ 1407 pos++; 1408 if (pos >= pattlen) 1409 break; 1410 if (paren_depth == 0) 1411 sel *= FIXED_CHAR_SEL; 1412 } 1413 else 1414 { 1415 if (paren_depth == 0) 1416 sel *= FIXED_CHAR_SEL; 1417 } 1418 } 1419 /* Could get sel > 1 if multiple wildcards */ 1420 if (sel > 1.0) 1421 sel = 1.0; 1422 return sel; 1423 } 1424 1425 static Selectivity 1426 regex_selectivity(const char *patt, int pattlen, bool case_insensitive, 1427 int fixed_prefix_len) 1428 { 1429 Selectivity sel; 1430 1431 /* If patt doesn't end with $, consider it to have a trailing wildcard */ 1432 if (pattlen > 0 && patt[pattlen - 1] == '$' && 1433 (pattlen == 1 || patt[pattlen - 2] != '\\')) 1434 { 1435 /* has trailing $ */ 1436 sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive); 1437 } 1438 else 1439 { 1440 /* no trailing $ */ 1441 sel = regex_selectivity_sub(patt, pattlen, case_insensitive); 1442 sel *= FULL_WILDCARD_SEL; 1443 } 1444 1445 /* 1446 * If there's a fixed prefix, discount its selectivity. We have to be 1447 * careful here since a very long prefix could result in pow's result 1448 * underflowing to zero (in which case "sel" probably has as well). 1449 */ 1450 if (fixed_prefix_len > 0) 1451 { 1452 double prefixsel = pow(FIXED_CHAR_SEL, fixed_prefix_len); 1453 1454 if (prefixsel > 0.0) 1455 sel /= prefixsel; 1456 } 1457 1458 /* Make sure result stays in range */ 1459 CLAMP_PROBABILITY(sel); 1460 return sel; 1461 } 1462 1463 /* 1464 * Check whether char is a letter (and, hence, subject to case-folding) 1465 * 1466 * In multibyte character sets or with ICU, we can't use isalpha, and it does 1467 * not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha. 1468 * Instead, just assume any non-ASCII char is potentially case-varying, and 1469 * hard-wire knowledge of which ASCII chars are letters. 1470 */ 1471 static int 1472 pattern_char_isalpha(char c, bool is_multibyte, 1473 pg_locale_t locale, bool locale_is_c) 1474 { 1475 if (locale_is_c) 1476 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); 1477 else if (is_multibyte && IS_HIGHBIT_SET(c)) 1478 return true; 1479 else if (locale && locale->provider == COLLPROVIDER_ICU) 1480 return IS_HIGHBIT_SET(c) || 1481 (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); 1482 #ifdef HAVE_LOCALE_T 1483 else if (locale && locale->provider == COLLPROVIDER_LIBC) 1484 return isalpha_l((unsigned char) c, locale->info.lt); 1485 #endif 1486 else 1487 return isalpha((unsigned char) c); 1488 } 1489 1490 1491 /* 1492 * For bytea, the increment function need only increment the current byte 1493 * (there are no multibyte characters to worry about). 1494 */ 1495 static bool 1496 byte_increment(unsigned char *ptr, int len) 1497 { 1498 if (*ptr >= 255) 1499 return false; 1500 (*ptr)++; 1501 return true; 1502 } 1503 1504 /* 1505 * Try to generate a string greater than the given string or any 1506 * string it is a prefix of. If successful, return a palloc'd string 1507 * in the form of a Const node; else return NULL. 1508 * 1509 * The caller must provide the appropriate "less than" comparison function 1510 * for testing the strings, along with the collation to use. 1511 * 1512 * The key requirement here is that given a prefix string, say "foo", 1513 * we must be able to generate another string "fop" that is greater than 1514 * all strings "foobar" starting with "foo". We can test that we have 1515 * generated a string greater than the prefix string, but in non-C collations 1516 * that is not a bulletproof guarantee that an extension of the string might 1517 * not sort after it; an example is that "foo " is less than "foo!", but it 1518 * is not clear that a "dictionary" sort ordering will consider "foo!" less 1519 * than "foo bar". CAUTION: Therefore, this function should be used only for 1520 * estimation purposes when working in a non-C collation. 1521 * 1522 * To try to catch most cases where an extended string might otherwise sort 1523 * before the result value, we determine which of the strings "Z", "z", "y", 1524 * and "9" is seen as largest by the collation, and append that to the given 1525 * prefix before trying to find a string that compares as larger. 1526 * 1527 * To search for a greater string, we repeatedly "increment" the rightmost 1528 * character, using an encoding-specific character incrementer function. 1529 * When it's no longer possible to increment the last character, we truncate 1530 * off that character and start incrementing the next-to-rightmost. 1531 * For example, if "z" were the last character in the sort order, then we 1532 * could produce "foo" as a string greater than "fonz". 1533 * 1534 * This could be rather slow in the worst case, but in most cases we 1535 * won't have to try more than one or two strings before succeeding. 1536 * 1537 * Note that it's important for the character incrementer not to be too anal 1538 * about producing every possible character code, since in some cases the only 1539 * way to get a larger string is to increment a previous character position. 1540 * So we don't want to spend too much time trying every possible character 1541 * code at the last position. A good rule of thumb is to be sure that we 1542 * don't try more than 256*K values for a K-byte character (and definitely 1543 * not 256^K, which is what an exhaustive search would approach). 1544 */ 1545 static Const * 1546 make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) 1547 { 1548 Oid datatype = str_const->consttype; 1549 char *workstr; 1550 int len; 1551 Datum cmpstr; 1552 char *cmptxt = NULL; 1553 mbcharacter_incrementer charinc; 1554 1555 /* 1556 * Get a modifiable copy of the prefix string in C-string format, and set 1557 * up the string we will compare to as a Datum. In C locale this can just 1558 * be the given prefix string, otherwise we need to add a suffix. Type 1559 * BYTEA sorts bytewise so it never needs a suffix either. 1560 */ 1561 if (datatype == BYTEAOID) 1562 { 1563 bytea *bstr = DatumGetByteaPP(str_const->constvalue); 1564 1565 len = VARSIZE_ANY_EXHDR(bstr); 1566 workstr = (char *) palloc(len); 1567 memcpy(workstr, VARDATA_ANY(bstr), len); 1568 Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue)); 1569 cmpstr = str_const->constvalue; 1570 } 1571 else 1572 { 1573 if (datatype == NAMEOID) 1574 workstr = DatumGetCString(DirectFunctionCall1(nameout, 1575 str_const->constvalue)); 1576 else 1577 workstr = TextDatumGetCString(str_const->constvalue); 1578 len = strlen(workstr); 1579 if (lc_collate_is_c(collation) || len == 0) 1580 cmpstr = str_const->constvalue; 1581 else 1582 { 1583 /* If first time through, determine the suffix to use */ 1584 static char suffixchar = 0; 1585 static Oid suffixcollation = 0; 1586 1587 if (!suffixchar || suffixcollation != collation) 1588 { 1589 char *best; 1590 1591 best = "Z"; 1592 if (varstr_cmp(best, 1, "z", 1, collation) < 0) 1593 best = "z"; 1594 if (varstr_cmp(best, 1, "y", 1, collation) < 0) 1595 best = "y"; 1596 if (varstr_cmp(best, 1, "9", 1, collation) < 0) 1597 best = "9"; 1598 suffixchar = *best; 1599 suffixcollation = collation; 1600 } 1601 1602 /* And build the string to compare to */ 1603 if (datatype == NAMEOID) 1604 { 1605 cmptxt = palloc(len + 2); 1606 memcpy(cmptxt, workstr, len); 1607 cmptxt[len] = suffixchar; 1608 cmptxt[len + 1] = '\0'; 1609 cmpstr = PointerGetDatum(cmptxt); 1610 } 1611 else 1612 { 1613 cmptxt = palloc(VARHDRSZ + len + 1); 1614 SET_VARSIZE(cmptxt, VARHDRSZ + len + 1); 1615 memcpy(VARDATA(cmptxt), workstr, len); 1616 *(VARDATA(cmptxt) + len) = suffixchar; 1617 cmpstr = PointerGetDatum(cmptxt); 1618 } 1619 } 1620 } 1621 1622 /* Select appropriate character-incrementer function */ 1623 if (datatype == BYTEAOID) 1624 charinc = byte_increment; 1625 else 1626 charinc = pg_database_encoding_character_incrementer(); 1627 1628 /* And search ... */ 1629 while (len > 0) 1630 { 1631 int charlen; 1632 unsigned char *lastchar; 1633 1634 /* Identify the last character --- for bytea, just the last byte */ 1635 if (datatype == BYTEAOID) 1636 charlen = 1; 1637 else 1638 charlen = len - pg_mbcliplen(workstr, len, len - 1); 1639 lastchar = (unsigned char *) (workstr + len - charlen); 1640 1641 /* 1642 * Try to generate a larger string by incrementing the last character 1643 * (for BYTEA, we treat each byte as a character). 1644 * 1645 * Note: the incrementer function is expected to return true if it's 1646 * generated a valid-per-the-encoding new character, otherwise false. 1647 * The contents of the character on false return are unspecified. 1648 */ 1649 while (charinc(lastchar, charlen)) 1650 { 1651 Const *workstr_const; 1652 1653 if (datatype == BYTEAOID) 1654 workstr_const = string_to_bytea_const(workstr, len); 1655 else 1656 workstr_const = string_to_const(workstr, datatype); 1657 1658 if (DatumGetBool(FunctionCall2Coll(ltproc, 1659 collation, 1660 cmpstr, 1661 workstr_const->constvalue))) 1662 { 1663 /* Successfully made a string larger than cmpstr */ 1664 if (cmptxt) 1665 pfree(cmptxt); 1666 pfree(workstr); 1667 return workstr_const; 1668 } 1669 1670 /* No good, release unusable value and try again */ 1671 pfree(DatumGetPointer(workstr_const->constvalue)); 1672 pfree(workstr_const); 1673 } 1674 1675 /* 1676 * No luck here, so truncate off the last character and try to 1677 * increment the next one. 1678 */ 1679 len -= charlen; 1680 workstr[len] = '\0'; 1681 } 1682 1683 /* Failed... */ 1684 if (cmptxt) 1685 pfree(cmptxt); 1686 pfree(workstr); 1687 1688 return NULL; 1689 } 1690 1691 /* 1692 * Generate a Datum of the appropriate type from a C string. 1693 * Note that all of the supported types are pass-by-ref, so the 1694 * returned value should be pfree'd if no longer needed. 1695 */ 1696 static Datum 1697 string_to_datum(const char *str, Oid datatype) 1698 { 1699 Assert(str != NULL); 1700 1701 /* 1702 * We cheat a little by assuming that CStringGetTextDatum() will do for 1703 * bpchar and varchar constants too... 1704 */ 1705 if (datatype == NAMEOID) 1706 return DirectFunctionCall1(namein, CStringGetDatum(str)); 1707 else if (datatype == BYTEAOID) 1708 return DirectFunctionCall1(byteain, CStringGetDatum(str)); 1709 else 1710 return CStringGetTextDatum(str); 1711 } 1712 1713 /* 1714 * Generate a Const node of the appropriate type from a C string. 1715 */ 1716 static Const * 1717 string_to_const(const char *str, Oid datatype) 1718 { 1719 Datum conval = string_to_datum(str, datatype); 1720 Oid collation; 1721 int constlen; 1722 1723 /* 1724 * We only need to support a few datatypes here, so hard-wire properties 1725 * instead of incurring the expense of catalog lookups. 1726 */ 1727 switch (datatype) 1728 { 1729 case TEXTOID: 1730 case VARCHAROID: 1731 case BPCHAROID: 1732 collation = DEFAULT_COLLATION_OID; 1733 constlen = -1; 1734 break; 1735 1736 case NAMEOID: 1737 collation = C_COLLATION_OID; 1738 constlen = NAMEDATALEN; 1739 break; 1740 1741 case BYTEAOID: 1742 collation = InvalidOid; 1743 constlen = -1; 1744 break; 1745 1746 default: 1747 elog(ERROR, "unexpected datatype in string_to_const: %u", 1748 datatype); 1749 return NULL; 1750 } 1751 1752 return makeConst(datatype, -1, collation, constlen, 1753 conval, false, false); 1754 } 1755 1756 /* 1757 * Generate a Const node of bytea type from a binary C string and a length. 1758 */ 1759 static Const * 1760 string_to_bytea_const(const char *str, size_t str_len) 1761 { 1762 bytea *bstr = palloc(VARHDRSZ + str_len); 1763 Datum conval; 1764 1765 memcpy(VARDATA(bstr), str, str_len); 1766 SET_VARSIZE(bstr, VARHDRSZ + str_len); 1767 conval = PointerGetDatum(bstr); 1768 1769 return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false); 1770 } 1771