1 /*-------------------------------------------------------------------------
2 *
3 * like_support.c
4 * Planner support functions for LIKE, regex, and related operators.
5 *
6 * These routines handle special optimization of operators that can be
7 * used with index scans even though they are not known to the executor's
8 * indexscan machinery. The key idea is that these operators allow us
9 * to derive approximate indexscan qual clauses, such that any tuples
10 * that pass the operator clause itself must also satisfy the simpler
11 * indexscan condition(s). Then we can use the indexscan machinery
12 * to avoid scanning as much of the table as we'd otherwise have to,
13 * while applying the original operator as a qpqual condition to ensure
14 * we deliver only the tuples we want. (In essence, we're using a regular
15 * index as if it were a lossy index.)
16 *
17 * An example of what we're doing is
18 * textfield LIKE 'abc%def'
19 * from which we can generate the indexscanable conditions
20 * textfield >= 'abc' AND textfield < 'abd'
21 * which allow efficient scanning of an index on textfield.
22 * (In reality, character set and collation issues make the transformation
23 * from LIKE to indexscan limits rather harder than one might think ...
24 * but that's the basic idea.)
25 *
26 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
27 * Portions Copyright (c) 1994, Regents of the University of California
28 *
29 *
30 * IDENTIFICATION
31 * src/backend/utils/adt/like_support.c
32 *
33 *-------------------------------------------------------------------------
34 */
35 #include "postgres.h"
36
37 #include <math.h>
38
39 #include "access/htup_details.h"
40 #include "access/stratnum.h"
41 #include "catalog/pg_collation.h"
42 #include "catalog/pg_operator.h"
43 #include "catalog/pg_opfamily.h"
44 #include "catalog/pg_statistic.h"
45 #include "catalog/pg_type.h"
46 #include "mb/pg_wchar.h"
47 #include "nodes/makefuncs.h"
48 #include "nodes/nodeFuncs.h"
49 #include "nodes/supportnodes.h"
50 #include "utils/builtins.h"
51 #include "utils/datum.h"
52 #include "utils/lsyscache.h"
53 #include "utils/pg_locale.h"
54 #include "utils/selfuncs.h"
55 #include "utils/varlena.h"
56
57
58 typedef enum
59 {
60 Pattern_Type_Like,
61 Pattern_Type_Like_IC,
62 Pattern_Type_Regex,
63 Pattern_Type_Regex_IC,
64 Pattern_Type_Prefix
65 } Pattern_Type;
66
67 typedef enum
68 {
69 Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact
70 } Pattern_Prefix_Status;
71
72 static Node *like_regex_support(Node *rawreq, Pattern_Type ptype);
73 static List *match_pattern_prefix(Node *leftop,
74 Node *rightop,
75 Pattern_Type ptype,
76 Oid expr_coll,
77 Oid opfamily,
78 Oid indexcollation);
79 static double patternsel_common(PlannerInfo *root,
80 Oid oprid,
81 Oid opfuncid,
82 List *args,
83 int varRelid,
84 Oid collation,
85 Pattern_Type ptype,
86 bool negate);
87 static Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
88 Pattern_Type ptype,
89 Oid collation,
90 Const **prefix,
91 Selectivity *rest_selec);
92 static Selectivity prefix_selectivity(PlannerInfo *root,
93 VariableStatData *vardata,
94 Oid eqopr, Oid ltopr, Oid geopr,
95 Oid collation,
96 Const *prefixcon);
97 static Selectivity like_selectivity(const char *patt, int pattlen,
98 bool case_insensitive);
99 static Selectivity regex_selectivity(const char *patt, int pattlen,
100 bool case_insensitive,
101 int fixed_prefix_len);
102 static int pattern_char_isalpha(char c, bool is_multibyte,
103 pg_locale_t locale, bool locale_is_c);
104 static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
105 Oid collation);
106 static Datum string_to_datum(const char *str, Oid datatype);
107 static Const *string_to_const(const char *str, Oid datatype);
108 static Const *string_to_bytea_const(const char *str, size_t str_len);
109
110
111 /*
112 * Planner support functions for LIKE, regex, and related operators
113 */
114 Datum
textlike_support(PG_FUNCTION_ARGS)115 textlike_support(PG_FUNCTION_ARGS)
116 {
117 Node *rawreq = (Node *) PG_GETARG_POINTER(0);
118
119 PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like));
120 }
121
122 Datum
texticlike_support(PG_FUNCTION_ARGS)123 texticlike_support(PG_FUNCTION_ARGS)
124 {
125 Node *rawreq = (Node *) PG_GETARG_POINTER(0);
126
127 PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like_IC));
128 }
129
130 Datum
textregexeq_support(PG_FUNCTION_ARGS)131 textregexeq_support(PG_FUNCTION_ARGS)
132 {
133 Node *rawreq = (Node *) PG_GETARG_POINTER(0);
134
135 PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex));
136 }
137
138 Datum
texticregexeq_support(PG_FUNCTION_ARGS)139 texticregexeq_support(PG_FUNCTION_ARGS)
140 {
141 Node *rawreq = (Node *) PG_GETARG_POINTER(0);
142
143 PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex_IC));
144 }
145
146 /* Common code for the above */
147 static Node *
like_regex_support(Node * rawreq,Pattern_Type ptype)148 like_regex_support(Node *rawreq, Pattern_Type ptype)
149 {
150 Node *ret = NULL;
151
152 if (IsA(rawreq, SupportRequestSelectivity))
153 {
154 /*
155 * Make a selectivity estimate for a function call, just as we'd do if
156 * the call was via the corresponding operator.
157 */
158 SupportRequestSelectivity *req = (SupportRequestSelectivity *) rawreq;
159 Selectivity s1;
160
161 if (req->is_join)
162 {
163 /*
164 * For the moment we just punt. If patternjoinsel is ever
165 * improved to do better, this should be made to call it.
166 */
167 s1 = DEFAULT_MATCH_SEL;
168 }
169 else
170 {
171 /* Share code with operator restriction selectivity functions */
172 s1 = patternsel_common(req->root,
173 InvalidOid,
174 req->funcid,
175 req->args,
176 req->varRelid,
177 req->inputcollid,
178 ptype,
179 false);
180 }
181 req->selectivity = s1;
182 ret = (Node *) req;
183 }
184 else if (IsA(rawreq, SupportRequestIndexCondition))
185 {
186 /* Try to convert operator/function call to index conditions */
187 SupportRequestIndexCondition *req = (SupportRequestIndexCondition *) rawreq;
188
189 /*
190 * Currently we have no "reverse" match operators with the pattern on
191 * the left, so we only need consider cases with the indexkey on the
192 * left.
193 */
194 if (req->indexarg != 0)
195 return NULL;
196
197 if (is_opclause(req->node))
198 {
199 OpExpr *clause = (OpExpr *) req->node;
200
201 Assert(list_length(clause->args) == 2);
202 ret = (Node *)
203 match_pattern_prefix((Node *) linitial(clause->args),
204 (Node *) lsecond(clause->args),
205 ptype,
206 clause->inputcollid,
207 req->opfamily,
208 req->indexcollation);
209 }
210 else if (is_funcclause(req->node)) /* be paranoid */
211 {
212 FuncExpr *clause = (FuncExpr *) req->node;
213
214 Assert(list_length(clause->args) == 2);
215 ret = (Node *)
216 match_pattern_prefix((Node *) linitial(clause->args),
217 (Node *) lsecond(clause->args),
218 ptype,
219 clause->inputcollid,
220 req->opfamily,
221 req->indexcollation);
222 }
223 }
224
225 return ret;
226 }
227
228 /*
229 * match_pattern_prefix
230 * Try to generate an indexqual for a LIKE or regex operator.
231 */
232 static List *
match_pattern_prefix(Node * leftop,Node * rightop,Pattern_Type ptype,Oid expr_coll,Oid opfamily,Oid indexcollation)233 match_pattern_prefix(Node *leftop,
234 Node *rightop,
235 Pattern_Type ptype,
236 Oid expr_coll,
237 Oid opfamily,
238 Oid indexcollation)
239 {
240 List *result;
241 Const *patt;
242 Const *prefix;
243 Pattern_Prefix_Status pstatus;
244 Oid ldatatype;
245 Oid rdatatype;
246 Oid eqopr;
247 Oid ltopr;
248 Oid geopr;
249 bool collation_aware;
250 Expr *expr;
251 FmgrInfo ltproc;
252 Const *greaterstr;
253
254 /*
255 * Can't do anything with a non-constant or NULL pattern argument.
256 *
257 * Note that since we restrict ourselves to cases with a hard constant on
258 * the RHS, it's a-fortiori a pseudoconstant, and we don't need to worry
259 * about verifying that.
260 */
261 if (!IsA(rightop, Const) ||
262 ((Const *) rightop)->constisnull)
263 return NIL;
264 patt = (Const *) rightop;
265
266 /*
267 * Not supported if the expression collation is nondeterministic. The
268 * optimized equality or prefix tests use bytewise comparisons, which is
269 * not consistent with nondeterministic collations. The actual
270 * pattern-matching implementation functions will later error out that
271 * pattern-matching is not supported with nondeterministic collations. (We
272 * could also error out here, but by doing it later we get more precise
273 * error messages.) (It should be possible to support at least
274 * Pattern_Prefix_Exact, but no point as long as the actual
275 * pattern-matching implementations don't support it.)
276 *
277 * expr_coll is not set for a non-collation-aware data type such as bytea.
278 */
279 if (expr_coll && !get_collation_isdeterministic(expr_coll))
280 return NIL;
281
282 /*
283 * Try to extract a fixed prefix from the pattern.
284 */
285 pstatus = pattern_fixed_prefix(patt, ptype, expr_coll,
286 &prefix, NULL);
287
288 /* fail if no fixed prefix */
289 if (pstatus == Pattern_Prefix_None)
290 return NIL;
291
292 /*
293 * Identify the operators we want to use, based on the type of the
294 * left-hand argument. Usually these are just the type's regular
295 * comparison operators, but if we are considering one of the semi-legacy
296 * "pattern" opclasses, use the "pattern" operators instead. Those are
297 * not collation-sensitive but always use C collation, as we want. The
298 * selected operators also determine the needed type of the prefix
299 * constant.
300 */
301 ldatatype = exprType(leftop);
302 switch (ldatatype)
303 {
304 case TEXTOID:
305 if (opfamily == TEXT_PATTERN_BTREE_FAM_OID ||
306 opfamily == TEXT_SPGIST_FAM_OID)
307 {
308 eqopr = TextEqualOperator;
309 ltopr = TextPatternLessOperator;
310 geopr = TextPatternGreaterEqualOperator;
311 collation_aware = false;
312 }
313 else
314 {
315 eqopr = TextEqualOperator;
316 ltopr = TextLessOperator;
317 geopr = TextGreaterEqualOperator;
318 collation_aware = true;
319 }
320 rdatatype = TEXTOID;
321 break;
322 case NAMEOID:
323
324 /*
325 * Note that here, we need the RHS type to be text, so that the
326 * comparison value isn't improperly truncated to NAMEDATALEN.
327 */
328 eqopr = NameEqualTextOperator;
329 ltopr = NameLessTextOperator;
330 geopr = NameGreaterEqualTextOperator;
331 collation_aware = true;
332 rdatatype = TEXTOID;
333 break;
334 case BPCHAROID:
335 if (opfamily == BPCHAR_PATTERN_BTREE_FAM_OID)
336 {
337 eqopr = BpcharEqualOperator;
338 ltopr = BpcharPatternLessOperator;
339 geopr = BpcharPatternGreaterEqualOperator;
340 collation_aware = false;
341 }
342 else
343 {
344 eqopr = BpcharEqualOperator;
345 ltopr = BpcharLessOperator;
346 geopr = BpcharGreaterEqualOperator;
347 collation_aware = true;
348 }
349 rdatatype = BPCHAROID;
350 break;
351 case BYTEAOID:
352 eqopr = ByteaEqualOperator;
353 ltopr = ByteaLessOperator;
354 geopr = ByteaGreaterEqualOperator;
355 collation_aware = false;
356 rdatatype = BYTEAOID;
357 break;
358 default:
359 /* Can't get here unless we're attached to the wrong operator */
360 return NIL;
361 }
362
363 /*
364 * If necessary, verify that the index's collation behavior is compatible.
365 * For an exact-match case, we don't have to be picky. Otherwise, insist
366 * that the index collation be "C". Note that here we are looking at the
367 * index's collation, not the expression's collation -- this test is *not*
368 * dependent on the LIKE/regex operator's collation.
369 */
370 if (collation_aware)
371 {
372 if (!(pstatus == Pattern_Prefix_Exact ||
373 lc_collate_is_c(indexcollation)))
374 return NIL;
375 }
376
377 /*
378 * If necessary, coerce the prefix constant to the right type. The given
379 * prefix constant is either text or bytea type, therefore the only case
380 * where we need to do anything is when converting text to bpchar. Those
381 * two types are binary-compatible, so relabeling the Const node is
382 * sufficient.
383 */
384 if (prefix->consttype != rdatatype)
385 {
386 Assert(prefix->consttype == TEXTOID &&
387 rdatatype == BPCHAROID);
388 prefix->consttype = rdatatype;
389 }
390
391 /*
392 * If we found an exact-match pattern, generate an "=" indexqual.
393 *
394 * Here and below, check to see whether the desired operator is actually
395 * supported by the index opclass, and fail quietly if not. This allows
396 * us to not be concerned with specific opclasses (except for the legacy
397 * "pattern" cases); any index that correctly implements the operators
398 * will work.
399 */
400 if (pstatus == Pattern_Prefix_Exact)
401 {
402 if (!op_in_opfamily(eqopr, opfamily))
403 return NIL;
404 expr = make_opclause(eqopr, BOOLOID, false,
405 (Expr *) leftop, (Expr *) prefix,
406 InvalidOid, indexcollation);
407 result = list_make1(expr);
408 return result;
409 }
410
411 /*
412 * Otherwise, we have a nonempty required prefix of the values.
413 *
414 * We can always say "x >= prefix".
415 */
416 if (!op_in_opfamily(geopr, opfamily))
417 return NIL;
418 expr = make_opclause(geopr, BOOLOID, false,
419 (Expr *) leftop, (Expr *) prefix,
420 InvalidOid, indexcollation);
421 result = list_make1(expr);
422
423 /*-------
424 * If we can create a string larger than the prefix, we can say
425 * "x < greaterstr". NB: we rely on make_greater_string() to generate
426 * a guaranteed-greater string, not just a probably-greater string.
427 * In general this is only guaranteed in C locale, so we'd better be
428 * using a C-locale index collation.
429 *-------
430 */
431 if (!op_in_opfamily(ltopr, opfamily))
432 return result;
433 fmgr_info(get_opcode(ltopr), <proc);
434 greaterstr = make_greater_string(prefix, <proc, indexcollation);
435 if (greaterstr)
436 {
437 expr = make_opclause(ltopr, BOOLOID, false,
438 (Expr *) leftop, (Expr *) greaterstr,
439 InvalidOid, indexcollation);
440 result = lappend(result, expr);
441 }
442
443 return result;
444 }
445
446
447 /*
448 * patternsel_common - generic code for pattern-match restriction selectivity.
449 *
450 * To support using this from either the operator or function paths, caller
451 * may pass either operator OID or underlying function OID; we look up the
452 * latter from the former if needed. (We could just have patternsel() call
453 * get_opcode(), but the work would be wasted if we don't have a need to
454 * compare a fixed prefix to the pg_statistic data.)
455 *
456 * Note that oprid and/or opfuncid should be for the positive-match operator
457 * even when negate is true.
458 */
459 static double
patternsel_common(PlannerInfo * root,Oid oprid,Oid opfuncid,List * args,int varRelid,Oid collation,Pattern_Type ptype,bool negate)460 patternsel_common(PlannerInfo *root,
461 Oid oprid,
462 Oid opfuncid,
463 List *args,
464 int varRelid,
465 Oid collation,
466 Pattern_Type ptype,
467 bool negate)
468 {
469 VariableStatData vardata;
470 Node *other;
471 bool varonleft;
472 Datum constval;
473 Oid consttype;
474 Oid vartype;
475 Oid rdatatype;
476 Oid eqopr;
477 Oid ltopr;
478 Oid geopr;
479 Pattern_Prefix_Status pstatus;
480 Const *patt;
481 Const *prefix = NULL;
482 Selectivity rest_selec = 0;
483 double nullfrac = 0.0;
484 double result;
485
486 /*
487 * Initialize result to the appropriate default estimate depending on
488 * whether it's a match or not-match operator.
489 */
490 if (negate)
491 result = 1.0 - DEFAULT_MATCH_SEL;
492 else
493 result = DEFAULT_MATCH_SEL;
494
495 /*
496 * If expression is not variable op constant, then punt and return the
497 * default estimate.
498 */
499 if (!get_restriction_variable(root, args, varRelid,
500 &vardata, &other, &varonleft))
501 return result;
502 if (!varonleft || !IsA(other, Const))
503 {
504 ReleaseVariableStats(vardata);
505 return result;
506 }
507
508 /*
509 * If the constant is NULL, assume operator is strict and return zero, ie,
510 * operator will never return TRUE. (It's zero even for a negator op.)
511 */
512 if (((Const *) other)->constisnull)
513 {
514 ReleaseVariableStats(vardata);
515 return 0.0;
516 }
517 constval = ((Const *) other)->constvalue;
518 consttype = ((Const *) other)->consttype;
519
520 /*
521 * The right-hand const is type text or bytea for all supported operators.
522 * We do not expect to see binary-compatible types here, since
523 * const-folding should have relabeled the const to exactly match the
524 * operator's declared type.
525 */
526 if (consttype != TEXTOID && consttype != BYTEAOID)
527 {
528 ReleaseVariableStats(vardata);
529 return result;
530 }
531
532 /*
533 * Similarly, the exposed type of the left-hand side should be one of
534 * those we know. (Do not look at vardata.atttype, which might be
535 * something binary-compatible but different.) We can use it to identify
536 * the comparison operators and the required type of the comparison
537 * constant, much as in match_pattern_prefix().
538 */
539 vartype = vardata.vartype;
540
541 switch (vartype)
542 {
543 case TEXTOID:
544 eqopr = TextEqualOperator;
545 ltopr = TextLessOperator;
546 geopr = TextGreaterEqualOperator;
547 rdatatype = TEXTOID;
548 break;
549 case NAMEOID:
550
551 /*
552 * Note that here, we need the RHS type to be text, so that the
553 * comparison value isn't improperly truncated to NAMEDATALEN.
554 */
555 eqopr = NameEqualTextOperator;
556 ltopr = NameLessTextOperator;
557 geopr = NameGreaterEqualTextOperator;
558 rdatatype = TEXTOID;
559 break;
560 case BPCHAROID:
561 eqopr = BpcharEqualOperator;
562 ltopr = BpcharLessOperator;
563 geopr = BpcharGreaterEqualOperator;
564 rdatatype = BPCHAROID;
565 break;
566 case BYTEAOID:
567 eqopr = ByteaEqualOperator;
568 ltopr = ByteaLessOperator;
569 geopr = ByteaGreaterEqualOperator;
570 rdatatype = BYTEAOID;
571 break;
572 default:
573 /* Can't get here unless we're attached to the wrong operator */
574 ReleaseVariableStats(vardata);
575 return result;
576 }
577
578 /*
579 * Grab the nullfrac for use below.
580 */
581 if (HeapTupleIsValid(vardata.statsTuple))
582 {
583 Form_pg_statistic stats;
584
585 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
586 nullfrac = stats->stanullfrac;
587 }
588
589 /*
590 * Pull out any fixed prefix implied by the pattern, and estimate the
591 * fractional selectivity of the remainder of the pattern. Unlike many
592 * other selectivity estimators, we use the pattern operator's actual
593 * collation for this step. This is not because we expect the collation
594 * to make a big difference in the selectivity estimate (it seldom would),
595 * but because we want to be sure we cache compiled regexps under the
596 * right cache key, so that they can be re-used at runtime.
597 */
598 patt = (Const *) other;
599 pstatus = pattern_fixed_prefix(patt, ptype, collation,
600 &prefix, &rest_selec);
601
602 /*
603 * If necessary, coerce the prefix constant to the right type. The only
604 * case where we need to do anything is when converting text to bpchar.
605 * Those two types are binary-compatible, so relabeling the Const node is
606 * sufficient.
607 */
608 if (prefix && prefix->consttype != rdatatype)
609 {
610 Assert(prefix->consttype == TEXTOID &&
611 rdatatype == BPCHAROID);
612 prefix->consttype = rdatatype;
613 }
614
615 if (pstatus == Pattern_Prefix_Exact)
616 {
617 /*
618 * Pattern specifies an exact match, so estimate as for '='
619 */
620 result = var_eq_const(&vardata, eqopr, collation, prefix->constvalue,
621 false, true, false);
622 }
623 else
624 {
625 /*
626 * Not exact-match pattern. If we have a sufficiently large
627 * histogram, estimate selectivity for the histogram part of the
628 * population by counting matches in the histogram. If not, estimate
629 * selectivity of the fixed prefix and remainder of pattern
630 * separately, then combine the two to get an estimate of the
631 * selectivity for the part of the column population represented by
632 * the histogram. (For small histograms, we combine these
633 * approaches.)
634 *
635 * We then add up data for any most-common-values values; these are
636 * not in the histogram population, and we can get exact answers for
637 * them by applying the pattern operator, so there's no reason to
638 * approximate. (If the MCVs cover a significant part of the total
639 * population, this gives us a big leg up in accuracy.)
640 */
641 Selectivity selec;
642 int hist_size;
643 FmgrInfo opproc;
644 double mcv_selec,
645 sumcommon;
646
647 /* Try to use the histogram entries to get selectivity */
648 if (!OidIsValid(opfuncid))
649 opfuncid = get_opcode(oprid);
650 fmgr_info(opfuncid, &opproc);
651
652 selec = histogram_selectivity(&vardata, &opproc, collation,
653 constval, true,
654 10, 1, &hist_size);
655
656 /* If not at least 100 entries, use the heuristic method */
657 if (hist_size < 100)
658 {
659 Selectivity heursel;
660 Selectivity prefixsel;
661
662 if (pstatus == Pattern_Prefix_Partial)
663 prefixsel = prefix_selectivity(root, &vardata,
664 eqopr, ltopr, geopr,
665 collation,
666 prefix);
667 else
668 prefixsel = 1.0;
669 heursel = prefixsel * rest_selec;
670
671 if (selec < 0) /* fewer than 10 histogram entries? */
672 selec = heursel;
673 else
674 {
675 /*
676 * For histogram sizes from 10 to 100, we combine the
677 * histogram and heuristic selectivities, putting increasingly
678 * more trust in the histogram for larger sizes.
679 */
680 double hist_weight = hist_size / 100.0;
681
682 selec = selec * hist_weight + heursel * (1.0 - hist_weight);
683 }
684 }
685
686 /* In any case, don't believe extremely small or large estimates. */
687 if (selec < 0.0001)
688 selec = 0.0001;
689 else if (selec > 0.9999)
690 selec = 0.9999;
691
692 /*
693 * If we have most-common-values info, add up the fractions of the MCV
694 * entries that satisfy MCV OP PATTERN. These fractions contribute
695 * directly to the result selectivity. Also add up the total fraction
696 * represented by MCV entries.
697 */
698 mcv_selec = mcv_selectivity(&vardata, &opproc, collation,
699 constval, true,
700 &sumcommon);
701
702 /*
703 * Now merge the results from the MCV and histogram calculations,
704 * realizing that the histogram covers only the non-null values that
705 * are not listed in MCV.
706 */
707 selec *= 1.0 - nullfrac - sumcommon;
708 selec += mcv_selec;
709 result = selec;
710 }
711
712 /* now adjust if we wanted not-match rather than match */
713 if (negate)
714 result = 1.0 - result - nullfrac;
715
716 /* result should be in range, but make sure... */
717 CLAMP_PROBABILITY(result);
718
719 if (prefix)
720 {
721 pfree(DatumGetPointer(prefix->constvalue));
722 pfree(prefix);
723 }
724
725 ReleaseVariableStats(vardata);
726
727 return result;
728 }
729
730 /*
731 * Fix impedance mismatch between SQL-callable functions and patternsel_common
732 */
733 static double
patternsel(PG_FUNCTION_ARGS,Pattern_Type ptype,bool negate)734 patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
735 {
736 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
737 Oid operator = PG_GETARG_OID(1);
738 List *args = (List *) PG_GETARG_POINTER(2);
739 int varRelid = PG_GETARG_INT32(3);
740 Oid collation = PG_GET_COLLATION();
741
742 /*
743 * If this is for a NOT LIKE or similar operator, get the corresponding
744 * positive-match operator and work with that.
745 */
746 if (negate)
747 {
748 operator = get_negator(operator);
749 if (!OidIsValid(operator))
750 elog(ERROR, "patternsel called for operator without a negator");
751 }
752
753 return patternsel_common(root,
754 operator,
755 InvalidOid,
756 args,
757 varRelid,
758 collation,
759 ptype,
760 negate);
761 }
762
763 /*
764 * regexeqsel - Selectivity of regular-expression pattern match.
765 */
766 Datum
regexeqsel(PG_FUNCTION_ARGS)767 regexeqsel(PG_FUNCTION_ARGS)
768 {
769 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false));
770 }
771
772 /*
773 * icregexeqsel - Selectivity of case-insensitive regex match.
774 */
775 Datum
icregexeqsel(PG_FUNCTION_ARGS)776 icregexeqsel(PG_FUNCTION_ARGS)
777 {
778 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false));
779 }
780
781 /*
782 * likesel - Selectivity of LIKE pattern match.
783 */
784 Datum
likesel(PG_FUNCTION_ARGS)785 likesel(PG_FUNCTION_ARGS)
786 {
787 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false));
788 }
789
790 /*
791 * prefixsel - selectivity of prefix operator
792 */
793 Datum
prefixsel(PG_FUNCTION_ARGS)794 prefixsel(PG_FUNCTION_ARGS)
795 {
796 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false));
797 }
798
799 /*
800 *
801 * iclikesel - Selectivity of ILIKE pattern match.
802 */
803 Datum
iclikesel(PG_FUNCTION_ARGS)804 iclikesel(PG_FUNCTION_ARGS)
805 {
806 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false));
807 }
808
809 /*
810 * regexnesel - Selectivity of regular-expression pattern non-match.
811 */
812 Datum
regexnesel(PG_FUNCTION_ARGS)813 regexnesel(PG_FUNCTION_ARGS)
814 {
815 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true));
816 }
817
818 /*
819 * icregexnesel - Selectivity of case-insensitive regex non-match.
820 */
821 Datum
icregexnesel(PG_FUNCTION_ARGS)822 icregexnesel(PG_FUNCTION_ARGS)
823 {
824 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true));
825 }
826
827 /*
828 * nlikesel - Selectivity of LIKE pattern non-match.
829 */
830 Datum
nlikesel(PG_FUNCTION_ARGS)831 nlikesel(PG_FUNCTION_ARGS)
832 {
833 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true));
834 }
835
836 /*
837 * icnlikesel - Selectivity of ILIKE pattern non-match.
838 */
839 Datum
icnlikesel(PG_FUNCTION_ARGS)840 icnlikesel(PG_FUNCTION_ARGS)
841 {
842 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true));
843 }
844
845 /*
846 * patternjoinsel - Generic code for pattern-match join selectivity.
847 */
848 static double
patternjoinsel(PG_FUNCTION_ARGS,Pattern_Type ptype,bool negate)849 patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
850 {
851 /* For the moment we just punt. */
852 return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL;
853 }
854
855 /*
856 * regexeqjoinsel - Join selectivity of regular-expression pattern match.
857 */
858 Datum
regexeqjoinsel(PG_FUNCTION_ARGS)859 regexeqjoinsel(PG_FUNCTION_ARGS)
860 {
861 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false));
862 }
863
864 /*
865 * icregexeqjoinsel - Join selectivity of case-insensitive regex match.
866 */
867 Datum
icregexeqjoinsel(PG_FUNCTION_ARGS)868 icregexeqjoinsel(PG_FUNCTION_ARGS)
869 {
870 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false));
871 }
872
873 /*
874 * likejoinsel - Join selectivity of LIKE pattern match.
875 */
876 Datum
likejoinsel(PG_FUNCTION_ARGS)877 likejoinsel(PG_FUNCTION_ARGS)
878 {
879 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false));
880 }
881
882 /*
883 * prefixjoinsel - Join selectivity of prefix operator
884 */
885 Datum
prefixjoinsel(PG_FUNCTION_ARGS)886 prefixjoinsel(PG_FUNCTION_ARGS)
887 {
888 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false));
889 }
890
891 /*
892 * iclikejoinsel - Join selectivity of ILIKE pattern match.
893 */
894 Datum
iclikejoinsel(PG_FUNCTION_ARGS)895 iclikejoinsel(PG_FUNCTION_ARGS)
896 {
897 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false));
898 }
899
900 /*
901 * regexnejoinsel - Join selectivity of regex non-match.
902 */
903 Datum
regexnejoinsel(PG_FUNCTION_ARGS)904 regexnejoinsel(PG_FUNCTION_ARGS)
905 {
906 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true));
907 }
908
909 /*
910 * icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
911 */
912 Datum
icregexnejoinsel(PG_FUNCTION_ARGS)913 icregexnejoinsel(PG_FUNCTION_ARGS)
914 {
915 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true));
916 }
917
918 /*
919 * nlikejoinsel - Join selectivity of LIKE pattern non-match.
920 */
921 Datum
nlikejoinsel(PG_FUNCTION_ARGS)922 nlikejoinsel(PG_FUNCTION_ARGS)
923 {
924 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true));
925 }
926
927 /*
928 * icnlikejoinsel - Join selectivity of ILIKE pattern non-match.
929 */
930 Datum
icnlikejoinsel(PG_FUNCTION_ARGS)931 icnlikejoinsel(PG_FUNCTION_ARGS)
932 {
933 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true));
934 }
935
936
937 /*-------------------------------------------------------------------------
938 *
939 * Pattern analysis functions
940 *
941 * These routines support analysis of LIKE and regular-expression patterns
942 * by the planner/optimizer. It's important that they agree with the
943 * regular-expression code in backend/regex/ and the LIKE code in
944 * backend/utils/adt/like.c. Also, the computation of the fixed prefix
945 * must be conservative: if we report a string longer than the true fixed
946 * prefix, the query may produce actually wrong answers, rather than just
947 * getting a bad selectivity estimate!
948 *
949 *-------------------------------------------------------------------------
950 */
951
952 /*
953 * Extract the fixed prefix, if any, for a pattern.
954 *
955 * *prefix is set to a palloc'd prefix string (in the form of a Const node),
956 * or to NULL if no fixed prefix exists for the pattern.
957 * If rest_selec is not NULL, *rest_selec is set to an estimate of the
958 * selectivity of the remainder of the pattern (without any fixed prefix).
959 * The prefix Const has the same type (TEXT or BYTEA) as the input pattern.
960 *
961 * The return value distinguishes no fixed prefix, a partial prefix,
962 * or an exact-match-only pattern.
963 */
964
965 static Pattern_Prefix_Status
like_fixed_prefix(Const * patt_const,bool case_insensitive,Oid collation,Const ** prefix_const,Selectivity * rest_selec)966 like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
967 Const **prefix_const, Selectivity *rest_selec)
968 {
969 char *match;
970 char *patt;
971 int pattlen;
972 Oid typeid = patt_const->consttype;
973 int pos,
974 match_pos;
975 bool is_multibyte = (pg_database_encoding_max_length() > 1);
976 pg_locale_t locale = 0;
977 bool locale_is_c = false;
978
979 /* the right-hand const is type text or bytea */
980 Assert(typeid == BYTEAOID || typeid == TEXTOID);
981
982 if (case_insensitive)
983 {
984 if (typeid == BYTEAOID)
985 ereport(ERROR,
986 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
987 errmsg("case insensitive matching not supported on type bytea")));
988
989 /* If case-insensitive, we need locale info */
990 if (lc_ctype_is_c(collation))
991 locale_is_c = true;
992 else if (collation != DEFAULT_COLLATION_OID)
993 {
994 if (!OidIsValid(collation))
995 {
996 /*
997 * This typically means that the parser could not resolve a
998 * conflict of implicit collations, so report it that way.
999 */
1000 ereport(ERROR,
1001 (errcode(ERRCODE_INDETERMINATE_COLLATION),
1002 errmsg("could not determine which collation to use for ILIKE"),
1003 errhint("Use the COLLATE clause to set the collation explicitly.")));
1004 }
1005 locale = pg_newlocale_from_collation(collation);
1006 }
1007 }
1008
1009 if (typeid != BYTEAOID)
1010 {
1011 patt = TextDatumGetCString(patt_const->constvalue);
1012 pattlen = strlen(patt);
1013 }
1014 else
1015 {
1016 bytea *bstr = DatumGetByteaPP(patt_const->constvalue);
1017
1018 pattlen = VARSIZE_ANY_EXHDR(bstr);
1019 patt = (char *) palloc(pattlen);
1020 memcpy(patt, VARDATA_ANY(bstr), pattlen);
1021 Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
1022 }
1023
1024 match = palloc(pattlen + 1);
1025 match_pos = 0;
1026 for (pos = 0; pos < pattlen; pos++)
1027 {
1028 /* % and _ are wildcard characters in LIKE */
1029 if (patt[pos] == '%' ||
1030 patt[pos] == '_')
1031 break;
1032
1033 /* Backslash escapes the next character */
1034 if (patt[pos] == '\\')
1035 {
1036 pos++;
1037 if (pos >= pattlen)
1038 break;
1039 }
1040
1041 /* Stop if case-varying character (it's sort of a wildcard) */
1042 if (case_insensitive &&
1043 pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
1044 break;
1045
1046 match[match_pos++] = patt[pos];
1047 }
1048
1049 match[match_pos] = '\0';
1050
1051 if (typeid != BYTEAOID)
1052 *prefix_const = string_to_const(match, typeid);
1053 else
1054 *prefix_const = string_to_bytea_const(match, match_pos);
1055
1056 if (rest_selec != NULL)
1057 *rest_selec = like_selectivity(&patt[pos], pattlen - pos,
1058 case_insensitive);
1059
1060 pfree(patt);
1061 pfree(match);
1062
1063 /* in LIKE, an empty pattern is an exact match! */
1064 if (pos == pattlen)
1065 return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
1066
1067 if (match_pos > 0)
1068 return Pattern_Prefix_Partial;
1069
1070 return Pattern_Prefix_None;
1071 }
1072
1073 static Pattern_Prefix_Status
regex_fixed_prefix(Const * patt_const,bool case_insensitive,Oid collation,Const ** prefix_const,Selectivity * rest_selec)1074 regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
1075 Const **prefix_const, Selectivity *rest_selec)
1076 {
1077 Oid typeid = patt_const->consttype;
1078 char *prefix;
1079 bool exact;
1080
1081 /*
1082 * Should be unnecessary, there are no bytea regex operators defined. As
1083 * such, it should be noted that the rest of this function has *not* been
1084 * made safe for binary (possibly NULL containing) strings.
1085 */
1086 if (typeid == BYTEAOID)
1087 ereport(ERROR,
1088 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1089 errmsg("regular-expression matching not supported on type bytea")));
1090
1091 /* Use the regexp machinery to extract the prefix, if any */
1092 prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
1093 case_insensitive, collation,
1094 &exact);
1095
1096 if (prefix == NULL)
1097 {
1098 *prefix_const = NULL;
1099
1100 if (rest_selec != NULL)
1101 {
1102 char *patt = TextDatumGetCString(patt_const->constvalue);
1103
1104 *rest_selec = regex_selectivity(patt, strlen(patt),
1105 case_insensitive,
1106 0);
1107 pfree(patt);
1108 }
1109
1110 return Pattern_Prefix_None;
1111 }
1112
1113 *prefix_const = string_to_const(prefix, typeid);
1114
1115 if (rest_selec != NULL)
1116 {
1117 if (exact)
1118 {
1119 /* Exact match, so there's no additional selectivity */
1120 *rest_selec = 1.0;
1121 }
1122 else
1123 {
1124 char *patt = TextDatumGetCString(patt_const->constvalue);
1125
1126 *rest_selec = regex_selectivity(patt, strlen(patt),
1127 case_insensitive,
1128 strlen(prefix));
1129 pfree(patt);
1130 }
1131 }
1132
1133 pfree(prefix);
1134
1135 if (exact)
1136 return Pattern_Prefix_Exact; /* pattern specifies exact match */
1137 else
1138 return Pattern_Prefix_Partial;
1139 }
1140
1141 static Pattern_Prefix_Status
pattern_fixed_prefix(Const * patt,Pattern_Type ptype,Oid collation,Const ** prefix,Selectivity * rest_selec)1142 pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
1143 Const **prefix, Selectivity *rest_selec)
1144 {
1145 Pattern_Prefix_Status result;
1146
1147 switch (ptype)
1148 {
1149 case Pattern_Type_Like:
1150 result = like_fixed_prefix(patt, false, collation,
1151 prefix, rest_selec);
1152 break;
1153 case Pattern_Type_Like_IC:
1154 result = like_fixed_prefix(patt, true, collation,
1155 prefix, rest_selec);
1156 break;
1157 case Pattern_Type_Regex:
1158 result = regex_fixed_prefix(patt, false, collation,
1159 prefix, rest_selec);
1160 break;
1161 case Pattern_Type_Regex_IC:
1162 result = regex_fixed_prefix(patt, true, collation,
1163 prefix, rest_selec);
1164 break;
1165 case Pattern_Type_Prefix:
1166 /* Prefix type work is trivial. */
1167 result = Pattern_Prefix_Partial;
1168 *rest_selec = 1.0; /* all */
1169 *prefix = makeConst(patt->consttype,
1170 patt->consttypmod,
1171 patt->constcollid,
1172 patt->constlen,
1173 datumCopy(patt->constvalue,
1174 patt->constbyval,
1175 patt->constlen),
1176 patt->constisnull,
1177 patt->constbyval);
1178 break;
1179 default:
1180 elog(ERROR, "unrecognized ptype: %d", (int) ptype);
1181 result = Pattern_Prefix_None; /* keep compiler quiet */
1182 break;
1183 }
1184 return result;
1185 }
1186
1187 /*
1188 * Estimate the selectivity of a fixed prefix for a pattern match.
1189 *
1190 * A fixed prefix "foo" is estimated as the selectivity of the expression
1191 * "variable >= 'foo' AND variable < 'fop'".
1192 *
1193 * The selectivity estimate is with respect to the portion of the column
1194 * population represented by the histogram --- the caller must fold this
1195 * together with info about MCVs and NULLs.
1196 *
1197 * We use the given comparison operators and collation to do the estimation.
1198 * The given variable and Const must be of the associated datatype(s).
1199 *
1200 * XXX Note: we make use of the upper bound to estimate operator selectivity
1201 * even if the locale is such that we cannot rely on the upper-bound string.
1202 * The selectivity only needs to be approximately right anyway, so it seems
1203 * more useful to use the upper-bound code than not.
1204 */
1205 static Selectivity
prefix_selectivity(PlannerInfo * root,VariableStatData * vardata,Oid eqopr,Oid ltopr,Oid geopr,Oid collation,Const * prefixcon)1206 prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
1207 Oid eqopr, Oid ltopr, Oid geopr,
1208 Oid collation,
1209 Const *prefixcon)
1210 {
1211 Selectivity prefixsel;
1212 FmgrInfo opproc;
1213 Const *greaterstrcon;
1214 Selectivity eq_sel;
1215
1216 /* Estimate the selectivity of "x >= prefix" */
1217 fmgr_info(get_opcode(geopr), &opproc);
1218
1219 prefixsel = ineq_histogram_selectivity(root, vardata,
1220 geopr, &opproc, true, true,
1221 collation,
1222 prefixcon->constvalue,
1223 prefixcon->consttype);
1224
1225 if (prefixsel < 0.0)
1226 {
1227 /* No histogram is present ... return a suitable default estimate */
1228 return DEFAULT_MATCH_SEL;
1229 }
1230
1231 /*
1232 * If we can create a string larger than the prefix, say "x < greaterstr".
1233 */
1234 fmgr_info(get_opcode(ltopr), &opproc);
1235 greaterstrcon = make_greater_string(prefixcon, &opproc, collation);
1236 if (greaterstrcon)
1237 {
1238 Selectivity topsel;
1239
1240 topsel = ineq_histogram_selectivity(root, vardata,
1241 ltopr, &opproc, false, false,
1242 collation,
1243 greaterstrcon->constvalue,
1244 greaterstrcon->consttype);
1245
1246 /* ineq_histogram_selectivity worked before, it shouldn't fail now */
1247 Assert(topsel >= 0.0);
1248
1249 /*
1250 * Merge the two selectivities in the same way as for a range query
1251 * (see clauselist_selectivity()). Note that we don't need to worry
1252 * about double-exclusion of nulls, since ineq_histogram_selectivity
1253 * doesn't count those anyway.
1254 */
1255 prefixsel = topsel + prefixsel - 1.0;
1256 }
1257
1258 /*
1259 * If the prefix is long then the two bounding values might be too close
1260 * together for the histogram to distinguish them usefully, resulting in a
1261 * zero estimate (plus or minus roundoff error). To avoid returning a
1262 * ridiculously small estimate, compute the estimated selectivity for
1263 * "variable = 'foo'", and clamp to that. (Obviously, the resultant
1264 * estimate should be at least that.)
1265 *
1266 * We apply this even if we couldn't make a greater string. That case
1267 * suggests that the prefix is near the maximum possible, and thus
1268 * probably off the end of the histogram, and thus we probably got a very
1269 * small estimate from the >= condition; so we still need to clamp.
1270 */
1271 eq_sel = var_eq_const(vardata, eqopr, collation, prefixcon->constvalue,
1272 false, true, false);
1273
1274 prefixsel = Max(prefixsel, eq_sel);
1275
1276 return prefixsel;
1277 }
1278
1279
1280 /*
1281 * Estimate the selectivity of a pattern of the specified type.
1282 * Note that any fixed prefix of the pattern will have been removed already,
1283 * so actually we may be looking at just a fragment of the pattern.
1284 *
1285 * For now, we use a very simplistic approach: fixed characters reduce the
1286 * selectivity a good deal, character ranges reduce it a little,
1287 * wildcards (such as % for LIKE or .* for regex) increase it.
1288 */
1289
1290 #define FIXED_CHAR_SEL 0.20 /* about 1/5 */
1291 #define CHAR_RANGE_SEL 0.25
1292 #define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
1293 #define FULL_WILDCARD_SEL 5.0
1294 #define PARTIAL_WILDCARD_SEL 2.0
1295
1296 static Selectivity
like_selectivity(const char * patt,int pattlen,bool case_insensitive)1297 like_selectivity(const char *patt, int pattlen, bool case_insensitive)
1298 {
1299 Selectivity sel = 1.0;
1300 int pos;
1301
1302 /* Skip any leading wildcard; it's already factored into initial sel */
1303 for (pos = 0; pos < pattlen; pos++)
1304 {
1305 if (patt[pos] != '%' && patt[pos] != '_')
1306 break;
1307 }
1308
1309 for (; pos < pattlen; pos++)
1310 {
1311 /* % and _ are wildcard characters in LIKE */
1312 if (patt[pos] == '%')
1313 sel *= FULL_WILDCARD_SEL;
1314 else if (patt[pos] == '_')
1315 sel *= ANY_CHAR_SEL;
1316 else if (patt[pos] == '\\')
1317 {
1318 /* Backslash quotes the next character */
1319 pos++;
1320 if (pos >= pattlen)
1321 break;
1322 sel *= FIXED_CHAR_SEL;
1323 }
1324 else
1325 sel *= FIXED_CHAR_SEL;
1326 }
1327 /* Could get sel > 1 if multiple wildcards */
1328 if (sel > 1.0)
1329 sel = 1.0;
1330 return sel;
1331 }
1332
1333 static Selectivity
regex_selectivity_sub(const char * patt,int pattlen,bool case_insensitive)1334 regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
1335 {
1336 Selectivity sel = 1.0;
1337 int paren_depth = 0;
1338 int paren_pos = 0; /* dummy init to keep compiler quiet */
1339 int pos;
1340
1341 for (pos = 0; pos < pattlen; pos++)
1342 {
1343 if (patt[pos] == '(')
1344 {
1345 if (paren_depth == 0)
1346 paren_pos = pos; /* remember start of parenthesized item */
1347 paren_depth++;
1348 }
1349 else if (patt[pos] == ')' && paren_depth > 0)
1350 {
1351 paren_depth--;
1352 if (paren_depth == 0)
1353 sel *= regex_selectivity_sub(patt + (paren_pos + 1),
1354 pos - (paren_pos + 1),
1355 case_insensitive);
1356 }
1357 else if (patt[pos] == '|' && paren_depth == 0)
1358 {
1359 /*
1360 * If unquoted | is present at paren level 0 in pattern, we have
1361 * multiple alternatives; sum their probabilities.
1362 */
1363 sel += regex_selectivity_sub(patt + (pos + 1),
1364 pattlen - (pos + 1),
1365 case_insensitive);
1366 break; /* rest of pattern is now processed */
1367 }
1368 else if (patt[pos] == '[')
1369 {
1370 bool negclass = false;
1371
1372 if (patt[++pos] == '^')
1373 {
1374 negclass = true;
1375 pos++;
1376 }
1377 if (patt[pos] == ']') /* ']' at start of class is not special */
1378 pos++;
1379 while (pos < pattlen && patt[pos] != ']')
1380 pos++;
1381 if (paren_depth == 0)
1382 sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
1383 }
1384 else if (patt[pos] == '.')
1385 {
1386 if (paren_depth == 0)
1387 sel *= ANY_CHAR_SEL;
1388 }
1389 else if (patt[pos] == '*' ||
1390 patt[pos] == '?' ||
1391 patt[pos] == '+')
1392 {
1393 /* Ought to be smarter about quantifiers... */
1394 if (paren_depth == 0)
1395 sel *= PARTIAL_WILDCARD_SEL;
1396 }
1397 else if (patt[pos] == '{')
1398 {
1399 while (pos < pattlen && patt[pos] != '}')
1400 pos++;
1401 if (paren_depth == 0)
1402 sel *= PARTIAL_WILDCARD_SEL;
1403 }
1404 else if (patt[pos] == '\\')
1405 {
1406 /* backslash quotes the next character */
1407 pos++;
1408 if (pos >= pattlen)
1409 break;
1410 if (paren_depth == 0)
1411 sel *= FIXED_CHAR_SEL;
1412 }
1413 else
1414 {
1415 if (paren_depth == 0)
1416 sel *= FIXED_CHAR_SEL;
1417 }
1418 }
1419 /* Could get sel > 1 if multiple wildcards */
1420 if (sel > 1.0)
1421 sel = 1.0;
1422 return sel;
1423 }
1424
1425 static Selectivity
regex_selectivity(const char * patt,int pattlen,bool case_insensitive,int fixed_prefix_len)1426 regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
1427 int fixed_prefix_len)
1428 {
1429 Selectivity sel;
1430
1431 /* If patt doesn't end with $, consider it to have a trailing wildcard */
1432 if (pattlen > 0 && patt[pattlen - 1] == '$' &&
1433 (pattlen == 1 || patt[pattlen - 2] != '\\'))
1434 {
1435 /* has trailing $ */
1436 sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
1437 }
1438 else
1439 {
1440 /* no trailing $ */
1441 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
1442 sel *= FULL_WILDCARD_SEL;
1443 }
1444
1445 /*
1446 * If there's a fixed prefix, discount its selectivity. We have to be
1447 * careful here since a very long prefix could result in pow's result
1448 * underflowing to zero (in which case "sel" probably has as well).
1449 */
1450 if (fixed_prefix_len > 0)
1451 {
1452 double prefixsel = pow(FIXED_CHAR_SEL, fixed_prefix_len);
1453
1454 if (prefixsel > 0.0)
1455 sel /= prefixsel;
1456 }
1457
1458 /* Make sure result stays in range */
1459 CLAMP_PROBABILITY(sel);
1460 return sel;
1461 }
1462
1463 /*
1464 * Check whether char is a letter (and, hence, subject to case-folding)
1465 *
1466 * In multibyte character sets or with ICU, we can't use isalpha, and it does
1467 * not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha.
1468 * Instead, just assume any non-ASCII char is potentially case-varying, and
1469 * hard-wire knowledge of which ASCII chars are letters.
1470 */
1471 static int
pattern_char_isalpha(char c,bool is_multibyte,pg_locale_t locale,bool locale_is_c)1472 pattern_char_isalpha(char c, bool is_multibyte,
1473 pg_locale_t locale, bool locale_is_c)
1474 {
1475 if (locale_is_c)
1476 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1477 else if (is_multibyte && IS_HIGHBIT_SET(c))
1478 return true;
1479 else if (locale && locale->provider == COLLPROVIDER_ICU)
1480 return IS_HIGHBIT_SET(c) ||
1481 (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1482 #ifdef HAVE_LOCALE_T
1483 else if (locale && locale->provider == COLLPROVIDER_LIBC)
1484 return isalpha_l((unsigned char) c, locale->info.lt);
1485 #endif
1486 else
1487 return isalpha((unsigned char) c);
1488 }
1489
1490
1491 /*
1492 * For bytea, the increment function need only increment the current byte
1493 * (there are no multibyte characters to worry about).
1494 */
1495 static bool
byte_increment(unsigned char * ptr,int len)1496 byte_increment(unsigned char *ptr, int len)
1497 {
1498 if (*ptr >= 255)
1499 return false;
1500 (*ptr)++;
1501 return true;
1502 }
1503
1504 /*
1505 * Try to generate a string greater than the given string or any
1506 * string it is a prefix of. If successful, return a palloc'd string
1507 * in the form of a Const node; else return NULL.
1508 *
1509 * The caller must provide the appropriate "less than" comparison function
1510 * for testing the strings, along with the collation to use.
1511 *
1512 * The key requirement here is that given a prefix string, say "foo",
1513 * we must be able to generate another string "fop" that is greater than
1514 * all strings "foobar" starting with "foo". We can test that we have
1515 * generated a string greater than the prefix string, but in non-C collations
1516 * that is not a bulletproof guarantee that an extension of the string might
1517 * not sort after it; an example is that "foo " is less than "foo!", but it
1518 * is not clear that a "dictionary" sort ordering will consider "foo!" less
1519 * than "foo bar". CAUTION: Therefore, this function should be used only for
1520 * estimation purposes when working in a non-C collation.
1521 *
1522 * To try to catch most cases where an extended string might otherwise sort
1523 * before the result value, we determine which of the strings "Z", "z", "y",
1524 * and "9" is seen as largest by the collation, and append that to the given
1525 * prefix before trying to find a string that compares as larger.
1526 *
1527 * To search for a greater string, we repeatedly "increment" the rightmost
1528 * character, using an encoding-specific character incrementer function.
1529 * When it's no longer possible to increment the last character, we truncate
1530 * off that character and start incrementing the next-to-rightmost.
1531 * For example, if "z" were the last character in the sort order, then we
1532 * could produce "foo" as a string greater than "fonz".
1533 *
1534 * This could be rather slow in the worst case, but in most cases we
1535 * won't have to try more than one or two strings before succeeding.
1536 *
1537 * Note that it's important for the character incrementer not to be too anal
1538 * about producing every possible character code, since in some cases the only
1539 * way to get a larger string is to increment a previous character position.
1540 * So we don't want to spend too much time trying every possible character
1541 * code at the last position. A good rule of thumb is to be sure that we
1542 * don't try more than 256*K values for a K-byte character (and definitely
1543 * not 256^K, which is what an exhaustive search would approach).
1544 */
1545 static Const *
make_greater_string(const Const * str_const,FmgrInfo * ltproc,Oid collation)1546 make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
1547 {
1548 Oid datatype = str_const->consttype;
1549 char *workstr;
1550 int len;
1551 Datum cmpstr;
1552 char *cmptxt = NULL;
1553 mbcharacter_incrementer charinc;
1554
1555 /*
1556 * Get a modifiable copy of the prefix string in C-string format, and set
1557 * up the string we will compare to as a Datum. In C locale this can just
1558 * be the given prefix string, otherwise we need to add a suffix. Type
1559 * BYTEA sorts bytewise so it never needs a suffix either.
1560 */
1561 if (datatype == BYTEAOID)
1562 {
1563 bytea *bstr = DatumGetByteaPP(str_const->constvalue);
1564
1565 len = VARSIZE_ANY_EXHDR(bstr);
1566 workstr = (char *) palloc(len);
1567 memcpy(workstr, VARDATA_ANY(bstr), len);
1568 Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue));
1569 cmpstr = str_const->constvalue;
1570 }
1571 else
1572 {
1573 if (datatype == NAMEOID)
1574 workstr = DatumGetCString(DirectFunctionCall1(nameout,
1575 str_const->constvalue));
1576 else
1577 workstr = TextDatumGetCString(str_const->constvalue);
1578 len = strlen(workstr);
1579 if (lc_collate_is_c(collation) || len == 0)
1580 cmpstr = str_const->constvalue;
1581 else
1582 {
1583 /* If first time through, determine the suffix to use */
1584 static char suffixchar = 0;
1585 static Oid suffixcollation = 0;
1586
1587 if (!suffixchar || suffixcollation != collation)
1588 {
1589 char *best;
1590
1591 best = "Z";
1592 if (varstr_cmp(best, 1, "z", 1, collation) < 0)
1593 best = "z";
1594 if (varstr_cmp(best, 1, "y", 1, collation) < 0)
1595 best = "y";
1596 if (varstr_cmp(best, 1, "9", 1, collation) < 0)
1597 best = "9";
1598 suffixchar = *best;
1599 suffixcollation = collation;
1600 }
1601
1602 /* And build the string to compare to */
1603 if (datatype == NAMEOID)
1604 {
1605 cmptxt = palloc(len + 2);
1606 memcpy(cmptxt, workstr, len);
1607 cmptxt[len] = suffixchar;
1608 cmptxt[len + 1] = '\0';
1609 cmpstr = PointerGetDatum(cmptxt);
1610 }
1611 else
1612 {
1613 cmptxt = palloc(VARHDRSZ + len + 1);
1614 SET_VARSIZE(cmptxt, VARHDRSZ + len + 1);
1615 memcpy(VARDATA(cmptxt), workstr, len);
1616 *(VARDATA(cmptxt) + len) = suffixchar;
1617 cmpstr = PointerGetDatum(cmptxt);
1618 }
1619 }
1620 }
1621
1622 /* Select appropriate character-incrementer function */
1623 if (datatype == BYTEAOID)
1624 charinc = byte_increment;
1625 else
1626 charinc = pg_database_encoding_character_incrementer();
1627
1628 /* And search ... */
1629 while (len > 0)
1630 {
1631 int charlen;
1632 unsigned char *lastchar;
1633
1634 /* Identify the last character --- for bytea, just the last byte */
1635 if (datatype == BYTEAOID)
1636 charlen = 1;
1637 else
1638 charlen = len - pg_mbcliplen(workstr, len, len - 1);
1639 lastchar = (unsigned char *) (workstr + len - charlen);
1640
1641 /*
1642 * Try to generate a larger string by incrementing the last character
1643 * (for BYTEA, we treat each byte as a character).
1644 *
1645 * Note: the incrementer function is expected to return true if it's
1646 * generated a valid-per-the-encoding new character, otherwise false.
1647 * The contents of the character on false return are unspecified.
1648 */
1649 while (charinc(lastchar, charlen))
1650 {
1651 Const *workstr_const;
1652
1653 if (datatype == BYTEAOID)
1654 workstr_const = string_to_bytea_const(workstr, len);
1655 else
1656 workstr_const = string_to_const(workstr, datatype);
1657
1658 if (DatumGetBool(FunctionCall2Coll(ltproc,
1659 collation,
1660 cmpstr,
1661 workstr_const->constvalue)))
1662 {
1663 /* Successfully made a string larger than cmpstr */
1664 if (cmptxt)
1665 pfree(cmptxt);
1666 pfree(workstr);
1667 return workstr_const;
1668 }
1669
1670 /* No good, release unusable value and try again */
1671 pfree(DatumGetPointer(workstr_const->constvalue));
1672 pfree(workstr_const);
1673 }
1674
1675 /*
1676 * No luck here, so truncate off the last character and try to
1677 * increment the next one.
1678 */
1679 len -= charlen;
1680 workstr[len] = '\0';
1681 }
1682
1683 /* Failed... */
1684 if (cmptxt)
1685 pfree(cmptxt);
1686 pfree(workstr);
1687
1688 return NULL;
1689 }
1690
1691 /*
1692 * Generate a Datum of the appropriate type from a C string.
1693 * Note that all of the supported types are pass-by-ref, so the
1694 * returned value should be pfree'd if no longer needed.
1695 */
1696 static Datum
string_to_datum(const char * str,Oid datatype)1697 string_to_datum(const char *str, Oid datatype)
1698 {
1699 Assert(str != NULL);
1700
1701 /*
1702 * We cheat a little by assuming that CStringGetTextDatum() will do for
1703 * bpchar and varchar constants too...
1704 */
1705 if (datatype == NAMEOID)
1706 return DirectFunctionCall1(namein, CStringGetDatum(str));
1707 else if (datatype == BYTEAOID)
1708 return DirectFunctionCall1(byteain, CStringGetDatum(str));
1709 else
1710 return CStringGetTextDatum(str);
1711 }
1712
1713 /*
1714 * Generate a Const node of the appropriate type from a C string.
1715 */
1716 static Const *
string_to_const(const char * str,Oid datatype)1717 string_to_const(const char *str, Oid datatype)
1718 {
1719 Datum conval = string_to_datum(str, datatype);
1720 Oid collation;
1721 int constlen;
1722
1723 /*
1724 * We only need to support a few datatypes here, so hard-wire properties
1725 * instead of incurring the expense of catalog lookups.
1726 */
1727 switch (datatype)
1728 {
1729 case TEXTOID:
1730 case VARCHAROID:
1731 case BPCHAROID:
1732 collation = DEFAULT_COLLATION_OID;
1733 constlen = -1;
1734 break;
1735
1736 case NAMEOID:
1737 collation = C_COLLATION_OID;
1738 constlen = NAMEDATALEN;
1739 break;
1740
1741 case BYTEAOID:
1742 collation = InvalidOid;
1743 constlen = -1;
1744 break;
1745
1746 default:
1747 elog(ERROR, "unexpected datatype in string_to_const: %u",
1748 datatype);
1749 return NULL;
1750 }
1751
1752 return makeConst(datatype, -1, collation, constlen,
1753 conval, false, false);
1754 }
1755
1756 /*
1757 * Generate a Const node of bytea type from a binary C string and a length.
1758 */
1759 static Const *
string_to_bytea_const(const char * str,size_t str_len)1760 string_to_bytea_const(const char *str, size_t str_len)
1761 {
1762 bytea *bstr = palloc(VARHDRSZ + str_len);
1763 Datum conval;
1764
1765 memcpy(VARDATA(bstr), str, str_len);
1766 SET_VARSIZE(bstr, VARHDRSZ + str_len);
1767 conval = PointerGetDatum(bstr);
1768
1769 return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false);
1770 }
1771