1 /*-------------------------------------------------------------------------
2  *
3  * like_support.c
4  *	  Planner support functions for LIKE, regex, and related operators.
5  *
6  * These routines handle special optimization of operators that can be
7  * used with index scans even though they are not known to the executor's
8  * indexscan machinery.  The key idea is that these operators allow us
9  * to derive approximate indexscan qual clauses, such that any tuples
10  * that pass the operator clause itself must also satisfy the simpler
11  * indexscan condition(s).  Then we can use the indexscan machinery
12  * to avoid scanning as much of the table as we'd otherwise have to,
13  * while applying the original operator as a qpqual condition to ensure
14  * we deliver only the tuples we want.  (In essence, we're using a regular
15  * index as if it were a lossy index.)
16  *
17  * An example of what we're doing is
18  *			textfield LIKE 'abc%def'
19  * from which we can generate the indexscanable conditions
20  *			textfield >= 'abc' AND textfield < 'abd'
21  * which allow efficient scanning of an index on textfield.
22  * (In reality, character set and collation issues make the transformation
23  * from LIKE to indexscan limits rather harder than one might think ...
24  * but that's the basic idea.)
25  *
26  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
27  * Portions Copyright (c) 1994, Regents of the University of California
28  *
29  *
30  * IDENTIFICATION
31  *	  src/backend/utils/adt/like_support.c
32  *
33  *-------------------------------------------------------------------------
34  */
35 #include "postgres.h"
36 
37 #include <math.h>
38 
39 #include "access/htup_details.h"
40 #include "access/stratnum.h"
41 #include "catalog/pg_collation.h"
42 #include "catalog/pg_operator.h"
43 #include "catalog/pg_opfamily.h"
44 #include "catalog/pg_statistic.h"
45 #include "catalog/pg_type.h"
46 #include "mb/pg_wchar.h"
47 #include "nodes/makefuncs.h"
48 #include "nodes/nodeFuncs.h"
49 #include "nodes/supportnodes.h"
50 #include "utils/builtins.h"
51 #include "utils/datum.h"
52 #include "utils/lsyscache.h"
53 #include "utils/pg_locale.h"
54 #include "utils/selfuncs.h"
55 #include "utils/varlena.h"
56 
57 
58 typedef enum
59 {
60 	Pattern_Type_Like,
61 	Pattern_Type_Like_IC,
62 	Pattern_Type_Regex,
63 	Pattern_Type_Regex_IC,
64 	Pattern_Type_Prefix
65 } Pattern_Type;
66 
67 typedef enum
68 {
69 	Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact
70 } Pattern_Prefix_Status;
71 
72 static Node *like_regex_support(Node *rawreq, Pattern_Type ptype);
73 static List *match_pattern_prefix(Node *leftop,
74 								  Node *rightop,
75 								  Pattern_Type ptype,
76 								  Oid expr_coll,
77 								  Oid opfamily,
78 								  Oid indexcollation);
79 static double patternsel_common(PlannerInfo *root,
80 								Oid oprid,
81 								Oid opfuncid,
82 								List *args,
83 								int varRelid,
84 								Oid collation,
85 								Pattern_Type ptype,
86 								bool negate);
87 static Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
88 												  Pattern_Type ptype,
89 												  Oid collation,
90 												  Const **prefix,
91 												  Selectivity *rest_selec);
92 static Selectivity prefix_selectivity(PlannerInfo *root,
93 									  VariableStatData *vardata,
94 									  Oid eqopr, Oid ltopr, Oid geopr,
95 									  Oid collation,
96 									  Const *prefixcon);
97 static Selectivity like_selectivity(const char *patt, int pattlen,
98 									bool case_insensitive);
99 static Selectivity regex_selectivity(const char *patt, int pattlen,
100 									 bool case_insensitive,
101 									 int fixed_prefix_len);
102 static int	pattern_char_isalpha(char c, bool is_multibyte,
103 								 pg_locale_t locale, bool locale_is_c);
104 static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
105 								  Oid collation);
106 static Datum string_to_datum(const char *str, Oid datatype);
107 static Const *string_to_const(const char *str, Oid datatype);
108 static Const *string_to_bytea_const(const char *str, size_t str_len);
109 
110 
111 /*
112  * Planner support functions for LIKE, regex, and related operators
113  */
114 Datum
textlike_support(PG_FUNCTION_ARGS)115 textlike_support(PG_FUNCTION_ARGS)
116 {
117 	Node	   *rawreq = (Node *) PG_GETARG_POINTER(0);
118 
119 	PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like));
120 }
121 
122 Datum
texticlike_support(PG_FUNCTION_ARGS)123 texticlike_support(PG_FUNCTION_ARGS)
124 {
125 	Node	   *rawreq = (Node *) PG_GETARG_POINTER(0);
126 
127 	PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like_IC));
128 }
129 
130 Datum
textregexeq_support(PG_FUNCTION_ARGS)131 textregexeq_support(PG_FUNCTION_ARGS)
132 {
133 	Node	   *rawreq = (Node *) PG_GETARG_POINTER(0);
134 
135 	PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex));
136 }
137 
138 Datum
texticregexeq_support(PG_FUNCTION_ARGS)139 texticregexeq_support(PG_FUNCTION_ARGS)
140 {
141 	Node	   *rawreq = (Node *) PG_GETARG_POINTER(0);
142 
143 	PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex_IC));
144 }
145 
146 /* Common code for the above */
147 static Node *
like_regex_support(Node * rawreq,Pattern_Type ptype)148 like_regex_support(Node *rawreq, Pattern_Type ptype)
149 {
150 	Node	   *ret = NULL;
151 
152 	if (IsA(rawreq, SupportRequestSelectivity))
153 	{
154 		/*
155 		 * Make a selectivity estimate for a function call, just as we'd do if
156 		 * the call was via the corresponding operator.
157 		 */
158 		SupportRequestSelectivity *req = (SupportRequestSelectivity *) rawreq;
159 		Selectivity s1;
160 
161 		if (req->is_join)
162 		{
163 			/*
164 			 * For the moment we just punt.  If patternjoinsel is ever
165 			 * improved to do better, this should be made to call it.
166 			 */
167 			s1 = DEFAULT_MATCH_SEL;
168 		}
169 		else
170 		{
171 			/* Share code with operator restriction selectivity functions */
172 			s1 = patternsel_common(req->root,
173 								   InvalidOid,
174 								   req->funcid,
175 								   req->args,
176 								   req->varRelid,
177 								   req->inputcollid,
178 								   ptype,
179 								   false);
180 		}
181 		req->selectivity = s1;
182 		ret = (Node *) req;
183 	}
184 	else if (IsA(rawreq, SupportRequestIndexCondition))
185 	{
186 		/* Try to convert operator/function call to index conditions */
187 		SupportRequestIndexCondition *req = (SupportRequestIndexCondition *) rawreq;
188 
189 		/*
190 		 * Currently we have no "reverse" match operators with the pattern on
191 		 * the left, so we only need consider cases with the indexkey on the
192 		 * left.
193 		 */
194 		if (req->indexarg != 0)
195 			return NULL;
196 
197 		if (is_opclause(req->node))
198 		{
199 			OpExpr	   *clause = (OpExpr *) req->node;
200 
201 			Assert(list_length(clause->args) == 2);
202 			ret = (Node *)
203 				match_pattern_prefix((Node *) linitial(clause->args),
204 									 (Node *) lsecond(clause->args),
205 									 ptype,
206 									 clause->inputcollid,
207 									 req->opfamily,
208 									 req->indexcollation);
209 		}
210 		else if (is_funcclause(req->node))	/* be paranoid */
211 		{
212 			FuncExpr   *clause = (FuncExpr *) req->node;
213 
214 			Assert(list_length(clause->args) == 2);
215 			ret = (Node *)
216 				match_pattern_prefix((Node *) linitial(clause->args),
217 									 (Node *) lsecond(clause->args),
218 									 ptype,
219 									 clause->inputcollid,
220 									 req->opfamily,
221 									 req->indexcollation);
222 		}
223 	}
224 
225 	return ret;
226 }
227 
228 /*
229  * match_pattern_prefix
230  *	  Try to generate an indexqual for a LIKE or regex operator.
231  */
232 static List *
match_pattern_prefix(Node * leftop,Node * rightop,Pattern_Type ptype,Oid expr_coll,Oid opfamily,Oid indexcollation)233 match_pattern_prefix(Node *leftop,
234 					 Node *rightop,
235 					 Pattern_Type ptype,
236 					 Oid expr_coll,
237 					 Oid opfamily,
238 					 Oid indexcollation)
239 {
240 	List	   *result;
241 	Const	   *patt;
242 	Const	   *prefix;
243 	Pattern_Prefix_Status pstatus;
244 	Oid			ldatatype;
245 	Oid			rdatatype;
246 	Oid			eqopr;
247 	Oid			ltopr;
248 	Oid			geopr;
249 	bool		collation_aware;
250 	Expr	   *expr;
251 	FmgrInfo	ltproc;
252 	Const	   *greaterstr;
253 
254 	/*
255 	 * Can't do anything with a non-constant or NULL pattern argument.
256 	 *
257 	 * Note that since we restrict ourselves to cases with a hard constant on
258 	 * the RHS, it's a-fortiori a pseudoconstant, and we don't need to worry
259 	 * about verifying that.
260 	 */
261 	if (!IsA(rightop, Const) ||
262 		((Const *) rightop)->constisnull)
263 		return NIL;
264 	patt = (Const *) rightop;
265 
266 	/*
267 	 * Not supported if the expression collation is nondeterministic.  The
268 	 * optimized equality or prefix tests use bytewise comparisons, which is
269 	 * not consistent with nondeterministic collations.  The actual
270 	 * pattern-matching implementation functions will later error out that
271 	 * pattern-matching is not supported with nondeterministic collations. (We
272 	 * could also error out here, but by doing it later we get more precise
273 	 * error messages.)  (It should be possible to support at least
274 	 * Pattern_Prefix_Exact, but no point as long as the actual
275 	 * pattern-matching implementations don't support it.)
276 	 *
277 	 * expr_coll is not set for a non-collation-aware data type such as bytea.
278 	 */
279 	if (expr_coll && !get_collation_isdeterministic(expr_coll))
280 		return NIL;
281 
282 	/*
283 	 * Try to extract a fixed prefix from the pattern.
284 	 */
285 	pstatus = pattern_fixed_prefix(patt, ptype, expr_coll,
286 								   &prefix, NULL);
287 
288 	/* fail if no fixed prefix */
289 	if (pstatus == Pattern_Prefix_None)
290 		return NIL;
291 
292 	/*
293 	 * Identify the operators we want to use, based on the type of the
294 	 * left-hand argument.  Usually these are just the type's regular
295 	 * comparison operators, but if we are considering one of the semi-legacy
296 	 * "pattern" opclasses, use the "pattern" operators instead.  Those are
297 	 * not collation-sensitive but always use C collation, as we want.  The
298 	 * selected operators also determine the needed type of the prefix
299 	 * constant.
300 	 */
301 	ldatatype = exprType(leftop);
302 	switch (ldatatype)
303 	{
304 		case TEXTOID:
305 			if (opfamily == TEXT_PATTERN_BTREE_FAM_OID ||
306 				opfamily == TEXT_SPGIST_FAM_OID)
307 			{
308 				eqopr = TextEqualOperator;
309 				ltopr = TextPatternLessOperator;
310 				geopr = TextPatternGreaterEqualOperator;
311 				collation_aware = false;
312 			}
313 			else
314 			{
315 				eqopr = TextEqualOperator;
316 				ltopr = TextLessOperator;
317 				geopr = TextGreaterEqualOperator;
318 				collation_aware = true;
319 			}
320 			rdatatype = TEXTOID;
321 			break;
322 		case NAMEOID:
323 
324 			/*
325 			 * Note that here, we need the RHS type to be text, so that the
326 			 * comparison value isn't improperly truncated to NAMEDATALEN.
327 			 */
328 			eqopr = NameEqualTextOperator;
329 			ltopr = NameLessTextOperator;
330 			geopr = NameGreaterEqualTextOperator;
331 			collation_aware = true;
332 			rdatatype = TEXTOID;
333 			break;
334 		case BPCHAROID:
335 			if (opfamily == BPCHAR_PATTERN_BTREE_FAM_OID)
336 			{
337 				eqopr = BpcharEqualOperator;
338 				ltopr = BpcharPatternLessOperator;
339 				geopr = BpcharPatternGreaterEqualOperator;
340 				collation_aware = false;
341 			}
342 			else
343 			{
344 				eqopr = BpcharEqualOperator;
345 				ltopr = BpcharLessOperator;
346 				geopr = BpcharGreaterEqualOperator;
347 				collation_aware = true;
348 			}
349 			rdatatype = BPCHAROID;
350 			break;
351 		case BYTEAOID:
352 			eqopr = ByteaEqualOperator;
353 			ltopr = ByteaLessOperator;
354 			geopr = ByteaGreaterEqualOperator;
355 			collation_aware = false;
356 			rdatatype = BYTEAOID;
357 			break;
358 		default:
359 			/* Can't get here unless we're attached to the wrong operator */
360 			return NIL;
361 	}
362 
363 	/*
364 	 * If necessary, verify that the index's collation behavior is compatible.
365 	 * For an exact-match case, we don't have to be picky.  Otherwise, insist
366 	 * that the index collation be "C".  Note that here we are looking at the
367 	 * index's collation, not the expression's collation -- this test is *not*
368 	 * dependent on the LIKE/regex operator's collation.
369 	 */
370 	if (collation_aware)
371 	{
372 		if (!(pstatus == Pattern_Prefix_Exact ||
373 			  lc_collate_is_c(indexcollation)))
374 			return NIL;
375 	}
376 
377 	/*
378 	 * If necessary, coerce the prefix constant to the right type.  The given
379 	 * prefix constant is either text or bytea type, therefore the only case
380 	 * where we need to do anything is when converting text to bpchar.  Those
381 	 * two types are binary-compatible, so relabeling the Const node is
382 	 * sufficient.
383 	 */
384 	if (prefix->consttype != rdatatype)
385 	{
386 		Assert(prefix->consttype == TEXTOID &&
387 			   rdatatype == BPCHAROID);
388 		prefix->consttype = rdatatype;
389 	}
390 
391 	/*
392 	 * If we found an exact-match pattern, generate an "=" indexqual.
393 	 *
394 	 * Here and below, check to see whether the desired operator is actually
395 	 * supported by the index opclass, and fail quietly if not.  This allows
396 	 * us to not be concerned with specific opclasses (except for the legacy
397 	 * "pattern" cases); any index that correctly implements the operators
398 	 * will work.
399 	 */
400 	if (pstatus == Pattern_Prefix_Exact)
401 	{
402 		if (!op_in_opfamily(eqopr, opfamily))
403 			return NIL;
404 		expr = make_opclause(eqopr, BOOLOID, false,
405 							 (Expr *) leftop, (Expr *) prefix,
406 							 InvalidOid, indexcollation);
407 		result = list_make1(expr);
408 		return result;
409 	}
410 
411 	/*
412 	 * Otherwise, we have a nonempty required prefix of the values.
413 	 *
414 	 * We can always say "x >= prefix".
415 	 */
416 	if (!op_in_opfamily(geopr, opfamily))
417 		return NIL;
418 	expr = make_opclause(geopr, BOOLOID, false,
419 						 (Expr *) leftop, (Expr *) prefix,
420 						 InvalidOid, indexcollation);
421 	result = list_make1(expr);
422 
423 	/*-------
424 	 * If we can create a string larger than the prefix, we can say
425 	 * "x < greaterstr".  NB: we rely on make_greater_string() to generate
426 	 * a guaranteed-greater string, not just a probably-greater string.
427 	 * In general this is only guaranteed in C locale, so we'd better be
428 	 * using a C-locale index collation.
429 	 *-------
430 	 */
431 	if (!op_in_opfamily(ltopr, opfamily))
432 		return result;
433 	fmgr_info(get_opcode(ltopr), &ltproc);
434 	greaterstr = make_greater_string(prefix, &ltproc, indexcollation);
435 	if (greaterstr)
436 	{
437 		expr = make_opclause(ltopr, BOOLOID, false,
438 							 (Expr *) leftop, (Expr *) greaterstr,
439 							 InvalidOid, indexcollation);
440 		result = lappend(result, expr);
441 	}
442 
443 	return result;
444 }
445 
446 
447 /*
448  * patternsel_common - generic code for pattern-match restriction selectivity.
449  *
450  * To support using this from either the operator or function paths, caller
451  * may pass either operator OID or underlying function OID; we look up the
452  * latter from the former if needed.  (We could just have patternsel() call
453  * get_opcode(), but the work would be wasted if we don't have a need to
454  * compare a fixed prefix to the pg_statistic data.)
455  *
456  * Note that oprid and/or opfuncid should be for the positive-match operator
457  * even when negate is true.
458  */
459 static double
patternsel_common(PlannerInfo * root,Oid oprid,Oid opfuncid,List * args,int varRelid,Oid collation,Pattern_Type ptype,bool negate)460 patternsel_common(PlannerInfo *root,
461 				  Oid oprid,
462 				  Oid opfuncid,
463 				  List *args,
464 				  int varRelid,
465 				  Oid collation,
466 				  Pattern_Type ptype,
467 				  bool negate)
468 {
469 	VariableStatData vardata;
470 	Node	   *other;
471 	bool		varonleft;
472 	Datum		constval;
473 	Oid			consttype;
474 	Oid			vartype;
475 	Oid			rdatatype;
476 	Oid			eqopr;
477 	Oid			ltopr;
478 	Oid			geopr;
479 	Pattern_Prefix_Status pstatus;
480 	Const	   *patt;
481 	Const	   *prefix = NULL;
482 	Selectivity rest_selec = 0;
483 	double		nullfrac = 0.0;
484 	double		result;
485 
486 	/*
487 	 * Initialize result to the appropriate default estimate depending on
488 	 * whether it's a match or not-match operator.
489 	 */
490 	if (negate)
491 		result = 1.0 - DEFAULT_MATCH_SEL;
492 	else
493 		result = DEFAULT_MATCH_SEL;
494 
495 	/*
496 	 * If expression is not variable op constant, then punt and return the
497 	 * default estimate.
498 	 */
499 	if (!get_restriction_variable(root, args, varRelid,
500 								  &vardata, &other, &varonleft))
501 		return result;
502 	if (!varonleft || !IsA(other, Const))
503 	{
504 		ReleaseVariableStats(vardata);
505 		return result;
506 	}
507 
508 	/*
509 	 * If the constant is NULL, assume operator is strict and return zero, ie,
510 	 * operator will never return TRUE.  (It's zero even for a negator op.)
511 	 */
512 	if (((Const *) other)->constisnull)
513 	{
514 		ReleaseVariableStats(vardata);
515 		return 0.0;
516 	}
517 	constval = ((Const *) other)->constvalue;
518 	consttype = ((Const *) other)->consttype;
519 
520 	/*
521 	 * The right-hand const is type text or bytea for all supported operators.
522 	 * We do not expect to see binary-compatible types here, since
523 	 * const-folding should have relabeled the const to exactly match the
524 	 * operator's declared type.
525 	 */
526 	if (consttype != TEXTOID && consttype != BYTEAOID)
527 	{
528 		ReleaseVariableStats(vardata);
529 		return result;
530 	}
531 
532 	/*
533 	 * Similarly, the exposed type of the left-hand side should be one of
534 	 * those we know.  (Do not look at vardata.atttype, which might be
535 	 * something binary-compatible but different.)	We can use it to identify
536 	 * the comparison operators and the required type of the comparison
537 	 * constant, much as in match_pattern_prefix().
538 	 */
539 	vartype = vardata.vartype;
540 
541 	switch (vartype)
542 	{
543 		case TEXTOID:
544 			eqopr = TextEqualOperator;
545 			ltopr = TextLessOperator;
546 			geopr = TextGreaterEqualOperator;
547 			rdatatype = TEXTOID;
548 			break;
549 		case NAMEOID:
550 
551 			/*
552 			 * Note that here, we need the RHS type to be text, so that the
553 			 * comparison value isn't improperly truncated to NAMEDATALEN.
554 			 */
555 			eqopr = NameEqualTextOperator;
556 			ltopr = NameLessTextOperator;
557 			geopr = NameGreaterEqualTextOperator;
558 			rdatatype = TEXTOID;
559 			break;
560 		case BPCHAROID:
561 			eqopr = BpcharEqualOperator;
562 			ltopr = BpcharLessOperator;
563 			geopr = BpcharGreaterEqualOperator;
564 			rdatatype = BPCHAROID;
565 			break;
566 		case BYTEAOID:
567 			eqopr = ByteaEqualOperator;
568 			ltopr = ByteaLessOperator;
569 			geopr = ByteaGreaterEqualOperator;
570 			rdatatype = BYTEAOID;
571 			break;
572 		default:
573 			/* Can't get here unless we're attached to the wrong operator */
574 			ReleaseVariableStats(vardata);
575 			return result;
576 	}
577 
578 	/*
579 	 * Grab the nullfrac for use below.
580 	 */
581 	if (HeapTupleIsValid(vardata.statsTuple))
582 	{
583 		Form_pg_statistic stats;
584 
585 		stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
586 		nullfrac = stats->stanullfrac;
587 	}
588 
589 	/*
590 	 * Pull out any fixed prefix implied by the pattern, and estimate the
591 	 * fractional selectivity of the remainder of the pattern.  Unlike many
592 	 * other selectivity estimators, we use the pattern operator's actual
593 	 * collation for this step.  This is not because we expect the collation
594 	 * to make a big difference in the selectivity estimate (it seldom would),
595 	 * but because we want to be sure we cache compiled regexps under the
596 	 * right cache key, so that they can be re-used at runtime.
597 	 */
598 	patt = (Const *) other;
599 	pstatus = pattern_fixed_prefix(patt, ptype, collation,
600 								   &prefix, &rest_selec);
601 
602 	/*
603 	 * If necessary, coerce the prefix constant to the right type.  The only
604 	 * case where we need to do anything is when converting text to bpchar.
605 	 * Those two types are binary-compatible, so relabeling the Const node is
606 	 * sufficient.
607 	 */
608 	if (prefix && prefix->consttype != rdatatype)
609 	{
610 		Assert(prefix->consttype == TEXTOID &&
611 			   rdatatype == BPCHAROID);
612 		prefix->consttype = rdatatype;
613 	}
614 
615 	if (pstatus == Pattern_Prefix_Exact)
616 	{
617 		/*
618 		 * Pattern specifies an exact match, so estimate as for '='
619 		 */
620 		result = var_eq_const(&vardata, eqopr, collation, prefix->constvalue,
621 							  false, true, false);
622 	}
623 	else
624 	{
625 		/*
626 		 * Not exact-match pattern.  If we have a sufficiently large
627 		 * histogram, estimate selectivity for the histogram part of the
628 		 * population by counting matches in the histogram.  If not, estimate
629 		 * selectivity of the fixed prefix and remainder of pattern
630 		 * separately, then combine the two to get an estimate of the
631 		 * selectivity for the part of the column population represented by
632 		 * the histogram.  (For small histograms, we combine these
633 		 * approaches.)
634 		 *
635 		 * We then add up data for any most-common-values values; these are
636 		 * not in the histogram population, and we can get exact answers for
637 		 * them by applying the pattern operator, so there's no reason to
638 		 * approximate.  (If the MCVs cover a significant part of the total
639 		 * population, this gives us a big leg up in accuracy.)
640 		 */
641 		Selectivity selec;
642 		int			hist_size;
643 		FmgrInfo	opproc;
644 		double		mcv_selec,
645 					sumcommon;
646 
647 		/* Try to use the histogram entries to get selectivity */
648 		if (!OidIsValid(opfuncid))
649 			opfuncid = get_opcode(oprid);
650 		fmgr_info(opfuncid, &opproc);
651 
652 		selec = histogram_selectivity(&vardata, &opproc, collation,
653 									  constval, true,
654 									  10, 1, &hist_size);
655 
656 		/* If not at least 100 entries, use the heuristic method */
657 		if (hist_size < 100)
658 		{
659 			Selectivity heursel;
660 			Selectivity prefixsel;
661 
662 			if (pstatus == Pattern_Prefix_Partial)
663 				prefixsel = prefix_selectivity(root, &vardata,
664 											   eqopr, ltopr, geopr,
665 											   collation,
666 											   prefix);
667 			else
668 				prefixsel = 1.0;
669 			heursel = prefixsel * rest_selec;
670 
671 			if (selec < 0)		/* fewer than 10 histogram entries? */
672 				selec = heursel;
673 			else
674 			{
675 				/*
676 				 * For histogram sizes from 10 to 100, we combine the
677 				 * histogram and heuristic selectivities, putting increasingly
678 				 * more trust in the histogram for larger sizes.
679 				 */
680 				double		hist_weight = hist_size / 100.0;
681 
682 				selec = selec * hist_weight + heursel * (1.0 - hist_weight);
683 			}
684 		}
685 
686 		/* In any case, don't believe extremely small or large estimates. */
687 		if (selec < 0.0001)
688 			selec = 0.0001;
689 		else if (selec > 0.9999)
690 			selec = 0.9999;
691 
692 		/*
693 		 * If we have most-common-values info, add up the fractions of the MCV
694 		 * entries that satisfy MCV OP PATTERN.  These fractions contribute
695 		 * directly to the result selectivity.  Also add up the total fraction
696 		 * represented by MCV entries.
697 		 */
698 		mcv_selec = mcv_selectivity(&vardata, &opproc, collation,
699 									constval, true,
700 									&sumcommon);
701 
702 		/*
703 		 * Now merge the results from the MCV and histogram calculations,
704 		 * realizing that the histogram covers only the non-null values that
705 		 * are not listed in MCV.
706 		 */
707 		selec *= 1.0 - nullfrac - sumcommon;
708 		selec += mcv_selec;
709 		result = selec;
710 	}
711 
712 	/* now adjust if we wanted not-match rather than match */
713 	if (negate)
714 		result = 1.0 - result - nullfrac;
715 
716 	/* result should be in range, but make sure... */
717 	CLAMP_PROBABILITY(result);
718 
719 	if (prefix)
720 	{
721 		pfree(DatumGetPointer(prefix->constvalue));
722 		pfree(prefix);
723 	}
724 
725 	ReleaseVariableStats(vardata);
726 
727 	return result;
728 }
729 
730 /*
731  * Fix impedance mismatch between SQL-callable functions and patternsel_common
732  */
733 static double
patternsel(PG_FUNCTION_ARGS,Pattern_Type ptype,bool negate)734 patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
735 {
736 	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
737 	Oid			operator = PG_GETARG_OID(1);
738 	List	   *args = (List *) PG_GETARG_POINTER(2);
739 	int			varRelid = PG_GETARG_INT32(3);
740 	Oid			collation = PG_GET_COLLATION();
741 
742 	/*
743 	 * If this is for a NOT LIKE or similar operator, get the corresponding
744 	 * positive-match operator and work with that.
745 	 */
746 	if (negate)
747 	{
748 		operator = get_negator(operator);
749 		if (!OidIsValid(operator))
750 			elog(ERROR, "patternsel called for operator without a negator");
751 	}
752 
753 	return patternsel_common(root,
754 							 operator,
755 							 InvalidOid,
756 							 args,
757 							 varRelid,
758 							 collation,
759 							 ptype,
760 							 negate);
761 }
762 
763 /*
764  *		regexeqsel		- Selectivity of regular-expression pattern match.
765  */
766 Datum
regexeqsel(PG_FUNCTION_ARGS)767 regexeqsel(PG_FUNCTION_ARGS)
768 {
769 	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false));
770 }
771 
772 /*
773  *		icregexeqsel	- Selectivity of case-insensitive regex match.
774  */
775 Datum
icregexeqsel(PG_FUNCTION_ARGS)776 icregexeqsel(PG_FUNCTION_ARGS)
777 {
778 	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false));
779 }
780 
781 /*
782  *		likesel			- Selectivity of LIKE pattern match.
783  */
784 Datum
likesel(PG_FUNCTION_ARGS)785 likesel(PG_FUNCTION_ARGS)
786 {
787 	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false));
788 }
789 
790 /*
791  *		prefixsel			- selectivity of prefix operator
792  */
793 Datum
prefixsel(PG_FUNCTION_ARGS)794 prefixsel(PG_FUNCTION_ARGS)
795 {
796 	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false));
797 }
798 
799 /*
800  *
801  *		iclikesel			- Selectivity of ILIKE pattern match.
802  */
803 Datum
iclikesel(PG_FUNCTION_ARGS)804 iclikesel(PG_FUNCTION_ARGS)
805 {
806 	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false));
807 }
808 
809 /*
810  *		regexnesel		- Selectivity of regular-expression pattern non-match.
811  */
812 Datum
regexnesel(PG_FUNCTION_ARGS)813 regexnesel(PG_FUNCTION_ARGS)
814 {
815 	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true));
816 }
817 
818 /*
819  *		icregexnesel	- Selectivity of case-insensitive regex non-match.
820  */
821 Datum
icregexnesel(PG_FUNCTION_ARGS)822 icregexnesel(PG_FUNCTION_ARGS)
823 {
824 	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true));
825 }
826 
827 /*
828  *		nlikesel		- Selectivity of LIKE pattern non-match.
829  */
830 Datum
nlikesel(PG_FUNCTION_ARGS)831 nlikesel(PG_FUNCTION_ARGS)
832 {
833 	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true));
834 }
835 
836 /*
837  *		icnlikesel		- Selectivity of ILIKE pattern non-match.
838  */
839 Datum
icnlikesel(PG_FUNCTION_ARGS)840 icnlikesel(PG_FUNCTION_ARGS)
841 {
842 	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true));
843 }
844 
845 /*
846  * patternjoinsel		- Generic code for pattern-match join selectivity.
847  */
848 static double
patternjoinsel(PG_FUNCTION_ARGS,Pattern_Type ptype,bool negate)849 patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
850 {
851 	/* For the moment we just punt. */
852 	return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL;
853 }
854 
855 /*
856  *		regexeqjoinsel	- Join selectivity of regular-expression pattern match.
857  */
858 Datum
regexeqjoinsel(PG_FUNCTION_ARGS)859 regexeqjoinsel(PG_FUNCTION_ARGS)
860 {
861 	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false));
862 }
863 
864 /*
865  *		icregexeqjoinsel	- Join selectivity of case-insensitive regex match.
866  */
867 Datum
icregexeqjoinsel(PG_FUNCTION_ARGS)868 icregexeqjoinsel(PG_FUNCTION_ARGS)
869 {
870 	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false));
871 }
872 
873 /*
874  *		likejoinsel			- Join selectivity of LIKE pattern match.
875  */
876 Datum
likejoinsel(PG_FUNCTION_ARGS)877 likejoinsel(PG_FUNCTION_ARGS)
878 {
879 	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false));
880 }
881 
882 /*
883  *		prefixjoinsel			- Join selectivity of prefix operator
884  */
885 Datum
prefixjoinsel(PG_FUNCTION_ARGS)886 prefixjoinsel(PG_FUNCTION_ARGS)
887 {
888 	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false));
889 }
890 
891 /*
892  *		iclikejoinsel			- Join selectivity of ILIKE pattern match.
893  */
894 Datum
iclikejoinsel(PG_FUNCTION_ARGS)895 iclikejoinsel(PG_FUNCTION_ARGS)
896 {
897 	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false));
898 }
899 
900 /*
901  *		regexnejoinsel	- Join selectivity of regex non-match.
902  */
903 Datum
regexnejoinsel(PG_FUNCTION_ARGS)904 regexnejoinsel(PG_FUNCTION_ARGS)
905 {
906 	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true));
907 }
908 
909 /*
910  *		icregexnejoinsel	- Join selectivity of case-insensitive regex non-match.
911  */
912 Datum
icregexnejoinsel(PG_FUNCTION_ARGS)913 icregexnejoinsel(PG_FUNCTION_ARGS)
914 {
915 	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true));
916 }
917 
918 /*
919  *		nlikejoinsel		- Join selectivity of LIKE pattern non-match.
920  */
921 Datum
nlikejoinsel(PG_FUNCTION_ARGS)922 nlikejoinsel(PG_FUNCTION_ARGS)
923 {
924 	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true));
925 }
926 
927 /*
928  *		icnlikejoinsel		- Join selectivity of ILIKE pattern non-match.
929  */
930 Datum
icnlikejoinsel(PG_FUNCTION_ARGS)931 icnlikejoinsel(PG_FUNCTION_ARGS)
932 {
933 	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true));
934 }
935 
936 
937 /*-------------------------------------------------------------------------
938  *
939  * Pattern analysis functions
940  *
941  * These routines support analysis of LIKE and regular-expression patterns
942  * by the planner/optimizer.  It's important that they agree with the
943  * regular-expression code in backend/regex/ and the LIKE code in
944  * backend/utils/adt/like.c.  Also, the computation of the fixed prefix
945  * must be conservative: if we report a string longer than the true fixed
946  * prefix, the query may produce actually wrong answers, rather than just
947  * getting a bad selectivity estimate!
948  *
949  *-------------------------------------------------------------------------
950  */
951 
952 /*
953  * Extract the fixed prefix, if any, for a pattern.
954  *
955  * *prefix is set to a palloc'd prefix string (in the form of a Const node),
956  *	or to NULL if no fixed prefix exists for the pattern.
957  * If rest_selec is not NULL, *rest_selec is set to an estimate of the
958  *	selectivity of the remainder of the pattern (without any fixed prefix).
959  * The prefix Const has the same type (TEXT or BYTEA) as the input pattern.
960  *
961  * The return value distinguishes no fixed prefix, a partial prefix,
962  * or an exact-match-only pattern.
963  */
964 
965 static Pattern_Prefix_Status
like_fixed_prefix(Const * patt_const,bool case_insensitive,Oid collation,Const ** prefix_const,Selectivity * rest_selec)966 like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
967 				  Const **prefix_const, Selectivity *rest_selec)
968 {
969 	char	   *match;
970 	char	   *patt;
971 	int			pattlen;
972 	Oid			typeid = patt_const->consttype;
973 	int			pos,
974 				match_pos;
975 	bool		is_multibyte = (pg_database_encoding_max_length() > 1);
976 	pg_locale_t locale = 0;
977 	bool		locale_is_c = false;
978 
979 	/* the right-hand const is type text or bytea */
980 	Assert(typeid == BYTEAOID || typeid == TEXTOID);
981 
982 	if (case_insensitive)
983 	{
984 		if (typeid == BYTEAOID)
985 			ereport(ERROR,
986 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
987 					 errmsg("case insensitive matching not supported on type bytea")));
988 
989 		/* If case-insensitive, we need locale info */
990 		if (lc_ctype_is_c(collation))
991 			locale_is_c = true;
992 		else if (collation != DEFAULT_COLLATION_OID)
993 		{
994 			if (!OidIsValid(collation))
995 			{
996 				/*
997 				 * This typically means that the parser could not resolve a
998 				 * conflict of implicit collations, so report it that way.
999 				 */
1000 				ereport(ERROR,
1001 						(errcode(ERRCODE_INDETERMINATE_COLLATION),
1002 						 errmsg("could not determine which collation to use for ILIKE"),
1003 						 errhint("Use the COLLATE clause to set the collation explicitly.")));
1004 			}
1005 			locale = pg_newlocale_from_collation(collation);
1006 		}
1007 	}
1008 
1009 	if (typeid != BYTEAOID)
1010 	{
1011 		patt = TextDatumGetCString(patt_const->constvalue);
1012 		pattlen = strlen(patt);
1013 	}
1014 	else
1015 	{
1016 		bytea	   *bstr = DatumGetByteaPP(patt_const->constvalue);
1017 
1018 		pattlen = VARSIZE_ANY_EXHDR(bstr);
1019 		patt = (char *) palloc(pattlen);
1020 		memcpy(patt, VARDATA_ANY(bstr), pattlen);
1021 		Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
1022 	}
1023 
1024 	match = palloc(pattlen + 1);
1025 	match_pos = 0;
1026 	for (pos = 0; pos < pattlen; pos++)
1027 	{
1028 		/* % and _ are wildcard characters in LIKE */
1029 		if (patt[pos] == '%' ||
1030 			patt[pos] == '_')
1031 			break;
1032 
1033 		/* Backslash escapes the next character */
1034 		if (patt[pos] == '\\')
1035 		{
1036 			pos++;
1037 			if (pos >= pattlen)
1038 				break;
1039 		}
1040 
1041 		/* Stop if case-varying character (it's sort of a wildcard) */
1042 		if (case_insensitive &&
1043 			pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
1044 			break;
1045 
1046 		match[match_pos++] = patt[pos];
1047 	}
1048 
1049 	match[match_pos] = '\0';
1050 
1051 	if (typeid != BYTEAOID)
1052 		*prefix_const = string_to_const(match, typeid);
1053 	else
1054 		*prefix_const = string_to_bytea_const(match, match_pos);
1055 
1056 	if (rest_selec != NULL)
1057 		*rest_selec = like_selectivity(&patt[pos], pattlen - pos,
1058 									   case_insensitive);
1059 
1060 	pfree(patt);
1061 	pfree(match);
1062 
1063 	/* in LIKE, an empty pattern is an exact match! */
1064 	if (pos == pattlen)
1065 		return Pattern_Prefix_Exact;	/* reached end of pattern, so exact */
1066 
1067 	if (match_pos > 0)
1068 		return Pattern_Prefix_Partial;
1069 
1070 	return Pattern_Prefix_None;
1071 }
1072 
1073 static Pattern_Prefix_Status
regex_fixed_prefix(Const * patt_const,bool case_insensitive,Oid collation,Const ** prefix_const,Selectivity * rest_selec)1074 regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
1075 				   Const **prefix_const, Selectivity *rest_selec)
1076 {
1077 	Oid			typeid = patt_const->consttype;
1078 	char	   *prefix;
1079 	bool		exact;
1080 
1081 	/*
1082 	 * Should be unnecessary, there are no bytea regex operators defined. As
1083 	 * such, it should be noted that the rest of this function has *not* been
1084 	 * made safe for binary (possibly NULL containing) strings.
1085 	 */
1086 	if (typeid == BYTEAOID)
1087 		ereport(ERROR,
1088 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1089 				 errmsg("regular-expression matching not supported on type bytea")));
1090 
1091 	/* Use the regexp machinery to extract the prefix, if any */
1092 	prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
1093 								 case_insensitive, collation,
1094 								 &exact);
1095 
1096 	if (prefix == NULL)
1097 	{
1098 		*prefix_const = NULL;
1099 
1100 		if (rest_selec != NULL)
1101 		{
1102 			char	   *patt = TextDatumGetCString(patt_const->constvalue);
1103 
1104 			*rest_selec = regex_selectivity(patt, strlen(patt),
1105 											case_insensitive,
1106 											0);
1107 			pfree(patt);
1108 		}
1109 
1110 		return Pattern_Prefix_None;
1111 	}
1112 
1113 	*prefix_const = string_to_const(prefix, typeid);
1114 
1115 	if (rest_selec != NULL)
1116 	{
1117 		if (exact)
1118 		{
1119 			/* Exact match, so there's no additional selectivity */
1120 			*rest_selec = 1.0;
1121 		}
1122 		else
1123 		{
1124 			char	   *patt = TextDatumGetCString(patt_const->constvalue);
1125 
1126 			*rest_selec = regex_selectivity(patt, strlen(patt),
1127 											case_insensitive,
1128 											strlen(prefix));
1129 			pfree(patt);
1130 		}
1131 	}
1132 
1133 	pfree(prefix);
1134 
1135 	if (exact)
1136 		return Pattern_Prefix_Exact;	/* pattern specifies exact match */
1137 	else
1138 		return Pattern_Prefix_Partial;
1139 }
1140 
1141 static Pattern_Prefix_Status
pattern_fixed_prefix(Const * patt,Pattern_Type ptype,Oid collation,Const ** prefix,Selectivity * rest_selec)1142 pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
1143 					 Const **prefix, Selectivity *rest_selec)
1144 {
1145 	Pattern_Prefix_Status result;
1146 
1147 	switch (ptype)
1148 	{
1149 		case Pattern_Type_Like:
1150 			result = like_fixed_prefix(patt, false, collation,
1151 									   prefix, rest_selec);
1152 			break;
1153 		case Pattern_Type_Like_IC:
1154 			result = like_fixed_prefix(patt, true, collation,
1155 									   prefix, rest_selec);
1156 			break;
1157 		case Pattern_Type_Regex:
1158 			result = regex_fixed_prefix(patt, false, collation,
1159 										prefix, rest_selec);
1160 			break;
1161 		case Pattern_Type_Regex_IC:
1162 			result = regex_fixed_prefix(patt, true, collation,
1163 										prefix, rest_selec);
1164 			break;
1165 		case Pattern_Type_Prefix:
1166 			/* Prefix type work is trivial.  */
1167 			result = Pattern_Prefix_Partial;
1168 			*rest_selec = 1.0;	/* all */
1169 			*prefix = makeConst(patt->consttype,
1170 								patt->consttypmod,
1171 								patt->constcollid,
1172 								patt->constlen,
1173 								datumCopy(patt->constvalue,
1174 										  patt->constbyval,
1175 										  patt->constlen),
1176 								patt->constisnull,
1177 								patt->constbyval);
1178 			break;
1179 		default:
1180 			elog(ERROR, "unrecognized ptype: %d", (int) ptype);
1181 			result = Pattern_Prefix_None;	/* keep compiler quiet */
1182 			break;
1183 	}
1184 	return result;
1185 }
1186 
1187 /*
1188  * Estimate the selectivity of a fixed prefix for a pattern match.
1189  *
1190  * A fixed prefix "foo" is estimated as the selectivity of the expression
1191  * "variable >= 'foo' AND variable < 'fop'".
1192  *
1193  * The selectivity estimate is with respect to the portion of the column
1194  * population represented by the histogram --- the caller must fold this
1195  * together with info about MCVs and NULLs.
1196  *
1197  * We use the given comparison operators and collation to do the estimation.
1198  * The given variable and Const must be of the associated datatype(s).
1199  *
1200  * XXX Note: we make use of the upper bound to estimate operator selectivity
1201  * even if the locale is such that we cannot rely on the upper-bound string.
1202  * The selectivity only needs to be approximately right anyway, so it seems
1203  * more useful to use the upper-bound code than not.
1204  */
1205 static Selectivity
prefix_selectivity(PlannerInfo * root,VariableStatData * vardata,Oid eqopr,Oid ltopr,Oid geopr,Oid collation,Const * prefixcon)1206 prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
1207 				   Oid eqopr, Oid ltopr, Oid geopr,
1208 				   Oid collation,
1209 				   Const *prefixcon)
1210 {
1211 	Selectivity prefixsel;
1212 	FmgrInfo	opproc;
1213 	Const	   *greaterstrcon;
1214 	Selectivity eq_sel;
1215 
1216 	/* Estimate the selectivity of "x >= prefix" */
1217 	fmgr_info(get_opcode(geopr), &opproc);
1218 
1219 	prefixsel = ineq_histogram_selectivity(root, vardata,
1220 										   geopr, &opproc, true, true,
1221 										   collation,
1222 										   prefixcon->constvalue,
1223 										   prefixcon->consttype);
1224 
1225 	if (prefixsel < 0.0)
1226 	{
1227 		/* No histogram is present ... return a suitable default estimate */
1228 		return DEFAULT_MATCH_SEL;
1229 	}
1230 
1231 	/*
1232 	 * If we can create a string larger than the prefix, say "x < greaterstr".
1233 	 */
1234 	fmgr_info(get_opcode(ltopr), &opproc);
1235 	greaterstrcon = make_greater_string(prefixcon, &opproc, collation);
1236 	if (greaterstrcon)
1237 	{
1238 		Selectivity topsel;
1239 
1240 		topsel = ineq_histogram_selectivity(root, vardata,
1241 											ltopr, &opproc, false, false,
1242 											collation,
1243 											greaterstrcon->constvalue,
1244 											greaterstrcon->consttype);
1245 
1246 		/* ineq_histogram_selectivity worked before, it shouldn't fail now */
1247 		Assert(topsel >= 0.0);
1248 
1249 		/*
1250 		 * Merge the two selectivities in the same way as for a range query
1251 		 * (see clauselist_selectivity()).  Note that we don't need to worry
1252 		 * about double-exclusion of nulls, since ineq_histogram_selectivity
1253 		 * doesn't count those anyway.
1254 		 */
1255 		prefixsel = topsel + prefixsel - 1.0;
1256 	}
1257 
1258 	/*
1259 	 * If the prefix is long then the two bounding values might be too close
1260 	 * together for the histogram to distinguish them usefully, resulting in a
1261 	 * zero estimate (plus or minus roundoff error). To avoid returning a
1262 	 * ridiculously small estimate, compute the estimated selectivity for
1263 	 * "variable = 'foo'", and clamp to that. (Obviously, the resultant
1264 	 * estimate should be at least that.)
1265 	 *
1266 	 * We apply this even if we couldn't make a greater string.  That case
1267 	 * suggests that the prefix is near the maximum possible, and thus
1268 	 * probably off the end of the histogram, and thus we probably got a very
1269 	 * small estimate from the >= condition; so we still need to clamp.
1270 	 */
1271 	eq_sel = var_eq_const(vardata, eqopr, collation, prefixcon->constvalue,
1272 						  false, true, false);
1273 
1274 	prefixsel = Max(prefixsel, eq_sel);
1275 
1276 	return prefixsel;
1277 }
1278 
1279 
1280 /*
1281  * Estimate the selectivity of a pattern of the specified type.
1282  * Note that any fixed prefix of the pattern will have been removed already,
1283  * so actually we may be looking at just a fragment of the pattern.
1284  *
1285  * For now, we use a very simplistic approach: fixed characters reduce the
1286  * selectivity a good deal, character ranges reduce it a little,
1287  * wildcards (such as % for LIKE or .* for regex) increase it.
1288  */
1289 
1290 #define FIXED_CHAR_SEL	0.20	/* about 1/5 */
1291 #define CHAR_RANGE_SEL	0.25
1292 #define ANY_CHAR_SEL	0.9		/* not 1, since it won't match end-of-string */
1293 #define FULL_WILDCARD_SEL 5.0
1294 #define PARTIAL_WILDCARD_SEL 2.0
1295 
1296 static Selectivity
like_selectivity(const char * patt,int pattlen,bool case_insensitive)1297 like_selectivity(const char *patt, int pattlen, bool case_insensitive)
1298 {
1299 	Selectivity sel = 1.0;
1300 	int			pos;
1301 
1302 	/* Skip any leading wildcard; it's already factored into initial sel */
1303 	for (pos = 0; pos < pattlen; pos++)
1304 	{
1305 		if (patt[pos] != '%' && patt[pos] != '_')
1306 			break;
1307 	}
1308 
1309 	for (; pos < pattlen; pos++)
1310 	{
1311 		/* % and _ are wildcard characters in LIKE */
1312 		if (patt[pos] == '%')
1313 			sel *= FULL_WILDCARD_SEL;
1314 		else if (patt[pos] == '_')
1315 			sel *= ANY_CHAR_SEL;
1316 		else if (patt[pos] == '\\')
1317 		{
1318 			/* Backslash quotes the next character */
1319 			pos++;
1320 			if (pos >= pattlen)
1321 				break;
1322 			sel *= FIXED_CHAR_SEL;
1323 		}
1324 		else
1325 			sel *= FIXED_CHAR_SEL;
1326 	}
1327 	/* Could get sel > 1 if multiple wildcards */
1328 	if (sel > 1.0)
1329 		sel = 1.0;
1330 	return sel;
1331 }
1332 
1333 static Selectivity
regex_selectivity_sub(const char * patt,int pattlen,bool case_insensitive)1334 regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
1335 {
1336 	Selectivity sel = 1.0;
1337 	int			paren_depth = 0;
1338 	int			paren_pos = 0;	/* dummy init to keep compiler quiet */
1339 	int			pos;
1340 
1341 	for (pos = 0; pos < pattlen; pos++)
1342 	{
1343 		if (patt[pos] == '(')
1344 		{
1345 			if (paren_depth == 0)
1346 				paren_pos = pos;	/* remember start of parenthesized item */
1347 			paren_depth++;
1348 		}
1349 		else if (patt[pos] == ')' && paren_depth > 0)
1350 		{
1351 			paren_depth--;
1352 			if (paren_depth == 0)
1353 				sel *= regex_selectivity_sub(patt + (paren_pos + 1),
1354 											 pos - (paren_pos + 1),
1355 											 case_insensitive);
1356 		}
1357 		else if (patt[pos] == '|' && paren_depth == 0)
1358 		{
1359 			/*
1360 			 * If unquoted | is present at paren level 0 in pattern, we have
1361 			 * multiple alternatives; sum their probabilities.
1362 			 */
1363 			sel += regex_selectivity_sub(patt + (pos + 1),
1364 										 pattlen - (pos + 1),
1365 										 case_insensitive);
1366 			break;				/* rest of pattern is now processed */
1367 		}
1368 		else if (patt[pos] == '[')
1369 		{
1370 			bool		negclass = false;
1371 
1372 			if (patt[++pos] == '^')
1373 			{
1374 				negclass = true;
1375 				pos++;
1376 			}
1377 			if (patt[pos] == ']')	/* ']' at start of class is not special */
1378 				pos++;
1379 			while (pos < pattlen && patt[pos] != ']')
1380 				pos++;
1381 			if (paren_depth == 0)
1382 				sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
1383 		}
1384 		else if (patt[pos] == '.')
1385 		{
1386 			if (paren_depth == 0)
1387 				sel *= ANY_CHAR_SEL;
1388 		}
1389 		else if (patt[pos] == '*' ||
1390 				 patt[pos] == '?' ||
1391 				 patt[pos] == '+')
1392 		{
1393 			/* Ought to be smarter about quantifiers... */
1394 			if (paren_depth == 0)
1395 				sel *= PARTIAL_WILDCARD_SEL;
1396 		}
1397 		else if (patt[pos] == '{')
1398 		{
1399 			while (pos < pattlen && patt[pos] != '}')
1400 				pos++;
1401 			if (paren_depth == 0)
1402 				sel *= PARTIAL_WILDCARD_SEL;
1403 		}
1404 		else if (patt[pos] == '\\')
1405 		{
1406 			/* backslash quotes the next character */
1407 			pos++;
1408 			if (pos >= pattlen)
1409 				break;
1410 			if (paren_depth == 0)
1411 				sel *= FIXED_CHAR_SEL;
1412 		}
1413 		else
1414 		{
1415 			if (paren_depth == 0)
1416 				sel *= FIXED_CHAR_SEL;
1417 		}
1418 	}
1419 	/* Could get sel > 1 if multiple wildcards */
1420 	if (sel > 1.0)
1421 		sel = 1.0;
1422 	return sel;
1423 }
1424 
1425 static Selectivity
regex_selectivity(const char * patt,int pattlen,bool case_insensitive,int fixed_prefix_len)1426 regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
1427 				  int fixed_prefix_len)
1428 {
1429 	Selectivity sel;
1430 
1431 	/* If patt doesn't end with $, consider it to have a trailing wildcard */
1432 	if (pattlen > 0 && patt[pattlen - 1] == '$' &&
1433 		(pattlen == 1 || patt[pattlen - 2] != '\\'))
1434 	{
1435 		/* has trailing $ */
1436 		sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
1437 	}
1438 	else
1439 	{
1440 		/* no trailing $ */
1441 		sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
1442 		sel *= FULL_WILDCARD_SEL;
1443 	}
1444 
1445 	/*
1446 	 * If there's a fixed prefix, discount its selectivity.  We have to be
1447 	 * careful here since a very long prefix could result in pow's result
1448 	 * underflowing to zero (in which case "sel" probably has as well).
1449 	 */
1450 	if (fixed_prefix_len > 0)
1451 	{
1452 		double		prefixsel = pow(FIXED_CHAR_SEL, fixed_prefix_len);
1453 
1454 		if (prefixsel > 0.0)
1455 			sel /= prefixsel;
1456 	}
1457 
1458 	/* Make sure result stays in range */
1459 	CLAMP_PROBABILITY(sel);
1460 	return sel;
1461 }
1462 
1463 /*
1464  * Check whether char is a letter (and, hence, subject to case-folding)
1465  *
1466  * In multibyte character sets or with ICU, we can't use isalpha, and it does
1467  * not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha.
1468  * Instead, just assume any non-ASCII char is potentially case-varying, and
1469  * hard-wire knowledge of which ASCII chars are letters.
1470  */
1471 static int
pattern_char_isalpha(char c,bool is_multibyte,pg_locale_t locale,bool locale_is_c)1472 pattern_char_isalpha(char c, bool is_multibyte,
1473 					 pg_locale_t locale, bool locale_is_c)
1474 {
1475 	if (locale_is_c)
1476 		return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1477 	else if (is_multibyte && IS_HIGHBIT_SET(c))
1478 		return true;
1479 	else if (locale && locale->provider == COLLPROVIDER_ICU)
1480 		return IS_HIGHBIT_SET(c) ||
1481 			(c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1482 #ifdef HAVE_LOCALE_T
1483 	else if (locale && locale->provider == COLLPROVIDER_LIBC)
1484 		return isalpha_l((unsigned char) c, locale->info.lt);
1485 #endif
1486 	else
1487 		return isalpha((unsigned char) c);
1488 }
1489 
1490 
1491 /*
1492  * For bytea, the increment function need only increment the current byte
1493  * (there are no multibyte characters to worry about).
1494  */
1495 static bool
byte_increment(unsigned char * ptr,int len)1496 byte_increment(unsigned char *ptr, int len)
1497 {
1498 	if (*ptr >= 255)
1499 		return false;
1500 	(*ptr)++;
1501 	return true;
1502 }
1503 
1504 /*
1505  * Try to generate a string greater than the given string or any
1506  * string it is a prefix of.  If successful, return a palloc'd string
1507  * in the form of a Const node; else return NULL.
1508  *
1509  * The caller must provide the appropriate "less than" comparison function
1510  * for testing the strings, along with the collation to use.
1511  *
1512  * The key requirement here is that given a prefix string, say "foo",
1513  * we must be able to generate another string "fop" that is greater than
1514  * all strings "foobar" starting with "foo".  We can test that we have
1515  * generated a string greater than the prefix string, but in non-C collations
1516  * that is not a bulletproof guarantee that an extension of the string might
1517  * not sort after it; an example is that "foo " is less than "foo!", but it
1518  * is not clear that a "dictionary" sort ordering will consider "foo!" less
1519  * than "foo bar".  CAUTION: Therefore, this function should be used only for
1520  * estimation purposes when working in a non-C collation.
1521  *
1522  * To try to catch most cases where an extended string might otherwise sort
1523  * before the result value, we determine which of the strings "Z", "z", "y",
1524  * and "9" is seen as largest by the collation, and append that to the given
1525  * prefix before trying to find a string that compares as larger.
1526  *
1527  * To search for a greater string, we repeatedly "increment" the rightmost
1528  * character, using an encoding-specific character incrementer function.
1529  * When it's no longer possible to increment the last character, we truncate
1530  * off that character and start incrementing the next-to-rightmost.
1531  * For example, if "z" were the last character in the sort order, then we
1532  * could produce "foo" as a string greater than "fonz".
1533  *
1534  * This could be rather slow in the worst case, but in most cases we
1535  * won't have to try more than one or two strings before succeeding.
1536  *
1537  * Note that it's important for the character incrementer not to be too anal
1538  * about producing every possible character code, since in some cases the only
1539  * way to get a larger string is to increment a previous character position.
1540  * So we don't want to spend too much time trying every possible character
1541  * code at the last position.  A good rule of thumb is to be sure that we
1542  * don't try more than 256*K values for a K-byte character (and definitely
1543  * not 256^K, which is what an exhaustive search would approach).
1544  */
1545 static Const *
make_greater_string(const Const * str_const,FmgrInfo * ltproc,Oid collation)1546 make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
1547 {
1548 	Oid			datatype = str_const->consttype;
1549 	char	   *workstr;
1550 	int			len;
1551 	Datum		cmpstr;
1552 	char	   *cmptxt = NULL;
1553 	mbcharacter_incrementer charinc;
1554 
1555 	/*
1556 	 * Get a modifiable copy of the prefix string in C-string format, and set
1557 	 * up the string we will compare to as a Datum.  In C locale this can just
1558 	 * be the given prefix string, otherwise we need to add a suffix.  Type
1559 	 * BYTEA sorts bytewise so it never needs a suffix either.
1560 	 */
1561 	if (datatype == BYTEAOID)
1562 	{
1563 		bytea	   *bstr = DatumGetByteaPP(str_const->constvalue);
1564 
1565 		len = VARSIZE_ANY_EXHDR(bstr);
1566 		workstr = (char *) palloc(len);
1567 		memcpy(workstr, VARDATA_ANY(bstr), len);
1568 		Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue));
1569 		cmpstr = str_const->constvalue;
1570 	}
1571 	else
1572 	{
1573 		if (datatype == NAMEOID)
1574 			workstr = DatumGetCString(DirectFunctionCall1(nameout,
1575 														  str_const->constvalue));
1576 		else
1577 			workstr = TextDatumGetCString(str_const->constvalue);
1578 		len = strlen(workstr);
1579 		if (lc_collate_is_c(collation) || len == 0)
1580 			cmpstr = str_const->constvalue;
1581 		else
1582 		{
1583 			/* If first time through, determine the suffix to use */
1584 			static char suffixchar = 0;
1585 			static Oid	suffixcollation = 0;
1586 
1587 			if (!suffixchar || suffixcollation != collation)
1588 			{
1589 				char	   *best;
1590 
1591 				best = "Z";
1592 				if (varstr_cmp(best, 1, "z", 1, collation) < 0)
1593 					best = "z";
1594 				if (varstr_cmp(best, 1, "y", 1, collation) < 0)
1595 					best = "y";
1596 				if (varstr_cmp(best, 1, "9", 1, collation) < 0)
1597 					best = "9";
1598 				suffixchar = *best;
1599 				suffixcollation = collation;
1600 			}
1601 
1602 			/* And build the string to compare to */
1603 			if (datatype == NAMEOID)
1604 			{
1605 				cmptxt = palloc(len + 2);
1606 				memcpy(cmptxt, workstr, len);
1607 				cmptxt[len] = suffixchar;
1608 				cmptxt[len + 1] = '\0';
1609 				cmpstr = PointerGetDatum(cmptxt);
1610 			}
1611 			else
1612 			{
1613 				cmptxt = palloc(VARHDRSZ + len + 1);
1614 				SET_VARSIZE(cmptxt, VARHDRSZ + len + 1);
1615 				memcpy(VARDATA(cmptxt), workstr, len);
1616 				*(VARDATA(cmptxt) + len) = suffixchar;
1617 				cmpstr = PointerGetDatum(cmptxt);
1618 			}
1619 		}
1620 	}
1621 
1622 	/* Select appropriate character-incrementer function */
1623 	if (datatype == BYTEAOID)
1624 		charinc = byte_increment;
1625 	else
1626 		charinc = pg_database_encoding_character_incrementer();
1627 
1628 	/* And search ... */
1629 	while (len > 0)
1630 	{
1631 		int			charlen;
1632 		unsigned char *lastchar;
1633 
1634 		/* Identify the last character --- for bytea, just the last byte */
1635 		if (datatype == BYTEAOID)
1636 			charlen = 1;
1637 		else
1638 			charlen = len - pg_mbcliplen(workstr, len, len - 1);
1639 		lastchar = (unsigned char *) (workstr + len - charlen);
1640 
1641 		/*
1642 		 * Try to generate a larger string by incrementing the last character
1643 		 * (for BYTEA, we treat each byte as a character).
1644 		 *
1645 		 * Note: the incrementer function is expected to return true if it's
1646 		 * generated a valid-per-the-encoding new character, otherwise false.
1647 		 * The contents of the character on false return are unspecified.
1648 		 */
1649 		while (charinc(lastchar, charlen))
1650 		{
1651 			Const	   *workstr_const;
1652 
1653 			if (datatype == BYTEAOID)
1654 				workstr_const = string_to_bytea_const(workstr, len);
1655 			else
1656 				workstr_const = string_to_const(workstr, datatype);
1657 
1658 			if (DatumGetBool(FunctionCall2Coll(ltproc,
1659 											   collation,
1660 											   cmpstr,
1661 											   workstr_const->constvalue)))
1662 			{
1663 				/* Successfully made a string larger than cmpstr */
1664 				if (cmptxt)
1665 					pfree(cmptxt);
1666 				pfree(workstr);
1667 				return workstr_const;
1668 			}
1669 
1670 			/* No good, release unusable value and try again */
1671 			pfree(DatumGetPointer(workstr_const->constvalue));
1672 			pfree(workstr_const);
1673 		}
1674 
1675 		/*
1676 		 * No luck here, so truncate off the last character and try to
1677 		 * increment the next one.
1678 		 */
1679 		len -= charlen;
1680 		workstr[len] = '\0';
1681 	}
1682 
1683 	/* Failed... */
1684 	if (cmptxt)
1685 		pfree(cmptxt);
1686 	pfree(workstr);
1687 
1688 	return NULL;
1689 }
1690 
1691 /*
1692  * Generate a Datum of the appropriate type from a C string.
1693  * Note that all of the supported types are pass-by-ref, so the
1694  * returned value should be pfree'd if no longer needed.
1695  */
1696 static Datum
string_to_datum(const char * str,Oid datatype)1697 string_to_datum(const char *str, Oid datatype)
1698 {
1699 	Assert(str != NULL);
1700 
1701 	/*
1702 	 * We cheat a little by assuming that CStringGetTextDatum() will do for
1703 	 * bpchar and varchar constants too...
1704 	 */
1705 	if (datatype == NAMEOID)
1706 		return DirectFunctionCall1(namein, CStringGetDatum(str));
1707 	else if (datatype == BYTEAOID)
1708 		return DirectFunctionCall1(byteain, CStringGetDatum(str));
1709 	else
1710 		return CStringGetTextDatum(str);
1711 }
1712 
1713 /*
1714  * Generate a Const node of the appropriate type from a C string.
1715  */
1716 static Const *
string_to_const(const char * str,Oid datatype)1717 string_to_const(const char *str, Oid datatype)
1718 {
1719 	Datum		conval = string_to_datum(str, datatype);
1720 	Oid			collation;
1721 	int			constlen;
1722 
1723 	/*
1724 	 * We only need to support a few datatypes here, so hard-wire properties
1725 	 * instead of incurring the expense of catalog lookups.
1726 	 */
1727 	switch (datatype)
1728 	{
1729 		case TEXTOID:
1730 		case VARCHAROID:
1731 		case BPCHAROID:
1732 			collation = DEFAULT_COLLATION_OID;
1733 			constlen = -1;
1734 			break;
1735 
1736 		case NAMEOID:
1737 			collation = C_COLLATION_OID;
1738 			constlen = NAMEDATALEN;
1739 			break;
1740 
1741 		case BYTEAOID:
1742 			collation = InvalidOid;
1743 			constlen = -1;
1744 			break;
1745 
1746 		default:
1747 			elog(ERROR, "unexpected datatype in string_to_const: %u",
1748 				 datatype);
1749 			return NULL;
1750 	}
1751 
1752 	return makeConst(datatype, -1, collation, constlen,
1753 					 conval, false, false);
1754 }
1755 
1756 /*
1757  * Generate a Const node of bytea type from a binary C string and a length.
1758  */
1759 static Const *
string_to_bytea_const(const char * str,size_t str_len)1760 string_to_bytea_const(const char *str, size_t str_len)
1761 {
1762 	bytea	   *bstr = palloc(VARHDRSZ + str_len);
1763 	Datum		conval;
1764 
1765 	memcpy(VARDATA(bstr), str, str_len);
1766 	SET_VARSIZE(bstr, VARHDRSZ + str_len);
1767 	conval = PointerGetDatum(bstr);
1768 
1769 	return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false);
1770 }
1771