1 /*-------------------------------------------------------------------------
2  *
3  * ts_selfuncs.c
4  *	  Selectivity estimation functions for text search operators.
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/tsearch/ts_selfuncs.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "access/htup_details.h"
17 #include "catalog/pg_statistic.h"
18 #include "catalog/pg_type.h"
19 #include "miscadmin.h"
20 #include "nodes/nodes.h"
21 #include "tsearch/ts_type.h"
22 #include "utils/builtins.h"
23 #include "utils/lsyscache.h"
24 #include "utils/selfuncs.h"
25 #include "utils/syscache.h"
26 
27 
28 /*
29  * The default text search selectivity is chosen to be small enough to
30  * encourage indexscans for typical table densities.  See selfuncs.h and
31  * DEFAULT_EQ_SEL for details.
32  */
33 #define DEFAULT_TS_MATCH_SEL 0.005
34 
35 /* lookup table type for binary searching through MCELEMs */
36 typedef struct
37 {
38 	text	   *element;
39 	float4		frequency;
40 } TextFreq;
41 
42 /* type of keys for bsearch'ing through an array of TextFreqs */
43 typedef struct
44 {
45 	char	   *lexeme;
46 	int			length;
47 } LexemeKey;
48 
49 static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
50 static Selectivity mcelem_tsquery_selec(TSQuery query,
51 					 Datum *mcelem, int nmcelem,
52 					 float4 *numbers, int nnumbers);
53 static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
54 				  TextFreq *lookup, int length, float4 minfreq);
55 static int	compare_lexeme_textfreq(const void *e1, const void *e2);
56 
57 #define tsquery_opr_selec_no_stats(query) \
58 	tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0)
59 
60 
61 /*
62  *	tsmatchsel -- Selectivity of "@@"
63  *
64  * restriction selectivity function for tsvector @@ tsquery and
65  * tsquery @@ tsvector
66  */
67 Datum
tsmatchsel(PG_FUNCTION_ARGS)68 tsmatchsel(PG_FUNCTION_ARGS)
69 {
70 	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
71 
72 #ifdef NOT_USED
73 	Oid			operator = PG_GETARG_OID(1);
74 #endif
75 	List	   *args = (List *) PG_GETARG_POINTER(2);
76 	int			varRelid = PG_GETARG_INT32(3);
77 	VariableStatData vardata;
78 	Node	   *other;
79 	bool		varonleft;
80 	Selectivity selec;
81 
82 	/*
83 	 * If expression is not variable = something or something = variable, then
84 	 * punt and return a default estimate.
85 	 */
86 	if (!get_restriction_variable(root, args, varRelid,
87 								  &vardata, &other, &varonleft))
88 		PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
89 
90 	/*
91 	 * Can't do anything useful if the something is not a constant, either.
92 	 */
93 	if (!IsA(other, Const))
94 	{
95 		ReleaseVariableStats(vardata);
96 		PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
97 	}
98 
99 	/*
100 	 * The "@@" operator is strict, so we can cope with NULL right away
101 	 */
102 	if (((Const *) other)->constisnull)
103 	{
104 		ReleaseVariableStats(vardata);
105 		PG_RETURN_FLOAT8(0.0);
106 	}
107 
108 	/*
109 	 * OK, there's a Var and a Const we're dealing with here.  We need the
110 	 * Const to be a TSQuery, else we can't do anything useful.  We have to
111 	 * check this because the Var might be the TSQuery not the TSVector.
112 	 */
113 	if (((Const *) other)->consttype == TSQUERYOID)
114 	{
115 		/* tsvector @@ tsquery or the other way around */
116 		Assert(vardata.vartype == TSVECTOROID);
117 
118 		selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
119 	}
120 	else
121 	{
122 		/* If we can't see the query structure, must punt */
123 		selec = DEFAULT_TS_MATCH_SEL;
124 	}
125 
126 	ReleaseVariableStats(vardata);
127 
128 	CLAMP_PROBABILITY(selec);
129 
130 	PG_RETURN_FLOAT8((float8) selec);
131 }
132 
133 
134 /*
135  *	tsmatchjoinsel -- join selectivity of "@@"
136  *
137  * join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
138  */
139 Datum
tsmatchjoinsel(PG_FUNCTION_ARGS)140 tsmatchjoinsel(PG_FUNCTION_ARGS)
141 {
142 	/* for the moment we just punt */
143 	PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
144 }
145 
146 
147 /*
148  * @@ selectivity for tsvector var vs tsquery constant
149  */
150 static Selectivity
tsquerysel(VariableStatData * vardata,Datum constval)151 tsquerysel(VariableStatData *vardata, Datum constval)
152 {
153 	Selectivity selec;
154 	TSQuery		query;
155 
156 	/* The caller made sure the const is a TSQuery, so get it now */
157 	query = DatumGetTSQuery(constval);
158 
159 	/* Empty query matches nothing */
160 	if (query->size == 0)
161 		return (Selectivity) 0.0;
162 
163 	if (HeapTupleIsValid(vardata->statsTuple))
164 	{
165 		Form_pg_statistic stats;
166 		AttStatsSlot sslot;
167 
168 		stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
169 
170 		/* MCELEM will be an array of TEXT elements for a tsvector column */
171 		if (get_attstatsslot(&sslot, vardata->statsTuple,
172 							 STATISTIC_KIND_MCELEM, InvalidOid,
173 							 ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
174 		{
175 			/*
176 			 * There is a most-common-elements slot for the tsvector Var, so
177 			 * use that.
178 			 */
179 			selec = mcelem_tsquery_selec(query, sslot.values, sslot.nvalues,
180 										 sslot.numbers, sslot.nnumbers);
181 			free_attstatsslot(&sslot);
182 		}
183 		else
184 		{
185 			/* No most-common-elements info, so do without */
186 			selec = tsquery_opr_selec_no_stats(query);
187 		}
188 
189 		/*
190 		 * MCE stats count only non-null rows, so adjust for null rows.
191 		 */
192 		selec *= (1.0 - stats->stanullfrac);
193 	}
194 	else
195 	{
196 		/* No stats at all, so do without */
197 		selec = tsquery_opr_selec_no_stats(query);
198 		/* we assume no nulls here, so no stanullfrac correction */
199 	}
200 
201 	return selec;
202 }
203 
204 /*
205  * Extract data from the pg_statistic arrays into useful format.
206  */
207 static Selectivity
mcelem_tsquery_selec(TSQuery query,Datum * mcelem,int nmcelem,float4 * numbers,int nnumbers)208 mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
209 					 float4 *numbers, int nnumbers)
210 {
211 	float4		minfreq;
212 	TextFreq   *lookup;
213 	Selectivity selec;
214 	int			i;
215 
216 	/*
217 	 * There should be two more Numbers than Values, because the last two
218 	 * cells are taken for minimal and maximal frequency.  Punt if not.
219 	 *
220 	 * (Note: the MCELEM statistics slot definition allows for a third extra
221 	 * number containing the frequency of nulls, but we're not expecting that
222 	 * to appear for a tsvector column.)
223 	 */
224 	if (nnumbers != nmcelem + 2)
225 		return tsquery_opr_selec_no_stats(query);
226 
227 	/*
228 	 * Transpose the data into a single array so we can use bsearch().
229 	 */
230 	lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
231 	for (i = 0; i < nmcelem; i++)
232 	{
233 		/*
234 		 * The text Datums came from an array, so it cannot be compressed or
235 		 * stored out-of-line -- it's safe to use VARSIZE_ANY*.
236 		 */
237 		Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i]));
238 		lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
239 		lookup[i].frequency = numbers[i];
240 	}
241 
242 	/*
243 	 * Grab the lowest frequency. compute_tsvector_stats() stored it for us in
244 	 * the one before the last cell of the Numbers array. See ts_typanalyze.c
245 	 */
246 	minfreq = numbers[nnumbers - 2];
247 
248 	selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
249 							  nmcelem, minfreq);
250 
251 	pfree(lookup);
252 
253 	return selec;
254 }
255 
256 /*
257  * Traverse the tsquery in preorder, calculating selectivity as:
258  *
259  *	 selec(left_oper) * selec(right_oper) in AND & PHRASE nodes,
260  *
261  *	 selec(left_oper) + selec(right_oper) -
262  *		selec(left_oper) * selec(right_oper) in OR nodes,
263  *
264  *	 1 - select(oper) in NOT nodes
265  *
266  *	 histogram-based estimation in prefix VAL nodes
267  *
268  *	 freq[val] in exact VAL nodes, if the value is in MCELEM
269  *	 min(freq[MCELEM]) / 2 in VAL nodes, if it is not
270  *
271  * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
272  * binary search for determining freq[MCELEM].
273  *
274  * If we don't have stats for the tsvector, we still use this logic,
275  * except we use default estimates for VAL nodes.  This case is signaled
276  * by lookup == NULL.
277  */
278 static Selectivity
tsquery_opr_selec(QueryItem * item,char * operand,TextFreq * lookup,int length,float4 minfreq)279 tsquery_opr_selec(QueryItem *item, char *operand,
280 				  TextFreq *lookup, int length, float4 minfreq)
281 {
282 	Selectivity selec;
283 
284 	/* since this function recurses, it could be driven to stack overflow */
285 	check_stack_depth();
286 
287 	if (item->type == QI_VAL)
288 	{
289 		QueryOperand *oper = (QueryOperand *) item;
290 		LexemeKey	key;
291 
292 		/*
293 		 * Prepare the key for bsearch().
294 		 */
295 		key.lexeme = operand + oper->distance;
296 		key.length = oper->length;
297 
298 		if (oper->prefix)
299 		{
300 			/* Prefix match, ie the query item is lexeme:* */
301 			Selectivity matched,
302 						allmces;
303 			int			i,
304 						n_matched;
305 
306 			/*
307 			 * Our strategy is to scan through the MCELEM list and combine the
308 			 * frequencies of the ones that match the prefix.  We then
309 			 * extrapolate the fraction of matching MCELEMs to the remaining
310 			 * rows, assuming that the MCELEMs are representative of the whole
311 			 * lexeme population in this respect.  (Compare
312 			 * histogram_selectivity().)  Note that these are most common
313 			 * elements not most common values, so they're not mutually
314 			 * exclusive.  We treat occurrences as independent events.
315 			 *
316 			 * This is only a good plan if we have a pretty fair number of
317 			 * MCELEMs available; we set the threshold at 100.  If no stats or
318 			 * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
319 			 */
320 			if (lookup == NULL || length < 100)
321 				return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
322 
323 			matched = allmces = 0;
324 			n_matched = 0;
325 			for (i = 0; i < length; i++)
326 			{
327 				TextFreq   *t = lookup + i;
328 				int			tlen = VARSIZE_ANY_EXHDR(t->element);
329 
330 				if (tlen >= key.length &&
331 					strncmp(key.lexeme, VARDATA_ANY(t->element),
332 							key.length) == 0)
333 				{
334 					matched += t->frequency - matched * t->frequency;
335 					n_matched++;
336 				}
337 				allmces += t->frequency - allmces * t->frequency;
338 			}
339 
340 			/* Clamp to ensure sanity in the face of roundoff error */
341 			CLAMP_PROBABILITY(matched);
342 			CLAMP_PROBABILITY(allmces);
343 
344 			selec = matched + (1.0 - allmces) * ((double) n_matched / length);
345 
346 			/*
347 			 * In any case, never believe that a prefix match has selectivity
348 			 * less than we would assign for a non-MCELEM lexeme.  This
349 			 * preserves the property that "word:*" should be estimated to
350 			 * match at least as many rows as "word" would be.
351 			 */
352 			selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / 2), selec);
353 		}
354 		else
355 		{
356 			/* Regular exact lexeme match */
357 			TextFreq   *searchres;
358 
359 			/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
360 			if (lookup == NULL)
361 				return (Selectivity) DEFAULT_TS_MATCH_SEL;
362 
363 			searchres = (TextFreq *) bsearch(&key, lookup, length,
364 											 sizeof(TextFreq),
365 											 compare_lexeme_textfreq);
366 
367 			if (searchres)
368 			{
369 				/*
370 				 * The element is in MCELEM.  Return precise selectivity (or
371 				 * at least as precise as ANALYZE could find out).
372 				 */
373 				selec = searchres->frequency;
374 			}
375 			else
376 			{
377 				/*
378 				 * The element is not in MCELEM.  Punt, but assume that the
379 				 * selectivity cannot be more than minfreq / 2.
380 				 */
381 				selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
382 			}
383 		}
384 	}
385 	else
386 	{
387 		/* Current TSQuery node is an operator */
388 		Selectivity s1,
389 					s2;
390 
391 		switch (item->qoperator.oper)
392 		{
393 			case OP_NOT:
394 				selec = 1.0 - tsquery_opr_selec(item + 1, operand,
395 												lookup, length, minfreq);
396 				break;
397 
398 			case OP_PHRASE:
399 			case OP_AND:
400 				s1 = tsquery_opr_selec(item + 1, operand,
401 									   lookup, length, minfreq);
402 				s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
403 									   lookup, length, minfreq);
404 				selec = s1 * s2;
405 				break;
406 
407 			case OP_OR:
408 				s1 = tsquery_opr_selec(item + 1, operand,
409 									   lookup, length, minfreq);
410 				s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
411 									   lookup, length, minfreq);
412 				selec = s1 + s2 - s1 * s2;
413 				break;
414 
415 			default:
416 				elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
417 				selec = 0;		/* keep compiler quiet */
418 				break;
419 		}
420 	}
421 
422 	/* Clamp intermediate results to stay sane despite roundoff error */
423 	CLAMP_PROBABILITY(selec);
424 
425 	return selec;
426 }
427 
428 /*
429  * bsearch() comparator for a lexeme (non-NULL terminated string with length)
430  * and a TextFreq. Use length, then byte-for-byte comparison, because that's
431  * how ANALYZE code sorted data before storing it in a statistic tuple.
432  * See ts_typanalyze.c for details.
433  */
434 static int
compare_lexeme_textfreq(const void * e1,const void * e2)435 compare_lexeme_textfreq(const void *e1, const void *e2)
436 {
437 	const LexemeKey *key = (const LexemeKey *) e1;
438 	const TextFreq *t = (const TextFreq *) e2;
439 	int			len1,
440 				len2;
441 
442 	len1 = key->length;
443 	len2 = VARSIZE_ANY_EXHDR(t->element);
444 
445 	/* Compare lengths first, possibly avoiding a strncmp call */
446 	if (len1 > len2)
447 		return 1;
448 	else if (len1 < len2)
449 		return -1;
450 
451 	/* Fall back on byte-for-byte comparison */
452 	return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
453 }
454