1 /*-------------------------------------------------------------------------
2  *
3  * tsquery_cleanup.c
4  *	 Cleanup query from NOT values and/or stopword
5  *	 Utility functions to correct work.
6  *
7  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/utils/adt/tsquery_cleanup.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "miscadmin.h"
19 #include "tsearch/ts_utils.h"
20 
21 typedef struct NODE
22 {
23 	struct NODE *left;
24 	struct NODE *right;
25 	QueryItem  *valnode;
26 } NODE;
27 
28 /*
29  * make query tree from plain view of query
30  */
31 static NODE *
maketree(QueryItem * in)32 maketree(QueryItem *in)
33 {
34 	NODE	   *node = (NODE *) palloc(sizeof(NODE));
35 
36 	/* since this function recurses, it could be driven to stack overflow. */
37 	check_stack_depth();
38 
39 	node->valnode = in;
40 	node->right = node->left = NULL;
41 	if (in->type == QI_OPR)
42 	{
43 		node->right = maketree(in + 1);
44 		if (in->qoperator.oper != OP_NOT)
45 			node->left = maketree(in + in->qoperator.left);
46 	}
47 	return node;
48 }
49 
50 /*
51  * Internal state for plaintree and plainnode
52  */
53 typedef struct
54 {
55 	QueryItem  *ptr;
56 	int			len;			/* allocated size of ptr */
57 	int			cur;			/* number of elements in ptr */
58 } PLAINTREE;
59 
60 static void
plainnode(PLAINTREE * state,NODE * node)61 plainnode(PLAINTREE *state, NODE *node)
62 {
63 	/* since this function recurses, it could be driven to stack overflow. */
64 	check_stack_depth();
65 
66 	if (state->cur == state->len)
67 	{
68 		state->len *= 2;
69 		state->ptr = (QueryItem *) repalloc((void *) state->ptr, state->len * sizeof(QueryItem));
70 	}
71 	memcpy((void *) &(state->ptr[state->cur]), (void *) node->valnode, sizeof(QueryItem));
72 	if (node->valnode->type == QI_VAL)
73 		state->cur++;
74 	else if (node->valnode->qoperator.oper == OP_NOT)
75 	{
76 		state->ptr[state->cur].qoperator.left = 1;
77 		state->cur++;
78 		plainnode(state, node->right);
79 	}
80 	else
81 	{
82 		int			cur = state->cur;
83 
84 		state->cur++;
85 		plainnode(state, node->right);
86 		state->ptr[cur].qoperator.left = state->cur - cur;
87 		plainnode(state, node->left);
88 	}
89 	pfree(node);
90 }
91 
92 /*
93  * make plain view of tree from a NODE-tree representation
94  */
95 static QueryItem *
plaintree(NODE * root,int * len)96 plaintree(NODE *root, int *len)
97 {
98 	PLAINTREE	pl;
99 
100 	pl.cur = 0;
101 	pl.len = 16;
102 	if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR))
103 	{
104 		pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem));
105 		plainnode(&pl, root);
106 	}
107 	else
108 		pl.ptr = NULL;
109 	*len = pl.cur;
110 	return pl.ptr;
111 }
112 
113 static void
freetree(NODE * node)114 freetree(NODE *node)
115 {
116 	/* since this function recurses, it could be driven to stack overflow. */
117 	check_stack_depth();
118 
119 	if (!node)
120 		return;
121 	if (node->left)
122 		freetree(node->left);
123 	if (node->right)
124 		freetree(node->right);
125 	pfree(node);
126 }
127 
128 /*
129  * clean tree for ! operator.
130  * It's useful for debug, but in
131  * other case, such view is used with search in index.
132  * Operator ! always return TRUE
133  */
134 static NODE *
clean_NOT_intree(NODE * node)135 clean_NOT_intree(NODE *node)
136 {
137 	/* since this function recurses, it could be driven to stack overflow. */
138 	check_stack_depth();
139 
140 	if (node->valnode->type == QI_VAL)
141 		return node;
142 
143 	if (node->valnode->qoperator.oper == OP_NOT)
144 	{
145 		freetree(node);
146 		return NULL;
147 	}
148 
149 	/* operator & or | */
150 	if (node->valnode->qoperator.oper == OP_OR)
151 	{
152 		if ((node->left = clean_NOT_intree(node->left)) == NULL ||
153 			(node->right = clean_NOT_intree(node->right)) == NULL)
154 		{
155 			freetree(node);
156 			return NULL;
157 		}
158 	}
159 	else
160 	{
161 		NODE	   *res = node;
162 
163 		Assert(node->valnode->qoperator.oper == OP_AND ||
164 			   node->valnode->qoperator.oper == OP_PHRASE);
165 
166 		node->left = clean_NOT_intree(node->left);
167 		node->right = clean_NOT_intree(node->right);
168 		if (node->left == NULL && node->right == NULL)
169 		{
170 			pfree(node);
171 			res = NULL;
172 		}
173 		else if (node->left == NULL)
174 		{
175 			res = node->right;
176 			pfree(node);
177 		}
178 		else if (node->right == NULL)
179 		{
180 			res = node->left;
181 			pfree(node);
182 		}
183 		return res;
184 	}
185 	return node;
186 }
187 
188 QueryItem *
clean_NOT(QueryItem * ptr,int * len)189 clean_NOT(QueryItem *ptr, int *len)
190 {
191 	NODE	   *root = maketree(ptr);
192 
193 	return plaintree(clean_NOT_intree(root), len);
194 }
195 
196 
197 /*
198  * Remove QI_VALSTOP (stopword) nodes from query tree.
199  *
200  * Returns NULL if the query degenerates to nothing.  Input must not be NULL.
201  *
202  * When we remove a phrase operator due to removing one or both of its
203  * arguments, we might need to adjust the distance of a parent phrase
204  * operator.  For example, 'a' is a stopword, so:
205  *		(b <-> a) <-> c  should become	b <2> c
206  *		b <-> (a <-> c)  should become	b <2> c
207  *		(b <-> (a <-> a)) <-> c  should become	b <3> c
208  *		b <-> ((a <-> a) <-> c)  should become	b <3> c
209  * To handle that, we define two output parameters:
210  *		ladd: amount to add to a phrase distance to the left of this node
211  *		radd: amount to add to a phrase distance to the right of this node
212  * We need two outputs because we could need to bubble up adjustments to two
213  * different parent phrase operators.  Consider
214  *		w <-> (((a <-> x) <2> (y <3> a)) <-> z)
215  * After we've removed the two a's and are considering the <2> node (which is
216  * now just x <2> y), we have an ladd distance of 1 that needs to propagate
217  * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
218  * propagate to the rightmost <->, so that we'll end up with
219  *		w <2> ((x <2> y) <4> z)
220  * Near the bottom of the tree, we may have subtrees consisting only of
221  * stopwords.  The distances of any phrase operators within such a subtree are
222  * summed and propagated to both ladd and radd, since we don't know which side
223  * of the lowest surviving phrase operator we are in.  The rule is that any
224  * subtree that degenerates to NULL must return equal values of ladd and radd,
225  * and the parent node dealing with it should incorporate only one of those.
226  *
227  * Currently, we only implement this adjustment for adjacent phrase operators.
228  * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
229  * isn't ideal, but there is no way to represent the really desired semantics
230  * without some redesign of the tsquery structure.  Certainly it would not be
231  * any better to convert that to 'x <2> (y | z)'.  Since this is such a weird
232  * corner case, let it go for now.  But we can fix it in cases where the
233  * intervening non-phrase operator also gets removed, for example
234  * '((x <-> a) | a) <-> y' will become 'x <2> y'.
235  */
236 static NODE *
clean_stopword_intree(NODE * node,int * ladd,int * radd)237 clean_stopword_intree(NODE *node, int *ladd, int *radd)
238 {
239 	/* since this function recurses, it could be driven to stack overflow. */
240 	check_stack_depth();
241 
242 	/* default output parameters indicate no change in parent distance */
243 	*ladd = *radd = 0;
244 
245 	if (node->valnode->type == QI_VAL)
246 		return node;
247 	else if (node->valnode->type == QI_VALSTOP)
248 	{
249 		pfree(node);
250 		return NULL;
251 	}
252 
253 	Assert(node->valnode->type == QI_OPR);
254 
255 	if (node->valnode->qoperator.oper == OP_NOT)
256 	{
257 		/* NOT doesn't change pattern width, so just report child distances */
258 		node->right = clean_stopword_intree(node->right, ladd, radd);
259 		if (!node->right)
260 		{
261 			freetree(node);
262 			return NULL;
263 		}
264 	}
265 	else
266 	{
267 		NODE	   *res = node;
268 		bool		isphrase;
269 		int			ndistance,
270 					lladd,
271 					lradd,
272 					rladd,
273 					rradd;
274 
275 		/* First, recurse */
276 		node->left = clean_stopword_intree(node->left, &lladd, &lradd);
277 		node->right = clean_stopword_intree(node->right, &rladd, &rradd);
278 
279 		/* Check if current node is OP_PHRASE, get its distance */
280 		isphrase = (node->valnode->qoperator.oper == OP_PHRASE);
281 		ndistance = isphrase ? node->valnode->qoperator.distance : 0;
282 
283 		if (node->left == NULL && node->right == NULL)
284 		{
285 			/*
286 			 * When we collapse out a phrase node entirely, propagate its own
287 			 * distance into both *ladd and *radd; it is the responsibility of
288 			 * the parent node to count it only once.  Also, for a phrase
289 			 * node, distances coming from children are summed and propagated
290 			 * up to parent (we assume lladd == lradd and rladd == rradd, else
291 			 * rule was broken at a lower level).  But if this isn't a phrase
292 			 * node, take the larger of the two child distances; that
293 			 * corresponds to what TS_execute will do in non-stopword cases.
294 			 */
295 			if (isphrase)
296 				*ladd = *radd = lladd + ndistance + rladd;
297 			else
298 				*ladd = *radd = Max(lladd, rladd);
299 			freetree(node);
300 			return NULL;
301 		}
302 		else if (node->left == NULL)
303 		{
304 			/* Removing this operator and left subnode */
305 			/* lladd and lradd are equal/redundant, don't count both */
306 			if (isphrase)
307 			{
308 				/* operator's own distance must propagate to left */
309 				*ladd = lladd + ndistance + rladd;
310 				*radd = rradd;
311 			}
312 			else
313 			{
314 				/* at non-phrase op, just forget the left subnode entirely */
315 				*ladd = rladd;
316 				*radd = rradd;
317 			}
318 			res = node->right;
319 			pfree(node);
320 		}
321 		else if (node->right == NULL)
322 		{
323 			/* Removing this operator and right subnode */
324 			/* rladd and rradd are equal/redundant, don't count both */
325 			if (isphrase)
326 			{
327 				/* operator's own distance must propagate to right */
328 				*ladd = lladd;
329 				*radd = lradd + ndistance + rradd;
330 			}
331 			else
332 			{
333 				/* at non-phrase op, just forget the right subnode entirely */
334 				*ladd = lladd;
335 				*radd = lradd;
336 			}
337 			res = node->left;
338 			pfree(node);
339 		}
340 		else if (isphrase)
341 		{
342 			/* Absorb appropriate corrections at this level */
343 			node->valnode->qoperator.distance += lradd + rladd;
344 			/* Propagate up any unaccounted-for corrections */
345 			*ladd = lladd;
346 			*radd = rradd;
347 		}
348 		else
349 		{
350 			/* We're keeping a non-phrase operator, so ladd/radd remain 0 */
351 		}
352 
353 		return res;
354 	}
355 	return node;
356 }
357 
358 /*
359  * Number of elements in query tree
360  */
361 static int32
calcstrlen(NODE * node)362 calcstrlen(NODE *node)
363 {
364 	int32		size = 0;
365 
366 	if (node->valnode->type == QI_VAL)
367 	{
368 		size = node->valnode->qoperand.length + 1;
369 	}
370 	else
371 	{
372 		Assert(node->valnode->type == QI_OPR);
373 
374 		size = calcstrlen(node->right);
375 		if (node->valnode->qoperator.oper != OP_NOT)
376 			size += calcstrlen(node->left);
377 	}
378 
379 	return size;
380 }
381 
382 /*
383  * Remove QI_VALSTOP (stopword) nodes from TSQuery.
384  */
385 TSQuery
cleanup_tsquery_stopwords(TSQuery in)386 cleanup_tsquery_stopwords(TSQuery in)
387 {
388 	int32		len,
389 				lenstr,
390 				commonlen,
391 				i;
392 	NODE	   *root;
393 	int			ladd,
394 				radd;
395 	TSQuery		out;
396 	QueryItem  *items;
397 	char	   *operands;
398 
399 	if (in->size == 0)
400 		return in;
401 
402 	/* eliminate stop words */
403 	root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd);
404 	if (root == NULL)
405 	{
406 		ereport(NOTICE,
407 				(errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored")));
408 		out = palloc(HDRSIZETQ);
409 		out->size = 0;
410 		SET_VARSIZE(out, HDRSIZETQ);
411 		return out;
412 	}
413 
414 	/*
415 	 * Build TSQuery from plain view
416 	 */
417 
418 	lenstr = calcstrlen(root);
419 	items = plaintree(root, &len);
420 	commonlen = COMPUTESIZE(len, lenstr);
421 
422 	out = palloc(commonlen);
423 	SET_VARSIZE(out, commonlen);
424 	out->size = len;
425 
426 	memcpy(GETQUERY(out), items, len * sizeof(QueryItem));
427 
428 	items = GETQUERY(out);
429 	operands = GETOPERAND(out);
430 	for (i = 0; i < out->size; i++)
431 	{
432 		QueryOperand *op = (QueryOperand *) &items[i];
433 
434 		if (op->type != QI_VAL)
435 			continue;
436 
437 		memcpy(operands, GETOPERAND(in) + op->distance, op->length);
438 		operands[op->length] = '\0';
439 		op->distance = operands - GETOPERAND(out);
440 		operands += op->length + 1;
441 	}
442 
443 	return out;
444 }
445