1 /*------------------------------------------------------------------------- 2 * 3 * tsquery_cleanup.c 4 * Cleanup query from NOT values and/or stopword 5 * Utility functions to correct work. 6 * 7 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group 8 * 9 * 10 * IDENTIFICATION 11 * src/backend/utils/adt/tsquery_cleanup.c 12 * 13 *------------------------------------------------------------------------- 14 */ 15 16 #include "postgres.h" 17 18 #include "miscadmin.h" 19 #include "tsearch/ts_utils.h" 20 21 typedef struct NODE 22 { 23 struct NODE *left; 24 struct NODE *right; 25 QueryItem *valnode; 26 } NODE; 27 28 /* 29 * make query tree from plain view of query 30 */ 31 static NODE * 32 maketree(QueryItem *in) 33 { 34 NODE *node = (NODE *) palloc(sizeof(NODE)); 35 36 /* since this function recurses, it could be driven to stack overflow. */ 37 check_stack_depth(); 38 39 node->valnode = in; 40 node->right = node->left = NULL; 41 if (in->type == QI_OPR) 42 { 43 node->right = maketree(in + 1); 44 if (in->qoperator.oper != OP_NOT) 45 node->left = maketree(in + in->qoperator.left); 46 } 47 return node; 48 } 49 50 /* 51 * Internal state for plaintree and plainnode 52 */ 53 typedef struct 54 { 55 QueryItem *ptr; 56 int len; /* allocated size of ptr */ 57 int cur; /* number of elements in ptr */ 58 } PLAINTREE; 59 60 static void 61 plainnode(PLAINTREE *state, NODE *node) 62 { 63 /* since this function recurses, it could be driven to stack overflow. */ 64 check_stack_depth(); 65 66 if (state->cur == state->len) 67 { 68 state->len *= 2; 69 state->ptr = (QueryItem *) repalloc((void *) state->ptr, state->len * sizeof(QueryItem)); 70 } 71 memcpy((void *) &(state->ptr[state->cur]), (void *) node->valnode, sizeof(QueryItem)); 72 if (node->valnode->type == QI_VAL) 73 state->cur++; 74 else if (node->valnode->qoperator.oper == OP_NOT) 75 { 76 state->ptr[state->cur].qoperator.left = 1; 77 state->cur++; 78 plainnode(state, node->right); 79 } 80 else 81 { 82 int cur = state->cur; 83 84 state->cur++; 85 plainnode(state, node->right); 86 state->ptr[cur].qoperator.left = state->cur - cur; 87 plainnode(state, node->left); 88 } 89 pfree(node); 90 } 91 92 /* 93 * make plain view of tree from a NODE-tree representation 94 */ 95 static QueryItem * 96 plaintree(NODE *root, int *len) 97 { 98 PLAINTREE pl; 99 100 pl.cur = 0; 101 pl.len = 16; 102 if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR)) 103 { 104 pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem)); 105 plainnode(&pl, root); 106 } 107 else 108 pl.ptr = NULL; 109 *len = pl.cur; 110 return pl.ptr; 111 } 112 113 static void 114 freetree(NODE *node) 115 { 116 /* since this function recurses, it could be driven to stack overflow. */ 117 check_stack_depth(); 118 119 if (!node) 120 return; 121 if (node->left) 122 freetree(node->left); 123 if (node->right) 124 freetree(node->right); 125 pfree(node); 126 } 127 128 /* 129 * clean tree for ! operator. 130 * It's useful for debug, but in 131 * other case, such view is used with search in index. 132 * Operator ! always return TRUE 133 */ 134 static NODE * 135 clean_NOT_intree(NODE *node) 136 { 137 /* since this function recurses, it could be driven to stack overflow. */ 138 check_stack_depth(); 139 140 if (node->valnode->type == QI_VAL) 141 return node; 142 143 if (node->valnode->qoperator.oper == OP_NOT) 144 { 145 freetree(node); 146 return NULL; 147 } 148 149 /* operator & or | */ 150 if (node->valnode->qoperator.oper == OP_OR) 151 { 152 if ((node->left = clean_NOT_intree(node->left)) == NULL || 153 (node->right = clean_NOT_intree(node->right)) == NULL) 154 { 155 freetree(node); 156 return NULL; 157 } 158 } 159 else 160 { 161 NODE *res = node; 162 163 Assert(node->valnode->qoperator.oper == OP_AND || 164 node->valnode->qoperator.oper == OP_PHRASE); 165 166 node->left = clean_NOT_intree(node->left); 167 node->right = clean_NOT_intree(node->right); 168 if (node->left == NULL && node->right == NULL) 169 { 170 pfree(node); 171 res = NULL; 172 } 173 else if (node->left == NULL) 174 { 175 res = node->right; 176 pfree(node); 177 } 178 else if (node->right == NULL) 179 { 180 res = node->left; 181 pfree(node); 182 } 183 return res; 184 } 185 return node; 186 } 187 188 QueryItem * 189 clean_NOT(QueryItem *ptr, int *len) 190 { 191 NODE *root = maketree(ptr); 192 193 return plaintree(clean_NOT_intree(root), len); 194 } 195 196 197 /* 198 * Remove QI_VALSTOP (stopword) nodes from query tree. 199 * 200 * Returns NULL if the query degenerates to nothing. Input must not be NULL. 201 * 202 * When we remove a phrase operator due to removing one or both of its 203 * arguments, we might need to adjust the distance of a parent phrase 204 * operator. For example, 'a' is a stopword, so: 205 * (b <-> a) <-> c should become b <2> c 206 * b <-> (a <-> c) should become b <2> c 207 * (b <-> (a <-> a)) <-> c should become b <3> c 208 * b <-> ((a <-> a) <-> c) should become b <3> c 209 * To handle that, we define two output parameters: 210 * ladd: amount to add to a phrase distance to the left of this node 211 * radd: amount to add to a phrase distance to the right of this node 212 * We need two outputs because we could need to bubble up adjustments to two 213 * different parent phrase operators. Consider 214 * w <-> (((a <-> x) <2> (y <3> a)) <-> z) 215 * After we've removed the two a's and are considering the <2> node (which is 216 * now just x <2> y), we have an ladd distance of 1 that needs to propagate 217 * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to 218 * propagate to the rightmost <->, so that we'll end up with 219 * w <2> ((x <2> y) <4> z) 220 * Near the bottom of the tree, we may have subtrees consisting only of 221 * stopwords. The distances of any phrase operators within such a subtree are 222 * summed and propagated to both ladd and radd, since we don't know which side 223 * of the lowest surviving phrase operator we are in. The rule is that any 224 * subtree that degenerates to NULL must return equal values of ladd and radd, 225 * and the parent node dealing with it should incorporate only one of those. 226 * 227 * Currently, we only implement this adjustment for adjacent phrase operators. 228 * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which 229 * isn't ideal, but there is no way to represent the really desired semantics 230 * without some redesign of the tsquery structure. Certainly it would not be 231 * any better to convert that to 'x <2> (y | z)'. Since this is such a weird 232 * corner case, let it go for now. But we can fix it in cases where the 233 * intervening non-phrase operator also gets removed, for example 234 * '((x <-> a) | a) <-> y' will become 'x <2> y'. 235 */ 236 static NODE * 237 clean_stopword_intree(NODE *node, int *ladd, int *radd) 238 { 239 /* since this function recurses, it could be driven to stack overflow. */ 240 check_stack_depth(); 241 242 /* default output parameters indicate no change in parent distance */ 243 *ladd = *radd = 0; 244 245 if (node->valnode->type == QI_VAL) 246 return node; 247 else if (node->valnode->type == QI_VALSTOP) 248 { 249 pfree(node); 250 return NULL; 251 } 252 253 Assert(node->valnode->type == QI_OPR); 254 255 if (node->valnode->qoperator.oper == OP_NOT) 256 { 257 /* NOT doesn't change pattern width, so just report child distances */ 258 node->right = clean_stopword_intree(node->right, ladd, radd); 259 if (!node->right) 260 { 261 freetree(node); 262 return NULL; 263 } 264 } 265 else 266 { 267 NODE *res = node; 268 bool isphrase; 269 int ndistance, 270 lladd, 271 lradd, 272 rladd, 273 rradd; 274 275 /* First, recurse */ 276 node->left = clean_stopword_intree(node->left, &lladd, &lradd); 277 node->right = clean_stopword_intree(node->right, &rladd, &rradd); 278 279 /* Check if current node is OP_PHRASE, get its distance */ 280 isphrase = (node->valnode->qoperator.oper == OP_PHRASE); 281 ndistance = isphrase ? node->valnode->qoperator.distance : 0; 282 283 if (node->left == NULL && node->right == NULL) 284 { 285 /* 286 * When we collapse out a phrase node entirely, propagate its own 287 * distance into both *ladd and *radd; it is the responsibility of 288 * the parent node to count it only once. Also, for a phrase 289 * node, distances coming from children are summed and propagated 290 * up to parent (we assume lladd == lradd and rladd == rradd, else 291 * rule was broken at a lower level). But if this isn't a phrase 292 * node, take the larger of the two child distances; that 293 * corresponds to what TS_execute will do in non-stopword cases. 294 */ 295 if (isphrase) 296 *ladd = *radd = lladd + ndistance + rladd; 297 else 298 *ladd = *radd = Max(lladd, rladd); 299 freetree(node); 300 return NULL; 301 } 302 else if (node->left == NULL) 303 { 304 /* Removing this operator and left subnode */ 305 /* lladd and lradd are equal/redundant, don't count both */ 306 if (isphrase) 307 { 308 /* operator's own distance must propagate to left */ 309 *ladd = lladd + ndistance + rladd; 310 *radd = rradd; 311 } 312 else 313 { 314 /* at non-phrase op, just forget the left subnode entirely */ 315 *ladd = rladd; 316 *radd = rradd; 317 } 318 res = node->right; 319 pfree(node); 320 } 321 else if (node->right == NULL) 322 { 323 /* Removing this operator and right subnode */ 324 /* rladd and rradd are equal/redundant, don't count both */ 325 if (isphrase) 326 { 327 /* operator's own distance must propagate to right */ 328 *ladd = lladd; 329 *radd = lradd + ndistance + rradd; 330 } 331 else 332 { 333 /* at non-phrase op, just forget the right subnode entirely */ 334 *ladd = lladd; 335 *radd = lradd; 336 } 337 res = node->left; 338 pfree(node); 339 } 340 else if (isphrase) 341 { 342 /* Absorb appropriate corrections at this level */ 343 node->valnode->qoperator.distance += lradd + rladd; 344 /* Propagate up any unaccounted-for corrections */ 345 *ladd = lladd; 346 *radd = rradd; 347 } 348 else 349 { 350 /* We're keeping a non-phrase operator, so ladd/radd remain 0 */ 351 } 352 353 return res; 354 } 355 return node; 356 } 357 358 /* 359 * Number of elements in query tree 360 */ 361 static int32 362 calcstrlen(NODE *node) 363 { 364 int32 size = 0; 365 366 if (node->valnode->type == QI_VAL) 367 { 368 size = node->valnode->qoperand.length + 1; 369 } 370 else 371 { 372 Assert(node->valnode->type == QI_OPR); 373 374 size = calcstrlen(node->right); 375 if (node->valnode->qoperator.oper != OP_NOT) 376 size += calcstrlen(node->left); 377 } 378 379 return size; 380 } 381 382 /* 383 * Remove QI_VALSTOP (stopword) nodes from TSQuery. 384 */ 385 TSQuery 386 cleanup_tsquery_stopwords(TSQuery in) 387 { 388 int32 len, 389 lenstr, 390 commonlen, 391 i; 392 NODE *root; 393 int ladd, 394 radd; 395 TSQuery out; 396 QueryItem *items; 397 char *operands; 398 399 if (in->size == 0) 400 return in; 401 402 /* eliminate stop words */ 403 root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd); 404 if (root == NULL) 405 { 406 ereport(NOTICE, 407 (errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored"))); 408 out = palloc(HDRSIZETQ); 409 out->size = 0; 410 SET_VARSIZE(out, HDRSIZETQ); 411 return out; 412 } 413 414 /* 415 * Build TSQuery from plain view 416 */ 417 418 lenstr = calcstrlen(root); 419 items = plaintree(root, &len); 420 commonlen = COMPUTESIZE(len, lenstr); 421 422 out = palloc(commonlen); 423 SET_VARSIZE(out, commonlen); 424 out->size = len; 425 426 memcpy(GETQUERY(out), items, len * sizeof(QueryItem)); 427 428 items = GETQUERY(out); 429 operands = GETOPERAND(out); 430 for (i = 0; i < out->size; i++) 431 { 432 QueryOperand *op = (QueryOperand *) &items[i]; 433 434 if (op->type != QI_VAL) 435 continue; 436 437 memcpy(operands, GETOPERAND(in) + op->distance, op->length); 438 operands[op->length] = '\0'; 439 op->distance = operands - GETOPERAND(out); 440 operands += op->length + 1; 441 } 442 443 return out; 444 } 445