1 /*------------------------------------------------------------------------- 2 * 3 * tsvector_parser.c 4 * Parser for tsvector 5 * 6 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group 7 * 8 * 9 * IDENTIFICATION 10 * src/backend/utils/adt/tsvector_parser.c 11 * 12 *------------------------------------------------------------------------- 13 */ 14 15 #include "postgres.h" 16 17 #include "tsearch/ts_locale.h" 18 #include "tsearch/ts_utils.h" 19 20 21 /* 22 * Private state of tsvector parser. Note that tsquery also uses this code to 23 * parse its input, hence the boolean flags. The two flags are both true or 24 * both false in current usage, but we keep them separate for clarity. 25 * is_tsquery affects *only* the content of error messages. 26 */ 27 struct TSVectorParseStateData 28 { 29 char *prsbuf; /* next input character */ 30 char *bufstart; /* whole string (used only for errors) */ 31 char *word; /* buffer to hold the current word */ 32 int len; /* size in bytes allocated for 'word' */ 33 int eml; /* max bytes per character */ 34 bool oprisdelim; /* treat ! | * ( ) as delimiters? */ 35 bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */ 36 bool is_web; /* we're in websearch_to_tsquery() */ 37 }; 38 39 40 /* 41 * Initializes parser for the input string. If oprisdelim is set, the 42 * following characters are treated as delimiters in addition to whitespace: 43 * ! | & ( ) 44 */ 45 TSVectorParseState 46 init_tsvector_parser(char *input, int flags) 47 { 48 TSVectorParseState state; 49 50 state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData)); 51 state->prsbuf = input; 52 state->bufstart = input; 53 state->len = 32; 54 state->word = (char *) palloc(state->len); 55 state->eml = pg_database_encoding_max_length(); 56 state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0; 57 state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0; 58 state->is_web = (flags & P_TSV_IS_WEB) != 0; 59 60 return state; 61 } 62 63 /* 64 * Reinitializes parser to parse 'input', instead of previous input. 65 */ 66 void 67 reset_tsvector_parser(TSVectorParseState state, char *input) 68 { 69 state->prsbuf = input; 70 } 71 72 /* 73 * Shuts down a tsvector parser. 74 */ 75 void 76 close_tsvector_parser(TSVectorParseState state) 77 { 78 pfree(state->word); 79 pfree(state); 80 } 81 82 /* increase the size of 'word' if needed to hold one more character */ 83 #define RESIZEPRSBUF \ 84 do { \ 85 int clen = curpos - state->word; \ 86 if ( clen + state->eml >= state->len ) \ 87 { \ 88 state->len *= 2; \ 89 state->word = (char *) repalloc(state->word, state->len); \ 90 curpos = state->word + clen; \ 91 } \ 92 } while (0) 93 94 /* Fills gettoken_tsvector's output parameters, and returns true */ 95 #define RETURN_TOKEN \ 96 do { \ 97 if (pos_ptr != NULL) \ 98 { \ 99 *pos_ptr = pos; \ 100 *poslen = npos; \ 101 } \ 102 else if (pos != NULL) \ 103 pfree(pos); \ 104 \ 105 if (strval != NULL) \ 106 *strval = state->word; \ 107 if (lenval != NULL) \ 108 *lenval = curpos - state->word; \ 109 if (endptr != NULL) \ 110 *endptr = state->prsbuf; \ 111 return true; \ 112 } while(0) 113 114 115 /* State codes used in gettoken_tsvector */ 116 #define WAITWORD 1 117 #define WAITENDWORD 2 118 #define WAITNEXTCHAR 3 119 #define WAITENDCMPLX 4 120 #define WAITPOSINFO 5 121 #define INPOSINFO 6 122 #define WAITPOSDELIM 7 123 #define WAITCHARCMPLX 8 124 125 #define PRSSYNTAXERROR prssyntaxerror(state) 126 127 static void 128 prssyntaxerror(TSVectorParseState state) 129 { 130 ereport(ERROR, 131 (errcode(ERRCODE_SYNTAX_ERROR), 132 state->is_tsquery ? 133 errmsg("syntax error in tsquery: \"%s\"", state->bufstart) : 134 errmsg("syntax error in tsvector: \"%s\"", state->bufstart))); 135 } 136 137 138 /* 139 * Get next token from string being parsed. Returns true if successful, 140 * false if end of input string is reached. On success, these output 141 * parameters are filled in: 142 * 143 * *strval pointer to token 144 * *lenval length of *strval 145 * *pos_ptr pointer to a palloc'd array of positions and weights 146 * associated with the token. If the caller is not interested 147 * in the information, NULL can be supplied. Otherwise 148 * the caller is responsible for pfreeing the array. 149 * *poslen number of elements in *pos_ptr 150 * *endptr scan resumption point 151 * 152 * Pass NULL for unwanted output parameters. 153 */ 154 bool 155 gettoken_tsvector(TSVectorParseState state, 156 char **strval, int *lenval, 157 WordEntryPos **pos_ptr, int *poslen, 158 char **endptr) 159 { 160 int oldstate = 0; 161 char *curpos = state->word; 162 int statecode = WAITWORD; 163 164 /* 165 * pos is for collecting the comma delimited list of positions followed by 166 * the actual token. 167 */ 168 WordEntryPos *pos = NULL; 169 int npos = 0; /* elements of pos used */ 170 int posalen = 0; /* allocated size of pos */ 171 172 while (1) 173 { 174 if (statecode == WAITWORD) 175 { 176 if (*(state->prsbuf) == '\0') 177 return false; 178 else if (!state->is_web && t_iseq(state->prsbuf, '\'')) 179 statecode = WAITENDCMPLX; 180 else if (!state->is_web && t_iseq(state->prsbuf, '\\')) 181 { 182 statecode = WAITNEXTCHAR; 183 oldstate = WAITENDWORD; 184 } 185 else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) || 186 (state->is_web && t_iseq(state->prsbuf, '"'))) 187 PRSSYNTAXERROR; 188 else if (!t_isspace(state->prsbuf)) 189 { 190 COPYCHAR(curpos, state->prsbuf); 191 curpos += pg_mblen(state->prsbuf); 192 statecode = WAITENDWORD; 193 } 194 } 195 else if (statecode == WAITNEXTCHAR) 196 { 197 if (*(state->prsbuf) == '\0') 198 ereport(ERROR, 199 (errcode(ERRCODE_SYNTAX_ERROR), 200 errmsg("there is no escaped character: \"%s\"", 201 state->bufstart))); 202 else 203 { 204 RESIZEPRSBUF; 205 COPYCHAR(curpos, state->prsbuf); 206 curpos += pg_mblen(state->prsbuf); 207 Assert(oldstate != 0); 208 statecode = oldstate; 209 } 210 } 211 else if (statecode == WAITENDWORD) 212 { 213 if (!state->is_web && t_iseq(state->prsbuf, '\\')) 214 { 215 statecode = WAITNEXTCHAR; 216 oldstate = WAITENDWORD; 217 } 218 else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || 219 (state->oprisdelim && ISOPERATOR(state->prsbuf)) || 220 (state->is_web && t_iseq(state->prsbuf, '"'))) 221 { 222 RESIZEPRSBUF; 223 if (curpos == state->word) 224 PRSSYNTAXERROR; 225 *(curpos) = '\0'; 226 RETURN_TOKEN; 227 } 228 else if (t_iseq(state->prsbuf, ':')) 229 { 230 if (curpos == state->word) 231 PRSSYNTAXERROR; 232 *(curpos) = '\0'; 233 if (state->oprisdelim) 234 RETURN_TOKEN; 235 else 236 statecode = INPOSINFO; 237 } 238 else 239 { 240 RESIZEPRSBUF; 241 COPYCHAR(curpos, state->prsbuf); 242 curpos += pg_mblen(state->prsbuf); 243 } 244 } 245 else if (statecode == WAITENDCMPLX) 246 { 247 if (!state->is_web && t_iseq(state->prsbuf, '\'')) 248 { 249 statecode = WAITCHARCMPLX; 250 } 251 else if (!state->is_web && t_iseq(state->prsbuf, '\\')) 252 { 253 statecode = WAITNEXTCHAR; 254 oldstate = WAITENDCMPLX; 255 } 256 else if (*(state->prsbuf) == '\0') 257 PRSSYNTAXERROR; 258 else 259 { 260 RESIZEPRSBUF; 261 COPYCHAR(curpos, state->prsbuf); 262 curpos += pg_mblen(state->prsbuf); 263 } 264 } 265 else if (statecode == WAITCHARCMPLX) 266 { 267 if (!state->is_web && t_iseq(state->prsbuf, '\'')) 268 { 269 RESIZEPRSBUF; 270 COPYCHAR(curpos, state->prsbuf); 271 curpos += pg_mblen(state->prsbuf); 272 statecode = WAITENDCMPLX; 273 } 274 else 275 { 276 RESIZEPRSBUF; 277 *(curpos) = '\0'; 278 if (curpos == state->word) 279 PRSSYNTAXERROR; 280 if (state->oprisdelim) 281 { 282 /* state->prsbuf+=pg_mblen(state->prsbuf); */ 283 RETURN_TOKEN; 284 } 285 else 286 statecode = WAITPOSINFO; 287 continue; /* recheck current character */ 288 } 289 } 290 else if (statecode == WAITPOSINFO) 291 { 292 if (t_iseq(state->prsbuf, ':')) 293 statecode = INPOSINFO; 294 else 295 RETURN_TOKEN; 296 } 297 else if (statecode == INPOSINFO) 298 { 299 if (t_isdigit(state->prsbuf)) 300 { 301 if (posalen == 0) 302 { 303 posalen = 4; 304 pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen); 305 npos = 0; 306 } 307 else if (npos + 1 >= posalen) 308 { 309 posalen *= 2; 310 pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen); 311 } 312 npos++; 313 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf))); 314 /* we cannot get here in tsquery, so no need for 2 errmsgs */ 315 if (WEP_GETPOS(pos[npos - 1]) == 0) 316 ereport(ERROR, 317 (errcode(ERRCODE_SYNTAX_ERROR), 318 errmsg("wrong position info in tsvector: \"%s\"", 319 state->bufstart))); 320 WEP_SETWEIGHT(pos[npos - 1], 0); 321 statecode = WAITPOSDELIM; 322 } 323 else 324 PRSSYNTAXERROR; 325 } 326 else if (statecode == WAITPOSDELIM) 327 { 328 if (t_iseq(state->prsbuf, ',')) 329 statecode = INPOSINFO; 330 else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*')) 331 { 332 if (WEP_GETWEIGHT(pos[npos - 1])) 333 PRSSYNTAXERROR; 334 WEP_SETWEIGHT(pos[npos - 1], 3); 335 } 336 else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B')) 337 { 338 if (WEP_GETWEIGHT(pos[npos - 1])) 339 PRSSYNTAXERROR; 340 WEP_SETWEIGHT(pos[npos - 1], 2); 341 } 342 else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C')) 343 { 344 if (WEP_GETWEIGHT(pos[npos - 1])) 345 PRSSYNTAXERROR; 346 WEP_SETWEIGHT(pos[npos - 1], 1); 347 } 348 else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D')) 349 { 350 if (WEP_GETWEIGHT(pos[npos - 1])) 351 PRSSYNTAXERROR; 352 WEP_SETWEIGHT(pos[npos - 1], 0); 353 } 354 else if (t_isspace(state->prsbuf) || 355 *(state->prsbuf) == '\0') 356 RETURN_TOKEN; 357 else if (!t_isdigit(state->prsbuf)) 358 PRSSYNTAXERROR; 359 } 360 else /* internal error */ 361 elog(ERROR, "unrecognized state in gettoken_tsvector: %d", 362 statecode); 363 364 /* get next char */ 365 state->prsbuf += pg_mblen(state->prsbuf); 366 } 367 } 368