1 /*-------------------------------------------------------------------------
2 *
3 * to_tsany.c
4 * to_ts* function definitions
5 *
6 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/tsearch/to_tsany.c
11 *
12 *-------------------------------------------------------------------------
13 */
14 #include "postgres.h"
15
16 #include "tsearch/ts_cache.h"
17 #include "tsearch/ts_utils.h"
18 #include "utils/builtins.h"
19
20
21 typedef struct MorphOpaque
22 {
23 Oid cfg_id;
24 int qoperator; /* query operator */
25 } MorphOpaque;
26
27
28 Datum
get_current_ts_config(PG_FUNCTION_ARGS)29 get_current_ts_config(PG_FUNCTION_ARGS)
30 {
31 PG_RETURN_OID(getTSCurrentConfig(true));
32 }
33
34 /*
35 * to_tsvector
36 */
37 static int
compareWORD(const void * a,const void * b)38 compareWORD(const void *a, const void *b)
39 {
40 int res;
41
42 res = tsCompareString(
43 ((const ParsedWord *) a)->word, ((const ParsedWord *) a)->len,
44 ((const ParsedWord *) b)->word, ((const ParsedWord *) b)->len,
45 false);
46
47 if (res == 0)
48 {
49 if (((const ParsedWord *) a)->pos.pos == ((const ParsedWord *) b)->pos.pos)
50 return 0;
51
52 res = (((const ParsedWord *) a)->pos.pos > ((const ParsedWord *) b)->pos.pos) ? 1 : -1;
53 }
54
55 return res;
56 }
57
58 static int
uniqueWORD(ParsedWord * a,int32 l)59 uniqueWORD(ParsedWord *a, int32 l)
60 {
61 ParsedWord *ptr,
62 *res;
63 int tmppos;
64
65 if (l == 1)
66 {
67 tmppos = LIMITPOS(a->pos.pos);
68 a->alen = 2;
69 a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
70 a->pos.apos[0] = 1;
71 a->pos.apos[1] = tmppos;
72 return l;
73 }
74
75 res = a;
76 ptr = a + 1;
77
78 /*
79 * Sort words with its positions
80 */
81 qsort((void *) a, l, sizeof(ParsedWord), compareWORD);
82
83 /*
84 * Initialize first word and its first position
85 */
86 tmppos = LIMITPOS(a->pos.pos);
87 a->alen = 2;
88 a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
89 a->pos.apos[0] = 1;
90 a->pos.apos[1] = tmppos;
91
92 /*
93 * Summarize position information for each word
94 */
95 while (ptr - a < l)
96 {
97 if (!(ptr->len == res->len &&
98 strncmp(ptr->word, res->word, res->len) == 0))
99 {
100 /*
101 * Got a new word, so put it in result
102 */
103 res++;
104 res->len = ptr->len;
105 res->word = ptr->word;
106 tmppos = LIMITPOS(ptr->pos.pos);
107 res->alen = 2;
108 res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen);
109 res->pos.apos[0] = 1;
110 res->pos.apos[1] = tmppos;
111 }
112 else
113 {
114 /*
115 * The word already exists, so adjust position information. But
116 * before we should check size of position's array, max allowed
117 * value for position and uniqueness of position
118 */
119 pfree(ptr->word);
120 if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 &&
121 res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
122 {
123 if (res->pos.apos[0] + 1 >= res->alen)
124 {
125 res->alen *= 2;
126 res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen);
127 }
128 if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
129 {
130 res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos);
131 res->pos.apos[0]++;
132 }
133 }
134 }
135 ptr++;
136 }
137
138 return res + 1 - a;
139 }
140
141 /*
142 * make value of tsvector, given parsed text
143 */
144 TSVector
make_tsvector(ParsedText * prs)145 make_tsvector(ParsedText *prs)
146 {
147 int i,
148 j,
149 lenstr = 0,
150 totallen;
151 TSVector in;
152 WordEntry *ptr;
153 char *str;
154 int stroff;
155
156 prs->curwords = uniqueWORD(prs->words, prs->curwords);
157 for (i = 0; i < prs->curwords; i++)
158 {
159 lenstr += prs->words[i].len;
160 if (prs->words[i].alen)
161 {
162 lenstr = SHORTALIGN(lenstr);
163 lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
164 }
165 }
166
167 if (lenstr > MAXSTRPOS)
168 ereport(ERROR,
169 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
170 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS)));
171
172 totallen = CALCDATASIZE(prs->curwords, lenstr);
173 in = (TSVector) palloc0(totallen);
174 SET_VARSIZE(in, totallen);
175 in->size = prs->curwords;
176
177 ptr = ARRPTR(in);
178 str = STRPTR(in);
179 stroff = 0;
180 for (i = 0; i < prs->curwords; i++)
181 {
182 ptr->len = prs->words[i].len;
183 ptr->pos = stroff;
184 memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
185 stroff += prs->words[i].len;
186 pfree(prs->words[i].word);
187 if (prs->words[i].alen)
188 {
189 int k = prs->words[i].pos.apos[0];
190 WordEntryPos *wptr;
191
192 if (k > 0xFFFF)
193 elog(ERROR, "positions array too long");
194
195 ptr->haspos = 1;
196 stroff = SHORTALIGN(stroff);
197 *(uint16 *) (str + stroff) = (uint16) k;
198 wptr = POSDATAPTR(in, ptr);
199 for (j = 0; j < k; j++)
200 {
201 WEP_SETWEIGHT(wptr[j], 0);
202 WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
203 }
204 stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
205 pfree(prs->words[i].pos.apos);
206 }
207 else
208 ptr->haspos = 0;
209 ptr++;
210 }
211 pfree(prs->words);
212 return in;
213 }
214
215 Datum
to_tsvector_byid(PG_FUNCTION_ARGS)216 to_tsvector_byid(PG_FUNCTION_ARGS)
217 {
218 Oid cfgId = PG_GETARG_OID(0);
219 text *in = PG_GETARG_TEXT_P(1);
220 ParsedText prs;
221 TSVector out;
222
223 prs.lenwords = (VARSIZE(in) - VARHDRSZ) / 6; /* just estimation of
224 * word's number */
225 if (prs.lenwords == 0)
226 prs.lenwords = 2;
227 prs.curwords = 0;
228 prs.pos = 0;
229 prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
230
231 parsetext(cfgId, &prs, VARDATA(in), VARSIZE(in) - VARHDRSZ);
232 PG_FREE_IF_COPY(in, 1);
233
234 if (prs.curwords)
235 out = make_tsvector(&prs);
236 else
237 {
238 pfree(prs.words);
239 out = palloc(CALCDATASIZE(0, 0));
240 SET_VARSIZE(out, CALCDATASIZE(0, 0));
241 out->size = 0;
242 }
243
244 PG_RETURN_POINTER(out);
245 }
246
247 Datum
to_tsvector(PG_FUNCTION_ARGS)248 to_tsvector(PG_FUNCTION_ARGS)
249 {
250 text *in = PG_GETARG_TEXT_P(0);
251 Oid cfgId;
252
253 cfgId = getTSCurrentConfig(true);
254 PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector_byid,
255 ObjectIdGetDatum(cfgId),
256 PointerGetDatum(in)));
257 }
258
259 /*
260 * to_tsquery
261 */
262
263
264 /*
265 * This function is used for morph parsing.
266 *
267 * The value is passed to parsetext which will call the right dictionary to
268 * lexize the word. If it turns out to be a stopword, we push a QI_VALSTOP
269 * to the stack.
270 *
271 * All words belonging to the same variant are pushed as an ANDed list,
272 * and different variants are ORed together.
273 */
274 static void
pushval_morph(Datum opaque,TSQueryParserState state,char * strval,int lenval,int16 weight,bool prefix)275 pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
276 {
277 int32 count = 0;
278 ParsedText prs;
279 uint32 variant,
280 pos = 0,
281 cntvar = 0,
282 cntpos = 0,
283 cnt = 0;
284 MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque);
285
286 prs.lenwords = 4;
287 prs.curwords = 0;
288 prs.pos = 0;
289 prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
290
291 parsetext(data->cfg_id, &prs, strval, lenval);
292
293 if (prs.curwords > 0)
294 {
295 while (count < prs.curwords)
296 {
297 /*
298 * Were any stop words removed? If so, fill empty positions with
299 * placeholders linked by an appropriate operator.
300 */
301 if (pos > 0 && pos + 1 < prs.words[count].pos.pos)
302 {
303 while (pos + 1 < prs.words[count].pos.pos)
304 {
305 /* put placeholders for each missing stop word */
306 pushStop(state);
307 if (cntpos)
308 pushOperator(state, data->qoperator, 1);
309 cntpos++;
310 pos++;
311 }
312 }
313
314 /* save current word's position */
315 pos = prs.words[count].pos.pos;
316
317 /* Go through all variants obtained from this token */
318 cntvar = 0;
319 while (count < prs.curwords && pos == prs.words[count].pos.pos)
320 {
321 variant = prs.words[count].nvariant;
322
323 /* Push all words belonging to the same variant */
324 cnt = 0;
325 while (count < prs.curwords &&
326 pos == prs.words[count].pos.pos &&
327 variant == prs.words[count].nvariant)
328 {
329 pushValue(state,
330 prs.words[count].word,
331 prs.words[count].len,
332 weight,
333 ((prs.words[count].flags & TSL_PREFIX) || prefix));
334 pfree(prs.words[count].word);
335 if (cnt)
336 pushOperator(state, OP_AND, 0);
337 cnt++;
338 count++;
339 }
340
341 if (cntvar)
342 pushOperator(state, OP_OR, 0);
343 cntvar++;
344 }
345
346 if (cntpos)
347 {
348 /* distance may be useful */
349 pushOperator(state, data->qoperator, 1);
350 }
351
352 cntpos++;
353 }
354
355 pfree(prs.words);
356
357 }
358 else
359 pushStop(state);
360 }
361
362 Datum
to_tsquery_byid(PG_FUNCTION_ARGS)363 to_tsquery_byid(PG_FUNCTION_ARGS)
364 {
365 text *in = PG_GETARG_TEXT_P(1);
366 TSQuery query;
367 MorphOpaque data;
368
369 data.cfg_id = PG_GETARG_OID(0);
370 data.qoperator = OP_AND;
371
372 query = parse_tsquery(text_to_cstring(in),
373 pushval_morph,
374 PointerGetDatum(&data),
375 false);
376
377 PG_RETURN_TSQUERY(query);
378 }
379
380 Datum
to_tsquery(PG_FUNCTION_ARGS)381 to_tsquery(PG_FUNCTION_ARGS)
382 {
383 text *in = PG_GETARG_TEXT_P(0);
384 Oid cfgId;
385
386 cfgId = getTSCurrentConfig(true);
387 PG_RETURN_DATUM(DirectFunctionCall2(to_tsquery_byid,
388 ObjectIdGetDatum(cfgId),
389 PointerGetDatum(in)));
390 }
391
392 Datum
plainto_tsquery_byid(PG_FUNCTION_ARGS)393 plainto_tsquery_byid(PG_FUNCTION_ARGS)
394 {
395 text *in = PG_GETARG_TEXT_P(1);
396 TSQuery query;
397 MorphOpaque data;
398
399 data.cfg_id = PG_GETARG_OID(0);
400 data.qoperator = OP_AND;
401
402 query = parse_tsquery(text_to_cstring(in),
403 pushval_morph,
404 PointerGetDatum(&data),
405 true);
406
407 PG_RETURN_POINTER(query);
408 }
409
410 Datum
plainto_tsquery(PG_FUNCTION_ARGS)411 plainto_tsquery(PG_FUNCTION_ARGS)
412 {
413 text *in = PG_GETARG_TEXT_P(0);
414 Oid cfgId;
415
416 cfgId = getTSCurrentConfig(true);
417 PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid,
418 ObjectIdGetDatum(cfgId),
419 PointerGetDatum(in)));
420 }
421
422
423 Datum
phraseto_tsquery_byid(PG_FUNCTION_ARGS)424 phraseto_tsquery_byid(PG_FUNCTION_ARGS)
425 {
426 text *in = PG_GETARG_TEXT_P(1);
427 TSQuery query;
428 MorphOpaque data;
429
430 data.cfg_id = PG_GETARG_OID(0);
431 data.qoperator = OP_PHRASE;
432
433 query = parse_tsquery(text_to_cstring(in),
434 pushval_morph,
435 PointerGetDatum(&data),
436 true);
437
438 PG_RETURN_TSQUERY(query);
439 }
440
441 Datum
phraseto_tsquery(PG_FUNCTION_ARGS)442 phraseto_tsquery(PG_FUNCTION_ARGS)
443 {
444 text *in = PG_GETARG_TEXT_P(0);
445 Oid cfgId;
446
447 cfgId = getTSCurrentConfig(true);
448 PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid,
449 ObjectIdGetDatum(cfgId),
450 PointerGetDatum(in)));
451 }
452