1 /*-------------------------------------------------------------------------
2  *
3  * ts_utils.h
4  *	  helper utilities for tsearch
5  *
6  * Copyright (c) 1998-2016, PostgreSQL Global Development Group
7  *
8  * src/include/tsearch/ts_utils.h
9  *
10  *-------------------------------------------------------------------------
11  */
12 #ifndef _PG_TS_UTILS_H_
13 #define _PG_TS_UTILS_H_
14 
15 #include "nodes/pg_list.h"
16 #include "tsearch/ts_public.h"
17 #include "tsearch/ts_type.h"
18 
19 /*
20  * Common parse definitions for tsvector and tsquery
21  */
22 
23 /* tsvector parser support. */
24 
25 struct TSVectorParseStateData;	/* opaque struct in tsvector_parser.c */
26 typedef struct TSVectorParseStateData *TSVectorParseState;
27 
28 extern TSVectorParseState init_tsvector_parser(char *input,
29 					 bool oprisdelim,
30 					 bool is_tsquery);
31 extern void reset_tsvector_parser(TSVectorParseState state, char *input);
32 extern bool gettoken_tsvector(TSVectorParseState state,
33 				  char **token, int *len,
34 				  WordEntryPos **pos, int *poslen,
35 				  char **endptr);
36 extern void close_tsvector_parser(TSVectorParseState state);
37 
38 /* parse_tsquery */
39 
40 struct TSQueryParserStateData;	/* private in backend/utils/adt/tsquery.c */
41 typedef struct TSQueryParserStateData *TSQueryParserState;
42 
43 typedef void (*PushFunction) (Datum opaque, TSQueryParserState state,
44 										  char *token, int tokenlen,
45 										  int16 tokenweights,	/* bitmap as described
46 																 * in QueryOperand
47 																 * struct */
48 										  bool prefix);
49 
50 extern TSQuery parse_tsquery(char *buf,
51 			  PushFunction pushval,
52 			  Datum opaque, bool isplain);
53 
54 /* Functions for use by PushFunction implementations */
55 extern void pushValue(TSQueryParserState state,
56 		  char *strval, int lenval, int16 weight, bool prefix);
57 extern void pushStop(TSQueryParserState state);
58 extern void pushOperator(TSQueryParserState state, int8 oper, int16 distance);
59 
60 /*
61  * parse plain text and lexize words
62  */
63 typedef struct
64 {
65 	uint16		len;
66 	uint16		nvariant;
67 	union
68 	{
69 		uint16		pos;
70 
71 		/*
72 		 * When apos array is used, apos[0] is the number of elements in the
73 		 * array (excluding apos[0]), and alen is the allocated size of the
74 		 * array.
75 		 */
76 		uint16	   *apos;
77 	}			pos;
78 	uint16		flags;			/* currently, only TSL_PREFIX */
79 	char	   *word;
80 	uint32		alen;
81 } ParsedWord;
82 
83 typedef struct
84 {
85 	ParsedWord *words;
86 	int32		lenwords;
87 	int32		curwords;
88 	int32		pos;
89 } ParsedText;
90 
91 extern void parsetext(Oid cfgId, ParsedText *prs, char *buf, int32 buflen);
92 
93 /*
94  * headline framework, flow in common to generate:
95  *	1 parse text with hlparsetext
96  *	2 parser-specific function to find part
97  *	3 generateHeadline to generate result text
98  */
99 
100 extern void hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query,
101 			char *buf, int32 buflen);
102 extern text *generateHeadline(HeadlineParsedText *prs);
103 
104 /*
105  * TSQuery execution support
106  *
107  * TS_execute() executes a tsquery against data that can be represented in
108  * various forms.  The TSExecuteCallback callback function is called to check
109  * whether a given primitive tsquery value is matched in the data.
110  */
111 
112 /*
113  * struct ExecPhraseData is passed to a TSExecuteCallback function if we need
114  * lexeme position data (because of a phrase-match operator in the tsquery).
115  * The callback should fill in position data when it returns true (success).
116  * If it cannot return position data, it may leave "data" unchanged, but
117  * then the caller of TS_execute() must pass the TS_EXEC_PHRASE_NO_POS flag
118  * and must arrange for a later recheck with position data available.
119  *
120  * The reported lexeme positions must be sorted and unique.  Callers must only
121  * consult the position bits of the pos array, ie, WEP_GETPOS(data->pos[i]).
122  * This allows the returned "pos" to point directly to the WordEntryPos
123  * portion of a tsvector value.  If "allocated" is true then the pos array
124  * is palloc'd workspace and caller may free it when done.
125  *
126  * "negate" means that the pos array contains positions where the query does
127  * not match, rather than positions where it does.  "width" is positive when
128  * the match is wider than one lexeme.  Neither of these fields normally need
129  * to be touched by TSExecuteCallback functions; they are used for
130  * phrase-search processing within TS_execute.
131  *
132  * All fields of the ExecPhraseData struct are initially zeroed by caller.
133  */
134 typedef struct ExecPhraseData
135 {
136 	int			npos;			/* number of positions reported */
137 	bool		allocated;		/* pos points to palloc'd data? */
138 	bool		negate;			/* positions are where query is NOT matched */
139 	WordEntryPos *pos;			/* ordered, non-duplicate lexeme positions */
140 	int			width;			/* width of match in lexemes, less 1 */
141 } ExecPhraseData;
142 
143 /*
144  * Signature for TSQuery lexeme check functions
145  *
146  * arg: opaque value passed through from caller of TS_execute
147  * val: lexeme to test for presence of
148  * data: to be filled with lexeme positions; NULL if position data not needed
149  *
150  * Return TRUE if lexeme is present in data, else FALSE.  If data is not
151  * NULL, it should be filled with lexeme positions, but function can leave
152  * it as zeroes if position data is not available.
153  */
154 typedef bool (*TSExecuteCallback) (void *arg, QueryOperand *val,
155 											   ExecPhraseData *data);
156 
157 /*
158  * Flag bits for TS_execute
159  */
160 #define TS_EXEC_EMPTY			(0x00)
161 /*
162  * If TS_EXEC_CALC_NOT is not set, then NOT expressions are automatically
163  * evaluated to be true.  Useful in cases where NOT cannot be accurately
164  * computed (GiST) or it isn't important (ranking).  From TS_execute's
165  * perspective, !CALC_NOT means that the TSExecuteCallback function might
166  * return false-positive indications of a lexeme's presence.
167  */
168 #define TS_EXEC_CALC_NOT		(0x01)
169 /*
170  * If TS_EXEC_PHRASE_NO_POS is set, allow OP_PHRASE to be executed lossily
171  * in the absence of position information: a TRUE result indicates that the
172  * phrase might be present.  Without this flag, OP_PHRASE always returns
173  * false if lexeme position information is not available.
174  */
175 #define TS_EXEC_PHRASE_NO_POS	(0x02)
176 /* Obsolete spelling of TS_EXEC_PHRASE_NO_POS: */
177 #define TS_EXEC_PHRASE_AS_AND	TS_EXEC_PHRASE_NO_POS
178 
179 extern bool TS_execute(QueryItem *curitem, void *arg, uint32 flags,
180 		   TSExecuteCallback chkcond);
181 extern bool tsquery_requires_match(QueryItem *curitem);
182 
183 /*
184  * to_ts* - text transformation to tsvector, tsquery
185  */
186 extern TSVector make_tsvector(ParsedText *prs);
187 extern int32 tsCompareString(char *a, int lena, char *b, int lenb, bool prefix);
188 
189 extern Datum to_tsvector_byid(PG_FUNCTION_ARGS);
190 extern Datum to_tsvector(PG_FUNCTION_ARGS);
191 extern Datum to_tsquery_byid(PG_FUNCTION_ARGS);
192 extern Datum to_tsquery(PG_FUNCTION_ARGS);
193 extern Datum plainto_tsquery_byid(PG_FUNCTION_ARGS);
194 extern Datum plainto_tsquery(PG_FUNCTION_ARGS);
195 extern Datum phraseto_tsquery_byid(PG_FUNCTION_ARGS);
196 extern Datum phraseto_tsquery(PG_FUNCTION_ARGS);
197 
198 /*
199  * GiST support function
200  */
201 
202 extern Datum gtsvector_compress(PG_FUNCTION_ARGS);
203 extern Datum gtsvector_decompress(PG_FUNCTION_ARGS);
204 extern Datum gtsvector_consistent(PG_FUNCTION_ARGS);
205 extern Datum gtsvector_union(PG_FUNCTION_ARGS);
206 extern Datum gtsvector_same(PG_FUNCTION_ARGS);
207 extern Datum gtsvector_penalty(PG_FUNCTION_ARGS);
208 extern Datum gtsvector_picksplit(PG_FUNCTION_ARGS);
209 extern Datum gtsvector_consistent_oldsig(PG_FUNCTION_ARGS);
210 
211 /*
212  * IO functions for pseudotype gtsvector
213  * used internally in tsvector GiST opclass
214  */
215 extern Datum gtsvectorin(PG_FUNCTION_ARGS);
216 extern Datum gtsvectorout(PG_FUNCTION_ARGS);
217 
218 /*
219  * GIN support function
220  */
221 
222 extern Datum gin_extract_tsvector(PG_FUNCTION_ARGS);
223 extern Datum gin_cmp_tslexeme(PG_FUNCTION_ARGS);
224 extern Datum gin_cmp_prefix(PG_FUNCTION_ARGS);
225 extern Datum gin_extract_tsquery(PG_FUNCTION_ARGS);
226 extern Datum gin_tsquery_consistent(PG_FUNCTION_ARGS);
227 extern Datum gin_tsquery_triconsistent(PG_FUNCTION_ARGS);
228 extern Datum gin_extract_tsvector_2args(PG_FUNCTION_ARGS);
229 extern Datum gin_extract_tsquery_5args(PG_FUNCTION_ARGS);
230 extern Datum gin_tsquery_consistent_6args(PG_FUNCTION_ARGS);
231 extern Datum gin_extract_tsquery_oldsig(PG_FUNCTION_ARGS);
232 extern Datum gin_tsquery_consistent_oldsig(PG_FUNCTION_ARGS);
233 
234 /*
235  * Possible strategy numbers for indexes
236  *	  TSearchStrategyNumber  - (tsvector|text) @@ tsquery
237  *	  TSearchWithClassStrategyNumber  - tsvector @@@ tsquery
238  */
239 #define TSearchStrategyNumber			1
240 #define TSearchWithClassStrategyNumber	2
241 
242 /*
243  * TSQuery Utilities
244  */
245 extern QueryItem *clean_NOT(QueryItem *ptr, int32 *len);
246 extern TSQuery cleanup_tsquery_stopwords(TSQuery in);
247 
248 typedef struct QTNode
249 {
250 	QueryItem  *valnode;
251 	uint32		flags;
252 	int32		nchild;
253 	char	   *word;
254 	uint32		sign;
255 	struct QTNode **child;
256 } QTNode;
257 
258 /* bits in QTNode.flags */
259 #define QTN_NEEDFREE	0x01
260 #define QTN_NOCHANGE	0x02
261 #define QTN_WORDFREE	0x04
262 
263 typedef uint64 TSQuerySign;
264 
265 #define TSQS_SIGLEN  (sizeof(TSQuerySign)*BITS_PER_BYTE)
266 
267 #define TSQuerySignGetDatum(X)		Int64GetDatum((int64) (X))
268 #define DatumGetTSQuerySign(X)		((TSQuerySign) DatumGetInt64(X))
269 #define PG_RETURN_TSQUERYSIGN(X)	return TSQuerySignGetDatum(X)
270 #define PG_GETARG_TSQUERYSIGN(n)	DatumGetTSQuerySign(PG_GETARG_DATUM(n))
271 
272 
273 extern QTNode *QT2QTN(QueryItem *in, char *operand);
274 extern TSQuery QTN2QT(QTNode *in);
275 extern void QTNFree(QTNode *in);
276 extern void QTNSort(QTNode *in);
277 extern void QTNTernary(QTNode *in);
278 extern void QTNBinary(QTNode *in);
279 extern int	QTNodeCompare(QTNode *an, QTNode *bn);
280 extern QTNode *QTNCopy(QTNode *in);
281 extern void QTNClearFlags(QTNode *in, uint32 flags);
282 extern bool QTNEq(QTNode *a, QTNode *b);
283 extern TSQuerySign makeTSQuerySign(TSQuery a);
284 extern QTNode *findsubquery(QTNode *root, QTNode *ex, QTNode *subs,
285 			 bool *isfind);
286 
287 /*
288  * TSQuery GiST support
289  */
290 extern Datum gtsquery_compress(PG_FUNCTION_ARGS);
291 extern Datum gtsquery_decompress(PG_FUNCTION_ARGS);
292 extern Datum gtsquery_consistent(PG_FUNCTION_ARGS);
293 extern Datum gtsquery_union(PG_FUNCTION_ARGS);
294 extern Datum gtsquery_same(PG_FUNCTION_ARGS);
295 extern Datum gtsquery_penalty(PG_FUNCTION_ARGS);
296 extern Datum gtsquery_picksplit(PG_FUNCTION_ARGS);
297 extern Datum gtsquery_consistent_oldsig(PG_FUNCTION_ARGS);
298 
299 /*
300  * Parser interface to SQL
301  */
302 extern Datum ts_token_type_byid(PG_FUNCTION_ARGS);
303 extern Datum ts_token_type_byname(PG_FUNCTION_ARGS);
304 extern Datum ts_parse_byid(PG_FUNCTION_ARGS);
305 extern Datum ts_parse_byname(PG_FUNCTION_ARGS);
306 
307 /*
308  * Default word parser
309  */
310 
311 extern Datum prsd_start(PG_FUNCTION_ARGS);
312 extern Datum prsd_nexttoken(PG_FUNCTION_ARGS);
313 extern Datum prsd_end(PG_FUNCTION_ARGS);
314 extern Datum prsd_headline(PG_FUNCTION_ARGS);
315 extern Datum prsd_lextype(PG_FUNCTION_ARGS);
316 
317 /*
318  * Dictionary interface to SQL
319  */
320 extern Datum ts_lexize(PG_FUNCTION_ARGS);
321 
322 /*
323  * Simple built-in dictionary
324  */
325 extern Datum dsimple_init(PG_FUNCTION_ARGS);
326 extern Datum dsimple_lexize(PG_FUNCTION_ARGS);
327 
328 /*
329  * Synonym built-in dictionary
330  */
331 extern Datum dsynonym_init(PG_FUNCTION_ARGS);
332 extern Datum dsynonym_lexize(PG_FUNCTION_ARGS);
333 
334 /*
335  * ISpell dictionary
336  */
337 extern Datum dispell_init(PG_FUNCTION_ARGS);
338 extern Datum dispell_lexize(PG_FUNCTION_ARGS);
339 
340 /*
341  * Thesaurus
342  */
343 extern Datum thesaurus_init(PG_FUNCTION_ARGS);
344 extern Datum thesaurus_lexize(PG_FUNCTION_ARGS);
345 
346 /*
347  * headline
348  */
349 extern Datum ts_headline_byid_opt(PG_FUNCTION_ARGS);
350 extern Datum ts_headline_byid(PG_FUNCTION_ARGS);
351 extern Datum ts_headline(PG_FUNCTION_ARGS);
352 extern Datum ts_headline_opt(PG_FUNCTION_ARGS);
353 
354 /*
355  * current cfg
356  */
357 extern Datum get_current_ts_config(PG_FUNCTION_ARGS);
358 
359 #endif   /* _PG_TS_UTILS_H_ */
360