1 /*-------------------------------------------------------------------------
2  *
3  * dict_synonym.c
4  *		Synonym dictionary: replace word by its synonym
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/tsearch/dict_synonym.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "commands/defrem.h"
17 #include "tsearch/ts_locale.h"
18 #include "tsearch/ts_utils.h"
19 #include "utils/builtins.h"
20 
21 typedef struct
22 {
23 	char	   *in;
24 	char	   *out;
25 	int			outlen;
26 	uint16		flags;
27 } Syn;
28 
29 typedef struct
30 {
31 	int			len;			/* length of syn array */
32 	Syn		   *syn;
33 	bool		case_sensitive;
34 } DictSyn;
35 
36 /*
37  * Finds the next whitespace-delimited word within the 'in' string.
38  * Returns a pointer to the first character of the word, and a pointer
39  * to the next byte after the last character in the word (in *end).
40  * Character '*' at the end of word will not be treated as word
41  * character if flags is not null.
42  */
43 static char *
findwrd(char * in,char ** end,uint16 * flags)44 findwrd(char *in, char **end, uint16 *flags)
45 {
46 	char	   *start;
47 	char	   *lastchar;
48 
49 	/* Skip leading spaces */
50 	while (*in && t_isspace(in))
51 		in += pg_mblen(in);
52 
53 	/* Return NULL on empty lines */
54 	if (*in == '\0')
55 	{
56 		*end = NULL;
57 		return NULL;
58 	}
59 
60 	lastchar = start = in;
61 
62 	/* Find end of word */
63 	while (*in && !t_isspace(in))
64 	{
65 		lastchar = in;
66 		in += pg_mblen(in);
67 	}
68 
69 	if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags)
70 	{
71 		*flags = TSL_PREFIX;
72 		*end = lastchar;
73 	}
74 	else
75 	{
76 		if (flags)
77 			*flags = 0;
78 		*end = in;
79 	}
80 
81 	return start;
82 }
83 
84 static int
compareSyn(const void * a,const void * b)85 compareSyn(const void *a, const void *b)
86 {
87 	return strcmp(((const Syn *) a)->in, ((const Syn *) b)->in);
88 }
89 
90 
91 Datum
dsynonym_init(PG_FUNCTION_ARGS)92 dsynonym_init(PG_FUNCTION_ARGS)
93 {
94 	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
95 	DictSyn    *d;
96 	ListCell   *l;
97 	char	   *filename = NULL;
98 	bool		case_sensitive = false;
99 	tsearch_readline_state trst;
100 	char	   *starti,
101 			   *starto,
102 			   *end = NULL;
103 	int			cur = 0;
104 	char	   *line = NULL;
105 	uint16		flags = 0;
106 
107 	foreach(l, dictoptions)
108 	{
109 		DefElem    *defel = (DefElem *) lfirst(l);
110 
111 		if (strcmp(defel->defname, "synonyms") == 0)
112 			filename = defGetString(defel);
113 		else if (strcmp(defel->defname, "casesensitive") == 0)
114 			case_sensitive = defGetBoolean(defel);
115 		else
116 			ereport(ERROR,
117 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
118 					 errmsg("unrecognized synonym parameter: \"%s\"",
119 							defel->defname)));
120 	}
121 
122 	if (!filename)
123 		ereport(ERROR,
124 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
125 				 errmsg("missing Synonyms parameter")));
126 
127 	filename = get_tsearch_config_filename(filename, "syn");
128 
129 	if (!tsearch_readline_begin(&trst, filename))
130 		ereport(ERROR,
131 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
132 				 errmsg("could not open synonym file \"%s\": %m",
133 						filename)));
134 
135 	d = (DictSyn *) palloc0(sizeof(DictSyn));
136 
137 	while ((line = tsearch_readline(&trst)) != NULL)
138 	{
139 		starti = findwrd(line, &end, NULL);
140 		if (!starti)
141 		{
142 			/* Empty line */
143 			goto skipline;
144 		}
145 		if (*end == '\0')
146 		{
147 			/* A line with only one word. Ignore silently. */
148 			goto skipline;
149 		}
150 		*end = '\0';
151 
152 		starto = findwrd(end + 1, &end, &flags);
153 		if (!starto)
154 		{
155 			/* A line with only one word (+whitespace). Ignore silently. */
156 			goto skipline;
157 		}
158 		*end = '\0';
159 
160 		/*
161 		 * starti now points to the first word, and starto to the second word
162 		 * on the line, with a \0 terminator at the end of both words.
163 		 */
164 
165 		if (cur >= d->len)
166 		{
167 			if (d->len == 0)
168 			{
169 				d->len = 64;
170 				d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
171 			}
172 			else
173 			{
174 				d->len *= 2;
175 				d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
176 			}
177 		}
178 
179 		if (case_sensitive)
180 		{
181 			d->syn[cur].in = pstrdup(starti);
182 			d->syn[cur].out = pstrdup(starto);
183 		}
184 		else
185 		{
186 			d->syn[cur].in = lowerstr(starti);
187 			d->syn[cur].out = lowerstr(starto);
188 		}
189 
190 		d->syn[cur].outlen = strlen(starto);
191 		d->syn[cur].flags = flags;
192 
193 		cur++;
194 
195 skipline:
196 		pfree(line);
197 	}
198 
199 	tsearch_readline_end(&trst);
200 
201 	d->len = cur;
202 	qsort(d->syn, d->len, sizeof(Syn), compareSyn);
203 
204 	d->case_sensitive = case_sensitive;
205 
206 	PG_RETURN_POINTER(d);
207 }
208 
209 Datum
dsynonym_lexize(PG_FUNCTION_ARGS)210 dsynonym_lexize(PG_FUNCTION_ARGS)
211 {
212 	DictSyn    *d = (DictSyn *) PG_GETARG_POINTER(0);
213 	char	   *in = (char *) PG_GETARG_POINTER(1);
214 	int32		len = PG_GETARG_INT32(2);
215 	Syn			key,
216 			   *found;
217 	TSLexeme   *res;
218 
219 	/* note: d->len test protects against Solaris bsearch-of-no-items bug */
220 	if (len <= 0 || d->len <= 0)
221 		PG_RETURN_POINTER(NULL);
222 
223 	if (d->case_sensitive)
224 		key.in = pnstrdup(in, len);
225 	else
226 		key.in = lowerstr_with_len(in, len);
227 
228 	key.out = NULL;
229 
230 	found = (Syn *) bsearch(&key, d->syn, d->len, sizeof(Syn), compareSyn);
231 	pfree(key.in);
232 
233 	if (!found)
234 		PG_RETURN_POINTER(NULL);
235 
236 	res = palloc0(sizeof(TSLexeme) * 2);
237 	res[0].lexeme = pnstrdup(found->out, found->outlen);
238 	res[0].flags = found->flags;
239 
240 	PG_RETURN_POINTER(res);
241 }
242