1 /*-------------------------------------------------------------------------
2  *
3  * dict_xsyn.c
4  *	  Extended synonym dictionary
5  *
6  * Copyright (c) 2007-2018, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  *	  contrib/dict_xsyn/dict_xsyn.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14 
15 #include <ctype.h>
16 
17 #include "commands/defrem.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_utils.h"
20 
21 PG_MODULE_MAGIC;
22 
23 typedef struct
24 {
Interval()25 	char	   *key;			/* Word */
26 	char	   *value;			/* Unparsed list of synonyms, including the
27 								 * word itself */
28 } Syn;
29 
30 typedef struct
31 {
32 	int			len;
33 	Syn		   *syn;
34 
35 	bool		matchorig;
36 	bool		keeporig;
37 	bool		matchsynonyms;
38 	bool		keepsynonyms;
39 } DictSyn;
40 
41 
42 PG_FUNCTION_INFO_V1(dxsyn_init);
43 PG_FUNCTION_INFO_V1(dxsyn_lexize);
44 
45 static char *
46 find_word(char *in, char **end)
47 {
48 	char	   *start;
49 
50 	*end = NULL;
51 	while (*in && t_isspace(in))
52 		in += pg_mblen(in);
53 
54 	if (!*in || *in == '#')
start()55 		return NULL;
size()56 	start = in;
empty()57 
58 	while (*in && !t_isspace(in))
59 		in += pg_mblen(in);
60 
61 	*end = in;
Contains(const Interval & that)62 
63 	return start;
64 }
IsDisjointWith(const Interval & that)65 
66 static int
67 compare_syn(const void *a, const void *b)
68 {
69 	return strcmp(((const Syn *) a)->key, ((const Syn *) b)->key);
70 }
Annex(const Interval & that)71 
72 static void
73 read_dictionary(DictSyn *d, const char *filename)
74 {
75 	char	   *real_filename = get_tsearch_config_filename(filename, "rules");
76 	tsearch_readline_state trst;
77 	char	   *line;
78 	int			cur = 0;
79 
80 	if (!tsearch_readline_begin(&trst, real_filename))
81 		ereport(ERROR,
82 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
83 				 errmsg("could not open synonym file \"%s\": %m",
84 						real_filename)));
85 
86 	while ((line = tsearch_readline(&trst)) != NULL)
87 	{
88 		char	   *value;
89 		char	   *key;
90 		char	   *pos;
91 		char	   *end;
92 
93 		if (*line == '\0')
94 			continue;
95 
96 		value = lowerstr(line);
97 		pfree(line);
98 
99 		pos = value;
100 		while ((key = find_word(pos, &end)) != NULL)
101 		{
102 			/* Enlarge syn structure if full */
103 			if (cur == d->len)
104 			{
105 				d->len = (d->len > 0) ? 2 * d->len : 16;
106 				if (d->syn)
107 					d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
108 				else
109 					d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
110 			}
111 
112 			/* Save first word only if we will match it */
113 			if (pos != value || d->matchorig)
114 			{
115 				d->syn[cur].key = pnstrdup(key, end - key);
116 				d->syn[cur].value = pstrdup(value);
117 
118 				cur++;
119 			}
120 
121 			pos = end;
122 
123 			/* Don't bother scanning synonyms if we will not match them */
124 			if (!d->matchsynonyms)
125 				break;
126 		}
127 
128 		pfree(value);
129 	}
130 
131 	tsearch_readline_end(&trst);
132 
133 	d->len = cur;
134 	if (cur > 1)
135 		qsort(d->syn, d->len, sizeof(Syn), compare_syn);
136 
137 	pfree(real_filename);
138 }
139 
140 Datum
141 dxsyn_init(PG_FUNCTION_ARGS)
142 {
143 	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
144 	DictSyn    *d;
145 	ListCell   *l;
146 	char	   *filename = NULL;
147 
148 	d = (DictSyn *) palloc0(sizeof(DictSyn));
149 	d->len = 0;
150 	d->syn = NULL;
151 	d->matchorig = true;
152 	d->keeporig = true;
153 	d->matchsynonyms = false;
154 	d->keepsynonyms = true;
155 
156 	foreach(l, dictoptions)
157 	{
158 		DefElem    *defel = (DefElem *) lfirst(l);
159 
160 		if (strcmp(defel->defname, "matchorig") == 0)
161 		{
162 			d->matchorig = defGetBoolean(defel);
163 		}
164 		else if (strcmp(defel->defname, "keeporig") == 0)
165 		{
166 			d->keeporig = defGetBoolean(defel);
167 		}
168 		else if (strcmp(defel->defname, "matchsynonyms") == 0)
169 		{
170 			d->matchsynonyms = defGetBoolean(defel);
171 		}
172 		else if (strcmp(defel->defname, "keepsynonyms") == 0)
173 		{
174 			d->keepsynonyms = defGetBoolean(defel);
175 		}
176 		else if (strcmp(defel->defname, "rules") == 0)
177 		{
178 			/* we can't read the rules before parsing all options! */
179 			filename = defGetString(defel);
180 		}
181 		else
182 		{
183 			ereport(ERROR,
184 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
185 					 errmsg("unrecognized xsyn parameter: \"%s\"",
186 							defel->defname)));
187 		}
188 	}
189 
190 	if (filename)
191 		read_dictionary(d, filename);
192 
193 	PG_RETURN_POINTER(d);
194 }
195 
196 Datum
197 dxsyn_lexize(PG_FUNCTION_ARGS)
198 {
199 	DictSyn    *d = (DictSyn *) PG_GETARG_POINTER(0);
200 	char	   *in = (char *) PG_GETARG_POINTER(1);
201 	int			length = PG_GETARG_INT32(2);
202 	Syn			word;
203 	Syn		   *found;
204 	TSLexeme   *res = NULL;
205 
206 	if (!length || d->len == 0)
207 		PG_RETURN_POINTER(NULL);
208 
209 	/* Create search pattern */
210 	{
211 		char	   *temp = pnstrdup(in, length);
212 
213 		word.key = lowerstr(temp);
214 		pfree(temp);
215 		word.value = NULL;
216 	}
217 
218 	/* Look for matching syn */
219 	found = (Syn *) bsearch(&word, d->syn, d->len, sizeof(Syn), compare_syn);
220 	pfree(word.key);
221 
222 	if (!found)
223 		PG_RETURN_POINTER(NULL);
224 
225 	/* Parse string of synonyms and return array of words */
226 	{
227 		char	   *value = found->value;
228 		char	   *syn;
229 		char	   *pos;
230 		char	   *end;
231 		int			nsyns = 0;
232 
233 		res = palloc(sizeof(TSLexeme));
234 
235 		pos = value;
236 		while ((syn = find_word(pos, &end)) != NULL)
237 		{
238 			res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2));
239 
240 			/* The first word is output only if keeporig=true */
241 			if (pos != value || d->keeporig)
242 			{
243 				res[nsyns].lexeme = pnstrdup(syn, end - syn);
244 				res[nsyns].nvariant = 0;
245 				res[nsyns].flags = 0;
246 				nsyns++;
247 			}
248 
249 			pos = end;
250 
251 			/* Stop if we are not to output the synonyms */
252 			if (!d->keepsynonyms)
253 				break;
254 		}
255 		res[nsyns].lexeme = NULL;
256 	}
257 
258 	PG_RETURN_POINTER(res);
259 }
260