1 /*-------------------------------------------------------------------------
2  *
3  * dict_snowball.c
4  *		Snowball dictionary
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  *	  src/backend/snowball/dict_snowball.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14 
15 #include "commands/defrem.h"
16 #include "tsearch/ts_locale.h"
17 #include "tsearch/ts_utils.h"
18 
19 /* Some platforms define MAXINT and/or MININT, causing conflicts */
20 #ifdef MAXINT
21 #undef MAXINT
22 #endif
23 #ifdef MININT
24 #undef MININT
25 #endif
26 
27 /* Now we can include the original Snowball header.h */
28 #include "snowball/libstemmer/header.h"
29 #include "snowball/libstemmer/stem_ISO_8859_1_basque.h"
30 #include "snowball/libstemmer/stem_ISO_8859_1_catalan.h"
31 #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
32 #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
33 #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
34 #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
35 #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
36 #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
37 #include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
38 #include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
39 #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
40 #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
41 #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
42 #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
43 #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
44 #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
45 #include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
46 #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
47 #include "snowball/libstemmer/stem_KOI8_R_russian.h"
48 #include "snowball/libstemmer/stem_UTF_8_arabic.h"
49 #include "snowball/libstemmer/stem_UTF_8_armenian.h"
50 #include "snowball/libstemmer/stem_UTF_8_basque.h"
51 #include "snowball/libstemmer/stem_UTF_8_catalan.h"
52 #include "snowball/libstemmer/stem_UTF_8_danish.h"
53 #include "snowball/libstemmer/stem_UTF_8_dutch.h"
54 #include "snowball/libstemmer/stem_UTF_8_english.h"
55 #include "snowball/libstemmer/stem_UTF_8_finnish.h"
56 #include "snowball/libstemmer/stem_UTF_8_french.h"
57 #include "snowball/libstemmer/stem_UTF_8_german.h"
58 #include "snowball/libstemmer/stem_UTF_8_greek.h"
59 #include "snowball/libstemmer/stem_UTF_8_hindi.h"
60 #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
61 #include "snowball/libstemmer/stem_UTF_8_indonesian.h"
62 #include "snowball/libstemmer/stem_UTF_8_irish.h"
63 #include "snowball/libstemmer/stem_UTF_8_italian.h"
64 #include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
65 #include "snowball/libstemmer/stem_UTF_8_nepali.h"
66 #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
67 #include "snowball/libstemmer/stem_UTF_8_porter.h"
68 #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
69 #include "snowball/libstemmer/stem_UTF_8_romanian.h"
70 #include "snowball/libstemmer/stem_UTF_8_russian.h"
71 #include "snowball/libstemmer/stem_UTF_8_serbian.h"
72 #include "snowball/libstemmer/stem_UTF_8_spanish.h"
73 #include "snowball/libstemmer/stem_UTF_8_swedish.h"
74 #include "snowball/libstemmer/stem_UTF_8_tamil.h"
75 #include "snowball/libstemmer/stem_UTF_8_turkish.h"
76 #include "snowball/libstemmer/stem_UTF_8_yiddish.h"
77 
78 PG_MODULE_MAGIC;
79 
80 PG_FUNCTION_INFO_V1(dsnowball_init);
81 
82 PG_FUNCTION_INFO_V1(dsnowball_lexize);
83 
84 /* List of supported modules */
85 typedef struct stemmer_module
86 {
87 	const char *name;
88 	pg_enc		enc;
89 	struct SN_env *(*create) (void);
90 	void		(*close) (struct SN_env *);
91 	int			(*stem) (struct SN_env *);
92 } stemmer_module;
93 
94 /* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
95 #define STEMMER_MODULE(name,enc,senc) \
96 	{#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
97 
98 static const stemmer_module stemmer_modules[] =
99 {
100 	/*
101 	 * Stemmers list from Snowball distribution
102 	 */
103 	STEMMER_MODULE(basque, PG_LATIN1, ISO_8859_1),
104 	STEMMER_MODULE(catalan, PG_LATIN1, ISO_8859_1),
105 	STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
106 	STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
107 	STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
108 	STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
109 	STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
110 	STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
111 	STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
112 	STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
113 	STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
114 	STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
115 	STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
116 	STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
117 	STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
118 	STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
119 	STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
120 	STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
121 	STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
122 	STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
123 	STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
124 	STEMMER_MODULE(basque, PG_UTF8, UTF_8),
125 	STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
126 	STEMMER_MODULE(danish, PG_UTF8, UTF_8),
127 	STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
128 	STEMMER_MODULE(english, PG_UTF8, UTF_8),
129 	STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
130 	STEMMER_MODULE(french, PG_UTF8, UTF_8),
131 	STEMMER_MODULE(german, PG_UTF8, UTF_8),
132 	STEMMER_MODULE(greek, PG_UTF8, UTF_8),
133 	STEMMER_MODULE(hindi, PG_UTF8, UTF_8),
134 	STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
135 	STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
136 	STEMMER_MODULE(irish, PG_UTF8, UTF_8),
137 	STEMMER_MODULE(italian, PG_UTF8, UTF_8),
138 	STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
139 	STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
140 	STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
141 	STEMMER_MODULE(porter, PG_UTF8, UTF_8),
142 	STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
143 	STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
144 	STEMMER_MODULE(russian, PG_UTF8, UTF_8),
145 	STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
146 	STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
147 	STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
148 	STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
149 	STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
150 	STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),
151 
152 	/*
153 	 * Stemmer with PG_SQL_ASCII encoding should be valid for any server
154 	 * encoding
155 	 */
156 	STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
157 
158 	{NULL, 0, NULL, NULL, NULL} /* list end marker */
159 };
160 
161 
162 typedef struct DictSnowball
163 {
164 	struct SN_env *z;
165 	StopList	stoplist;
166 	bool		needrecode;		/* needs recoding before/after call stem */
167 	int			(*stem) (struct SN_env *z);
168 
169 	/*
170 	 * snowball saves alloced memory between calls, so we should run it in our
171 	 * private memory context. Note, init function is executed in long lived
172 	 * context, so we just remember CurrentMemoryContext
173 	 */
174 	MemoryContext dictCtx;
175 } DictSnowball;
176 
177 
178 static void
locate_stem_module(DictSnowball * d,const char * lang)179 locate_stem_module(DictSnowball *d, const char *lang)
180 {
181 	const stemmer_module *m;
182 
183 	/*
184 	 * First, try to find exact match of stemmer module. Stemmer with
185 	 * PG_SQL_ASCII encoding is treated as working with any server encoding
186 	 */
187 	for (m = stemmer_modules; m->name; m++)
188 	{
189 		if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
190 			pg_strcasecmp(m->name, lang) == 0)
191 		{
192 			d->stem = m->stem;
193 			d->z = m->create();
194 			d->needrecode = false;
195 			return;
196 		}
197 	}
198 
199 	/*
200 	 * Second, try to find stemmer for needed language for UTF8 encoding.
201 	 */
202 	for (m = stemmer_modules; m->name; m++)
203 	{
204 		if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
205 		{
206 			d->stem = m->stem;
207 			d->z = m->create();
208 			d->needrecode = true;
209 			return;
210 		}
211 	}
212 
213 	ereport(ERROR,
214 			(errcode(ERRCODE_UNDEFINED_OBJECT),
215 			 errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
216 					lang, GetDatabaseEncodingName())));
217 }
218 
219 Datum
dsnowball_init(PG_FUNCTION_ARGS)220 dsnowball_init(PG_FUNCTION_ARGS)
221 {
222 	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
223 	DictSnowball *d;
224 	bool		stoploaded = false;
225 	ListCell   *l;
226 
227 	d = (DictSnowball *) palloc0(sizeof(DictSnowball));
228 
229 	foreach(l, dictoptions)
230 	{
231 		DefElem    *defel = (DefElem *) lfirst(l);
232 
233 		if (strcmp(defel->defname, "stopwords") == 0)
234 		{
235 			if (stoploaded)
236 				ereport(ERROR,
237 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
238 						 errmsg("multiple StopWords parameters")));
239 			readstoplist(defGetString(defel), &d->stoplist, lowerstr);
240 			stoploaded = true;
241 		}
242 		else if (strcmp(defel->defname, "language") == 0)
243 		{
244 			if (d->stem)
245 				ereport(ERROR,
246 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
247 						 errmsg("multiple Language parameters")));
248 			locate_stem_module(d, defGetString(defel));
249 		}
250 		else
251 		{
252 			ereport(ERROR,
253 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
254 					 errmsg("unrecognized Snowball parameter: \"%s\"",
255 							defel->defname)));
256 		}
257 	}
258 
259 	if (!d->stem)
260 		ereport(ERROR,
261 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
262 				 errmsg("missing Language parameter")));
263 
264 	d->dictCtx = CurrentMemoryContext;
265 
266 	PG_RETURN_POINTER(d);
267 }
268 
269 Datum
dsnowball_lexize(PG_FUNCTION_ARGS)270 dsnowball_lexize(PG_FUNCTION_ARGS)
271 {
272 	DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
273 	char	   *in = (char *) PG_GETARG_POINTER(1);
274 	int32		len = PG_GETARG_INT32(2);
275 	char	   *txt = lowerstr_with_len(in, len);
276 	TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
277 
278 	if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
279 	{
280 		pfree(txt);
281 	}
282 	else
283 	{
284 		MemoryContext saveCtx;
285 
286 		/*
287 		 * recode to utf8 if stemmer is utf8 and doesn't match server encoding
288 		 */
289 		if (d->needrecode)
290 		{
291 			char	   *recoded;
292 
293 			recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
294 			if (recoded != txt)
295 			{
296 				pfree(txt);
297 				txt = recoded;
298 			}
299 		}
300 
301 		/* see comment about d->dictCtx */
302 		saveCtx = MemoryContextSwitchTo(d->dictCtx);
303 		SN_set_current(d->z, strlen(txt), (symbol *) txt);
304 		d->stem(d->z);
305 		MemoryContextSwitchTo(saveCtx);
306 
307 		if (d->z->p && d->z->l)
308 		{
309 			txt = repalloc(txt, d->z->l + 1);
310 			memcpy(txt, d->z->p, d->z->l);
311 			txt[d->z->l] = '\0';
312 		}
313 
314 		/* back recode if needed */
315 		if (d->needrecode)
316 		{
317 			char	   *recoded;
318 
319 			recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
320 			if (recoded != txt)
321 			{
322 				pfree(txt);
323 				txt = recoded;
324 			}
325 		}
326 
327 		res->lexeme = txt;
328 	}
329 
330 	PG_RETURN_POINTER(res);
331 }
332