1 /*-------------------------------------------------------------------------
2 *
3 * dict_snowball.c
4 * Snowball dictionary
5 *
6 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/backend/snowball/dict_snowball.c
10 *
11 *-------------------------------------------------------------------------
12 */
13 #include "postgres.h"
14
15 #include "commands/defrem.h"
16 #include "tsearch/ts_locale.h"
17 #include "tsearch/ts_utils.h"
18
19 /* Some platforms define MAXINT and/or MININT, causing conflicts */
20 #ifdef MAXINT
21 #undef MAXINT
22 #endif
23 #ifdef MININT
24 #undef MININT
25 #endif
26
27 /* Now we can include the original Snowball header.h */
28 #include "snowball/libstemmer/header.h"
29 #include "snowball/libstemmer/stem_ISO_8859_1_basque.h"
30 #include "snowball/libstemmer/stem_ISO_8859_1_catalan.h"
31 #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
32 #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
33 #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
34 #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
35 #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
36 #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
37 #include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
38 #include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
39 #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
40 #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
41 #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
42 #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
43 #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
44 #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
45 #include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
46 #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
47 #include "snowball/libstemmer/stem_KOI8_R_russian.h"
48 #include "snowball/libstemmer/stem_UTF_8_arabic.h"
49 #include "snowball/libstemmer/stem_UTF_8_armenian.h"
50 #include "snowball/libstemmer/stem_UTF_8_basque.h"
51 #include "snowball/libstemmer/stem_UTF_8_catalan.h"
52 #include "snowball/libstemmer/stem_UTF_8_danish.h"
53 #include "snowball/libstemmer/stem_UTF_8_dutch.h"
54 #include "snowball/libstemmer/stem_UTF_8_english.h"
55 #include "snowball/libstemmer/stem_UTF_8_finnish.h"
56 #include "snowball/libstemmer/stem_UTF_8_french.h"
57 #include "snowball/libstemmer/stem_UTF_8_german.h"
58 #include "snowball/libstemmer/stem_UTF_8_greek.h"
59 #include "snowball/libstemmer/stem_UTF_8_hindi.h"
60 #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
61 #include "snowball/libstemmer/stem_UTF_8_indonesian.h"
62 #include "snowball/libstemmer/stem_UTF_8_irish.h"
63 #include "snowball/libstemmer/stem_UTF_8_italian.h"
64 #include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
65 #include "snowball/libstemmer/stem_UTF_8_nepali.h"
66 #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
67 #include "snowball/libstemmer/stem_UTF_8_porter.h"
68 #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
69 #include "snowball/libstemmer/stem_UTF_8_romanian.h"
70 #include "snowball/libstemmer/stem_UTF_8_russian.h"
71 #include "snowball/libstemmer/stem_UTF_8_serbian.h"
72 #include "snowball/libstemmer/stem_UTF_8_spanish.h"
73 #include "snowball/libstemmer/stem_UTF_8_swedish.h"
74 #include "snowball/libstemmer/stem_UTF_8_tamil.h"
75 #include "snowball/libstemmer/stem_UTF_8_turkish.h"
76 #include "snowball/libstemmer/stem_UTF_8_yiddish.h"
77
78 PG_MODULE_MAGIC;
79
80 PG_FUNCTION_INFO_V1(dsnowball_init);
81
82 PG_FUNCTION_INFO_V1(dsnowball_lexize);
83
84 /* List of supported modules */
85 typedef struct stemmer_module
86 {
87 const char *name;
88 pg_enc enc;
89 struct SN_env *(*create) (void);
90 void (*close) (struct SN_env *);
91 int (*stem) (struct SN_env *);
92 } stemmer_module;
93
94 /* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
95 #define STEMMER_MODULE(name,enc,senc) \
96 {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
97
98 static const stemmer_module stemmer_modules[] =
99 {
100 /*
101 * Stemmers list from Snowball distribution
102 */
103 STEMMER_MODULE(basque, PG_LATIN1, ISO_8859_1),
104 STEMMER_MODULE(catalan, PG_LATIN1, ISO_8859_1),
105 STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
106 STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
107 STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
108 STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
109 STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
110 STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
111 STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
112 STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
113 STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
114 STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
115 STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
116 STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
117 STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
118 STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
119 STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
120 STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
121 STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
122 STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
123 STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
124 STEMMER_MODULE(basque, PG_UTF8, UTF_8),
125 STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
126 STEMMER_MODULE(danish, PG_UTF8, UTF_8),
127 STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
128 STEMMER_MODULE(english, PG_UTF8, UTF_8),
129 STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
130 STEMMER_MODULE(french, PG_UTF8, UTF_8),
131 STEMMER_MODULE(german, PG_UTF8, UTF_8),
132 STEMMER_MODULE(greek, PG_UTF8, UTF_8),
133 STEMMER_MODULE(hindi, PG_UTF8, UTF_8),
134 STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
135 STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
136 STEMMER_MODULE(irish, PG_UTF8, UTF_8),
137 STEMMER_MODULE(italian, PG_UTF8, UTF_8),
138 STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
139 STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
140 STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
141 STEMMER_MODULE(porter, PG_UTF8, UTF_8),
142 STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
143 STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
144 STEMMER_MODULE(russian, PG_UTF8, UTF_8),
145 STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
146 STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
147 STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
148 STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
149 STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
150 STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),
151
152 /*
153 * Stemmer with PG_SQL_ASCII encoding should be valid for any server
154 * encoding
155 */
156 STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
157
158 {NULL, 0, NULL, NULL, NULL} /* list end marker */
159 };
160
161
162 typedef struct DictSnowball
163 {
164 struct SN_env *z;
165 StopList stoplist;
166 bool needrecode; /* needs recoding before/after call stem */
167 int (*stem) (struct SN_env *z);
168
169 /*
170 * snowball saves alloced memory between calls, so we should run it in our
171 * private memory context. Note, init function is executed in long lived
172 * context, so we just remember CurrentMemoryContext
173 */
174 MemoryContext dictCtx;
175 } DictSnowball;
176
177
178 static void
locate_stem_module(DictSnowball * d,const char * lang)179 locate_stem_module(DictSnowball *d, const char *lang)
180 {
181 const stemmer_module *m;
182
183 /*
184 * First, try to find exact match of stemmer module. Stemmer with
185 * PG_SQL_ASCII encoding is treated as working with any server encoding
186 */
187 for (m = stemmer_modules; m->name; m++)
188 {
189 if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
190 pg_strcasecmp(m->name, lang) == 0)
191 {
192 d->stem = m->stem;
193 d->z = m->create();
194 d->needrecode = false;
195 return;
196 }
197 }
198
199 /*
200 * Second, try to find stemmer for needed language for UTF8 encoding.
201 */
202 for (m = stemmer_modules; m->name; m++)
203 {
204 if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
205 {
206 d->stem = m->stem;
207 d->z = m->create();
208 d->needrecode = true;
209 return;
210 }
211 }
212
213 ereport(ERROR,
214 (errcode(ERRCODE_UNDEFINED_OBJECT),
215 errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
216 lang, GetDatabaseEncodingName())));
217 }
218
219 Datum
dsnowball_init(PG_FUNCTION_ARGS)220 dsnowball_init(PG_FUNCTION_ARGS)
221 {
222 List *dictoptions = (List *) PG_GETARG_POINTER(0);
223 DictSnowball *d;
224 bool stoploaded = false;
225 ListCell *l;
226
227 d = (DictSnowball *) palloc0(sizeof(DictSnowball));
228
229 foreach(l, dictoptions)
230 {
231 DefElem *defel = (DefElem *) lfirst(l);
232
233 if (strcmp(defel->defname, "stopwords") == 0)
234 {
235 if (stoploaded)
236 ereport(ERROR,
237 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
238 errmsg("multiple StopWords parameters")));
239 readstoplist(defGetString(defel), &d->stoplist, lowerstr);
240 stoploaded = true;
241 }
242 else if (strcmp(defel->defname, "language") == 0)
243 {
244 if (d->stem)
245 ereport(ERROR,
246 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
247 errmsg("multiple Language parameters")));
248 locate_stem_module(d, defGetString(defel));
249 }
250 else
251 {
252 ereport(ERROR,
253 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
254 errmsg("unrecognized Snowball parameter: \"%s\"",
255 defel->defname)));
256 }
257 }
258
259 if (!d->stem)
260 ereport(ERROR,
261 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
262 errmsg("missing Language parameter")));
263
264 d->dictCtx = CurrentMemoryContext;
265
266 PG_RETURN_POINTER(d);
267 }
268
269 Datum
dsnowball_lexize(PG_FUNCTION_ARGS)270 dsnowball_lexize(PG_FUNCTION_ARGS)
271 {
272 DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
273 char *in = (char *) PG_GETARG_POINTER(1);
274 int32 len = PG_GETARG_INT32(2);
275 char *txt = lowerstr_with_len(in, len);
276 TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
277
278 if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
279 {
280 pfree(txt);
281 }
282 else
283 {
284 MemoryContext saveCtx;
285
286 /*
287 * recode to utf8 if stemmer is utf8 and doesn't match server encoding
288 */
289 if (d->needrecode)
290 {
291 char *recoded;
292
293 recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
294 if (recoded != txt)
295 {
296 pfree(txt);
297 txt = recoded;
298 }
299 }
300
301 /* see comment about d->dictCtx */
302 saveCtx = MemoryContextSwitchTo(d->dictCtx);
303 SN_set_current(d->z, strlen(txt), (symbol *) txt);
304 d->stem(d->z);
305 MemoryContextSwitchTo(saveCtx);
306
307 if (d->z->p && d->z->l)
308 {
309 txt = repalloc(txt, d->z->l + 1);
310 memcpy(txt, d->z->p, d->z->l);
311 txt[d->z->l] = '\0';
312 }
313
314 /* back recode if needed */
315 if (d->needrecode)
316 {
317 char *recoded;
318
319 recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
320 if (recoded != txt)
321 {
322 pfree(txt);
323 txt = recoded;
324 }
325 }
326
327 res->lexeme = txt;
328 }
329
330 PG_RETURN_POINTER(res);
331 }
332