1 /*-------------------------------------------------------------------------
2  *
3  * dict_snowball.c
4  *		Snowball dictionary
5  *
6  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  *	  src/backend/snowball/dict_snowball.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14 
15 #include "commands/defrem.h"
16 #include "tsearch/ts_locale.h"
17 #include "tsearch/ts_utils.h"
18 
19 /* Some platforms define MAXINT and/or MININT, causing conflicts */
20 #ifdef MAXINT
21 #undef MAXINT
22 #endif
23 #ifdef MININT
24 #undef MININT
25 #endif
26 
27 /* Now we can include the original Snowball header.h */
28 #include "snowball/libstemmer/header.h"
29 #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
30 #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
31 #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
32 #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
33 #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
34 #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
35 #include "snowball/libstemmer/stem_ISO_8859_1_hungarian.h"
36 #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
37 #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
38 #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
39 #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
40 #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
41 #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
42 #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
43 #include "snowball/libstemmer/stem_KOI8_R_russian.h"
44 #include "snowball/libstemmer/stem_UTF_8_danish.h"
45 #include "snowball/libstemmer/stem_UTF_8_dutch.h"
46 #include "snowball/libstemmer/stem_UTF_8_english.h"
47 #include "snowball/libstemmer/stem_UTF_8_finnish.h"
48 #include "snowball/libstemmer/stem_UTF_8_french.h"
49 #include "snowball/libstemmer/stem_UTF_8_german.h"
50 #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
51 #include "snowball/libstemmer/stem_UTF_8_italian.h"
52 #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
53 #include "snowball/libstemmer/stem_UTF_8_porter.h"
54 #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
55 #include "snowball/libstemmer/stem_UTF_8_romanian.h"
56 #include "snowball/libstemmer/stem_UTF_8_russian.h"
57 #include "snowball/libstemmer/stem_UTF_8_spanish.h"
58 #include "snowball/libstemmer/stem_UTF_8_swedish.h"
59 #include "snowball/libstemmer/stem_UTF_8_turkish.h"
60 
61 PG_MODULE_MAGIC;
62 
63 PG_FUNCTION_INFO_V1(dsnowball_init);
64 
65 PG_FUNCTION_INFO_V1(dsnowball_lexize);
66 
67 /* List of supported modules */
68 typedef struct stemmer_module
69 {
70 	const char *name;
71 	pg_enc		enc;
72 	struct SN_env *(*create) (void);
73 	void		(*close) (struct SN_env *);
74 	int			(*stem) (struct SN_env *);
75 } stemmer_module;
76 
77 static const stemmer_module stemmer_modules[] =
78 {
79 	/*
80 	 * Stemmers list from Snowball distribution
81 	 */
82 	{"danish", PG_LATIN1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
83 	{"dutch", PG_LATIN1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
84 	{"english", PG_LATIN1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
85 	{"finnish", PG_LATIN1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
86 	{"french", PG_LATIN1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
87 	{"german", PG_LATIN1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
88 	{"hungarian", PG_LATIN1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
89 	{"italian", PG_LATIN1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
90 	{"norwegian", PG_LATIN1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
91 	{"porter", PG_LATIN1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
92 	{"portuguese", PG_LATIN1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
93 	{"spanish", PG_LATIN1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
94 	{"swedish", PG_LATIN1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
95 	{"romanian", PG_LATIN2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
96 	{"russian", PG_KOI8R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
97 	{"danish", PG_UTF8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
98 	{"dutch", PG_UTF8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
99 	{"english", PG_UTF8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
100 	{"finnish", PG_UTF8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
101 	{"french", PG_UTF8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
102 	{"german", PG_UTF8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
103 	{"hungarian", PG_UTF8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
104 	{"italian", PG_UTF8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
105 	{"norwegian", PG_UTF8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
106 	{"porter", PG_UTF8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
107 	{"portuguese", PG_UTF8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
108 	{"romanian", PG_UTF8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
109 	{"russian", PG_UTF8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
110 	{"spanish", PG_UTF8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
111 	{"swedish", PG_UTF8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
112 	{"turkish", PG_UTF8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
113 
114 	/*
115 	 * Stemmer with PG_SQL_ASCII encoding should be valid for any server
116 	 * encoding
117 	 */
118 	{"english", PG_SQL_ASCII, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
119 
120 	{NULL, 0, NULL, NULL, NULL} /* list end marker */
121 };
122 
123 
124 typedef struct DictSnowball
125 {
126 	struct SN_env *z;
127 	StopList	stoplist;
128 	bool		needrecode;		/* needs recoding before/after call stem */
129 	int			(*stem) (struct SN_env * z);
130 
131 	/*
132 	 * snowball saves alloced memory between calls, so we should run it in our
133 	 * private memory context. Note, init function is executed in long lived
134 	 * context, so we just remember CurrentMemoryContext
135 	 */
136 	MemoryContext dictCtx;
137 } DictSnowball;
138 
139 
140 static void
locate_stem_module(DictSnowball * d,char * lang)141 locate_stem_module(DictSnowball *d, char *lang)
142 {
143 	const stemmer_module *m;
144 
145 	/*
146 	 * First, try to find exact match of stemmer module. Stemmer with
147 	 * PG_SQL_ASCII encoding is treated as working with any server encoding
148 	 */
149 	for (m = stemmer_modules; m->name; m++)
150 	{
151 		if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
152 			pg_strcasecmp(m->name, lang) == 0)
153 		{
154 			d->stem = m->stem;
155 			d->z = m->create();
156 			d->needrecode = false;
157 			return;
158 		}
159 	}
160 
161 	/*
162 	 * Second, try to find stemmer for needed language for UTF8 encoding.
163 	 */
164 	for (m = stemmer_modules; m->name; m++)
165 	{
166 		if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
167 		{
168 			d->stem = m->stem;
169 			d->z = m->create();
170 			d->needrecode = true;
171 			return;
172 		}
173 	}
174 
175 	ereport(ERROR,
176 			(errcode(ERRCODE_UNDEFINED_OBJECT),
177 			 errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
178 					lang, GetDatabaseEncodingName())));
179 }
180 
181 Datum
dsnowball_init(PG_FUNCTION_ARGS)182 dsnowball_init(PG_FUNCTION_ARGS)
183 {
184 	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
185 	DictSnowball *d;
186 	bool		stoploaded = false;
187 	ListCell   *l;
188 
189 	d = (DictSnowball *) palloc0(sizeof(DictSnowball));
190 
191 	foreach(l, dictoptions)
192 	{
193 		DefElem    *defel = (DefElem *) lfirst(l);
194 
195 		if (pg_strcasecmp("StopWords", defel->defname) == 0)
196 		{
197 			if (stoploaded)
198 				ereport(ERROR,
199 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
200 						 errmsg("multiple StopWords parameters")));
201 			readstoplist(defGetString(defel), &d->stoplist, lowerstr);
202 			stoploaded = true;
203 		}
204 		else if (pg_strcasecmp("Language", defel->defname) == 0)
205 		{
206 			if (d->stem)
207 				ereport(ERROR,
208 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
209 						 errmsg("multiple Language parameters")));
210 			locate_stem_module(d, defGetString(defel));
211 		}
212 		else
213 		{
214 			ereport(ERROR,
215 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
216 					 errmsg("unrecognized Snowball parameter: \"%s\"",
217 							defel->defname)));
218 		}
219 	}
220 
221 	if (!d->stem)
222 		ereport(ERROR,
223 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
224 				 errmsg("missing Language parameter")));
225 
226 	d->dictCtx = CurrentMemoryContext;
227 
228 	PG_RETURN_POINTER(d);
229 }
230 
231 Datum
dsnowball_lexize(PG_FUNCTION_ARGS)232 dsnowball_lexize(PG_FUNCTION_ARGS)
233 {
234 	DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
235 	char	   *in = (char *) PG_GETARG_POINTER(1);
236 	int32		len = PG_GETARG_INT32(2);
237 	char	   *txt = lowerstr_with_len(in, len);
238 	TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
239 
240 	if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
241 	{
242 		pfree(txt);
243 	}
244 	else
245 	{
246 		MemoryContext saveCtx;
247 
248 		/*
249 		 * recode to utf8 if stemmer is utf8 and doesn't match server encoding
250 		 */
251 		if (d->needrecode)
252 		{
253 			char	   *recoded;
254 
255 			recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
256 			if (recoded != txt)
257 			{
258 				pfree(txt);
259 				txt = recoded;
260 			}
261 		}
262 
263 		/* see comment about d->dictCtx */
264 		saveCtx = MemoryContextSwitchTo(d->dictCtx);
265 		SN_set_current(d->z, strlen(txt), (symbol *) txt);
266 		d->stem(d->z);
267 		MemoryContextSwitchTo(saveCtx);
268 
269 		if (d->z->p && d->z->l)
270 		{
271 			txt = repalloc(txt, d->z->l + 1);
272 			memcpy(txt, d->z->p, d->z->l);
273 			txt[d->z->l] = '\0';
274 		}
275 
276 		/* back recode if needed */
277 		if (d->needrecode)
278 		{
279 			char	   *recoded;
280 
281 			recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
282 			if (recoded != txt)
283 			{
284 				pfree(txt);
285 				txt = recoded;
286 			}
287 		}
288 
289 		res->lexeme = txt;
290 	}
291 
292 	PG_RETURN_POINTER(res);
293 }
294