1 /*-------------------------------------------------------------------------
2  *
3  * ts_locale.c
4  *		locale compatibility layer for tsearch
5  *
6  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/tsearch/ts_locale.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "catalog/pg_collation.h"
17 #include "storage/fd.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_public.h"
20 
21 static void tsearch_readline_callback(void *arg);
22 
23 
24 #ifdef USE_WIDE_UPPER_LOWER
25 
26 /*
27  * The reason these functions use a 3-wchar_t output buffer, not 2 as you
28  * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
29  * getting from char2wchar() is UTF16 not UTF32.  A single input character
30  * may therefore produce a surrogate pair rather than just one wchar_t;
31  * we also need room for a trailing null.  When we do get a surrogate pair,
32  * we pass just the first code to iswdigit() etc, so that these functions will
33  * always return false for characters outside the Basic Multilingual Plane.
34  */
35 #define WC_BUF_LEN  3
36 
37 int
t_isdigit(const char * ptr)38 t_isdigit(const char *ptr)
39 {
40 	int			clen = pg_mblen(ptr);
41 	wchar_t		character[WC_BUF_LEN];
42 	Oid			collation = DEFAULT_COLLATION_OID;		/* TODO */
43 	pg_locale_t mylocale = 0;	/* TODO */
44 
45 	if (clen == 1 || lc_ctype_is_c(collation))
46 		return isdigit(TOUCHAR(ptr));
47 
48 	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
49 
50 	return iswdigit((wint_t) character[0]);
51 }
52 
53 int
t_isspace(const char * ptr)54 t_isspace(const char *ptr)
55 {
56 	int			clen = pg_mblen(ptr);
57 	wchar_t		character[WC_BUF_LEN];
58 	Oid			collation = DEFAULT_COLLATION_OID;		/* TODO */
59 	pg_locale_t mylocale = 0;	/* TODO */
60 
61 	if (clen == 1 || lc_ctype_is_c(collation))
62 		return isspace(TOUCHAR(ptr));
63 
64 	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
65 
66 	return iswspace((wint_t) character[0]);
67 }
68 
69 int
t_isalpha(const char * ptr)70 t_isalpha(const char *ptr)
71 {
72 	int			clen = pg_mblen(ptr);
73 	wchar_t		character[WC_BUF_LEN];
74 	Oid			collation = DEFAULT_COLLATION_OID;		/* TODO */
75 	pg_locale_t mylocale = 0;	/* TODO */
76 
77 	if (clen == 1 || lc_ctype_is_c(collation))
78 		return isalpha(TOUCHAR(ptr));
79 
80 	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
81 
82 	return iswalpha((wint_t) character[0]);
83 }
84 
85 int
t_isprint(const char * ptr)86 t_isprint(const char *ptr)
87 {
88 	int			clen = pg_mblen(ptr);
89 	wchar_t		character[WC_BUF_LEN];
90 	Oid			collation = DEFAULT_COLLATION_OID;		/* TODO */
91 	pg_locale_t mylocale = 0;	/* TODO */
92 
93 	if (clen == 1 || lc_ctype_is_c(collation))
94 		return isprint(TOUCHAR(ptr));
95 
96 	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
97 
98 	return iswprint((wint_t) character[0]);
99 }
100 #endif   /* USE_WIDE_UPPER_LOWER */
101 
102 
103 /*
104  * Set up to read a file using tsearch_readline().  This facility is
105  * better than just reading the file directly because it provides error
106  * context pointing to the specific line where a problem is detected.
107  *
108  * Expected usage is:
109  *
110  *		tsearch_readline_state trst;
111  *
112  *		if (!tsearch_readline_begin(&trst, filename))
113  *			ereport(ERROR,
114  *					(errcode(ERRCODE_CONFIG_FILE_ERROR),
115  *					 errmsg("could not open stop-word file \"%s\": %m",
116  *							filename)));
117  *		while ((line = tsearch_readline(&trst)) != NULL)
118  *			process line;
119  *		tsearch_readline_end(&trst);
120  *
121  * Note that the caller supplies the ereport() for file open failure;
122  * this is so that a custom message can be provided.  The filename string
123  * passed to tsearch_readline_begin() must remain valid through
124  * tsearch_readline_end().
125  */
126 bool
tsearch_readline_begin(tsearch_readline_state * stp,const char * filename)127 tsearch_readline_begin(tsearch_readline_state *stp,
128 					   const char *filename)
129 {
130 	if ((stp->fp = AllocateFile(filename, "r")) == NULL)
131 		return false;
132 	stp->filename = filename;
133 	stp->lineno = 0;
134 	stp->curline = NULL;
135 	/* Setup error traceback support for ereport() */
136 	stp->cb.callback = tsearch_readline_callback;
137 	stp->cb.arg = (void *) stp;
138 	stp->cb.previous = error_context_stack;
139 	error_context_stack = &stp->cb;
140 	return true;
141 }
142 
143 /*
144  * Read the next line from a tsearch data file (expected to be in UTF-8), and
145  * convert it to database encoding if needed. The returned string is palloc'd.
146  * NULL return means EOF.
147  */
148 char *
tsearch_readline(tsearch_readline_state * stp)149 tsearch_readline(tsearch_readline_state *stp)
150 {
151 	char	   *result;
152 
153 	/* Advance line number to use in error reports */
154 	stp->lineno++;
155 
156 	/* Clear curline, it's no longer relevant */
157 	if (stp->curline)
158 	{
159 		pfree(stp->curline);
160 		stp->curline = NULL;
161 	}
162 
163 	/* Collect next line, if there is one */
164 	result = t_readline(stp->fp);
165 	if (!result)
166 		return NULL;
167 
168 	/*
169 	 * Save a copy of the line for possible use in error reports.  (We cannot
170 	 * just save "result", since it's likely to get pfree'd at some point by
171 	 * the caller; an error after that would try to access freed data.)
172 	 */
173 	stp->curline = pstrdup(result);
174 
175 	return result;
176 }
177 
178 /*
179  * Close down after reading a file with tsearch_readline()
180  */
181 void
tsearch_readline_end(tsearch_readline_state * stp)182 tsearch_readline_end(tsearch_readline_state *stp)
183 {
184 	/* Suppress use of curline in any error reported below */
185 	if (stp->curline)
186 	{
187 		pfree(stp->curline);
188 		stp->curline = NULL;
189 	}
190 
191 	/* Release other resources */
192 	FreeFile(stp->fp);
193 
194 	/* Pop the error context stack */
195 	error_context_stack = stp->cb.previous;
196 }
197 
198 /*
199  * Error context callback for errors occurring while reading a tsearch
200  * configuration file.
201  */
202 static void
tsearch_readline_callback(void * arg)203 tsearch_readline_callback(void *arg)
204 {
205 	tsearch_readline_state *stp = (tsearch_readline_state *) arg;
206 
207 	/*
208 	 * We can't include the text of the config line for errors that occur
209 	 * during t_readline() itself.  This is only partly a consequence of our
210 	 * arms-length use of that routine: the major cause of such errors is
211 	 * encoding violations, and we daren't try to print error messages
212 	 * containing badly-encoded data.
213 	 */
214 	if (stp->curline)
215 		errcontext("line %d of configuration file \"%s\": \"%s\"",
216 				   stp->lineno,
217 				   stp->filename,
218 				   stp->curline);
219 	else
220 		errcontext("line %d of configuration file \"%s\"",
221 				   stp->lineno,
222 				   stp->filename);
223 }
224 
225 
226 /*
227  * Read the next line from a tsearch data file (expected to be in UTF-8), and
228  * convert it to database encoding if needed. The returned string is palloc'd.
229  * NULL return means EOF.
230  *
231  * Note: direct use of this function is now deprecated.  Go through
232  * tsearch_readline() to provide better error reporting.
233  */
234 char *
t_readline(FILE * fp)235 t_readline(FILE *fp)
236 {
237 	int			len;
238 	char	   *recoded;
239 	char		buf[4096];		/* lines must not be longer than this */
240 
241 	if (fgets(buf, sizeof(buf), fp) == NULL)
242 		return NULL;
243 
244 	len = strlen(buf);
245 
246 	/* Make sure the input is valid UTF-8 */
247 	(void) pg_verify_mbstr(PG_UTF8, buf, len, false);
248 
249 	/* And convert */
250 	recoded = pg_any_to_server(buf, len, PG_UTF8);
251 	if (recoded == buf)
252 	{
253 		/*
254 		 * conversion didn't pstrdup, so we must. We can use the length of the
255 		 * original string, because no conversion was done.
256 		 */
257 		recoded = pnstrdup(recoded, len);
258 	}
259 
260 	return recoded;
261 }
262 
263 /*
264  * lowerstr --- fold null-terminated string to lower case
265  *
266  * Returned string is palloc'd
267  */
268 char *
lowerstr(const char * str)269 lowerstr(const char *str)
270 {
271 	return lowerstr_with_len(str, strlen(str));
272 }
273 
274 /*
275  * lowerstr_with_len --- fold string to lower case
276  *
277  * Input string need not be null-terminated.
278  *
279  * Returned string is palloc'd
280  */
281 char *
lowerstr_with_len(const char * str,int len)282 lowerstr_with_len(const char *str, int len)
283 {
284 	char	   *out;
285 
286 #ifdef USE_WIDE_UPPER_LOWER
287 	Oid			collation = DEFAULT_COLLATION_OID;		/* TODO */
288 	pg_locale_t mylocale = 0;	/* TODO */
289 #endif
290 
291 	if (len == 0)
292 		return pstrdup("");
293 
294 #ifdef USE_WIDE_UPPER_LOWER
295 
296 	/*
297 	 * Use wide char code only when max encoding length > 1 and ctype != C.
298 	 * Some operating systems fail with multi-byte encodings and a C locale.
299 	 * Also, for a C locale there is no need to process as multibyte. From
300 	 * backend/utils/adt/oracle_compat.c Teodor
301 	 */
302 	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation))
303 	{
304 		wchar_t    *wstr,
305 				   *wptr;
306 		int			wlen;
307 
308 		/*
309 		 * alloc number of wchar_t for worst case, len contains number of
310 		 * bytes >= number of characters and alloc 1 wchar_t for 0, because
311 		 * wchar2char wants zero-terminated string
312 		 */
313 		wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
314 
315 		wlen = char2wchar(wstr, len + 1, str, len, mylocale);
316 		Assert(wlen <= len);
317 
318 		while (*wptr)
319 		{
320 			*wptr = towlower((wint_t) *wptr);
321 			wptr++;
322 		}
323 
324 		/*
325 		 * Alloc result string for worst case + '\0'
326 		 */
327 		len = pg_database_encoding_max_length() * wlen + 1;
328 		out = (char *) palloc(len);
329 
330 		wlen = wchar2char(out, wstr, len, mylocale);
331 
332 		pfree(wstr);
333 
334 		if (wlen < 0)
335 			ereport(ERROR,
336 					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
337 			errmsg("conversion from wchar_t to server encoding failed: %m")));
338 		Assert(wlen < len);
339 	}
340 	else
341 #endif   /* USE_WIDE_UPPER_LOWER */
342 	{
343 		const char *ptr = str;
344 		char	   *outptr;
345 
346 		outptr = out = (char *) palloc(sizeof(char) * (len + 1));
347 		while ((ptr - str) < len && *ptr)
348 		{
349 			*outptr++ = tolower(TOUCHAR(ptr));
350 			ptr++;
351 		}
352 		*outptr = '\0';
353 	}
354 
355 	return out;
356 }
357