1 /*-------------------------------------------------------------------------
2  *
3  * ts_locale.c
4  *		locale compatibility layer for tsearch
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/tsearch/ts_locale.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "catalog/pg_collation.h"
17 #include "storage/fd.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_public.h"
20 
21 static void tsearch_readline_callback(void *arg);
22 
23 
24 /*
25  * The reason these functions use a 3-wchar_t output buffer, not 2 as you
26  * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
27  * getting from char2wchar() is UTF16 not UTF32.  A single input character
28  * may therefore produce a surrogate pair rather than just one wchar_t;
29  * we also need room for a trailing null.  When we do get a surrogate pair,
30  * we pass just the first code to iswdigit() etc, so that these functions will
31  * always return false for characters outside the Basic Multilingual Plane.
32  */
33 #define WC_BUF_LEN  3
34 
35 int
36 t_isdigit(const char *ptr)
37 {
38 	int			clen = pg_mblen(ptr);
39 	wchar_t		character[WC_BUF_LEN];
40 	Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
41 	pg_locale_t mylocale = 0;	/* TODO */
42 
43 	if (clen == 1 || lc_ctype_is_c(collation))
44 		return isdigit(TOUCHAR(ptr));
45 
46 	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
47 
48 	return iswdigit((wint_t) character[0]);
49 }
50 
51 int
52 t_isspace(const char *ptr)
53 {
54 	int			clen = pg_mblen(ptr);
55 	wchar_t		character[WC_BUF_LEN];
56 	Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
57 	pg_locale_t mylocale = 0;	/* TODO */
58 
59 	if (clen == 1 || lc_ctype_is_c(collation))
60 		return isspace(TOUCHAR(ptr));
61 
62 	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
63 
64 	return iswspace((wint_t) character[0]);
65 }
66 
67 int
68 t_isalpha(const char *ptr)
69 {
70 	int			clen = pg_mblen(ptr);
71 	wchar_t		character[WC_BUF_LEN];
72 	Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
73 	pg_locale_t mylocale = 0;	/* TODO */
74 
75 	if (clen == 1 || lc_ctype_is_c(collation))
76 		return isalpha(TOUCHAR(ptr));
77 
78 	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
79 
80 	return iswalpha((wint_t) character[0]);
81 }
82 
83 int
84 t_isprint(const char *ptr)
85 {
86 	int			clen = pg_mblen(ptr);
87 	wchar_t		character[WC_BUF_LEN];
88 	Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
89 	pg_locale_t mylocale = 0;	/* TODO */
90 
91 	if (clen == 1 || lc_ctype_is_c(collation))
92 		return isprint(TOUCHAR(ptr));
93 
94 	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
95 
96 	return iswprint((wint_t) character[0]);
97 }
98 
99 
100 /*
101  * Set up to read a file using tsearch_readline().  This facility is
102  * better than just reading the file directly because it provides error
103  * context pointing to the specific line where a problem is detected.
104  *
105  * Expected usage is:
106  *
107  *		tsearch_readline_state trst;
108  *
109  *		if (!tsearch_readline_begin(&trst, filename))
110  *			ereport(ERROR,
111  *					(errcode(ERRCODE_CONFIG_FILE_ERROR),
112  *					 errmsg("could not open stop-word file \"%s\": %m",
113  *							filename)));
114  *		while ((line = tsearch_readline(&trst)) != NULL)
115  *			process line;
116  *		tsearch_readline_end(&trst);
117  *
118  * Note that the caller supplies the ereport() for file open failure;
119  * this is so that a custom message can be provided.  The filename string
120  * passed to tsearch_readline_begin() must remain valid through
121  * tsearch_readline_end().
122  */
123 bool
124 tsearch_readline_begin(tsearch_readline_state *stp,
125 					   const char *filename)
126 {
127 	if ((stp->fp = AllocateFile(filename, "r")) == NULL)
128 		return false;
129 	stp->filename = filename;
130 	stp->lineno = 0;
131 	stp->curline = NULL;
132 	/* Setup error traceback support for ereport() */
133 	stp->cb.callback = tsearch_readline_callback;
134 	stp->cb.arg = (void *) stp;
135 	stp->cb.previous = error_context_stack;
136 	error_context_stack = &stp->cb;
137 	return true;
138 }
139 
140 /*
141  * Read the next line from a tsearch data file (expected to be in UTF-8), and
142  * convert it to database encoding if needed. The returned string is palloc'd.
143  * NULL return means EOF.
144  */
145 char *
146 tsearch_readline(tsearch_readline_state *stp)
147 {
148 	char	   *result;
149 
150 	/* Advance line number to use in error reports */
151 	stp->lineno++;
152 
153 	/* Clear curline, it's no longer relevant */
154 	if (stp->curline)
155 	{
156 		pfree(stp->curline);
157 		stp->curline = NULL;
158 	}
159 
160 	/* Collect next line, if there is one */
161 	result = t_readline(stp->fp);
162 	if (!result)
163 		return NULL;
164 
165 	/*
166 	 * Save a copy of the line for possible use in error reports.  (We cannot
167 	 * just save "result", since it's likely to get pfree'd at some point by
168 	 * the caller; an error after that would try to access freed data.)
169 	 */
170 	stp->curline = pstrdup(result);
171 
172 	return result;
173 }
174 
175 /*
176  * Close down after reading a file with tsearch_readline()
177  */
178 void
179 tsearch_readline_end(tsearch_readline_state *stp)
180 {
181 	/* Suppress use of curline in any error reported below */
182 	if (stp->curline)
183 	{
184 		pfree(stp->curline);
185 		stp->curline = NULL;
186 	}
187 
188 	/* Release other resources */
189 	FreeFile(stp->fp);
190 
191 	/* Pop the error context stack */
192 	error_context_stack = stp->cb.previous;
193 }
194 
195 /*
196  * Error context callback for errors occurring while reading a tsearch
197  * configuration file.
198  */
199 static void
200 tsearch_readline_callback(void *arg)
201 {
202 	tsearch_readline_state *stp = (tsearch_readline_state *) arg;
203 
204 	/*
205 	 * We can't include the text of the config line for errors that occur
206 	 * during t_readline() itself.  This is only partly a consequence of our
207 	 * arms-length use of that routine: the major cause of such errors is
208 	 * encoding violations, and we daren't try to print error messages
209 	 * containing badly-encoded data.
210 	 */
211 	if (stp->curline)
212 		errcontext("line %d of configuration file \"%s\": \"%s\"",
213 				   stp->lineno,
214 				   stp->filename,
215 				   stp->curline);
216 	else
217 		errcontext("line %d of configuration file \"%s\"",
218 				   stp->lineno,
219 				   stp->filename);
220 }
221 
222 
223 /*
224  * Read the next line from a tsearch data file (expected to be in UTF-8), and
225  * convert it to database encoding if needed. The returned string is palloc'd.
226  * NULL return means EOF.
227  *
228  * Note: direct use of this function is now deprecated.  Go through
229  * tsearch_readline() to provide better error reporting.
230  */
231 char *
232 t_readline(FILE *fp)
233 {
234 	int			len;
235 	char	   *recoded;
236 	char		buf[4096];		/* lines must not be longer than this */
237 
238 	if (fgets(buf, sizeof(buf), fp) == NULL)
239 		return NULL;
240 
241 	len = strlen(buf);
242 
243 	/* Make sure the input is valid UTF-8 */
244 	(void) pg_verify_mbstr(PG_UTF8, buf, len, false);
245 
246 	/* And convert */
247 	recoded = pg_any_to_server(buf, len, PG_UTF8);
248 	if (recoded == buf)
249 	{
250 		/*
251 		 * conversion didn't pstrdup, so we must. We can use the length of the
252 		 * original string, because no conversion was done.
253 		 */
254 		recoded = pnstrdup(recoded, len);
255 	}
256 
257 	return recoded;
258 }
259 
260 /*
261  * lowerstr --- fold null-terminated string to lower case
262  *
263  * Returned string is palloc'd
264  */
265 char *
266 lowerstr(const char *str)
267 {
268 	return lowerstr_with_len(str, strlen(str));
269 }
270 
271 /*
272  * lowerstr_with_len --- fold string to lower case
273  *
274  * Input string need not be null-terminated.
275  *
276  * Returned string is palloc'd
277  */
278 char *
279 lowerstr_with_len(const char *str, int len)
280 {
281 	char	   *out;
282 	Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
283 	pg_locale_t mylocale = 0;	/* TODO */
284 
285 	if (len == 0)
286 		return pstrdup("");
287 
288 	/*
289 	 * Use wide char code only when max encoding length > 1 and ctype != C.
290 	 * Some operating systems fail with multi-byte encodings and a C locale.
291 	 * Also, for a C locale there is no need to process as multibyte. From
292 	 * backend/utils/adt/oracle_compat.c Teodor
293 	 */
294 	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation))
295 	{
296 		wchar_t    *wstr,
297 				   *wptr;
298 		int			wlen;
299 
300 		/*
301 		 * alloc number of wchar_t for worst case, len contains number of
302 		 * bytes >= number of characters and alloc 1 wchar_t for 0, because
303 		 * wchar2char wants zero-terminated string
304 		 */
305 		wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
306 
307 		wlen = char2wchar(wstr, len + 1, str, len, mylocale);
308 		Assert(wlen <= len);
309 
310 		while (*wptr)
311 		{
312 			*wptr = towlower((wint_t) *wptr);
313 			wptr++;
314 		}
315 
316 		/*
317 		 * Alloc result string for worst case + '\0'
318 		 */
319 		len = pg_database_encoding_max_length() * wlen + 1;
320 		out = (char *) palloc(len);
321 
322 		wlen = wchar2char(out, wstr, len, mylocale);
323 
324 		pfree(wstr);
325 
326 		if (wlen < 0)
327 			ereport(ERROR,
328 					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
329 					 errmsg("conversion from wchar_t to server encoding failed: %m")));
330 		Assert(wlen < len);
331 	}
332 	else
333 	{
334 		const char *ptr = str;
335 		char	   *outptr;
336 
337 		outptr = out = (char *) palloc(sizeof(char) * (len + 1));
338 		while ((ptr - str) < len && *ptr)
339 		{
340 			*outptr++ = tolower(TOUCHAR(ptr));
341 			ptr++;
342 		}
343 		*outptr = '\0';
344 	}
345 
346 	return out;
347 }
348