1 /*-------------------------------------------------------------------------
2  *
3  * ts_locale.c
4  *		locale compatibility layer for tsearch
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/tsearch/ts_locale.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "catalog/pg_collation.h"
17 #include "common/string.h"
18 #include "storage/fd.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_public.h"
21 
22 static void tsearch_readline_callback(void *arg);
23 
24 
25 /*
26  * The reason these functions use a 3-wchar_t output buffer, not 2 as you
27  * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
28  * getting from char2wchar() is UTF16 not UTF32.  A single input character
29  * may therefore produce a surrogate pair rather than just one wchar_t;
30  * we also need room for a trailing null.  When we do get a surrogate pair,
31  * we pass just the first code to iswdigit() etc, so that these functions will
32  * always return false for characters outside the Basic Multilingual Plane.
33  */
34 #define WC_BUF_LEN  3
35 
36 int
t_isdigit(const char * ptr)37 t_isdigit(const char *ptr)
38 {
39 	int			clen = pg_mblen(ptr);
40 	wchar_t		character[WC_BUF_LEN];
41 	Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
42 	pg_locale_t mylocale = 0;	/* TODO */
43 
44 	if (clen == 1 || lc_ctype_is_c(collation))
45 		return isdigit(TOUCHAR(ptr));
46 
47 	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
48 
49 	return iswdigit((wint_t) character[0]);
50 }
51 
52 int
t_isspace(const char * ptr)53 t_isspace(const char *ptr)
54 {
55 	int			clen = pg_mblen(ptr);
56 	wchar_t		character[WC_BUF_LEN];
57 	Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
58 	pg_locale_t mylocale = 0;	/* TODO */
59 
60 	if (clen == 1 || lc_ctype_is_c(collation))
61 		return isspace(TOUCHAR(ptr));
62 
63 	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
64 
65 	return iswspace((wint_t) character[0]);
66 }
67 
68 int
t_isalpha(const char * ptr)69 t_isalpha(const char *ptr)
70 {
71 	int			clen = pg_mblen(ptr);
72 	wchar_t		character[WC_BUF_LEN];
73 	Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
74 	pg_locale_t mylocale = 0;	/* TODO */
75 
76 	if (clen == 1 || lc_ctype_is_c(collation))
77 		return isalpha(TOUCHAR(ptr));
78 
79 	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
80 
81 	return iswalpha((wint_t) character[0]);
82 }
83 
84 int
t_isprint(const char * ptr)85 t_isprint(const char *ptr)
86 {
87 	int			clen = pg_mblen(ptr);
88 	wchar_t		character[WC_BUF_LEN];
89 	Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
90 	pg_locale_t mylocale = 0;	/* TODO */
91 
92 	if (clen == 1 || lc_ctype_is_c(collation))
93 		return isprint(TOUCHAR(ptr));
94 
95 	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
96 
97 	return iswprint((wint_t) character[0]);
98 }
99 
100 
101 /*
102  * Set up to read a file using tsearch_readline().  This facility is
103  * better than just reading the file directly because it provides error
104  * context pointing to the specific line where a problem is detected.
105  *
106  * Expected usage is:
107  *
108  *		tsearch_readline_state trst;
109  *
110  *		if (!tsearch_readline_begin(&trst, filename))
111  *			ereport(ERROR,
112  *					(errcode(ERRCODE_CONFIG_FILE_ERROR),
113  *					 errmsg("could not open stop-word file \"%s\": %m",
114  *							filename)));
115  *		while ((line = tsearch_readline(&trst)) != NULL)
116  *			process line;
117  *		tsearch_readline_end(&trst);
118  *
119  * Note that the caller supplies the ereport() for file open failure;
120  * this is so that a custom message can be provided.  The filename string
121  * passed to tsearch_readline_begin() must remain valid through
122  * tsearch_readline_end().
123  */
124 bool
tsearch_readline_begin(tsearch_readline_state * stp,const char * filename)125 tsearch_readline_begin(tsearch_readline_state *stp,
126 					   const char *filename)
127 {
128 	if ((stp->fp = AllocateFile(filename, "r")) == NULL)
129 		return false;
130 	stp->filename = filename;
131 	stp->lineno = 0;
132 	initStringInfo(&stp->buf);
133 	stp->curline = NULL;
134 	/* Setup error traceback support for ereport() */
135 	stp->cb.callback = tsearch_readline_callback;
136 	stp->cb.arg = (void *) stp;
137 	stp->cb.previous = error_context_stack;
138 	error_context_stack = &stp->cb;
139 	return true;
140 }
141 
142 /*
143  * Read the next line from a tsearch data file (expected to be in UTF-8), and
144  * convert it to database encoding if needed. The returned string is palloc'd.
145  * NULL return means EOF.
146  */
147 char *
tsearch_readline(tsearch_readline_state * stp)148 tsearch_readline(tsearch_readline_state *stp)
149 {
150 	char	   *recoded;
151 
152 	/* Advance line number to use in error reports */
153 	stp->lineno++;
154 
155 	/* Clear curline, it's no longer relevant */
156 	if (stp->curline)
157 	{
158 		if (stp->curline != stp->buf.data)
159 			pfree(stp->curline);
160 		stp->curline = NULL;
161 	}
162 
163 	/* Collect next line, if there is one */
164 	if (!pg_get_line_buf(stp->fp, &stp->buf))
165 		return NULL;
166 
167 	/* Validate the input as UTF-8, then convert to DB encoding if needed */
168 	recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
169 
170 	/* Save the correctly-encoded string for possible error reports */
171 	stp->curline = recoded;		/* might be equal to buf.data */
172 
173 	/*
174 	 * We always return a freshly pstrdup'd string.  This is clearly necessary
175 	 * if pg_any_to_server() returned buf.data, and we need a second copy even
176 	 * if encoding conversion did occur.  The caller is entitled to pfree the
177 	 * returned string at any time, which would leave curline pointing to
178 	 * recycled storage, causing problems if an error occurs after that point.
179 	 * (It's preferable to return the result of pstrdup instead of the output
180 	 * of pg_any_to_server, because the conversion result tends to be
181 	 * over-allocated.  Since callers might save the result string directly
182 	 * into a long-lived dictionary structure, we don't want it to be a larger
183 	 * palloc chunk than necessary.  We'll reclaim the conversion result on
184 	 * the next call.)
185 	 */
186 	return pstrdup(recoded);
187 }
188 
189 /*
190  * Close down after reading a file with tsearch_readline()
191  */
192 void
tsearch_readline_end(tsearch_readline_state * stp)193 tsearch_readline_end(tsearch_readline_state *stp)
194 {
195 	/* Suppress use of curline in any error reported below */
196 	if (stp->curline)
197 	{
198 		if (stp->curline != stp->buf.data)
199 			pfree(stp->curline);
200 		stp->curline = NULL;
201 	}
202 
203 	/* Release other resources */
204 	pfree(stp->buf.data);
205 	FreeFile(stp->fp);
206 
207 	/* Pop the error context stack */
208 	error_context_stack = stp->cb.previous;
209 }
210 
211 /*
212  * Error context callback for errors occurring while reading a tsearch
213  * configuration file.
214  */
215 static void
tsearch_readline_callback(void * arg)216 tsearch_readline_callback(void *arg)
217 {
218 	tsearch_readline_state *stp = (tsearch_readline_state *) arg;
219 
220 	/*
221 	 * We can't include the text of the config line for errors that occur
222 	 * during tsearch_readline() itself.  The major cause of such errors is
223 	 * encoding violations, and we daren't try to print error messages
224 	 * containing badly-encoded data.
225 	 */
226 	if (stp->curline)
227 		errcontext("line %d of configuration file \"%s\": \"%s\"",
228 				   stp->lineno,
229 				   stp->filename,
230 				   stp->curline);
231 	else
232 		errcontext("line %d of configuration file \"%s\"",
233 				   stp->lineno,
234 				   stp->filename);
235 }
236 
237 
238 /*
239  * lowerstr --- fold null-terminated string to lower case
240  *
241  * Returned string is palloc'd
242  */
243 char *
lowerstr(const char * str)244 lowerstr(const char *str)
245 {
246 	return lowerstr_with_len(str, strlen(str));
247 }
248 
249 /*
250  * lowerstr_with_len --- fold string to lower case
251  *
252  * Input string need not be null-terminated.
253  *
254  * Returned string is palloc'd
255  */
256 char *
lowerstr_with_len(const char * str,int len)257 lowerstr_with_len(const char *str, int len)
258 {
259 	char	   *out;
260 	Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
261 	pg_locale_t mylocale = 0;	/* TODO */
262 
263 	if (len == 0)
264 		return pstrdup("");
265 
266 	/*
267 	 * Use wide char code only when max encoding length > 1 and ctype != C.
268 	 * Some operating systems fail with multi-byte encodings and a C locale.
269 	 * Also, for a C locale there is no need to process as multibyte. From
270 	 * backend/utils/adt/oracle_compat.c Teodor
271 	 */
272 	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation))
273 	{
274 		wchar_t    *wstr,
275 				   *wptr;
276 		int			wlen;
277 
278 		/*
279 		 * alloc number of wchar_t for worst case, len contains number of
280 		 * bytes >= number of characters and alloc 1 wchar_t for 0, because
281 		 * wchar2char wants zero-terminated string
282 		 */
283 		wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
284 
285 		wlen = char2wchar(wstr, len + 1, str, len, mylocale);
286 		Assert(wlen <= len);
287 
288 		while (*wptr)
289 		{
290 			*wptr = towlower((wint_t) *wptr);
291 			wptr++;
292 		}
293 
294 		/*
295 		 * Alloc result string for worst case + '\0'
296 		 */
297 		len = pg_database_encoding_max_length() * wlen + 1;
298 		out = (char *) palloc(len);
299 
300 		wlen = wchar2char(out, wstr, len, mylocale);
301 
302 		pfree(wstr);
303 
304 		if (wlen < 0)
305 			ereport(ERROR,
306 					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
307 					 errmsg("conversion from wchar_t to server encoding failed: %m")));
308 		Assert(wlen < len);
309 	}
310 	else
311 	{
312 		const char *ptr = str;
313 		char	   *outptr;
314 
315 		outptr = out = (char *) palloc(sizeof(char) * (len + 1));
316 		while ((ptr - str) < len && *ptr)
317 		{
318 			*outptr++ = tolower(TOUCHAR(ptr));
319 			ptr++;
320 		}
321 		*outptr = '\0';
322 	}
323 
324 	return out;
325 }
326