1 /*-------------------------------------------------------------------------
2 *
3 * ts_locale.c
4 * locale compatibility layer for tsearch
5 *
6 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/tsearch/ts_locale.c
11 *
12 *-------------------------------------------------------------------------
13 */
14 #include "postgres.h"
15
16 #include "catalog/pg_collation.h"
17 #include "storage/fd.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_public.h"
20
21 static void tsearch_readline_callback(void *arg);
22
23
24 /*
25 * The reason these functions use a 3-wchar_t output buffer, not 2 as you
26 * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
27 * getting from char2wchar() is UTF16 not UTF32. A single input character
28 * may therefore produce a surrogate pair rather than just one wchar_t;
29 * we also need room for a trailing null. When we do get a surrogate pair,
30 * we pass just the first code to iswdigit() etc, so that these functions will
31 * always return false for characters outside the Basic Multilingual Plane.
32 */
33 #define WC_BUF_LEN 3
34
35 int
t_isdigit(const char * ptr)36 t_isdigit(const char *ptr)
37 {
38 int clen = pg_mblen(ptr);
39 wchar_t character[WC_BUF_LEN];
40 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
41 pg_locale_t mylocale = 0; /* TODO */
42
43 if (clen == 1 || lc_ctype_is_c(collation))
44 return isdigit(TOUCHAR(ptr));
45
46 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
47
48 return iswdigit((wint_t) character[0]);
49 }
50
51 int
t_isspace(const char * ptr)52 t_isspace(const char *ptr)
53 {
54 int clen = pg_mblen(ptr);
55 wchar_t character[WC_BUF_LEN];
56 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
57 pg_locale_t mylocale = 0; /* TODO */
58
59 if (clen == 1 || lc_ctype_is_c(collation))
60 return isspace(TOUCHAR(ptr));
61
62 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
63
64 return iswspace((wint_t) character[0]);
65 }
66
67 int
t_isalpha(const char * ptr)68 t_isalpha(const char *ptr)
69 {
70 int clen = pg_mblen(ptr);
71 wchar_t character[WC_BUF_LEN];
72 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
73 pg_locale_t mylocale = 0; /* TODO */
74
75 if (clen == 1 || lc_ctype_is_c(collation))
76 return isalpha(TOUCHAR(ptr));
77
78 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
79
80 return iswalpha((wint_t) character[0]);
81 }
82
83 int
t_isprint(const char * ptr)84 t_isprint(const char *ptr)
85 {
86 int clen = pg_mblen(ptr);
87 wchar_t character[WC_BUF_LEN];
88 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
89 pg_locale_t mylocale = 0; /* TODO */
90
91 if (clen == 1 || lc_ctype_is_c(collation))
92 return isprint(TOUCHAR(ptr));
93
94 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
95
96 return iswprint((wint_t) character[0]);
97 }
98
99
100 /*
101 * Set up to read a file using tsearch_readline(). This facility is
102 * better than just reading the file directly because it provides error
103 * context pointing to the specific line where a problem is detected.
104 *
105 * Expected usage is:
106 *
107 * tsearch_readline_state trst;
108 *
109 * if (!tsearch_readline_begin(&trst, filename))
110 * ereport(ERROR,
111 * (errcode(ERRCODE_CONFIG_FILE_ERROR),
112 * errmsg("could not open stop-word file \"%s\": %m",
113 * filename)));
114 * while ((line = tsearch_readline(&trst)) != NULL)
115 * process line;
116 * tsearch_readline_end(&trst);
117 *
118 * Note that the caller supplies the ereport() for file open failure;
119 * this is so that a custom message can be provided. The filename string
120 * passed to tsearch_readline_begin() must remain valid through
121 * tsearch_readline_end().
122 */
123 bool
tsearch_readline_begin(tsearch_readline_state * stp,const char * filename)124 tsearch_readline_begin(tsearch_readline_state *stp,
125 const char *filename)
126 {
127 if ((stp->fp = AllocateFile(filename, "r")) == NULL)
128 return false;
129 stp->filename = filename;
130 stp->lineno = 0;
131 stp->curline = NULL;
132 /* Setup error traceback support for ereport() */
133 stp->cb.callback = tsearch_readline_callback;
134 stp->cb.arg = (void *) stp;
135 stp->cb.previous = error_context_stack;
136 error_context_stack = &stp->cb;
137 return true;
138 }
139
140 /*
141 * Read the next line from a tsearch data file (expected to be in UTF-8), and
142 * convert it to database encoding if needed. The returned string is palloc'd.
143 * NULL return means EOF.
144 */
145 char *
tsearch_readline(tsearch_readline_state * stp)146 tsearch_readline(tsearch_readline_state *stp)
147 {
148 char *result;
149
150 /* Advance line number to use in error reports */
151 stp->lineno++;
152
153 /* Clear curline, it's no longer relevant */
154 if (stp->curline)
155 {
156 pfree(stp->curline);
157 stp->curline = NULL;
158 }
159
160 /* Collect next line, if there is one */
161 result = t_readline(stp->fp);
162 if (!result)
163 return NULL;
164
165 /*
166 * Save a copy of the line for possible use in error reports. (We cannot
167 * just save "result", since it's likely to get pfree'd at some point by
168 * the caller; an error after that would try to access freed data.)
169 */
170 stp->curline = pstrdup(result);
171
172 return result;
173 }
174
175 /*
176 * Close down after reading a file with tsearch_readline()
177 */
178 void
tsearch_readline_end(tsearch_readline_state * stp)179 tsearch_readline_end(tsearch_readline_state *stp)
180 {
181 /* Suppress use of curline in any error reported below */
182 if (stp->curline)
183 {
184 pfree(stp->curline);
185 stp->curline = NULL;
186 }
187
188 /* Release other resources */
189 FreeFile(stp->fp);
190
191 /* Pop the error context stack */
192 error_context_stack = stp->cb.previous;
193 }
194
195 /*
196 * Error context callback for errors occurring while reading a tsearch
197 * configuration file.
198 */
199 static void
tsearch_readline_callback(void * arg)200 tsearch_readline_callback(void *arg)
201 {
202 tsearch_readline_state *stp = (tsearch_readline_state *) arg;
203
204 /*
205 * We can't include the text of the config line for errors that occur
206 * during t_readline() itself. This is only partly a consequence of our
207 * arms-length use of that routine: the major cause of such errors is
208 * encoding violations, and we daren't try to print error messages
209 * containing badly-encoded data.
210 */
211 if (stp->curline)
212 errcontext("line %d of configuration file \"%s\": \"%s\"",
213 stp->lineno,
214 stp->filename,
215 stp->curline);
216 else
217 errcontext("line %d of configuration file \"%s\"",
218 stp->lineno,
219 stp->filename);
220 }
221
222
223 /*
224 * Read the next line from a tsearch data file (expected to be in UTF-8), and
225 * convert it to database encoding if needed. The returned string is palloc'd.
226 * NULL return means EOF.
227 *
228 * Note: direct use of this function is now deprecated. Go through
229 * tsearch_readline() to provide better error reporting.
230 */
231 char *
t_readline(FILE * fp)232 t_readline(FILE *fp)
233 {
234 int len;
235 char *recoded;
236 char buf[4096]; /* lines must not be longer than this */
237
238 if (fgets(buf, sizeof(buf), fp) == NULL)
239 return NULL;
240
241 len = strlen(buf);
242
243 /* Make sure the input is valid UTF-8 */
244 (void) pg_verify_mbstr(PG_UTF8, buf, len, false);
245
246 /* And convert */
247 recoded = pg_any_to_server(buf, len, PG_UTF8);
248 if (recoded == buf)
249 {
250 /*
251 * conversion didn't pstrdup, so we must. We can use the length of the
252 * original string, because no conversion was done.
253 */
254 recoded = pnstrdup(recoded, len);
255 }
256
257 return recoded;
258 }
259
260 /*
261 * lowerstr --- fold null-terminated string to lower case
262 *
263 * Returned string is palloc'd
264 */
265 char *
lowerstr(const char * str)266 lowerstr(const char *str)
267 {
268 return lowerstr_with_len(str, strlen(str));
269 }
270
271 /*
272 * lowerstr_with_len --- fold string to lower case
273 *
274 * Input string need not be null-terminated.
275 *
276 * Returned string is palloc'd
277 */
278 char *
lowerstr_with_len(const char * str,int len)279 lowerstr_with_len(const char *str, int len)
280 {
281 char *out;
282 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
283 pg_locale_t mylocale = 0; /* TODO */
284
285 if (len == 0)
286 return pstrdup("");
287
288 /*
289 * Use wide char code only when max encoding length > 1 and ctype != C.
290 * Some operating systems fail with multi-byte encodings and a C locale.
291 * Also, for a C locale there is no need to process as multibyte. From
292 * backend/utils/adt/oracle_compat.c Teodor
293 */
294 if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation))
295 {
296 wchar_t *wstr,
297 *wptr;
298 int wlen;
299
300 /*
301 * alloc number of wchar_t for worst case, len contains number of
302 * bytes >= number of characters and alloc 1 wchar_t for 0, because
303 * wchar2char wants zero-terminated string
304 */
305 wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
306
307 wlen = char2wchar(wstr, len + 1, str, len, mylocale);
308 Assert(wlen <= len);
309
310 while (*wptr)
311 {
312 *wptr = towlower((wint_t) *wptr);
313 wptr++;
314 }
315
316 /*
317 * Alloc result string for worst case + '\0'
318 */
319 len = pg_database_encoding_max_length() * wlen + 1;
320 out = (char *) palloc(len);
321
322 wlen = wchar2char(out, wstr, len, mylocale);
323
324 pfree(wstr);
325
326 if (wlen < 0)
327 ereport(ERROR,
328 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
329 errmsg("conversion from wchar_t to server encoding failed: %m")));
330 Assert(wlen < len);
331 }
332 else
333 {
334 const char *ptr = str;
335 char *outptr;
336
337 outptr = out = (char *) palloc(sizeof(char) * (len + 1));
338 while ((ptr - str) < len && *ptr)
339 {
340 *outptr++ = tolower(TOUCHAR(ptr));
341 ptr++;
342 }
343 *outptr = '\0';
344 }
345
346 return out;
347 }
348