1 /*-------------------------------------------------------------------------
2 *
3 * ts_locale.c
4 * locale compatibility layer for tsearch
5 *
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/tsearch/ts_locale.c
11 *
12 *-------------------------------------------------------------------------
13 */
14 #include "postgres.h"
15
16 #include "catalog/pg_collation.h"
17 #include "storage/fd.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_public.h"
20
21 static void tsearch_readline_callback(void *arg);
22
23
24 #ifdef USE_WIDE_UPPER_LOWER
25
26 /*
27 * The reason these functions use a 3-wchar_t output buffer, not 2 as you
28 * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
29 * getting from char2wchar() is UTF16 not UTF32. A single input character
30 * may therefore produce a surrogate pair rather than just one wchar_t;
31 * we also need room for a trailing null. When we do get a surrogate pair,
32 * we pass just the first code to iswdigit() etc, so that these functions will
33 * always return false for characters outside the Basic Multilingual Plane.
34 */
35 #define WC_BUF_LEN 3
36
37 int
t_isdigit(const char * ptr)38 t_isdigit(const char *ptr)
39 {
40 int clen = pg_mblen(ptr);
41 wchar_t character[WC_BUF_LEN];
42 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
43 pg_locale_t mylocale = 0; /* TODO */
44
45 if (clen == 1 || lc_ctype_is_c(collation))
46 return isdigit(TOUCHAR(ptr));
47
48 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
49
50 return iswdigit((wint_t) character[0]);
51 }
52
53 int
t_isspace(const char * ptr)54 t_isspace(const char *ptr)
55 {
56 int clen = pg_mblen(ptr);
57 wchar_t character[WC_BUF_LEN];
58 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
59 pg_locale_t mylocale = 0; /* TODO */
60
61 if (clen == 1 || lc_ctype_is_c(collation))
62 return isspace(TOUCHAR(ptr));
63
64 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
65
66 return iswspace((wint_t) character[0]);
67 }
68
69 int
t_isalpha(const char * ptr)70 t_isalpha(const char *ptr)
71 {
72 int clen = pg_mblen(ptr);
73 wchar_t character[WC_BUF_LEN];
74 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
75 pg_locale_t mylocale = 0; /* TODO */
76
77 if (clen == 1 || lc_ctype_is_c(collation))
78 return isalpha(TOUCHAR(ptr));
79
80 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
81
82 return iswalpha((wint_t) character[0]);
83 }
84
85 int
t_isprint(const char * ptr)86 t_isprint(const char *ptr)
87 {
88 int clen = pg_mblen(ptr);
89 wchar_t character[WC_BUF_LEN];
90 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
91 pg_locale_t mylocale = 0; /* TODO */
92
93 if (clen == 1 || lc_ctype_is_c(collation))
94 return isprint(TOUCHAR(ptr));
95
96 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
97
98 return iswprint((wint_t) character[0]);
99 }
100 #endif /* USE_WIDE_UPPER_LOWER */
101
102
103 /*
104 * Set up to read a file using tsearch_readline(). This facility is
105 * better than just reading the file directly because it provides error
106 * context pointing to the specific line where a problem is detected.
107 *
108 * Expected usage is:
109 *
110 * tsearch_readline_state trst;
111 *
112 * if (!tsearch_readline_begin(&trst, filename))
113 * ereport(ERROR,
114 * (errcode(ERRCODE_CONFIG_FILE_ERROR),
115 * errmsg("could not open stop-word file \"%s\": %m",
116 * filename)));
117 * while ((line = tsearch_readline(&trst)) != NULL)
118 * process line;
119 * tsearch_readline_end(&trst);
120 *
121 * Note that the caller supplies the ereport() for file open failure;
122 * this is so that a custom message can be provided. The filename string
123 * passed to tsearch_readline_begin() must remain valid through
124 * tsearch_readline_end().
125 */
126 bool
tsearch_readline_begin(tsearch_readline_state * stp,const char * filename)127 tsearch_readline_begin(tsearch_readline_state *stp,
128 const char *filename)
129 {
130 if ((stp->fp = AllocateFile(filename, "r")) == NULL)
131 return false;
132 stp->filename = filename;
133 stp->lineno = 0;
134 stp->curline = NULL;
135 /* Setup error traceback support for ereport() */
136 stp->cb.callback = tsearch_readline_callback;
137 stp->cb.arg = (void *) stp;
138 stp->cb.previous = error_context_stack;
139 error_context_stack = &stp->cb;
140 return true;
141 }
142
143 /*
144 * Read the next line from a tsearch data file (expected to be in UTF-8), and
145 * convert it to database encoding if needed. The returned string is palloc'd.
146 * NULL return means EOF.
147 */
148 char *
tsearch_readline(tsearch_readline_state * stp)149 tsearch_readline(tsearch_readline_state *stp)
150 {
151 char *result;
152
153 /* Advance line number to use in error reports */
154 stp->lineno++;
155
156 /* Clear curline, it's no longer relevant */
157 if (stp->curline)
158 {
159 pfree(stp->curline);
160 stp->curline = NULL;
161 }
162
163 /* Collect next line, if there is one */
164 result = t_readline(stp->fp);
165 if (!result)
166 return NULL;
167
168 /*
169 * Save a copy of the line for possible use in error reports. (We cannot
170 * just save "result", since it's likely to get pfree'd at some point by
171 * the caller; an error after that would try to access freed data.)
172 */
173 stp->curline = pstrdup(result);
174
175 return result;
176 }
177
178 /*
179 * Close down after reading a file with tsearch_readline()
180 */
181 void
tsearch_readline_end(tsearch_readline_state * stp)182 tsearch_readline_end(tsearch_readline_state *stp)
183 {
184 /* Suppress use of curline in any error reported below */
185 if (stp->curline)
186 {
187 pfree(stp->curline);
188 stp->curline = NULL;
189 }
190
191 /* Release other resources */
192 FreeFile(stp->fp);
193
194 /* Pop the error context stack */
195 error_context_stack = stp->cb.previous;
196 }
197
198 /*
199 * Error context callback for errors occurring while reading a tsearch
200 * configuration file.
201 */
202 static void
tsearch_readline_callback(void * arg)203 tsearch_readline_callback(void *arg)
204 {
205 tsearch_readline_state *stp = (tsearch_readline_state *) arg;
206
207 /*
208 * We can't include the text of the config line for errors that occur
209 * during t_readline() itself. This is only partly a consequence of our
210 * arms-length use of that routine: the major cause of such errors is
211 * encoding violations, and we daren't try to print error messages
212 * containing badly-encoded data.
213 */
214 if (stp->curline)
215 errcontext("line %d of configuration file \"%s\": \"%s\"",
216 stp->lineno,
217 stp->filename,
218 stp->curline);
219 else
220 errcontext("line %d of configuration file \"%s\"",
221 stp->lineno,
222 stp->filename);
223 }
224
225
226 /*
227 * Read the next line from a tsearch data file (expected to be in UTF-8), and
228 * convert it to database encoding if needed. The returned string is palloc'd.
229 * NULL return means EOF.
230 *
231 * Note: direct use of this function is now deprecated. Go through
232 * tsearch_readline() to provide better error reporting.
233 */
234 char *
t_readline(FILE * fp)235 t_readline(FILE *fp)
236 {
237 int len;
238 char *recoded;
239 char buf[4096]; /* lines must not be longer than this */
240
241 if (fgets(buf, sizeof(buf), fp) == NULL)
242 return NULL;
243
244 len = strlen(buf);
245
246 /* Make sure the input is valid UTF-8 */
247 (void) pg_verify_mbstr(PG_UTF8, buf, len, false);
248
249 /* And convert */
250 recoded = pg_any_to_server(buf, len, PG_UTF8);
251 if (recoded == buf)
252 {
253 /*
254 * conversion didn't pstrdup, so we must. We can use the length of the
255 * original string, because no conversion was done.
256 */
257 recoded = pnstrdup(recoded, len);
258 }
259
260 return recoded;
261 }
262
263 /*
264 * lowerstr --- fold null-terminated string to lower case
265 *
266 * Returned string is palloc'd
267 */
268 char *
lowerstr(const char * str)269 lowerstr(const char *str)
270 {
271 return lowerstr_with_len(str, strlen(str));
272 }
273
274 /*
275 * lowerstr_with_len --- fold string to lower case
276 *
277 * Input string need not be null-terminated.
278 *
279 * Returned string is palloc'd
280 */
281 char *
lowerstr_with_len(const char * str,int len)282 lowerstr_with_len(const char *str, int len)
283 {
284 char *out;
285
286 #ifdef USE_WIDE_UPPER_LOWER
287 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
288 pg_locale_t mylocale = 0; /* TODO */
289 #endif
290
291 if (len == 0)
292 return pstrdup("");
293
294 #ifdef USE_WIDE_UPPER_LOWER
295
296 /*
297 * Use wide char code only when max encoding length > 1 and ctype != C.
298 * Some operating systems fail with multi-byte encodings and a C locale.
299 * Also, for a C locale there is no need to process as multibyte. From
300 * backend/utils/adt/oracle_compat.c Teodor
301 */
302 if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation))
303 {
304 wchar_t *wstr,
305 *wptr;
306 int wlen;
307
308 /*
309 * alloc number of wchar_t for worst case, len contains number of
310 * bytes >= number of characters and alloc 1 wchar_t for 0, because
311 * wchar2char wants zero-terminated string
312 */
313 wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
314
315 wlen = char2wchar(wstr, len + 1, str, len, mylocale);
316 Assert(wlen <= len);
317
318 while (*wptr)
319 {
320 *wptr = towlower((wint_t) *wptr);
321 wptr++;
322 }
323
324 /*
325 * Alloc result string for worst case + '\0'
326 */
327 len = pg_database_encoding_max_length() * wlen + 1;
328 out = (char *) palloc(len);
329
330 wlen = wchar2char(out, wstr, len, mylocale);
331
332 pfree(wstr);
333
334 if (wlen < 0)
335 ereport(ERROR,
336 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
337 errmsg("conversion from wchar_t to server encoding failed: %m")));
338 Assert(wlen < len);
339 }
340 else
341 #endif /* USE_WIDE_UPPER_LOWER */
342 {
343 const char *ptr = str;
344 char *outptr;
345
346 outptr = out = (char *) palloc(sizeof(char) * (len + 1));
347 while ((ptr - str) < len && *ptr)
348 {
349 *outptr++ = tolower(TOUCHAR(ptr));
350 ptr++;
351 }
352 *outptr = '\0';
353 }
354
355 return out;
356 }
357