1 /*-------------------------------------------------------------------------
2 *
3 * ts_locale.c
4 * locale compatibility layer for tsearch
5 *
6 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/tsearch/ts_locale.c
11 *
12 *-------------------------------------------------------------------------
13 */
14 #include "postgres.h"
15
16 #include "catalog/pg_collation.h"
17 #include "common/string.h"
18 #include "storage/fd.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_public.h"
21
22 static void tsearch_readline_callback(void *arg);
23
24
25 /*
26 * The reason these functions use a 3-wchar_t output buffer, not 2 as you
27 * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
28 * getting from char2wchar() is UTF16 not UTF32. A single input character
29 * may therefore produce a surrogate pair rather than just one wchar_t;
30 * we also need room for a trailing null. When we do get a surrogate pair,
31 * we pass just the first code to iswdigit() etc, so that these functions will
32 * always return false for characters outside the Basic Multilingual Plane.
33 */
34 #define WC_BUF_LEN 3
35
36 int
t_isdigit(const char * ptr)37 t_isdigit(const char *ptr)
38 {
39 int clen = pg_mblen(ptr);
40 wchar_t character[WC_BUF_LEN];
41 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
42 pg_locale_t mylocale = 0; /* TODO */
43
44 if (clen == 1 || lc_ctype_is_c(collation))
45 return isdigit(TOUCHAR(ptr));
46
47 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
48
49 return iswdigit((wint_t) character[0]);
50 }
51
52 int
t_isspace(const char * ptr)53 t_isspace(const char *ptr)
54 {
55 int clen = pg_mblen(ptr);
56 wchar_t character[WC_BUF_LEN];
57 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
58 pg_locale_t mylocale = 0; /* TODO */
59
60 if (clen == 1 || lc_ctype_is_c(collation))
61 return isspace(TOUCHAR(ptr));
62
63 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
64
65 return iswspace((wint_t) character[0]);
66 }
67
68 int
t_isalpha(const char * ptr)69 t_isalpha(const char *ptr)
70 {
71 int clen = pg_mblen(ptr);
72 wchar_t character[WC_BUF_LEN];
73 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
74 pg_locale_t mylocale = 0; /* TODO */
75
76 if (clen == 1 || lc_ctype_is_c(collation))
77 return isalpha(TOUCHAR(ptr));
78
79 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
80
81 return iswalpha((wint_t) character[0]);
82 }
83
84 int
t_isprint(const char * ptr)85 t_isprint(const char *ptr)
86 {
87 int clen = pg_mblen(ptr);
88 wchar_t character[WC_BUF_LEN];
89 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
90 pg_locale_t mylocale = 0; /* TODO */
91
92 if (clen == 1 || lc_ctype_is_c(collation))
93 return isprint(TOUCHAR(ptr));
94
95 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
96
97 return iswprint((wint_t) character[0]);
98 }
99
100
101 /*
102 * Set up to read a file using tsearch_readline(). This facility is
103 * better than just reading the file directly because it provides error
104 * context pointing to the specific line where a problem is detected.
105 *
106 * Expected usage is:
107 *
108 * tsearch_readline_state trst;
109 *
110 * if (!tsearch_readline_begin(&trst, filename))
111 * ereport(ERROR,
112 * (errcode(ERRCODE_CONFIG_FILE_ERROR),
113 * errmsg("could not open stop-word file \"%s\": %m",
114 * filename)));
115 * while ((line = tsearch_readline(&trst)) != NULL)
116 * process line;
117 * tsearch_readline_end(&trst);
118 *
119 * Note that the caller supplies the ereport() for file open failure;
120 * this is so that a custom message can be provided. The filename string
121 * passed to tsearch_readline_begin() must remain valid through
122 * tsearch_readline_end().
123 */
124 bool
tsearch_readline_begin(tsearch_readline_state * stp,const char * filename)125 tsearch_readline_begin(tsearch_readline_state *stp,
126 const char *filename)
127 {
128 if ((stp->fp = AllocateFile(filename, "r")) == NULL)
129 return false;
130 stp->filename = filename;
131 stp->lineno = 0;
132 initStringInfo(&stp->buf);
133 stp->curline = NULL;
134 /* Setup error traceback support for ereport() */
135 stp->cb.callback = tsearch_readline_callback;
136 stp->cb.arg = (void *) stp;
137 stp->cb.previous = error_context_stack;
138 error_context_stack = &stp->cb;
139 return true;
140 }
141
142 /*
143 * Read the next line from a tsearch data file (expected to be in UTF-8), and
144 * convert it to database encoding if needed. The returned string is palloc'd.
145 * NULL return means EOF.
146 */
147 char *
tsearch_readline(tsearch_readline_state * stp)148 tsearch_readline(tsearch_readline_state *stp)
149 {
150 char *recoded;
151
152 /* Advance line number to use in error reports */
153 stp->lineno++;
154
155 /* Clear curline, it's no longer relevant */
156 if (stp->curline)
157 {
158 if (stp->curline != stp->buf.data)
159 pfree(stp->curline);
160 stp->curline = NULL;
161 }
162
163 /* Collect next line, if there is one */
164 if (!pg_get_line_buf(stp->fp, &stp->buf))
165 return NULL;
166
167 /* Validate the input as UTF-8, then convert to DB encoding if needed */
168 recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
169
170 /* Save the correctly-encoded string for possible error reports */
171 stp->curline = recoded; /* might be equal to buf.data */
172
173 /*
174 * We always return a freshly pstrdup'd string. This is clearly necessary
175 * if pg_any_to_server() returned buf.data, and we need a second copy even
176 * if encoding conversion did occur. The caller is entitled to pfree the
177 * returned string at any time, which would leave curline pointing to
178 * recycled storage, causing problems if an error occurs after that point.
179 * (It's preferable to return the result of pstrdup instead of the output
180 * of pg_any_to_server, because the conversion result tends to be
181 * over-allocated. Since callers might save the result string directly
182 * into a long-lived dictionary structure, we don't want it to be a larger
183 * palloc chunk than necessary. We'll reclaim the conversion result on
184 * the next call.)
185 */
186 return pstrdup(recoded);
187 }
188
189 /*
190 * Close down after reading a file with tsearch_readline()
191 */
192 void
tsearch_readline_end(tsearch_readline_state * stp)193 tsearch_readline_end(tsearch_readline_state *stp)
194 {
195 /* Suppress use of curline in any error reported below */
196 if (stp->curline)
197 {
198 if (stp->curline != stp->buf.data)
199 pfree(stp->curline);
200 stp->curline = NULL;
201 }
202
203 /* Release other resources */
204 pfree(stp->buf.data);
205 FreeFile(stp->fp);
206
207 /* Pop the error context stack */
208 error_context_stack = stp->cb.previous;
209 }
210
211 /*
212 * Error context callback for errors occurring while reading a tsearch
213 * configuration file.
214 */
215 static void
tsearch_readline_callback(void * arg)216 tsearch_readline_callback(void *arg)
217 {
218 tsearch_readline_state *stp = (tsearch_readline_state *) arg;
219
220 /*
221 * We can't include the text of the config line for errors that occur
222 * during tsearch_readline() itself. The major cause of such errors is
223 * encoding violations, and we daren't try to print error messages
224 * containing badly-encoded data.
225 */
226 if (stp->curline)
227 errcontext("line %d of configuration file \"%s\": \"%s\"",
228 stp->lineno,
229 stp->filename,
230 stp->curline);
231 else
232 errcontext("line %d of configuration file \"%s\"",
233 stp->lineno,
234 stp->filename);
235 }
236
237
238 /*
239 * lowerstr --- fold null-terminated string to lower case
240 *
241 * Returned string is palloc'd
242 */
243 char *
lowerstr(const char * str)244 lowerstr(const char *str)
245 {
246 return lowerstr_with_len(str, strlen(str));
247 }
248
249 /*
250 * lowerstr_with_len --- fold string to lower case
251 *
252 * Input string need not be null-terminated.
253 *
254 * Returned string is palloc'd
255 */
256 char *
lowerstr_with_len(const char * str,int len)257 lowerstr_with_len(const char *str, int len)
258 {
259 char *out;
260 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
261 pg_locale_t mylocale = 0; /* TODO */
262
263 if (len == 0)
264 return pstrdup("");
265
266 /*
267 * Use wide char code only when max encoding length > 1 and ctype != C.
268 * Some operating systems fail with multi-byte encodings and a C locale.
269 * Also, for a C locale there is no need to process as multibyte. From
270 * backend/utils/adt/oracle_compat.c Teodor
271 */
272 if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation))
273 {
274 wchar_t *wstr,
275 *wptr;
276 int wlen;
277
278 /*
279 * alloc number of wchar_t for worst case, len contains number of
280 * bytes >= number of characters and alloc 1 wchar_t for 0, because
281 * wchar2char wants zero-terminated string
282 */
283 wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
284
285 wlen = char2wchar(wstr, len + 1, str, len, mylocale);
286 Assert(wlen <= len);
287
288 while (*wptr)
289 {
290 *wptr = towlower((wint_t) *wptr);
291 wptr++;
292 }
293
294 /*
295 * Alloc result string for worst case + '\0'
296 */
297 len = pg_database_encoding_max_length() * wlen + 1;
298 out = (char *) palloc(len);
299
300 wlen = wchar2char(out, wstr, len, mylocale);
301
302 pfree(wstr);
303
304 if (wlen < 0)
305 ereport(ERROR,
306 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
307 errmsg("conversion from wchar_t to server encoding failed: %m")));
308 Assert(wlen < len);
309 }
310 else
311 {
312 const char *ptr = str;
313 char *outptr;
314
315 outptr = out = (char *) palloc(sizeof(char) * (len + 1));
316 while ((ptr - str) < len && *ptr)
317 {
318 *outptr++ = tolower(TOUCHAR(ptr));
319 ptr++;
320 }
321 *outptr = '\0';
322 }
323
324 return out;
325 }
326