1 /*------------------------------------------------------------------------- 2 * 3 * ts_locale.c 4 * locale compatibility layer for tsearch 5 * 6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group 7 * 8 * 9 * IDENTIFICATION 10 * src/backend/tsearch/ts_locale.c 11 * 12 *------------------------------------------------------------------------- 13 */ 14 #include "postgres.h" 15 16 #include "catalog/pg_collation.h" 17 #include "storage/fd.h" 18 #include "tsearch/ts_locale.h" 19 #include "tsearch/ts_public.h" 20 21 static void tsearch_readline_callback(void *arg); 22 23 24 /* 25 * The reason these functions use a 3-wchar_t output buffer, not 2 as you 26 * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be 27 * getting from char2wchar() is UTF16 not UTF32. A single input character 28 * may therefore produce a surrogate pair rather than just one wchar_t; 29 * we also need room for a trailing null. When we do get a surrogate pair, 30 * we pass just the first code to iswdigit() etc, so that these functions will 31 * always return false for characters outside the Basic Multilingual Plane. 32 */ 33 #define WC_BUF_LEN 3 34 35 int 36 t_isdigit(const char *ptr) 37 { 38 int clen = pg_mblen(ptr); 39 wchar_t character[WC_BUF_LEN]; 40 Oid collation = DEFAULT_COLLATION_OID; /* TODO */ 41 pg_locale_t mylocale = 0; /* TODO */ 42 43 if (clen == 1 || lc_ctype_is_c(collation)) 44 return isdigit(TOUCHAR(ptr)); 45 46 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); 47 48 return iswdigit((wint_t) character[0]); 49 } 50 51 int 52 t_isspace(const char *ptr) 53 { 54 int clen = pg_mblen(ptr); 55 wchar_t character[WC_BUF_LEN]; 56 Oid collation = DEFAULT_COLLATION_OID; /* TODO */ 57 pg_locale_t mylocale = 0; /* TODO */ 58 59 if (clen == 1 || lc_ctype_is_c(collation)) 60 return isspace(TOUCHAR(ptr)); 61 62 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); 63 64 return iswspace((wint_t) character[0]); 65 } 66 67 int 68 t_isalpha(const char *ptr) 69 { 70 int clen = pg_mblen(ptr); 71 wchar_t character[WC_BUF_LEN]; 72 Oid collation = DEFAULT_COLLATION_OID; /* TODO */ 73 pg_locale_t mylocale = 0; /* TODO */ 74 75 if (clen == 1 || lc_ctype_is_c(collation)) 76 return isalpha(TOUCHAR(ptr)); 77 78 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); 79 80 return iswalpha((wint_t) character[0]); 81 } 82 83 int 84 t_isprint(const char *ptr) 85 { 86 int clen = pg_mblen(ptr); 87 wchar_t character[WC_BUF_LEN]; 88 Oid collation = DEFAULT_COLLATION_OID; /* TODO */ 89 pg_locale_t mylocale = 0; /* TODO */ 90 91 if (clen == 1 || lc_ctype_is_c(collation)) 92 return isprint(TOUCHAR(ptr)); 93 94 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); 95 96 return iswprint((wint_t) character[0]); 97 } 98 99 100 /* 101 * Set up to read a file using tsearch_readline(). This facility is 102 * better than just reading the file directly because it provides error 103 * context pointing to the specific line where a problem is detected. 104 * 105 * Expected usage is: 106 * 107 * tsearch_readline_state trst; 108 * 109 * if (!tsearch_readline_begin(&trst, filename)) 110 * ereport(ERROR, 111 * (errcode(ERRCODE_CONFIG_FILE_ERROR), 112 * errmsg("could not open stop-word file \"%s\": %m", 113 * filename))); 114 * while ((line = tsearch_readline(&trst)) != NULL) 115 * process line; 116 * tsearch_readline_end(&trst); 117 * 118 * Note that the caller supplies the ereport() for file open failure; 119 * this is so that a custom message can be provided. The filename string 120 * passed to tsearch_readline_begin() must remain valid through 121 * tsearch_readline_end(). 122 */ 123 bool 124 tsearch_readline_begin(tsearch_readline_state *stp, 125 const char *filename) 126 { 127 if ((stp->fp = AllocateFile(filename, "r")) == NULL) 128 return false; 129 stp->filename = filename; 130 stp->lineno = 0; 131 stp->curline = NULL; 132 /* Setup error traceback support for ereport() */ 133 stp->cb.callback = tsearch_readline_callback; 134 stp->cb.arg = (void *) stp; 135 stp->cb.previous = error_context_stack; 136 error_context_stack = &stp->cb; 137 return true; 138 } 139 140 /* 141 * Read the next line from a tsearch data file (expected to be in UTF-8), and 142 * convert it to database encoding if needed. The returned string is palloc'd. 143 * NULL return means EOF. 144 */ 145 char * 146 tsearch_readline(tsearch_readline_state *stp) 147 { 148 char *result; 149 150 /* Advance line number to use in error reports */ 151 stp->lineno++; 152 153 /* Clear curline, it's no longer relevant */ 154 if (stp->curline) 155 { 156 pfree(stp->curline); 157 stp->curline = NULL; 158 } 159 160 /* Collect next line, if there is one */ 161 result = t_readline(stp->fp); 162 if (!result) 163 return NULL; 164 165 /* 166 * Save a copy of the line for possible use in error reports. (We cannot 167 * just save "result", since it's likely to get pfree'd at some point by 168 * the caller; an error after that would try to access freed data.) 169 */ 170 stp->curline = pstrdup(result); 171 172 return result; 173 } 174 175 /* 176 * Close down after reading a file with tsearch_readline() 177 */ 178 void 179 tsearch_readline_end(tsearch_readline_state *stp) 180 { 181 /* Suppress use of curline in any error reported below */ 182 if (stp->curline) 183 { 184 pfree(stp->curline); 185 stp->curline = NULL; 186 } 187 188 /* Release other resources */ 189 FreeFile(stp->fp); 190 191 /* Pop the error context stack */ 192 error_context_stack = stp->cb.previous; 193 } 194 195 /* 196 * Error context callback for errors occurring while reading a tsearch 197 * configuration file. 198 */ 199 static void 200 tsearch_readline_callback(void *arg) 201 { 202 tsearch_readline_state *stp = (tsearch_readline_state *) arg; 203 204 /* 205 * We can't include the text of the config line for errors that occur 206 * during t_readline() itself. This is only partly a consequence of our 207 * arms-length use of that routine: the major cause of such errors is 208 * encoding violations, and we daren't try to print error messages 209 * containing badly-encoded data. 210 */ 211 if (stp->curline) 212 errcontext("line %d of configuration file \"%s\": \"%s\"", 213 stp->lineno, 214 stp->filename, 215 stp->curline); 216 else 217 errcontext("line %d of configuration file \"%s\"", 218 stp->lineno, 219 stp->filename); 220 } 221 222 223 /* 224 * Read the next line from a tsearch data file (expected to be in UTF-8), and 225 * convert it to database encoding if needed. The returned string is palloc'd. 226 * NULL return means EOF. 227 * 228 * Note: direct use of this function is now deprecated. Go through 229 * tsearch_readline() to provide better error reporting. 230 */ 231 char * 232 t_readline(FILE *fp) 233 { 234 int len; 235 char *recoded; 236 char buf[4096]; /* lines must not be longer than this */ 237 238 if (fgets(buf, sizeof(buf), fp) == NULL) 239 return NULL; 240 241 len = strlen(buf); 242 243 /* Make sure the input is valid UTF-8 */ 244 (void) pg_verify_mbstr(PG_UTF8, buf, len, false); 245 246 /* And convert */ 247 recoded = pg_any_to_server(buf, len, PG_UTF8); 248 if (recoded == buf) 249 { 250 /* 251 * conversion didn't pstrdup, so we must. We can use the length of the 252 * original string, because no conversion was done. 253 */ 254 recoded = pnstrdup(recoded, len); 255 } 256 257 return recoded; 258 } 259 260 /* 261 * lowerstr --- fold null-terminated string to lower case 262 * 263 * Returned string is palloc'd 264 */ 265 char * 266 lowerstr(const char *str) 267 { 268 return lowerstr_with_len(str, strlen(str)); 269 } 270 271 /* 272 * lowerstr_with_len --- fold string to lower case 273 * 274 * Input string need not be null-terminated. 275 * 276 * Returned string is palloc'd 277 */ 278 char * 279 lowerstr_with_len(const char *str, int len) 280 { 281 char *out; 282 Oid collation = DEFAULT_COLLATION_OID; /* TODO */ 283 pg_locale_t mylocale = 0; /* TODO */ 284 285 if (len == 0) 286 return pstrdup(""); 287 288 /* 289 * Use wide char code only when max encoding length > 1 and ctype != C. 290 * Some operating systems fail with multi-byte encodings and a C locale. 291 * Also, for a C locale there is no need to process as multibyte. From 292 * backend/utils/adt/oracle_compat.c Teodor 293 */ 294 if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation)) 295 { 296 wchar_t *wstr, 297 *wptr; 298 int wlen; 299 300 /* 301 * alloc number of wchar_t for worst case, len contains number of 302 * bytes >= number of characters and alloc 1 wchar_t for 0, because 303 * wchar2char wants zero-terminated string 304 */ 305 wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1)); 306 307 wlen = char2wchar(wstr, len + 1, str, len, mylocale); 308 Assert(wlen <= len); 309 310 while (*wptr) 311 { 312 *wptr = towlower((wint_t) *wptr); 313 wptr++; 314 } 315 316 /* 317 * Alloc result string for worst case + '\0' 318 */ 319 len = pg_database_encoding_max_length() * wlen + 1; 320 out = (char *) palloc(len); 321 322 wlen = wchar2char(out, wstr, len, mylocale); 323 324 pfree(wstr); 325 326 if (wlen < 0) 327 ereport(ERROR, 328 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), 329 errmsg("conversion from wchar_t to server encoding failed: %m"))); 330 Assert(wlen < len); 331 } 332 else 333 { 334 const char *ptr = str; 335 char *outptr; 336 337 outptr = out = (char *) palloc(sizeof(char) * (len + 1)); 338 while ((ptr - str) < len && *ptr) 339 { 340 *outptr++ = tolower(TOUCHAR(ptr)); 341 ptr++; 342 } 343 *outptr = '\0'; 344 } 345 346 return out; 347 } 348