1 /*--------
2 * Module : multibyte.c
3 *
4 * Description: New Multibyte related additional function.
5 *
6 * Create 2001-03-03 Eiji Tokuya
7 * New Create 2001-09-16 Eiji Tokuya
8 *--------
9 */
10
11 #include "multibyte.h"
12 #include "misc.h"
13 #include "connection.h"
14 #include "pgapifunc.h"
15 #include <string.h>
16 #include <ctype.h>
17 #include <stdio.h>
18 #include <stdlib.h>
19 #ifndef WIN32
20 #include <locale.h>
21 #endif
22 #ifndef TRUE
23 #define TRUE 1
24 #endif
25
26 typedef struct pg_CS
27 {
28 char *name;
29 int code;
30 } pg_CS;
31
32 static pg_CS CS_Table[] =
33 {
34 { "SQL_ASCII", SQL_ASCII },
35 { "EUC_JP", EUC_JP },
36 { "EUC_CN", EUC_CN },
37 { "EUC_KR", EUC_KR },
38 { "EUC_TW", EUC_TW },
39 { "JOHAB", JOHAB }, /* since 7.3 */
40 { "UTF8", UTF8 }, /* since 7.2 */
41 { "MULE_INTERNAL",MULE_INTERNAL },
42 { "LATIN1", LATIN1 },
43 { "LATIN2", LATIN2 },
44 { "LATIN3", LATIN3 },
45 { "LATIN4", LATIN4 },
46 { "LATIN5", LATIN5 },
47 { "LATIN6", LATIN6 },
48 { "LATIN7", LATIN7 },
49 { "LATIN8", LATIN8 },
50 { "LATIN9", LATIN9 },
51 { "LATIN10", LATIN10 },
52 { "WIN1256", WIN1256 }, /* Arabic since 7.3 */
53 { "WIN1258", WIN1258 }, /* Vietnamese since 8.1 */
54 { "WIN866", WIN866 }, /* since 8.1 */
55 { "WIN874", WIN874 }, /* Thai since 7.3 */
56 { "KOI8", KOI8R },
57 { "WIN1251", WIN1251 }, /* Cyrillic */
58 { "WIN1252", WIN1252 }, /* Western Europe since 8.1 */
59 { "ISO_8859_5", ISO_8859_5 },
60 { "ISO_8859_6", ISO_8859_6 },
61 { "ISO_8859_7", ISO_8859_7 },
62 { "ISO_8859_8", ISO_8859_8 },
63 { "WIN1250", WIN1250 }, /* Central Europe */
64 { "WIN1253", WIN1253 }, /* Greek since 8.2 */
65 { "WIN1254", WIN1254 }, /* Turkish since 8.2 */
66 { "WIN1255", WIN1255 }, /* Hebrew since 8.2 */
67 { "WIN1257", WIN1257 }, /* Baltic(North Europe) since 8.2 */
68
69 { "EUC_JIS_2004", EUC_JIS_2004}, /* EUC for SHIFT-JIS-2004 Japanese, since 8.3 */
70 { "SJIS", SJIS },
71 { "BIG5", BIG5 },
72 { "GBK", GBK }, /* since 7.3 */
73 { "UHC", UHC }, /* since 7.3 */
74 { "GB18030", GB18030 }, /* since 7.3 */
75 { "SHIFT_JIS_2004", SHIFT_JIS_2004 }, /* SHIFT-JIS-2004 Japanese, standard JIS X 0213, since 8.3 */
76 { "OTHER", OTHER }
77 };
78
79 static pg_CS CS_Alias[] =
80 {
81 { "UNICODE", UTF8 },
82 { "TCVN", WIN1258 },
83 { "ALT", WIN866 },
84 { "WIN", WIN1251 },
85 { "KOI8R", KOI8R },
86 { "OTHER", OTHER }
87 };
88
89 int
pg_CS_code(const char * characterset_string)90 pg_CS_code(const char *characterset_string)
91 {
92 int i, c = -1;
93
94 for(i = 0; CS_Table[i].code != OTHER; i++)
95 {
96 if (0 == stricmp(characterset_string, CS_Table[i].name))
97 {
98 c = CS_Table[i].code;
99 break;
100 }
101 }
102 if (c < 0)
103 {
104 for(i = 0; CS_Alias[i].code != OTHER; i++)
105 {
106 if (0 == stricmp(characterset_string, CS_Alias[i].name))
107 {
108 c = CS_Alias[i].code;
109 break;
110 }
111 }
112 }
113 if (c < 0)
114 c = OTHER;
115 return (c);
116 }
117
118 char *
check_client_encoding(const pgNAME conn_settings)119 check_client_encoding(const pgNAME conn_settings)
120 {
121 const char *cptr, *sptr = NULL;
122 char *rptr;
123 BOOL allowed_cmd = TRUE, in_quote = FALSE;
124 int step = 0;
125 size_t len = 0;
126
127 if (NAME_IS_NULL(conn_settings))
128 return NULL;
129 for (cptr = SAFE_NAME(conn_settings); *cptr; cptr++)
130 {
131 if (in_quote)
132 {
133 if (LITERAL_QUOTE == *cptr)
134 {
135 in_quote = FALSE;
136 continue;
137 }
138 }
139 if (';' == *cptr)
140 {
141 allowed_cmd = TRUE;
142 step = 0;
143 continue;
144 }
145 if (!allowed_cmd)
146 continue;
147 if (isspace((unsigned char) *cptr))
148 continue;
149 switch (step)
150 {
151 case 0:
152 if (0 != strnicmp(cptr, "set", 3))
153 {
154 allowed_cmd = FALSE;
155 continue;
156 }
157 step++;
158 cptr += 3;
159 break;
160 case 1:
161 if (0 != strnicmp(cptr, "client_encoding", 15))
162 {
163 allowed_cmd = FALSE;
164 continue;
165 }
166 step++;
167 cptr += 15;
168 if ('=' == *cptr)
169 cptr--;
170 break;
171 case 2:
172 if (0 == strnicmp(cptr, "to", 2))
173 cptr += 2;
174 else if (0 == strnicmp(cptr, "=", 1))
175 ;
176 else
177 {
178 allowed_cmd = FALSE;
179 continue;
180 }
181 step++;
182 break;
183 case 3:
184 if (LITERAL_QUOTE == *cptr)
185 {
186 cptr++;
187 for (sptr = cptr; *cptr && *cptr != LITERAL_QUOTE; cptr++) ;
188 }
189 else
190 {
191 for (sptr = cptr; ';' != *cptr && IS_NOT_SPACE(*cptr); cptr++) ;
192 }
193 len = cptr - sptr;
194 if (';' == *cptr)
195 cptr--;
196 step++;
197 break;
198 }
199 }
200 if (!sptr)
201 return NULL;
202 rptr = malloc(len + 1);
203 if (!rptr)
204 return NULL;
205 memcpy(rptr, sptr, len);
206 rptr[len] = '\0';
207 MYLOG(0, "extracted a client_encoding '%s' from conn_settings\n", rptr);
208 return rptr;
209 }
210
211 int
pg_mb_maxlen(int characterset_code)212 pg_mb_maxlen(int characterset_code)
213 {
214 switch (characterset_code)
215 {
216 case UTF8:
217 return 4;
218 case EUC_TW:
219 return 4;
220 case EUC_JIS_2004:
221 case EUC_JP:
222 case GB18030:
223 return 3;
224 case SHIFT_JIS_2004:
225 case SJIS:
226 case BIG5:
227 case GBK:
228 case UHC:
229 case EUC_CN:
230 case EUC_KR:
231 case JOHAB:
232 return 2;
233 default:
234 return 1;
235 }
236 }
237
238 static int
pg_CS_stat(int stat,unsigned int character,int characterset_code)239 pg_CS_stat(int stat,unsigned int character,int characterset_code)
240 {
241 if (character == 0)
242 stat = 0;
243 switch (characterset_code)
244 {
245 case UTF8:
246 {
247 if (stat < 2 &&
248 character >= 0x80)
249 {
250 if (character >= 0xfc)
251 stat = 6;
252 else if (character >= 0xf8)
253 stat = 5;
254 else if (character >= 0xf0)
255 stat = 4;
256 else if (character >= 0xe0)
257 stat = 3;
258 else if (character >= 0xc0)
259 stat = 2;
260 }
261 else if (stat >= 2 &&
262 character > 0x7f)
263 stat--;
264 else
265 stat=0;
266 }
267 break;
268 /* SHIFT_JIS_2004 Support. */
269 case SHIFT_JIS_2004:
270 {
271 if (stat < 2 &&
272 character >= 0x81 && character <= 0x9f)
273 stat = 2;
274 else if (stat < 2 &&
275 character >= 0xe0 && character <= 0xef)
276 stat = 2;
277 else if (stat < 2 &&
278 character >= 0xf0 && character <= 0xfc)
279 stat = 2;
280 else if (stat == 2)
281 stat = 1;
282 else
283 stat = 0;
284 }
285 break;
286 /* Shift-JIS Support. */
287 case SJIS:
288 {
289 if (stat < 2 &&
290 character > 0x80 &&
291 !(character > 0x9f &&
292 character < 0xe0))
293 stat = 2;
294 else if (stat == 2)
295 stat = 1;
296 else
297 stat = 0;
298 }
299 break;
300 /* Chinese Big5 Support. */
301 case BIG5:
302 {
303 if (stat < 2 &&
304 character > 0xA0)
305 stat = 2;
306 else if (stat == 2)
307 stat = 1;
308 else
309 stat = 0;
310 }
311 break;
312 /* Chinese GBK Support. */
313 case GBK:
314 {
315 if (stat < 2 &&
316 character > 0x7F)
317 stat = 2;
318 else if (stat == 2)
319 stat = 1;
320 else
321 stat = 0;
322 }
323 break;
324
325 /* Korian UHC Support. */
326 case UHC:
327 {
328 if (stat < 2 &&
329 character > 0x7F)
330 stat = 2;
331 else if (stat == 2)
332 stat = 1;
333 else
334 stat = 0;
335 }
336 break;
337
338 case EUC_JIS_2004:
339 /* 0x8f is JIS X 0212 + JIS X 0213(2) 3 byte */
340 /* 0x8e is JIS X 0201 2 byte */
341 /* 0xa0-0xff is JIS X 0213(1) 2 byte */
342 case EUC_JP:
343 /* 0x8f is JIS X 0212 3 byte */
344 /* 0x8e is JIS X 0201 2 byte */
345 /* 0xa0-0xff is JIS X 0208 2 byte */
346 {
347 if (stat < 3 &&
348 character == 0x8f) /* JIS X 0212 */
349 stat = 3;
350 else
351 if (stat != 2 &&
352 (character == 0x8e ||
353 character > 0xa0)) /* Half Katakana HighByte & Kanji HighByte */
354 stat = 2;
355 else if (stat == 2)
356 stat = 1;
357 else
358 stat = 0;
359 }
360 break;
361
362 /* EUC_CN, EUC_KR, JOHAB Support */
363 case EUC_CN:
364 case EUC_KR:
365 case JOHAB:
366 {
367 if (stat < 2 &&
368 character > 0xa0)
369 stat = 2;
370 else if (stat == 2)
371 stat = 1;
372 else
373 stat = 0;
374 }
375 break;
376 case EUC_TW:
377 {
378 if (stat < 4 &&
379 character == 0x8e)
380 stat = 4;
381 else if (stat == 4 &&
382 character > 0xa0)
383 stat = 3;
384 else if ((stat == 3 ||
385 stat < 2) &&
386 character > 0xa0)
387 stat = 2;
388 else if (stat == 2)
389 stat = 1;
390 else
391 stat = 0;
392 }
393 break;
394 /*Chinese GB18030 support.Added by Bill Huang <bhuang@redhat.com> <bill_huanghb@ybb.ne.jp>*/
395 case GB18030:
396 {
397 if (stat < 2 && character > 0x80)
398 stat = 2;
399 else if (stat == 2)
400 {
401 if (character >= 0x30 && character <= 0x39)
402 stat = 3;
403 else
404 stat = 1;
405 }
406 else if (stat == 3)
407 {
408 if (character >= 0x30 && character <= 0x39)
409 stat = 1;
410 else
411 stat = 3;
412 }
413 else
414 stat = 0;
415 }
416 break;
417 default:
418 {
419 stat = 0;
420 }
421 break;
422 }
423 return stat;
424 }
425
426 /*
427 * This function is used to know the encoding corresponding to
428 * the current locale.
429 */
430 const char *
derive_locale_encoding(const char * dbencoding)431 derive_locale_encoding(const char *dbencoding)
432 {
433 const char *wenc = NULL;
434 #ifdef WIN32
435 int acp;
436 #else
437 const char *loc, *ptr;
438 #endif /* WIN32 */
439
440 if (wenc = getenv("PGCLIENTENCODING"), NULL != wenc) /* environmnt variable */
441 return wenc;
442 #ifdef WIN32
443 acp = GetACP();
444 if (acp >= 1251 && acp <= 1258)
445 {
446 if (stricmp(dbencoding, "SQL_ASCII") == 0)
447 return wenc;
448 }
449 switch (acp)
450 {
451 case 932:
452 wenc = "SJIS";
453 break;
454 case 936:
455 wenc = "GBK";
456 break;
457 case 949:
458 wenc = "UHC";
459 break;
460 case 950:
461 wenc = "BIG5";
462 break;
463 case 1250:
464 wenc = "WIN1250";
465 break;
466 case 1251:
467 wenc = "WIN1251";
468 break;
469 case 1256:
470 wenc = "WIN1256";
471 break;
472 case 1252:
473 if (strnicmp(dbencoding, "LATIN", 5) == 0)
474 break;
475 wenc = "WIN1252";
476 break;
477 case 1258:
478 wenc = "WIN1258";
479 break;
480 case 1253:
481 wenc = "WIN1253";
482 break;
483 case 1254:
484 wenc = "WIN1254";
485 break;
486 case 1255:
487 wenc = "WIN1255";
488 break;
489 case 1257:
490 wenc = "WIN1257";
491 break;
492 }
493 #else
494 /*
495 * Derive the encoding from the codeset part of the current locale.
496 */
497 loc = setlocale(LC_CTYPE, "");
498 if (loc && (ptr = strchr(loc, '.')))
499 {
500 int enc_no;
501
502 ptr++;
503 if ((enc_no= pg_char_to_encoding(ptr)) >= 0)
504 wenc = pg_encoding_to_char(enc_no);
505 MYLOG(0, "locale=%s enc=%s\n", loc, wenc ? wenc : "(null)");
506 }
507 #endif /* WIN32 */
508 return wenc;
509 }
510
encoded_str_constr(encoded_str * encstr,int ccsc,const char * str)511 void encoded_str_constr(encoded_str *encstr, int ccsc, const char *str)
512 {
513 encstr->ccsc = ccsc;
514 encstr->encstr = (const UCHAR *) str;
515 encstr->pos = -1;
516 encstr->ccst = 0;
517 }
encoded_nextchar(encoded_str * encstr)518 int encoded_nextchar(encoded_str *encstr)
519 {
520 int chr;
521
522 if (encstr->pos >= 0 && !encstr->encstr[encstr->pos])
523 return 0;
524 chr = encstr->encstr[++encstr->pos];
525 encstr->ccst = pg_CS_stat(encstr->ccst, (unsigned int) chr, encstr->ccsc);
526 return chr;
527 }
encoded_position_shift(encoded_str * encstr,size_t shift)528 ssize_t encoded_position_shift(encoded_str *encstr, size_t shift)
529 {
530 encstr->pos += shift;
531 return encstr->pos;
532 }
encoded_byte_check(encoded_str * encstr,size_t abspos)533 int encoded_byte_check(encoded_str *encstr, size_t abspos)
534 {
535 int chr;
536
537 chr = encstr->encstr[encstr->pos = abspos];
538 encstr->ccst = pg_CS_stat(encstr->ccst, (unsigned int) chr, encstr->ccsc);
539 return chr;
540 }
541