1 /*--------
2  * Module :			multibyte.c
3  *
4  * Description:		New Multibyte related additional function.
5  *
6  *					Create 2001-03-03 Eiji Tokuya
7  *					New Create 2001-09-16 Eiji Tokuya
8  *--------
9  */
10 
11 #include "multibyte.h"
12 #include "misc.h"
13 #include "connection.h"
14 #include "pgapifunc.h"
15 #include <string.h>
16 #include <ctype.h>
17 #include <stdio.h>
18 #include <stdlib.h>
19 #ifndef	WIN32
20 #include <locale.h>
21 #endif
22 #ifndef	TRUE
23 #define	TRUE	1
24 #endif
25 
26 typedef struct pg_CS
27 {
28 	char   *name;
29 	int		code;
30 } pg_CS;
31 
32 static pg_CS CS_Table[] =
33 {
34 	{ "SQL_ASCII",	SQL_ASCII },
35 	{ "EUC_JP",	EUC_JP },
36 	{ "EUC_CN",	EUC_CN },
37 	{ "EUC_KR",	EUC_KR },
38 	{ "EUC_TW",	EUC_TW },
39 	{ "JOHAB",	JOHAB },	/* since 7.3 */
40 	{ "UTF8",	UTF8 },		/* since 7.2 */
41 	{ "MULE_INTERNAL",MULE_INTERNAL },
42 	{ "LATIN1",	LATIN1 },
43 	{ "LATIN2",	LATIN2 },
44 	{ "LATIN3",	LATIN3 },
45 	{ "LATIN4",	LATIN4 },
46 	{ "LATIN5",	LATIN5 },
47 	{ "LATIN6",	LATIN6 },
48 	{ "LATIN7",	LATIN7 },
49 	{ "LATIN8",	LATIN8 },
50 	{ "LATIN9",	LATIN9 },
51 	{ "LATIN10",	LATIN10 },
52 	{ "WIN1256",	WIN1256 },	/* Arabic since 7.3 */
53 	{ "WIN1258",	WIN1258 },	/* Vietnamese since 8.1 */
54 	{ "WIN866",	WIN866 },	/* since 8.1 */
55 	{ "WIN874",	WIN874 },	/* Thai since 7.3 */
56 	{ "KOI8",	KOI8R },
57 	{ "WIN1251",	WIN1251 },	/* Cyrillic */
58 	{ "WIN1252",	WIN1252 },	/* Western Europe since 8.1 */
59 	{ "ISO_8859_5", ISO_8859_5 },
60 	{ "ISO_8859_6", ISO_8859_6 },
61 	{ "ISO_8859_7", ISO_8859_7 },
62 	{ "ISO_8859_8", ISO_8859_8 },
63 	{ "WIN1250",	WIN1250 },	/* Central Europe */
64 	{ "WIN1253",	WIN1253 },	/* Greek since 8.2 */
65 	{ "WIN1254",	WIN1254 },	/* Turkish since 8.2 */
66 	{ "WIN1255",	WIN1255 },	/* Hebrew since 8.2 */
67 	{ "WIN1257",	WIN1257 },	/* Baltic(North Europe) since 8.2 */
68 
69 	{ "EUC_JIS_2004", EUC_JIS_2004},	/* EUC for SHIFT-JIS-2004 Japanese, since 8.3 */
70 	{ "SJIS",	SJIS },
71 	{ "BIG5",	BIG5 },
72 	{ "GBK",	GBK },		/* since 7.3 */
73 	{ "UHC",	UHC },		/* since 7.3 */
74 	{ "GB18030",	GB18030 },	/* since 7.3 */
75 	{ "SHIFT_JIS_2004", SHIFT_JIS_2004 },	/* SHIFT-JIS-2004 Japanese, standard JIS X 0213, since 8.3 */
76 	{ "OTHER",	OTHER }
77 };
78 
79 static pg_CS CS_Alias[] =
80 {
81 	{ "UNICODE",	UTF8 },
82 	{ "TCVN",	WIN1258 },
83 	{ "ALT",	WIN866 },
84 	{ "WIN",	WIN1251 },
85 	{ "KOI8R",	KOI8R },
86 	{ "OTHER",	OTHER }
87 };
88 
89 int
pg_CS_code(const char * characterset_string)90 pg_CS_code(const char *characterset_string)
91 {
92 	int i, c = -1;
93 
94 	for(i = 0; CS_Table[i].code != OTHER; i++)
95 	{
96 		if (0 == stricmp(characterset_string, CS_Table[i].name))
97 		{
98 			c = CS_Table[i].code;
99 			break;
100 		}
101 	}
102 	if (c < 0)
103 	{
104 		for(i = 0; CS_Alias[i].code != OTHER; i++)
105 		{
106 			if (0 == stricmp(characterset_string, CS_Alias[i].name))
107 			{
108 				c = CS_Alias[i].code;
109 				break;
110 			}
111 		}
112 	}
113 	if (c < 0)
114 		c = OTHER;
115 	return (c);
116 }
117 
118 char *
check_client_encoding(const pgNAME conn_settings)119 check_client_encoding(const pgNAME conn_settings)
120 {
121 	const char *cptr, *sptr = NULL;
122 	char   *rptr;
123 	BOOL	allowed_cmd = TRUE, in_quote = FALSE;
124 	int	step = 0;
125 	size_t	len = 0;
126 
127 	if (NAME_IS_NULL(conn_settings))
128 		return NULL;
129 	for (cptr = SAFE_NAME(conn_settings); *cptr; cptr++)
130 	{
131 		if (in_quote)
132 		{
133 			if (LITERAL_QUOTE == *cptr)
134 			{
135 				in_quote = FALSE;
136 				continue;
137 			}
138 		}
139 		if (';' == *cptr)
140 		{
141 			allowed_cmd = TRUE;
142 			step = 0;
143 			continue;
144 		}
145 		if (!allowed_cmd)
146 			continue;
147 		if (isspace((unsigned char) *cptr))
148 			continue;
149 		switch (step)
150 		{
151 			case 0:
152 				if (0 != strnicmp(cptr, "set", 3))
153 				{
154 					allowed_cmd = FALSE;
155 					continue;
156 				}
157 				step++;
158 				cptr += 3;
159 				break;
160 			case 1:
161 				if (0 != strnicmp(cptr, "client_encoding", 15))
162 				{
163 					allowed_cmd = FALSE;
164 					continue;
165 				}
166 				step++;
167 				cptr += 15;
168 				if ('=' == *cptr)
169 					cptr--;
170 				break;
171 			case 2:
172 				if (0 == strnicmp(cptr, "to", 2))
173 					cptr += 2;
174 				else if (0 == strnicmp(cptr, "=", 1))
175 					;
176 				else
177 				{
178 					allowed_cmd = FALSE;
179 					continue;
180 				}
181 				step++;
182 				break;
183 			case 3:
184 				if (LITERAL_QUOTE == *cptr)
185 				{
186 					cptr++;
187 					for (sptr = cptr; *cptr && *cptr != LITERAL_QUOTE; cptr++) ;
188 				}
189 				else
190 				{
191 					for (sptr = cptr; ';' != *cptr && IS_NOT_SPACE(*cptr); cptr++) ;
192 				}
193 				len = cptr - sptr;
194 				if (';' == *cptr)
195 					cptr--;
196 				step++;
197 				break;
198 		}
199 	}
200 	if (!sptr)
201 		return NULL;
202 	rptr = malloc(len + 1);
203 	if (!rptr)
204 		return NULL;
205 	memcpy(rptr, sptr, len);
206 	rptr[len] = '\0';
207 	MYLOG(0, "extracted a client_encoding '%s' from conn_settings\n", rptr);
208 	return rptr;
209 }
210 
211 int
pg_mb_maxlen(int characterset_code)212 pg_mb_maxlen(int characterset_code)
213 {
214 	switch (characterset_code)
215 	{
216 		case UTF8:
217 			return 4;
218 		case EUC_TW:
219 			return 4;
220 		case EUC_JIS_2004:
221 		case EUC_JP:
222 		case GB18030:
223 			return 3;
224 		case SHIFT_JIS_2004:
225 		case SJIS:
226 		case BIG5:
227 		case GBK:
228 		case UHC:
229 		case EUC_CN:
230 		case EUC_KR:
231 		case JOHAB:
232 			return 2;
233 		default:
234 			return 1;
235 	}
236 }
237 
238 static int
pg_CS_stat(int stat,unsigned int character,int characterset_code)239 pg_CS_stat(int stat,unsigned int character,int characterset_code)
240 {
241 	if (character == 0)
242 		stat = 0;
243 	switch (characterset_code)
244 	{
245 		case UTF8:
246 			{
247 				if (stat < 2 &&
248 					character >= 0x80)
249 				{
250 					if (character >= 0xfc)
251 						stat = 6;
252 					else if (character >= 0xf8)
253 						stat = 5;
254 					else if (character >= 0xf0)
255 						stat = 4;
256 					else if (character >= 0xe0)
257 						stat = 3;
258 					else if (character >= 0xc0)
259 						stat = 2;
260 				}
261 				else if (stat >= 2 &&
262 					character > 0x7f)
263 					stat--;
264 				else
265 					stat=0;
266 			}
267 			break;
268 /* SHIFT_JIS_2004 Support. */
269 			case SHIFT_JIS_2004:
270 			{
271 				if (stat < 2 &&
272 					character >= 0x81 && character <= 0x9f)
273 					stat = 2;
274 				else if (stat < 2 &&
275 					character >= 0xe0 && character <= 0xef)
276 					stat = 2;
277 				else if (stat < 2 &&
278 					character >= 0xf0 && character <= 0xfc)
279 					stat = 2;
280 				else if (stat == 2)
281 					stat = 1;
282 				else
283 					stat = 0;
284 			}
285 			break;
286 /* Shift-JIS Support. */
287 			case SJIS:
288 			{
289 				if (stat < 2 &&
290 					character > 0x80 &&
291 					!(character > 0x9f &&
292 					character < 0xe0))
293 					stat = 2;
294 				else if (stat == 2)
295 					stat = 1;
296 				else
297 					stat = 0;
298 			}
299 			break;
300 /* Chinese Big5 Support. */
301 		case BIG5:
302 			{
303 				if (stat < 2 &&
304 					character > 0xA0)
305 					stat = 2;
306 				else if (stat == 2)
307 					stat = 1;
308 				else
309 					stat = 0;
310 			}
311 			break;
312 /* Chinese GBK Support. */
313 		case GBK:
314 			{
315 				if (stat < 2 &&
316 					character > 0x7F)
317 					stat = 2;
318 				else if (stat == 2)
319 					stat = 1;
320 				else
321 					stat = 0;
322 			}
323 			break;
324 
325 /* Korian UHC Support. */
326 		case UHC:
327 			{
328 				if (stat < 2 &&
329 					character > 0x7F)
330 					stat = 2;
331 				else if (stat == 2)
332 					stat = 1;
333 				else
334 					stat = 0;
335 			}
336 			break;
337 
338 		case EUC_JIS_2004:
339 			/* 0x8f is JIS X 0212 + JIS X 0213(2) 3 byte */
340 			/* 0x8e is JIS X 0201 2 byte */
341 			/* 0xa0-0xff is JIS X 0213(1) 2 byte */
342 		case EUC_JP:
343 			/* 0x8f is JIS X 0212 3 byte */
344 			/* 0x8e is JIS X 0201 2 byte */
345 			/* 0xa0-0xff is JIS X 0208 2 byte */
346 			{
347 				if (stat < 3 &&
348 					character == 0x8f)	/* JIS X 0212 */
349 					stat = 3;
350 				else
351 				if (stat != 2 &&
352 					(character == 0x8e ||
353 					character > 0xa0))	/* Half Katakana HighByte & Kanji HighByte */
354 					stat = 2;
355 				else if (stat == 2)
356 					stat = 1;
357 				else
358 					stat = 0;
359 			}
360 			break;
361 
362 /* EUC_CN, EUC_KR, JOHAB Support */
363 		case EUC_CN:
364 		case EUC_KR:
365 		case JOHAB:
366 			{
367 				if (stat < 2 &&
368 					character > 0xa0)
369 					stat = 2;
370 				else if (stat == 2)
371 					stat = 1;
372 				else
373 					stat = 0;
374 			}
375 			break;
376 		case EUC_TW:
377 			{
378 				if (stat < 4 &&
379 					character == 0x8e)
380 					stat = 4;
381 				else if (stat == 4 &&
382 					character > 0xa0)
383 					stat = 3;
384 				else if ((stat == 3 ||
385 					stat < 2) &&
386 					character > 0xa0)
387 					stat = 2;
388 				else if (stat == 2)
389 					stat = 1;
390 				else
391 					stat = 0;
392 			}
393 			break;
394 			/*Chinese GB18030 support.Added by Bill Huang <bhuang@redhat.com> <bill_huanghb@ybb.ne.jp>*/
395 		case GB18030:
396 			{
397 				if (stat < 2 && character > 0x80)
398 					stat = 2;
399 				else if (stat == 2)
400 				{
401 					if (character >= 0x30 && character <= 0x39)
402 						stat = 3;
403 					else
404 						stat = 1;
405 				}
406 				else if (stat == 3)
407 				{
408 					if (character >= 0x30 && character <= 0x39)
409 						stat = 1;
410 					else
411 						stat = 3;
412 				}
413 				else
414 					stat = 0;
415 			}
416 			break;
417 		default:
418 			{
419 				stat = 0;
420 			}
421 			break;
422 	}
423 	return stat;
424 }
425 
426 /*
427  *	This function is used to know the encoding corresponding to
428  *	the current locale.
429  */
430 const char *
derive_locale_encoding(const char * dbencoding)431 derive_locale_encoding(const char *dbencoding)
432 {
433 	const char *wenc = NULL;
434 #ifdef	WIN32
435 	int	acp;
436 #else
437 	const char *loc, *ptr;
438 #endif /* WIN32 */
439 
440 	if (wenc = getenv("PGCLIENTENCODING"), NULL != wenc) /* environmnt variable */
441 		return wenc;
442 #ifdef	WIN32
443 	acp = GetACP();
444 	if (acp >= 1251 && acp <= 1258)
445 	{
446 		if (stricmp(dbencoding, "SQL_ASCII") == 0)
447 			return wenc;
448 	}
449 	switch (acp)
450 	{
451 		case 932:
452 			wenc = "SJIS";
453 			break;
454 		case 936:
455 			wenc = "GBK";
456 			break;
457 		case 949:
458 			wenc = "UHC";
459 			break;
460 		case 950:
461 			wenc = "BIG5";
462 			break;
463 		case 1250:
464 			wenc = "WIN1250";
465 			break;
466 		case 1251:
467 			wenc = "WIN1251";
468 			break;
469 		case 1256:
470 			wenc = "WIN1256";
471 			break;
472 		case 1252:
473 			if (strnicmp(dbencoding, "LATIN", 5) == 0)
474 				break;
475 			wenc = "WIN1252";
476 			break;
477 		case 1258:
478 			wenc = "WIN1258";
479 			break;
480 		case 1253:
481 			wenc = "WIN1253";
482 			break;
483 		case 1254:
484 			wenc = "WIN1254";
485 			break;
486 		case 1255:
487 			wenc = "WIN1255";
488 			break;
489 		case 1257:
490 			wenc = "WIN1257";
491 			break;
492 	}
493 #else
494 	/*
495 	 *	Derive the encoding from the codeset part of the current locale.
496 	 */
497 	loc = setlocale(LC_CTYPE, "");
498 	if (loc && (ptr = strchr(loc, '.')))
499 	{
500 		int enc_no;
501 
502 		ptr++;
503 		if ((enc_no= pg_char_to_encoding(ptr)) >= 0)
504 			wenc = pg_encoding_to_char(enc_no);
505 		MYLOG(0, "locale=%s enc=%s\n", loc, wenc ? wenc : "(null)");
506 	}
507 #endif /* WIN32 */
508 	return wenc;
509 }
510 
encoded_str_constr(encoded_str * encstr,int ccsc,const char * str)511 void encoded_str_constr(encoded_str *encstr, int ccsc, const char *str)
512 {
513 	encstr->ccsc = ccsc;
514 	encstr->encstr = (const UCHAR *) str;
515 	encstr->pos = -1;
516 	encstr->ccst = 0;
517 }
encoded_nextchar(encoded_str * encstr)518 int encoded_nextchar(encoded_str *encstr)
519 {
520 	int	chr;
521 
522 	if (encstr->pos >= 0 && !encstr->encstr[encstr->pos])
523 		return 0;
524 	chr = encstr->encstr[++encstr->pos];
525 	encstr->ccst = pg_CS_stat(encstr->ccst, (unsigned int) chr, encstr->ccsc);
526 	return chr;
527 }
encoded_position_shift(encoded_str * encstr,size_t shift)528 ssize_t encoded_position_shift(encoded_str *encstr, size_t shift)
529 {
530 	encstr->pos += shift;
531 	return encstr->pos;
532 }
encoded_byte_check(encoded_str * encstr,size_t abspos)533 int encoded_byte_check(encoded_str *encstr, size_t abspos)
534 {
535 	int	chr;
536 
537 	chr = encstr->encstr[encstr->pos = abspos];
538 	encstr->ccst = pg_CS_stat(encstr->ccst, (unsigned int) chr, encstr->ccsc);
539 	return chr;
540 }
541