1 /* FreeTDS - Library of routines accessing Sybase and Microsoft databases
2  * Copyright (C) 2003, 2004  James K. Lowden, based on original work by Brian Bruns
3  * Copyright (C) 2011 Frediano Ziglio
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Library General Public
7  * License as published by the Free Software Foundation; either
8  * version 2 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Library General Public License for more details.
14  *
15  * You should have received a copy of the GNU Library General Public
16  * License along with this library; if not, write to the
17  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18  * Boston, MA 02111-1307, USA.
19  */
20 
21 /**
22  * \file
23  * This file implements a very simple iconv.
24  * Its purpose is to allow ASCII clients to communicate with Microsoft servers
25  * that encode their metadata in Unicode (UTF-16).
26  *
27  * It supports ISO-8859-1, ASCII, CP1252, UTF-16, UCS-4 and UTF-8
28  */
29 
30 #include <config.h>
31 
32 #if ! HAVE_ICONV
33 
34 #if HAVE_STRING_H
35 #include <string.h>
36 #endif /* HAVE_STRING_H */
37 #if HAVE_ERRNO_H
38 #include <errno.h>
39 #endif
40 
41 #include <assert.h>
42 #include <ctype.h>
43 
44 #include <freetds/tds.h>
45 #include <freetds/bytes.h>
46 #include <freetds/iconv.h>
47 #include <freetds/utils/bjoern-utf8.h>
48 
49 #include "iconv_charsets.h"
50 
51 /**
52  * \addtogroup conv
53  * @{
54  */
55 
56 enum ICONV_CD_VALUE
57 {
58 	Like_to_Like = 0x100
59 };
60 
61 typedef uint32_t ICONV_CHAR;
62 
63 /*
64  * Return values for get_*:
65  * - >0 bytes readed
66  * - -EINVAL not enough data to read
67  * - -EILSEQ invalid encoding detected
68  * Return values for put_*:
69  * - >0 bytes written
70  * - -E2BIG no space left on output
71  * - -EILSEQ character can't be encoded in output charset
72  */
73 
74 static int
get_utf8(const unsigned char * p,size_t len,ICONV_CHAR * out)75 get_utf8(const unsigned char *p, size_t len, ICONV_CHAR *out)
76 {
77 	uint32_t uc, state = UTF8_ACCEPT;
78 	size_t l = 1;
79 
80 	do {
81 		switch (decode_utf8(&state, &uc, *p++)) {
82 		case UTF8_ACCEPT:
83 			*out = uc;
84 			return l;
85 		case UTF8_REJECT:
86 			return -EILSEQ;
87 		}
88 	} while (l++ < len);
89 	return -EINVAL;
90 }
91 
92 static int
put_utf8(unsigned char * buf,size_t buf_len,ICONV_CHAR c)93 put_utf8(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
94 {
95 #define MASK(n) ((0xffffffffu << (n)) & 0xffffffffu)
96 	size_t o_len;
97 
98 	if ((c & MASK(7)) == 0) {
99 		if (buf_len < 1)
100 			return -E2BIG;
101 		*buf = (unsigned char) c;
102 		return 1;
103 	}
104 
105 	o_len = 2;
106 	for (;;) {
107 		if ((c & MASK(11)) == 0)
108 			break;
109 		++o_len;
110 		if ((c & MASK(16)) == 0)
111 			break;
112 		++o_len;
113 		if ((c & MASK(21)) == 0)
114 			break;
115 		++o_len;
116 		if ((c & MASK(26)) == 0)
117 			break;
118 		++o_len;
119 		if ((c & MASK(31)) != 0)
120 			return -EILSEQ;
121 	}
122 
123 	if (buf_len < o_len)
124 		return -E2BIG;
125 	buf += o_len;
126 	buf_len = o_len - 1;
127 	do {
128 		*--buf = 0x80 | (c & 0x3f);
129 		c >>= 6;
130 	} while (--buf_len);
131 	*--buf = (0xff00u >> o_len) | c;
132 	return o_len;
133 }
134 
135 static int
get_ucs4le(const unsigned char * p,size_t len,ICONV_CHAR * out)136 get_ucs4le(const unsigned char *p, size_t len, ICONV_CHAR *out)
137 {
138 	TDS_EXTRA_CHECK(assert((((uintptr_t) p) & 3) == 0));
139 
140 	if (len < 4)
141 		return -EINVAL;
142 	*out = TDS_GET_A4LE(p);
143 	return 4;
144 }
145 
146 static int
put_ucs4le(unsigned char * buf,size_t buf_len,ICONV_CHAR c)147 put_ucs4le(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
148 {
149 	TDS_EXTRA_CHECK(assert((((uintptr_t) buf) & 3) == 0));
150 
151 	if (buf_len < 4)
152 		return -E2BIG;
153 	TDS_PUT_A4LE(buf, c);
154 	return 4;
155 }
156 
157 static int
get_ucs4be(const unsigned char * p,size_t len,ICONV_CHAR * out)158 get_ucs4be(const unsigned char *p, size_t len, ICONV_CHAR *out)
159 {
160 	TDS_EXTRA_CHECK(assert((((uintptr_t) p) & 3) == 0));
161 
162 	if (len < 4)
163 		return -EINVAL;
164 	*out = TDS_GET_A4BE(p);
165 	return 4;
166 }
167 
168 static int
put_ucs4be(unsigned char * buf,size_t buf_len,ICONV_CHAR c)169 put_ucs4be(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
170 {
171 	TDS_EXTRA_CHECK(assert((((uintptr_t) buf) & 3) == 0));
172 
173 	if (buf_len < 4)
174 		return -E2BIG;
175 	TDS_PUT_A4BE(buf, c);
176 	return 4;
177 }
178 
179 static int
get_utf16le(const unsigned char * p,size_t len,ICONV_CHAR * out)180 get_utf16le(const unsigned char *p, size_t len, ICONV_CHAR *out)
181 {
182 	ICONV_CHAR c, c2;
183 
184 	TDS_EXTRA_CHECK(assert((((uintptr_t) p) & 1) == 0));
185 
186 	if (len < 2)
187 		return -EINVAL;
188 	c = TDS_GET_A2LE(p);
189 	if ((c & 0xfc00) == 0xd800) {
190 		if (len < 4)
191 			return -EINVAL;
192 		c2 = TDS_GET_A2LE(p+2);
193 		if ((c2 & 0xfc00) == 0xdc00) {
194 			*out = (c << 10) + c2 - ((0xd800 << 10) + 0xdc00 - 0x10000);
195 			return 4;
196 		}
197 	}
198 	*out = c;
199 	return 2;
200 }
201 
202 static int
put_utf16le(unsigned char * buf,size_t buf_len,ICONV_CHAR c)203 put_utf16le(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
204 {
205 	TDS_EXTRA_CHECK(assert((((uintptr_t) buf) & 1) == 0));
206 
207 	if (c < 0x10000u) {
208 		if (buf_len < 2)
209 			return -E2BIG;
210 		TDS_PUT_A2LE(buf, c);
211 		return 2;
212 	}
213 	if (TDS_UNLIKELY(c >= 0x110000u))
214 		return -EILSEQ;
215 	if (buf_len < 4)
216 		return -E2BIG;
217 	TDS_PUT_A2LE(buf,   0xd7c0 + (c >> 10));
218 	TDS_PUT_A2LE(buf+2, 0xdc00 + (c & 0x3ffu));
219 	return 4;
220 }
221 
222 static int
get_utf16be(const unsigned char * p,size_t len,ICONV_CHAR * out)223 get_utf16be(const unsigned char *p, size_t len, ICONV_CHAR *out)
224 {
225 	ICONV_CHAR c, c2;
226 
227 	TDS_EXTRA_CHECK(assert((((uintptr_t) p) & 1) == 0));
228 
229 	if (len < 2)
230 		return -EINVAL;
231 	c = TDS_GET_A2BE(p);
232 	if ((c & 0xfc00) == 0xd800) {
233 		if (len < 4)
234 			return -EINVAL;
235 		c2 = TDS_GET_A2BE(p+2);
236 		if ((c2 & 0xfc00) == 0xdc00) {
237 			*out = (c << 10) + c2 - ((0xd800 << 10) + 0xdc00 - 0x10000);
238 			return 4;
239 		}
240 	}
241 	*out = c;
242 	return 2;
243 }
244 
245 static int
put_utf16be(unsigned char * buf,size_t buf_len,ICONV_CHAR c)246 put_utf16be(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
247 {
248 	TDS_EXTRA_CHECK(assert((((uintptr_t) buf) & 1) == 0));
249 
250 	if (c < 0x10000u) {
251 		if (buf_len < 2)
252 			return -E2BIG;
253 		TDS_PUT_A2BE(buf, c);
254 		return 2;
255 	}
256 	if (TDS_UNLIKELY(c >= 0x110000u))
257 		return -EILSEQ;
258 	if (buf_len < 4)
259 		return -E2BIG;
260 	TDS_PUT_A2BE(buf,   0xd7c0 + (c >> 10));
261 	TDS_PUT_A2BE(buf+2, 0xdc00 + (c & 0x3ffu));
262 	return 4;
263 }
264 
265 static int
get_iso1(const unsigned char * p,size_t len,ICONV_CHAR * out)266 get_iso1(const unsigned char *p, size_t len, ICONV_CHAR *out)
267 {
268 	*out = p[0];
269 	return 1;
270 }
271 
272 static int
put_iso1(unsigned char * buf,size_t buf_len,ICONV_CHAR c)273 put_iso1(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
274 {
275 	if (c >= 0x100u)
276 		return -EILSEQ;
277 	if (buf_len < 1)
278 		return -E2BIG;
279 	buf[0] = (unsigned char) c;
280 	return 1;
281 }
282 
283 static int
get_ascii(const unsigned char * p,size_t len,ICONV_CHAR * out)284 get_ascii(const unsigned char *p, size_t len, ICONV_CHAR *out)
285 {
286 	if (p[0] >= 0x80)
287 		return -EILSEQ;
288 	*out = p[0];
289 	return 1;
290 }
291 
292 static int
put_ascii(unsigned char * buf,size_t buf_len,ICONV_CHAR c)293 put_ascii(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
294 {
295 	if (c >= 0x80u)
296 		return -EILSEQ;
297 	if (buf_len < 1)
298 		return -E2BIG;
299 	buf[0] = (unsigned char) c;
300 	return 1;
301 }
302 
303 static int
get_cp1252(const unsigned char * p,size_t len,ICONV_CHAR * out)304 get_cp1252(const unsigned char *p, size_t len, ICONV_CHAR *out)
305 {
306 	if (*p >= 0x80 && *p < 0xa0)
307 		*out = cp1252_0080_00a0[*p - 0x80];
308 	else
309 		*out = *p;
310 	return 1;
311 }
312 
313 static int
put_cp1252(unsigned char * buf,size_t buf_len,ICONV_CHAR c)314 put_cp1252(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
315 {
316 	if (buf_len < 1)
317 		return -E2BIG;
318 
319 	if (c >= 0x100 || ((c&~0x1fu) == 0x80 && cp1252_0080_00a0[c - 0x80] != c - 0x80)) {
320 		switch (c) {
321 #define CP1252(i,o) case o: c = i; break;
322 		CP1252_ALL
323 #undef CP1252
324 		default:
325 			return -EILSEQ;
326 		}
327 	}
328 	*buf = c;
329 	return 1;
330 }
331 
332 static int
get_err(const unsigned char * p,size_t len,ICONV_CHAR * out)333 get_err(const unsigned char *p, size_t len, ICONV_CHAR *out)
334 {
335 	return -EILSEQ;
336 }
337 
338 static int
put_err(unsigned char * buf,size_t buf_len,ICONV_CHAR c)339 put_err(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
340 {
341 	return -EILSEQ;
342 }
343 
344 typedef int (*iconv_get_t)(const unsigned char *p, size_t len,     ICONV_CHAR *out);
345 typedef int (*iconv_put_t)(unsigned char *buf,     size_t buf_len, ICONV_CHAR c);
346 
347 static const iconv_get_t iconv_gets[16] = {
348 	get_iso1, get_ascii, get_utf16le, get_utf16be, get_ucs4le, get_ucs4be, get_utf8, get_cp1252,
349 	get_err, get_err, get_err, get_err, get_err, get_err, get_err, get_err,
350 };
351 static const iconv_put_t iconv_puts[16] = {
352 	put_iso1, put_ascii, put_utf16le, put_utf16be, put_ucs4le, put_ucs4be, put_utf8, put_cp1252,
353 	put_err, put_err, put_err, put_err, put_err, put_err, put_err, put_err,
354 };
355 
356 /**
357  * Inputs are FreeTDS canonical names, no other. No alias list is consulted.
358  */
359 iconv_t
tds_sys_iconv_open(const char * tocode,const char * fromcode)360 tds_sys_iconv_open (const char* tocode, const char* fromcode)
361 {
362 	int i;
363 	unsigned int fromto;
364 	const char *enc_name;
365 	unsigned char encodings[2];
366 
367 	static char first_time = 1;
368 
369 	if (TDS_UNLIKELY(first_time)) {
370 		first_time = 0;
371 		tdsdump_log(TDS_DBG_INFO1, "Using trivial iconv\n");
372 	}
373 
374 	/* match both inputs to our canonical names */
375 	enc_name = fromcode;
376 	for (i=0; i < 2; ++i) {
377 		unsigned char encoding;
378 
379 		if (strcmp(enc_name, "ISO-8859-1") == 0)
380 			encoding = 0;
381 		else if (strcmp(enc_name, "US-ASCII") == 0)
382 			encoding = 1;
383 		else if (strcmp(enc_name, "UCS-2LE") == 0 || strcmp(enc_name, "UTF-16LE") == 0)
384 			encoding = 2;
385 		else if (strcmp(enc_name, "UCS-2BE") == 0 || strcmp(enc_name, "UTF-16BE") == 0)
386 			encoding = 3;
387 		else if (strcmp(enc_name, "UCS-4LE") == 0)
388 			encoding = 4;
389 		else if (strcmp(enc_name, "UCS-4BE") == 0)
390 			encoding = 5;
391 		else if (strcmp(enc_name, "UTF-8") == 0)
392 			encoding = 6;
393 		else if (strcmp(enc_name, "CP1252") == 0)
394 			encoding = 7;
395 		else {
396 			errno = EINVAL;
397 			return (iconv_t)(-1);
398 		}
399 		encodings[i] = encoding;
400 
401 		enc_name = tocode;
402 	}
403 
404 	fromto = (encodings[0] << 4) | (encodings[1] & 0x0F);
405 
406 	/* like to like */
407 	if (encodings[0] == encodings[1]) {
408 		fromto = Like_to_Like;
409 	}
410 
411 	return (iconv_t) (intptr_t) fromto;
412 }
413 
414 int
tds_sys_iconv_close(iconv_t cd)415 tds_sys_iconv_close (iconv_t cd)
416 {
417 	return 0;
418 }
419 
420 size_t
tds_sys_iconv(iconv_t cd,const char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)421 tds_sys_iconv (iconv_t cd, const char* * inbuf, size_t *inbytesleft, char* * outbuf, size_t *outbytesleft)
422 {
423 	const unsigned char *ib;
424 	unsigned char *ob;
425 	size_t il, ol;
426 	int local_errno;
427 
428 #undef CD
429 #define CD ((int) (intptr_t) cd)
430 
431 	/* iconv defines valid semantics for NULL inputs, but we don't support them. */
432 	if (!inbuf || !*inbuf || !inbytesleft || !outbuf || !*outbuf || !outbytesleft)
433 		return 0;
434 
435 	/*
436 	 * some optimizations
437 	 * - do not use errno directly only assign a time
438 	 *   (some platform define errno as a complex macro)
439 	 * - some processors have few registers, deference and copy input variable
440 	 *   (this make also compiler optimize more due to removed aliasing)
441 	 *   also we use unsigned to remove required unsigned casts
442 	 */
443 	local_errno = 0;
444 	il = *inbytesleft;
445 	ol = *outbytesleft;
446 	ib = (const unsigned char*) *inbuf;
447 	ob = (unsigned char*) *outbuf;
448 
449 	if (CD == Like_to_Like) {
450 		size_t copybytes = (il < ol)? il : ol;
451 
452 		memcpy(ob, ib, copybytes);
453 		ob += copybytes;
454 		ol -= copybytes;
455 		ib += copybytes;
456 		il -= copybytes;
457 	} else if (CD & ~0xff) {
458 		local_errno = EINVAL;
459 	} else {
460 		iconv_get_t get_func = iconv_gets[(CD>>4) & 15];
461 		iconv_put_t put_func = iconv_puts[ CD     & 15];
462 
463 		while (il) {
464 			ICONV_CHAR out_c;
465 			int readed = get_func(ib, il, &out_c), written;
466 
467 			TDS_EXTRA_CHECK(assert(readed > 0 || readed == -EINVAL || readed == -EILSEQ));
468 			if (TDS_UNLIKELY(readed < 0)) {
469 				local_errno = -readed;
470 				break;
471 			}
472 
473 			written = put_func(ob, ol, out_c);
474 			TDS_EXTRA_CHECK(assert(written > 0 || written == -E2BIG || written == -EILSEQ));
475 			if (TDS_UNLIKELY(written < 0)) {
476 				local_errno = -written;
477 				break;
478 			}
479 			il -= readed;
480 			ib += readed;
481 			ol -= written;
482 			ob += written;
483 		}
484 	}
485 
486 	/* back to source */
487 	*inbytesleft = il;
488 	*outbytesleft = ol;
489 	*inbuf = (const char*) ib;
490 	*outbuf = (char*) ob;
491 
492 	if (il && !local_errno)
493 		local_errno = E2BIG;
494 
495 	if (local_errno) {
496 		errno = local_errno;
497 		return (size_t)(-1);
498 	}
499 
500 	return 0;
501 }
502 
503 
504 /** @} */
505 
506 #endif
507