1 /* FreeTDS - Library of routines accessing Sybase and Microsoft databases
2 * Copyright (C) 2003, 2004 James K. Lowden, based on original work by Brian Bruns
3 * Copyright (C) 2011 Frediano Ziglio
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public
16 * License along with this library; if not, write to the
17 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 02111-1307, USA.
19 */
20
21 /**
22 * \file
23 * This file implements a very simple iconv.
24 * Its purpose is to allow ASCII clients to communicate with Microsoft servers
25 * that encode their metadata in Unicode (UTF-16).
26 *
27 * It supports ISO-8859-1, ASCII, CP1252, UTF-16, UCS-4 and UTF-8
28 */
29
30 #include <config.h>
31
32 #if ! HAVE_ICONV
33
34 #if HAVE_STRING_H
35 #include <string.h>
36 #endif /* HAVE_STRING_H */
37 #if HAVE_ERRNO_H
38 #include <errno.h>
39 #endif
40
41 #include <assert.h>
42 #include <ctype.h>
43
44 #include <freetds/tds.h>
45 #include <freetds/bytes.h>
46 #include <freetds/iconv.h>
47 #include <freetds/utils/bjoern-utf8.h>
48
49 #include "iconv_charsets.h"
50
51 /**
52 * \addtogroup conv
53 * @{
54 */
55
56 enum ICONV_CD_VALUE
57 {
58 Like_to_Like = 0x100
59 };
60
61 typedef uint32_t ICONV_CHAR;
62
63 /*
64 * Return values for get_*:
65 * - >0 bytes readed
66 * - -EINVAL not enough data to read
67 * - -EILSEQ invalid encoding detected
68 * Return values for put_*:
69 * - >0 bytes written
70 * - -E2BIG no space left on output
71 * - -EILSEQ character can't be encoded in output charset
72 */
73
74 static int
get_utf8(const unsigned char * p,size_t len,ICONV_CHAR * out)75 get_utf8(const unsigned char *p, size_t len, ICONV_CHAR *out)
76 {
77 uint32_t uc, state = UTF8_ACCEPT;
78 size_t l = 1;
79
80 do {
81 switch (decode_utf8(&state, &uc, *p++)) {
82 case UTF8_ACCEPT:
83 *out = uc;
84 return l;
85 case UTF8_REJECT:
86 return -EILSEQ;
87 }
88 } while (l++ < len);
89 return -EINVAL;
90 }
91
92 static int
put_utf8(unsigned char * buf,size_t buf_len,ICONV_CHAR c)93 put_utf8(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
94 {
95 #define MASK(n) ((0xffffffffu << (n)) & 0xffffffffu)
96 size_t o_len;
97
98 if ((c & MASK(7)) == 0) {
99 if (buf_len < 1)
100 return -E2BIG;
101 *buf = (unsigned char) c;
102 return 1;
103 }
104
105 o_len = 2;
106 for (;;) {
107 if ((c & MASK(11)) == 0)
108 break;
109 ++o_len;
110 if ((c & MASK(16)) == 0)
111 break;
112 ++o_len;
113 if ((c & MASK(21)) == 0)
114 break;
115 ++o_len;
116 if ((c & MASK(26)) == 0)
117 break;
118 ++o_len;
119 if ((c & MASK(31)) != 0)
120 return -EILSEQ;
121 }
122
123 if (buf_len < o_len)
124 return -E2BIG;
125 buf += o_len;
126 buf_len = o_len - 1;
127 do {
128 *--buf = 0x80 | (c & 0x3f);
129 c >>= 6;
130 } while (--buf_len);
131 *--buf = (0xff00u >> o_len) | c;
132 return o_len;
133 }
134
135 static int
get_ucs4le(const unsigned char * p,size_t len,ICONV_CHAR * out)136 get_ucs4le(const unsigned char *p, size_t len, ICONV_CHAR *out)
137 {
138 TDS_EXTRA_CHECK(assert((((uintptr_t) p) & 3) == 0));
139
140 if (len < 4)
141 return -EINVAL;
142 *out = TDS_GET_A4LE(p);
143 return 4;
144 }
145
146 static int
put_ucs4le(unsigned char * buf,size_t buf_len,ICONV_CHAR c)147 put_ucs4le(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
148 {
149 TDS_EXTRA_CHECK(assert((((uintptr_t) buf) & 3) == 0));
150
151 if (buf_len < 4)
152 return -E2BIG;
153 TDS_PUT_A4LE(buf, c);
154 return 4;
155 }
156
157 static int
get_ucs4be(const unsigned char * p,size_t len,ICONV_CHAR * out)158 get_ucs4be(const unsigned char *p, size_t len, ICONV_CHAR *out)
159 {
160 TDS_EXTRA_CHECK(assert((((uintptr_t) p) & 3) == 0));
161
162 if (len < 4)
163 return -EINVAL;
164 *out = TDS_GET_A4BE(p);
165 return 4;
166 }
167
168 static int
put_ucs4be(unsigned char * buf,size_t buf_len,ICONV_CHAR c)169 put_ucs4be(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
170 {
171 TDS_EXTRA_CHECK(assert((((uintptr_t) buf) & 3) == 0));
172
173 if (buf_len < 4)
174 return -E2BIG;
175 TDS_PUT_A4BE(buf, c);
176 return 4;
177 }
178
179 static int
get_utf16le(const unsigned char * p,size_t len,ICONV_CHAR * out)180 get_utf16le(const unsigned char *p, size_t len, ICONV_CHAR *out)
181 {
182 ICONV_CHAR c, c2;
183
184 TDS_EXTRA_CHECK(assert((((uintptr_t) p) & 1) == 0));
185
186 if (len < 2)
187 return -EINVAL;
188 c = TDS_GET_A2LE(p);
189 if ((c & 0xfc00) == 0xd800) {
190 if (len < 4)
191 return -EINVAL;
192 c2 = TDS_GET_A2LE(p+2);
193 if ((c2 & 0xfc00) == 0xdc00) {
194 *out = (c << 10) + c2 - ((0xd800 << 10) + 0xdc00 - 0x10000);
195 return 4;
196 }
197 }
198 *out = c;
199 return 2;
200 }
201
202 static int
put_utf16le(unsigned char * buf,size_t buf_len,ICONV_CHAR c)203 put_utf16le(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
204 {
205 TDS_EXTRA_CHECK(assert((((uintptr_t) buf) & 1) == 0));
206
207 if (c < 0x10000u) {
208 if (buf_len < 2)
209 return -E2BIG;
210 TDS_PUT_A2LE(buf, c);
211 return 2;
212 }
213 if (TDS_UNLIKELY(c >= 0x110000u))
214 return -EILSEQ;
215 if (buf_len < 4)
216 return -E2BIG;
217 TDS_PUT_A2LE(buf, 0xd7c0 + (c >> 10));
218 TDS_PUT_A2LE(buf+2, 0xdc00 + (c & 0x3ffu));
219 return 4;
220 }
221
222 static int
get_utf16be(const unsigned char * p,size_t len,ICONV_CHAR * out)223 get_utf16be(const unsigned char *p, size_t len, ICONV_CHAR *out)
224 {
225 ICONV_CHAR c, c2;
226
227 TDS_EXTRA_CHECK(assert((((uintptr_t) p) & 1) == 0));
228
229 if (len < 2)
230 return -EINVAL;
231 c = TDS_GET_A2BE(p);
232 if ((c & 0xfc00) == 0xd800) {
233 if (len < 4)
234 return -EINVAL;
235 c2 = TDS_GET_A2BE(p+2);
236 if ((c2 & 0xfc00) == 0xdc00) {
237 *out = (c << 10) + c2 - ((0xd800 << 10) + 0xdc00 - 0x10000);
238 return 4;
239 }
240 }
241 *out = c;
242 return 2;
243 }
244
245 static int
put_utf16be(unsigned char * buf,size_t buf_len,ICONV_CHAR c)246 put_utf16be(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
247 {
248 TDS_EXTRA_CHECK(assert((((uintptr_t) buf) & 1) == 0));
249
250 if (c < 0x10000u) {
251 if (buf_len < 2)
252 return -E2BIG;
253 TDS_PUT_A2BE(buf, c);
254 return 2;
255 }
256 if (TDS_UNLIKELY(c >= 0x110000u))
257 return -EILSEQ;
258 if (buf_len < 4)
259 return -E2BIG;
260 TDS_PUT_A2BE(buf, 0xd7c0 + (c >> 10));
261 TDS_PUT_A2BE(buf+2, 0xdc00 + (c & 0x3ffu));
262 return 4;
263 }
264
265 static int
get_iso1(const unsigned char * p,size_t len,ICONV_CHAR * out)266 get_iso1(const unsigned char *p, size_t len, ICONV_CHAR *out)
267 {
268 *out = p[0];
269 return 1;
270 }
271
272 static int
put_iso1(unsigned char * buf,size_t buf_len,ICONV_CHAR c)273 put_iso1(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
274 {
275 if (c >= 0x100u)
276 return -EILSEQ;
277 if (buf_len < 1)
278 return -E2BIG;
279 buf[0] = (unsigned char) c;
280 return 1;
281 }
282
283 static int
get_ascii(const unsigned char * p,size_t len,ICONV_CHAR * out)284 get_ascii(const unsigned char *p, size_t len, ICONV_CHAR *out)
285 {
286 if (p[0] >= 0x80)
287 return -EILSEQ;
288 *out = p[0];
289 return 1;
290 }
291
292 static int
put_ascii(unsigned char * buf,size_t buf_len,ICONV_CHAR c)293 put_ascii(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
294 {
295 if (c >= 0x80u)
296 return -EILSEQ;
297 if (buf_len < 1)
298 return -E2BIG;
299 buf[0] = (unsigned char) c;
300 return 1;
301 }
302
303 static int
get_cp1252(const unsigned char * p,size_t len,ICONV_CHAR * out)304 get_cp1252(const unsigned char *p, size_t len, ICONV_CHAR *out)
305 {
306 if (*p >= 0x80 && *p < 0xa0)
307 *out = cp1252_0080_00a0[*p - 0x80];
308 else
309 *out = *p;
310 return 1;
311 }
312
313 static int
put_cp1252(unsigned char * buf,size_t buf_len,ICONV_CHAR c)314 put_cp1252(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
315 {
316 if (buf_len < 1)
317 return -E2BIG;
318
319 if (c >= 0x100 || ((c&~0x1fu) == 0x80 && cp1252_0080_00a0[c - 0x80] != c - 0x80)) {
320 switch (c) {
321 #define CP1252(i,o) case o: c = i; break;
322 CP1252_ALL
323 #undef CP1252
324 default:
325 return -EILSEQ;
326 }
327 }
328 *buf = c;
329 return 1;
330 }
331
332 static int
get_err(const unsigned char * p,size_t len,ICONV_CHAR * out)333 get_err(const unsigned char *p, size_t len, ICONV_CHAR *out)
334 {
335 return -EILSEQ;
336 }
337
338 static int
put_err(unsigned char * buf,size_t buf_len,ICONV_CHAR c)339 put_err(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
340 {
341 return -EILSEQ;
342 }
343
344 typedef int (*iconv_get_t)(const unsigned char *p, size_t len, ICONV_CHAR *out);
345 typedef int (*iconv_put_t)(unsigned char *buf, size_t buf_len, ICONV_CHAR c);
346
347 static const iconv_get_t iconv_gets[16] = {
348 get_iso1, get_ascii, get_utf16le, get_utf16be, get_ucs4le, get_ucs4be, get_utf8, get_cp1252,
349 get_err, get_err, get_err, get_err, get_err, get_err, get_err, get_err,
350 };
351 static const iconv_put_t iconv_puts[16] = {
352 put_iso1, put_ascii, put_utf16le, put_utf16be, put_ucs4le, put_ucs4be, put_utf8, put_cp1252,
353 put_err, put_err, put_err, put_err, put_err, put_err, put_err, put_err,
354 };
355
356 /**
357 * Inputs are FreeTDS canonical names, no other. No alias list is consulted.
358 */
359 iconv_t
tds_sys_iconv_open(const char * tocode,const char * fromcode)360 tds_sys_iconv_open (const char* tocode, const char* fromcode)
361 {
362 int i;
363 unsigned int fromto;
364 const char *enc_name;
365 unsigned char encodings[2];
366
367 static char first_time = 1;
368
369 if (TDS_UNLIKELY(first_time)) {
370 first_time = 0;
371 tdsdump_log(TDS_DBG_INFO1, "Using trivial iconv\n");
372 }
373
374 /* match both inputs to our canonical names */
375 enc_name = fromcode;
376 for (i=0; i < 2; ++i) {
377 unsigned char encoding;
378
379 if (strcmp(enc_name, "ISO-8859-1") == 0)
380 encoding = 0;
381 else if (strcmp(enc_name, "US-ASCII") == 0)
382 encoding = 1;
383 else if (strcmp(enc_name, "UCS-2LE") == 0 || strcmp(enc_name, "UTF-16LE") == 0)
384 encoding = 2;
385 else if (strcmp(enc_name, "UCS-2BE") == 0 || strcmp(enc_name, "UTF-16BE") == 0)
386 encoding = 3;
387 else if (strcmp(enc_name, "UCS-4LE") == 0)
388 encoding = 4;
389 else if (strcmp(enc_name, "UCS-4BE") == 0)
390 encoding = 5;
391 else if (strcmp(enc_name, "UTF-8") == 0)
392 encoding = 6;
393 else if (strcmp(enc_name, "CP1252") == 0)
394 encoding = 7;
395 else {
396 errno = EINVAL;
397 return (iconv_t)(-1);
398 }
399 encodings[i] = encoding;
400
401 enc_name = tocode;
402 }
403
404 fromto = (encodings[0] << 4) | (encodings[1] & 0x0F);
405
406 /* like to like */
407 if (encodings[0] == encodings[1]) {
408 fromto = Like_to_Like;
409 }
410
411 return (iconv_t) (intptr_t) fromto;
412 }
413
414 int
tds_sys_iconv_close(iconv_t cd)415 tds_sys_iconv_close (iconv_t cd)
416 {
417 return 0;
418 }
419
420 size_t
tds_sys_iconv(iconv_t cd,const char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)421 tds_sys_iconv (iconv_t cd, const char* * inbuf, size_t *inbytesleft, char* * outbuf, size_t *outbytesleft)
422 {
423 const unsigned char *ib;
424 unsigned char *ob;
425 size_t il, ol;
426 int local_errno;
427
428 #undef CD
429 #define CD ((int) (intptr_t) cd)
430
431 /* iconv defines valid semantics for NULL inputs, but we don't support them. */
432 if (!inbuf || !*inbuf || !inbytesleft || !outbuf || !*outbuf || !outbytesleft)
433 return 0;
434
435 /*
436 * some optimizations
437 * - do not use errno directly only assign a time
438 * (some platform define errno as a complex macro)
439 * - some processors have few registers, deference and copy input variable
440 * (this make also compiler optimize more due to removed aliasing)
441 * also we use unsigned to remove required unsigned casts
442 */
443 local_errno = 0;
444 il = *inbytesleft;
445 ol = *outbytesleft;
446 ib = (const unsigned char*) *inbuf;
447 ob = (unsigned char*) *outbuf;
448
449 if (CD == Like_to_Like) {
450 size_t copybytes = (il < ol)? il : ol;
451
452 memcpy(ob, ib, copybytes);
453 ob += copybytes;
454 ol -= copybytes;
455 ib += copybytes;
456 il -= copybytes;
457 } else if (CD & ~0xff) {
458 local_errno = EINVAL;
459 } else {
460 iconv_get_t get_func = iconv_gets[(CD>>4) & 15];
461 iconv_put_t put_func = iconv_puts[ CD & 15];
462
463 while (il) {
464 ICONV_CHAR out_c;
465 int readed = get_func(ib, il, &out_c), written;
466
467 TDS_EXTRA_CHECK(assert(readed > 0 || readed == -EINVAL || readed == -EILSEQ));
468 if (TDS_UNLIKELY(readed < 0)) {
469 local_errno = -readed;
470 break;
471 }
472
473 written = put_func(ob, ol, out_c);
474 TDS_EXTRA_CHECK(assert(written > 0 || written == -E2BIG || written == -EILSEQ));
475 if (TDS_UNLIKELY(written < 0)) {
476 local_errno = -written;
477 break;
478 }
479 il -= readed;
480 ib += readed;
481 ol -= written;
482 ob += written;
483 }
484 }
485
486 /* back to source */
487 *inbytesleft = il;
488 *outbytesleft = ol;
489 *inbuf = (const char*) ib;
490 *outbuf = (char*) ob;
491
492 if (il && !local_errno)
493 local_errno = E2BIG;
494
495 if (local_errno) {
496 errno = local_errno;
497 return (size_t)(-1);
498 }
499
500 return 0;
501 }
502
503
504 /** @} */
505
506 #endif
507