1 /* @(#)unicode.c 1.23 20/07/03 Copyright 2001-2020 J. Schilling */
2 #include <schily/mconfig.h>
3 #ifndef lint
4 static UConst char sccsid[] =
5 "@(#)unicode.c 1.23 20/07/03 Copyright 2001-2020 J. Schilling";
6 #endif
7 /*
8 * Routines to convert from/to UNICODE
9 *
10 * This is currently a very simple implementation that only
11 * handles ISO-8859-1 coding using intrinsic code and using
12 * iconv() in case of other encodings.
13 *
14 * Copyright (c) 2001-2020 J. Schilling
15 */
16 /*
17 * The contents of this file are subject to the terms of the
18 * Common Development and Distribution License, Version 1.0 only
19 * (the "License"). You may not use this file except in compliance
20 * with the License.
21 *
22 * See the file CDDL.Schily.txt in this distribution for details.
23 * A copy of the CDDL is also available via the Internet at
24 * http://www.opensource.org/licenses/cddl1.txt
25 *
26 * When distributing Covered Code, include this CDDL HEADER in each
27 * file and include the License file CDDL.Schily.txt from this distribution.
28 */
29
30 #include <schily/stdio.h>
31 #include <schily/types.h>
32 #include <schily/utypes.h>
33 #include <schily/iconv.h>
34 #include <schily/standard.h>
35 #include <schily/errno.h>
36 #define GT_COMERR /* #define comerr gtcomerr */
37 #define GT_ERROR /* #define error gterror */
38 #include <schily/schily.h>
39 #ifdef __STAR__
40 #include "star.h"
41 #include "starsubs.h"
42 #include "checkerr.h"
43 #else
44 #include "header.h"
45 #endif
46
47 EXPORT void utf8_init __PR((int type));
48 EXPORT void utf8_fini __PR((void));
49 EXPORT size_t to_utf8 __PR((Uchar *to, size_t tolen,
50 Uchar *from, size_t len));
51 LOCAL size_t _to_utf8 __PR((Uchar *to, size_t tolen,
52 Uchar *from, size_t len));
53 #ifdef USE_ICONV
54 LOCAL size_t _to_iconv __PR((Uchar *to, size_t tolen,
55 Uchar *from, size_t len));
56 #endif
57 LOCAL size_t _to_none __PR((Uchar *to, size_t tolen,
58 Uchar *from, size_t len));
59 EXPORT BOOL from_utf8 __PR((Uchar *to, size_t tolen,
60 Uchar *from, size_t *len));
61 LOCAL BOOL _from_utf8 __PR((Uchar *to, size_t tolen,
62 Uchar *from, size_t *len));
63 #ifdef USE_ICONV
64 LOCAL BOOL _from_iconv __PR((Uchar *to, size_t tolen,
65 Uchar *from, size_t *len));
66 #endif
67 LOCAL BOOL _from_none __PR((Uchar *to, size_t tolen,
68 Uchar *from, size_t *len));
69
70 LOCAL size_t (*p_to_utf8) __PR((Uchar *to, size_t tolen,
71 Uchar *from, size_t len)) = _to_utf8;
72 LOCAL BOOL (*p_from_utf8) __PR((Uchar *to, size_t tolen,
73 Uchar *from, size_t *len)) = _from_utf8;
74
75 LOCAL iconv_t ic_from = (iconv_t)-1;
76 LOCAL iconv_t ic_to = (iconv_t)-1;
77
78 #ifdef __STAR__
79 extern char *codeset;
80 #else
81 LOCAL const char *codeset;
82
83 EXPORT void
utf8_codeset(code_set)84 utf8_codeset(code_set)
85 const char *code_set;
86 {
87 codeset = code_set;
88 }
89 #endif
90
91
92 EXPORT void
utf8_init(type)93 utf8_init(type)
94 int type;
95 {
96 if (codeset == NULL)
97 codeset = "ISO8859-1";
98 #ifndef ICONV_DEBUG
99 if (streql(codeset, "ISO8859-1") ||
100 streql(codeset, "ISO-8859-1") ||
101 streql(codeset, "ISO8859_1") ||
102 streql(codeset, "ISO_8859_1") ||
103 streql(codeset, "8859-1") ||
104 streql(codeset, "8859_1")) {
105 p_to_utf8 = _to_utf8;
106 p_from_utf8 = _from_utf8;
107 return;
108 }
109 if (streql(codeset, "UTF-8") ||
110 streql(codeset, "UTF8") ||
111 streql(codeset, "UTF_8")) {
112 p_to_utf8 = _to_none;
113 p_from_utf8 = _from_none;
114 return;
115 }
116 #endif
117 if (type & S_CREATE) {
118 #ifdef USE_ICONV
119 if (ic_to != (iconv_t)-1) {
120 iconv_close(ic_to);
121 }
122 ic_to = iconv_open("UTF-8", codeset);
123 #ifdef ICONV_DEBUG
124 fprintf(stderr, "ic_to %p\n", ic_to);
125 #endif
126 if (ic_to != (iconv_t)-1)
127 p_to_utf8 = _to_iconv;
128 else
129 #endif
130 p_to_utf8 = _to_utf8;
131 }
132 if (type & S_EXTRACT) {
133 #ifdef USE_ICONV
134 if (ic_from != (iconv_t)-1) {
135 iconv_close(ic_from);
136 }
137 ic_from = iconv_open(codeset, "UTF-8");
138 #ifdef ICONV_DEBUG
139 fprintf(stderr, "ic_from %p\n", ic_from);
140 #endif
141 if (ic_from != (iconv_t)-1)
142 p_from_utf8 = _from_iconv;
143 else
144 #endif
145 p_from_utf8 = _from_utf8;
146 }
147 }
148
149 EXPORT void
utf8_fini()150 utf8_fini()
151 {
152 #ifdef USE_ICONV
153 if (ic_to != (iconv_t)-1) {
154 iconv_close(ic_to);
155 ic_to = (iconv_t)-1;
156 }
157 if (ic_from != (iconv_t)-1) {
158 iconv_close(ic_from);
159 ic_from = (iconv_t)-1;
160 }
161 #endif
162 }
163
164 EXPORT size_t
to_utf8(to,tolen,from,len)165 to_utf8(to, tolen, from, len)
166 register Uchar *to;
167 size_t tolen;
168 register Uchar *from;
169 register size_t len;
170 {
171 return (p_to_utf8(to, tolen, from, len));
172 }
173
174 /*
175 * First copy len bytes from the source, convert it to UTF-8 assuming that it
176 * is in ISO-8859-1 encoding. Then add a final null byte. Return the number of
177 * characters written to the destination excluding the final null byte
178 * (strlen(to)).
179 */
180 LOCAL size_t
_to_utf8(to,tolen,from,len)181 _to_utf8(to, tolen, from, len)
182 register Uchar *to;
183 size_t tolen;
184 register Uchar *from;
185 register size_t len;
186 {
187 register Uchar *oto = to;
188 register Uchar c;
189
190 if (len == 0)
191 goto out;
192
193 do {
194 c = *from++;
195 if (c <= 0x7F) {
196 *to++ = c;
197 } else if (c <= 0xBF) {
198 *to++ = 0xC2;
199 *to++ = c;
200 } else { /* c <= 0xFF */
201 *to++ = 0xC3;
202 *to++ = c & 0xBF;
203 }
204 /*
205 * XXX We have plenty of space in "to" when we are called.
206 * XXX Should we check wether we did hit "tolen"?
207 */
208 } while (--len > 0);
209 out:
210 *to = '\0';
211 return (to - oto);
212 }
213
214 #ifdef USE_ICONV
215 LOCAL size_t
_to_iconv(to,tolen,from,len)216 _to_iconv(to, tolen, from, len)
217 Uchar *to;
218 size_t tolen;
219 Uchar *from;
220 size_t len;
221 {
222 #ifdef HAVE_ICONV_CONST
223 const char *fp = (char *)from;
224 #else
225 char *fp = (char *)from;
226 #endif
227 char *tp = (char *)to;
228 size_t frl = len;
229 size_t tol = tolen;
230 size_t ret;
231
232 seterrno(0);
233 ret = iconv(ic_to, &fp, &frl, &tp, &tol);
234 if (tol > 0)
235 *tp = '\0';
236 if (ret != 0) { /* Error (-1) or nonidentical translations (>0) */
237 #ifdef __STAR__
238 if (!errhidden(E_ICONV, (char *)from)) {
239 if (!errwarnonly(E_ICONV, (char *)from))
240 xstats.s_iconv++;
241 #endif
242 errmsg("Cannot convert '%s' to UTF-8.\n", from);
243 #ifdef __STAR__
244 (void) errabort(E_ICONV, (char *)from, TRUE);
245 }
246 #endif
247 }
248 /*
249 * Reset shift state
250 */
251 (void) iconv(ic_to, NULL, NULL, NULL, NULL);
252 return (tolen - tol);
253 }
254 #endif
255
256 LOCAL size_t
_to_none(to,tolen,from,len)257 _to_none(to, tolen, from, len)
258 Uchar *to;
259 size_t tolen;
260 Uchar *from;
261 size_t len;
262 {
263 if (tolen < len) {
264 movebytes(from, to, tolen);
265 return (tolen);
266 }
267 *movebytes(from, to, len) = '\0';
268 return (len);
269 }
270
271 EXPORT BOOL
from_utf8(to,tolen,from,lenp)272 from_utf8(to, tolen, from, lenp)
273 Uchar *to;
274 size_t tolen;
275 Uchar *from;
276 size_t *lenp;
277 {
278 return (p_from_utf8(to, tolen, from, lenp));
279 }
280
281 /*
282 * First copy len bytes from the source and convert it from UTF-8 assuming
283 * ISO-8859-1 encoding. Then add a final null byte. Set *lenp to the number of
284 * bytes written to the destination excluding the final null byte (strlen(to)).
285 * Return FALSE in case that an illegal ISO-8859-1 character was seen in the
286 * UTF-8 stream.
287 */
288 LOCAL BOOL
_from_utf8(to,tolen,from,lenp)289 _from_utf8(to, tolen, from, lenp)
290 register Uchar *to;
291 size_t tolen;
292 register Uchar *from;
293 size_t *lenp;
294 {
295 register Uchar *oto = to;
296 register Uchar c;
297 register BOOL ret = TRUE;
298 register size_t len = *lenp;
299 Uchar *endp = to + tolen;
300
301 if (len == 0)
302 goto out;
303
304 do {
305 c = *from++;
306 if (c <= 0x7F) {
307 *to++ = c;
308 } else if (c == 0xC0) {
309 *to++ = *from++ & 0x7F;
310 if (--len == 0)
311 break;
312 } else if (c == 0xC1) {
313 *to++ = (*from++ | 0x40) & 0x7F;
314 if (--len == 0)
315 break;
316 } else if (c == 0xC2) {
317 *to++ = *from++;
318 if (--len == 0)
319 break;
320 } else if (c == 0xC3) {
321 *to++ = *from++ | 0x40;
322 if (--len == 0)
323 break;
324 } else {
325 ret = FALSE; /* unknown/illegal UTF-8 char */
326 *to++ = '_'; /* use default character */
327 if (c < 0xE0) {
328 from++; /* 2 bytes in total */
329 if (--len == 0)
330 break;
331 } else if (c < 0xF0) {
332 from += 2; /* 3 bytes in total */
333 if (len <= 2)
334 break;
335 len -= 2;
336 } else if (c < 0xF8) {
337 from += 3; /* 4 bytes in total */
338 if (len <= 3)
339 break;
340 len -= 3;
341 } else if (c < 0xFC) {
342 from += 4; /* 5 bytes in total */
343 if (len <= 4)
344 break;
345 len -= 4;
346 } else if (c < 0xFE) {
347 from += 5; /* 6 bytes in total */
348 if (len <= 5)
349 break;
350 len -= 5;
351 } else {
352 while (len > 0) {
353 c = *from;
354 /*
355 * Test for 7 bit ASCII + non prefix
356 */
357 if (c <= 0xBF)
358 break;
359 from++;
360 if (--len == 0)
361 break;
362 }
363 if (len == 0)
364 break;
365 }
366 }
367 /*
368 * It is easy to check, since the result is always only one
369 * character. We need to stop here since the new path handling
370 * may need to grow the result in case of an overflow.
371 */
372 if (to >= endp)
373 break;
374 } while (--len > 0);
375 out:
376 if (to < endp)
377 *to = '\0';
378 *lenp = (to - oto);
379 return (ret);
380 }
381
382 #ifdef USE_ICONV
383 LOCAL BOOL
_from_iconv(to,tolen,from,len)384 _from_iconv(to, tolen, from, len)
385 Uchar *to;
386 size_t tolen;
387 Uchar *from;
388 size_t *len;
389 {
390 #ifdef HAVE_ICONV_CONST
391 const char *fp = (char *)from;
392 #else
393 char *fp = (char *)from;
394 #endif
395 char *tp = (char *)to;
396 size_t frl = *len;
397 size_t tol = tolen;
398 size_t ret;
399 BOOL rc = TRUE;
400
401 seterrno(0);
402 ret = iconv(ic_from, &fp, &frl, &tp, &tol);
403 if (tol > 0)
404 *tp = '\0';
405 *len = tolen - tol;
406 if (ret == -1 && geterrno() == E2BIG) {
407 /*
408 * in case of an overflow signal this via *len,
409 * even if on Linux where "tol" is 0 in such a case.
410 */
411 *len = tolen;
412 rc = FALSE;
413 } else if (ret != 0) { /* -1 or # of nonidentical translations (>0) */
414 #ifdef __STAR__
415 if (!errhidden(E_ICONV, (char *)from)) {
416 if (!errwarnonly(E_ICONV, (char *)from))
417 xstats.s_iconv++;
418 #endif
419 errmsg("Cannot convert '%s' to local charset.\n", from);
420 #ifdef __STAR__
421 (void) errabort(E_ICONV, (char *)from, TRUE);
422 }
423 #endif
424 rc = FALSE;
425 }
426 /*
427 * Reset shift state
428 */
429 (void) iconv(ic_from, NULL, NULL, NULL, NULL);
430 return (rc);
431 }
432 #endif
433
434 LOCAL BOOL
_from_none(to,tolen,from,len)435 _from_none(to, tolen, from, len)
436 Uchar *to;
437 size_t tolen;
438 Uchar *from;
439 size_t *len;
440 {
441 size_t clen = *len;
442
443 if (tolen < clen) {
444 movebytes(from, to, tolen);
445 *len = tolen;
446 return (TRUE);
447 }
448 *movebytes(from, to, clen) = '\0';
449 return (TRUE);
450 }
451