1 /* @(#)unicode.c	1.23 20/07/03 Copyright 2001-2020 J. Schilling */
2 #include <schily/mconfig.h>
3 #ifndef lint
4 static	UConst char sccsid[] =
5 	"@(#)unicode.c	1.23 20/07/03 Copyright 2001-2020 J. Schilling";
6 #endif
7 /*
8  *	Routines to convert from/to UNICODE
9  *
10  *	This is currently a very simple implementation that only
11  *	handles ISO-8859-1 coding using intrinsic code and using
12  *	iconv() in case of other encodings.
13  *
14  *	Copyright (c) 2001-2020 J. Schilling
15  */
16 /*
17  * The contents of this file are subject to the terms of the
18  * Common Development and Distribution License, Version 1.0 only
19  * (the "License").  You may not use this file except in compliance
20  * with the License.
21  *
22  * See the file CDDL.Schily.txt in this distribution for details.
23  * A copy of the CDDL is also available via the Internet at
24  * http://www.opensource.org/licenses/cddl1.txt
25  *
26  * When distributing Covered Code, include this CDDL HEADER in each
27  * file and include the License file CDDL.Schily.txt from this distribution.
28  */
29 
30 #include <schily/stdio.h>
31 #include <schily/types.h>
32 #include <schily/utypes.h>
33 #include <schily/iconv.h>
34 #include <schily/standard.h>
35 #include <schily/errno.h>
36 #define	GT_COMERR		/* #define comerr gtcomerr */
37 #define	GT_ERROR		/* #define error gterror   */
38 #include <schily/schily.h>
39 #ifdef	__STAR__
40 #include "star.h"
41 #include "starsubs.h"
42 #include "checkerr.h"
43 #else
44 #include "header.h"
45 #endif
46 
47 EXPORT	void	utf8_init	__PR((int type));
48 EXPORT	void	utf8_fini	__PR((void));
49 EXPORT	size_t	to_utf8		__PR((Uchar *to, size_t tolen,
50 					Uchar *from, size_t len));
51 LOCAL	size_t	_to_utf8	__PR((Uchar *to, size_t tolen,
52 					Uchar *from, size_t len));
53 #ifdef	USE_ICONV
54 LOCAL	size_t	_to_iconv	__PR((Uchar *to, size_t tolen,
55 					Uchar *from, size_t len));
56 #endif
57 LOCAL	size_t	_to_none	__PR((Uchar *to, size_t tolen,
58 					Uchar *from, size_t len));
59 EXPORT	BOOL	from_utf8	__PR((Uchar *to, size_t tolen,
60 					Uchar *from, size_t *len));
61 LOCAL	BOOL	_from_utf8	__PR((Uchar *to, size_t tolen,
62 					Uchar *from, size_t *len));
63 #ifdef	USE_ICONV
64 LOCAL	BOOL	_from_iconv	__PR((Uchar *to, size_t tolen,
65 					Uchar *from, size_t *len));
66 #endif
67 LOCAL	BOOL	_from_none	__PR((Uchar *to, size_t tolen,
68 					Uchar *from, size_t *len));
69 
70 LOCAL	size_t	(*p_to_utf8)	__PR((Uchar *to, size_t tolen,
71 					Uchar *from, size_t len)) = _to_utf8;
72 LOCAL	BOOL	(*p_from_utf8)	__PR((Uchar *to, size_t tolen,
73 					Uchar *from, size_t *len)) = _from_utf8;
74 
75 LOCAL	iconv_t	ic_from	= (iconv_t)-1;
76 LOCAL	iconv_t	ic_to	= (iconv_t)-1;
77 
78 #ifdef	__STAR__
79 extern	char		*codeset;
80 #else
81 LOCAL	const char	*codeset;
82 
83 EXPORT void
utf8_codeset(code_set)84 utf8_codeset(code_set)
85 	const	char	*code_set;
86 {
87 	codeset = code_set;
88 }
89 #endif
90 
91 
92 EXPORT void
utf8_init(type)93 utf8_init(type)
94 	int	type;
95 {
96 	if (codeset == NULL)
97 		codeset = "ISO8859-1";
98 #ifndef	ICONV_DEBUG
99 	if (streql(codeset, "ISO8859-1") ||
100 	    streql(codeset, "ISO-8859-1") ||
101 	    streql(codeset, "ISO8859_1") ||
102 	    streql(codeset, "ISO_8859_1") ||
103 	    streql(codeset, "8859-1") ||
104 	    streql(codeset, "8859_1")) {
105 		p_to_utf8 = _to_utf8;
106 		p_from_utf8 = _from_utf8;
107 		return;
108 	}
109 	if (streql(codeset, "UTF-8") ||
110 	    streql(codeset, "UTF8") ||
111 	    streql(codeset, "UTF_8")) {
112 		p_to_utf8 = _to_none;
113 		p_from_utf8 = _from_none;
114 		return;
115 	}
116 #endif
117 	if (type & S_CREATE) {
118 #ifdef	USE_ICONV
119 		if (ic_to != (iconv_t)-1) {
120 			iconv_close(ic_to);
121 		}
122 		ic_to = iconv_open("UTF-8", codeset);
123 #ifdef	ICONV_DEBUG
124 		fprintf(stderr, "ic_to %p\n", ic_to);
125 #endif
126 		if (ic_to != (iconv_t)-1)
127 			p_to_utf8 = _to_iconv;
128 		else
129 #endif
130 			p_to_utf8 = _to_utf8;
131 	}
132 	if (type & S_EXTRACT) {
133 #ifdef	USE_ICONV
134 		if (ic_from != (iconv_t)-1) {
135 			iconv_close(ic_from);
136 		}
137 		ic_from = iconv_open(codeset, "UTF-8");
138 #ifdef	ICONV_DEBUG
139 		fprintf(stderr, "ic_from %p\n", ic_from);
140 #endif
141 		if (ic_from != (iconv_t)-1)
142 			p_from_utf8 = _from_iconv;
143 		else
144 #endif
145 			p_from_utf8 = _from_utf8;
146 	}
147 }
148 
149 EXPORT void
utf8_fini()150 utf8_fini()
151 {
152 #ifdef	USE_ICONV
153 	if (ic_to != (iconv_t)-1) {
154 		iconv_close(ic_to);
155 		ic_to = (iconv_t)-1;
156 	}
157 	if (ic_from != (iconv_t)-1) {
158 		iconv_close(ic_from);
159 		ic_from = (iconv_t)-1;
160 	}
161 #endif
162 }
163 
164 EXPORT size_t
to_utf8(to,tolen,from,len)165 to_utf8(to, tolen, from, len)
166 	register Uchar	*to;
167 		size_t	tolen;
168 	register Uchar	*from;
169 	register size_t	len;
170 {
171 	return (p_to_utf8(to, tolen, from, len));
172 }
173 
174 /*
175  * First copy len bytes from the source, convert it to UTF-8 assuming that it
176  * is in ISO-8859-1 encoding. Then add a final null byte. Return the number of
177  * characters written to the destination excluding the final null byte
178  * (strlen(to)).
179  */
180 LOCAL size_t
_to_utf8(to,tolen,from,len)181 _to_utf8(to, tolen, from, len)
182 	register Uchar	*to;
183 		size_t	tolen;
184 	register Uchar	*from;
185 	register size_t	len;
186 {
187 	register Uchar	*oto = to;
188 	register Uchar	c;
189 
190 	if (len == 0)
191 		goto out;
192 
193 	do {
194 		c = *from++;
195 		if (c <= 0x7F) {
196 			*to++ = c;
197 		} else if (c <= 0xBF) {
198 			*to++ = 0xC2;
199 			*to++ = c;
200 		} else { /* c <= 0xFF */
201 			*to++ = 0xC3;
202 			*to++ = c & 0xBF;
203 		}
204 		/*
205 		 * XXX We have plenty of space in "to" when we are called.
206 		 * XXX Should we check wether we did hit "tolen"?
207 		 */
208 	} while (--len > 0);
209 out:
210 	*to = '\0';
211 	return (to - oto);
212 }
213 
214 #ifdef	USE_ICONV
215 LOCAL size_t
_to_iconv(to,tolen,from,len)216 _to_iconv(to, tolen, from, len)
217 	Uchar	*to;
218 	size_t	tolen;
219 	Uchar	*from;
220 	size_t	len;
221 {
222 #ifdef	HAVE_ICONV_CONST
223 	const char	*fp = (char *)from;
224 #else
225 	char		*fp = (char *)from;
226 #endif
227 	char		*tp = (char *)to;
228 	size_t		frl = len;
229 	size_t		tol = tolen;
230 	size_t		ret;
231 
232 	seterrno(0);
233 	ret = iconv(ic_to, &fp, &frl, &tp, &tol);
234 	if (tol > 0)
235 		*tp = '\0';
236 	if (ret != 0) {	/* Error (-1) or nonidentical translations (>0) */
237 #ifdef	__STAR__
238 		if (!errhidden(E_ICONV, (char *)from)) {
239 			if (!errwarnonly(E_ICONV, (char *)from))
240 				xstats.s_iconv++;
241 #endif
242 			errmsg("Cannot convert '%s' to UTF-8.\n", from);
243 #ifdef	__STAR__
244 			(void) errabort(E_ICONV, (char *)from, TRUE);
245 		}
246 #endif
247 	}
248 	/*
249 	 * Reset shift state
250 	 */
251 	(void) iconv(ic_to, NULL, NULL, NULL, NULL);
252 	return (tolen - tol);
253 }
254 #endif
255 
256 LOCAL size_t
_to_none(to,tolen,from,len)257 _to_none(to, tolen, from, len)
258 	Uchar	*to;
259 	size_t	tolen;
260 	Uchar	*from;
261 	size_t	len;
262 {
263 	if (tolen < len) {
264 		movebytes(from, to, tolen);
265 		return (tolen);
266 	}
267 	*movebytes(from, to, len) = '\0';
268 	return (len);
269 }
270 
271 EXPORT BOOL
from_utf8(to,tolen,from,lenp)272 from_utf8(to, tolen, from, lenp)
273 	Uchar	*to;
274 	size_t	tolen;
275 	Uchar	*from;
276 	size_t	*lenp;
277 {
278 	return (p_from_utf8(to, tolen, from, lenp));
279 }
280 
281 /*
282  * First copy len bytes from the source and convert it from UTF-8 assuming
283  * ISO-8859-1 encoding. Then add a final null byte. Set *lenp to the number of
284  * bytes written to the destination excluding the final null byte (strlen(to)).
285  * Return FALSE in case that an illegal ISO-8859-1 character was seen in the
286  * UTF-8 stream.
287  */
288 LOCAL BOOL
_from_utf8(to,tolen,from,lenp)289 _from_utf8(to, tolen, from, lenp)
290 	register Uchar	*to;
291 		size_t	tolen;
292 	register Uchar	*from;
293 		size_t	*lenp;
294 {
295 	register Uchar	*oto = to;
296 	register Uchar	c;
297 	register BOOL	ret = TRUE;
298 	register size_t	len = *lenp;
299 		Uchar	*endp = to + tolen;
300 
301 	if (len == 0)
302 		goto out;
303 
304 	do {
305 		c = *from++;
306 		if (c <= 0x7F) {
307 			*to++ = c;
308 		} else if (c == 0xC0) {
309 			*to++ = *from++ & 0x7F;
310 			if (--len == 0)
311 				break;
312 		} else if (c == 0xC1) {
313 			*to++ = (*from++ | 0x40) & 0x7F;
314 			if (--len == 0)
315 				break;
316 		} else if (c == 0xC2) {
317 			*to++ = *from++;
318 			if (--len == 0)
319 				break;
320 		} else if (c == 0xC3) {
321 			*to++ = *from++ | 0x40;
322 			if (--len == 0)
323 				break;
324 		} else {
325 			ret = FALSE;		/* unknown/illegal UTF-8 char */
326 			*to++ = '_';		/* use default character    */
327 			if (c < 0xE0) {
328 				from++;		/* 2 bytes in total */
329 				if (--len == 0)
330 					break;
331 			} else if (c < 0xF0) {
332 				from += 2;	/* 3 bytes in total */
333 				if (len <= 2)
334 					break;
335 				len -= 2;
336 			} else if (c < 0xF8) {
337 				from += 3;	/* 4 bytes in total */
338 				if (len <= 3)
339 					break;
340 				len -= 3;
341 			} else if (c < 0xFC) {
342 				from += 4;	/* 5 bytes in total */
343 				if (len <= 4)
344 					break;
345 				len -= 4;
346 			} else if (c < 0xFE) {
347 				from += 5;	/* 6 bytes in total */
348 				if (len <= 5)
349 					break;
350 				len -= 5;
351 			} else {
352 				while (len > 0) {
353 					c = *from;
354 					/*
355 					 * Test for 7 bit ASCII + non prefix
356 					 */
357 					if (c <= 0xBF)
358 						break;
359 					from++;
360 					if (--len == 0)
361 						break;
362 				}
363 				if (len == 0)
364 					break;
365 			}
366 		}
367 		/*
368 		 * It is easy to check, since the result is always only one
369 		 * character. We need to stop here since the new path handling
370 		 * may need to grow the result in case of an overflow.
371 		 */
372 		if (to >= endp)
373 			break;
374 	} while (--len > 0);
375 out:
376 	if (to < endp)
377 		*to = '\0';
378 	*lenp = (to - oto);
379 	return (ret);
380 }
381 
382 #ifdef	USE_ICONV
383 LOCAL BOOL
_from_iconv(to,tolen,from,len)384 _from_iconv(to, tolen, from, len)
385 	Uchar	*to;
386 	size_t	tolen;
387 	Uchar	*from;
388 	size_t	*len;
389 {
390 #ifdef	HAVE_ICONV_CONST
391 	const char	*fp = (char *)from;
392 #else
393 	char		*fp = (char *)from;
394 #endif
395 	char		*tp = (char *)to;
396 	size_t		frl = *len;
397 	size_t		tol = tolen;
398 	size_t		ret;
399 	BOOL		rc = TRUE;
400 
401 	seterrno(0);
402 	ret = iconv(ic_from, &fp, &frl, &tp, &tol);
403 	if (tol > 0)
404 		*tp = '\0';
405 	*len = tolen - tol;
406 	if (ret == -1 && geterrno() == E2BIG) {
407 		/*
408 		 * in case of an overflow signal this via *len,
409 		 * even if on Linux where "tol" is 0 in such a case.
410 		 */
411 		*len = tolen;
412 		rc = FALSE;
413 	} else if (ret != 0) {	/* -1 or # of nonidentical translations (>0) */
414 #ifdef	__STAR__
415 		if (!errhidden(E_ICONV, (char *)from)) {
416 			if (!errwarnonly(E_ICONV, (char *)from))
417 				xstats.s_iconv++;
418 #endif
419 			errmsg("Cannot convert '%s' to local charset.\n", from);
420 #ifdef	__STAR__
421 			(void) errabort(E_ICONV, (char *)from, TRUE);
422 		}
423 #endif
424 		rc = FALSE;
425 	}
426 	/*
427 	 * Reset shift state
428 	 */
429 	(void) iconv(ic_from, NULL, NULL, NULL, NULL);
430 	return (rc);
431 }
432 #endif
433 
434 LOCAL BOOL
_from_none(to,tolen,from,len)435 _from_none(to, tolen, from, len)
436 	Uchar	*to;
437 	size_t	tolen;
438 	Uchar	*from;
439 	size_t	*len;
440 {
441 	size_t	clen = *len;
442 
443 	if (tolen < clen) {
444 		movebytes(from, to, tolen);
445 		*len = tolen;
446 		return (TRUE);
447 	}
448 	*movebytes(from, to, clen) = '\0';
449 	return (TRUE);
450 }
451