1 /* @(#)unicode.c	1.22 19/12/03 Copyright 2001-2019 J. Schilling */
2 #include <schily/mconfig.h>
3 #ifndef lint
4 static	UConst char sccsid[] =
5 	"@(#)unicode.c	1.22 19/12/03 Copyright 2001-2019 J. Schilling";
6 #endif
7 /*
8  *	Routines to convert from/to UNICODE
9  *
10  *	This is currently a very simple implementation that only
11  *	handles ISO-8859-1 coding using intrinsic code and using
12  *	iconv() in case of other encodings.
13  *
14  *	Copyright (c) 2001-2019 J. Schilling
15  */
16 /*
17  * The contents of this file are subject to the terms of the
18  * Common Development and Distribution License, Version 1.0 only
19  * (the "License").  You may not use this file except in compliance
20  * with the License.
21  *
22  * See the file CDDL.Schily.txt in this distribution for details.
23  * A copy of the CDDL is also available via the Internet at
24  * http://www.opensource.org/licenses/cddl1.txt
25  *
26  * When distributing Covered Code, include this CDDL HEADER in each
27  * file and include the License file CDDL.Schily.txt from this distribution.
28  */
29 
30 #include <schily/stdio.h>
31 #include <schily/types.h>
32 #include <schily/utypes.h>
33 #include <schily/iconv.h>
34 #include <schily/standard.h>
35 #include <schily/errno.h>
36 #define	GT_COMERR		/* #define comerr gtcomerr */
37 #define	GT_ERROR		/* #define error gterror   */
38 #include <schily/schily.h>
39 #ifdef	__STAR__
40 #include "star.h"
41 #include "starsubs.h"
42 #include "checkerr.h"
43 #else
44 #include "header.h"
45 #endif
46 
47 EXPORT	void	utf8_init	__PR((int type));
48 EXPORT	void	utf8_fini	__PR((void));
49 EXPORT	size_t	to_utf8		__PR((Uchar *to, size_t tolen,
50 					Uchar *from, size_t len));
51 LOCAL	size_t	_to_utf8	__PR((Uchar *to, size_t tolen,
52 					Uchar *from, size_t len));
53 #ifdef	USE_ICONV
54 LOCAL	size_t	_to_iconv	__PR((Uchar *to, size_t tolen,
55 					Uchar *from, size_t len));
56 #endif
57 LOCAL	size_t	_to_none	__PR((Uchar *to, size_t tolen,
58 					Uchar *from, size_t len));
59 EXPORT	BOOL	from_utf8	__PR((Uchar *to, size_t tolen,
60 					Uchar *from, size_t *len));
61 LOCAL	BOOL	_from_utf8	__PR((Uchar *to, size_t tolen,
62 					Uchar *from, size_t *len));
63 #ifdef	USE_ICONV
64 LOCAL	BOOL	_from_iconv	__PR((Uchar *to, size_t tolen,
65 					Uchar *from, size_t *len));
66 #endif
67 LOCAL	BOOL	_from_none	__PR((Uchar *to, size_t tolen,
68 					Uchar *from, size_t *len));
69 
70 LOCAL	size_t	(*p_to_utf8)	__PR((Uchar *to, size_t tolen,
71 					Uchar *from, size_t len)) = _to_utf8;
72 LOCAL	BOOL	(*p_from_utf8)	__PR((Uchar *to, size_t tolen,
73 					Uchar *from, size_t *len)) = _from_utf8;
74 
75 LOCAL	iconv_t	ic_from	= (iconv_t)-1;
76 LOCAL	iconv_t	ic_to	= (iconv_t)-1;
77 
78 #ifdef	__STAR__
79 extern	char		*codeset;
80 #else
81 LOCAL	const char	*codeset;
82 
83 EXPORT void
utf8_codeset(code_set)84 utf8_codeset(code_set)
85 	const	char	*code_set;
86 {
87 	codeset = code_set;
88 }
89 #endif
90 
91 
92 EXPORT void
utf8_init(type)93 utf8_init(type)
94 	int	type;
95 {
96 	if (codeset == NULL)
97 		codeset = "ISO8859-1";
98 #ifndef	ICONV_DEBUG
99 	if (streql(codeset, "ISO8859-1") ||
100 	    streql(codeset, "ISO-8859-1") ||
101 	    streql(codeset, "ISO8859_1") ||
102 	    streql(codeset, "ISO_8859_1") ||
103 	    streql(codeset, "8859-1") ||
104 	    streql(codeset, "8859_1")) {
105 		p_to_utf8 = _to_utf8;
106 		p_from_utf8 = _from_utf8;
107 		return;
108 	}
109 	if (streql(codeset, "UTF-8") ||
110 	    streql(codeset, "UTF8") ||
111 	    streql(codeset, "UTF_8")) {
112 		p_to_utf8 = _to_none;
113 		p_from_utf8 = _from_none;
114 		return;
115 	}
116 #endif
117 	if (type & S_CREATE) {
118 #ifdef	USE_ICONV
119 		if (ic_to != (iconv_t)-1) {
120 			iconv_close(ic_to);
121 		}
122 		ic_to = iconv_open("UTF-8", codeset);
123 #ifdef	ICONV_DEBUG
124 		fprintf(stderr, "ic_to %p\n", ic_to);
125 #endif
126 		if (ic_to != (iconv_t)-1)
127 			p_to_utf8 = _to_iconv;
128 		else
129 #endif
130 			p_to_utf8 = _to_utf8;
131 	}
132 	if (type & S_EXTRACT) {
133 #ifdef	USE_ICONV
134 		if (ic_from != (iconv_t)-1) {
135 			iconv_close(ic_from);
136 		}
137 		ic_from = iconv_open(codeset, "UTF-8");
138 #ifdef	ICONV_DEBUG
139 		fprintf(stderr, "ic_from %p\n", ic_from);
140 #endif
141 		if (ic_from != (iconv_t)-1)
142 			p_from_utf8 = _from_iconv;
143 		else
144 #endif
145 			p_from_utf8 = _from_utf8;
146 	}
147 }
148 
149 EXPORT void
utf8_fini()150 utf8_fini()
151 {
152 #ifdef	USE_ICONV
153 	if (ic_to != (iconv_t)-1) {
154 		iconv_close(ic_to);
155 		ic_to = (iconv_t)-1;
156 	}
157 	if (ic_from != (iconv_t)-1) {
158 		iconv_close(ic_from);
159 		ic_from = (iconv_t)-1;
160 	}
161 #endif
162 }
163 
164 EXPORT size_t
to_utf8(to,tolen,from,len)165 to_utf8(to, tolen, from, len)
166 	register Uchar	*to;
167 		size_t	tolen;
168 	register Uchar	*from;
169 	register size_t	len;
170 {
171 	return (p_to_utf8(to, tolen, from, len));
172 }
173 
174 /*
175  * First copy len bytes from the source, convert it to UTF-8 assuming that it
176  * is in ISO-8859-1 encoding. Then add a final null byte. Return the number of
177  * characters written to the destination excluding the final null byte
178  * (strlen(to)).
179  */
180 LOCAL size_t
_to_utf8(to,tolen,from,len)181 _to_utf8(to, tolen, from, len)
182 	register Uchar	*to;
183 		size_t	tolen;
184 	register Uchar	*from;
185 	register size_t	len;
186 {
187 	register Uchar	*oto = to;
188 	register Uchar	c;
189 
190 	if (len == 0)
191 		goto out;
192 
193 	do {
194 		c = *from++;
195 		if (c <= 0x7F) {
196 			*to++ = c;
197 		} else if (c <= 0xBF) {
198 			*to++ = 0xC2;
199 			*to++ = c;
200 		} else { /* c <= 0xFF */
201 			*to++ = 0xC3;
202 			*to++ = c & 0xBF;
203 		}
204 		/*
205 		 * XXX We have plenty of space in "to" when we are called.
206 		 * XXX Should we check wether we did hit "tolen"?
207 		 */
208 	} while (--len > 0);
209 out:
210 	*to = '\0';
211 	return (to - oto);
212 }
213 
214 #ifdef	USE_ICONV
215 LOCAL size_t
_to_iconv(to,tolen,from,len)216 _to_iconv(to, tolen, from, len)
217 	Uchar	*to;
218 	size_t	tolen;
219 	Uchar	*from;
220 	size_t	len;
221 {
222 #ifdef	HAVE_ICONV_CONST
223 	const char	*fp = (char *)from;
224 #else
225 	char		*fp = (char *)from;
226 #endif
227 	char		*tp = (char *)to;
228 	size_t		frl = len;
229 	size_t		tol = tolen;
230 	size_t		ret;
231 
232 	seterrno(0);
233 	ret = iconv(ic_to, &fp, &frl, &tp, &tol);
234 	if (tol > 0)
235 		*tp = '\0';
236 	if (ret != 0) {	/* Error (-1) or nonidentical translations (>0) */
237 #ifdef	__STAR__
238 		if (!errhidden(E_ICONV, (char *)from)) {
239 			if (!errwarnonly(E_ICONV, (char *)from))
240 				xstats.s_iconv++;
241 #endif
242 			errmsg("Cannot convert '%s' to UTF-8.\n", from);
243 #ifdef	__STAR__
244 			(void) errabort(E_ICONV, (char *)from, TRUE);
245 		}
246 #endif
247 	}
248 	/*
249 	 * Reset shift state
250 	 */
251 	(void) iconv(ic_to, NULL, NULL, NULL, NULL);
252 	return (tolen - tol);
253 }
254 #endif
255 
256 LOCAL size_t
_to_none(to,tolen,from,len)257 _to_none(to, tolen, from, len)
258 	Uchar	*to;
259 	size_t	tolen;
260 	Uchar	*from;
261 	size_t	len;
262 {
263 	*movebytes(from, to, len) = '\0';
264 	return (len);
265 }
266 
267 EXPORT BOOL
from_utf8(to,tolen,from,lenp)268 from_utf8(to, tolen, from, lenp)
269 	Uchar	*to;
270 	size_t	tolen;
271 	Uchar	*from;
272 	size_t	*lenp;
273 {
274 	return (p_from_utf8(to, tolen, from, lenp));
275 }
276 
277 /*
278  * First copy len bytes from the source and convert it from UTF-8 assuming
279  * ISO-8859-1 encoding. Then add a final null byte. Set *lenp to the number of
280  * bytes written to the destination excluding the final null byte (strlen(to)).
281  * Return FALSE in case that an illegal ISO-8859-1 character was seen in the
282  * UTF-8 stream.
283  */
284 LOCAL BOOL
_from_utf8(to,tolen,from,lenp)285 _from_utf8(to, tolen, from, lenp)
286 	register Uchar	*to;
287 		size_t	tolen;
288 	register Uchar	*from;
289 		size_t	*lenp;
290 {
291 	register Uchar	*oto = to;
292 	register Uchar	c;
293 	register BOOL	ret = TRUE;
294 	register size_t	len = *lenp;
295 		Uchar	*endp = to + tolen;
296 
297 	if (len == 0)
298 		goto out;
299 
300 	do {
301 		c = *from++;
302 		if (c <= 0x7F) {
303 			*to++ = c;
304 		} else if (c == 0xC0) {
305 			*to++ = *from++ & 0x7F;
306 			if (--len == 0)
307 				break;
308 		} else if (c == 0xC1) {
309 			*to++ = (*from++ | 0x40) & 0x7F;
310 			if (--len == 0)
311 				break;
312 		} else if (c == 0xC2) {
313 			*to++ = *from++;
314 			if (--len == 0)
315 				break;
316 		} else if (c == 0xC3) {
317 			*to++ = *from++ | 0x40;
318 			if (--len == 0)
319 				break;
320 		} else {
321 			ret = FALSE;		/* unknown/illegal UTF-8 char */
322 			*to++ = '_';		/* use default character    */
323 			if (c < 0xE0) {
324 				from++;		/* 2 bytes in total */
325 				if (--len == 0)
326 					break;
327 			} else if (c < 0xF0) {
328 				from += 2;	/* 3 bytes in total */
329 				if (len <= 2)
330 					break;
331 				len -= 2;
332 			} else if (c < 0xF8) {
333 				from += 3;	/* 4 bytes in total */
334 				if (len <= 3)
335 					break;
336 				len -= 3;
337 			} else if (c < 0xFC) {
338 				from += 4;	/* 5 bytes in total */
339 				if (len <= 4)
340 					break;
341 				len -= 4;
342 			} else if (c < 0xFE) {
343 				from += 5;	/* 6 bytes in total */
344 				if (len <= 5)
345 					break;
346 				len -= 5;
347 			} else {
348 				while (len > 0) {
349 					c = *from;
350 					/*
351 					 * Test for 7 bit ASCII + non prefix
352 					 */
353 					if (c <= 0xBF)
354 						break;
355 					from++;
356 					if (--len == 0)
357 						break;
358 				}
359 				if (len == 0)
360 					break;
361 			}
362 		}
363 		/*
364 		 * It is easy to check, since the result is always only one
365 		 * character. We need to stop here since the new path handling
366 		 * may need to grow the result in case of an overflow.
367 		 */
368 		if (to >= endp)
369 			break;
370 	} while (--len > 0);
371 out:
372 	if (to < endp)
373 		*to = '\0';
374 	*lenp = (to - oto);
375 	return (ret);
376 }
377 
378 #ifdef	USE_ICONV
379 LOCAL BOOL
_from_iconv(to,tolen,from,len)380 _from_iconv(to, tolen, from, len)
381 	Uchar	*to;
382 	size_t	tolen;
383 	Uchar	*from;
384 	size_t	*len;
385 {
386 #ifdef	HAVE_ICONV_CONST
387 	const char	*fp = (char *)from;
388 #else
389 	char		*fp = (char *)from;
390 #endif
391 	char		*tp = (char *)to;
392 	size_t		frl = *len;
393 	size_t		tol = tolen;
394 	size_t		ret;
395 	BOOL		rc = TRUE;
396 
397 	seterrno(0);
398 	ret = iconv(ic_from, &fp, &frl, &tp, &tol);
399 	if (tol > 0)
400 		*tp = '\0';
401 	*len = tolen - tol;
402 	if (ret == -1 && geterrno() == E2BIG) {
403 		/*
404 		 * in case of an overflow signal this via *len,
405 		 * even if on Linux where "tol" is 0 in such a case.
406 		 */
407 		*len = tolen;
408 		rc = FALSE;
409 	} else if (ret != 0) {	/* -1 or # of nonidentical translations (>0) */
410 #ifdef	__STAR__
411 		if (!errhidden(E_ICONV, (char *)from)) {
412 			if (!errwarnonly(E_ICONV, (char *)from))
413 				xstats.s_iconv++;
414 #endif
415 			errmsg("Cannot convert '%s' to local charset.\n", from);
416 #ifdef	__STAR__
417 			(void) errabort(E_ICONV, (char *)from, TRUE);
418 		}
419 #endif
420 		rc = FALSE;
421 	}
422 	/*
423 	 * Reset shift state
424 	 */
425 	(void) iconv(ic_from, NULL, NULL, NULL, NULL);
426 	return (rc);
427 }
428 #endif
429 
430 LOCAL BOOL
_from_none(to,tolen,from,len)431 _from_none(to, tolen, from, len)
432 	Uchar	*to;
433 	size_t	tolen;
434 	Uchar	*from;
435 	size_t	*len;
436 {
437 	*movebytes(from, to, *len) = '\0';
438 	return (TRUE);
439 }
440