1 /************************************************************************/
2 /*									*/
3 /*  Convert text from/to UTF-8 using iconv.				*/
4 /*									*/
5 /************************************************************************/
6 
7 #   include	"textEncodingConfig.h"
8 
9 #   include	<stdlib.h>
10 #   include	<string.h>
11 #   include	<errno.h>
12 #   include	<iconv.h>
13 
14 #   include	"uniUtf8.h"
15 #   include	<appDebugon.h>
16 
17 #   include	"textConverter.h"
18 #   include	"textConverterImpl.h"
19 #   include	<utilIndexMapping.h>
20 #   include	"uniLegacyEncoding.h"
21 #   include	"uniLegacyMapping.h"
22 
textConverterSetNativeEncodingName(TextConverter * tc,const char * encodingName)23 void textConverterSetNativeEncodingName(	TextConverter *	tc,
24 						const char *	encodingName )
25     {
26     if  ( ! encodingName )
27 	{ XDEB(encodingName); return;	}
28 
29     /* Shortcut that also covers the both null case */
30     if  ( tc->tcNativeEncodingName == encodingName )
31 	{ return;	}
32 
33     if  ( (   tc->tcNativeEncodingName && ! encodingName )	||
34 	  ( ! tc->tcNativeEncodingName &&   encodingName )	||
35 	  strcmp( tc->tcNativeEncodingName, encodingName )	)
36 	{
37 	char *		oldName= tc->tcNativeEncodingName;
38 
39 	if  ( (iconv_t)tc->tcIconvToUtf8 != (iconv_t)-1 )
40 	    {
41 	    iconv_close( (iconv_t)tc->tcIconvToUtf8 );
42 	    tc->tcIconvToUtf8= (struct TextConverterImpl *)(iconv_t)-1;
43 	    }
44 
45 	if  ( (iconv_t)tc->tcIconvFrUtf8 != (iconv_t)-1 )
46 	    {
47 	    iconv_close( (iconv_t)tc->tcIconvFrUtf8 );
48 	    tc->tcIconvFrUtf8= (struct TextConverterImpl *)(iconv_t)-1;
49 	    }
50 
51 	if  ( encodingName )
52 	    { tc->tcNativeEncodingName= strdup( encodingName );	}
53 	else{ tc->tcNativeEncodingName= (char *)0;		}
54 
55 	if  ( oldName )
56 	    { free( oldName );	}
57 	}
58 
59     return;
60     }
61 
textInitTextConverter(TextConverter * tc)62 void textInitTextConverter(	TextConverter *		tc )
63     {
64     tc->tcNativeEncodingName= (char *)0;
65     tc->tcIconvToUtf8= (struct TextConverterImpl *)(iconv_t)-1;
66     tc->tcIconvFrUtf8= (struct TextConverterImpl *)(iconv_t)-1;
67 
68     tc->tcProduce= (TextConverterProduce)0;
69     }
70 
textCleanTextConverter(TextConverter * tc)71 void textCleanTextConverter(	TextConverter *		tc )
72     {
73     if  ( tc->tcNativeEncodingName )
74 	{ free( tc->tcNativeEncodingName );	}
75 
76     if  ( (iconv_t)tc->tcIconvToUtf8 != (iconv_t)-1 )
77 	{
78 	iconv_close( (iconv_t)tc->tcIconvToUtf8 );
79 	tc->tcIconvToUtf8= (struct TextConverterImpl *)(iconv_t)-1;
80 	}
81 
82     if  ( (iconv_t)tc->tcIconvFrUtf8 != (iconv_t)-1 )
83 	{
84 	iconv_close( (iconv_t)tc->tcIconvFrUtf8 );
85 	tc->tcIconvFrUtf8= (struct TextConverterImpl *)(iconv_t)-1;
86 	}
87     }
88 
89 /************************************************************************/
90 
textConverterProduce(const TextConverter * tc,void * through,int produced,const char * text,int len)91 static int textConverterProduce(	const TextConverter *	tc,
92 					void *			through,
93 					int			produced,
94 					const char *		text,
95 					int			len )
96     {
97     int		step;
98 
99     step= (*tc->tcProduce)( through, produced, text, len );
100     if  ( step < 0 )
101 	{ LLLDEB(produced,len,step); return -1;	}
102 
103     return step;
104     }
105 
106 /************************************************************************/
107 /*									*/
108 /*  Hack because iconv does not support the symbol encoding.		*/
109 /*  Convert legacy bytes to UTF-8.					*/
110 /*									*/
111 /************************************************************************/
112 
textConverterConvertBytesToUtf8(const TextConverter * tc,const int unicodes[256],void * through,int * pConsumed,int produced,const char * text,int len)113 static int textConverterConvertBytesToUtf8(
114 					const TextConverter *	tc,
115 					const int		unicodes[256],
116 					void *			through,
117 					int *			pConsumed,
118 					int			produced,
119 					const char *		text,
120 					int			len )
121     {
122     char		scratch[750];
123     int			buffered= 0;
124     int			step;
125     const int		scratchLen= sizeof(scratch)- 7;
126     int			consumed= 0;
127 
128     while( len > 0 )
129 	{
130 	if  ( buffered >= scratchLen )
131 	    {
132 	    step= textConverterProduce( tc, through, produced, scratch, buffered );
133 	    if  ( step < 0 )
134 		{ LLLDEB(produced,buffered,step); return -1;	}
135 	    produced += step; buffered= 0;
136 	    }
137 
138 	if  ( unicodes[*text&0xff] < 0 )
139 	    {
140 	    step= uniPutUtf8( scratch+ buffered, *text&0xff );
141 	    }
142 	else{
143 	    step= uniPutUtf8( scratch+ buffered, unicodes[*text&0xff] );
144 	    }
145 	if  ( step < 1 )
146 	    { LDEB(step); return -1;	}
147 
148 	text++; len--; consumed++; buffered += step;
149 	}
150 
151     if  ( buffered > 0 )
152 	{
153 	step= textConverterProduce( tc, through, produced, scratch, buffered );
154 	if  ( step < 0 )
155 	    { LLLDEB(produced,buffered,step); return -1;	}
156 	produced += step; buffered= 0;
157 	}
158 
159     *pConsumed += consumed;
160     return produced;
161     }
162 
163 /************************************************************************/
164 /*									*/
165 /*  Hack because iconv does not support the symbol encoding.		*/
166 /*  Convert UTF-8 to legacy.						*/
167 /*									*/
168 /************************************************************************/
169 
textConverterConvertBytesFromUtf8(IndexMapping * im,const int unicodes[256],void * through,TextConverterProduce produce,int * pConsumed,int produced,const char * text,int len)170 static int textConverterConvertBytesFromUtf8(
171 					IndexMapping *		im,
172 					const int		unicodes[256],
173 					void *			through,
174 					TextConverterProduce	produce,
175 					int *			pConsumed,
176 					int			produced,
177 					const char *		text,
178 					int			len )
179     {
180     unsigned char	scratch[750];
181     int			done= 0;
182     int			step;
183     const int		scratchLen= sizeof(scratch);
184     int			consumed= 0;
185 
186     if  ( utilIndexMappingIsEmpty( im )				&&
187 	  unicodes						&&
188 	  utilIndexMappingBuildBackward( im, unicodes, 256 )	)
189 	{ XDEB(unicodes); return -1;	}
190 
191     while( len > 0 )
192 	{
193 	unsigned short			symbol;
194 	int				code;
195 
196 	if  ( done >= scratchLen )
197 	    {
198 	    step= (*produce)( through, produced, (char *)scratch, done );
199 	    if  ( step < 0 )
200 		{ LLLDEB(produced,done,step); return -1;	}
201 	    produced += step; done= 0;
202 	    }
203 
204 	step= uniGetUtf8( &symbol, text );
205 	if  ( step < 1 )
206 	    { LDEB(step); return -1;	}
207 
208 	code= utilIndexMappingGetU( im, symbol );
209 	if  ( code < 0 )
210 	    { break;	}
211 
212 	scratch[done++]= code;
213 	text += step; len -= step; consumed += step;
214 	}
215 
216     if  ( done > 0 )
217 	{
218 	step= (*produce)( through, produced, (char *)scratch, done );
219 	if  ( step < 0 )
220 	    { LLLDEB(produced,done,step); return -1;	}
221 	produced += step; done= 0;
222 	}
223 
224     *pConsumed += consumed;
225     return produced;
226     }
227 
228 /************************************************************************/
229 /*									*/
230 /*  Convert bytes using iconv. This is used in both directions.		*/
231 /*									*/
232 /*  Note that GNU iconv() expects a 'char **' as its second argument	*/
233 /*  rather than a 'const char **' as documented in the single UNIX spec.*/
234 /*  See: http://www.opengroup.org/pubs/online/7908799/xsh/iconv.html.	*/
235 /*									*/
236 /************************************************************************/
237 
textConverterConvertIconv(TextConverter * tc,struct TextConverterImpl * tci,void * through,int * pConsumed,int produced,const char * arg_ibuf,size_t ileft)238 static int textConverterConvertIconv(	TextConverter *		tc,
239 					struct TextConverterImpl *	tci,
240 					void *			through,
241 					int *			pConsumed,
242 					int			produced,
243 					const char *		arg_ibuf,
244 					size_t			ileft )
245     {
246     char		scratch[750];
247     char *		obuf= scratch;
248     size_t		oleft= sizeof(scratch);
249     int			consumed= 0;
250 
251     iconv_t		ico= (iconv_t)tci;
252 
253 #   if defined(__GNUC__) && ! defined(iconv)
254     char *		ibuf= (char *)arg_ibuf;
255 #   else
256     const char *	ibuf= arg_ibuf;
257 #   endif
258 
259     while( ileft > 0 )
260 	{
261 	const char *	iibuf= ibuf;
262 
263 	while( oleft > 0 && ileft > 0 )
264 	    {
265 	    /*  return value is irrelevant: Just tells that the	*/
266 	    /*  conversion was incomplete. So does ileft > 0.	*/
267 
268 	    (void)iconv( ico, &ibuf, &ileft, &obuf, &oleft );
269 
270 	    if  ( ibuf == iibuf )
271 		{ /*XXDEB(ibuf,iibuf);*/ break;	}
272 
273 	    consumed += ibuf- iibuf;
274 	    iibuf= ibuf;
275 	    }
276 
277 	if  ( obuf > scratch )
278 	    {
279 	    int	step;
280 
281 	    step= textConverterProduce( tc, through, produced, scratch, obuf- scratch );
282 	    if  ( step < 0 )
283 		{ LLLDEB(produced,obuf- scratch,step); return -1;	}
284 
285 	    produced += step;
286 	    obuf= scratch;
287 	    oleft= sizeof(scratch);
288 	    }
289 
290 	if  ( ileft > 0 && errno == EILSEQ )
291 	    { /*LSCDEB(errno,strerror(errno),*ibuf);*/ break;	}
292 	if  ( ileft > 0 && errno != E2BIG )
293 	    { LSDEB(errno,strerror(errno)); return -1;	}
294 	}
295 
296     *pConsumed += consumed;
297     return produced;
298     }
299 
300 /************************************************************************/
301 /*									*/
302 /*  Convert the input to UTF-8 bytes.					*/
303 /*									*/
304 /*  Note that GNU iconv() expects a 'char **' as its second argument	*/
305 /*  rather than a 'const char **' as documented in the single UNIX spec.*/
306 /*  See: http://www.opengroup.org/pubs/online/7908799/xsh/iconv.html.	*/
307 /*									*/
308 /*  Also note that iconv_open() returns an (iconv_t)-1 rather than	*/
309 /*  (iconv_t)0 on failure as is to be expected for a call that returns	*/
310 /*  a (void *) pointer.							*/
311 /*									*/
312 /************************************************************************/
313 
textConverterConvertToUtf8(TextConverter * tc,void * through,int * pConsumed,int produced,const char * text,int len)314 int textConverterConvertToUtf8(	TextConverter *		tc,
315 				void *			through,
316 				int *			pConsumed,
317 				int			produced,
318 				const char *		text,
319 				int			len )
320     {
321     if  ( tc				&&
322 	  tc->tcNativeEncodingName	&&
323 	  tc->tcNativeEncodingName[0]	)
324 	{
325 	if  ( (iconv_t)tc->tcIconvToUtf8 == (iconv_t)-1 )
326 	    {
327 	    tc->tcIconvToUtf8= (struct TextConverterImpl *)
328 			iconv_open( "UTF-8", tc->tcNativeEncodingName );
329 
330 	    if  ( (iconv_t)tc->tcIconvToUtf8 == (iconv_t)-1		&&
331 		  ! strcmp( tc->tcNativeEncodingName, "SYMBOL" )	)
332 		{
333 		return textConverterConvertBytesToUtf8( tc,
334 					    uniSymbolGlyphUnicodes,
335 					    through, pConsumed,
336 					    produced, text, len );
337 		}
338 
339 	    if  ( (iconv_t)tc->tcIconvToUtf8 == (iconv_t)-1		&&
340 		  ! strcmp( tc->tcNativeEncodingName, "DINGBATS" )	)
341 		{
342 		return textConverterConvertBytesToUtf8( tc,
343 					    uniDingbatsGlyphUnicodes,
344 					    through, pConsumed,
345 					    produced, text, len );
346 		}
347 
348 	    if  ( (iconv_t)tc->tcIconvToUtf8 == (iconv_t)-1 )
349 		{
350 		SXDEB(tc->tcNativeEncodingName,tc->tcIconvToUtf8);
351 		return -1;
352 		}
353 	    }
354 
355 	produced= textConverterConvertIconv( tc, tc->tcIconvToUtf8,
356 					    through, pConsumed,
357 					    produced, (char *)text, len );
358 	if  ( produced < 0 )
359 	    { LDEB(produced); return -1;	}
360 	}
361     else{
362 	int	step;
363 
364 	step= textConverterProduce( tc, through, produced, text, len );
365 	if  ( step < 0 )
366 	    { LLLDEB(produced,len,step); return -1;	}
367 
368 	produced += step;
369 	*pConsumed += len;
370 	}
371 
372     return produced;
373     }
374 
375 /************************************************************************/
376 /*									*/
377 /*  Convert the UTF-8 input to legacy bytes.				*/
378 /*									*/
379 /*  Note that GNU iconv() expects a 'char **' as its second argument	*/
380 /*  rather than a 'const char **' as documented in the single UNIX spec.*/
381 /*  See: http://www.opengroup.org/pubs/online/7908799/xsh/iconv.html.	*/
382 /*									*/
383 /*  Also note that iconv_open() returns an (iconv_t)-1 rather than	*/
384 /*  (iconv_t)0 on failure as is to be expected for a call that returns	*/
385 /*  a (void *) pointer.							*/
386 /*									*/
387 /************************************************************************/
388 
textConverterConvertFromUtf8(TextConverter * tc,void * through,int * pConsumed,int produced,const char * text,int len)389 int textConverterConvertFromUtf8(	TextConverter *		tc,
390 					void *			through,
391 					int *			pConsumed,
392 					int			produced,
393 					const char *		text,
394 					int			len )
395     {
396     if  ( tc				&&
397 	  tc->tcNativeEncodingName	&&
398 	  tc->tcNativeEncodingName[0]	)
399 	{
400 	if  ( (iconv_t)tc->tcIconvFrUtf8 == (iconv_t)-1 )
401 	    {
402 	    tc->tcIconvFrUtf8= (struct TextConverterImpl *)
403 			    iconv_open( tc->tcNativeEncodingName, "UTF-8" );
404 
405 	    if  ( (iconv_t)tc->tcIconvFrUtf8 == (iconv_t)-1		&&
406 		  ! strcmp( tc->tcNativeEncodingName, "SYMBOL" )	)
407 		{
408 		return textConverterConvertBytesFromUtf8(
409 					    &UNI_SymbolToGlyphMapping,
410 					    uniSymbolGlyphUnicodes,
411 					    through, tc->tcProduce, pConsumed,
412 					    produced, text, len );
413 		}
414 
415 	    if  ( (iconv_t)tc->tcIconvFrUtf8 == (iconv_t)-1		&&
416 		  ! strcmp( tc->tcNativeEncodingName, "DINGBATS" )	)
417 		{
418 #		if 1
419 		/*  Emit the character as a \u12345 unicode */
420 		return 0;
421 #		else
422 		return textConverterConvertBytesFromUtf8(
423 					    &UNI_DingbatsToGlyphMapping,
424 					    uniDingbatsGlyphUnicodes,
425 					    through, tc->tcProduce, pConsumed,
426 					    produced, text, len );
427 #		endif
428 		}
429 
430 	    if  ( (iconv_t)tc->tcIconvFrUtf8 == (iconv_t)-1 )
431 		{
432 		SXDEB(tc->tcNativeEncodingName,tc->tcIconvFrUtf8);
433 		return -1;
434 		}
435 	    }
436 
437 	produced= textConverterConvertIconv( tc, tc->tcIconvFrUtf8,
438 					    through, pConsumed,
439 					    produced, text, len );
440 	if  ( produced < 0 )
441 	    { LDEB(produced); return -1;	}
442 	}
443     else{
444 	int	step;
445 
446 	step= textConverterProduce( tc, through, produced, text, len );
447 	if  ( step < 0 )
448 	    { LLLDEB(produced,len,step); return -1;	}
449 
450 	produced += step;
451 	*pConsumed += len;
452 	}
453 
454     return produced;
455     }
456 
textConverterSetProduce(struct TextConverter * tc,TextConverterProduce produce)457 void textConverterSetProduce(	struct TextConverter *	tc,
458 				TextConverterProduce	produce )
459     {
460     tc->tcProduce= produce;
461     }
462