1 /************************************************************************/
2 /* */
3 /* Convert text from/to UTF-8 using iconv. */
4 /* */
5 /************************************************************************/
6
7 # include "textEncodingConfig.h"
8
9 # include <stdlib.h>
10 # include <string.h>
11 # include <errno.h>
12 # include <iconv.h>
13
14 # include "uniUtf8.h"
15 # include <appDebugon.h>
16
17 # include "textConverter.h"
18 # include "textConverterImpl.h"
19 # include <utilIndexMapping.h>
20 # include "uniLegacyEncoding.h"
21 # include "uniLegacyMapping.h"
22
textConverterSetNativeEncodingName(TextConverter * tc,const char * encodingName)23 void textConverterSetNativeEncodingName( TextConverter * tc,
24 const char * encodingName )
25 {
26 if ( ! encodingName )
27 { XDEB(encodingName); return; }
28
29 /* Shortcut that also covers the both null case */
30 if ( tc->tcNativeEncodingName == encodingName )
31 { return; }
32
33 if ( ( tc->tcNativeEncodingName && ! encodingName ) ||
34 ( ! tc->tcNativeEncodingName && encodingName ) ||
35 strcmp( tc->tcNativeEncodingName, encodingName ) )
36 {
37 char * oldName= tc->tcNativeEncodingName;
38
39 if ( (iconv_t)tc->tcIconvToUtf8 != (iconv_t)-1 )
40 {
41 iconv_close( (iconv_t)tc->tcIconvToUtf8 );
42 tc->tcIconvToUtf8= (struct TextConverterImpl *)(iconv_t)-1;
43 }
44
45 if ( (iconv_t)tc->tcIconvFrUtf8 != (iconv_t)-1 )
46 {
47 iconv_close( (iconv_t)tc->tcIconvFrUtf8 );
48 tc->tcIconvFrUtf8= (struct TextConverterImpl *)(iconv_t)-1;
49 }
50
51 if ( encodingName )
52 { tc->tcNativeEncodingName= strdup( encodingName ); }
53 else{ tc->tcNativeEncodingName= (char *)0; }
54
55 if ( oldName )
56 { free( oldName ); }
57 }
58
59 return;
60 }
61
textInitTextConverter(TextConverter * tc)62 void textInitTextConverter( TextConverter * tc )
63 {
64 tc->tcNativeEncodingName= (char *)0;
65 tc->tcIconvToUtf8= (struct TextConverterImpl *)(iconv_t)-1;
66 tc->tcIconvFrUtf8= (struct TextConverterImpl *)(iconv_t)-1;
67
68 tc->tcProduce= (TextConverterProduce)0;
69 }
70
textCleanTextConverter(TextConverter * tc)71 void textCleanTextConverter( TextConverter * tc )
72 {
73 if ( tc->tcNativeEncodingName )
74 { free( tc->tcNativeEncodingName ); }
75
76 if ( (iconv_t)tc->tcIconvToUtf8 != (iconv_t)-1 )
77 {
78 iconv_close( (iconv_t)tc->tcIconvToUtf8 );
79 tc->tcIconvToUtf8= (struct TextConverterImpl *)(iconv_t)-1;
80 }
81
82 if ( (iconv_t)tc->tcIconvFrUtf8 != (iconv_t)-1 )
83 {
84 iconv_close( (iconv_t)tc->tcIconvFrUtf8 );
85 tc->tcIconvFrUtf8= (struct TextConverterImpl *)(iconv_t)-1;
86 }
87 }
88
89 /************************************************************************/
90
textConverterProduce(const TextConverter * tc,void * through,int produced,const char * text,int len)91 static int textConverterProduce( const TextConverter * tc,
92 void * through,
93 int produced,
94 const char * text,
95 int len )
96 {
97 int step;
98
99 step= (*tc->tcProduce)( through, produced, text, len );
100 if ( step < 0 )
101 { LLLDEB(produced,len,step); return -1; }
102
103 return step;
104 }
105
106 /************************************************************************/
107 /* */
108 /* Hack because iconv does not support the symbol encoding. */
109 /* Convert legacy bytes to UTF-8. */
110 /* */
111 /************************************************************************/
112
textConverterConvertBytesToUtf8(const TextConverter * tc,const int unicodes[256],void * through,int * pConsumed,int produced,const char * text,int len)113 static int textConverterConvertBytesToUtf8(
114 const TextConverter * tc,
115 const int unicodes[256],
116 void * through,
117 int * pConsumed,
118 int produced,
119 const char * text,
120 int len )
121 {
122 char scratch[750];
123 int buffered= 0;
124 int step;
125 const int scratchLen= sizeof(scratch)- 7;
126 int consumed= 0;
127
128 while( len > 0 )
129 {
130 if ( buffered >= scratchLen )
131 {
132 step= textConverterProduce( tc, through, produced, scratch, buffered );
133 if ( step < 0 )
134 { LLLDEB(produced,buffered,step); return -1; }
135 produced += step; buffered= 0;
136 }
137
138 if ( unicodes[*text&0xff] < 0 )
139 {
140 step= uniPutUtf8( scratch+ buffered, *text&0xff );
141 }
142 else{
143 step= uniPutUtf8( scratch+ buffered, unicodes[*text&0xff] );
144 }
145 if ( step < 1 )
146 { LDEB(step); return -1; }
147
148 text++; len--; consumed++; buffered += step;
149 }
150
151 if ( buffered > 0 )
152 {
153 step= textConverterProduce( tc, through, produced, scratch, buffered );
154 if ( step < 0 )
155 { LLLDEB(produced,buffered,step); return -1; }
156 produced += step; buffered= 0;
157 }
158
159 *pConsumed += consumed;
160 return produced;
161 }
162
163 /************************************************************************/
164 /* */
165 /* Hack because iconv does not support the symbol encoding. */
166 /* Convert UTF-8 to legacy. */
167 /* */
168 /************************************************************************/
169
textConverterConvertBytesFromUtf8(IndexMapping * im,const int unicodes[256],void * through,TextConverterProduce produce,int * pConsumed,int produced,const char * text,int len)170 static int textConverterConvertBytesFromUtf8(
171 IndexMapping * im,
172 const int unicodes[256],
173 void * through,
174 TextConverterProduce produce,
175 int * pConsumed,
176 int produced,
177 const char * text,
178 int len )
179 {
180 unsigned char scratch[750];
181 int done= 0;
182 int step;
183 const int scratchLen= sizeof(scratch);
184 int consumed= 0;
185
186 if ( utilIndexMappingIsEmpty( im ) &&
187 unicodes &&
188 utilIndexMappingBuildBackward( im, unicodes, 256 ) )
189 { XDEB(unicodes); return -1; }
190
191 while( len > 0 )
192 {
193 unsigned short symbol;
194 int code;
195
196 if ( done >= scratchLen )
197 {
198 step= (*produce)( through, produced, (char *)scratch, done );
199 if ( step < 0 )
200 { LLLDEB(produced,done,step); return -1; }
201 produced += step; done= 0;
202 }
203
204 step= uniGetUtf8( &symbol, text );
205 if ( step < 1 )
206 { LDEB(step); return -1; }
207
208 code= utilIndexMappingGetU( im, symbol );
209 if ( code < 0 )
210 { break; }
211
212 scratch[done++]= code;
213 text += step; len -= step; consumed += step;
214 }
215
216 if ( done > 0 )
217 {
218 step= (*produce)( through, produced, (char *)scratch, done );
219 if ( step < 0 )
220 { LLLDEB(produced,done,step); return -1; }
221 produced += step; done= 0;
222 }
223
224 *pConsumed += consumed;
225 return produced;
226 }
227
228 /************************************************************************/
229 /* */
230 /* Convert bytes using iconv. This is used in both directions. */
231 /* */
232 /* Note that GNU iconv() expects a 'char **' as its second argument */
233 /* rather than a 'const char **' as documented in the single UNIX spec.*/
234 /* See: http://www.opengroup.org/pubs/online/7908799/xsh/iconv.html. */
235 /* */
236 /************************************************************************/
237
textConverterConvertIconv(TextConverter * tc,struct TextConverterImpl * tci,void * through,int * pConsumed,int produced,const char * arg_ibuf,size_t ileft)238 static int textConverterConvertIconv( TextConverter * tc,
239 struct TextConverterImpl * tci,
240 void * through,
241 int * pConsumed,
242 int produced,
243 const char * arg_ibuf,
244 size_t ileft )
245 {
246 char scratch[750];
247 char * obuf= scratch;
248 size_t oleft= sizeof(scratch);
249 int consumed= 0;
250
251 iconv_t ico= (iconv_t)tci;
252
253 # if defined(__GNUC__) && ! defined(iconv)
254 char * ibuf= (char *)arg_ibuf;
255 # else
256 const char * ibuf= arg_ibuf;
257 # endif
258
259 while( ileft > 0 )
260 {
261 const char * iibuf= ibuf;
262
263 while( oleft > 0 && ileft > 0 )
264 {
265 /* return value is irrelevant: Just tells that the */
266 /* conversion was incomplete. So does ileft > 0. */
267
268 (void)iconv( ico, &ibuf, &ileft, &obuf, &oleft );
269
270 if ( ibuf == iibuf )
271 { /*XXDEB(ibuf,iibuf);*/ break; }
272
273 consumed += ibuf- iibuf;
274 iibuf= ibuf;
275 }
276
277 if ( obuf > scratch )
278 {
279 int step;
280
281 step= textConverterProduce( tc, through, produced, scratch, obuf- scratch );
282 if ( step < 0 )
283 { LLLDEB(produced,obuf- scratch,step); return -1; }
284
285 produced += step;
286 obuf= scratch;
287 oleft= sizeof(scratch);
288 }
289
290 if ( ileft > 0 && errno == EILSEQ )
291 { /*LSCDEB(errno,strerror(errno),*ibuf);*/ break; }
292 if ( ileft > 0 && errno != E2BIG )
293 { LSDEB(errno,strerror(errno)); return -1; }
294 }
295
296 *pConsumed += consumed;
297 return produced;
298 }
299
300 /************************************************************************/
301 /* */
302 /* Convert the input to UTF-8 bytes. */
303 /* */
304 /* Note that GNU iconv() expects a 'char **' as its second argument */
305 /* rather than a 'const char **' as documented in the single UNIX spec.*/
306 /* See: http://www.opengroup.org/pubs/online/7908799/xsh/iconv.html. */
307 /* */
308 /* Also note that iconv_open() returns an (iconv_t)-1 rather than */
309 /* (iconv_t)0 on failure as is to be expected for a call that returns */
310 /* a (void *) pointer. */
311 /* */
312 /************************************************************************/
313
textConverterConvertToUtf8(TextConverter * tc,void * through,int * pConsumed,int produced,const char * text,int len)314 int textConverterConvertToUtf8( TextConverter * tc,
315 void * through,
316 int * pConsumed,
317 int produced,
318 const char * text,
319 int len )
320 {
321 if ( tc &&
322 tc->tcNativeEncodingName &&
323 tc->tcNativeEncodingName[0] )
324 {
325 if ( (iconv_t)tc->tcIconvToUtf8 == (iconv_t)-1 )
326 {
327 tc->tcIconvToUtf8= (struct TextConverterImpl *)
328 iconv_open( "UTF-8", tc->tcNativeEncodingName );
329
330 if ( (iconv_t)tc->tcIconvToUtf8 == (iconv_t)-1 &&
331 ! strcmp( tc->tcNativeEncodingName, "SYMBOL" ) )
332 {
333 return textConverterConvertBytesToUtf8( tc,
334 uniSymbolGlyphUnicodes,
335 through, pConsumed,
336 produced, text, len );
337 }
338
339 if ( (iconv_t)tc->tcIconvToUtf8 == (iconv_t)-1 &&
340 ! strcmp( tc->tcNativeEncodingName, "DINGBATS" ) )
341 {
342 return textConverterConvertBytesToUtf8( tc,
343 uniDingbatsGlyphUnicodes,
344 through, pConsumed,
345 produced, text, len );
346 }
347
348 if ( (iconv_t)tc->tcIconvToUtf8 == (iconv_t)-1 )
349 {
350 SXDEB(tc->tcNativeEncodingName,tc->tcIconvToUtf8);
351 return -1;
352 }
353 }
354
355 produced= textConverterConvertIconv( tc, tc->tcIconvToUtf8,
356 through, pConsumed,
357 produced, (char *)text, len );
358 if ( produced < 0 )
359 { LDEB(produced); return -1; }
360 }
361 else{
362 int step;
363
364 step= textConverterProduce( tc, through, produced, text, len );
365 if ( step < 0 )
366 { LLLDEB(produced,len,step); return -1; }
367
368 produced += step;
369 *pConsumed += len;
370 }
371
372 return produced;
373 }
374
375 /************************************************************************/
376 /* */
377 /* Convert the UTF-8 input to legacy bytes. */
378 /* */
379 /* Note that GNU iconv() expects a 'char **' as its second argument */
380 /* rather than a 'const char **' as documented in the single UNIX spec.*/
381 /* See: http://www.opengroup.org/pubs/online/7908799/xsh/iconv.html. */
382 /* */
383 /* Also note that iconv_open() returns an (iconv_t)-1 rather than */
384 /* (iconv_t)0 on failure as is to be expected for a call that returns */
385 /* a (void *) pointer. */
386 /* */
387 /************************************************************************/
388
textConverterConvertFromUtf8(TextConverter * tc,void * through,int * pConsumed,int produced,const char * text,int len)389 int textConverterConvertFromUtf8( TextConverter * tc,
390 void * through,
391 int * pConsumed,
392 int produced,
393 const char * text,
394 int len )
395 {
396 if ( tc &&
397 tc->tcNativeEncodingName &&
398 tc->tcNativeEncodingName[0] )
399 {
400 if ( (iconv_t)tc->tcIconvFrUtf8 == (iconv_t)-1 )
401 {
402 tc->tcIconvFrUtf8= (struct TextConverterImpl *)
403 iconv_open( tc->tcNativeEncodingName, "UTF-8" );
404
405 if ( (iconv_t)tc->tcIconvFrUtf8 == (iconv_t)-1 &&
406 ! strcmp( tc->tcNativeEncodingName, "SYMBOL" ) )
407 {
408 return textConverterConvertBytesFromUtf8(
409 &UNI_SymbolToGlyphMapping,
410 uniSymbolGlyphUnicodes,
411 through, tc->tcProduce, pConsumed,
412 produced, text, len );
413 }
414
415 if ( (iconv_t)tc->tcIconvFrUtf8 == (iconv_t)-1 &&
416 ! strcmp( tc->tcNativeEncodingName, "DINGBATS" ) )
417 {
418 # if 1
419 /* Emit the character as a \u12345 unicode */
420 return 0;
421 # else
422 return textConverterConvertBytesFromUtf8(
423 &UNI_DingbatsToGlyphMapping,
424 uniDingbatsGlyphUnicodes,
425 through, tc->tcProduce, pConsumed,
426 produced, text, len );
427 # endif
428 }
429
430 if ( (iconv_t)tc->tcIconvFrUtf8 == (iconv_t)-1 )
431 {
432 SXDEB(tc->tcNativeEncodingName,tc->tcIconvFrUtf8);
433 return -1;
434 }
435 }
436
437 produced= textConverterConvertIconv( tc, tc->tcIconvFrUtf8,
438 through, pConsumed,
439 produced, text, len );
440 if ( produced < 0 )
441 { LDEB(produced); return -1; }
442 }
443 else{
444 int step;
445
446 step= textConverterProduce( tc, through, produced, text, len );
447 if ( step < 0 )
448 { LLLDEB(produced,len,step); return -1; }
449
450 produced += step;
451 *pConsumed += len;
452 }
453
454 return produced;
455 }
456
textConverterSetProduce(struct TextConverter * tc,TextConverterProduce produce)457 void textConverterSetProduce( struct TextConverter * tc,
458 TextConverterProduce produce )
459 {
460 tc->tcProduce= produce;
461 }
462