1 /*
2   conversion to other encodings
3 
4   Copyright (C) 2000-2003 David Necas (Yeti) <yeti@physics.muni.cz>
5 
6   This program is free software; you can redistribute it and/or modify it
7   under the terms of version 2 of the GNU General Public License as published
8   by the Free Software Foundation.
9 
10   This program is distributed in the hope that it will be useful, but WITHOUT
11   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13   more details.
14 
15   You should have received a copy of the GNU General Public License along
16   with this program; if not, write to the Free Software Foundation, Inc.,
17   59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
18 */
19 #include "common.h"
20 
21 #ifdef HAVE_SYS_WAIT_H
22 # include <sys/wait.h>
23 #else
24 pid_t waitpid(pid_t pid, int *status, int options);
25 #endif
26 
27 /* We can't go on w/o this, defining struct stat manually is braindamaged. */
28 #include <sys/types.h>
29 #include <sys/stat.h>
30 
31 /* converter flags */
32 #define CONV_EXTERN   0x0001
33 
34 /* converter-type (filename, input encoding, output encoding) */
35 typedef int (* ConverterFunc)(File*, EncaEncoding);
36 
37 /* struct converter data */
38 typedef struct _ConverterData ConverterData;
39 
40 struct _ConverterData {
41   unsigned long int flags; /* flags */
42   ConverterFunc convfunc; /* pointer to converter function */
43 };
44 
45 /* struct converter list */
46 typedef struct _Converter Converter;
47 
48 struct _Converter {
49   const Abbreviation *conv; /* the converter (an abbreviation table entry) */
50   Converter *next; /* next in the list */
51 };
52 
53 /* converter list */
54 static Converter *converters = NULL;
55 
56 /* data for xtable */
57 static struct {
58   size_t ncharsets; /* number of charsets */
59   int *charsets; /* charset id's for active language [ncharsets] */
60   byte *tables; /* tables from charsets to target_charset [ncharsets * 0x100] */
61   int *have_table; /* whether particular table is already cached [ncharsets] */
62   unsigned int *ucs2_map; /* temporary space for map computation [0x10000] */
63   unsigned int target_map[0x100];
64 }
65 xdata = { 0, NULL, NULL, NULL, NULL, { 0 } };
66 
67 /* Local prototypes. */
68 static int   convert_builtin (File *file,
69                               EncaEncoding from_enc);
70 static const byte* xtable    (int from_charset);
71 static void  xdata_free      (void);
72 
73 static const ConverterData cdata_builtin = { 0, &convert_builtin };
74 #ifdef HAVE_LIBRECODE
75 static const ConverterData cdata_librecode = { 0, &convert_recode };
76 #endif /* HAVE_LIBRECODE */
77 #ifdef HAVE_GOOD_ICONV
78 static const ConverterData cdata_iconv = { 0, &convert_iconv };
79 #endif /* HAVE_GOOD_ICONV */
80 #ifdef ENABLE_EXTERNAL
81 static const ConverterData cdata_extern = { CONV_EXTERN, &convert_external };
82 #endif /* ENABLE_EXTERNAL */
83 
84 static const Abbreviation CONVERTERS[] = {
85   { "built-in", &cdata_builtin },
86 #ifdef HAVE_LIBRECODE
87   { "librecode", &cdata_librecode },
88 #endif /* HAVE_LIBRECODE */
89 #ifdef HAVE_GOOD_ICONV
90   { "iconv", &cdata_iconv },
91 #endif /* HAVE_GOOD_ICONV */
92 #ifdef ENABLE_EXTERNAL
93   { "extern", &cdata_extern }
94 #endif /* ENABLE_EXTERNAL */
95 };
96 
97 /* decide which converter should be run and do common checks
98    from_enc, to_enc are current and requested encoding
99    returns error code
100 
101    it doesn't open the file (guess() did it) and doesn't close it (caller does
102    it) */
103 int
convert(File * file,EncaEncoding from_enc)104 convert(File *file,
105         EncaEncoding from_enc)
106 {
107   Converter *conv;
108   int err;
109 
110   if (options.verbosity_level) {
111     fprintf(stderr, "%s: converting `%s': %s\n",
112                     program_name, ffname_r(file->name),
113                     format_request_string(from_enc, options.target_enc, 0));
114   }
115 
116   /* do nothing when requested encoding is current encoding
117      (`nothing' may include copying stdin to stdout) */
118   if (from_enc.charset == options.target_enc.charset
119       && from_enc.surface == options.target_enc.surface) {
120     if (file->name != NULL)
121       return ERR_OK;
122     else
123       return copy_and_convert(file, file, NULL);
124   }
125 
126   /* try sequentially all allowed converters until we find some that can
127      perform the conversion or exahust the list */
128   conv = converters;
129   while (conv != NULL) {
130     if (options.verbosity_level > 1) {
131       fprintf(stderr, "    trying to convert `%s' using %s\n",
132                       ffname_r(file->name), conv->conv->name);
133     }
134     err = ((ConverterData *)conv->conv->data)->convfunc(file, from_enc);
135     if (err == ERR_OK)
136       return ERR_OK;
137 
138     if ((((ConverterData *)conv->conv->data)->flags & CONV_EXTERN) != 0) {
139       fprintf(stderr, "%s: external converter failed on `%s', "
140                       "probably destroying it\n",
141                       program_name, ffname_w(file->name));
142     }
143     /* don't tempt fate in case of i/o or other serious problems */
144     if (err != ERR_CANNOT)
145       return ERR_IOFAIL;
146 
147     conv = conv->next;
148   }
149 
150   /* no converter able/allowed to perform given conversion, that's bad */
151   fprintf(stderr, "%s: no converter is able/allowed to perform "
152                   "conversion %s on file `%s'\n",
153                   program_name,
154                   format_request_string(from_enc, options.target_enc, 0),
155                   ffname_r(file->name));
156 
157   /* nevertheless stdin should be copied to stdout anyway it cannot make
158      more mess */
159   if (file->name == NULL)
160     copy_and_convert(file, file, NULL);
161 
162   return ERR_CANNOT;
163 }
164 
165 /* built-in converter
166    performs conversion by in place modification of file named fname
167    or by calling copy_and_convert() for stdin -> stdout conversion
168    returns zero on success, error code otherwise */
169 static int
convert_builtin(File * file,EncaEncoding from_enc)170 convert_builtin(File *file,
171                 EncaEncoding from_enc)
172 {
173   static int ascii = ENCA_CS_UNKNOWN;
174 
175   Buffer *buf; /* file->buffer alias */
176   const byte *xlat; /* conversion table */
177 
178   if (!enca_charset_is_known(ascii)) {
179     ascii = enca_name_to_charset("ascii");
180     assert(enca_charset_is_known(ascii));
181   }
182 
183   /* surfaces can cause fail iff user specificaly requested some
184    * or when they are other type than EOLs */
185   {
186     EncaSurface srf = options.target_enc.surface ^ from_enc.surface;
187 
188     if ((options.target_enc.surface
189          && from_enc.surface != options.target_enc.surface)
190          || srf != (srf & ENCA_SURFACE_MASK_EOL)) {
191       if (options.verbosity_level > 2)
192         fprintf(stderr, "%s: built-in: cannot convert between "
193                 "different surfaces\n",
194                 program_name);
195       return ERR_CANNOT;
196     }
197   }
198 
199   /* catch trivial conversions */
200   {
201     int identity = 0;
202 
203     if (from_enc.charset == options.target_enc.charset)
204       identity = 1;
205 
206     if (from_enc.charset == ascii
207         && enca_charset_is_8bit(options.target_enc.charset)
208         && !enca_charset_is_binary(options.target_enc.charset))
209       identity = 1;
210 
211     if (identity) {
212       if (file->name == NULL)
213         return copy_and_convert(file, file, NULL);
214       else
215         return ERR_OK;
216     }
217   }
218 
219   xlat = xtable(from_enc.charset);
220   if (xlat == NULL)
221     return ERR_CANNOT;
222 
223   if (file->name == NULL)
224     return copy_and_convert(file, file, xlat);
225 
226   /* read buffer_size bytes, convert, write back, etc. to death (or eof,
227      whichever come first) */
228   buf = file->buffer;
229   buf->pos = 0;
230   if (file_seek(file, 0, SEEK_SET) == -1)
231     return ERR_IOFAIL;
232 
233   do {
234     if (file_read(file) == -1)
235       return ERR_IOFAIL;
236 
237     if (buf->pos == 0)
238       break;
239 
240     {
241       size_t len = buf->pos;
242       byte *p = buf->data;
243       do {
244         *p = xlat[*p];
245         p++;
246       } while (--len);
247     }
248 
249     if (file_seek(file, -(buf->pos), SEEK_CUR) == -1)
250       return ERR_IOFAIL;
251 
252     if (file_write(file) == -1)
253       return ERR_IOFAIL;
254 
255     /* XXX: apparent no-op
256        but ISO C requires fseek() or ftell() between subsequent fwrite() and
257        fread(), or else the latter _may_ read nonsense -- and it actually does
258        read nonsense with glibc-2.2 (at least); see fopen(3) */
259     if (file_seek(file, 0, SEEK_CUR) == -1)
260       return ERR_IOFAIL;
261 
262   } while (1);
263 
264   return ERR_OK;
265 }
266 
267 /* copy file file_from to file file_to, optionally performing xlat conversion
268    (if not NULL)
269    file_from has to be already opened for reading,
270    file_to has to be already opened for writing
271    they have to share common buffer
272    returns 0 on success, nonzero on failure */
273 int
copy_and_convert(File * file_from,File * file_to,const byte * xlat)274 copy_and_convert(File *file_from, File *file_to, const byte *xlat)
275 {
276   Buffer *buf; /* file_from->buffer alias */
277 
278   if (xlat == NULL && options.verbosity_level > 3)
279     fprintf(stderr, "    copying `%s' to `%s'\n",
280                     ffname_r(file_from->name),
281                     ffname_w(file_to->name));
282 
283   assert(file_from->buffer == file_to->buffer);
284   buf = file_from->buffer;
285   /* If there's something in the buffer, process it first. */
286   if (file_from->buffer->pos != 0) {
287     if (xlat != NULL) {
288       size_t len = buf->pos;
289       byte *p = buf->data;
290       do {
291         *p = xlat[*p];
292         p++;
293       } while (--len);
294     }
295     if (file_write(file_to) == -1)
296       return ERR_IOFAIL;
297   }
298   /* Then copy the rest. */
299   do {
300     if (file_read(file_from) == -1)
301       return ERR_IOFAIL;
302 
303     if (buf->pos == 0)
304       break;
305 
306     if (xlat != NULL) {
307       size_t len = buf->pos;
308       byte *p = buf->data;
309       do {
310         *p = xlat[*p];
311         p++;
312       } while (--len);
313     }
314 
315     if (file_write(file_to) == -1)
316       return ERR_IOFAIL;
317   } while (1);
318   fflush(file_to->stream);
319 
320   return ERR_OK;
321 }
322 
323 /* add converter to list of converters
324    (note `none' adds nothing and causes removing of all converters instead)
325    returns zero if everything went ok, nonzero otherwise */
326 int
add_converter(const char * cname)327 add_converter(const char *cname)
328 {
329   /* no converters symbolic name */
330   static const char *CONVERTER_NAME_NONE = "none";
331 
332   const Abbreviation *data;
333   Converter *conv = NULL, *conv1;
334 
335   /* remove everything when we got `none' */
336   if (strcmp(CONVERTER_NAME_NONE, cname) == 0) {
337     if (options.verbosity_level > 3)
338       fprintf(stderr, "Removing all converters\n");
339     while (converters != NULL) {
340       conv = converters->next;
341       enca_free(converters);
342       converters = conv;
343     }
344     return 0;
345   }
346 
347   /* find converter data */
348   data = expand_abbreviation(cname, CONVERTERS, ELEMENTS(CONVERTERS),
349                              "converter");
350   if (data == NULL)
351     return 1;
352 
353   /* add it to the end of converter list */
354   if (options.verbosity_level > 3)
355     fprintf(stderr, "Adding converter `%s'\n", data->name);
356   if (converters == NULL)
357     converters = conv = NEW(Converter, 1);
358   else {
359     for (conv1 = converters; conv1 != NULL; conv1 = conv1->next) {
360       /* reject duplicities */
361       if (data == conv1->conv->data) {
362         fprintf(stderr, "%s: converter %s specified more than once\n",
363                        program_name,
364                        conv1->conv->name);
365         return 1;
366       }
367       conv = conv1;
368     }
369 
370     conv->next = NEW(Converter, 1);
371     conv = conv->next;
372   }
373   conv->next = NULL;
374   conv->conv = data;
375 
376   return 0;
377 }
378 
379 /* return nonzero if the list contains external converter */
380 int
external_converter_listed(void)381 external_converter_listed(void)
382 {
383   Converter *conv;
384 
385   for (conv = converters; conv; conv = conv->next) {
386     if (((ConverterData*)conv->conv->data)->flags & CONV_EXTERN)
387       return 1;
388   }
389 
390   return 0;
391 }
392 
393 /* print white separated list of all valid converter names */
394 void
print_converter_list(void)395 print_converter_list(void)
396 {
397   size_t i;
398 
399   for (i = 0; i < sizeof(CONVERTERS)/sizeof(Abbreviation); i++)
400     printf("%s\n", CONVERTERS[i].name);
401 }
402 
403 /* create and return request string for conversion from e1 to e2
404    filters out natrual surfaces || mask
405    is NOT thread-safe
406    returned string must NOT be freed and must be cosidered volatile */
407 const char*
format_request_string(EncaEncoding e1,EncaEncoding e2,EncaSurface mask)408 format_request_string(EncaEncoding e1,
409                       EncaEncoding e2,
410                       EncaSurface mask)
411 {
412   static char *s = NULL;
413   char *p, *q;
414   const char *e2_name, *e1_name;
415 
416   enca_free(s);
417   /* build s sequentially since value returned by surface_name() is lost
418      by the second call */
419   e1_name = enca_charset_name(e1.charset, ENCA_NAME_STYLE_ENCA);
420   p = enca_get_surface_name(e1.surface
421                             & ~(enca_charset_natural_surface(e1.charset)
422                                 | mask),
423                             ENCA_NAME_STYLE_ENCA);
424   if (!enca_charset_is_known(e2.charset)) {
425     q = enca_strdup("");
426     e2_name = options.target_enc_str;
427   }
428   else {
429     q = enca_get_surface_name(e2.surface
430                               & ~(enca_charset_natural_surface(e2.charset)
431                                   | mask),
432                               ENCA_NAME_STYLE_ENCA);
433     e2_name = enca_charset_name(e2.charset, ENCA_NAME_STYLE_ENCA);
434   }
435 
436   s = enca_strconcat(e1_name, p, "..", e2_name, q, NULL);
437 
438   enca_free(p);
439   enca_free(q);
440 
441   return s;
442 }
443 
444 /**
445  * xtable:
446  * @from_charset: Charset id for which the conversion table should be returned.
447  *
448  * Returns translation table from charset @from to (global) target charset.
449  *
450  * The returned table must be considered constant and must NOT be freed.
451  *
452  * Only conversion between charsets of one language is supported.  We assume
453  * a language contains all known charsets usable for represenation of texts,
454  * so other charsets are taken as incompatible.
455  *
456  * Globals used: options.target_enc.charset, options.language.
457  *
458  * Returns: The conversion table [0x100]; #NULL on failure.
459  **/
460 static const byte*
xtable(int from_charset)461 xtable(int from_charset)
462 {
463   static int xtable_initialized = 0;
464 
465   unsigned int from_map[0x100];
466   size_t i;
467   ssize_t fidx;
468 
469   if (!enca_charset_has_ucs2_map(options.target_enc.charset)
470       || !enca_charset_has_ucs2_map(from_charset))
471     return NULL;
472 
473   /* Initialize when we are called the first time. */
474   if (!xtable_initialized) {
475     /* Allocate various tables.  Never freed. */
476     xdata.charsets = enca_get_language_charsets(options.language,
477                                                 &xdata.ncharsets);
478     assert(xdata.ncharsets > 1);
479     xdata.have_table = NEW(int, xdata.ncharsets);
480     xdata.tables = NEW(byte, 0x100*xdata.ncharsets);
481     xdata.ucs2_map = NEW(unsigned int, 0x10000);
482 
483     for (i = 0; i < xdata.ncharsets; i++)
484       xdata.have_table[i] = 0;
485 
486     /* Initialize tables to identity */
487     for (i = 0; i < 0x100; i++)
488       xdata.tables[i] = (byte)i;
489     for (i = 1; i < xdata.ncharsets; i++)
490       memcpy(xdata.tables + 0x100*i, xdata.tables, 0x100);
491 
492     /* Check whether target_charset belongs to given language */
493     fidx = -1;
494     for (i = 0; i < xdata.ncharsets; i++) {
495       if (xdata.charsets[i] == options.target_enc.charset) {
496         fidx = i;
497         break;
498       }
499     }
500     if (fidx < 0)
501       return NULL;
502 
503     {
504       int map_created;
505       map_created = enca_charset_ucs2_map(options.target_enc.charset,
506                                           xdata.target_map);
507       assert(map_created);
508     }
509     atexit(xdata_free);
510   }
511 
512   /* Check whether from_charset belongs to given language */
513   fidx = -1;
514   for (i = 0; i < xdata.ncharsets; i++) {
515     if (xdata.charsets[i] == from_charset) {
516       fidx = i;
517       break;
518     }
519   }
520   if (fidx < 0)
521     return NULL;
522 
523   /* Return table if cached. */
524   if (xdata.have_table[fidx])
525     return xdata.tables + 0x100*fidx;
526 
527   /* Otherwise it must be generated */
528   {
529     int map_created;
530     map_created = enca_charset_ucs2_map(from_charset, from_map);
531     assert(map_created);
532   }
533 
534   for (i = 0; i < 0x10000; i++)
535     xdata.ucs2_map[i] = ENCA_NOT_A_CHAR;
536 
537   for (i = 0; i < 0x100; i++) {
538     size_t j = 0xff - i;
539 
540     if (xdata.target_map[j] != ENCA_NOT_A_CHAR)
541       xdata.ucs2_map[xdata.target_map[j]] = (unsigned int)j;
542   }
543 
544   /* XXX XXX XXX XXX XXX Warning: Extreme brain damage! XXX XXX XXX XXX XXX
545    * When converting to ibm866 we have to replace Belarusian/Ukrainian i/I
546    * with Latin versions.  I've been told everybody expect this. */
547   if (options.target_enc.charset == enca_name_to_charset("ibm866")) {
548     xdata.ucs2_map[0x0406] = (byte)'I';
549     xdata.ucs2_map[0x0456] = (byte)'i';
550   }
551 
552   for (i = 0; i < 0x100; i++) {
553     size_t j = 0xff - i;
554 
555     if (from_map[j] != ENCA_NOT_A_CHAR
556         && xdata.ucs2_map[from_map[j]] != ENCA_NOT_A_CHAR)
557       xdata.tables[0x100*fidx + j] = (byte)xdata.ucs2_map[from_map[j]];
558   }
559 
560   return xdata.tables + 0x100*fidx;
561 }
562 
563 static void
xdata_free(void)564 xdata_free(void)
565 {
566   enca_free(xdata.charsets);
567   enca_free(xdata.tables);
568   enca_free(xdata.have_table);
569   enca_free(xdata.ucs2_map);
570 }
571 
572 /* vim: ts=2
573  */
574