1 /*
2 conversion to other encodings
3
4 Copyright (C) 2000-2003 David Necas (Yeti) <yeti@physics.muni.cz>
5
6 This program is free software; you can redistribute it and/or modify it
7 under the terms of version 2 of the GNU General Public License as published
8 by the Free Software Foundation.
9
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 more details.
14
15 You should have received a copy of the GNU General Public License along
16 with this program; if not, write to the Free Software Foundation, Inc.,
17 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
18 */
19 #include "common.h"
20
21 #ifdef HAVE_SYS_WAIT_H
22 # include <sys/wait.h>
23 #else
24 pid_t waitpid(pid_t pid, int *status, int options);
25 #endif
26
27 /* We can't go on w/o this, defining struct stat manually is braindamaged. */
28 #include <sys/types.h>
29 #include <sys/stat.h>
30
31 /* converter flags */
32 #define CONV_EXTERN 0x0001
33
34 /* converter-type (filename, input encoding, output encoding) */
35 typedef int (* ConverterFunc)(File*, EncaEncoding);
36
37 /* struct converter data */
38 typedef struct _ConverterData ConverterData;
39
40 struct _ConverterData {
41 unsigned long int flags; /* flags */
42 ConverterFunc convfunc; /* pointer to converter function */
43 };
44
45 /* struct converter list */
46 typedef struct _Converter Converter;
47
48 struct _Converter {
49 const Abbreviation *conv; /* the converter (an abbreviation table entry) */
50 Converter *next; /* next in the list */
51 };
52
53 /* converter list */
54 static Converter *converters = NULL;
55
56 /* data for xtable */
57 static struct {
58 size_t ncharsets; /* number of charsets */
59 int *charsets; /* charset id's for active language [ncharsets] */
60 byte *tables; /* tables from charsets to target_charset [ncharsets * 0x100] */
61 int *have_table; /* whether particular table is already cached [ncharsets] */
62 unsigned int *ucs2_map; /* temporary space for map computation [0x10000] */
63 unsigned int target_map[0x100];
64 }
65 xdata = { 0, NULL, NULL, NULL, NULL, { 0 } };
66
67 /* Local prototypes. */
68 static int convert_builtin (File *file,
69 EncaEncoding from_enc);
70 static const byte* xtable (int from_charset);
71 static void xdata_free (void);
72
73 static const ConverterData cdata_builtin = { 0, &convert_builtin };
74 #ifdef HAVE_LIBRECODE
75 static const ConverterData cdata_librecode = { 0, &convert_recode };
76 #endif /* HAVE_LIBRECODE */
77 #ifdef HAVE_GOOD_ICONV
78 static const ConverterData cdata_iconv = { 0, &convert_iconv };
79 #endif /* HAVE_GOOD_ICONV */
80 #ifdef ENABLE_EXTERNAL
81 static const ConverterData cdata_extern = { CONV_EXTERN, &convert_external };
82 #endif /* ENABLE_EXTERNAL */
83
84 static const Abbreviation CONVERTERS[] = {
85 { "built-in", &cdata_builtin },
86 #ifdef HAVE_LIBRECODE
87 { "librecode", &cdata_librecode },
88 #endif /* HAVE_LIBRECODE */
89 #ifdef HAVE_GOOD_ICONV
90 { "iconv", &cdata_iconv },
91 #endif /* HAVE_GOOD_ICONV */
92 #ifdef ENABLE_EXTERNAL
93 { "extern", &cdata_extern }
94 #endif /* ENABLE_EXTERNAL */
95 };
96
97 /* decide which converter should be run and do common checks
98 from_enc, to_enc are current and requested encoding
99 returns error code
100
101 it doesn't open the file (guess() did it) and doesn't close it (caller does
102 it) */
103 int
convert(File * file,EncaEncoding from_enc)104 convert(File *file,
105 EncaEncoding from_enc)
106 {
107 Converter *conv;
108 int err;
109
110 if (options.verbosity_level) {
111 fprintf(stderr, "%s: converting `%s': %s\n",
112 program_name, ffname_r(file->name),
113 format_request_string(from_enc, options.target_enc, 0));
114 }
115
116 /* do nothing when requested encoding is current encoding
117 (`nothing' may include copying stdin to stdout) */
118 if (from_enc.charset == options.target_enc.charset
119 && from_enc.surface == options.target_enc.surface) {
120 if (file->name != NULL)
121 return ERR_OK;
122 else
123 return copy_and_convert(file, file, NULL);
124 }
125
126 /* try sequentially all allowed converters until we find some that can
127 perform the conversion or exahust the list */
128 conv = converters;
129 while (conv != NULL) {
130 if (options.verbosity_level > 1) {
131 fprintf(stderr, " trying to convert `%s' using %s\n",
132 ffname_r(file->name), conv->conv->name);
133 }
134 err = ((ConverterData *)conv->conv->data)->convfunc(file, from_enc);
135 if (err == ERR_OK)
136 return ERR_OK;
137
138 if ((((ConverterData *)conv->conv->data)->flags & CONV_EXTERN) != 0) {
139 fprintf(stderr, "%s: external converter failed on `%s', "
140 "probably destroying it\n",
141 program_name, ffname_w(file->name));
142 }
143 /* don't tempt fate in case of i/o or other serious problems */
144 if (err != ERR_CANNOT)
145 return ERR_IOFAIL;
146
147 conv = conv->next;
148 }
149
150 /* no converter able/allowed to perform given conversion, that's bad */
151 fprintf(stderr, "%s: no converter is able/allowed to perform "
152 "conversion %s on file `%s'\n",
153 program_name,
154 format_request_string(from_enc, options.target_enc, 0),
155 ffname_r(file->name));
156
157 /* nevertheless stdin should be copied to stdout anyway it cannot make
158 more mess */
159 if (file->name == NULL)
160 copy_and_convert(file, file, NULL);
161
162 return ERR_CANNOT;
163 }
164
165 /* built-in converter
166 performs conversion by in place modification of file named fname
167 or by calling copy_and_convert() for stdin -> stdout conversion
168 returns zero on success, error code otherwise */
169 static int
convert_builtin(File * file,EncaEncoding from_enc)170 convert_builtin(File *file,
171 EncaEncoding from_enc)
172 {
173 static int ascii = ENCA_CS_UNKNOWN;
174
175 Buffer *buf; /* file->buffer alias */
176 const byte *xlat; /* conversion table */
177
178 if (!enca_charset_is_known(ascii)) {
179 ascii = enca_name_to_charset("ascii");
180 assert(enca_charset_is_known(ascii));
181 }
182
183 /* surfaces can cause fail iff user specificaly requested some
184 * or when they are other type than EOLs */
185 {
186 EncaSurface srf = options.target_enc.surface ^ from_enc.surface;
187
188 if ((options.target_enc.surface
189 && from_enc.surface != options.target_enc.surface)
190 || srf != (srf & ENCA_SURFACE_MASK_EOL)) {
191 if (options.verbosity_level > 2)
192 fprintf(stderr, "%s: built-in: cannot convert between "
193 "different surfaces\n",
194 program_name);
195 return ERR_CANNOT;
196 }
197 }
198
199 /* catch trivial conversions */
200 {
201 int identity = 0;
202
203 if (from_enc.charset == options.target_enc.charset)
204 identity = 1;
205
206 if (from_enc.charset == ascii
207 && enca_charset_is_8bit(options.target_enc.charset)
208 && !enca_charset_is_binary(options.target_enc.charset))
209 identity = 1;
210
211 if (identity) {
212 if (file->name == NULL)
213 return copy_and_convert(file, file, NULL);
214 else
215 return ERR_OK;
216 }
217 }
218
219 xlat = xtable(from_enc.charset);
220 if (xlat == NULL)
221 return ERR_CANNOT;
222
223 if (file->name == NULL)
224 return copy_and_convert(file, file, xlat);
225
226 /* read buffer_size bytes, convert, write back, etc. to death (or eof,
227 whichever come first) */
228 buf = file->buffer;
229 buf->pos = 0;
230 if (file_seek(file, 0, SEEK_SET) == -1)
231 return ERR_IOFAIL;
232
233 do {
234 if (file_read(file) == -1)
235 return ERR_IOFAIL;
236
237 if (buf->pos == 0)
238 break;
239
240 {
241 size_t len = buf->pos;
242 byte *p = buf->data;
243 do {
244 *p = xlat[*p];
245 p++;
246 } while (--len);
247 }
248
249 if (file_seek(file, -(buf->pos), SEEK_CUR) == -1)
250 return ERR_IOFAIL;
251
252 if (file_write(file) == -1)
253 return ERR_IOFAIL;
254
255 /* XXX: apparent no-op
256 but ISO C requires fseek() or ftell() between subsequent fwrite() and
257 fread(), or else the latter _may_ read nonsense -- and it actually does
258 read nonsense with glibc-2.2 (at least); see fopen(3) */
259 if (file_seek(file, 0, SEEK_CUR) == -1)
260 return ERR_IOFAIL;
261
262 } while (1);
263
264 return ERR_OK;
265 }
266
267 /* copy file file_from to file file_to, optionally performing xlat conversion
268 (if not NULL)
269 file_from has to be already opened for reading,
270 file_to has to be already opened for writing
271 they have to share common buffer
272 returns 0 on success, nonzero on failure */
273 int
copy_and_convert(File * file_from,File * file_to,const byte * xlat)274 copy_and_convert(File *file_from, File *file_to, const byte *xlat)
275 {
276 Buffer *buf; /* file_from->buffer alias */
277
278 if (xlat == NULL && options.verbosity_level > 3)
279 fprintf(stderr, " copying `%s' to `%s'\n",
280 ffname_r(file_from->name),
281 ffname_w(file_to->name));
282
283 assert(file_from->buffer == file_to->buffer);
284 buf = file_from->buffer;
285 /* If there's something in the buffer, process it first. */
286 if (file_from->buffer->pos != 0) {
287 if (xlat != NULL) {
288 size_t len = buf->pos;
289 byte *p = buf->data;
290 do {
291 *p = xlat[*p];
292 p++;
293 } while (--len);
294 }
295 if (file_write(file_to) == -1)
296 return ERR_IOFAIL;
297 }
298 /* Then copy the rest. */
299 do {
300 if (file_read(file_from) == -1)
301 return ERR_IOFAIL;
302
303 if (buf->pos == 0)
304 break;
305
306 if (xlat != NULL) {
307 size_t len = buf->pos;
308 byte *p = buf->data;
309 do {
310 *p = xlat[*p];
311 p++;
312 } while (--len);
313 }
314
315 if (file_write(file_to) == -1)
316 return ERR_IOFAIL;
317 } while (1);
318 fflush(file_to->stream);
319
320 return ERR_OK;
321 }
322
323 /* add converter to list of converters
324 (note `none' adds nothing and causes removing of all converters instead)
325 returns zero if everything went ok, nonzero otherwise */
326 int
add_converter(const char * cname)327 add_converter(const char *cname)
328 {
329 /* no converters symbolic name */
330 static const char *CONVERTER_NAME_NONE = "none";
331
332 const Abbreviation *data;
333 Converter *conv = NULL, *conv1;
334
335 /* remove everything when we got `none' */
336 if (strcmp(CONVERTER_NAME_NONE, cname) == 0) {
337 if (options.verbosity_level > 3)
338 fprintf(stderr, "Removing all converters\n");
339 while (converters != NULL) {
340 conv = converters->next;
341 enca_free(converters);
342 converters = conv;
343 }
344 return 0;
345 }
346
347 /* find converter data */
348 data = expand_abbreviation(cname, CONVERTERS, ELEMENTS(CONVERTERS),
349 "converter");
350 if (data == NULL)
351 return 1;
352
353 /* add it to the end of converter list */
354 if (options.verbosity_level > 3)
355 fprintf(stderr, "Adding converter `%s'\n", data->name);
356 if (converters == NULL)
357 converters = conv = NEW(Converter, 1);
358 else {
359 for (conv1 = converters; conv1 != NULL; conv1 = conv1->next) {
360 /* reject duplicities */
361 if (data == conv1->conv->data) {
362 fprintf(stderr, "%s: converter %s specified more than once\n",
363 program_name,
364 conv1->conv->name);
365 return 1;
366 }
367 conv = conv1;
368 }
369
370 conv->next = NEW(Converter, 1);
371 conv = conv->next;
372 }
373 conv->next = NULL;
374 conv->conv = data;
375
376 return 0;
377 }
378
379 /* return nonzero if the list contains external converter */
380 int
external_converter_listed(void)381 external_converter_listed(void)
382 {
383 Converter *conv;
384
385 for (conv = converters; conv; conv = conv->next) {
386 if (((ConverterData*)conv->conv->data)->flags & CONV_EXTERN)
387 return 1;
388 }
389
390 return 0;
391 }
392
393 /* print white separated list of all valid converter names */
394 void
print_converter_list(void)395 print_converter_list(void)
396 {
397 size_t i;
398
399 for (i = 0; i < sizeof(CONVERTERS)/sizeof(Abbreviation); i++)
400 printf("%s\n", CONVERTERS[i].name);
401 }
402
403 /* create and return request string for conversion from e1 to e2
404 filters out natrual surfaces || mask
405 is NOT thread-safe
406 returned string must NOT be freed and must be cosidered volatile */
407 const char*
format_request_string(EncaEncoding e1,EncaEncoding e2,EncaSurface mask)408 format_request_string(EncaEncoding e1,
409 EncaEncoding e2,
410 EncaSurface mask)
411 {
412 static char *s = NULL;
413 char *p, *q;
414 const char *e2_name, *e1_name;
415
416 enca_free(s);
417 /* build s sequentially since value returned by surface_name() is lost
418 by the second call */
419 e1_name = enca_charset_name(e1.charset, ENCA_NAME_STYLE_ENCA);
420 p = enca_get_surface_name(e1.surface
421 & ~(enca_charset_natural_surface(e1.charset)
422 | mask),
423 ENCA_NAME_STYLE_ENCA);
424 if (!enca_charset_is_known(e2.charset)) {
425 q = enca_strdup("");
426 e2_name = options.target_enc_str;
427 }
428 else {
429 q = enca_get_surface_name(e2.surface
430 & ~(enca_charset_natural_surface(e2.charset)
431 | mask),
432 ENCA_NAME_STYLE_ENCA);
433 e2_name = enca_charset_name(e2.charset, ENCA_NAME_STYLE_ENCA);
434 }
435
436 s = enca_strconcat(e1_name, p, "..", e2_name, q, NULL);
437
438 enca_free(p);
439 enca_free(q);
440
441 return s;
442 }
443
444 /**
445 * xtable:
446 * @from_charset: Charset id for which the conversion table should be returned.
447 *
448 * Returns translation table from charset @from to (global) target charset.
449 *
450 * The returned table must be considered constant and must NOT be freed.
451 *
452 * Only conversion between charsets of one language is supported. We assume
453 * a language contains all known charsets usable for represenation of texts,
454 * so other charsets are taken as incompatible.
455 *
456 * Globals used: options.target_enc.charset, options.language.
457 *
458 * Returns: The conversion table [0x100]; #NULL on failure.
459 **/
460 static const byte*
xtable(int from_charset)461 xtable(int from_charset)
462 {
463 static int xtable_initialized = 0;
464
465 unsigned int from_map[0x100];
466 size_t i;
467 ssize_t fidx;
468
469 if (!enca_charset_has_ucs2_map(options.target_enc.charset)
470 || !enca_charset_has_ucs2_map(from_charset))
471 return NULL;
472
473 /* Initialize when we are called the first time. */
474 if (!xtable_initialized) {
475 /* Allocate various tables. Never freed. */
476 xdata.charsets = enca_get_language_charsets(options.language,
477 &xdata.ncharsets);
478 assert(xdata.ncharsets > 1);
479 xdata.have_table = NEW(int, xdata.ncharsets);
480 xdata.tables = NEW(byte, 0x100*xdata.ncharsets);
481 xdata.ucs2_map = NEW(unsigned int, 0x10000);
482
483 for (i = 0; i < xdata.ncharsets; i++)
484 xdata.have_table[i] = 0;
485
486 /* Initialize tables to identity */
487 for (i = 0; i < 0x100; i++)
488 xdata.tables[i] = (byte)i;
489 for (i = 1; i < xdata.ncharsets; i++)
490 memcpy(xdata.tables + 0x100*i, xdata.tables, 0x100);
491
492 /* Check whether target_charset belongs to given language */
493 fidx = -1;
494 for (i = 0; i < xdata.ncharsets; i++) {
495 if (xdata.charsets[i] == options.target_enc.charset) {
496 fidx = i;
497 break;
498 }
499 }
500 if (fidx < 0)
501 return NULL;
502
503 {
504 int map_created;
505 map_created = enca_charset_ucs2_map(options.target_enc.charset,
506 xdata.target_map);
507 assert(map_created);
508 }
509 atexit(xdata_free);
510 }
511
512 /* Check whether from_charset belongs to given language */
513 fidx = -1;
514 for (i = 0; i < xdata.ncharsets; i++) {
515 if (xdata.charsets[i] == from_charset) {
516 fidx = i;
517 break;
518 }
519 }
520 if (fidx < 0)
521 return NULL;
522
523 /* Return table if cached. */
524 if (xdata.have_table[fidx])
525 return xdata.tables + 0x100*fidx;
526
527 /* Otherwise it must be generated */
528 {
529 int map_created;
530 map_created = enca_charset_ucs2_map(from_charset, from_map);
531 assert(map_created);
532 }
533
534 for (i = 0; i < 0x10000; i++)
535 xdata.ucs2_map[i] = ENCA_NOT_A_CHAR;
536
537 for (i = 0; i < 0x100; i++) {
538 size_t j = 0xff - i;
539
540 if (xdata.target_map[j] != ENCA_NOT_A_CHAR)
541 xdata.ucs2_map[xdata.target_map[j]] = (unsigned int)j;
542 }
543
544 /* XXX XXX XXX XXX XXX Warning: Extreme brain damage! XXX XXX XXX XXX XXX
545 * When converting to ibm866 we have to replace Belarusian/Ukrainian i/I
546 * with Latin versions. I've been told everybody expect this. */
547 if (options.target_enc.charset == enca_name_to_charset("ibm866")) {
548 xdata.ucs2_map[0x0406] = (byte)'I';
549 xdata.ucs2_map[0x0456] = (byte)'i';
550 }
551
552 for (i = 0; i < 0x100; i++) {
553 size_t j = 0xff - i;
554
555 if (from_map[j] != ENCA_NOT_A_CHAR
556 && xdata.ucs2_map[from_map[j]] != ENCA_NOT_A_CHAR)
557 xdata.tables[0x100*fidx + j] = (byte)xdata.ucs2_map[from_map[j]];
558 }
559
560 return xdata.tables + 0x100*fidx;
561 }
562
563 static void
xdata_free(void)564 xdata_free(void)
565 {
566 enca_free(xdata.charsets);
567 enca_free(xdata.tables);
568 enca_free(xdata.have_table);
569 enca_free(xdata.ucs2_map);
570 }
571
572 /* vim: ts=2
573 */
574