1 /* mconv.c -- Code converter. -*- coding: euc-jp; -*-
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
5
6 This file is part of the m17n library.
7
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
12
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 Boston, MA 02110-1301 USA. */
22
23 /***en
24 @enpage m17n-conv convert file code
25
26 @section m17n-conv-synopsis SYNOPSIS
27
28 m17n-conv [ OPTION ... ] [ INFILE [ OUTFILE ] ]
29
30 @section m17n-conv-description DESCRIPTION
31
32 Convert encoding of given files from one to another.
33
34 If INFILE is omitted, the input is taken from standard input. If
35 OUTFILE is omitted, the output written to standard output.
36
37 The following OPTIONs are available.
38
39 <ul>
40
41 <li> -f FROMCODE
42
43 FROMCODE is the encoding of INFILE (defaults to UTF-8).
44
45 <li> -t TOCODE
46
47 TOCODE is the encoding of OUTFILE (defaults to UTF-8).
48
49 <li> -k
50
51 Do not stop conversion on error.
52
53 <li> -s
54
55 Suppress warnings.
56
57 <li> -v
58
59 Print progress information.
60
61 <li> -l
62
63 List available encodings.
64
65 <li> --version
66
67 Print version number.
68
69 <li> -h, --help
70
71 Print this message.
72
73 </ul>
74 */
75 /***ja
76 @japage m17n-conv �ե�����Υ����ɤ��Ѵ�����
77
78 @section m17n-conv-synopsis SYNOPSIS
79
80 m17n-conv [ OPTION ... ] [ INFILE [ OUTFILE ] ]
81
82 @section m17n-conv-description ����
83
84 Ϳ����줿�ե�����Υ����ɤ��̤Τ�Τ��Ѵ����롣
85
86 INFILE ����ά���줿���ϡ�ɸ�����Ϥ���Ȥ롣OUTFILE ����ά���줿
87 ���ϡ�ɸ����ϤؽФ���
88
89 �ʲ��Υ��ץ�������ѤǤ��롣
90
91 <ul>
92
93 <li> -f FROMCODE
94
95 FROMCODE �� INFILE �Υ����ɷϤǤ��롣(�ǥե���Ȥ� UTF-8)
96
97 <li> -t TOCODE
98
99 TOCODE �� OUTFILE �Υ����ɷϤǤ��롣(�ǥե���Ȥ� UTF-8)
100
101 <li> -k
102
103 ���顼���Ѵ�����ߤ��ʤ���
104
105 <li> -s
106
107 �ٹ��ɽ�����ʤ���
108
109 <li> -v
110
111 �ʹԾ�����ɽ�����롣
112
113 <li> -l
114
115 ���Ѳ�ǽ�ʥ����ɷϤ���롣
116
117 <li> --version
118
119 �С�������ֹ��ɽ�����롣
120
121 <li> -h, --help
122
123 ���Υ�å�������ɽ�����롣
124
125 </ul>
126 */
127
128 #ifndef FOR_DOXYGEN
129
130 #include <stdio.h>
131 #include <stdlib.h>
132 #include <string.h>
133
134 #include <m17n.h>
135 #include <m17n-misc.h>
136
137 /* Print all coding system names. */
138
139 int
compare_coding_name(const void * elt1,const void * elt2)140 compare_coding_name (const void *elt1, const void *elt2)
141 {
142 const MSymbol *n1 = elt1;
143 const MSymbol *n2 = elt2;
144
145 return strcmp (msymbol_name (*n1), msymbol_name (*n2));
146 }
147
148 void
list_coding()149 list_coding ()
150 {
151 MSymbol *codings;
152 int i, n;
153 char *name;
154 int len, clm;
155
156 n = mconv_list_codings (&codings);
157 qsort (codings, n, sizeof (MSymbol), compare_coding_name);
158 clm = 0;
159 for (i = 0; i < n; i++)
160 {
161 name = msymbol_name (codings[i]);
162 len = strlen (name) + 1;
163 if (clm + len >= 80)
164 {
165 printf ("\n");
166 clm = 0;
167 }
168 printf (" %s", name);
169 clm += len;
170 }
171 printf ("\n");
172 free (codings);
173 }
174
175
176 /* Print the usage of this program (the name is PROG), and exit with
177 EXIT_CODE. */
178
179 void
help_exit(char * prog,int exit_code)180 help_exit (char *prog, int exit_code)
181 {
182 char *p = prog;
183
184 while (*p)
185 if (*p++ == '/')
186 prog = p;
187
188 printf ("Usage: %s [ OPTION ... ] [ INFILE [ OUTFILE ] ]\n", prog);
189 printf ("Convert encoding of given files from one to another.\n");
190 printf (" If INFILE is omitted, the input is taken from standard input.\n");
191 printf (" If OUTFILE is omitted, the output is written to standard output.\n");
192 printf ("The following OPTIONs are available.\n");
193 printf (" %-13s %s", "-f FROMCODE",
194 "FROMCODE is the encoding of INFILE (defaults to UTF-8).\n");
195 printf (" %-13s %s", "-t TOCODE",
196 "TOCODE is the encoding of OUTFILE (defaults to UTF-8).\n");
197 printf (" %-13s %s", "-k", "Do not stop conversion on error.\n");
198 printf (" %-13s %s", "-s", "Suppress warnings.\n");
199 printf (" %-13s %s", "-v", "Print progress information.\n");
200 printf (" %-13s %s", "-l", "List available encodings.\n");
201 printf (" %-13s %s", "--version", "Print version number.\n");
202 printf (" %-13s %s", "-h, --help", "Print this message.\n");
203 exit (exit_code);
204 }
205
206 /* Global flags to control the behaviour. */
207 int suppress_warning;
208 int continue_on_error;
209
210 /* Check invalid bytes found in the last decoding. Text property
211 Mcharset of such a byte is Mcharset_binary. */
212
213 void
check_invalid_bytes(MText * mt)214 check_invalid_bytes (MText *mt)
215 {
216 int from = 0, to = 0;
217 int len = mtext_len (mt);
218 int first = 1;
219
220 while (to < len)
221 {
222 int n = mtext_prop_range (mt, Mcharset, from, NULL, &to, 1);
223 MSymbol charset
224 = n > 0 ? (MSymbol) mtext_get_prop (mt, from, Mcharset) : Mnil;
225
226 if (charset == Mcharset_binary)
227 {
228 if (! suppress_warning)
229 {
230 if (first)
231 {
232 fprintf (stderr,
233 "Invalid bytes (at each character position);\n");
234 first = 0;
235 }
236 for (; from < to; from++)
237 fprintf (stderr, " 0x%02X(%d)",
238 mtext_ref_char (mt, from), from);
239 }
240 if (! continue_on_error)
241 {
242 if (! first)
243 fprintf (stderr, "\n");
244 exit (1);
245 }
246 }
247 else
248 from = to;
249 }
250 if (! first)
251 fprintf (stderr, "\n");
252 }
253
254
255 /* Check unencoded characters in the last encoding. Text property
256 Mcoding of such a character is Mnil. */
257
258 void
check_unencoded_chars(MText * mt,int len)259 check_unencoded_chars (MText *mt, int len)
260 {
261 int from = 0, to = 0;
262 int first = 1;
263
264 while (to < len)
265 {
266 int n = mtext_prop_range (mt, Mcoding, from, NULL, &to, 1);
267 MSymbol coding
268 = n > 0 ? (MSymbol) mtext_get_prop (mt, from, Mcoding) : Mnil;
269
270 if (coding == Mnil)
271 {
272 if (! suppress_warning)
273 {
274 if (first)
275 {
276 fprintf (stderr,
277 "Unencoded chars (at each character position):\n");
278 first = 0;
279 }
280 for (; from < to; from++)
281 fprintf (stderr, " 0x%02X(%d)",
282 mtext_ref_char (mt, from), from);
283 }
284 if (! continue_on_error)
285 {
286 if (! first)
287 fprintf (stderr, "\n");
288 exit (1);
289 }
290 }
291 else
292 from = to;
293 }
294 if (! first)
295 fprintf (stderr, "\n");
296 }
297
298
299 void
unknown_encoding(char * name)300 unknown_encoding (char *name)
301 {
302 if (! suppress_warning)
303 {
304 fprintf (stderr, "Unknown encoding: \"%s\"\n", name);
305 if (mconv_resolve_coding (msymbol ("iso-2022-jp")) == Mnil)
306 fprintf (stderr, "Perhaps the library \"m17n-db\" is missing.\n");
307 }
308 exit (1);
309 }
310
311 /* Format MSG by FMT and print the result to the stderr, and exit. */
312
313 #define FATAL_ERROR(fmt, arg) \
314 do { \
315 if (! suppress_warning) \
316 fprintf (stderr, fmt, arg); \
317 exit (1); \
318 } while (0)
319
320
321 int
main(int argc,char ** argv)322 main (int argc, char **argv)
323 {
324 int verbose;
325 MSymbol incode, outcode;
326 FILE *in, *out;
327 MText *mt;
328 MConverter *converter;
329 int i;
330
331 /* Initialize the m17n library. */
332 M17N_INIT ();
333 if (merror_code != MERROR_NONE)
334 FATAL_ERROR ("%s\n", "Fail to initialize the m17n library.");
335
336 /* Default encodings are both UTF-8. */
337 incode = outcode = Mcoding_utf_8;
338 /* By default, read from standard input and write to standard output. */
339 in = stdin, out = stdout;
340 /* By default, all these flags are 0. */
341 suppress_warning = verbose = continue_on_error = 0;
342 /* Parse the command line arguments. */
343 for (i = 1; i < argc; i++)
344 {
345 if (! strcmp (argv[i], "--help")
346 || ! strcmp (argv[i], "-h")
347 || ! strcmp (argv[i], "-?"))
348 help_exit (argv[0], 0);
349 else if (! strcmp (argv[i], "--version"))
350 {
351 printf ("m17n-conv (m17n library) %s\n", M17NLIB_VERSION_NAME);
352 printf ("Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 AIST, JAPAN\n");
353 exit (0);
354 }
355 else if (! strcmp (argv[i], "-l"))
356 {
357 list_coding ();
358 M17N_FINI ();
359 exit (0);
360 }
361 else if (! strcmp (argv[i], "-f") && i + 1 < argc)
362 {
363 incode = mconv_resolve_coding (msymbol (argv[++i]));
364 if (incode == Mnil)
365 unknown_encoding (argv[i]);
366 }
367 else if (! strcmp (argv[i], "-t") && i + 1 < argc)
368 {
369 outcode = mconv_resolve_coding (msymbol (argv[++i]));
370 if (outcode == Mnil)
371 unknown_encoding (argv[i]);
372 }
373 else if (! strcmp (argv[i], "-k"))
374 continue_on_error = 1;
375 else if (! strcmp (argv[i], "-s"))
376 suppress_warning = 1;
377 else if (! strcmp (argv[i], "-v"))
378 verbose = 1;
379 else if (argv[i][0] != '-')
380 {
381 if (in == stdin)
382 {
383 in = fopen (argv[i], "r");
384 if (! in)
385 FATAL_ERROR ("Can't read the file %s\n", argv[i]);
386 }
387 else if (out == stdout)
388 {
389 out = fopen (argv[i], "w");
390 if (! out)
391 FATAL_ERROR ("Can't write the file %s\n", argv[i]);
392 }
393 else
394 help_exit (argv[0], 1);
395 }
396 else
397 help_exit (argv[0], 1);
398 }
399 if (verbose)
400 suppress_warning = 0;
401
402 /* Create an M-text to store the decoded characters. */
403 mt = mtext ();
404
405 /* Create a converter for decoding. */
406 converter = mconv_stream_converter (incode, in);
407 if (! converter)
408 FATAL_ERROR ("Encoding \"%s\" requires the missing library \"m17n-db\".\n",
409 msymbol_name (incode));
410 /* Instead of doing strict decoding, we decode all input bytes at
411 once, and check invalid bytes later by the fuction
412 check_invalid_bytes. */
413 converter->lenient = 1;
414
415 mconv_decode (converter, mt);
416
417 check_invalid_bytes (mt);
418 if (verbose)
419 fprintf (stderr, "%d bytes (%s) decoded into %d characters,\n",
420 converter->nbytes, msymbol_name (incode), mtext_len (mt));
421
422 mconv_free_converter (converter);
423
424 /* Create a converter for encoding. */
425 converter = mconv_stream_converter (outcode, out);
426 if (! converter)
427 FATAL_ERROR ("Encoding \"%s\" requires the missing library \"m17n-db\".\n",
428 msymbol_name (outcode));
429 /* Instead of doing strict encoding, we encode all characters at
430 once, and check unencoded characters later by the fuction
431 check_unencoded_chars. */
432 converter->lenient = 1;
433 converter->last_block = 1;
434 if (mconv_encode (converter, mt) < 0
435 && ! suppress_warning)
436 fprintf (stderr, "I/O error on writing\n");
437 check_unencoded_chars (mt, converter->nchars);
438 if (verbose)
439 fprintf (stderr, "%d characters encoded into %d bytes (%s).\n",
440 converter->nchars, converter->nbytes, msymbol_name (outcode));
441
442 /* Clear away. */
443 mconv_free_converter (converter);
444 fclose (in);
445 fclose (out);
446 m17n_object_unref (mt);
447 M17N_FINI ();
448 exit (0);
449 }
450 #endif /* not FOR_DOXYGEN */
451