1 /* mconv.c -- Code converter.				-*- coding: euc-jp; -*-
2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
3      National Institute of Advanced Industrial Science and Technology (AIST)
4      Registration Number H15PRO112
5 
6    This file is part of the m17n library.
7 
8    The m17n library is free software; you can redistribute it and/or
9    modify it under the terms of the GNU Lesser General Public License
10    as published by the Free Software Foundation; either version 2.1 of
11    the License, or (at your option) any later version.
12 
13    The m17n library is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16    Lesser General Public License for more details.
17 
18    You should have received a copy of the GNU Lesser General Public
19    License along with the m17n library; if not, write to the Free
20    Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21    Boston, MA 02110-1301 USA.  */
22 
23 /***en
24     @enpage m17n-conv convert file code
25 
26     @section m17n-conv-synopsis SYNOPSIS
27 
28     m17n-conv [ OPTION ... ] [ INFILE [ OUTFILE ] ]
29 
30     @section m17n-conv-description DESCRIPTION
31 
32     Convert encoding of given files from one to another.
33 
34     If INFILE is omitted, the input is taken from standard input.  If
35     OUTFILE is omitted, the output written to standard output.
36 
37     The following OPTIONs are available.
38 
39     <ul>
40 
41     <li> -f FROMCODE
42 
43     FROMCODE is the encoding of INFILE (defaults to UTF-8).
44 
45     <li> -t TOCODE
46 
47     TOCODE is the encoding of OUTFILE (defaults to UTF-8).
48 
49     <li> -k
50 
51     Do not stop conversion on error.
52 
53     <li> -s
54 
55     Suppress warnings.
56 
57     <li> -v
58 
59     Print progress information.
60 
61     <li> -l
62 
63     List available encodings.
64 
65     <li> --version
66 
67     Print version number.
68 
69     <li> -h, --help
70 
71     Print this message.
72 
73     </ul>
74 */
75 /***ja
76     @japage m17n-conv �ե�����Υ����ɤ��Ѵ�����
77 
78     @section m17n-conv-synopsis SYNOPSIS
79 
80     m17n-conv [ OPTION ... ] [ INFILE [ OUTFILE ] ]
81 
82     @section m17n-conv-description ����
83 
84     Ϳ����줿�ե�����Υ����ɤ��̤Τ�Τ��Ѵ����롣
85 
86     INFILE ����ά���줿���ϡ�ɸ�����Ϥ���Ȥ롣OUTFILE ����ά���줿
87     ���ϡ�ɸ����Ϥؽ��Ф���
88 
89     �ʲ��Υ��ץ�������ѤǤ��롣
90 
91     <ul>
92 
93     <li> -f FROMCODE
94 
95     FROMCODE �� INFILE �Υ����ɷϤǤ��롣(�ǥե���Ȥ� UTF-8)
96 
97     <li> -t TOCODE
98 
99     TOCODE �� OUTFILE �Υ����ɷϤǤ��롣(�ǥե���Ȥ� UTF-8)
100 
101     <li> -k
102 
103     ���顼���Ѵ�����ߤ��ʤ���
104 
105     <li> -s
106 
107     �ٹ��ɽ�����ʤ���
108 
109     <li> -v
110 
111     �ʹԾ�����ɽ�����롣
112 
113     <li> -l
114 
115     ���Ѳ�ǽ�ʥ����ɷϤ�����롣
116 
117     <li> --version
118 
119     �С�������ֹ��ɽ�����롣
120 
121     <li> -h, --help
122 
123     ���Υ�å�������ɽ�����롣
124 
125     </ul>
126 */
127 
128 #ifndef FOR_DOXYGEN
129 
130 #include <stdio.h>
131 #include <stdlib.h>
132 #include <string.h>
133 
134 #include <m17n.h>
135 #include <m17n-misc.h>
136 
137 /* Print all coding system names.  */
138 
139 int
compare_coding_name(const void * elt1,const void * elt2)140 compare_coding_name (const void *elt1, const void *elt2)
141 {
142   const MSymbol *n1 = elt1;
143   const MSymbol *n2 = elt2;
144 
145   return strcmp (msymbol_name (*n1), msymbol_name (*n2));
146 }
147 
148 void
list_coding()149 list_coding ()
150 {
151   MSymbol *codings;
152   int i, n;
153   char *name;
154   int len, clm;
155 
156   n = mconv_list_codings (&codings);
157   qsort (codings, n, sizeof (MSymbol), compare_coding_name);
158   clm = 0;
159   for (i = 0; i < n; i++)
160     {
161       name = msymbol_name (codings[i]);
162       len = strlen (name) + 1;
163       if (clm + len >= 80)
164 	{
165 	  printf ("\n");
166 	  clm = 0;
167 	}
168       printf (" %s", name);
169       clm += len;
170     }
171   printf ("\n");
172   free (codings);
173 }
174 
175 
176 /* Print the usage of this program (the name is PROG), and exit with
177    EXIT_CODE.  */
178 
179 void
help_exit(char * prog,int exit_code)180 help_exit (char *prog, int exit_code)
181 {
182   char *p = prog;
183 
184   while (*p)
185     if (*p++ == '/')
186       prog = p;
187 
188   printf ("Usage: %s [ OPTION ... ] [ INFILE [ OUTFILE ] ]\n", prog);
189   printf ("Convert encoding of given files from one to another.\n");
190   printf ("  If INFILE is omitted, the input is taken from standard input.\n");
191   printf ("  If OUTFILE is omitted, the output is written to standard output.\n");
192   printf ("The following OPTIONs are available.\n");
193   printf ("  %-13s %s", "-f FROMCODE",
194 	  "FROMCODE is the encoding of INFILE (defaults to UTF-8).\n");
195   printf ("  %-13s %s", "-t TOCODE",
196 	  "TOCODE is the encoding of OUTFILE (defaults to UTF-8).\n");
197   printf ("  %-13s %s", "-k", "Do not stop conversion on error.\n");
198   printf ("  %-13s %s", "-s", "Suppress warnings.\n");
199   printf ("  %-13s %s", "-v", "Print progress information.\n");
200   printf ("  %-13s %s", "-l", "List available encodings.\n");
201   printf ("  %-13s %s", "--version", "Print version number.\n");
202   printf ("  %-13s %s", "-h, --help", "Print this message.\n");
203   exit (exit_code);
204 }
205 
206 /* Global flags to control the behaviour.  */
207 int suppress_warning;
208 int continue_on_error;
209 
210 /* Check invalid bytes found in the last decoding.  Text property
211    Mcharset of such a byte is Mcharset_binary.  */
212 
213 void
check_invalid_bytes(MText * mt)214 check_invalid_bytes (MText *mt)
215 {
216   int from = 0, to = 0;
217   int len = mtext_len (mt);
218   int first = 1;
219 
220   while (to < len)
221     {
222       int n = mtext_prop_range (mt, Mcharset, from, NULL, &to, 1);
223       MSymbol charset
224 	= n > 0 ? (MSymbol) mtext_get_prop (mt, from, Mcharset) : Mnil;
225 
226       if (charset == Mcharset_binary)
227 	{
228 	  if (! suppress_warning)
229 	    {
230 	      if (first)
231 		{
232 		  fprintf (stderr,
233 			   "Invalid bytes (at each character position);\n");
234 		  first = 0;
235 		}
236 	      for (; from < to; from++)
237 		fprintf (stderr, " 0x%02X(%d)",
238 			 mtext_ref_char (mt, from), from);
239 	    }
240 	  if (! continue_on_error)
241 	    {
242 	      if (! first)
243 		fprintf (stderr, "\n");
244 	      exit (1);
245 	    }
246 	}
247       else
248 	from = to;
249     }
250   if (! first)
251     fprintf (stderr, "\n");
252 }
253 
254 
255 /* Check unencoded characters in the last encoding.  Text property
256    Mcoding of such a character is Mnil.  */
257 
258 void
check_unencoded_chars(MText * mt,int len)259 check_unencoded_chars (MText *mt, int len)
260 {
261   int from = 0, to = 0;
262   int first = 1;
263 
264   while (to < len)
265     {
266       int n = mtext_prop_range (mt, Mcoding, from, NULL, &to, 1);
267       MSymbol coding
268 	= n > 0 ? (MSymbol) mtext_get_prop (mt, from, Mcoding) : Mnil;
269 
270       if (coding == Mnil)
271 	{
272 	  if (! suppress_warning)
273 	    {
274 	      if (first)
275 		{
276 		  fprintf (stderr,
277 			   "Unencoded chars (at each character position):\n");
278 		  first = 0;
279 		}
280 	      for (; from < to; from++)
281 		fprintf (stderr, " 0x%02X(%d)",
282 			 mtext_ref_char (mt, from), from);
283 	    }
284 	  if (! continue_on_error)
285 	    {
286 	      if (! first)
287 		fprintf (stderr, "\n");
288 	      exit (1);
289 	    }
290 	}
291       else
292 	from = to;
293     }
294   if (! first)
295     fprintf (stderr, "\n");
296 }
297 
298 
299 void
unknown_encoding(char * name)300 unknown_encoding (char *name)
301 {
302   if (! suppress_warning)
303     {
304       fprintf (stderr, "Unknown encoding: \"%s\"\n", name);
305       if (mconv_resolve_coding (msymbol ("iso-2022-jp")) == Mnil)
306 	fprintf (stderr, "Perhaps the library \"m17n-db\" is missing.\n");
307     }
308   exit (1);
309 }
310 
311 /* Format MSG by FMT and print the result to the stderr, and exit.  */
312 
313 #define FATAL_ERROR(fmt, arg)		\
314   do {					\
315     if (! suppress_warning)		\
316       fprintf (stderr, fmt, arg);	\
317     exit (1);				\
318   } while (0)
319 
320 
321 int
main(int argc,char ** argv)322 main (int argc, char **argv)
323 {
324   int verbose;
325   MSymbol incode, outcode;
326   FILE *in, *out;
327   MText *mt;
328   MConverter *converter;
329   int i;
330 
331   /* Initialize the m17n library.  */
332   M17N_INIT ();
333   if (merror_code != MERROR_NONE)
334     FATAL_ERROR ("%s\n", "Fail to initialize the m17n library.");
335 
336   /* Default encodings are both UTF-8.  */
337   incode = outcode = Mcoding_utf_8;
338   /* By default, read from standard input and write to standard output. */
339   in = stdin, out = stdout;
340   /* By default, all these flags are 0.  */
341   suppress_warning = verbose = continue_on_error = 0;
342   /* Parse the command line arguments.  */
343   for (i = 1; i < argc; i++)
344     {
345       if (! strcmp (argv[i], "--help")
346 	       || ! strcmp (argv[i], "-h")
347 	       || ! strcmp (argv[i], "-?"))
348 	help_exit (argv[0], 0);
349       else if (! strcmp (argv[i], "--version"))
350 	{
351 	  printf ("m17n-conv (m17n library) %s\n", M17NLIB_VERSION_NAME);
352 	  printf ("Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 AIST, JAPAN\n");
353 	  exit (0);
354 	}
355       else if (! strcmp (argv[i], "-l"))
356 	{
357 	  list_coding ();
358 	  M17N_FINI ();
359 	  exit (0);
360 	}
361       else if (! strcmp (argv[i], "-f") && i + 1 < argc)
362 	{
363 	  incode = mconv_resolve_coding (msymbol (argv[++i]));
364 	  if (incode == Mnil)
365 	    unknown_encoding (argv[i]);
366 	}
367       else if (! strcmp (argv[i], "-t") && i + 1 < argc)
368 	{
369 	  outcode = mconv_resolve_coding (msymbol (argv[++i]));
370 	  if (outcode == Mnil)
371 	    unknown_encoding (argv[i]);
372 	}
373       else if (! strcmp (argv[i], "-k"))
374 	continue_on_error = 1;
375       else if (! strcmp (argv[i], "-s"))
376 	suppress_warning = 1;
377       else if (! strcmp (argv[i], "-v"))
378 	verbose = 1;
379       else if (argv[i][0] != '-')
380 	{
381 	  if (in == stdin)
382 	    {
383 	      in = fopen (argv[i], "r");
384 	      if (! in)
385 		FATAL_ERROR ("Can't read the file %s\n", argv[i]);
386 	    }
387 	  else if (out == stdout)
388 	    {
389 	      out = fopen (argv[i], "w");
390 	      if (! out)
391 		FATAL_ERROR ("Can't write the file %s\n", argv[i]);
392 	    }
393 	  else
394 	    help_exit (argv[0], 1);
395 	}
396       else
397 	help_exit (argv[0], 1);
398     }
399   if (verbose)
400     suppress_warning = 0;
401 
402   /* Create an M-text to store the decoded characters.  */
403   mt = mtext ();
404 
405   /* Create a converter for decoding.  */
406   converter = mconv_stream_converter (incode, in);
407   if (! converter)
408     FATAL_ERROR ("Encoding \"%s\" requires the missing library \"m17n-db\".\n",
409 		 msymbol_name (incode));
410   /* Instead of doing strict decoding, we decode all input bytes at
411      once, and check invalid bytes later by the fuction
412      check_invalid_bytes.  */
413   converter->lenient = 1;
414 
415   mconv_decode (converter, mt);
416 
417   check_invalid_bytes (mt);
418   if (verbose)
419     fprintf (stderr, "%d bytes (%s) decoded into %d characters,\n",
420 	     converter->nbytes, msymbol_name (incode), mtext_len (mt));
421 
422   mconv_free_converter (converter);
423 
424   /* Create a converter for encoding.  */
425   converter = mconv_stream_converter (outcode, out);
426   if (! converter)
427     FATAL_ERROR ("Encoding \"%s\" requires the missing library \"m17n-db\".\n",
428 		 msymbol_name (outcode));
429   /* Instead of doing strict encoding, we encode all characters at
430      once, and check unencoded characters later by the fuction
431      check_unencoded_chars.  */
432   converter->lenient = 1;
433   converter->last_block = 1;
434   if (mconv_encode (converter, mt) < 0
435       && ! suppress_warning)
436     fprintf (stderr, "I/O error on writing\n");
437   check_unencoded_chars (mt, converter->nchars);
438   if (verbose)
439     fprintf (stderr, "%d characters encoded into %d bytes (%s).\n",
440 	     converter->nchars, converter->nbytes, msymbol_name (outcode));
441 
442   /* Clear away.  */
443   mconv_free_converter (converter);
444   fclose (in);
445   fclose (out);
446   m17n_object_unref (mt);
447   M17N_FINI ();
448   exit (0);
449 }
450 #endif /* not FOR_DOXYGEN */
451