1
2(********************************************************************)
3(*                                                                  *)
4(*  toutf8.sd7    Convert a file to UTF-8                           *)
5(*  Copyright (C) 2006, 2010, 2015  Thomas Mertes                   *)
6(*                                                                  *)
7(*  This program is free software; you can redistribute it and/or   *)
8(*  modify it under the terms of the GNU General Public License as  *)
9(*  published by the Free Software Foundation; either version 2 of  *)
10(*  the License, or (at your option) any later version.             *)
11(*                                                                  *)
12(*  This program is distributed in the hope that it will be useful, *)
13(*  but WITHOUT ANY WARRANTY; without even the implied warranty of  *)
14(*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the   *)
15(*  GNU General Public License for more details.                    *)
16(*                                                                  *)
17(*  You should have received a copy of the GNU General Public       *)
18(*  License along with this program; if not, write to the           *)
19(*  Free Software Foundation, Inc., 51 Franklin Street,             *)
20(*  Fifth Floor, Boston, MA  02110-1301, USA.                       *)
21(*                                                                  *)
22(********************************************************************)
23
24
25$ include "seed7_05.s7i";
26  include "stdio.s7i";
27  include "osfiles.s7i";
28  include "charsets.s7i";
29  include "utf8.s7i";
30  include "console.s7i";
31
32
33const proc: main is func
34  local
35    var string: conv_name is "";
36    var string: in_name is "";
37    var string: out_name is "";
38    var file: in_file is STD_NULL;
39    var file: out_file is STD_NULL;
40    var string: stri is "";
41    var char: ch is ' ';
42  begin
43    if length(argv(PROGRAM)) >= 1 then
44      conv_name := argv(PROGRAM)[1];
45      if startsWith(conv_name, "-") then
46        conv_name := conv_name[2 ..];
47      end if;
48    end if;
49    if length(argv(PROGRAM)) < 2 or conv_name = "?" then
50      writeln("Toutf8 Version 1.0 - Convert a file to UTF-8");
51      writeln("Copyright (C) 2006, 2010, 2015 Thomas Mertes");
52      writeln("This is free software; see the source for copying conditions.  There is NO");
53      writeln("warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.");
54      writeln("Toutf8 is written in the Seed7 programming language");
55      writeln("Homepage: http://seed7.sourceforge.net");
56      writeln;
57      writeln("usage: toutf8 -codepage infile [outfile]");
58      writeln;
59      writeln("Converts a file encoded with a codepage to UTF-8.");
60      writeln("The following codepages are supported:");
61      writeln("  437, 708, 720, 737, 775, 850, 852, 855, 857, 858, 860, 861, 862, 863,");
62      writeln("  864, 865, 866, 869, 874, 1125, 1250, 1251, 1252, 1253, 1254, 1255, 1256,");
63      writeln("  1257, 1258, 8859-1, 8859-2, 8859-3, 8859-4, 8859-5, 8859-6, 8859-7,");
64      writeln("  8859-8, 8859-9, 8859-10, 8859-11, 8859-13, 8859-14, 8859-15, 8859-16,");
65      writeln("  latin-1, latin-2, latin-3, latin-4, latin-5, latin-6, latin-7, latin-8,");
66      writeln("  latin-9, 037, 273, 277, 280, 285, 297, 500, 1047");
67      writeln("The following IANA/MIME charset names are also accepted:");
68      writeln("  ANSI_X3.4-1968, ARMSCII-8, ASCII, CP437, CP850, GEOSTD8, IBM437, IBM850,");
69      writeln("  ISO_8859-1, ISO-8859-1, ISO-8859-2, ISO-8859-3, ISO-8859-4, ISO-8859-5,");
70      writeln("  ISO-8859-6, ISO-8859-7, ISO-8859-8, ISO-8859-9, ISO-8859-10, ISO-8859-11,");
71      writeln("  ISO-8859-13, ISO-8859-14, ISO-8859-15, ISO-8859-16, KOI8-R, KOI8-U,");
72      writeln("  MACINTOSH, NS_4551-1, TIS-620, US-ASCII, UTF-16BE, UTF-16LE, UTF-7, UTF-8,");
73      writeln("  UTF8, VISCII, WINDOWS-1250, WINDOWS-1251, WINDOWS-1252, WINDOWS-1253,");
74      writeln("  WINDOWS-1254, WINDOWS-1255, WINDOWS-1256, WINDOWS-1257, WINDOWS-1258");
75    else
76      in_name := convDosPath(argv(PROGRAM)[2]);
77      if length(argv(PROGRAM)) >= 3 then
78        out_name := argv(PROGRAM)[3];
79      end if;
80      in_file := open(in_name, "r");
81      if in_file <> STD_NULL then
82        stri := gets(in_file, length(in_file));
83        close(in_file);
84        if conv_name = "437" then
85          conv2unicode(stri, cp_437);
86        elsif conv_name = "708" then
87          conv2unicode(stri, cp_708);
88        elsif conv_name = "720" then
89          conv2unicode(stri, cp_720);
90        elsif conv_name = "737" then
91          conv2unicode(stri, cp_737);
92        elsif conv_name = "775" then
93          conv2unicode(stri, cp_775);
94        elsif conv_name = "850" then
95          conv2unicode(stri, cp_850);
96        elsif conv_name = "852" then
97          conv2unicode(stri, cp_852);
98        elsif conv_name = "855" then
99          conv2unicode(stri, cp_855);
100        elsif conv_name = "857" then
101          conv2unicode(stri, cp_857);
102        elsif conv_name = "858" then
103          conv2unicode(stri, cp_858);
104        elsif conv_name = "860" then
105          conv2unicode(stri, cp_860);
106        elsif conv_name = "861" then
107          conv2unicode(stri, cp_861);
108        elsif conv_name = "862" then
109          conv2unicode(stri, cp_862);
110        elsif conv_name = "863" then
111          conv2unicode(stri, cp_863);
112        elsif conv_name = "864" then
113          conv2unicode(stri, cp_864);
114        elsif conv_name = "865" then
115          conv2unicode(stri, cp_865);
116        elsif conv_name = "866" then
117          conv2unicode(stri, cp_866);
118        elsif conv_name = "869" then
119          conv2unicode(stri, cp_869);
120        elsif conv_name = "874" then
121          conv2unicode(stri, cp_874);
122        elsif conv_name = "1125" then
123          conv2unicode(stri, cp_1125);
124        elsif conv_name = "1250" then
125          conv2unicode(stri, cp_1250);
126        elsif conv_name = "1251" then
127          conv2unicode(stri, cp_1251);
128        elsif conv_name = "1252" then
129          conv2unicode(stri, cp_1252);
130        elsif conv_name = "1253" then
131          conv2unicode(stri, cp_1253);
132        elsif conv_name = "1254" then
133          conv2unicode(stri, cp_1254);
134        elsif conv_name = "1255" then
135          conv2unicode(stri, cp_1255);
136        elsif conv_name = "1256" then
137          conv2unicode(stri, cp_1256);
138        elsif conv_name = "1257" then
139          conv2unicode(stri, cp_1257);
140        elsif conv_name = "1258" then
141          conv2unicode(stri, cp_1258);
142        elsif conv_name = "8859-1" or
143              conv_name = "latin-1" then
144          noop;
145        elsif conv_name = "8859-2" or
146              conv_name = "latin-2" then
147          conv2unicode(stri, iso_8859_2);
148        elsif conv_name = "8859-3" or
149              conv_name = "latin-3" then
150          conv2unicode(stri, iso_8859_3);
151        elsif conv_name = "8859-4" or
152              conv_name = "latin-4" then
153          conv2unicode(stri, iso_8859_4);
154        elsif conv_name = "8859-5" then
155          conv2unicode(stri, iso_8859_5);
156        elsif conv_name = "8859-6" then
157          conv2unicode(stri, iso_8859_6);
158        elsif conv_name = "8859-7" then
159          conv2unicode(stri, iso_8859_7);
160        elsif conv_name = "8859-8" then
161          conv2unicode(stri, iso_8859_8);
162        elsif conv_name = "8859-9" or
163              conv_name = "latin-5" then
164          conv2unicode(stri, iso_8859_9);
165        elsif conv_name = "8859-10" or
166              conv_name = "latin-6" then
167          conv2unicode(stri, iso_8859_10);
168        elsif conv_name = "8859-11" then
169          conv2unicode(stri, iso_8859_11);
170        elsif conv_name = "8859-13" or
171              conv_name = "latin-7" then
172          conv2unicode(stri, iso_8859_13);
173        elsif conv_name = "8859-14" or
174              conv_name = "latin-8" then
175          conv2unicode(stri, iso_8859_14);
176        elsif conv_name = "8859-15" or
177              conv_name = "latin-9" then
178          conv2unicode(stri, iso_8859_15);
179        elsif conv_name = "8859-16" then
180          conv2unicode(stri, iso_8859_16);
181        elsif conv_name = "037" then
182          conv2unicode(stri, cp_037);
183        elsif conv_name = "273" then
184          conv2unicode(stri, cp_273);
185        elsif conv_name = "277" then
186          conv2unicode(stri, cp_277);
187        elsif conv_name = "280" then
188          conv2unicode(stri, cp_280);
189        elsif conv_name = "285" then
190          conv2unicode(stri, cp_285);
191        elsif conv_name = "297" then
192          conv2unicode(stri, cp_297);
193        elsif conv_name = "500" then
194          conv2unicode(stri, cp_500);
195        elsif conv_name = "1047" then
196          conv2unicode(stri, cp_1047);
197        elsif conv_name = "UTF-16BE" then
198          if startsWith(stri, "\254;\255;") then
199            stri := utf16beToStri(stri[3 ..]);
200          elsif startsWith(stri, "\255;\254;") then
201            stri := utf16leToStri(stri[3 ..]);
202          else
203            stri := utf16beToStri(stri);
204          end if;
205        elsif conv_name = "UTF-16LE" then
206          if startsWith(stri, "\255;\254;") then
207            stri := utf16leToStri(stri[3 ..]);
208          elsif startsWith(stri, "\254;\255;") then
209            stri := utf16beToStri(stri[3 ..]);
210          else
211            stri := utf16leToStri(stri);
212          end if;
213        else
214          block
215            conv2unicodeByName(stri, conv_name);
216          exception
217            catch RANGE_ERROR:
218              writeln(" *** Unsupported codepage: " <& conv_name);
219              writeln("Use the option -? for a list of codepages.");
220              stri := "";
221          end block;
222        end if;
223        if stri <> "" then
224          if out_name <> "" then
225            out_file := openUtf8(out_name, "w");
226          else
227            out_file := STD_CONSOLE;
228          end if;
229          if out_file <> STD_NULL then
230            write(out_file, stri);
231            close(out_file);
232          end if;
233        end if;
234      end if;
235    end if;
236  end func;
237