1 2(********************************************************************) 3(* *) 4(* toutf8.sd7 Convert a file to UTF-8 *) 5(* Copyright (C) 2006, 2010, 2015 Thomas Mertes *) 6(* *) 7(* This program is free software; you can redistribute it and/or *) 8(* modify it under the terms of the GNU General Public License as *) 9(* published by the Free Software Foundation; either version 2 of *) 10(* the License, or (at your option) any later version. *) 11(* *) 12(* This program is distributed in the hope that it will be useful, *) 13(* but WITHOUT ANY WARRANTY; without even the implied warranty of *) 14(* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *) 15(* GNU General Public License for more details. *) 16(* *) 17(* You should have received a copy of the GNU General Public *) 18(* License along with this program; if not, write to the *) 19(* Free Software Foundation, Inc., 51 Franklin Street, *) 20(* Fifth Floor, Boston, MA 02110-1301, USA. *) 21(* *) 22(********************************************************************) 23 24 25$ include "seed7_05.s7i"; 26 include "stdio.s7i"; 27 include "osfiles.s7i"; 28 include "charsets.s7i"; 29 include "utf8.s7i"; 30 include "console.s7i"; 31 32 33const proc: main is func 34 local 35 var string: conv_name is ""; 36 var string: in_name is ""; 37 var string: out_name is ""; 38 var file: in_file is STD_NULL; 39 var file: out_file is STD_NULL; 40 var string: stri is ""; 41 var char: ch is ' '; 42 begin 43 if length(argv(PROGRAM)) >= 1 then 44 conv_name := argv(PROGRAM)[1]; 45 if startsWith(conv_name, "-") then 46 conv_name := conv_name[2 ..]; 47 end if; 48 end if; 49 if length(argv(PROGRAM)) < 2 or conv_name = "?" then 50 writeln("Toutf8 Version 1.0 - Convert a file to UTF-8"); 51 writeln("Copyright (C) 2006, 2010, 2015 Thomas Mertes"); 52 writeln("This is free software; see the source for copying conditions. There is NO"); 53 writeln("warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."); 54 writeln("Toutf8 is written in the Seed7 programming language"); 55 writeln("Homepage: http://seed7.sourceforge.net"); 56 writeln; 57 writeln("usage: toutf8 -codepage infile [outfile]"); 58 writeln; 59 writeln("Converts a file encoded with a codepage to UTF-8."); 60 writeln("The following codepages are supported:"); 61 writeln(" 437, 708, 720, 737, 775, 850, 852, 855, 857, 858, 860, 861, 862, 863,"); 62 writeln(" 864, 865, 866, 869, 874, 1125, 1250, 1251, 1252, 1253, 1254, 1255, 1256,"); 63 writeln(" 1257, 1258, 8859-1, 8859-2, 8859-3, 8859-4, 8859-5, 8859-6, 8859-7,"); 64 writeln(" 8859-8, 8859-9, 8859-10, 8859-11, 8859-13, 8859-14, 8859-15, 8859-16,"); 65 writeln(" latin-1, latin-2, latin-3, latin-4, latin-5, latin-6, latin-7, latin-8,"); 66 writeln(" latin-9, 037, 273, 277, 280, 285, 297, 500, 1047"); 67 writeln("The following IANA/MIME charset names are also accepted:"); 68 writeln(" ANSI_X3.4-1968, ARMSCII-8, ASCII, CP437, CP850, GEOSTD8, IBM437, IBM850,"); 69 writeln(" ISO_8859-1, ISO-8859-1, ISO-8859-2, ISO-8859-3, ISO-8859-4, ISO-8859-5,"); 70 writeln(" ISO-8859-6, ISO-8859-7, ISO-8859-8, ISO-8859-9, ISO-8859-10, ISO-8859-11,"); 71 writeln(" ISO-8859-13, ISO-8859-14, ISO-8859-15, ISO-8859-16, KOI8-R, KOI8-U,"); 72 writeln(" MACINTOSH, NS_4551-1, TIS-620, US-ASCII, UTF-16BE, UTF-16LE, UTF-7, UTF-8,"); 73 writeln(" UTF8, VISCII, WINDOWS-1250, WINDOWS-1251, WINDOWS-1252, WINDOWS-1253,"); 74 writeln(" WINDOWS-1254, WINDOWS-1255, WINDOWS-1256, WINDOWS-1257, WINDOWS-1258"); 75 else 76 in_name := convDosPath(argv(PROGRAM)[2]); 77 if length(argv(PROGRAM)) >= 3 then 78 out_name := argv(PROGRAM)[3]; 79 end if; 80 in_file := open(in_name, "r"); 81 if in_file <> STD_NULL then 82 stri := gets(in_file, length(in_file)); 83 close(in_file); 84 if conv_name = "437" then 85 conv2unicode(stri, cp_437); 86 elsif conv_name = "708" then 87 conv2unicode(stri, cp_708); 88 elsif conv_name = "720" then 89 conv2unicode(stri, cp_720); 90 elsif conv_name = "737" then 91 conv2unicode(stri, cp_737); 92 elsif conv_name = "775" then 93 conv2unicode(stri, cp_775); 94 elsif conv_name = "850" then 95 conv2unicode(stri, cp_850); 96 elsif conv_name = "852" then 97 conv2unicode(stri, cp_852); 98 elsif conv_name = "855" then 99 conv2unicode(stri, cp_855); 100 elsif conv_name = "857" then 101 conv2unicode(stri, cp_857); 102 elsif conv_name = "858" then 103 conv2unicode(stri, cp_858); 104 elsif conv_name = "860" then 105 conv2unicode(stri, cp_860); 106 elsif conv_name = "861" then 107 conv2unicode(stri, cp_861); 108 elsif conv_name = "862" then 109 conv2unicode(stri, cp_862); 110 elsif conv_name = "863" then 111 conv2unicode(stri, cp_863); 112 elsif conv_name = "864" then 113 conv2unicode(stri, cp_864); 114 elsif conv_name = "865" then 115 conv2unicode(stri, cp_865); 116 elsif conv_name = "866" then 117 conv2unicode(stri, cp_866); 118 elsif conv_name = "869" then 119 conv2unicode(stri, cp_869); 120 elsif conv_name = "874" then 121 conv2unicode(stri, cp_874); 122 elsif conv_name = "1125" then 123 conv2unicode(stri, cp_1125); 124 elsif conv_name = "1250" then 125 conv2unicode(stri, cp_1250); 126 elsif conv_name = "1251" then 127 conv2unicode(stri, cp_1251); 128 elsif conv_name = "1252" then 129 conv2unicode(stri, cp_1252); 130 elsif conv_name = "1253" then 131 conv2unicode(stri, cp_1253); 132 elsif conv_name = "1254" then 133 conv2unicode(stri, cp_1254); 134 elsif conv_name = "1255" then 135 conv2unicode(stri, cp_1255); 136 elsif conv_name = "1256" then 137 conv2unicode(stri, cp_1256); 138 elsif conv_name = "1257" then 139 conv2unicode(stri, cp_1257); 140 elsif conv_name = "1258" then 141 conv2unicode(stri, cp_1258); 142 elsif conv_name = "8859-1" or 143 conv_name = "latin-1" then 144 noop; 145 elsif conv_name = "8859-2" or 146 conv_name = "latin-2" then 147 conv2unicode(stri, iso_8859_2); 148 elsif conv_name = "8859-3" or 149 conv_name = "latin-3" then 150 conv2unicode(stri, iso_8859_3); 151 elsif conv_name = "8859-4" or 152 conv_name = "latin-4" then 153 conv2unicode(stri, iso_8859_4); 154 elsif conv_name = "8859-5" then 155 conv2unicode(stri, iso_8859_5); 156 elsif conv_name = "8859-6" then 157 conv2unicode(stri, iso_8859_6); 158 elsif conv_name = "8859-7" then 159 conv2unicode(stri, iso_8859_7); 160 elsif conv_name = "8859-8" then 161 conv2unicode(stri, iso_8859_8); 162 elsif conv_name = "8859-9" or 163 conv_name = "latin-5" then 164 conv2unicode(stri, iso_8859_9); 165 elsif conv_name = "8859-10" or 166 conv_name = "latin-6" then 167 conv2unicode(stri, iso_8859_10); 168 elsif conv_name = "8859-11" then 169 conv2unicode(stri, iso_8859_11); 170 elsif conv_name = "8859-13" or 171 conv_name = "latin-7" then 172 conv2unicode(stri, iso_8859_13); 173 elsif conv_name = "8859-14" or 174 conv_name = "latin-8" then 175 conv2unicode(stri, iso_8859_14); 176 elsif conv_name = "8859-15" or 177 conv_name = "latin-9" then 178 conv2unicode(stri, iso_8859_15); 179 elsif conv_name = "8859-16" then 180 conv2unicode(stri, iso_8859_16); 181 elsif conv_name = "037" then 182 conv2unicode(stri, cp_037); 183 elsif conv_name = "273" then 184 conv2unicode(stri, cp_273); 185 elsif conv_name = "277" then 186 conv2unicode(stri, cp_277); 187 elsif conv_name = "280" then 188 conv2unicode(stri, cp_280); 189 elsif conv_name = "285" then 190 conv2unicode(stri, cp_285); 191 elsif conv_name = "297" then 192 conv2unicode(stri, cp_297); 193 elsif conv_name = "500" then 194 conv2unicode(stri, cp_500); 195 elsif conv_name = "1047" then 196 conv2unicode(stri, cp_1047); 197 elsif conv_name = "UTF-16BE" then 198 if startsWith(stri, "\254;\255;") then 199 stri := utf16beToStri(stri[3 ..]); 200 elsif startsWith(stri, "\255;\254;") then 201 stri := utf16leToStri(stri[3 ..]); 202 else 203 stri := utf16beToStri(stri); 204 end if; 205 elsif conv_name = "UTF-16LE" then 206 if startsWith(stri, "\255;\254;") then 207 stri := utf16leToStri(stri[3 ..]); 208 elsif startsWith(stri, "\254;\255;") then 209 stri := utf16beToStri(stri[3 ..]); 210 else 211 stri := utf16leToStri(stri); 212 end if; 213 else 214 block 215 conv2unicodeByName(stri, conv_name); 216 exception 217 catch RANGE_ERROR: 218 writeln(" *** Unsupported codepage: " <& conv_name); 219 writeln("Use the option -? for a list of codepages."); 220 stri := ""; 221 end block; 222 end if; 223 if stri <> "" then 224 if out_name <> "" then 225 out_file := openUtf8(out_name, "w"); 226 else 227 out_file := STD_CONSOLE; 228 end if; 229 if out_file <> STD_NULL then 230 write(out_file, stri); 231 close(out_file); 232 end if; 233 end if; 234 end if; 235 end if; 236 end func; 237