1 /*
2 Copyright (c) 2001, 2013, Oracle and/or its affiliates.
3 Copyright (c) 2010, 2019, MariaDB
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; version 2 of the License.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 /*
20 Character set conversion utility
21 */
22
23 #include "mariadb.h"
24 #include "client_priv.h"
25 #include "sql_string.h"
26 #include "my_dir.h"
27
28 #define CONV_VERSION "1.0"
29
30
31 class CmdOpt
32 {
33 public:
34 const char *m_charset_from;
35 const char *m_charset_to;
36 const char *m_delimiter;
37 my_bool m_continue;
CmdOpt()38 CmdOpt()
39 :m_charset_from("latin1"),
40 m_charset_to("latin1"),
41 m_delimiter(NULL),
42 m_continue(FALSE)
43 { }
csinfo_by_name(const char * csname)44 static CHARSET_INFO *csinfo_by_name(const char *csname)
45 {
46 return get_charset_by_csname(csname, MY_CS_PRIMARY, MYF(0));
47 }
csinfo_from() const48 CHARSET_INFO *csinfo_from() const
49 {
50 return m_charset_from ? csinfo_by_name(m_charset_from) : NULL;
51 }
csinfo_to() const52 CHARSET_INFO *csinfo_to() const
53 {
54 return m_charset_to ? csinfo_by_name(m_charset_to) : NULL;
55 }
56 };
57
58
59 static CmdOpt opt;
60
61
62 static struct my_option long_options[] =
63 {
64 {"from", 'f', "Specifies the encoding of the input.", &opt.m_charset_from,
65 &opt.m_charset_from, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
66 {"to", 't', "Specifies the encoding of the output.", &opt.m_charset_to,
67 &opt.m_charset_to, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
68 {"continue", 'c', "Silently ignore conversion errors.",
69 &opt.m_continue, &opt.m_continue, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
70 {"delimiter", 0, "Treat the specified characters as delimiters.",
71 &opt.m_delimiter, &opt.m_delimiter, 0, GET_STR, REQUIRED_ARG,
72 0, 0, 0, 0, 0, 0},
73 {0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
74 };
75
76
77 my_bool
get_one_option(const struct my_option * opt,const char * value,const char * filename)78 get_one_option(const struct my_option *opt,
79 const char *value, const char *filename)
80 {
81 return 0;
82 }
83
84
85 class File_buffer: public Binary_string
86 {
87 public:
88 bool load_binary_stream(FILE *file);
89 bool load_binary_file_by_name(const char *file);
90 };
91
92
93 /*
94 Load data from a binary stream whose length is not known in advance,
95 e.g. from stdin.
96 */
load_binary_stream(FILE * file)97 bool File_buffer::load_binary_stream(FILE *file)
98 {
99 for ( ; ; )
100 {
101 char buf[1024];
102 if (length() + sizeof(buf) > UINT_MAX32 || reserve(sizeof(buf)))
103 {
104 fprintf(stderr, "Input data is too large\n");
105 return true;
106 }
107 size_t nbytes= my_fread(file, (uchar *) end(), sizeof(buf), MYF(0));
108 if (!nbytes || nbytes == (size_t) -1)
109 return false;
110 str_length+= (uint32) nbytes;
111 }
112 return false;
113 }
114
115
116 /*
117 Load data from a file by name.
118 The file size is know.
119 */
load_binary_file_by_name(const char * filename)120 bool File_buffer::load_binary_file_by_name(const char *filename)
121 {
122 MY_STAT sbuf;
123 File fd;
124
125 if (!my_stat(filename, &sbuf, MYF(0)))
126 {
127 fprintf(stderr, "my_stat failed for '%s'\n", filename);
128 return true;
129 }
130
131 if (!MY_S_ISREG(sbuf.st_mode))
132 {
133 fprintf(stderr, "'%s' is not a regular file\n", filename);
134 return true;
135 }
136
137 if ((size_t) sbuf.st_size > UINT_MAX32)
138 {
139 fprintf(stderr, "File '%s' is too large\n", filename);
140 return true;
141 }
142
143 if (alloc((uint32) sbuf.st_size))
144 {
145 fprintf(stderr, "Failed to allocate read buffer\n");
146 return true;
147 }
148
149 if ((fd= my_open(filename, O_RDONLY, MYF(0))) == -1)
150 {
151 fprintf(stderr, "Could not open '%s'\n", filename);
152 return true;
153 }
154
155 size_t nbytes= my_read(fd, (uchar*) Ptr, (size_t)sbuf.st_size, MYF(0));
156 my_close(fd, MYF(0));
157 length((uint32) nbytes);
158
159 return false;
160 }
161
162
163 class Delimiter
164 {
165 protected:
166 bool m_delimiter[127];
167 bool m_has_delimiter_cached;
has_delimiter_slow() const168 bool has_delimiter_slow() const
169 {
170 for (size_t i= 0; i < sizeof(m_delimiter); i++)
171 {
172 if (m_delimiter[i])
173 return true;
174 }
175 return false;
176 }
unescape(char * to,char from) const177 bool unescape(char *to, char from) const
178 {
179 switch (from) {
180 case '\\': *to= '\\'; return false;
181 case 'r': *to= '\r'; return false;
182 case 'n': *to= '\n'; return false;
183 case 't': *to= '\t'; return false;
184 case '0': *to= '\0'; return false;
185 }
186 *to= '\0';
187 return true;
188 }
is_delimiter(char ch) const189 bool is_delimiter(char ch) const
190 {
191 return (signed char) ch < 0 ? false : m_delimiter[(uint32) ch];
192 }
193 public:
Delimiter()194 Delimiter()
195 :m_has_delimiter_cached(false)
196 {
197 bzero(&m_delimiter, sizeof(m_delimiter));
198 }
has_delimiter() const199 bool has_delimiter() const
200 {
201 return m_has_delimiter_cached;
202 }
set_delimiter_unescape(const char * str)203 bool set_delimiter_unescape(const char *str)
204 {
205 m_has_delimiter_cached= false;
206 for ( ; *str; str++)
207 {
208 if ((signed char) *str < 0)
209 return true;
210 if (*str == '\\')
211 {
212 char unescaped;
213 str++;
214 if (!*str || unescape(&unescaped, *str))
215 return true;
216 m_delimiter[(uint) unescaped]= true;
217 }
218 else
219 m_delimiter[(uint) *str]= true;
220 }
221 m_has_delimiter_cached= has_delimiter_slow();
222 return false;
223 }
get_delimiter_length(const char * str,const char * end) const224 size_t get_delimiter_length(const char *str, const char *end) const
225 {
226 const char *str0= str;
227 for ( ; str < end; str++)
228 {
229 if (!is_delimiter(*str))
230 break;
231 }
232 return str - str0;
233 }
get_data_length(const char * str,const char * end) const234 size_t get_data_length(const char *str, const char *end) const
235 {
236 const char *str0= str;
237 for ( ; str < end; str++)
238 {
239 if (is_delimiter(*str))
240 break;
241 }
242 return str - str0;
243 }
244 };
245
246
247 class Conv_inbuf
248 {
249 const char *m_ptr;
250 const char *m_end;
251 public:
Conv_inbuf(const char * from,size_t length)252 Conv_inbuf(const char *from, size_t length)
253 :m_ptr(from), m_end(from + length)
254 { }
ptr() const255 const char *ptr() const { return m_ptr; }
end() const256 const char *end() const { return m_end; }
length() const257 size_t length() const
258 {
259 return m_end - m_ptr;
260 }
261 private:
get_prefix(size_t len)262 LEX_CSTRING get_prefix(size_t len)
263 {
264 LEX_CSTRING res;
265 res.str= ptr();
266 res.length= len;
267 m_ptr+= len;
268 return res;
269 }
get_empty_string() const270 LEX_CSTRING get_empty_string() const
271 {
272 static LEX_CSTRING str= {NULL, 0};
273 return str;
274 }
275 public:
get_delimiter_chunk(const Delimiter & delimiter)276 LEX_CSTRING get_delimiter_chunk(const Delimiter &delimiter)
277 {
278 if (!delimiter.has_delimiter())
279 return get_empty_string();
280 size_t len= delimiter.get_delimiter_length(ptr(), end());
281 return get_prefix(len);
282 }
get_data_chunk(const Delimiter & delimiter)283 LEX_CSTRING get_data_chunk(const Delimiter &delimiter)
284 {
285 if (!delimiter.has_delimiter())
286 return get_prefix(length());
287 size_t len= delimiter.get_data_length(ptr(), end());
288 return get_prefix(len);
289 }
290 };
291
292
293 class Conv_outbuf: public Binary_string
294 {
295 public:
alloc(size_t out_max_length)296 bool alloc(size_t out_max_length)
297 {
298 if (out_max_length >= UINT_MAX32)
299 {
300 fprintf(stderr, "The data needs a too large output buffer\n");
301 return true;
302 }
303 if (Binary_string::alloc((uint32) out_max_length))
304 {
305 fprintf(stderr, "Failed to allocate the output buffer\n");
306 return true;
307 }
308 return false;
309 }
310 };
311
312
313 class Conv: public String_copier, public Delimiter
314 {
315 CHARSET_INFO *m_tocs;
316 CHARSET_INFO *m_fromcs;
317 bool m_continue;
318 public:
Conv(CHARSET_INFO * tocs,CHARSET_INFO * fromcs,bool opt_continue)319 Conv(CHARSET_INFO *tocs, CHARSET_INFO *fromcs, bool opt_continue)
320 :m_tocs(tocs), m_fromcs(fromcs), m_continue(opt_continue)
321 { }
out_buffer_max_length(size_t from_length) const322 size_t out_buffer_max_length(size_t from_length) const
323 {
324 return from_length / m_fromcs->mbminlen * m_tocs->mbmaxlen;
325 }
326 bool convert_data(const char *from, size_t length);
convert_binary_stream(FILE * file)327 bool convert_binary_stream(FILE *file)
328 {
329 File_buffer buf;
330 return buf.load_binary_stream(file) ||
331 convert_data(buf.ptr(), buf.length());
332 }
convert_binary_file_by_name(const char * filename)333 bool convert_binary_file_by_name(const char *filename)
334 {
335 File_buffer buf;
336 return buf.load_binary_file_by_name(filename)||
337 convert_data(buf.ptr(), buf.length());
338 }
339 private:
report_error(const char * from) const340 void report_error(const char *from) const
341 {
342 if (well_formed_error_pos())
343 {
344 fflush(stdout);
345 fprintf(stderr,
346 "Illegal %s byte sequence at position %d\n",
347 m_fromcs->csname,
348 (uint) (well_formed_error_pos() - from));
349 }
350 else if (cannot_convert_error_pos())
351 {
352 fflush(stdout);
353 fprintf(stderr,
354 "Conversion from %s to %s failed at position %d\n",
355 m_fromcs->csname, m_tocs->csname,
356 (uint) (cannot_convert_error_pos() - from));
357 }
358 }
write(const char * str,size_t length) const359 size_t write(const char *str, size_t length) const
360 {
361 return my_fwrite(stdout, (uchar *) str, length, MY_WME);
362 }
363 };
364
365
convert_data(const char * from,size_t from_length)366 bool Conv::convert_data(const char *from, size_t from_length)
367 {
368 Conv_inbuf inbuf(from, from_length);
369 Conv_outbuf outbuf;
370
371 if (outbuf.alloc(out_buffer_max_length(from_length)))
372 return true;
373
374 for ( ; ; )
375 {
376 LEX_CSTRING delim, data;
377
378 delim= inbuf.get_delimiter_chunk(*this);
379 if (delim.length)
380 write(delim.str, delim.length);
381
382 data= inbuf.get_data_chunk(*this);
383 if (!data.length)
384 break;
385 size_t length= well_formed_copy(m_tocs,
386 (char *) outbuf.ptr(),
387 outbuf.alloced_length(),
388 m_fromcs, data.str, data.length);
389 outbuf.length((uint32) length);
390
391 if (most_important_error_pos() && !m_continue)
392 {
393 report_error(from);
394 return true;
395 }
396 write(outbuf.ptr(), outbuf.length());
397 }
398 return false;
399 }
400
401
402 class Session
403 {
404 public:
Session(const char * prog)405 Session(const char *prog)
406 {
407 MY_INIT(prog);
408 }
~Session()409 ~Session()
410 {
411 my_end(0);
412 }
usage(void)413 void usage(void)
414 {
415 printf("%s Ver %s Distrib %s for %s on %s\n", my_progname, CONV_VERSION,
416 MYSQL_SERVER_VERSION, SYSTEM_TYPE, MACHINE_TYPE);
417 puts("Character set conversion utility for MariaDB");
418 puts("Usage:");
419 printf("%s [OPTION...] [FILE...]\n", my_progname);
420 my_print_help(long_options);
421 }
422 };
423
424
main(int argc,char * argv[])425 int main(int argc, char *argv[])
426 {
427 Session session(argv[0]);
428 CHARSET_INFO *charset_info_from= NULL;
429 CHARSET_INFO *charset_info_to= NULL;
430
431 if (handle_options(&argc, &argv, long_options, get_one_option))
432 {
433 session.usage();
434 return 1;
435 }
436
437 if (!(charset_info_from= opt.csinfo_from()))
438 {
439 fprintf(stderr, "Character set %s is not supported\n", opt.m_charset_from);
440 return 1;
441 }
442
443 if (!(charset_info_to= opt.csinfo_to()))
444 {
445 fprintf(stderr, "Character set %s is not supported\n", opt.m_charset_to);
446 return 1;
447 }
448
449 Conv conv(charset_info_to, charset_info_from, opt.m_continue);
450 if (opt.m_delimiter)
451 {
452 if (charset_info_from->mbminlen > 1 ||
453 charset_info_to->mbminlen > 1)
454 {
455 fprintf(stderr, "--delimiter cannot be used with %s to %s conversion\n",
456 charset_info_from->csname, charset_info_to->csname);
457 return 1;
458 }
459 if (conv.set_delimiter_unescape(opt.m_delimiter))
460 {
461 fprintf(stderr, "Bad --delimiter value\n");
462 return 1;
463 }
464 }
465
466 if (argc == 0)
467 {
468 if (conv.convert_binary_stream(stdin))
469 return 1;
470 }
471 else
472 {
473 for (int i= 0; i < argc; i++)
474 {
475 if (conv.convert_binary_file_by_name(argv[i]))
476 return 1;
477 }
478 }
479
480 return 0;
481 } /* main */
482