1 /*
2    Copyright (c) 2001, 2013, Oracle and/or its affiliates.
3    Copyright (c) 2010, 2019, MariaDB
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; version 2 of the License.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, write to the Free Software
16    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
17 */
18 
19 /*
20    Character set conversion utility
21 */
22 
23 #include "mariadb.h"
24 #include "client_priv.h"
25 #include "sql_string.h"
26 #include "my_dir.h"
27 
28 #define CONV_VERSION "1.0"
29 
30 
31 class CmdOpt
32 {
33 public:
34   const char *m_charset_from;
35   const char *m_charset_to;
36   const char *m_delimiter;
37   my_bool m_continue;
CmdOpt()38   CmdOpt()
39    :m_charset_from("latin1"),
40     m_charset_to("latin1"),
41     m_delimiter(NULL),
42     m_continue(FALSE)
43   { }
csinfo_by_name(const char * csname)44   static CHARSET_INFO *csinfo_by_name(const char *csname)
45   {
46     return get_charset_by_csname(csname, MY_CS_PRIMARY, MYF(0));
47   }
csinfo_from() const48   CHARSET_INFO *csinfo_from() const
49   {
50     return m_charset_from ? csinfo_by_name(m_charset_from) : NULL;
51   }
csinfo_to() const52   CHARSET_INFO *csinfo_to() const
53   {
54     return m_charset_to ? csinfo_by_name(m_charset_to) : NULL;
55   }
56 };
57 
58 
59 static CmdOpt opt;
60 
61 
62 static struct my_option long_options[] =
63 {
64   {"from", 'f', "Specifies the encoding of the input.", &opt.m_charset_from,
65    &opt.m_charset_from, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
66   {"to", 't', "Specifies the encoding of the output.", &opt.m_charset_to,
67    &opt.m_charset_to, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
68   {"continue", 'c', "Silently ignore conversion errors.",
69    &opt.m_continue, &opt.m_continue, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
70   {"delimiter", 0, "Treat the specified characters as delimiters.",
71     &opt.m_delimiter, &opt.m_delimiter, 0, GET_STR, REQUIRED_ARG,
72     0, 0, 0, 0, 0, 0},
73   {0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
74 };
75 
76 
77 my_bool
get_one_option(const struct my_option * opt,const char * value,const char * filename)78 get_one_option(const struct my_option *opt,
79                const char *value, const char *filename)
80 {
81   return 0;
82 }
83 
84 
85 class File_buffer: public Binary_string
86 {
87 public:
88   bool load_binary_stream(FILE *file);
89   bool load_binary_file_by_name(const char *file);
90 };
91 
92 
93 /*
94   Load data from a binary stream whose length is not known in advance,
95   e.g. from stdin.
96 */
load_binary_stream(FILE * file)97 bool File_buffer::load_binary_stream(FILE *file)
98 {
99   for ( ; ; )
100   {
101     char buf[1024];
102     if (length() + sizeof(buf) > UINT_MAX32 || reserve(sizeof(buf)))
103     {
104       fprintf(stderr, "Input data is too large\n");
105       return true;
106     }
107     size_t nbytes= my_fread(file, (uchar *) end(), sizeof(buf), MYF(0));
108     if (!nbytes || nbytes == (size_t) -1)
109       return false;
110     str_length+= (uint32) nbytes;
111   }
112   return false;
113 }
114 
115 
116 /*
117   Load data from a file by name.
118   The file size is know.
119 */
load_binary_file_by_name(const char * filename)120 bool File_buffer::load_binary_file_by_name(const char *filename)
121 {
122   MY_STAT sbuf;
123   File fd;
124 
125   if (!my_stat(filename, &sbuf, MYF(0)))
126   {
127     fprintf(stderr, "my_stat failed for '%s'\n", filename);
128     return true;
129   }
130 
131   if (!MY_S_ISREG(sbuf.st_mode))
132   {
133     fprintf(stderr, "'%s' is not a regular file\n", filename);
134     return true;
135   }
136 
137   if ((size_t) sbuf.st_size > UINT_MAX32)
138   {
139     fprintf(stderr, "File '%s' is too large\n", filename);
140     return true;
141   }
142 
143   if (alloc((uint32) sbuf.st_size))
144   {
145     fprintf(stderr, "Failed to allocate read buffer\n");
146     return true;
147   }
148 
149   if ((fd= my_open(filename, O_RDONLY, MYF(0))) == -1)
150   {
151     fprintf(stderr, "Could not open '%s'\n", filename);
152     return true;
153   }
154 
155   size_t nbytes= my_read(fd, (uchar*) Ptr, (size_t)sbuf.st_size, MYF(0));
156   my_close(fd, MYF(0));
157   length((uint32) nbytes);
158 
159   return false;
160 }
161 
162 
163 class Delimiter
164 {
165 protected:
166   bool m_delimiter[127];
167   bool m_has_delimiter_cached;
has_delimiter_slow() const168   bool has_delimiter_slow() const
169   {
170     for (size_t i= 0; i < sizeof(m_delimiter); i++)
171     {
172       if (m_delimiter[i])
173         return true;
174     }
175     return false;
176   }
unescape(char * to,char from) const177   bool unescape(char *to, char from) const
178   {
179     switch (from) {
180     case '\\': *to= '\\'; return false;
181     case 'r':  *to= '\r'; return false;
182     case 'n':  *to= '\n'; return false;
183     case 't':  *to= '\t'; return false;
184     case '0':  *to= '\0'; return false;
185     }
186     *to= '\0';
187     return true;
188   }
is_delimiter(char ch) const189   bool is_delimiter(char ch) const
190   {
191     return (signed char) ch < 0 ? false : m_delimiter[(uint32) ch];
192   }
193 public:
Delimiter()194   Delimiter()
195    :m_has_delimiter_cached(false)
196   {
197     bzero(&m_delimiter, sizeof(m_delimiter));
198   }
has_delimiter() const199   bool has_delimiter() const
200   {
201     return m_has_delimiter_cached;
202   }
set_delimiter_unescape(const char * str)203   bool set_delimiter_unescape(const char *str)
204   {
205     m_has_delimiter_cached= false;
206     for ( ; *str; str++)
207     {
208       if ((signed char) *str < 0)
209         return true;
210       if (*str == '\\')
211       {
212         char unescaped;
213         str++;
214         if (!*str || unescape(&unescaped, *str))
215           return true;
216         m_delimiter[(uint) unescaped]= true;
217       }
218       else
219         m_delimiter[(uint) *str]= true;
220     }
221     m_has_delimiter_cached= has_delimiter_slow();
222     return false;
223   }
get_delimiter_length(const char * str,const char * end) const224   size_t get_delimiter_length(const char *str, const char *end) const
225   {
226     const char *str0= str;
227     for ( ; str < end; str++)
228     {
229       if (!is_delimiter(*str))
230         break;
231     }
232     return str - str0;
233   }
get_data_length(const char * str,const char * end) const234   size_t get_data_length(const char *str, const char *end) const
235   {
236     const char *str0= str;
237     for ( ; str < end; str++)
238     {
239       if (is_delimiter(*str))
240         break;
241     }
242     return str - str0;
243   }
244 };
245 
246 
247 class Conv_inbuf
248 {
249   const char *m_ptr;
250   const char *m_end;
251 public:
Conv_inbuf(const char * from,size_t length)252   Conv_inbuf(const char *from, size_t length)
253    :m_ptr(from), m_end(from + length)
254   { }
ptr() const255   const char *ptr() const { return m_ptr; }
end() const256   const char *end() const { return m_end; }
length() const257   size_t length() const
258   {
259     return m_end - m_ptr;
260   }
261 private:
get_prefix(size_t len)262   LEX_CSTRING get_prefix(size_t len)
263   {
264     LEX_CSTRING res;
265     res.str= ptr();
266     res.length= len;
267     m_ptr+= len;
268     return res;
269   }
get_empty_string() const270   LEX_CSTRING get_empty_string() const
271   {
272     static LEX_CSTRING str= {NULL, 0};
273     return str;
274   }
275 public:
get_delimiter_chunk(const Delimiter & delimiter)276   LEX_CSTRING get_delimiter_chunk(const Delimiter &delimiter)
277   {
278     if (!delimiter.has_delimiter())
279       return get_empty_string();
280     size_t len= delimiter.get_delimiter_length(ptr(), end());
281     return get_prefix(len);
282   }
get_data_chunk(const Delimiter & delimiter)283   LEX_CSTRING get_data_chunk(const Delimiter &delimiter)
284   {
285     if (!delimiter.has_delimiter())
286       return get_prefix(length());
287     size_t len= delimiter.get_data_length(ptr(), end());
288     return get_prefix(len);
289   }
290 };
291 
292 
293 class Conv_outbuf: public Binary_string
294 {
295 public:
alloc(size_t out_max_length)296   bool alloc(size_t out_max_length)
297   {
298     if (out_max_length >= UINT_MAX32)
299     {
300       fprintf(stderr, "The data needs a too large output buffer\n");
301       return true;
302     }
303     if (Binary_string::alloc((uint32) out_max_length))
304     {
305       fprintf(stderr, "Failed to allocate the output buffer\n");
306       return true;
307     }
308     return false;
309   }
310 };
311 
312 
313 class Conv: public String_copier, public Delimiter
314 {
315   CHARSET_INFO *m_tocs;
316   CHARSET_INFO *m_fromcs;
317   bool m_continue;
318 public:
Conv(CHARSET_INFO * tocs,CHARSET_INFO * fromcs,bool opt_continue)319   Conv(CHARSET_INFO *tocs, CHARSET_INFO *fromcs, bool opt_continue)
320    :m_tocs(tocs), m_fromcs(fromcs), m_continue(opt_continue)
321   { }
out_buffer_max_length(size_t from_length) const322   size_t out_buffer_max_length(size_t from_length) const
323   {
324     return from_length / m_fromcs->mbminlen * m_tocs->mbmaxlen;
325   }
326   bool convert_data(const char *from, size_t length);
convert_binary_stream(FILE * file)327   bool convert_binary_stream(FILE *file)
328   {
329     File_buffer buf;
330     return buf.load_binary_stream(file) ||
331            convert_data(buf.ptr(), buf.length());
332   }
convert_binary_file_by_name(const char * filename)333   bool convert_binary_file_by_name(const char *filename)
334   {
335     File_buffer buf;
336     return buf.load_binary_file_by_name(filename)||
337            convert_data(buf.ptr(), buf.length());
338   }
339 private:
report_error(const char * from) const340   void report_error(const char *from) const
341   {
342     if (well_formed_error_pos())
343     {
344       fflush(stdout);
345       fprintf(stderr,
346               "Illegal %s byte sequence at position %d\n",
347               m_fromcs->csname,
348               (uint) (well_formed_error_pos() - from));
349     }
350     else if (cannot_convert_error_pos())
351     {
352       fflush(stdout);
353       fprintf(stderr,
354               "Conversion from %s to %s failed at position %d\n",
355               m_fromcs->csname, m_tocs->csname,
356               (uint) (cannot_convert_error_pos() - from));
357     }
358   }
write(const char * str,size_t length) const359   size_t write(const char *str, size_t length) const
360   {
361     return my_fwrite(stdout, (uchar *) str, length, MY_WME);
362   }
363 };
364 
365 
convert_data(const char * from,size_t from_length)366 bool Conv::convert_data(const char *from, size_t from_length)
367 {
368   Conv_inbuf inbuf(from, from_length);
369   Conv_outbuf outbuf;
370 
371   if (outbuf.alloc(out_buffer_max_length(from_length)))
372     return true;
373 
374   for ( ; ; )
375   {
376     LEX_CSTRING delim, data;
377 
378     delim= inbuf.get_delimiter_chunk(*this);
379     if (delim.length)
380       write(delim.str, delim.length);
381 
382     data= inbuf.get_data_chunk(*this);
383     if (!data.length)
384       break;
385     size_t length= well_formed_copy(m_tocs,
386                                     (char *) outbuf.ptr(),
387                                     outbuf.alloced_length(),
388                                     m_fromcs, data.str, data.length);
389     outbuf.length((uint32) length);
390 
391     if (most_important_error_pos() && !m_continue)
392     {
393       report_error(from);
394       return true;
395     }
396     write(outbuf.ptr(), outbuf.length());
397   }
398   return false;
399 }
400 
401 
402 class Session
403 {
404 public:
Session(const char * prog)405   Session(const char *prog)
406   {
407     MY_INIT(prog);
408   }
~Session()409   ~Session()
410   {
411     my_end(0);
412   }
usage(void)413   void usage(void)
414   {
415     printf("%s Ver %s Distrib %s for %s on %s\n", my_progname, CONV_VERSION,
416       MYSQL_SERVER_VERSION, SYSTEM_TYPE, MACHINE_TYPE);
417     puts("Character set conversion utility for MariaDB");
418     puts("Usage:");
419     printf("%s [OPTION...] [FILE...]\n", my_progname);
420     my_print_help(long_options);
421   }
422 };
423 
424 
main(int argc,char * argv[])425 int main(int argc, char *argv[])
426 {
427   Session session(argv[0]);
428   CHARSET_INFO *charset_info_from= NULL;
429   CHARSET_INFO *charset_info_to= NULL;
430 
431   if (handle_options(&argc, &argv, long_options, get_one_option))
432   {
433     session.usage();
434     return 1;
435   }
436 
437   if (!(charset_info_from= opt.csinfo_from()))
438   {
439     fprintf(stderr, "Character set %s is not supported\n", opt.m_charset_from);
440     return 1;
441   }
442 
443   if (!(charset_info_to= opt.csinfo_to()))
444   {
445     fprintf(stderr, "Character set %s is not supported\n", opt.m_charset_to);
446     return 1;
447   }
448 
449   Conv conv(charset_info_to, charset_info_from, opt.m_continue);
450   if (opt.m_delimiter)
451   {
452     if (charset_info_from->mbminlen > 1 ||
453         charset_info_to->mbminlen > 1)
454     {
455       fprintf(stderr, "--delimiter cannot be used with %s to %s conversion\n",
456               charset_info_from->csname, charset_info_to->csname);
457       return 1;
458     }
459     if (conv.set_delimiter_unescape(opt.m_delimiter))
460     {
461       fprintf(stderr, "Bad --delimiter value\n");
462       return 1;
463     }
464   }
465 
466   if (argc == 0)
467   {
468     if (conv.convert_binary_stream(stdin))
469       return 1;
470   }
471   else
472   {
473     for (int i= 0; i < argc; i++)
474     {
475       if (conv.convert_binary_file_by_name(argv[i]))
476        return 1;
477     }
478   }
479 
480   return 0;
481 } /* main */
482