1 /* Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    Without limiting anything contained in the foregoing, this file,
15    which is part of C Driver for MySQL (Connector/C), is also subject to the
16    Universal FOSS Exception, version 1.0, a copy of which can be found at
17    http://oss.oracle.com/licenses/universal-foss-exception.
18 
19    This program is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22    GNU General Public License, version 2.0, for more details.
23 
24    You should have received a copy of the GNU General Public License
25    along with this program; if not, write to the Free Software
26    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
27 
28 #include <my_global.h>
29 #include <m_ctype.h>
30 #include <my_xml.h>
31 #ifndef SCO
32 #include <m_string.h>
33 #endif
34 
35 
36 /*
37 
38   This files implements routines which parse XML based
39   character set and collation description files.
40 
41   Unicode collations are encoded according to
42 
43     Unicode Technical Standard #35
44     Locale Data Markup Language (LDML)
45     http://www.unicode.org/reports/tr35/
46 
47   and converted into ICU string according to
48 
49     Collation Customization
50     http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
51 
52 */
53 
54 
55 /*
56   Avoid using my_snprintf
57   We cannot use my_snprintf() here, because ctype.o is
58   used to build conf_to_src, which must require minimun
59   dependency.
60 */
61 
62 #undef my_snprinf
63 #define my_snprintf "We cannot use my_snprintf in this file"
64 
65 
66 int (*my_string_stack_guard)(int)= NULL;
67 
mstr(char * str,const char * src,size_t l1,size_t l2)68 static char *mstr(char *str,const char *src,size_t l1,size_t l2)
69 {
70   l1= l1<l2 ? l1 : l2;
71   memcpy(str,src,l1);
72   str[l1]='\0';
73   return str;
74 }
75 
76 struct my_cs_file_section_st
77 {
78   int        state;
79   const char *str;
80 };
81 
82 #define _CS_MISC	1
83 #define _CS_ID		2
84 #define _CS_CSNAME	3
85 #define _CS_FAMILY	4
86 #define _CS_ORDER	5
87 #define _CS_COLNAME	6
88 #define _CS_FLAG	7
89 #define _CS_CHARSET	8
90 #define _CS_COLLATION	9
91 #define _CS_UPPERMAP	10
92 #define _CS_LOWERMAP	11
93 #define _CS_UNIMAP	12
94 #define _CS_COLLMAP	13
95 #define _CS_CTYPEMAP	14
96 #define _CS_PRIMARY_ID	15
97 #define _CS_BINARY_ID	16
98 #define _CS_CSDESCRIPT	17
99 
100 
101 /* Special purpose commands */
102 #define _CS_UCA_VERSION                 100
103 #define _CS_CL_SUPPRESS_CONTRACTIONS    101
104 #define _CS_CL_OPTIMIZE                 102
105 #define _CS_CL_SHIFT_AFTER_METHOD       103
106 
107 
108 /* Collation Settings */
109 #define _CS_ST_SETTINGS                 200
110 #define _CS_ST_STRENGTH                 201
111 #define _CS_ST_ALTERNATE                202
112 #define _CS_ST_BACKWARDS                203
113 #define _CS_ST_NORMALIZATION            204
114 #define _CS_ST_CASE_LEVEL               205
115 #define _CS_ST_CASE_FIRST               206
116 #define _CS_ST_HIRAGANA_QUATERNARY      207
117 #define _CS_ST_NUMERIC                  208
118 #define _CS_ST_VARIABLE_TOP             209
119 #define _CS_ST_MATCH_BOUNDARIES         210
120 #define _CS_ST_MATCH_STYLE              211
121 
122 
123 /* Rules */
124 #define _CS_RULES                       300
125 #define _CS_RESET                       301
126 #define _CS_DIFF1                       302
127 #define _CS_DIFF2                       303
128 #define _CS_DIFF3                       304
129 #define _CS_DIFF4                       305
130 #define _CS_IDENTICAL                   306
131 
132 /* Rules: Expansions */
133 #define _CS_EXP_X                       320
134 #define _CS_EXP_EXTEND                  321
135 #define _CS_EXP_DIFF1                   322
136 #define _CS_EXP_DIFF2                   323
137 #define _CS_EXP_DIFF3                   324
138 #define _CS_EXP_DIFF4                   325
139 #define _CS_EXP_IDENTICAL               326
140 
141 /* Rules: Abbreviating Ordering Specifications */
142 #define _CS_A_DIFF1                     351
143 #define _CS_A_DIFF2                     352
144 #define _CS_A_DIFF3                     353
145 #define _CS_A_DIFF4                     354
146 #define _CS_A_IDENTICAL                 355
147 
148 /* Rules: previous context */
149 #define _CS_CONTEXT                     370
150 
151 /* Rules: Placing Characters Before Others*/
152 #define _CS_RESET_BEFORE 380
153 
154 /* Rules: Logical Reset Positions */
155 #define _CS_RESET_FIRST_PRIMARY_IGNORABLE     401
156 #define _CS_RESET_LAST_PRIMARY_IGNORABLE      402
157 #define _CS_RESET_FIRST_SECONDARY_IGNORABLE   403
158 #define _CS_RESET_LAST_SECONDARY_IGNORABLE    404
159 #define _CS_RESET_FIRST_TERTIARY_IGNORABLE    405
160 #define _CS_RESET_LAST_TERTIARY_IGNORABLE     406
161 #define _CS_RESET_FIRST_TRAILING              407
162 #define _CS_RESET_LAST_TRAILING               408
163 #define _CS_RESET_FIRST_VARIABLE              409
164 #define _CS_RESET_LAST_VARIABLE               410
165 #define _CS_RESET_FIRST_NON_IGNORABLE         411
166 #define _CS_RESET_LAST_NON_IGNORABLE          412
167 
168 
169 
170 static struct my_cs_file_section_st sec[] =
171 {
172   {_CS_MISC,		"xml"},
173   {_CS_MISC,		"xml/version"},
174   {_CS_MISC,		"xml/encoding"},
175   {_CS_MISC,		"charsets"},
176   {_CS_MISC,		"charsets/max-id"},
177   {_CS_MISC,		"charsets/copyright"},
178   {_CS_MISC,		"charsets/description"},
179   {_CS_CHARSET,		"charsets/charset"},
180   {_CS_PRIMARY_ID,	"charsets/charset/primary-id"},
181   {_CS_BINARY_ID,	"charsets/charset/binary-id"},
182   {_CS_CSNAME,		"charsets/charset/name"},
183   {_CS_FAMILY,		"charsets/charset/family"},
184   {_CS_CSDESCRIPT,	"charsets/charset/description"},
185   {_CS_MISC,		"charsets/charset/alias"},
186   {_CS_MISC,		"charsets/charset/ctype"},
187   {_CS_CTYPEMAP,	"charsets/charset/ctype/map"},
188   {_CS_MISC,		"charsets/charset/upper"},
189   {_CS_UPPERMAP,	"charsets/charset/upper/map"},
190   {_CS_MISC,		"charsets/charset/lower"},
191   {_CS_LOWERMAP,	"charsets/charset/lower/map"},
192   {_CS_MISC,		"charsets/charset/unicode"},
193   {_CS_UNIMAP,		"charsets/charset/unicode/map"},
194   {_CS_COLLATION,	"charsets/charset/collation"},
195   {_CS_COLNAME,		"charsets/charset/collation/name"},
196   {_CS_ID,		"charsets/charset/collation/id"},
197   {_CS_ORDER,		"charsets/charset/collation/order"},
198   {_CS_FLAG,		"charsets/charset/collation/flag"},
199   {_CS_COLLMAP,		"charsets/charset/collation/map"},
200 
201   /* Special purpose commands */
202   {_CS_UCA_VERSION,              "charsets/charset/collation/version"},
203   {_CS_CL_SUPPRESS_CONTRACTIONS, "charsets/charset/collation/suppress_contractions"},
204   {_CS_CL_OPTIMIZE,              "charsets/charset/collation/optimize"},
205   {_CS_CL_SHIFT_AFTER_METHOD,    "charsets/charset/collation/shift-after-method"},
206 
207   /* Collation Settings */
208   {_CS_ST_SETTINGS,              "charsets/charset/collation/settings"},
209   {_CS_ST_STRENGTH,              "charsets/charset/collation/settings/strength"},
210   {_CS_ST_ALTERNATE,             "charsets/charset/collation/settings/alternate"},
211   {_CS_ST_BACKWARDS,             "charsets/charset/collation/settings/backwards"},
212   {_CS_ST_NORMALIZATION,         "charsets/charset/collation/settings/normalization"},
213   {_CS_ST_CASE_LEVEL,            "charsets/charset/collation/settings/caseLevel"},
214   {_CS_ST_CASE_FIRST,            "charsets/charset/collation/settings/caseFirst"},
215   {_CS_ST_HIRAGANA_QUATERNARY,   "charsets/charset/collation/settings/hiraganaQuaternary"},
216   {_CS_ST_NUMERIC,               "charsets/charset/collation/settings/numeric"},
217   {_CS_ST_VARIABLE_TOP,          "charsets/charset/collation/settings/variableTop"},
218   {_CS_ST_MATCH_BOUNDARIES,      "charsets/charset/collation/settings/match-boundaries"},
219   {_CS_ST_MATCH_STYLE,           "charsets/charset/collation/settings/match-style"},
220 
221   /* Rules */
222   {_CS_RULES,           "charsets/charset/collation/rules"},
223   {_CS_RESET,           "charsets/charset/collation/rules/reset"},
224   {_CS_DIFF1,           "charsets/charset/collation/rules/p"},
225   {_CS_DIFF2,           "charsets/charset/collation/rules/s"},
226   {_CS_DIFF3,           "charsets/charset/collation/rules/t"},
227   {_CS_DIFF4,           "charsets/charset/collation/rules/q"},
228   {_CS_IDENTICAL,       "charsets/charset/collation/rules/i"},
229 
230   /* Rules: expansions */
231   {_CS_EXP_X,           "charsets/charset/collation/rules/x"},
232   {_CS_EXP_EXTEND,      "charsets/charset/collation/rules/x/extend"},
233   {_CS_EXP_DIFF1,       "charsets/charset/collation/rules/x/p"},
234   {_CS_EXP_DIFF2,       "charsets/charset/collation/rules/x/s"},
235   {_CS_EXP_DIFF3,       "charsets/charset/collation/rules/x/t"},
236   {_CS_EXP_DIFF4,       "charsets/charset/collation/rules/x/q"},
237   {_CS_EXP_IDENTICAL,   "charsets/charset/collation/rules/x/i"},
238 
239   /* Rules: previous context */
240   {_CS_CONTEXT,         "charsets/charset/collation/rules/x/context"},
241 
242   /* Rules: Abbreviating Ordering Specifications */
243   {_CS_A_DIFF1,         "charsets/charset/collation/rules/pc"},
244   {_CS_A_DIFF2,         "charsets/charset/collation/rules/sc"},
245   {_CS_A_DIFF3,         "charsets/charset/collation/rules/tc"},
246   {_CS_A_DIFF4,         "charsets/charset/collation/rules/qc"},
247   {_CS_A_IDENTICAL,     "charsets/charset/collation/rules/ic"},
248 
249   /* Rules: Placing Characters Before Others*/
250   {_CS_RESET_BEFORE,    "charsets/charset/collation/rules/reset/before"},
251 
252   /* Rules: Logical Reset Positions */
253   {_CS_RESET_FIRST_NON_IGNORABLE,       "charsets/charset/collation/rules/reset/first_non_ignorable"},
254   {_CS_RESET_LAST_NON_IGNORABLE,        "charsets/charset/collation/rules/reset/last_non_ignorable"},
255   {_CS_RESET_FIRST_PRIMARY_IGNORABLE,   "charsets/charset/collation/rules/reset/first_primary_ignorable"},
256   {_CS_RESET_LAST_PRIMARY_IGNORABLE,    "charsets/charset/collation/rules/reset/last_primary_ignorable"},
257   {_CS_RESET_FIRST_SECONDARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_secondary_ignorable"},
258   {_CS_RESET_LAST_SECONDARY_IGNORABLE,  "charsets/charset/collation/rules/reset/last_secondary_ignorable"},
259   {_CS_RESET_FIRST_TERTIARY_IGNORABLE,  "charsets/charset/collation/rules/reset/first_tertiary_ignorable"},
260   {_CS_RESET_LAST_TERTIARY_IGNORABLE,   "charsets/charset/collation/rules/reset/last_tertiary_ignorable"},
261   {_CS_RESET_FIRST_TRAILING,            "charsets/charset/collation/rules/reset/first_trailing"},
262   {_CS_RESET_LAST_TRAILING,             "charsets/charset/collation/rules/reset/last_trailing"},
263   {_CS_RESET_FIRST_VARIABLE,            "charsets/charset/collation/rules/reset/first_variable"},
264   {_CS_RESET_LAST_VARIABLE,             "charsets/charset/collation/rules/reset/last_variable"},
265 
266   {0,	NULL}
267 };
268 
cs_file_sec(const char * attr,size_t len)269 static struct my_cs_file_section_st * cs_file_sec(const char *attr, size_t len)
270 {
271   struct my_cs_file_section_st *s;
272   for (s= sec; s->str; s++)
273   {
274     if (!strncmp(attr, s->str, len) && s->str[len] == 0)
275       return s;
276   }
277   return NULL;
278 }
279 
280 #define MY_CS_CSDESCR_SIZE	64
281 #define MY_CS_TAILORING_SIZE	32*1024
282 #define MY_CS_UCA_VERSION_SIZE  64
283 #define MY_CS_CONTEXT_SIZE      64
284 
285 typedef struct my_cs_file_info
286 {
287   char   csname[MY_CS_NAME_SIZE];
288   char   name[MY_CS_NAME_SIZE];
289   uchar  ctype[MY_CS_CTYPE_TABLE_SIZE];
290   uchar  to_lower[MY_CS_TO_LOWER_TABLE_SIZE];
291   uchar  to_upper[MY_CS_TO_UPPER_TABLE_SIZE];
292   uchar  sort_order[MY_CS_SORT_ORDER_TABLE_SIZE];
293   uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE];
294   char   comment[MY_CS_CSDESCR_SIZE];
295   char  *tailoring;
296   size_t tailoring_length;
297   size_t tailoring_alloced_length;
298   char   context[MY_CS_CONTEXT_SIZE];
299   CHARSET_INFO cs;
300   MY_CHARSET_LOADER *loader;
301 } MY_CHARSET_FILE;
302 
303 
304 static void
my_charset_file_reset_charset(MY_CHARSET_FILE * i)305 my_charset_file_reset_charset(MY_CHARSET_FILE *i)
306 {
307   memset(&i->cs, 0, sizeof(i->cs));
308 }
309 
310 
311 static void
my_charset_file_reset_collation(MY_CHARSET_FILE * i)312 my_charset_file_reset_collation(MY_CHARSET_FILE *i)
313 {
314   i->tailoring_length= 0;
315   i->context[0]= '\0';
316 }
317 
318 
319 static void
my_charset_file_init(MY_CHARSET_FILE * i)320 my_charset_file_init(MY_CHARSET_FILE *i)
321 {
322   my_charset_file_reset_charset(i);
323   my_charset_file_reset_collation(i);
324   i->tailoring= NULL;
325   i->tailoring_alloced_length= 0;
326 }
327 
328 
329 static void
my_charset_file_free(MY_CHARSET_FILE * i)330 my_charset_file_free(MY_CHARSET_FILE *i)
331 {
332   i->loader->free(i->tailoring);
333 }
334 
335 
336 static int
my_charset_file_tailoring_realloc(MY_CHARSET_FILE * i,size_t newlen)337 my_charset_file_tailoring_realloc(MY_CHARSET_FILE *i, size_t newlen)
338 {
339   if (i->tailoring_alloced_length > newlen ||
340      (i->tailoring= i->loader->realloc(i->tailoring,
341                                        (i->tailoring_alloced_length=
342                                         (newlen + 32*1024)))))
343   {
344     return MY_XML_OK;
345   }
346   return MY_XML_ERROR;
347 }
348 
349 
fill_uchar(uchar * a,uint size,const char * str,size_t len)350 static int fill_uchar(uchar *a,uint size,const char *str, size_t len)
351 {
352   uint i= 0;
353   const char *s, *b, *e=str+len;
354 
355   for (s=str ; s < e ; i++)
356   {
357     for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
358     b=s;
359     for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
360     if (s == b || i > size)
361       break;
362     a[i]= (uchar) strtoul(b,NULL,16);
363   }
364   return 0;
365 }
366 
fill_uint16(uint16 * a,uint size,const char * str,size_t len)367 static int fill_uint16(uint16 *a,uint size,const char *str, size_t len)
368 {
369   uint i= 0;
370 
371   const char *s, *b, *e=str+len;
372   for (s=str ; s < e ; i++)
373   {
374     for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
375     b=s;
376     for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
377     if (s == b || i > size)
378       break;
379     a[i]= (uint16) strtol(b,NULL,16);
380   }
381   return 0;
382 }
383 
384 
385 
386 
387 static int
tailoring_append(MY_XML_PARSER * st,const char * fmt,size_t len,const char * attr)388 tailoring_append(MY_XML_PARSER *st,
389                  const char *fmt, size_t len, const char *attr)
390 {
391   struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data;
392   size_t newlen= i->tailoring_length + len + 64; /* 64 for format */
393   if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen))
394   {
395     char *dst= i->tailoring + i->tailoring_length;
396     sprintf(dst, fmt, (int) len, attr);
397     i->tailoring_length+= strlen(dst);
398     return MY_XML_OK;
399   }
400   return MY_XML_ERROR;
401 }
402 
403 
404 static int
tailoring_append2(MY_XML_PARSER * st,const char * fmt,size_t len1,const char * attr1,size_t len2,const char * attr2)405 tailoring_append2(MY_XML_PARSER *st,
406                   const char *fmt,
407                   size_t len1, const char *attr1,
408                   size_t len2, const char *attr2)
409 {
410   struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data;
411   size_t newlen= i->tailoring_length + len1 + len2 + 64; /* 64 for format */
412   if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen))
413   {
414     char *dst= i->tailoring + i->tailoring_length;
415     sprintf(dst, fmt, (int) len1, attr1, (int) len2, attr2);
416     i->tailoring_length+= strlen(dst);
417     return MY_XML_OK;
418   }
419   return MY_XML_ERROR;
420 }
421 
422 
423 static size_t
scan_one_character(const char * s,const char * e,my_wc_t * wc)424 scan_one_character(const char *s, const char *e, my_wc_t *wc)
425 {
426   CHARSET_INFO *cs= &my_charset_utf8_general_ci;
427   if (s >= e)
428     return 0;
429 
430   /* Escape sequence: \uXXXX */
431   if (s[0] == '\\' && s + 2 < e && s[1] == 'u' && my_isxdigit(cs, s[2]))
432   {
433     size_t len= 3; /* We have at least one digit */
434     for (s+= 3; s < e && my_isxdigit(cs, s[0]); s++, len++)
435     {
436     }
437     wc[0]= 0;
438     return len;
439   }
440   else if ((s[0] & 0x80) == 0) /* 7-bit character */
441   {
442     wc[0]= 0;
443     return 1;
444   }
445   else /* Non-escaped character */
446   {
447     int rc= cs->cset->mb_wc(cs, wc, (uchar *) s, (uchar *) e);
448     if (rc > 0)
449       return (size_t) rc;
450   }
451   return 0;
452 }
453 
454 
455 static int
tailoring_append_abbreviation(MY_XML_PARSER * st,const char * fmt,size_t len,const char * attr)456 tailoring_append_abbreviation(MY_XML_PARSER *st,
457                               const char *fmt, size_t len, const char *attr)
458 {
459   size_t clen;
460   const char *attrend= attr + len;
461   my_wc_t wc;
462 
463   for ( ; (clen= scan_one_character(attr, attrend, &wc)) > 0; attr+= clen)
464   {
465     DBUG_ASSERT(attr < attrend);
466     if (tailoring_append(st, fmt, clen, attr) != MY_XML_OK)
467       return MY_XML_ERROR;
468   }
469   return MY_XML_OK;
470 }
471 
472 
cs_enter(MY_XML_PARSER * st,const char * attr,size_t len)473 static int cs_enter(MY_XML_PARSER *st,const char *attr, size_t len)
474 {
475   struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
476   struct my_cs_file_section_st *s= cs_file_sec(attr,len);
477   int state= s ? s->state : 0;
478 
479   switch (state) {
480   case 0:
481     i->loader->reporter(WARNING_LEVEL, "Unknown LDML tag: '%.*s'", len, attr);
482     break;
483 
484   case _CS_CHARSET:
485     my_charset_file_reset_charset(i);
486     break;
487 
488   case _CS_COLLATION:
489     my_charset_file_reset_collation(i);
490     break;
491 
492   case _CS_RESET:
493     return tailoring_append(st, " &", 0, NULL);
494 
495   default:
496     break;
497   }
498   return MY_XML_OK;
499 }
500 
501 
cs_leave(MY_XML_PARSER * st,const char * attr,size_t len)502 static int cs_leave(MY_XML_PARSER *st,const char *attr, size_t len)
503 {
504   struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
505   struct my_cs_file_section_st *s= cs_file_sec(attr,len);
506   int    state= s ? s->state : 0;
507   int    rc;
508 
509   switch(state){
510   case _CS_COLLATION:
511     if (i->tailoring_length)
512       i->cs.tailoring= i->tailoring;
513     rc= i->loader->add_collation ? i->loader->add_collation(&i->cs) : MY_XML_OK;
514     break;
515 
516   /* Rules: Logical Reset Positions */
517   case _CS_RESET_FIRST_NON_IGNORABLE:
518     rc= tailoring_append(st, "[first non-ignorable]", 0, NULL);
519     break;
520 
521   case _CS_RESET_LAST_NON_IGNORABLE:
522     rc= tailoring_append(st, "[last non-ignorable]", 0, NULL);
523     break;
524 
525   case _CS_RESET_FIRST_PRIMARY_IGNORABLE:
526     rc= tailoring_append(st, "[first primary ignorable]", 0, NULL);
527     break;
528 
529   case _CS_RESET_LAST_PRIMARY_IGNORABLE:
530     rc= tailoring_append(st, "[last primary ignorable]", 0, NULL);
531     break;
532 
533   case _CS_RESET_FIRST_SECONDARY_IGNORABLE:
534     rc= tailoring_append(st, "[first secondary ignorable]", 0, NULL);
535     break;
536 
537   case _CS_RESET_LAST_SECONDARY_IGNORABLE:
538     rc= tailoring_append(st, "[last secondary ignorable]", 0, NULL);
539     break;
540 
541   case _CS_RESET_FIRST_TERTIARY_IGNORABLE:
542     rc= tailoring_append(st, "[first tertiary ignorable]", 0, NULL);
543     break;
544 
545   case _CS_RESET_LAST_TERTIARY_IGNORABLE:
546     rc= tailoring_append(st, "[last tertiary ignorable]", 0, NULL);
547     break;
548 
549   case _CS_RESET_FIRST_TRAILING:
550     rc= tailoring_append(st, "[first trailing]", 0, NULL);
551     break;
552 
553   case _CS_RESET_LAST_TRAILING:
554     rc= tailoring_append(st, "[last trailing]", 0, NULL);
555     break;
556 
557   case _CS_RESET_FIRST_VARIABLE:
558     rc= tailoring_append(st, "[first variable]", 0, NULL);
559     break;
560 
561   case _CS_RESET_LAST_VARIABLE:
562     rc= tailoring_append(st, "[last variable]", 0, NULL);
563     break;
564 
565   default:
566     rc=MY_XML_OK;
567   }
568   return rc;
569 }
570 
571 
572 static const char *diff_fmt[5]=
573 {
574   "<%.*s",
575   "<<%.*s",
576   "<<<%.*s",
577   "<<<<%.*s",
578   "=%.*s"
579 };
580 
581 
582 static const char *context_diff_fmt[5]=
583 {
584   "<%.*s|%.*s",
585   "<<%.*s|%.*s",
586   "<<<%.*s|%.*s",
587   "<<<<%.*s|%.*s",
588   "=%.*s|%.*s"
589 };
590 
591 
cs_value(MY_XML_PARSER * st,const char * attr,size_t len)592 static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len)
593 {
594   struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
595   struct my_cs_file_section_st *s;
596   int    state= (int)((s= cs_file_sec(st->attr.start,
597                                       st->attr.end - st->attr.start)) ?
598                       s->state : 0);
599   int rc= MY_XML_OK;
600 
601   switch (state) {
602   case _CS_MISC:
603   case _CS_FAMILY:
604   case _CS_ORDER:
605     break;
606   case _CS_ID:
607     i->cs.number= strtol(attr,(char**)NULL,10);
608     break;
609   case _CS_BINARY_ID:
610     i->cs.binary_number= strtol(attr,(char**)NULL,10);
611     break;
612   case _CS_PRIMARY_ID:
613     i->cs.primary_number= strtol(attr,(char**)NULL,10);
614     break;
615   case _CS_COLNAME:
616     i->cs.name=mstr(i->name,attr,len,MY_CS_NAME_SIZE-1);
617     break;
618   case _CS_CSNAME:
619     i->cs.csname=mstr(i->csname,attr,len,MY_CS_NAME_SIZE-1);
620     break;
621   case _CS_CSDESCRIPT:
622     i->cs.comment=mstr(i->comment,attr,len,MY_CS_CSDESCR_SIZE-1);
623     break;
624   case _CS_FLAG:
625     if (!strncmp("primary",attr,len))
626       i->cs.state|= MY_CS_PRIMARY;
627     else if (!strncmp("binary",attr,len))
628       i->cs.state|= MY_CS_BINSORT;
629     else if (!strncmp("compiled",attr,len))
630       i->cs.state|= MY_CS_COMPILED;
631     break;
632   case _CS_UPPERMAP:
633     fill_uchar(i->to_upper,MY_CS_TO_UPPER_TABLE_SIZE,attr,len);
634     i->cs.to_upper=i->to_upper;
635     break;
636   case _CS_LOWERMAP:
637     fill_uchar(i->to_lower,MY_CS_TO_LOWER_TABLE_SIZE,attr,len);
638     i->cs.to_lower=i->to_lower;
639     break;
640   case _CS_UNIMAP:
641     fill_uint16(i->tab_to_uni,MY_CS_TO_UNI_TABLE_SIZE,attr,len);
642     i->cs.tab_to_uni=i->tab_to_uni;
643     break;
644   case _CS_COLLMAP:
645     fill_uchar(i->sort_order,MY_CS_SORT_ORDER_TABLE_SIZE,attr,len);
646     i->cs.sort_order=i->sort_order;
647     break;
648   case _CS_CTYPEMAP:
649     fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len);
650     i->cs.ctype=i->ctype;
651     break;
652 
653   /* Special purpose commands */
654   case _CS_UCA_VERSION:
655     rc= tailoring_append(st, "[version %.*s]", len, attr);
656     break;
657 
658   case _CS_CL_SUPPRESS_CONTRACTIONS:
659     rc= tailoring_append(st, "[suppress contractions %.*s]", len, attr);
660     break;
661 
662   case _CS_CL_OPTIMIZE:
663     rc= tailoring_append(st, "[optimize %.*s]", len, attr);
664     break;
665 
666   case _CS_CL_SHIFT_AFTER_METHOD:
667     rc= tailoring_append(st, "[shift-after-method %.*s]", len, attr);
668     break;
669 
670   /* Collation Settings */
671   case _CS_ST_STRENGTH:
672     /* 1, 2, 3, 4, 5, or primary, secondary, tertiary, quaternary, identical */
673     rc= tailoring_append(st, "[strength %.*s]", len, attr);
674     break;
675 
676   case _CS_ST_ALTERNATE:
677     /* non-ignorable, shifted */
678     rc= tailoring_append(st, "[alternate %.*s]", len, attr);
679     break;
680 
681   case _CS_ST_BACKWARDS:
682     /* on, off, 2 */
683     rc= tailoring_append(st, "[backwards %.*s]", len, attr);
684     break;
685 
686   case _CS_ST_NORMALIZATION:
687     /*
688       TODO for WL#896: check collations for normalization: vi.xml
689       We want precomposed characters work well at this point.
690     */
691     /* on, off */
692     rc= tailoring_append(st, "[normalization %.*s]", len, attr);
693     break;
694 
695   case _CS_ST_CASE_LEVEL:
696     /* on, off */
697     rc= tailoring_append(st, "[caseLevel %.*s]", len, attr);
698     break;
699 
700   case _CS_ST_CASE_FIRST:
701     /* upper, lower, off */
702     rc= tailoring_append(st, "[caseFirst %.*s]", len, attr);
703     break;
704 
705   case _CS_ST_HIRAGANA_QUATERNARY:
706     /* on, off */
707     rc= tailoring_append(st, "[hiraganaQ %.*s]", len, attr);
708     break;
709 
710   case _CS_ST_NUMERIC:
711     /* on, off */
712     rc= tailoring_append(st, "[numeric %.*s]", len, attr);
713     break;
714 
715   case _CS_ST_VARIABLE_TOP:
716     /* TODO for WL#896: check value format */
717     rc= tailoring_append(st, "[variableTop %.*s]", len, attr);
718     break;
719 
720   case _CS_ST_MATCH_BOUNDARIES:
721     /* none, whole-character, whole-word */
722     rc= tailoring_append(st, "[match-boundaries %.*s]", len, attr);
723     break;
724 
725   case _CS_ST_MATCH_STYLE:
726     /* minimal, medial, maximal */
727     rc= tailoring_append(st, "[match-style %.*s]", len, attr);
728     break;
729 
730 
731   /* Rules */
732   case _CS_RESET:
733     rc= tailoring_append(st, "%.*s", len, attr);
734     break;
735 
736   case _CS_DIFF1:
737   case _CS_DIFF2:
738   case _CS_DIFF3:
739   case _CS_DIFF4:
740   case _CS_IDENTICAL:
741     rc= tailoring_append(st, diff_fmt[state - _CS_DIFF1], len, attr);
742     break;
743 
744 
745   /* Rules: Expansion */
746   case _CS_EXP_EXTEND:
747     rc= tailoring_append(st, " / %.*s", len, attr);
748     break;
749 
750   case _CS_EXP_DIFF1:
751   case _CS_EXP_DIFF2:
752   case _CS_EXP_DIFF3:
753   case _CS_EXP_DIFF4:
754   case _CS_EXP_IDENTICAL:
755     if (i->context[0])
756     {
757       rc= tailoring_append2(st, context_diff_fmt[state - _CS_EXP_DIFF1],
758                             strlen(i->context), i->context, len, attr);
759       i->context[0]= 0;
760     }
761     else
762       rc= tailoring_append(st, diff_fmt[state  - _CS_EXP_DIFF1], len, attr);
763     break;
764 
765   /* Rules: Context */
766   case _CS_CONTEXT:
767     if (len < sizeof(i->context))
768     {
769       memcpy(i->context, attr, len);
770       i->context[len]= '\0';
771     }
772     break;
773 
774   /* Rules: Abbreviating Ordering Specifications */
775   case _CS_A_DIFF1:
776   case _CS_A_DIFF2:
777   case _CS_A_DIFF3:
778   case _CS_A_DIFF4:
779   case _CS_A_IDENTICAL:
780     rc= tailoring_append_abbreviation(st, diff_fmt[state - _CS_A_DIFF1], len, attr);
781     break;
782 
783   /* Rules: Placing Characters Before Others */
784   case _CS_RESET_BEFORE:
785     /*
786       TODO for WL#896: Add this check into text customization parser:
787       It is an error if the strength of the before relation is not identical
788       to the relation after the reset. We'll need this for WL#896.
789     */
790     rc= tailoring_append(st, "[before %.*s]", len, attr);
791     break;
792 
793 
794   default:
795     break;
796   }
797 
798   return rc;
799 }
800 
801 
802 my_bool
my_parse_charset_xml(MY_CHARSET_LOADER * loader,const char * buf,size_t len)803 my_parse_charset_xml(MY_CHARSET_LOADER *loader, const char *buf, size_t len)
804 {
805   MY_XML_PARSER p;
806   struct my_cs_file_info info;
807   my_bool rc;
808 
809   my_charset_file_init(&info);
810   my_xml_parser_create(&p);
811   my_xml_set_enter_handler(&p,cs_enter);
812   my_xml_set_value_handler(&p,cs_value);
813   my_xml_set_leave_handler(&p,cs_leave);
814   info.loader= loader;
815   my_xml_set_user_data(&p, (void *) &info);
816   rc= (my_xml_parse(&p,buf,len) == MY_XML_OK) ? FALSE : TRUE;
817   my_xml_parser_free(&p);
818   my_charset_file_free(&info);
819   if (rc != MY_XML_OK)
820   {
821     const char *errstr= my_xml_error_string(&p);
822     if (sizeof(loader->error) > 32 + strlen(errstr))
823     {
824       /* We cannot use my_snprintf() here. See previous comment. */
825       sprintf(loader->error, "at line %d pos %d: %s",
826                 my_xml_error_lineno(&p)+1,
827                 (int) my_xml_error_pos(&p),
828                 my_xml_error_string(&p));
829     }
830   }
831   return rc;
832 }
833 
834 
835 /*
836   Check repertoire: detect pure ascii strings
837 */
838 uint
my_string_repertoire(const CHARSET_INFO * cs,const char * str,ulong length)839 my_string_repertoire(const CHARSET_INFO *cs, const char *str, ulong length)
840 {
841   const char *strend= str + length;
842   if (cs->mbminlen == 1)
843   {
844     for ( ; str < strend; str++)
845     {
846       if (((uchar) *str) > 0x7F)
847         return MY_REPERTOIRE_UNICODE30;
848     }
849   }
850   else
851   {
852     my_wc_t wc;
853     int chlen;
854     for (;
855          (chlen= cs->cset->mb_wc(cs, &wc, (uchar*) str, (uchar*) strend)) > 0;
856          str+= chlen)
857     {
858       if (wc > 0x7F)
859         return MY_REPERTOIRE_UNICODE30;
860     }
861   }
862   return MY_REPERTOIRE_ASCII;
863 }
864 
865 
866 /*
867   Returns repertoire for charset
868 */
my_charset_repertoire(const CHARSET_INFO * cs)869 uint my_charset_repertoire(const CHARSET_INFO *cs)
870 {
871   return cs->state & MY_CS_PUREASCII ?
872     MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
873 }
874 
875 
876 /*
877   Detect whether a character set is ASCII compatible.
878 
879   Returns TRUE for:
880 
881   - all 8bit character sets whose Unicode mapping of 0x7B is '{'
882     (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS")
883 
884   - all multi-byte character sets having mbminlen == 1
885     (ignores ucs2 whose mbminlen is 2)
886 
887   TODO:
888 
889   When merging to 5.2, this function should be changed
890   to check a new flag MY_CS_NONASCII,
891 
892      return (cs->flag & MY_CS_NONASCII) ? 0 : 1;
893 
894   This flag was previously added into 5.2 under terms
895   of WL#3759 "Optimize identifier conversion in client-server protocol"
896   especially to mark character sets not compatible with ASCII.
897 
898   We won't backport this flag to 5.0 or 5.1.
899   This function is Ok for 5.0 and 5.1, because we're not going
900   to introduce new tricky character sets between 5.0 and 5.2.
901 */
902 my_bool
my_charset_is_ascii_based(const CHARSET_INFO * cs)903 my_charset_is_ascii_based(const CHARSET_INFO *cs)
904 {
905   return
906     (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') ||
907     (cs->mbminlen == 1 && cs->mbmaxlen > 1);
908 }
909 
910 
911 /*
912   Detect if a character set is 8bit,
913   and it is pure ascii, i.e. doesn't have
914   characters outside U+0000..U+007F
915   This functions is shared between "conf_to_src"
916   and dynamic charsets loader in "mysqld".
917 */
918 my_bool
my_charset_is_8bit_pure_ascii(const CHARSET_INFO * cs)919 my_charset_is_8bit_pure_ascii(const CHARSET_INFO *cs)
920 {
921   size_t code;
922   if (!cs->tab_to_uni)
923     return 0;
924   for (code= 0; code < 256; code++)
925   {
926     if (cs->tab_to_uni[code] > 0x7F)
927       return 0;
928   }
929   return 1;
930 }
931 
932 
933 /*
934   Shared function between conf_to_src and mysys.
935   Check if a 8bit character set is compatible with
936   ascii on the range 0x00..0x7F.
937 */
938 my_bool
my_charset_is_ascii_compatible(const CHARSET_INFO * cs)939 my_charset_is_ascii_compatible(const CHARSET_INFO *cs)
940 {
941   uint i;
942   if (!cs->tab_to_uni)
943     return 1;
944   for (i= 0; i < 128; i++)
945   {
946     if (cs->tab_to_uni[i] != i)
947       return 0;
948   }
949   return 1;
950 }
951 
952 
953 /*
954   Convert a string between two character sets.
955   'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
956 
957   @param  to[OUT]       Store result here
958   @param  to_length     Size of "to" buffer
959   @param  to_cs         Character set of result string
960   @param  from          Copy from here
961   @param  from_length   Length of the "from" string
962   @param  from_cs       Character set of the "from" string
963   @param  errors[OUT]   Number of conversion errors
964 
965   @return Number of bytes copied to 'to' string
966 */
967 
968 static uint32
my_convert_internal(char * to,uint32 to_length,const CHARSET_INFO * to_cs,const char * from,uint32 from_length,const CHARSET_INFO * from_cs,uint * errors)969 my_convert_internal(char *to, uint32 to_length,
970                     const CHARSET_INFO *to_cs,
971                     const char *from, uint32 from_length,
972                     const CHARSET_INFO *from_cs, uint *errors)
973 {
974   int         cnvres;
975   my_wc_t     wc;
976   const uchar *from_end= (const uchar*) from + from_length;
977   char *to_start= to;
978   uchar *to_end= (uchar*) to + to_length;
979   my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
980   my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
981   uint error_count= 0;
982 
983   while (1)
984   {
985     if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
986       from+= cnvres;
987     else if (cnvres == MY_CS_ILSEQ)
988     {
989       error_count++;
990       from++;
991       wc= '?';
992     }
993     else if (cnvres > MY_CS_TOOSMALL)
994     {
995       /*
996         A correct multibyte sequence detected
997         But it doesn't have Unicode mapping.
998       */
999       error_count++;
1000       from+= (-cnvres);
1001       wc= '?';
1002     }
1003     else
1004       break;  // Not enough characters
1005 
1006 outp:
1007     if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
1008       to+= cnvres;
1009     else if (cnvres == MY_CS_ILUNI && wc != '?')
1010     {
1011       error_count++;
1012       wc= '?';
1013       goto outp;
1014     }
1015     else
1016       break;
1017   }
1018   *errors= error_count;
1019   return (uint32) (to - to_start);
1020 }
1021 
1022 
1023 /*
1024   Convert a string between two character sets.
1025    Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
1026   'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
1027 
1028   @param  to[OUT]       Store result here
1029   @param  to_length     Size of "to" buffer
1030   @param  to_cs         Character set of result string
1031   @param  from          Copy from here
1032   @param  from_length   Length of the "from" string
1033   @param  from_cs       Character set of the "from" string
1034   @param  errors[OUT]   Number of conversion errors
1035 
1036   @return Number of bytes copied to 'to' string
1037 */
1038 
1039 uint32
my_convert(char * to,uint32 to_length,const CHARSET_INFO * to_cs,const char * from,uint32 from_length,const CHARSET_INFO * from_cs,uint * errors)1040 my_convert(char *to, uint32 to_length, const CHARSET_INFO *to_cs,
1041            const char *from, uint32 from_length,
1042            const CHARSET_INFO *from_cs, uint *errors)
1043 {
1044   uint32 length, length2;
1045   /*
1046     If any of the character sets is not ASCII compatible,
1047     immediately switch to slow mb_wc->wc_mb method.
1048   */
1049   if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
1050     return my_convert_internal(to, to_length, to_cs,
1051                                from, from_length, from_cs, errors);
1052 
1053   length= length2= MY_MIN(to_length, from_length);
1054 
1055 #if defined(__i386__)
1056   /*
1057     Special loop for i386, it allows to refer to a
1058     non-aligned memory block as UINT32, which makes
1059     it possible to copy four bytes at once. This
1060     gives about 10% performance improvement comparing
1061     to byte-by-byte loop.
1062   */
1063   for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
1064   {
1065     if ((*(uint32*)from) & 0x80808080)
1066       break;
1067     *((uint32*) to)= *((const uint32*) from);
1068   }
1069 #endif /* __i386__ */
1070 
1071   for (; ; *to++= *from++, length--)
1072   {
1073     if (!length)
1074     {
1075       *errors= 0;
1076       return length2;
1077     }
1078     if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
1079     {
1080       uint32 copied_length= length2 - length;
1081       to_length-= copied_length;
1082       from_length-= copied_length;
1083       return copied_length + my_convert_internal(to, to_length, to_cs,
1084                                                  from, from_length, from_cs,
1085                                                  errors);
1086     }
1087   }
1088 
1089   DBUG_ASSERT(FALSE); // Should never get to here
1090   return 0;           // Make compiler happy
1091 }
1092