1 /* Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 Without limiting anything contained in the foregoing, this file,
15 which is part of C Driver for MySQL (Connector/C), is also subject to the
16 Universal FOSS Exception, version 1.0, a copy of which can be found at
17 http://oss.oracle.com/licenses/universal-foss-exception.
18
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License, version 2.0, for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with this program; if not, write to the Free Software
26 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
27
28 #include <my_global.h>
29 #include <m_ctype.h>
30 #include <my_xml.h>
31 #ifndef SCO
32 #include <m_string.h>
33 #endif
34
35
36 /*
37
38 This files implements routines which parse XML based
39 character set and collation description files.
40
41 Unicode collations are encoded according to
42
43 Unicode Technical Standard #35
44 Locale Data Markup Language (LDML)
45 http://www.unicode.org/reports/tr35/
46
47 and converted into ICU string according to
48
49 Collation Customization
50 http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
51
52 */
53
54
55 /*
56 Avoid using my_snprintf
57 We cannot use my_snprintf() here, because ctype.o is
58 used to build conf_to_src, which must require minimun
59 dependency.
60 */
61
62 #undef my_snprinf
63 #define my_snprintf "We cannot use my_snprintf in this file"
64
65
66 int (*my_string_stack_guard)(int)= NULL;
67
mstr(char * str,const char * src,size_t l1,size_t l2)68 static char *mstr(char *str,const char *src,size_t l1,size_t l2)
69 {
70 l1= l1<l2 ? l1 : l2;
71 memcpy(str,src,l1);
72 str[l1]='\0';
73 return str;
74 }
75
76 struct my_cs_file_section_st
77 {
78 int state;
79 const char *str;
80 };
81
82 #define _CS_MISC 1
83 #define _CS_ID 2
84 #define _CS_CSNAME 3
85 #define _CS_FAMILY 4
86 #define _CS_ORDER 5
87 #define _CS_COLNAME 6
88 #define _CS_FLAG 7
89 #define _CS_CHARSET 8
90 #define _CS_COLLATION 9
91 #define _CS_UPPERMAP 10
92 #define _CS_LOWERMAP 11
93 #define _CS_UNIMAP 12
94 #define _CS_COLLMAP 13
95 #define _CS_CTYPEMAP 14
96 #define _CS_PRIMARY_ID 15
97 #define _CS_BINARY_ID 16
98 #define _CS_CSDESCRIPT 17
99
100
101 /* Special purpose commands */
102 #define _CS_UCA_VERSION 100
103 #define _CS_CL_SUPPRESS_CONTRACTIONS 101
104 #define _CS_CL_OPTIMIZE 102
105 #define _CS_CL_SHIFT_AFTER_METHOD 103
106
107
108 /* Collation Settings */
109 #define _CS_ST_SETTINGS 200
110 #define _CS_ST_STRENGTH 201
111 #define _CS_ST_ALTERNATE 202
112 #define _CS_ST_BACKWARDS 203
113 #define _CS_ST_NORMALIZATION 204
114 #define _CS_ST_CASE_LEVEL 205
115 #define _CS_ST_CASE_FIRST 206
116 #define _CS_ST_HIRAGANA_QUATERNARY 207
117 #define _CS_ST_NUMERIC 208
118 #define _CS_ST_VARIABLE_TOP 209
119 #define _CS_ST_MATCH_BOUNDARIES 210
120 #define _CS_ST_MATCH_STYLE 211
121
122
123 /* Rules */
124 #define _CS_RULES 300
125 #define _CS_RESET 301
126 #define _CS_DIFF1 302
127 #define _CS_DIFF2 303
128 #define _CS_DIFF3 304
129 #define _CS_DIFF4 305
130 #define _CS_IDENTICAL 306
131
132 /* Rules: Expansions */
133 #define _CS_EXP_X 320
134 #define _CS_EXP_EXTEND 321
135 #define _CS_EXP_DIFF1 322
136 #define _CS_EXP_DIFF2 323
137 #define _CS_EXP_DIFF3 324
138 #define _CS_EXP_DIFF4 325
139 #define _CS_EXP_IDENTICAL 326
140
141 /* Rules: Abbreviating Ordering Specifications */
142 #define _CS_A_DIFF1 351
143 #define _CS_A_DIFF2 352
144 #define _CS_A_DIFF3 353
145 #define _CS_A_DIFF4 354
146 #define _CS_A_IDENTICAL 355
147
148 /* Rules: previous context */
149 #define _CS_CONTEXT 370
150
151 /* Rules: Placing Characters Before Others*/
152 #define _CS_RESET_BEFORE 380
153
154 /* Rules: Logical Reset Positions */
155 #define _CS_RESET_FIRST_PRIMARY_IGNORABLE 401
156 #define _CS_RESET_LAST_PRIMARY_IGNORABLE 402
157 #define _CS_RESET_FIRST_SECONDARY_IGNORABLE 403
158 #define _CS_RESET_LAST_SECONDARY_IGNORABLE 404
159 #define _CS_RESET_FIRST_TERTIARY_IGNORABLE 405
160 #define _CS_RESET_LAST_TERTIARY_IGNORABLE 406
161 #define _CS_RESET_FIRST_TRAILING 407
162 #define _CS_RESET_LAST_TRAILING 408
163 #define _CS_RESET_FIRST_VARIABLE 409
164 #define _CS_RESET_LAST_VARIABLE 410
165 #define _CS_RESET_FIRST_NON_IGNORABLE 411
166 #define _CS_RESET_LAST_NON_IGNORABLE 412
167
168
169
170 static struct my_cs_file_section_st sec[] =
171 {
172 {_CS_MISC, "xml"},
173 {_CS_MISC, "xml/version"},
174 {_CS_MISC, "xml/encoding"},
175 {_CS_MISC, "charsets"},
176 {_CS_MISC, "charsets/max-id"},
177 {_CS_MISC, "charsets/copyright"},
178 {_CS_MISC, "charsets/description"},
179 {_CS_CHARSET, "charsets/charset"},
180 {_CS_PRIMARY_ID, "charsets/charset/primary-id"},
181 {_CS_BINARY_ID, "charsets/charset/binary-id"},
182 {_CS_CSNAME, "charsets/charset/name"},
183 {_CS_FAMILY, "charsets/charset/family"},
184 {_CS_CSDESCRIPT, "charsets/charset/description"},
185 {_CS_MISC, "charsets/charset/alias"},
186 {_CS_MISC, "charsets/charset/ctype"},
187 {_CS_CTYPEMAP, "charsets/charset/ctype/map"},
188 {_CS_MISC, "charsets/charset/upper"},
189 {_CS_UPPERMAP, "charsets/charset/upper/map"},
190 {_CS_MISC, "charsets/charset/lower"},
191 {_CS_LOWERMAP, "charsets/charset/lower/map"},
192 {_CS_MISC, "charsets/charset/unicode"},
193 {_CS_UNIMAP, "charsets/charset/unicode/map"},
194 {_CS_COLLATION, "charsets/charset/collation"},
195 {_CS_COLNAME, "charsets/charset/collation/name"},
196 {_CS_ID, "charsets/charset/collation/id"},
197 {_CS_ORDER, "charsets/charset/collation/order"},
198 {_CS_FLAG, "charsets/charset/collation/flag"},
199 {_CS_COLLMAP, "charsets/charset/collation/map"},
200
201 /* Special purpose commands */
202 {_CS_UCA_VERSION, "charsets/charset/collation/version"},
203 {_CS_CL_SUPPRESS_CONTRACTIONS, "charsets/charset/collation/suppress_contractions"},
204 {_CS_CL_OPTIMIZE, "charsets/charset/collation/optimize"},
205 {_CS_CL_SHIFT_AFTER_METHOD, "charsets/charset/collation/shift-after-method"},
206
207 /* Collation Settings */
208 {_CS_ST_SETTINGS, "charsets/charset/collation/settings"},
209 {_CS_ST_STRENGTH, "charsets/charset/collation/settings/strength"},
210 {_CS_ST_ALTERNATE, "charsets/charset/collation/settings/alternate"},
211 {_CS_ST_BACKWARDS, "charsets/charset/collation/settings/backwards"},
212 {_CS_ST_NORMALIZATION, "charsets/charset/collation/settings/normalization"},
213 {_CS_ST_CASE_LEVEL, "charsets/charset/collation/settings/caseLevel"},
214 {_CS_ST_CASE_FIRST, "charsets/charset/collation/settings/caseFirst"},
215 {_CS_ST_HIRAGANA_QUATERNARY, "charsets/charset/collation/settings/hiraganaQuaternary"},
216 {_CS_ST_NUMERIC, "charsets/charset/collation/settings/numeric"},
217 {_CS_ST_VARIABLE_TOP, "charsets/charset/collation/settings/variableTop"},
218 {_CS_ST_MATCH_BOUNDARIES, "charsets/charset/collation/settings/match-boundaries"},
219 {_CS_ST_MATCH_STYLE, "charsets/charset/collation/settings/match-style"},
220
221 /* Rules */
222 {_CS_RULES, "charsets/charset/collation/rules"},
223 {_CS_RESET, "charsets/charset/collation/rules/reset"},
224 {_CS_DIFF1, "charsets/charset/collation/rules/p"},
225 {_CS_DIFF2, "charsets/charset/collation/rules/s"},
226 {_CS_DIFF3, "charsets/charset/collation/rules/t"},
227 {_CS_DIFF4, "charsets/charset/collation/rules/q"},
228 {_CS_IDENTICAL, "charsets/charset/collation/rules/i"},
229
230 /* Rules: expansions */
231 {_CS_EXP_X, "charsets/charset/collation/rules/x"},
232 {_CS_EXP_EXTEND, "charsets/charset/collation/rules/x/extend"},
233 {_CS_EXP_DIFF1, "charsets/charset/collation/rules/x/p"},
234 {_CS_EXP_DIFF2, "charsets/charset/collation/rules/x/s"},
235 {_CS_EXP_DIFF3, "charsets/charset/collation/rules/x/t"},
236 {_CS_EXP_DIFF4, "charsets/charset/collation/rules/x/q"},
237 {_CS_EXP_IDENTICAL, "charsets/charset/collation/rules/x/i"},
238
239 /* Rules: previous context */
240 {_CS_CONTEXT, "charsets/charset/collation/rules/x/context"},
241
242 /* Rules: Abbreviating Ordering Specifications */
243 {_CS_A_DIFF1, "charsets/charset/collation/rules/pc"},
244 {_CS_A_DIFF2, "charsets/charset/collation/rules/sc"},
245 {_CS_A_DIFF3, "charsets/charset/collation/rules/tc"},
246 {_CS_A_DIFF4, "charsets/charset/collation/rules/qc"},
247 {_CS_A_IDENTICAL, "charsets/charset/collation/rules/ic"},
248
249 /* Rules: Placing Characters Before Others*/
250 {_CS_RESET_BEFORE, "charsets/charset/collation/rules/reset/before"},
251
252 /* Rules: Logical Reset Positions */
253 {_CS_RESET_FIRST_NON_IGNORABLE, "charsets/charset/collation/rules/reset/first_non_ignorable"},
254 {_CS_RESET_LAST_NON_IGNORABLE, "charsets/charset/collation/rules/reset/last_non_ignorable"},
255 {_CS_RESET_FIRST_PRIMARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_primary_ignorable"},
256 {_CS_RESET_LAST_PRIMARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_primary_ignorable"},
257 {_CS_RESET_FIRST_SECONDARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_secondary_ignorable"},
258 {_CS_RESET_LAST_SECONDARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_secondary_ignorable"},
259 {_CS_RESET_FIRST_TERTIARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_tertiary_ignorable"},
260 {_CS_RESET_LAST_TERTIARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_tertiary_ignorable"},
261 {_CS_RESET_FIRST_TRAILING, "charsets/charset/collation/rules/reset/first_trailing"},
262 {_CS_RESET_LAST_TRAILING, "charsets/charset/collation/rules/reset/last_trailing"},
263 {_CS_RESET_FIRST_VARIABLE, "charsets/charset/collation/rules/reset/first_variable"},
264 {_CS_RESET_LAST_VARIABLE, "charsets/charset/collation/rules/reset/last_variable"},
265
266 {0, NULL}
267 };
268
cs_file_sec(const char * attr,size_t len)269 static struct my_cs_file_section_st * cs_file_sec(const char *attr, size_t len)
270 {
271 struct my_cs_file_section_st *s;
272 for (s= sec; s->str; s++)
273 {
274 if (!strncmp(attr, s->str, len) && s->str[len] == 0)
275 return s;
276 }
277 return NULL;
278 }
279
280 #define MY_CS_CSDESCR_SIZE 64
281 #define MY_CS_TAILORING_SIZE 32*1024
282 #define MY_CS_UCA_VERSION_SIZE 64
283 #define MY_CS_CONTEXT_SIZE 64
284
285 typedef struct my_cs_file_info
286 {
287 char csname[MY_CS_NAME_SIZE];
288 char name[MY_CS_NAME_SIZE];
289 uchar ctype[MY_CS_CTYPE_TABLE_SIZE];
290 uchar to_lower[MY_CS_TO_LOWER_TABLE_SIZE];
291 uchar to_upper[MY_CS_TO_UPPER_TABLE_SIZE];
292 uchar sort_order[MY_CS_SORT_ORDER_TABLE_SIZE];
293 uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE];
294 char comment[MY_CS_CSDESCR_SIZE];
295 char *tailoring;
296 size_t tailoring_length;
297 size_t tailoring_alloced_length;
298 char context[MY_CS_CONTEXT_SIZE];
299 CHARSET_INFO cs;
300 MY_CHARSET_LOADER *loader;
301 } MY_CHARSET_FILE;
302
303
304 static void
my_charset_file_reset_charset(MY_CHARSET_FILE * i)305 my_charset_file_reset_charset(MY_CHARSET_FILE *i)
306 {
307 memset(&i->cs, 0, sizeof(i->cs));
308 }
309
310
311 static void
my_charset_file_reset_collation(MY_CHARSET_FILE * i)312 my_charset_file_reset_collation(MY_CHARSET_FILE *i)
313 {
314 i->tailoring_length= 0;
315 i->context[0]= '\0';
316 }
317
318
319 static void
my_charset_file_init(MY_CHARSET_FILE * i)320 my_charset_file_init(MY_CHARSET_FILE *i)
321 {
322 my_charset_file_reset_charset(i);
323 my_charset_file_reset_collation(i);
324 i->tailoring= NULL;
325 i->tailoring_alloced_length= 0;
326 }
327
328
329 static void
my_charset_file_free(MY_CHARSET_FILE * i)330 my_charset_file_free(MY_CHARSET_FILE *i)
331 {
332 i->loader->free(i->tailoring);
333 }
334
335
336 static int
my_charset_file_tailoring_realloc(MY_CHARSET_FILE * i,size_t newlen)337 my_charset_file_tailoring_realloc(MY_CHARSET_FILE *i, size_t newlen)
338 {
339 if (i->tailoring_alloced_length > newlen ||
340 (i->tailoring= i->loader->realloc(i->tailoring,
341 (i->tailoring_alloced_length=
342 (newlen + 32*1024)))))
343 {
344 return MY_XML_OK;
345 }
346 return MY_XML_ERROR;
347 }
348
349
fill_uchar(uchar * a,uint size,const char * str,size_t len)350 static int fill_uchar(uchar *a,uint size,const char *str, size_t len)
351 {
352 uint i= 0;
353 const char *s, *b, *e=str+len;
354
355 for (s=str ; s < e ; i++)
356 {
357 for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
358 b=s;
359 for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
360 if (s == b || i > size)
361 break;
362 a[i]= (uchar) strtoul(b,NULL,16);
363 }
364 return 0;
365 }
366
fill_uint16(uint16 * a,uint size,const char * str,size_t len)367 static int fill_uint16(uint16 *a,uint size,const char *str, size_t len)
368 {
369 uint i= 0;
370
371 const char *s, *b, *e=str+len;
372 for (s=str ; s < e ; i++)
373 {
374 for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
375 b=s;
376 for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
377 if (s == b || i > size)
378 break;
379 a[i]= (uint16) strtol(b,NULL,16);
380 }
381 return 0;
382 }
383
384
385
386
387 static int
tailoring_append(MY_XML_PARSER * st,const char * fmt,size_t len,const char * attr)388 tailoring_append(MY_XML_PARSER *st,
389 const char *fmt, size_t len, const char *attr)
390 {
391 struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data;
392 size_t newlen= i->tailoring_length + len + 64; /* 64 for format */
393 if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen))
394 {
395 char *dst= i->tailoring + i->tailoring_length;
396 sprintf(dst, fmt, (int) len, attr);
397 i->tailoring_length+= strlen(dst);
398 return MY_XML_OK;
399 }
400 return MY_XML_ERROR;
401 }
402
403
404 static int
tailoring_append2(MY_XML_PARSER * st,const char * fmt,size_t len1,const char * attr1,size_t len2,const char * attr2)405 tailoring_append2(MY_XML_PARSER *st,
406 const char *fmt,
407 size_t len1, const char *attr1,
408 size_t len2, const char *attr2)
409 {
410 struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data;
411 size_t newlen= i->tailoring_length + len1 + len2 + 64; /* 64 for format */
412 if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen))
413 {
414 char *dst= i->tailoring + i->tailoring_length;
415 sprintf(dst, fmt, (int) len1, attr1, (int) len2, attr2);
416 i->tailoring_length+= strlen(dst);
417 return MY_XML_OK;
418 }
419 return MY_XML_ERROR;
420 }
421
422
423 static size_t
scan_one_character(const char * s,const char * e,my_wc_t * wc)424 scan_one_character(const char *s, const char *e, my_wc_t *wc)
425 {
426 CHARSET_INFO *cs= &my_charset_utf8_general_ci;
427 if (s >= e)
428 return 0;
429
430 /* Escape sequence: \uXXXX */
431 if (s[0] == '\\' && s + 2 < e && s[1] == 'u' && my_isxdigit(cs, s[2]))
432 {
433 size_t len= 3; /* We have at least one digit */
434 for (s+= 3; s < e && my_isxdigit(cs, s[0]); s++, len++)
435 {
436 }
437 wc[0]= 0;
438 return len;
439 }
440 else if ((s[0] & 0x80) == 0) /* 7-bit character */
441 {
442 wc[0]= 0;
443 return 1;
444 }
445 else /* Non-escaped character */
446 {
447 int rc= cs->cset->mb_wc(cs, wc, (uchar *) s, (uchar *) e);
448 if (rc > 0)
449 return (size_t) rc;
450 }
451 return 0;
452 }
453
454
455 static int
tailoring_append_abbreviation(MY_XML_PARSER * st,const char * fmt,size_t len,const char * attr)456 tailoring_append_abbreviation(MY_XML_PARSER *st,
457 const char *fmt, size_t len, const char *attr)
458 {
459 size_t clen;
460 const char *attrend= attr + len;
461 my_wc_t wc;
462
463 for ( ; (clen= scan_one_character(attr, attrend, &wc)) > 0; attr+= clen)
464 {
465 DBUG_ASSERT(attr < attrend);
466 if (tailoring_append(st, fmt, clen, attr) != MY_XML_OK)
467 return MY_XML_ERROR;
468 }
469 return MY_XML_OK;
470 }
471
472
cs_enter(MY_XML_PARSER * st,const char * attr,size_t len)473 static int cs_enter(MY_XML_PARSER *st,const char *attr, size_t len)
474 {
475 struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
476 struct my_cs_file_section_st *s= cs_file_sec(attr,len);
477 int state= s ? s->state : 0;
478
479 switch (state) {
480 case 0:
481 i->loader->reporter(WARNING_LEVEL, "Unknown LDML tag: '%.*s'", len, attr);
482 break;
483
484 case _CS_CHARSET:
485 my_charset_file_reset_charset(i);
486 break;
487
488 case _CS_COLLATION:
489 my_charset_file_reset_collation(i);
490 break;
491
492 case _CS_RESET:
493 return tailoring_append(st, " &", 0, NULL);
494
495 default:
496 break;
497 }
498 return MY_XML_OK;
499 }
500
501
cs_leave(MY_XML_PARSER * st,const char * attr,size_t len)502 static int cs_leave(MY_XML_PARSER *st,const char *attr, size_t len)
503 {
504 struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
505 struct my_cs_file_section_st *s= cs_file_sec(attr,len);
506 int state= s ? s->state : 0;
507 int rc;
508
509 switch(state){
510 case _CS_COLLATION:
511 if (i->tailoring_length)
512 i->cs.tailoring= i->tailoring;
513 rc= i->loader->add_collation ? i->loader->add_collation(&i->cs) : MY_XML_OK;
514 break;
515
516 /* Rules: Logical Reset Positions */
517 case _CS_RESET_FIRST_NON_IGNORABLE:
518 rc= tailoring_append(st, "[first non-ignorable]", 0, NULL);
519 break;
520
521 case _CS_RESET_LAST_NON_IGNORABLE:
522 rc= tailoring_append(st, "[last non-ignorable]", 0, NULL);
523 break;
524
525 case _CS_RESET_FIRST_PRIMARY_IGNORABLE:
526 rc= tailoring_append(st, "[first primary ignorable]", 0, NULL);
527 break;
528
529 case _CS_RESET_LAST_PRIMARY_IGNORABLE:
530 rc= tailoring_append(st, "[last primary ignorable]", 0, NULL);
531 break;
532
533 case _CS_RESET_FIRST_SECONDARY_IGNORABLE:
534 rc= tailoring_append(st, "[first secondary ignorable]", 0, NULL);
535 break;
536
537 case _CS_RESET_LAST_SECONDARY_IGNORABLE:
538 rc= tailoring_append(st, "[last secondary ignorable]", 0, NULL);
539 break;
540
541 case _CS_RESET_FIRST_TERTIARY_IGNORABLE:
542 rc= tailoring_append(st, "[first tertiary ignorable]", 0, NULL);
543 break;
544
545 case _CS_RESET_LAST_TERTIARY_IGNORABLE:
546 rc= tailoring_append(st, "[last tertiary ignorable]", 0, NULL);
547 break;
548
549 case _CS_RESET_FIRST_TRAILING:
550 rc= tailoring_append(st, "[first trailing]", 0, NULL);
551 break;
552
553 case _CS_RESET_LAST_TRAILING:
554 rc= tailoring_append(st, "[last trailing]", 0, NULL);
555 break;
556
557 case _CS_RESET_FIRST_VARIABLE:
558 rc= tailoring_append(st, "[first variable]", 0, NULL);
559 break;
560
561 case _CS_RESET_LAST_VARIABLE:
562 rc= tailoring_append(st, "[last variable]", 0, NULL);
563 break;
564
565 default:
566 rc=MY_XML_OK;
567 }
568 return rc;
569 }
570
571
572 static const char *diff_fmt[5]=
573 {
574 "<%.*s",
575 "<<%.*s",
576 "<<<%.*s",
577 "<<<<%.*s",
578 "=%.*s"
579 };
580
581
582 static const char *context_diff_fmt[5]=
583 {
584 "<%.*s|%.*s",
585 "<<%.*s|%.*s",
586 "<<<%.*s|%.*s",
587 "<<<<%.*s|%.*s",
588 "=%.*s|%.*s"
589 };
590
591
cs_value(MY_XML_PARSER * st,const char * attr,size_t len)592 static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len)
593 {
594 struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
595 struct my_cs_file_section_st *s;
596 int state= (int)((s= cs_file_sec(st->attr.start,
597 st->attr.end - st->attr.start)) ?
598 s->state : 0);
599 int rc= MY_XML_OK;
600
601 switch (state) {
602 case _CS_MISC:
603 case _CS_FAMILY:
604 case _CS_ORDER:
605 break;
606 case _CS_ID:
607 i->cs.number= strtol(attr,(char**)NULL,10);
608 break;
609 case _CS_BINARY_ID:
610 i->cs.binary_number= strtol(attr,(char**)NULL,10);
611 break;
612 case _CS_PRIMARY_ID:
613 i->cs.primary_number= strtol(attr,(char**)NULL,10);
614 break;
615 case _CS_COLNAME:
616 i->cs.name=mstr(i->name,attr,len,MY_CS_NAME_SIZE-1);
617 break;
618 case _CS_CSNAME:
619 i->cs.csname=mstr(i->csname,attr,len,MY_CS_NAME_SIZE-1);
620 break;
621 case _CS_CSDESCRIPT:
622 i->cs.comment=mstr(i->comment,attr,len,MY_CS_CSDESCR_SIZE-1);
623 break;
624 case _CS_FLAG:
625 if (!strncmp("primary",attr,len))
626 i->cs.state|= MY_CS_PRIMARY;
627 else if (!strncmp("binary",attr,len))
628 i->cs.state|= MY_CS_BINSORT;
629 else if (!strncmp("compiled",attr,len))
630 i->cs.state|= MY_CS_COMPILED;
631 break;
632 case _CS_UPPERMAP:
633 fill_uchar(i->to_upper,MY_CS_TO_UPPER_TABLE_SIZE,attr,len);
634 i->cs.to_upper=i->to_upper;
635 break;
636 case _CS_LOWERMAP:
637 fill_uchar(i->to_lower,MY_CS_TO_LOWER_TABLE_SIZE,attr,len);
638 i->cs.to_lower=i->to_lower;
639 break;
640 case _CS_UNIMAP:
641 fill_uint16(i->tab_to_uni,MY_CS_TO_UNI_TABLE_SIZE,attr,len);
642 i->cs.tab_to_uni=i->tab_to_uni;
643 break;
644 case _CS_COLLMAP:
645 fill_uchar(i->sort_order,MY_CS_SORT_ORDER_TABLE_SIZE,attr,len);
646 i->cs.sort_order=i->sort_order;
647 break;
648 case _CS_CTYPEMAP:
649 fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len);
650 i->cs.ctype=i->ctype;
651 break;
652
653 /* Special purpose commands */
654 case _CS_UCA_VERSION:
655 rc= tailoring_append(st, "[version %.*s]", len, attr);
656 break;
657
658 case _CS_CL_SUPPRESS_CONTRACTIONS:
659 rc= tailoring_append(st, "[suppress contractions %.*s]", len, attr);
660 break;
661
662 case _CS_CL_OPTIMIZE:
663 rc= tailoring_append(st, "[optimize %.*s]", len, attr);
664 break;
665
666 case _CS_CL_SHIFT_AFTER_METHOD:
667 rc= tailoring_append(st, "[shift-after-method %.*s]", len, attr);
668 break;
669
670 /* Collation Settings */
671 case _CS_ST_STRENGTH:
672 /* 1, 2, 3, 4, 5, or primary, secondary, tertiary, quaternary, identical */
673 rc= tailoring_append(st, "[strength %.*s]", len, attr);
674 break;
675
676 case _CS_ST_ALTERNATE:
677 /* non-ignorable, shifted */
678 rc= tailoring_append(st, "[alternate %.*s]", len, attr);
679 break;
680
681 case _CS_ST_BACKWARDS:
682 /* on, off, 2 */
683 rc= tailoring_append(st, "[backwards %.*s]", len, attr);
684 break;
685
686 case _CS_ST_NORMALIZATION:
687 /*
688 TODO for WL#896: check collations for normalization: vi.xml
689 We want precomposed characters work well at this point.
690 */
691 /* on, off */
692 rc= tailoring_append(st, "[normalization %.*s]", len, attr);
693 break;
694
695 case _CS_ST_CASE_LEVEL:
696 /* on, off */
697 rc= tailoring_append(st, "[caseLevel %.*s]", len, attr);
698 break;
699
700 case _CS_ST_CASE_FIRST:
701 /* upper, lower, off */
702 rc= tailoring_append(st, "[caseFirst %.*s]", len, attr);
703 break;
704
705 case _CS_ST_HIRAGANA_QUATERNARY:
706 /* on, off */
707 rc= tailoring_append(st, "[hiraganaQ %.*s]", len, attr);
708 break;
709
710 case _CS_ST_NUMERIC:
711 /* on, off */
712 rc= tailoring_append(st, "[numeric %.*s]", len, attr);
713 break;
714
715 case _CS_ST_VARIABLE_TOP:
716 /* TODO for WL#896: check value format */
717 rc= tailoring_append(st, "[variableTop %.*s]", len, attr);
718 break;
719
720 case _CS_ST_MATCH_BOUNDARIES:
721 /* none, whole-character, whole-word */
722 rc= tailoring_append(st, "[match-boundaries %.*s]", len, attr);
723 break;
724
725 case _CS_ST_MATCH_STYLE:
726 /* minimal, medial, maximal */
727 rc= tailoring_append(st, "[match-style %.*s]", len, attr);
728 break;
729
730
731 /* Rules */
732 case _CS_RESET:
733 rc= tailoring_append(st, "%.*s", len, attr);
734 break;
735
736 case _CS_DIFF1:
737 case _CS_DIFF2:
738 case _CS_DIFF3:
739 case _CS_DIFF4:
740 case _CS_IDENTICAL:
741 rc= tailoring_append(st, diff_fmt[state - _CS_DIFF1], len, attr);
742 break;
743
744
745 /* Rules: Expansion */
746 case _CS_EXP_EXTEND:
747 rc= tailoring_append(st, " / %.*s", len, attr);
748 break;
749
750 case _CS_EXP_DIFF1:
751 case _CS_EXP_DIFF2:
752 case _CS_EXP_DIFF3:
753 case _CS_EXP_DIFF4:
754 case _CS_EXP_IDENTICAL:
755 if (i->context[0])
756 {
757 rc= tailoring_append2(st, context_diff_fmt[state - _CS_EXP_DIFF1],
758 strlen(i->context), i->context, len, attr);
759 i->context[0]= 0;
760 }
761 else
762 rc= tailoring_append(st, diff_fmt[state - _CS_EXP_DIFF1], len, attr);
763 break;
764
765 /* Rules: Context */
766 case _CS_CONTEXT:
767 if (len < sizeof(i->context))
768 {
769 memcpy(i->context, attr, len);
770 i->context[len]= '\0';
771 }
772 break;
773
774 /* Rules: Abbreviating Ordering Specifications */
775 case _CS_A_DIFF1:
776 case _CS_A_DIFF2:
777 case _CS_A_DIFF3:
778 case _CS_A_DIFF4:
779 case _CS_A_IDENTICAL:
780 rc= tailoring_append_abbreviation(st, diff_fmt[state - _CS_A_DIFF1], len, attr);
781 break;
782
783 /* Rules: Placing Characters Before Others */
784 case _CS_RESET_BEFORE:
785 /*
786 TODO for WL#896: Add this check into text customization parser:
787 It is an error if the strength of the before relation is not identical
788 to the relation after the reset. We'll need this for WL#896.
789 */
790 rc= tailoring_append(st, "[before %.*s]", len, attr);
791 break;
792
793
794 default:
795 break;
796 }
797
798 return rc;
799 }
800
801
802 my_bool
my_parse_charset_xml(MY_CHARSET_LOADER * loader,const char * buf,size_t len)803 my_parse_charset_xml(MY_CHARSET_LOADER *loader, const char *buf, size_t len)
804 {
805 MY_XML_PARSER p;
806 struct my_cs_file_info info;
807 my_bool rc;
808
809 my_charset_file_init(&info);
810 my_xml_parser_create(&p);
811 my_xml_set_enter_handler(&p,cs_enter);
812 my_xml_set_value_handler(&p,cs_value);
813 my_xml_set_leave_handler(&p,cs_leave);
814 info.loader= loader;
815 my_xml_set_user_data(&p, (void *) &info);
816 rc= (my_xml_parse(&p,buf,len) == MY_XML_OK) ? FALSE : TRUE;
817 my_xml_parser_free(&p);
818 my_charset_file_free(&info);
819 if (rc != MY_XML_OK)
820 {
821 const char *errstr= my_xml_error_string(&p);
822 if (sizeof(loader->error) > 32 + strlen(errstr))
823 {
824 /* We cannot use my_snprintf() here. See previous comment. */
825 sprintf(loader->error, "at line %d pos %d: %s",
826 my_xml_error_lineno(&p)+1,
827 (int) my_xml_error_pos(&p),
828 my_xml_error_string(&p));
829 }
830 }
831 return rc;
832 }
833
834
835 /*
836 Check repertoire: detect pure ascii strings
837 */
838 uint
my_string_repertoire(const CHARSET_INFO * cs,const char * str,ulong length)839 my_string_repertoire(const CHARSET_INFO *cs, const char *str, ulong length)
840 {
841 const char *strend= str + length;
842 if (cs->mbminlen == 1)
843 {
844 for ( ; str < strend; str++)
845 {
846 if (((uchar) *str) > 0x7F)
847 return MY_REPERTOIRE_UNICODE30;
848 }
849 }
850 else
851 {
852 my_wc_t wc;
853 int chlen;
854 for (;
855 (chlen= cs->cset->mb_wc(cs, &wc, (uchar*) str, (uchar*) strend)) > 0;
856 str+= chlen)
857 {
858 if (wc > 0x7F)
859 return MY_REPERTOIRE_UNICODE30;
860 }
861 }
862 return MY_REPERTOIRE_ASCII;
863 }
864
865
866 /*
867 Returns repertoire for charset
868 */
my_charset_repertoire(const CHARSET_INFO * cs)869 uint my_charset_repertoire(const CHARSET_INFO *cs)
870 {
871 return cs->state & MY_CS_PUREASCII ?
872 MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
873 }
874
875
876 /*
877 Detect whether a character set is ASCII compatible.
878
879 Returns TRUE for:
880
881 - all 8bit character sets whose Unicode mapping of 0x7B is '{'
882 (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS")
883
884 - all multi-byte character sets having mbminlen == 1
885 (ignores ucs2 whose mbminlen is 2)
886
887 TODO:
888
889 When merging to 5.2, this function should be changed
890 to check a new flag MY_CS_NONASCII,
891
892 return (cs->flag & MY_CS_NONASCII) ? 0 : 1;
893
894 This flag was previously added into 5.2 under terms
895 of WL#3759 "Optimize identifier conversion in client-server protocol"
896 especially to mark character sets not compatible with ASCII.
897
898 We won't backport this flag to 5.0 or 5.1.
899 This function is Ok for 5.0 and 5.1, because we're not going
900 to introduce new tricky character sets between 5.0 and 5.2.
901 */
902 my_bool
my_charset_is_ascii_based(const CHARSET_INFO * cs)903 my_charset_is_ascii_based(const CHARSET_INFO *cs)
904 {
905 return
906 (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') ||
907 (cs->mbminlen == 1 && cs->mbmaxlen > 1);
908 }
909
910
911 /*
912 Detect if a character set is 8bit,
913 and it is pure ascii, i.e. doesn't have
914 characters outside U+0000..U+007F
915 This functions is shared between "conf_to_src"
916 and dynamic charsets loader in "mysqld".
917 */
918 my_bool
my_charset_is_8bit_pure_ascii(const CHARSET_INFO * cs)919 my_charset_is_8bit_pure_ascii(const CHARSET_INFO *cs)
920 {
921 size_t code;
922 if (!cs->tab_to_uni)
923 return 0;
924 for (code= 0; code < 256; code++)
925 {
926 if (cs->tab_to_uni[code] > 0x7F)
927 return 0;
928 }
929 return 1;
930 }
931
932
933 /*
934 Shared function between conf_to_src and mysys.
935 Check if a 8bit character set is compatible with
936 ascii on the range 0x00..0x7F.
937 */
938 my_bool
my_charset_is_ascii_compatible(const CHARSET_INFO * cs)939 my_charset_is_ascii_compatible(const CHARSET_INFO *cs)
940 {
941 uint i;
942 if (!cs->tab_to_uni)
943 return 1;
944 for (i= 0; i < 128; i++)
945 {
946 if (cs->tab_to_uni[i] != i)
947 return 0;
948 }
949 return 1;
950 }
951
952
953 /*
954 Convert a string between two character sets.
955 'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
956
957 @param to[OUT] Store result here
958 @param to_length Size of "to" buffer
959 @param to_cs Character set of result string
960 @param from Copy from here
961 @param from_length Length of the "from" string
962 @param from_cs Character set of the "from" string
963 @param errors[OUT] Number of conversion errors
964
965 @return Number of bytes copied to 'to' string
966 */
967
968 static uint32
my_convert_internal(char * to,uint32 to_length,const CHARSET_INFO * to_cs,const char * from,uint32 from_length,const CHARSET_INFO * from_cs,uint * errors)969 my_convert_internal(char *to, uint32 to_length,
970 const CHARSET_INFO *to_cs,
971 const char *from, uint32 from_length,
972 const CHARSET_INFO *from_cs, uint *errors)
973 {
974 int cnvres;
975 my_wc_t wc;
976 const uchar *from_end= (const uchar*) from + from_length;
977 char *to_start= to;
978 uchar *to_end= (uchar*) to + to_length;
979 my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
980 my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
981 uint error_count= 0;
982
983 while (1)
984 {
985 if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
986 from+= cnvres;
987 else if (cnvres == MY_CS_ILSEQ)
988 {
989 error_count++;
990 from++;
991 wc= '?';
992 }
993 else if (cnvres > MY_CS_TOOSMALL)
994 {
995 /*
996 A correct multibyte sequence detected
997 But it doesn't have Unicode mapping.
998 */
999 error_count++;
1000 from+= (-cnvres);
1001 wc= '?';
1002 }
1003 else
1004 break; // Not enough characters
1005
1006 outp:
1007 if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
1008 to+= cnvres;
1009 else if (cnvres == MY_CS_ILUNI && wc != '?')
1010 {
1011 error_count++;
1012 wc= '?';
1013 goto outp;
1014 }
1015 else
1016 break;
1017 }
1018 *errors= error_count;
1019 return (uint32) (to - to_start);
1020 }
1021
1022
1023 /*
1024 Convert a string between two character sets.
1025 Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
1026 'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
1027
1028 @param to[OUT] Store result here
1029 @param to_length Size of "to" buffer
1030 @param to_cs Character set of result string
1031 @param from Copy from here
1032 @param from_length Length of the "from" string
1033 @param from_cs Character set of the "from" string
1034 @param errors[OUT] Number of conversion errors
1035
1036 @return Number of bytes copied to 'to' string
1037 */
1038
1039 uint32
my_convert(char * to,uint32 to_length,const CHARSET_INFO * to_cs,const char * from,uint32 from_length,const CHARSET_INFO * from_cs,uint * errors)1040 my_convert(char *to, uint32 to_length, const CHARSET_INFO *to_cs,
1041 const char *from, uint32 from_length,
1042 const CHARSET_INFO *from_cs, uint *errors)
1043 {
1044 uint32 length, length2;
1045 /*
1046 If any of the character sets is not ASCII compatible,
1047 immediately switch to slow mb_wc->wc_mb method.
1048 */
1049 if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
1050 return my_convert_internal(to, to_length, to_cs,
1051 from, from_length, from_cs, errors);
1052
1053 length= length2= MY_MIN(to_length, from_length);
1054
1055 #if defined(__i386__)
1056 /*
1057 Special loop for i386, it allows to refer to a
1058 non-aligned memory block as UINT32, which makes
1059 it possible to copy four bytes at once. This
1060 gives about 10% performance improvement comparing
1061 to byte-by-byte loop.
1062 */
1063 for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
1064 {
1065 if ((*(uint32*)from) & 0x80808080)
1066 break;
1067 *((uint32*) to)= *((const uint32*) from);
1068 }
1069 #endif /* __i386__ */
1070
1071 for (; ; *to++= *from++, length--)
1072 {
1073 if (!length)
1074 {
1075 *errors= 0;
1076 return length2;
1077 }
1078 if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
1079 {
1080 uint32 copied_length= length2 - length;
1081 to_length-= copied_length;
1082 from_length-= copied_length;
1083 return copied_length + my_convert_internal(to, to_length, to_cs,
1084 from, from_length, from_cs,
1085 errors);
1086 }
1087 }
1088
1089 DBUG_ASSERT(FALSE); // Should never get to here
1090 return 0; // Make compiler happy
1091 }
1092