1 /* Copyright (c) 2000, 2020, Oracle and/or its affiliates. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License, version 2.0, for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, write to the Free Software
21    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
22 
23 #ifndef M_CTYPE_INCLUDED
24 #define M_CTYPE_INCLUDED
25 
26 /**
27   @file include/m_ctype.h
28   A better implementation of the UNIX ctype(3) library.
29 */
30 
31 #ifndef __cplusplus
32 #include <stdbool.h>
33 #endif
34 #include <stddef.h>
35 #include <sys/types.h>
36 
37 #include "my_compiler.h"
38 #include "my_inttypes.h"
39 #include "my_loglevel.h"
40 #include "my_macros.h"
41 #include "my_sharedlib.h"
42 #include "template_utils.h"
43 
44 #define MY_CS_NAME_SIZE 32
45 #define MY_CS_CTYPE_TABLE_SIZE 257
46 #define MY_CS_TO_LOWER_TABLE_SIZE 256
47 #define MY_CS_TO_UPPER_TABLE_SIZE 256
48 #define MY_CS_SORT_ORDER_TABLE_SIZE 256
49 #define MY_CS_TO_UNI_TABLE_SIZE 256
50 
51 #define CHARSET_DIR "charsets/"
52 
53 /**
54   Our own version of wchar_t, ie., a type that holds a single Unicode code point
55   ("wide character"). ulong is always big enough to hold any character
56   in the BMP.
57 */
58 typedef ulong my_wc_t;
59 
60 #define MY_CS_REPLACEMENT_CHARACTER 0xFFFD
61 
MY_PUT_MB2(unsigned char * s,uint16 code)62 static inline void MY_PUT_MB2(unsigned char *s, uint16 code) {
63   s[0] = code >> 8;
64   s[1] = code & 0xFF;
65 }
66 
67 typedef struct MY_UNICASE_CHARACTER {
68   uint32 toupper;
69   uint32 tolower;
70   uint32 sort;
71 } MY_UNICASE_CHARACTER;
72 
73 typedef struct MY_UNICASE_INFO {
74   my_wc_t maxchar;
75   const MY_UNICASE_CHARACTER **page;
76 } MY_UNICASE_INFO;
77 
78 extern MY_UNICASE_INFO my_unicase_default;
79 extern MY_UNICASE_INFO my_unicase_turkish;
80 extern MY_UNICASE_INFO my_unicase_mysql500;
81 extern MY_UNICASE_INFO my_unicase_unicode520;
82 
83 struct MY_UCA_INFO;
84 
85 typedef struct MY_UNI_CTYPE {
86   uchar pctype;
87   uchar *ctype;
88 } MY_UNI_CTYPE;
89 
90 extern MY_UNI_CTYPE my_uni_ctype[256];
91 
92 /* wm_wc and wc_mb return codes */
93 #define MY_CS_ILSEQ 0        /* Wrong by sequence: wb_wc                   */
94 #define MY_CS_ILUNI 0        /* Cannot encode Unicode to charset: wc_mb    */
95 #define MY_CS_TOOSMALL -101  /* Need at least one byte:    wc_mb and mb_wc */
96 #define MY_CS_TOOSMALL2 -102 /* Need at least two bytes:   wc_mb and mb_wc */
97 #define MY_CS_TOOSMALL3 -103 /* Need at least three bytes: wc_mb and mb_wc */
98 /* These following three are currently not really used */
99 #define MY_CS_TOOSMALL4 -104 /* Need at least 4 bytes: wc_mb and mb_wc */
100 #define MY_CS_TOOSMALL5 -105 /* Need at least 5 bytes: wc_mb and mb_wc */
101 #define MY_CS_TOOSMALL6 -106 /* Need at least 6 bytes: wc_mb and mb_wc */
102 /* A helper macros for "need at least n bytes" */
103 #define MY_CS_TOOSMALLN(n) (-100 - (n))
104 
105 #define MY_SEQ_INTTAIL 1
106 #define MY_SEQ_SPACES 2
107 
108 /* My charsets_list flags */
109 #define MY_CS_COMPILED 1 /* compiled-in sets               */
110 #define MY_CS_CONFIG 2   /* sets that have a *.conf file   */
111 #define MY_CS_INDEX 4    /* sets listed in the Index file  */
112 #define MY_CS_LOADED 8   /* sets that are currently loaded */
113 #define MY_CS_BINSORT 16 /* if binary sort order           */
114 #define MY_CS_PRIMARY 32 /* if primary collation           */
115 #define MY_CS_STRNXFRM                                                         \
116   64                                   /*                                      \
117                                          if _not_ set, sort_order will         \
118                                          give same result as strnxfrm --       \
119                                          all new collations should have this   \
120                                          flag set, do not check it in new code \
121                                        */
122 #define MY_CS_UNICODE 128              /* is a charset is BMP Unicode    */
123 #define MY_CS_READY 256                /* if a charset is initialized    */
124 #define MY_CS_AVAILABLE 512            /* If either compiled-in or loaded*/
125 #define MY_CS_CSSORT 1024              /* if case sensitive sort order   */
126 #define MY_CS_HIDDEN 2048              /* don't display in SHOW          */
127 #define MY_CS_PUREASCII 4096           /* if a charset is pure ascii     */
128 #define MY_CS_NONASCII 8192            /* if not ASCII-compatible        */
129 #define MY_CS_UNICODE_SUPPLEMENT 16384 /* Non-BMP Unicode characters */
130 #define MY_CS_LOWER_SORT 32768         /* If use lower case as weight   */
131 #define MY_CHARSET_UNDEFINED 0
132 
133 /* Character repertoire flags */
134 #define MY_REPERTOIRE_ASCII 1     /* Pure ASCII            U+0000..U+007F */
135 #define MY_REPERTOIRE_EXTENDED 2  /* Extended characters:  U+0080..U+FFFF */
136 #define MY_REPERTOIRE_UNICODE30 3 /* ASCII | EXTENDED:     U+0000..U+FFFF */
137 
138 /* Flags for strxfrm */
139 #define MY_STRXFRM_PAD_TO_MAXLEN 0x00000080 /* if pad tail(for filesort) */
140 
141 typedef struct MY_UNI_IDX {
142   uint16 from;
143   uint16 to;
144   const uchar *tab;
145 } MY_UNI_IDX;
146 
147 typedef struct {
148   uint beg;
149   uint end;
150   uint mb_len;
151 } my_match_t;
152 
153 struct CHARSET_INFO;
154 #ifndef __cplusplus
155 typedef struct CHARSET_INFO CHARSET_INFO;
156 #endif
157 extern MYSQL_PLUGIN_IMPORT CHARSET_INFO *system_charset_info;
158 
159 typedef struct MY_CHARSET_LOADER {
160   uint errcode;
161   char errarg[192];
162   void *(*once_alloc)(size_t);
163   void *(*mem_malloc)(size_t);
164   void *(*mem_realloc)(void *, size_t);
165   void (*mem_free)(void *);
166   void (*reporter)(enum loglevel, uint errcode, ...);
167   int (*add_collation)(CHARSET_INFO *cs);
168 } MY_CHARSET_LOADER;
169 
170 extern int (*my_string_stack_guard)(int);
171 
172 enum Pad_attribute { PAD_SPACE, NO_PAD };
173 
174 /* See strings/CHARSET_INFO.txt for information about this structure  */
175 typedef struct MY_COLLATION_HANDLER {
176   bool (*init)(CHARSET_INFO *, MY_CHARSET_LOADER *);
177   void (*uninit)(CHARSET_INFO *);
178   /* Collation routines */
179   int (*strnncoll)(const CHARSET_INFO *, const uchar *, size_t, const uchar *,
180                    size_t, bool);
181   /**
182     Compare the two strings under the pad rules given by the collation.
183 
184     Thus, for NO PAD collations, this is identical to strnncoll with is_prefix
185     set to false. For PAD SPACE collations, the two strings are conceptually
186     extended infinitely at the end using space characters (0x20) and then
187     compared under the collation's normal comparison rules, so that e.g 'a' is
188     equal to 'a '.
189   */
190   int (*strnncollsp)(const CHARSET_INFO *, const uchar *, size_t, const uchar *,
191                      size_t);
192   /**
193     Transform the string into a form such that memcmp() between transformed
194     strings yields the correct collation order.
195 
196     @param [out] dst Buffer for the transformed string.
197     @param [out] dstlen Number of bytes available in dstlen.
198       Must be even.
199     @param num_codepoints Treat the string as if it were of type
200       CHAR(num_codepoints). In particular, this means that if the
201       collation is a pad collation (pad_attribute is PAD_SPACE) and
202       string has fewer than "num_codepoints" codepoints, the string
203       will be transformed as if it ended in (num_codepoints-n) extra spaces.
204       If the string has more than "num_codepoints" codepoints,
205       behavior is undefined; may truncate, may crash, or do something
206       else entirely. Note that MY_STRXFRM_PAD_TO_MAXLEN overrides this;
207       if it is given for a PAD SPACE collation, this value is taken to be
208       effectively infinity.
209     @param src The source string, in the required character set
210       for the collation.
211     @param srclen Number of bytes in src.
212     @param flags ORed bitmask of MY_STRXFRM_* flags.
213 
214     @return Number of bytes written to dst.
215   */
216   size_t (*strnxfrm)(const CHARSET_INFO *, uchar *dst, size_t dstlen,
217                      uint num_codepoints, const uchar *src, size_t srclen,
218                      uint flags);
219 
220   /**
221     Return the maximum number of output bytes needed for strnxfrm()
222     to output all weights for any string of the given input length.
223     You can use this to e.g. size buffers for sort keys.
224 
225     @param num_bytes Number of bytes in the input string. Note that for
226       multibyte character sets, this _must_ be a pessimistic estimate,
227       ie., one that's cs->mbmaxlen * max_num_codepoints. So for e.g.
228       the utf8mb4 string "foo", you will need to give in 12, not 3.
229   */
230   size_t (*strnxfrmlen)(const CHARSET_INFO *, size_t num_bytes);
231   bool (*like_range)(const CHARSET_INFO *, const char *s, size_t s_length,
232                      char w_prefix, char w_one, char w_many, size_t res_length,
233                      char *min_str, char *max_str, size_t *min_len,
234                      size_t *max_len);
235   int (*wildcmp)(const CHARSET_INFO *, const char *str, const char *str_end,
236                  const char *wildstr, const char *wildend, int escape,
237                  int w_one, int w_many);
238 
239   int (*strcasecmp)(const CHARSET_INFO *, const char *, const char *);
240 
241   uint (*strstr)(const CHARSET_INFO *, const char *b, size_t b_length,
242                  const char *s, size_t s_length, my_match_t *match,
243                  uint nmatch);
244 
245   /**
246     Compute a sort hash for the given key. This hash must preserve equality
247     under the given collation, so that a=b => H(a)=H(b). Note that this hash
248     is used for hash-based partitioning (PARTITION KEY), so you cannot change
249     it except when writing a new collation; it needs to be unchanged across
250     releases, so that the on-disk format does not change. (It is also used
251     for testing equality in the MEMORY storage engine.)
252 
253     nr1 and nr2 are both in/out parameters. nr1 is the actual hash value;
254     nr2 holds extra state between invocations.
255   */
256   void (*hash_sort)(const CHARSET_INFO *cs, const uchar *key, size_t len,
257                     uint64 *nr1, uint64 *nr2);
258   bool (*propagate)(const CHARSET_INFO *cs, const uchar *str, size_t len);
259 } MY_COLLATION_HANDLER;
260 
261 extern MY_COLLATION_HANDLER my_collation_mb_bin_handler;
262 extern MY_COLLATION_HANDLER my_collation_8bit_bin_handler;
263 extern MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler;
264 extern MY_COLLATION_HANDLER my_collation_ucs2_uca_handler;
265 
266 /* Some typedef to make it easy for C++ to make function pointers */
267 typedef int (*my_charset_conv_mb_wc)(const CHARSET_INFO *, my_wc_t *,
268                                      const uchar *, const uchar *);
269 typedef int (*my_charset_conv_wc_mb)(const CHARSET_INFO *, my_wc_t, uchar *,
270                                      uchar *);
271 typedef size_t (*my_charset_conv_case)(const CHARSET_INFO *, char *, size_t,
272                                        char *, size_t);
273 
274 /* See strings/CHARSET_INFO.txt about information on this structure  */
275 typedef struct MY_CHARSET_HANDLER {
276   bool (*init)(CHARSET_INFO *, MY_CHARSET_LOADER *loader);
277   /* Multibyte routines */
278   uint (*ismbchar)(const CHARSET_INFO *, const char *, const char *);
279   uint (*mbcharlen)(const CHARSET_INFO *, uint c);
280   size_t (*numchars)(const CHARSET_INFO *, const char *b, const char *e);
281 
282   /**
283     Return at which byte codepoint number "pos" begins, relative to
284     the start of the string. If the string is shorter than or is
285     exactly "pos" codepoints long, returns a value equal or greater to
286     (e-b).
287   */
288   size_t (*charpos)(const CHARSET_INFO *, const char *b, const char *e,
289                     size_t pos);
290   size_t (*well_formed_len)(const CHARSET_INFO *, const char *b, const char *e,
291                             size_t nchars, int *error);
292   /**
293     Given a pointer and a length in bytes, returns a new length in bytes where
294     all trailing space characters are stripped. This holds even for NO PAD
295     collations.
296 
297     Exception: The "binary" collation, which is used behind-the-scenes to
298     implement the BINARY type (by mapping it to CHAR(n) COLLATE "binary"),
299     returns just the length back with no stripping. It's done that way so that
300     Field_string (implementing CHAR(n)) returns the full padded width on read
301     (as opposed to a normal CHAR, where we usually strip the spaces on read),
302     but it's suboptimal, since lengthsp() is also used in a number of other
303     places, e.g. stripping trailing spaces from enum values given in by the
304     user. If you call this function, be aware of this special exception and
305     consider the implications.
306   */
307   size_t (*lengthsp)(const CHARSET_INFO *, const char *ptr, size_t length);
308   size_t (*numcells)(const CHARSET_INFO *, const char *b, const char *e);
309 
310   /* Unicode conversion */
311   my_charset_conv_mb_wc mb_wc;
312   my_charset_conv_wc_mb wc_mb;
313 
314   /* CTYPE scanner */
315   int (*ctype)(const CHARSET_INFO *cs, int *ctype, const uchar *s,
316                const uchar *e);
317 
318   /* Functions for case and sort conversion */
319   size_t (*caseup_str)(const CHARSET_INFO *, char *);
320   size_t (*casedn_str)(const CHARSET_INFO *, char *);
321 
322   my_charset_conv_case caseup;
323   my_charset_conv_case casedn;
324 
325   /* Charset dependant snprintf() */
326   size_t (*snprintf)(const CHARSET_INFO *, char *to, size_t n, const char *fmt,
327                      ...) MY_ATTRIBUTE((format(printf, 4, 5)));
328   size_t (*long10_to_str)(const CHARSET_INFO *, char *to, size_t n, int radix,
329                           long int val);
330   size_t (*longlong10_to_str)(const CHARSET_INFO *, char *to, size_t n,
331                               int radix, longlong val);
332 
333   void (*fill)(const CHARSET_INFO *, char *to, size_t len, int fill);
334 
335   /* String-to-number conversion routines */
336   long (*strntol)(const CHARSET_INFO *, const char *s, size_t l, int base,
337                   const char **e, int *err);
338   ulong (*strntoul)(const CHARSET_INFO *, const char *s, size_t l, int base,
339                     const char **e, int *err);
340   longlong (*strntoll)(const CHARSET_INFO *, const char *s, size_t l, int base,
341                        const char **e, int *err);
342   ulonglong (*strntoull)(const CHARSET_INFO *, const char *s, size_t l,
343                          int base, const char **e, int *err);
344   double (*strntod)(const CHARSET_INFO *, const char *s, size_t l,
345                     const char **e, int *err);
346   longlong (*strtoll10)(const CHARSET_INFO *cs, const char *nptr,
347                         const char **endptr, int *error);
348   ulonglong (*strntoull10rnd)(const CHARSET_INFO *cs, const char *str,
349                               size_t length, int unsigned_fl,
350                               const char **endptr, int *error);
351   size_t (*scan)(const CHARSET_INFO *, const char *b, const char *e, int sq);
352 } MY_CHARSET_HANDLER;
353 
354 extern MY_CHARSET_HANDLER my_charset_8bit_handler;
355 extern MY_CHARSET_HANDLER my_charset_ascii_handler;
356 extern MY_CHARSET_HANDLER my_charset_ucs2_handler;
357 
358 /* See strings/CHARSET_INFO.txt about information on this structure  */
359 struct CHARSET_INFO {
360   uint number;
361   uint primary_number;
362   uint binary_number;
363   uint state;
364   const char *csname;
365   const char *name;
366   const char *comment;
367   const char *tailoring;
368   struct Coll_param *coll_param;
369   const uchar *ctype;
370   const uchar *to_lower;
371   const uchar *to_upper;
372   const uchar *sort_order;
373   struct MY_UCA_INFO *uca; /* This can be changed in apply_one_rule() */
374   const uint16 *tab_to_uni;
375   const MY_UNI_IDX *tab_from_uni;
376   const MY_UNICASE_INFO *caseinfo;
377   const struct lex_state_maps_st *state_maps; /* parser internal data */
378   const uchar *ident_map;                     /* parser internal data */
379   uint strxfrm_multiply;
380   uchar caseup_multiply;
381   uchar casedn_multiply;
382   uint mbminlen;
383   uint mbmaxlen;
384   uint mbmaxlenlen;
385   my_wc_t min_sort_char;
386   my_wc_t max_sort_char; /* For LIKE optimization */
387   uchar pad_char;
388   bool escape_with_backslash_is_dangerous;
389   uchar levels_for_compare;
390 
391   MY_CHARSET_HANDLER *cset;
392   MY_COLLATION_HANDLER *coll;
393 
394   /**
395     If this collation is PAD_SPACE, it collates as if all inputs were
396     padded with a given number of spaces at the end (see the "num_codepoints"
397     flag to strnxfrm). NO_PAD simply compares unextended strings.
398 
399     Note that this is fundamentally about the behavior of coll->strnxfrm.
400   */
401   enum Pad_attribute pad_attribute;
402 };
403 #define ILLEGAL_CHARSET_INFO_NUMBER (~0U)
404 
405 /*
406   NOTE: You cannot use a CHARSET_INFO without it having been initialized first.
407   In particular, they are not initialized when a unit test starts; do not use
408   these globals indiscriminately from there, and do not add more. Instead,
409   load them through a MY_CHARSET_LOADER, using my_collation_get_by_name().
410 */
411 
412 extern MYSQL_PLUGIN_IMPORT CHARSET_INFO my_charset_bin;
413 extern MYSQL_PLUGIN_IMPORT CHARSET_INFO my_charset_latin1;
414 extern MYSQL_PLUGIN_IMPORT CHARSET_INFO my_charset_filename;
415 extern MYSQL_PLUGIN_IMPORT CHARSET_INFO my_charset_utf8mb4_0900_ai_ci;
416 extern MYSQL_PLUGIN_IMPORT CHARSET_INFO my_charset_utf8mb4_0900_bin;
417 
418 extern CHARSET_INFO my_charset_latin1_bin;
419 extern CHARSET_INFO my_charset_utf32_unicode_ci;
420 extern MYSQL_PLUGIN_IMPORT CHARSET_INFO my_charset_utf8_general_ci;
421 extern CHARSET_INFO my_charset_utf8_tolower_ci;
422 extern CHARSET_INFO my_charset_utf8_unicode_ci;
423 extern CHARSET_INFO my_charset_utf8_bin;
424 extern CHARSET_INFO my_charset_utf8mb4_bin;
425 extern MYSQL_PLUGIN_IMPORT CHARSET_INFO my_charset_utf8mb4_general_ci;
426 
427 #define MY_UTF8MB3 "utf8"
428 #define MY_UTF8MB4 "utf8mb4"
429 
430 /* declarations for simple charsets */
431 extern size_t my_strnxfrm_simple(const CHARSET_INFO *, uchar *dst,
432                                  size_t dstlen, uint nweights, const uchar *src,
433                                  size_t srclen, uint flags);
434 size_t my_strnxfrmlen_simple(const CHARSET_INFO *, size_t);
435 extern int my_strnncoll_simple(const CHARSET_INFO *, const uchar *, size_t,
436                                const uchar *, size_t, bool);
437 
438 extern int my_strnncollsp_simple(const CHARSET_INFO *, const uchar *, size_t,
439                                  const uchar *, size_t);
440 
441 extern void my_hash_sort_simple(const CHARSET_INFO *cs, const uchar *key,
442                                 size_t len, uint64 *nr1, uint64 *nr2);
443 
444 extern size_t my_lengthsp_8bit(const CHARSET_INFO *cs, const char *ptr,
445                                size_t length);
446 
447 extern uint my_instr_simple(const CHARSET_INFO *, const char *b,
448                             size_t b_length, const char *s, size_t s_length,
449                             my_match_t *match, uint nmatch);
450 
451 /* Functions for 8bit */
452 extern size_t my_caseup_str_8bit(const CHARSET_INFO *, char *);
453 extern size_t my_casedn_str_8bit(const CHARSET_INFO *, char *);
454 extern size_t my_caseup_8bit(const CHARSET_INFO *, char *src, size_t srclen,
455                              char *dst, size_t dstlen);
456 extern size_t my_casedn_8bit(const CHARSET_INFO *, char *src, size_t srclen,
457                              char *dst, size_t dstlen);
458 
459 extern int my_strcasecmp_8bit(const CHARSET_INFO *cs, const char *,
460                               const char *);
461 
462 int my_mb_wc_8bit(const CHARSET_INFO *cs, my_wc_t *wc, const uchar *s,
463                   const uchar *e);
464 int my_wc_mb_8bit(const CHARSET_INFO *cs, my_wc_t wc, uchar *s, uchar *e);
465 
466 int my_mb_ctype_8bit(const CHARSET_INFO *, int *, const uchar *, const uchar *);
467 int my_mb_ctype_mb(const CHARSET_INFO *, int *, const uchar *, const uchar *);
468 
469 size_t my_scan_8bit(const CHARSET_INFO *cs, const char *b, const char *e,
470                     int sq);
471 
472 size_t my_snprintf_8bit(const CHARSET_INFO *, char *to, size_t n,
473                         const char *fmt, ...)
474     MY_ATTRIBUTE((format(printf, 4, 5)));
475 
476 long my_strntol_8bit(const CHARSET_INFO *, const char *s, size_t l, int base,
477                      const char **e, int *err);
478 ulong my_strntoul_8bit(const CHARSET_INFO *, const char *s, size_t l, int base,
479                        const char **e, int *err);
480 longlong my_strntoll_8bit(const CHARSET_INFO *, const char *s, size_t l,
481                           int base, const char **e, int *err);
482 ulonglong my_strntoull_8bit(const CHARSET_INFO *, const char *s, size_t l,
483                             int base, const char **e, int *err);
484 double my_strntod_8bit(const CHARSET_INFO *, const char *s, size_t l,
485                        const char **e, int *err);
486 size_t my_long10_to_str_8bit(const CHARSET_INFO *, char *to, size_t l,
487                              int radix, long int val);
488 size_t my_longlong10_to_str_8bit(const CHARSET_INFO *, char *to, size_t l,
489                                  int radix, longlong val);
490 
491 longlong my_strtoll10_8bit(const CHARSET_INFO *cs, const char *nptr,
492                            const char **endptr, int *error);
493 longlong my_strtoll10_ucs2(const CHARSET_INFO *cs, const char *nptr,
494                            char **endptr, int *error);
495 
496 ulonglong my_strntoull10rnd_8bit(const CHARSET_INFO *cs, const char *str,
497                                  size_t length, int unsigned_fl,
498                                  const char **endptr, int *error);
499 ulonglong my_strntoull10rnd_ucs2(const CHARSET_INFO *cs, const char *str,
500                                  size_t length, int unsigned_fl, char **endptr,
501                                  int *error);
502 
503 void my_fill_8bit(const CHARSET_INFO *cs, char *to, size_t l, int fill);
504 
505 /* For 8-bit character set */
506 bool my_like_range_simple(const CHARSET_INFO *cs, const char *ptr,
507                           size_t ptr_length, char escape, char w_one,
508                           char w_many, size_t res_length, char *min_str,
509                           char *max_str, size_t *min_length,
510                           size_t *max_length);
511 
512 /* For ASCII-based multi-byte character sets with mbminlen=1 */
513 bool my_like_range_mb(const CHARSET_INFO *cs, const char *ptr,
514                       size_t ptr_length, char escape, char w_one, char w_many,
515                       size_t res_length, char *min_str, char *max_str,
516                       size_t *min_length, size_t *max_length);
517 
518 /* For other character sets, with arbitrary mbminlen and mbmaxlen numbers */
519 bool my_like_range_generic(const CHARSET_INFO *cs, const char *ptr,
520                            size_t ptr_length, char escape, char w_one,
521                            char w_many, size_t res_length, char *min_str,
522                            char *max_str, size_t *min_length,
523                            size_t *max_length);
524 
525 int my_wildcmp_8bit(const CHARSET_INFO *, const char *str, const char *str_end,
526                     const char *wildstr, const char *wildend, int escape,
527                     int w_one, int w_many);
528 
529 int my_wildcmp_bin(const CHARSET_INFO *, const char *str, const char *str_end,
530                    const char *wildstr, const char *wildend, int escape,
531                    int w_one, int w_many);
532 
533 size_t my_numchars_8bit(const CHARSET_INFO *, const char *b, const char *e);
534 size_t my_numcells_8bit(const CHARSET_INFO *, const char *b, const char *e);
535 size_t my_charpos_8bit(const CHARSET_INFO *, const char *b, const char *e,
536                        size_t pos);
537 size_t my_well_formed_len_8bit(const CHARSET_INFO *, const char *b,
538                                const char *e, size_t pos, int *error);
539 uint my_mbcharlen_8bit(const CHARSET_INFO *, uint c);
540 
541 /* Functions for multibyte charsets */
542 extern size_t my_caseup_str_mb(const CHARSET_INFO *, char *);
543 extern size_t my_casedn_str_mb(const CHARSET_INFO *, char *);
544 extern size_t my_caseup_mb(const CHARSET_INFO *, char *src, size_t srclen,
545                            char *dst, size_t dstlen);
546 extern size_t my_casedn_mb(const CHARSET_INFO *, char *src, size_t srclen,
547                            char *dst, size_t dstlen);
548 extern size_t my_caseup_mb_varlen(const CHARSET_INFO *, char *src,
549                                   size_t srclen, char *dst, size_t dstlen);
550 extern size_t my_casedn_mb_varlen(const CHARSET_INFO *, char *src,
551                                   size_t srclen, char *dst, size_t dstlen);
552 extern size_t my_caseup_ujis(const CHARSET_INFO *, char *src, size_t srclen,
553                              char *dst, size_t dstlen);
554 extern size_t my_casedn_ujis(const CHARSET_INFO *, char *src, size_t srclen,
555                              char *dst, size_t dstlen);
556 extern int my_strcasecmp_mb(const CHARSET_INFO *cs, const char *, const char *);
557 
558 int my_wildcmp_mb(const CHARSET_INFO *, const char *str, const char *str_end,
559                   const char *wildstr, const char *wildend, int escape,
560                   int w_one, int w_many);
561 size_t my_numchars_mb(const CHARSET_INFO *, const char *b, const char *e);
562 size_t my_numcells_mb(const CHARSET_INFO *, const char *b, const char *e);
563 size_t my_charpos_mb(const CHARSET_INFO *, const char *b, const char *e,
564                      size_t pos);
565 size_t my_well_formed_len_mb(const CHARSET_INFO *, const char *b, const char *e,
566                              size_t pos, int *error);
567 uint my_instr_mb(const CHARSET_INFO *, const char *b, size_t b_length,
568                  const char *s, size_t s_length, my_match_t *match,
569                  uint nmatch);
570 
571 int my_strnncoll_mb_bin(const CHARSET_INFO *cs, const uchar *s, size_t slen,
572                         const uchar *t, size_t tlen, bool t_is_prefix);
573 
574 int my_strnncollsp_mb_bin(const CHARSET_INFO *cs, const uchar *a,
575                           size_t a_length, const uchar *b, size_t b_length);
576 
577 int my_wildcmp_mb_bin(const CHARSET_INFO *cs, const char *str,
578                       const char *str_end, const char *wildstr,
579                       const char *wildend, int escape, int w_one, int w_many);
580 
581 int my_strcasecmp_mb_bin(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
582                          const char *s, const char *t);
583 
584 void my_hash_sort_mb_bin(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
585                          const uchar *key, size_t len, uint64 *nr1,
586                          uint64 *nr2);
587 
588 size_t my_strnxfrm_mb(const CHARSET_INFO *, uchar *dst, size_t dstlen,
589                       uint nweights, const uchar *src, size_t srclen,
590                       uint flags);
591 
592 size_t my_strnxfrm_unicode(const CHARSET_INFO *, uchar *dst, size_t dstlen,
593                            uint nweights, const uchar *src, size_t srclen,
594                            uint flags);
595 
596 size_t my_strnxfrm_unicode_full_bin(const CHARSET_INFO *, uchar *dst,
597                                     size_t dstlen, uint nweights,
598                                     const uchar *src, size_t srclen,
599                                     uint flags);
600 size_t my_strnxfrmlen_unicode_full_bin(const CHARSET_INFO *, size_t);
601 
602 int my_wildcmp_unicode(const CHARSET_INFO *cs, const char *str,
603                        const char *str_end, const char *wildstr,
604                        const char *wildend, int escape, int w_one, int w_many,
605                        const MY_UNICASE_INFO *weights);
606 
607 extern bool my_parse_charset_xml(MY_CHARSET_LOADER *loader, const char *buf,
608                                  size_t buflen);
609 extern size_t my_strcspn(const CHARSET_INFO *cs, const char *str,
610                          const char *end, const char *reject,
611                          size_t reject_length);
612 
613 bool my_propagate_simple(const CHARSET_INFO *cs, const uchar *str, size_t len);
614 bool my_propagate_complex(const CHARSET_INFO *cs, const uchar *str, size_t len);
615 
616 uint my_string_repertoire(const CHARSET_INFO *cs, const char *str, size_t len);
617 bool my_charset_is_ascii_based(const CHARSET_INFO *cs);
618 bool my_charset_is_8bit_pure_ascii(const CHARSET_INFO *cs);
619 uint my_charset_repertoire(const CHARSET_INFO *cs);
620 
621 uint my_strxfrm_flag_normalize(uint flags);
622 size_t my_strxfrm_pad(const CHARSET_INFO *cs, uchar *str, uchar *frmend,
623                       uchar *strend, uint nweights, uint flags);
624 
625 bool my_charset_is_ascii_compatible(const CHARSET_INFO *cs);
626 
627 size_t my_convert(char *to, size_t to_length, const CHARSET_INFO *to_cs,
628                   const char *from, size_t from_length,
629                   const CHARSET_INFO *from_cs, uint *errors);
630 
631 uint my_mbcharlen_ptr(const CHARSET_INFO *cs, const char *s, const char *e);
632 
633 bool my_is_prefixidx_cand(const CHARSET_INFO *cs, const char *wildstr,
634                           const char *wildend, int escape, int w_many,
635                           size_t *prefix_len);
636 
637 #define _MY_U 01    /* Upper case */
638 #define _MY_L 02    /* Lower case */
639 #define _MY_NMR 04  /* Numeral (digit) */
640 #define _MY_SPC 010 /* Spacing character */
641 #define _MY_PNT 020 /* Punctuation */
642 #define _MY_CTR 040 /* Control character */
643 #define _MY_B 0100  /* Blank */
644 #define _MY_X 0200  /* heXadecimal digit */
645 
646 /* The following macros makes sense only for one-byte character sets.
647 They will not fail for multibyte character sets, but will not produce
648 the expected results. They may have som limited usability like
649 e.g. for utf8mb3/utf8mb4, meaningful results will be produced for
650 values < 0x7F. */
651 #define my_isascii(c) (!((c) & ~0177))
652 #define my_toupper(s, c) (char)((s)->to_upper[(uchar)(c)])
653 #define my_tolower(s, c) (char)((s)->to_lower[(uchar)(c)])
654 #define my_isalpha(s, c) (((s)->ctype + 1)[(uchar)(c)] & (_MY_U | _MY_L))
655 #define my_isupper(s, c) (((s)->ctype + 1)[(uchar)(c)] & _MY_U)
656 #define my_islower(s, c) (((s)->ctype + 1)[(uchar)(c)] & _MY_L)
657 #define my_isdigit(s, c) (((s)->ctype + 1)[(uchar)(c)] & _MY_NMR)
658 #define my_isxdigit(s, c) (((s)->ctype + 1)[(uchar)(c)] & _MY_X)
659 #define my_isalnum(s, c) \
660   (((s)->ctype + 1)[(uchar)(c)] & (_MY_U | _MY_L | _MY_NMR))
661 #define my_isspace(s, c) (((s)->ctype + 1)[(uchar)(c)] & _MY_SPC)
662 #define my_ispunct(s, c) (((s)->ctype + 1)[(uchar)(c)] & _MY_PNT)
663 #define my_isprint(s, c) \
664   (((s)->ctype + 1)[(uchar)(c)] & (_MY_PNT | _MY_U | _MY_L | _MY_NMR | _MY_B))
665 #define my_isgraph(s, c) \
666   (((s)->ctype + 1)[(uchar)(c)] & (_MY_PNT | _MY_U | _MY_L | _MY_NMR))
667 #define my_iscntrl(s, c) (((s)->ctype + 1)[(uchar)(c)] & _MY_CTR)
668 
669 /* Some macros that should be cleaned up a little */
670 #define my_isvar(s, c) (my_isalnum(s, c) || (c) == '_')
671 #define my_isvar_start(s, c) (my_isalpha(s, c) || (c) == '_')
672 
673 #define my_binary_compare(s) ((s)->state & MY_CS_BINSORT)
674 #define use_strnxfrm(s) ((s)->state & MY_CS_STRNXFRM)
675 #define my_strnxfrm(cs, d, dl, s, sl) \
676   ((cs)->coll->strnxfrm((cs), (d), (dl), (dl), (s), (sl), 0))
677 #define my_strnncoll(s, a, b, c, d) \
678   ((s)->coll->strnncoll((s), (a), (b), (c), (d), 0))
679 #define my_like_range(s, a, b, c, d, e, f, g, h, i, j) \
680   ((s)->coll->like_range((s), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j)))
681 #define my_wildcmp(cs, s, se, w, we, e, o, m) \
682   ((cs)->coll->wildcmp((cs), (s), (se), (w), (we), (e), (o), (m)))
683 #define my_strcasecmp(s, a, b) ((s)->coll->strcasecmp((s), (a), (b)))
684 #define my_charpos(cs, b, e, num) \
685   (cs)->cset->charpos((cs), (const char *)(b), (const char *)(e), (num))
686 
687 #define use_mb(s) ((s)->cset->ismbchar != NULL)
my_ismbchar(const CHARSET_INFO * cs,const char * str,const char * strend)688 static inline uint my_ismbchar(const CHARSET_INFO *cs, const char *str,
689                                const char *strend) {
690   return cs->cset->ismbchar(cs, str, strend);
691 }
692 
my_ismbchar(const CHARSET_INFO * cs,const uchar * str,const uchar * strend)693 static inline uint my_ismbchar(const CHARSET_INFO *cs, const uchar *str,
694                                const uchar *strend) {
695   return cs->cset->ismbchar(cs, pointer_cast<const char *>(str),
696                             pointer_cast<const char *>(strend));
697 }
698 
699 #define my_mbcharlen(s, a) ((s)->cset->mbcharlen((s), (a)))
700 /**
701   Get the length of gb18030 code by the given two leading bytes
702 
703   @param[in] s charset_info
704   @param[in] a first byte of gb18030 code
705   @param[in] b second byte of gb18030 code
706   @return    the length of gb18030 code starting with given two bytes,
707              the length would be 2 or 4 for valid gb18030 code,
708              or 0 for invalid gb18030 code
709 */
710 #define my_mbcharlen_2(s, a, b) \
711   ((s)->cset->mbcharlen((s), ((((a)&0xFF) << 8) + ((b)&0xFF))))
712 /**
713   Get the maximum length of leading bytes needed to determine the length of a
714   multi-byte gb18030 code
715 
716   @param[in] s charset_info
717   @return    number of leading bytes we need, would be 2 for gb18030
718              and 1 for all other charsets
719 */
720 #define my_mbmaxlenlen(s) ((s)->mbmaxlenlen)
721 /**
722   Judge if the given byte is a possible leading byte for a charset.
723   For gb18030 whose mbmaxlenlen is 2, we can't determine the length of
724   a multi-byte character by looking at the first byte only
725 
726   @param[in] s charset_info
727   @param[in] i possible leading byte
728   @return    true if it is, otherwise false
729 */
730 #define my_ismb1st(s, i)         \
731   (my_mbcharlen((s), (i)) > 1 || \
732    (my_mbmaxlenlen((s)) == 2 && my_mbcharlen((s), (i)) == 0))
733 
734 #define my_caseup_str(s, a) ((s)->cset->caseup_str((s), (a)))
735 #define my_casedn_str(s, a) ((s)->cset->casedn_str((s), (a)))
736 #define my_strntol(s, a, b, c, d, e) \
737   ((s)->cset->strntol((s), (a), (b), (c), (d), (e)))
738 #define my_strntoul(s, a, b, c, d, e) \
739   ((s)->cset->strntoul((s), (a), (b), (c), (d), (e)))
740 #define my_strntoll(s, a, b, c, d, e) \
741   ((s)->cset->strntoll((s), (a), (b), (c), (d), (e)))
742 #define my_strntoull(s, a, b, c, d, e) \
743   ((s)->cset->strntoull((s), (a), (b), (c), (d), (e)))
744 #define my_strntod(s, a, b, c, d) ((s)->cset->strntod((s), (a), (b), (c), (d)))
745 
is_supported_parser_charset(const CHARSET_INFO * cs)746 static inline bool is_supported_parser_charset(const CHARSET_INFO *cs) {
747   return (cs->mbminlen == 1);
748 }
749 
750 #endif  // M_CTYPE_INCLUDED
751