1 /*
2 Copyright (c) 2018 MariaDB Corporation
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18 #ifndef _CTYPE_UTF8_H
19 #define _CTYPE_UTF8_H
20
21 /* Detect special bytes and sequences */
22 #define IS_CONTINUATION_BYTE(c) (((uchar) (c) ^ 0x80) < 0x40)
23
24 /*
25 Check MB2 character assuming that b0 is alredy known to be >= 0xC2.
26 Use this macro if the caller already checked b0 for:
27 - an MB1 character
28 - an unused gap between MB1 and MB2HEAD
29 */
30 #define IS_UTF8MB2_STEP2(b0,b1) (((uchar) (b0) < 0xE0) && \
31 IS_CONTINUATION_BYTE((uchar) b1))
32
33 /*
34 Check MB3 character assuming that b0 is already known to be
35 in the valid MB3HEAD range [0xE0..0xEF].
36 */
37 #define IS_UTF8MB3_STEP2(b0,b1,b2) (IS_CONTINUATION_BYTE(b1) && \
38 IS_CONTINUATION_BYTE(b2) && \
39 ((uchar) b0 >= 0xe1 || (uchar) b1 >= 0xa0))
40
41 /*
42 Check MB3 character assuming that b0 is already known to be >= 0xE0,
43 but is not checked for the high end 0xF0 yet.
44 Use this macro if the caller already checked b0 for:
45 - an MB1 character
46 - an unused gap between MB1 and MB2HEAD
47 - an MB2HEAD
48 */
49 #define IS_UTF8MB3_STEP3(b0,b1,b2) (((uchar) (b0) < 0xF0) && \
50 IS_UTF8MB3_STEP2(b0,b1,b2))
51
52 /*
53 UTF-8 quick four-byte mask:
54 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
55 Encoding allows to encode U+00010000..U+001FFFFF
56
57 The maximum character defined in the Unicode standard is U+0010FFFF.
58 Higher characters U+00110000..U+001FFFFF are not used.
59
60 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
61 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
62
63 Valid codes:
64 [F0][90..BF][80..BF][80..BF]
65 [F1][80..BF][80..BF][80..BF]
66 [F2][80..BF][80..BF][80..BF]
67 [F3][80..BF][80..BF][80..BF]
68 [F4][80..8F][80..BF][80..BF]
69 */
70
71 /*
72 Check MB4 character assuming that b0 is already
73 known to be in the range [0xF0..0xF4]
74 */
75 #define IS_UTF8MB4_STEP2(b0,b1,b2,b3) (IS_CONTINUATION_BYTE(b1) && \
76 IS_CONTINUATION_BYTE(b2) && \
77 IS_CONTINUATION_BYTE(b3) && \
78 (b0 >= 0xf1 || b1 >= 0x90) && \
79 (b0 <= 0xf3 || b1 <= 0x8F))
80 #define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \
81 IS_UTF8MB4_STEP2(b0,b1,b2,b3))
82
83 /* Convert individual bytes to Unicode code points */
84 #define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\
85 ((my_wc_t) ((uchar) b1 ^ 0x80)))
86 #define UTF8MB3_CODE(b0,b1,b2) (((my_wc_t) ((uchar) b0 & 0x0f) << 12) |\
87 ((my_wc_t) ((uchar) b1 ^ 0x80) << 6) |\
88 ((my_wc_t) ((uchar) b2 ^ 0x80)))
89 #define UTF8MB4_CODE(b0,b1,b2,b3) (((my_wc_t) ((uchar) b0 & 0x07) << 18) |\
90 ((my_wc_t) ((uchar) b1 ^ 0x80) << 12) |\
91 ((my_wc_t) ((uchar) b2 ^ 0x80) << 6) |\
92 (my_wc_t) ((uchar) b3 ^ 0x80))
93
94 static inline int
my_mb_wc_utf8mb3_quick(my_wc_t * pwc,const uchar * s,const uchar * e)95 my_mb_wc_utf8mb3_quick(my_wc_t * pwc, const uchar *s, const uchar *e)
96 {
97 uchar c;
98
99 if (s >= e)
100 return MY_CS_TOOSMALL;
101
102 c= s[0];
103 if (c < 0x80)
104 {
105 *pwc = c;
106 return 1;
107 }
108 else if (c < 0xc2)
109 return MY_CS_ILSEQ;
110 else if (c < 0xe0)
111 {
112 if (s+2 > e) /* We need 2 characters */
113 return MY_CS_TOOSMALL2;
114
115 if (!(IS_CONTINUATION_BYTE(s[1])))
116 return MY_CS_ILSEQ;
117
118 *pwc= UTF8MB2_CODE(c, s[1]);
119 return 2;
120 }
121 else if (c < 0xf0)
122 {
123 if (s+3 > e) /* We need 3 characters */
124 return MY_CS_TOOSMALL3;
125
126 if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
127 return MY_CS_ILSEQ;
128
129 *pwc= UTF8MB3_CODE(c, s[1], s[2]);
130 return 3;
131 }
132 return MY_CS_ILSEQ;
133 }
134
135
136 #ifdef HAVE_CHARSET_utf8mb4
137 static inline int
my_mb_wc_utf8mb4_quick(my_wc_t * pwc,const uchar * s,const uchar * e)138 my_mb_wc_utf8mb4_quick(my_wc_t *pwc, const uchar *s, const uchar *e)
139 {
140 uchar c;
141
142 if (s >= e)
143 return MY_CS_TOOSMALL;
144
145 c= s[0];
146 if (c < 0x80)
147 {
148 *pwc= c;
149 return 1;
150 }
151 else if (c < 0xc2)
152 return MY_CS_ILSEQ;
153 else if (c < 0xe0)
154 {
155 if (s + 2 > e) /* We need 2 characters */
156 return MY_CS_TOOSMALL2;
157
158 if (!(IS_CONTINUATION_BYTE(s[1])))
159 return MY_CS_ILSEQ;
160
161 *pwc= UTF8MB2_CODE(c, s[1]);
162 return 2;
163 }
164 else if (c < 0xf0)
165 {
166 if (s + 3 > e) /* We need 3 characters */
167 return MY_CS_TOOSMALL3;
168
169 if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
170 return MY_CS_ILSEQ;
171
172 *pwc= UTF8MB3_CODE(c, s[1], s[2]);
173 return 3;
174 }
175 else if (c < 0xf5)
176 {
177 if (s + 4 > e) /* We need 4 characters */
178 return MY_CS_TOOSMALL4;
179
180 if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3]))
181 return MY_CS_ILSEQ;
182 *pwc= UTF8MB4_CODE(c, s[1], s[2], s[3]);
183 return 4;
184 }
185 return MY_CS_ILSEQ;
186 }
187 #endif /* HAVE_CHARSET_utf8mb4*/
188
189
190 #endif /* _CTYPE_UTF8_H */
191