1 #include <stdbool.h>
2 #include "utf8.h"
3 #include "ascii.h"
4
u_seq_len(unsigned int first_byte)5 static int u_seq_len(unsigned int first_byte)
6 {
7 if (first_byte < 0x80) {
8 return 1;
9 }
10 if (first_byte < 0xc0) {
11 return 0;
12 }
13 if (first_byte < 0xe0) {
14 return 2;
15 }
16 if (first_byte < 0xf0) {
17 return 3;
18 }
19
20 // Could be 0xf8 but RFC 3629 doesn't allow codepoints above 0x10ffff
21 if (first_byte < 0xf5) {
22 return 4;
23 }
24 return -1;
25 }
26
u_is_continuation(CodePoint u)27 static bool u_is_continuation(CodePoint u)
28 {
29 return (u & 0xc0) == 0x80;
30 }
31
u_seq_len_ok(CodePoint u,int len)32 static bool u_seq_len_ok(CodePoint u, int len)
33 {
34 return u_char_size(u) == len;
35 }
36
37 /*
38 * Len Mask Note
39 * -------------------------------------------------
40 * 1 0111 1111 Not supported by this function!
41 * 2 0001 1111
42 * 3 0000 1111
43 * 4 0000 0111
44 * 5 0000 0011 Forbidden by RFC 3629
45 * 6 0000 0001 Forbidden by RFC 3629
46 */
u_get_first_byte_mask(unsigned int len)47 static unsigned int u_get_first_byte_mask(unsigned int len)
48 {
49 return (1U << 7U >> len) - 1;
50 }
51
u_str_width(const unsigned char * str)52 size_t u_str_width(const unsigned char *str)
53 {
54 size_t i = 0, w = 0;
55 while (str[i]) {
56 w += u_char_width(u_str_get_char(str, &i));
57 }
58 return w;
59 }
60
u_prev_char(const unsigned char * buf,size_t * idx)61 CodePoint u_prev_char(const unsigned char *buf, size_t *idx)
62 {
63 size_t i = *idx;
64 unsigned int count, shift;
65 CodePoint u;
66
67 u = buf[--i];
68 if (u < 0x80) {
69 *idx = i;
70 return u;
71 }
72
73 if (!u_is_continuation(u)) {
74 goto invalid;
75 }
76
77 u &= 0x3f;
78 count = 1;
79 shift = 6;
80 while (i) {
81 unsigned int ch = buf[--i];
82 unsigned int len = u_seq_len(ch);
83
84 count++;
85 if (len == 0) {
86 if (count == 4) {
87 // Too long sequence
88 break;
89 }
90 u |= (ch & 0x3f) << shift;
91 shift += 6;
92 } else if (count != len) {
93 // Incorrect length
94 break;
95 } else {
96 u |= (ch & u_get_first_byte_mask(len)) << shift;
97 if (!u_seq_len_ok(u, len)) {
98 break;
99 }
100
101 *idx = i;
102 return u;
103 }
104 }
105 invalid:
106 *idx = *idx - 1;
107 u = buf[*idx];
108 return -u;
109 }
110
u_str_get_char(const unsigned char * str,size_t * idx)111 CodePoint u_str_get_char(const unsigned char *str, size_t *idx)
112 {
113 size_t i = *idx;
114 CodePoint u = str[i];
115 if (u < 0x80) {
116 *idx = i + 1;
117 return u;
118 }
119 return u_get_nonascii(str, i + 4, idx);
120 }
121
u_get_char(const unsigned char * buf,size_t size,size_t * idx)122 CodePoint u_get_char(const unsigned char *buf, size_t size, size_t *idx)
123 {
124 size_t i = *idx;
125 CodePoint u = buf[i];
126 if (u < 0x80) {
127 *idx = i + 1;
128 return u;
129 }
130 return u_get_nonascii(buf, size, idx);
131 }
132
u_get_nonascii(const unsigned char * buf,size_t size,size_t * idx)133 CodePoint u_get_nonascii(const unsigned char *buf, size_t size, size_t *idx)
134 {
135 size_t i = *idx;
136 int len, c;
137 unsigned int first, u;
138
139 first = buf[i++];
140 len = u_seq_len(first);
141 if (unlikely(len < 2 || len > size - i + 1)) {
142 goto invalid;
143 }
144
145 u = first & u_get_first_byte_mask(len);
146 c = len - 1;
147 do {
148 CodePoint ch = buf[i++];
149 if (!u_is_continuation(ch)) {
150 goto invalid;
151 }
152 u = (u << 6) | (ch & 0x3f);
153 } while (--c);
154
155 if (!u_seq_len_ok(u, len)) {
156 goto invalid;
157 }
158
159 *idx = i;
160 return u;
161 invalid:
162 *idx += 1;
163 return -first;
164 }
165
u_set_char_raw(char * str,size_t * idx,CodePoint u)166 void u_set_char_raw(char *str, size_t *idx, CodePoint u)
167 {
168 size_t i = *idx;
169 if (u <= 0x7f) {
170 str[i++] = u;
171 } else if (u <= 0x7ff) {
172 str[i + 1] = (u & 0x3f) | 0x80; u >>= 6;
173 str[i + 0] = u | 0xc0;
174 i += 2;
175 } else if (u <= 0xffff) {
176 str[i + 2] = (u & 0x3f) | 0x80; u >>= 6;
177 str[i + 1] = (u & 0x3f) | 0x80; u >>= 6;
178 str[i + 0] = u | 0xe0;
179 i += 3;
180 } else if (u <= 0x10ffff) {
181 str[i + 3] = (u & 0x3f) | 0x80; u >>= 6;
182 str[i + 2] = (u & 0x3f) | 0x80; u >>= 6;
183 str[i + 1] = (u & 0x3f) | 0x80; u >>= 6;
184 str[i + 0] = u | 0xf0;
185 i += 4;
186 } else {
187 // Invalid byte value
188 str[i++] = u & 0xff;
189 }
190 *idx = i;
191 }
192
u_set_char(char * str,size_t * idx,CodePoint u)193 void u_set_char(char *str, size_t *idx, CodePoint u)
194 {
195 size_t i = *idx;
196 if (u < 0x80) {
197 if (ascii_iscntrl(u)) {
198 u_set_ctrl(str, idx, u);
199 } else {
200 str[i++] = u;
201 *idx = i;
202 }
203 } else if (u_is_unprintable(u)) {
204 u_set_hex(str, idx, u);
205 } else if (u <= 0x7ff) {
206 str[i + 1] = (u & 0x3f) | 0x80; u >>= 6;
207 str[i + 0] = u | 0xc0;
208 i += 2;
209 *idx = i;
210 } else if (u <= 0xffff) {
211 str[i + 2] = (u & 0x3f) | 0x80; u >>= 6;
212 str[i + 1] = (u & 0x3f) | 0x80; u >>= 6;
213 str[i + 0] = u | 0xe0;
214 i += 3;
215 *idx = i;
216 } else if (u <= 0x10ffff) {
217 str[i + 3] = (u & 0x3f) | 0x80; u >>= 6;
218 str[i + 2] = (u & 0x3f) | 0x80; u >>= 6;
219 str[i + 1] = (u & 0x3f) | 0x80; u >>= 6;
220 str[i + 0] = u | 0xf0;
221 i += 4;
222 *idx = i;
223 }
224 }
225
u_set_hex(char * str,size_t * idx,CodePoint u)226 void u_set_hex(char *str, size_t *idx, CodePoint u)
227 {
228 static const char hex_tab[16] = "0123456789abcdef";
229 char *p = str + *idx;
230 p[0] = '<';
231 if (!u_is_unicode(u)) {
232 // Invalid byte (negated)
233 u *= -1;
234 p[1] = hex_tab[(u >> 4) & 0x0f];
235 p[2] = hex_tab[u & 0x0f];
236 } else {
237 p[1] = '?';
238 p[2] = '?';
239 }
240 p[3] = '>';
241 *idx += 4;
242 }
243
u_skip_chars(const char * str,int * width)244 size_t u_skip_chars(const char *str, int *width)
245 {
246 int w = *width;
247 size_t idx = 0;
248
249 while (str[idx] && w > 0) {
250 w -= u_char_width(u_str_get_char(str, &idx));
251 }
252
253 // Add 1..3 if skipped 'too much' (the last char was double
254 // width or invalid (<xx>))
255 *width -= w;
256 return idx;
257 }
258
has_prefix(const char * str,const char * prefix_lcase)259 static bool has_prefix(const char *str, const char *prefix_lcase)
260 {
261 size_t ni = 0;
262 size_t hi = 0;
263 CodePoint pc;
264 while ((pc = u_str_get_char(prefix_lcase, &ni))) {
265 CodePoint sc = u_str_get_char(str, &hi);
266 if (sc != pc && u_to_lower(sc) != pc) {
267 return false;
268 }
269 }
270 return true;
271 }
272
u_str_index(const char * haystack,const char * needle_lcase)273 ssize_t u_str_index(const char *haystack, const char *needle_lcase)
274 {
275 size_t hi = 0;
276 size_t ni = 0;
277 CodePoint nc = u_str_get_char(needle_lcase, &ni);
278
279 if (!nc) {
280 return 0;
281 }
282
283 while (haystack[hi]) {
284 size_t prev = hi;
285 CodePoint hc = u_str_get_char(haystack, &hi);
286 if (
287 (hc == nc || u_to_lower(hc) == nc)
288 && has_prefix(haystack + hi, needle_lcase + ni)
289 ) {
290 return prev;
291 }
292 }
293 return -1;
294 }
295