1 #include <stdbool.h>
2 #include "utf8.h"
3 #include "ascii.h"
4 
u_seq_len(unsigned int first_byte)5 static int u_seq_len(unsigned int first_byte)
6 {
7     if (first_byte < 0x80) {
8         return 1;
9     }
10     if (first_byte < 0xc0) {
11         return 0;
12     }
13     if (first_byte < 0xe0) {
14         return 2;
15     }
16     if (first_byte < 0xf0) {
17         return 3;
18     }
19 
20     // Could be 0xf8 but RFC 3629 doesn't allow codepoints above 0x10ffff
21     if (first_byte < 0xf5) {
22         return 4;
23     }
24     return -1;
25 }
26 
u_is_continuation(CodePoint u)27 static bool u_is_continuation(CodePoint u)
28 {
29     return (u & 0xc0) == 0x80;
30 }
31 
u_seq_len_ok(CodePoint u,int len)32 static bool u_seq_len_ok(CodePoint u, int len)
33 {
34     return u_char_size(u) == len;
35 }
36 
37 /*
38  * Len  Mask         Note
39  * -------------------------------------------------
40  * 1    0111 1111    Not supported by this function!
41  * 2    0001 1111
42  * 3    0000 1111
43  * 4    0000 0111
44  * 5    0000 0011    Forbidden by RFC 3629
45  * 6    0000 0001    Forbidden by RFC 3629
46  */
u_get_first_byte_mask(unsigned int len)47 static unsigned int u_get_first_byte_mask(unsigned int len)
48 {
49     return (1U << 7U >> len) - 1;
50 }
51 
u_str_width(const unsigned char * str)52 size_t u_str_width(const unsigned char *str)
53 {
54     size_t i = 0, w = 0;
55     while (str[i]) {
56         w += u_char_width(u_str_get_char(str, &i));
57     }
58     return w;
59 }
60 
u_prev_char(const unsigned char * buf,size_t * idx)61 CodePoint u_prev_char(const unsigned char *buf, size_t *idx)
62 {
63     size_t i = *idx;
64     unsigned int count, shift;
65     CodePoint u;
66 
67     u = buf[--i];
68     if (u < 0x80) {
69         *idx = i;
70         return u;
71     }
72 
73     if (!u_is_continuation(u)) {
74         goto invalid;
75     }
76 
77     u &= 0x3f;
78     count = 1;
79     shift = 6;
80     while (i) {
81         unsigned int ch = buf[--i];
82         unsigned int len = u_seq_len(ch);
83 
84         count++;
85         if (len == 0) {
86             if (count == 4) {
87                 // Too long sequence
88                 break;
89             }
90             u |= (ch & 0x3f) << shift;
91             shift += 6;
92         } else if (count != len) {
93             // Incorrect length
94             break;
95         } else {
96             u |= (ch & u_get_first_byte_mask(len)) << shift;
97             if (!u_seq_len_ok(u, len)) {
98                 break;
99             }
100 
101             *idx = i;
102             return u;
103         }
104     }
105 invalid:
106     *idx = *idx - 1;
107     u = buf[*idx];
108     return -u;
109 }
110 
u_str_get_char(const unsigned char * str,size_t * idx)111 CodePoint u_str_get_char(const unsigned char *str, size_t *idx)
112 {
113     size_t i = *idx;
114     CodePoint u = str[i];
115     if (u < 0x80) {
116         *idx = i + 1;
117         return u;
118     }
119     return u_get_nonascii(str, i + 4, idx);
120 }
121 
u_get_char(const unsigned char * buf,size_t size,size_t * idx)122 CodePoint u_get_char(const unsigned char *buf, size_t size, size_t *idx)
123 {
124     size_t i = *idx;
125     CodePoint u = buf[i];
126     if (u < 0x80) {
127         *idx = i + 1;
128         return u;
129     }
130     return u_get_nonascii(buf, size, idx);
131 }
132 
u_get_nonascii(const unsigned char * buf,size_t size,size_t * idx)133 CodePoint u_get_nonascii(const unsigned char *buf, size_t size, size_t *idx)
134 {
135     size_t i = *idx;
136     int len, c;
137     unsigned int first, u;
138 
139     first = buf[i++];
140     len = u_seq_len(first);
141     if (unlikely(len < 2 || len > size - i + 1)) {
142         goto invalid;
143     }
144 
145     u = first & u_get_first_byte_mask(len);
146     c = len - 1;
147     do {
148         CodePoint ch = buf[i++];
149         if (!u_is_continuation(ch)) {
150             goto invalid;
151         }
152         u = (u << 6) | (ch & 0x3f);
153     } while (--c);
154 
155     if (!u_seq_len_ok(u, len)) {
156         goto invalid;
157     }
158 
159     *idx = i;
160     return u;
161 invalid:
162     *idx += 1;
163     return -first;
164 }
165 
u_set_char_raw(char * str,size_t * idx,CodePoint u)166 void u_set_char_raw(char *str, size_t *idx, CodePoint u)
167 {
168     size_t i = *idx;
169     if (u <= 0x7f) {
170         str[i++] = u;
171     } else if (u <= 0x7ff) {
172         str[i + 1] = (u & 0x3f) | 0x80; u >>= 6;
173         str[i + 0] = u | 0xc0;
174         i += 2;
175     } else if (u <= 0xffff) {
176         str[i + 2] = (u & 0x3f) | 0x80; u >>= 6;
177         str[i + 1] = (u & 0x3f) | 0x80; u >>= 6;
178         str[i + 0] = u | 0xe0;
179         i += 3;
180     } else if (u <= 0x10ffff) {
181         str[i + 3] = (u & 0x3f) | 0x80; u >>= 6;
182         str[i + 2] = (u & 0x3f) | 0x80; u >>= 6;
183         str[i + 1] = (u & 0x3f) | 0x80; u >>= 6;
184         str[i + 0] = u | 0xf0;
185         i += 4;
186     } else {
187         // Invalid byte value
188         str[i++] = u & 0xff;
189     }
190     *idx = i;
191 }
192 
u_set_char(char * str,size_t * idx,CodePoint u)193 void u_set_char(char *str, size_t *idx, CodePoint u)
194 {
195     size_t i = *idx;
196     if (u < 0x80) {
197         if (ascii_iscntrl(u)) {
198             u_set_ctrl(str, idx, u);
199         } else {
200             str[i++] = u;
201             *idx = i;
202         }
203     } else if (u_is_unprintable(u)) {
204         u_set_hex(str, idx, u);
205     } else if (u <= 0x7ff) {
206         str[i + 1] = (u & 0x3f) | 0x80; u >>= 6;
207         str[i + 0] = u | 0xc0;
208         i += 2;
209         *idx = i;
210     } else if (u <= 0xffff) {
211         str[i + 2] = (u & 0x3f) | 0x80; u >>= 6;
212         str[i + 1] = (u & 0x3f) | 0x80; u >>= 6;
213         str[i + 0] = u | 0xe0;
214         i += 3;
215         *idx = i;
216     } else if (u <= 0x10ffff) {
217         str[i + 3] = (u & 0x3f) | 0x80; u >>= 6;
218         str[i + 2] = (u & 0x3f) | 0x80; u >>= 6;
219         str[i + 1] = (u & 0x3f) | 0x80; u >>= 6;
220         str[i + 0] = u | 0xf0;
221         i += 4;
222         *idx = i;
223     }
224 }
225 
u_set_hex(char * str,size_t * idx,CodePoint u)226 void u_set_hex(char *str, size_t *idx, CodePoint u)
227 {
228     static const char hex_tab[16] = "0123456789abcdef";
229     char *p = str + *idx;
230     p[0] = '<';
231     if (!u_is_unicode(u)) {
232         // Invalid byte (negated)
233         u *= -1;
234         p[1] = hex_tab[(u >> 4) & 0x0f];
235         p[2] = hex_tab[u & 0x0f];
236     } else {
237         p[1] = '?';
238         p[2] = '?';
239     }
240     p[3] = '>';
241     *idx += 4;
242 }
243 
u_skip_chars(const char * str,int * width)244 size_t u_skip_chars(const char *str, int *width)
245 {
246     int w = *width;
247     size_t idx = 0;
248 
249     while (str[idx] && w > 0) {
250         w -= u_char_width(u_str_get_char(str, &idx));
251     }
252 
253     // Add 1..3 if skipped 'too much' (the last char was double
254     // width or invalid (<xx>))
255     *width -= w;
256     return idx;
257 }
258 
has_prefix(const char * str,const char * prefix_lcase)259 static bool has_prefix(const char *str, const char *prefix_lcase)
260 {
261     size_t ni = 0;
262     size_t hi = 0;
263     CodePoint pc;
264     while ((pc = u_str_get_char(prefix_lcase, &ni))) {
265         CodePoint sc = u_str_get_char(str, &hi);
266         if (sc != pc && u_to_lower(sc) != pc) {
267             return false;
268         }
269     }
270     return true;
271 }
272 
u_str_index(const char * haystack,const char * needle_lcase)273 ssize_t u_str_index(const char *haystack, const char *needle_lcase)
274 {
275     size_t hi = 0;
276     size_t ni = 0;
277     CodePoint nc = u_str_get_char(needle_lcase, &ni);
278 
279     if (!nc) {
280         return 0;
281     }
282 
283     while (haystack[hi]) {
284         size_t prev = hi;
285         CodePoint hc = u_str_get_char(haystack, &hi);
286         if (
287             (hc == nc || u_to_lower(hc) == nc)
288             && has_prefix(haystack + hi, needle_lcase + ni)
289         ) {
290             return prev;
291         }
292     }
293     return -1;
294 }
295