1 // The latest version of this library is available on GitHub;
2 // https://github.com/sheredom/utf8.h
3
4 // This is free and unencumbered software released into the public domain.
5 //
6 // Anyone is free to copy, modify, publish, use, compile, sell, or
7 // distribute this software, either in source code form or as a compiled
8 // binary, for any purpose, commercial or non-commercial, and by any
9 // means.
10 //
11 // In jurisdictions that recognize copyright laws, the author or authors
12 // of this software dedicate any and all copyright interest in the
13 // software to the public domain. We make this dedication for the benefit
14 // of the public at large and to the detriment of our heirs and
15 // successors. We intend this dedication to be an overt act of
16 // relinquishment in perpetuity of all present and future rights to this
17 // software under copyright law.
18 //
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
23 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
24 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 // OTHER DEALINGS IN THE SOFTWARE.
26 //
27 // For more information, please refer to <http://unlicense.org/>
28
29 #ifndef SHEREDOM_UTF8_H_INCLUDED
30 #define SHEREDOM_UTF8_H_INCLUDED
31
32 #if defined(_MSC_VER)
33 #pragma warning(push)
34
35 // disable 'bytes padding added after construct' warning
36 #pragma warning(disable : 4820)
37 #endif
38
39 #include <stddef.h>
40 #include <stdlib.h>
41
42 #if defined(_MSC_VER)
43 #pragma warning(pop)
44 #endif
45
46 #if defined(_MSC_VER)
47 typedef __int32 utf8_int32_t;
48 #else
49 #include <stdint.h>
50 typedef int32_t utf8_int32_t;
51 #endif
52
53 #if defined(__clang__)
54 #pragma clang diagnostic push
55 #pragma clang diagnostic ignored "-Wold-style-cast"
56 #pragma clang diagnostic ignored "-Wcast-qual"
57 #endif
58
59 #ifdef __cplusplus
60 extern "C" {
61 #endif
62
63 #if defined(__clang__) || defined(__GNUC__)
64 #define utf8_nonnull __attribute__((nonnull))
65 #define utf8_pure __attribute__((pure))
66 #define utf8_restrict __restrict__
67 #define utf8_weak __attribute__((weak))
68 #elif defined(_MSC_VER)
69 #define utf8_nonnull
70 #define utf8_pure
71 #define utf8_restrict __restrict
72 #define utf8_weak __inline
73 #else
74 #error Non clang, non gcc, non MSVC compiler found!
75 #endif
76
77 #ifdef __cplusplus
78 #define utf8_null NULL
79 #else
80 #define utf8_null 0
81 #endif
82
83 // Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
84 // src2 respectively, case insensitive.
85 utf8_nonnull utf8_pure utf8_weak int utf8casecmp(const void *src1,
86 const void *src2);
87
88 // Append the utf8 string src onto the utf8 string dst.
89 utf8_nonnull utf8_weak void *utf8cat(void *utf8_restrict dst,
90 const void *utf8_restrict src);
91
92 // Find the first match of the utf8 codepoint chr in the utf8 string src.
93 utf8_nonnull utf8_pure utf8_weak void *utf8chr(const void *src,
94 utf8_int32_t chr);
95
96 // Return less than 0, 0, greater than 0 if src1 < src2,
97 // src1 == src2, src1 > src2 respectively.
98 utf8_nonnull utf8_pure utf8_weak int utf8cmp(const void *src1,
99 const void *src2);
100
101 // Copy the utf8 string src onto the memory allocated in dst.
102 utf8_nonnull utf8_weak void *utf8cpy(void *utf8_restrict dst,
103 const void *utf8_restrict src);
104
105 // Number of utf8 codepoints in the utf8 string src that consists entirely
106 // of utf8 codepoints not from the utf8 string reject.
107 utf8_nonnull utf8_pure utf8_weak size_t utf8cspn(const void *src,
108 const void *reject);
109
110 // Duplicate the utf8 string src by getting its size, malloc'ing a new buffer
111 // copying over the data, and returning that. Or 0 if malloc failed.
112 utf8_nonnull utf8_weak void *utf8dup(const void *src);
113
114 // Number of utf8 codepoints in the utf8 string str,
115 // excluding the null terminating byte.
116 utf8_nonnull utf8_pure utf8_weak size_t utf8len(const void *str);
117
118 // Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
119 // src2 respectively, case insensitive. Checking at most n bytes of each utf8
120 // string.
121 utf8_nonnull utf8_pure utf8_weak int utf8ncasecmp(const void *src1,
122 const void *src2, size_t n);
123
124 // Append the utf8 string src onto the utf8 string dst,
125 // writing at most n+1 bytes. Can produce an invalid utf8
126 // string if n falls partway through a utf8 codepoint.
127 utf8_nonnull utf8_weak void *utf8ncat(void *utf8_restrict dst,
128 const void *utf8_restrict src, size_t n);
129
130 // Return less than 0, 0, greater than 0 if src1 < src2,
131 // src1 == src2, src1 > src2 respectively. Checking at most n
132 // bytes of each utf8 string.
133 utf8_nonnull utf8_pure utf8_weak int utf8ncmp(const void *src1,
134 const void *src2, size_t n);
135
136 // Copy the utf8 string src onto the memory allocated in dst.
137 // Copies at most n bytes. If there is no terminating null byte in
138 // the first n bytes of src, the string placed into dst will not be
139 // null-terminated. If the size (in bytes) of src is less than n,
140 // extra null terminating bytes are appended to dst such that at
141 // total of n bytes are written. Can produce an invalid utf8
142 // string if n falls partway through a utf8 codepoint.
143 utf8_nonnull utf8_weak void *utf8ncpy(void *utf8_restrict dst,
144 const void *utf8_restrict src, size_t n);
145
146 // Similar to utf8dup, except that at most n bytes of src are copied. If src is
147 // longer than n, only n bytes are copied and a null byte is added.
148 //
149 // Returns a new string if successful, 0 otherwise
150 utf8_nonnull utf8_weak void *utf8ndup(const void *src, size_t n);
151
152 // Locates the first occurence in the utf8 string str of any byte in the
153 // utf8 string accept, or 0 if no match was found.
154 utf8_nonnull utf8_pure utf8_weak void *utf8pbrk(const void *str,
155 const void *accept);
156
157 // Find the last match of the utf8 codepoint chr in the utf8 string src.
158 utf8_nonnull utf8_pure utf8_weak void *utf8rchr(const void *src, int chr);
159
160 // Number of bytes in the utf8 string str,
161 // including the null terminating byte.
162 utf8_nonnull utf8_pure utf8_weak size_t utf8size(const void *str);
163
164 // Number of utf8 codepoints in the utf8 string src that consists entirely
165 // of utf8 codepoints from the utf8 string accept.
166 utf8_nonnull utf8_pure utf8_weak size_t utf8spn(const void *src,
167 const void *accept);
168
169 // The position of the utf8 string needle in the utf8 string haystack.
170 utf8_nonnull utf8_pure utf8_weak void *utf8str(const void *haystack,
171 const void *needle);
172
173 // The position of the utf8 string needle in the utf8 string haystack, case
174 // insensitive.
175 utf8_nonnull utf8_pure utf8_weak void *utf8casestr(const void *haystack,
176 const void *needle);
177
178 // Return 0 on success, or the position of the invalid
179 // utf8 codepoint on failure.
180 utf8_nonnull utf8_pure utf8_weak void *utf8valid(const void *str);
181
182 // Sets out_codepoint to the next utf8 codepoint in str, and returns the address
183 // of the utf8 codepoint after the current one in str.
184 utf8_nonnull utf8_weak void *
185 utf8codepoint(const void *utf8_restrict str,
186 utf8_int32_t *utf8_restrict out_codepoint);
187
188 // Returns the size of the given codepoint in bytes.
189 utf8_weak size_t utf8codepointsize(utf8_int32_t chr);
190
191 // Write a codepoint to the given string, and return the address to the next
192 // place after the written codepoint. Pass how many bytes left in the buffer to
193 // n. If there is not enough space for the codepoint, this function returns
194 // null.
195 utf8_nonnull utf8_weak void *utf8catcodepoint(void *utf8_restrict str,
196 utf8_int32_t chr, size_t n);
197
198 // Returns 1 if the given character is lowercase, or 0 if it is not.
199 utf8_weak int utf8islower(utf8_int32_t chr);
200
201 // Returns 1 if the given character is uppercase, or 0 if it is not.
202 utf8_weak int utf8isupper(utf8_int32_t chr);
203
204 // Transform the given string into all lowercase codepoints.
205 utf8_nonnull utf8_weak void utf8lwr(void *utf8_restrict str);
206
207 // Transform the given string into all uppercase codepoints.
208 utf8_nonnull utf8_weak void utf8upr(void *utf8_restrict str);
209
210 // Make a codepoint lower case if possible.
211 utf8_weak utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp);
212
213 // Make a codepoint upper case if possible.
214 utf8_weak utf8_int32_t utf8uprcodepoint(utf8_int32_t cp);
215
216 #undef utf8_weak
217 #undef utf8_pure
218 #undef utf8_nonnull
219
utf8casecmp(const void * src1,const void * src2)220 int utf8casecmp(const void *src1, const void *src2) {
221 utf8_int32_t src1_cp, src2_cp, src1_orig_cp, src2_orig_cp;
222
223 for (;;) {
224 src1 = utf8codepoint(src1, &src1_cp);
225 src2 = utf8codepoint(src2, &src2_cp);
226
227 // Take a copy of src1 & src2
228 src1_orig_cp = src1_cp;
229 src2_orig_cp = src2_cp;
230
231 // Lower the srcs if required
232 src1_cp = utf8lwrcodepoint(src1_cp);
233 src2_cp = utf8lwrcodepoint(src2_cp);
234
235 // Check if the lowered codepoints match
236 if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
237 return 0;
238 } else if (src1_cp == src2_cp) {
239 continue;
240 }
241
242 // If they don't match, then we return which of the original's are less
243 if (src1_orig_cp < src2_orig_cp) {
244 return -1;
245 } else if (src1_orig_cp > src2_orig_cp) {
246 return 1;
247 }
248 }
249 }
250
utf8cat(void * utf8_restrict dst,const void * utf8_restrict src)251 void *utf8cat(void *utf8_restrict dst, const void *utf8_restrict src) {
252 char *d = (char *)dst;
253 const char *s = (const char *)src;
254
255 // find the null terminating byte in dst
256 while ('\0' != *d) {
257 d++;
258 }
259
260 // overwriting the null terminating byte in dst, append src byte-by-byte
261 while ('\0' != *s) {
262 *d++ = *s++;
263 }
264
265 // write out a new null terminating byte into dst
266 *d = '\0';
267
268 return dst;
269 }
270
utf8chr(const void * src,utf8_int32_t chr)271 void *utf8chr(const void *src, utf8_int32_t chr) {
272 char c[5] = {'\0', '\0', '\0', '\0', '\0'};
273
274 if (0 == chr) {
275 // being asked to return position of null terminating byte, so
276 // just run s to the end, and return!
277 const char *s = (const char *)src;
278 while ('\0' != *s) {
279 s++;
280 }
281 return (void *)s;
282 } else if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
283 // 1-byte/7-bit ascii
284 // (0b0xxxxxxx)
285 c[0] = (char)chr;
286 } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
287 // 2-byte/11-bit utf8 code point
288 // (0b110xxxxx 0b10xxxxxx)
289 c[0] = 0xc0 | (char)(chr >> 6);
290 c[1] = 0x80 | (char)(chr & 0x3f);
291 } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
292 // 3-byte/16-bit utf8 code point
293 // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx)
294 c[0] = 0xe0 | (char)(chr >> 12);
295 c[1] = 0x80 | (char)((chr >> 6) & 0x3f);
296 c[2] = 0x80 | (char)(chr & 0x3f);
297 } else { // if (0 == ((int)0xffe00000 & chr)) {
298 // 4-byte/21-bit utf8 code point
299 // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx)
300 c[0] = 0xf0 | (char)(chr >> 18);
301 c[1] = 0x80 | (char)((chr >> 12) & 0x3f);
302 c[2] = 0x80 | (char)((chr >> 6) & 0x3f);
303 c[3] = 0x80 | (char)(chr & 0x3f);
304 }
305
306 // we've made c into a 2 utf8 codepoint string, one for the chr we are
307 // seeking, another for the null terminating byte. Now use utf8str to
308 // search
309 return utf8str(src, c);
310 }
311
utf8cmp(const void * src1,const void * src2)312 int utf8cmp(const void *src1, const void *src2) {
313 const unsigned char *s1 = (const unsigned char *)src1;
314 const unsigned char *s2 = (const unsigned char *)src2;
315
316 while (('\0' != *s1) || ('\0' != *s2)) {
317 if (*s1 < *s2) {
318 return -1;
319 } else if (*s1 > *s2) {
320 return 1;
321 }
322
323 s1++;
324 s2++;
325 }
326
327 // both utf8 strings matched
328 return 0;
329 }
330
331 int utf8coll(const void *src1, const void *src2);
332
utf8cpy(void * utf8_restrict dst,const void * utf8_restrict src)333 void *utf8cpy(void *utf8_restrict dst, const void *utf8_restrict src) {
334 char *d = (char *)dst;
335 const char *s = (const char *)src;
336
337 // overwriting anything previously in dst, write byte-by-byte
338 // from src
339 while ('\0' != *s) {
340 *d++ = *s++;
341 }
342
343 // append null terminating byte
344 *d = '\0';
345
346 return dst;
347 }
348
utf8cspn(const void * src,const void * reject)349 size_t utf8cspn(const void *src, const void *reject) {
350 const char *s = (const char *)src;
351 size_t chars = 0;
352
353 while ('\0' != *s) {
354 const char *r = (const char *)reject;
355 size_t offset = 0;
356
357 while ('\0' != *r) {
358 // checking that if *r is the start of a utf8 codepoint
359 // (it is not 0b10xxxxxx) and we have successfully matched
360 // a previous character (0 < offset) - we found a match
361 if ((0x80 != (0xc0 & *r)) && (0 < offset)) {
362 return chars;
363 } else {
364 if (*r == s[offset]) {
365 // part of a utf8 codepoint matched, so move our checking
366 // onwards to the next byte
367 offset++;
368 r++;
369 } else {
370 // r could be in the middle of an unmatching utf8 code point,
371 // so we need to march it on to the next character beginning,
372
373 do {
374 r++;
375 } while (0x80 == (0xc0 & *r));
376
377 // reset offset too as we found a mismatch
378 offset = 0;
379 }
380 }
381 }
382
383 // the current utf8 codepoint in src did not match reject, but src
384 // could have been partway through a utf8 codepoint, so we need to
385 // march it onto the next utf8 codepoint starting byte
386 do {
387 s++;
388 } while ((0x80 == (0xc0 & *s)));
389 chars++;
390 }
391
392 return chars;
393 }
394
395 size_t utf8size(const void *str);
396
utf8dup(const void * src)397 void *utf8dup(const void *src) {
398 const char *s = (const char *)src;
399 char *n = utf8_null;
400
401 // figure out how many bytes (including the terminator) we need to copy first
402 size_t bytes = utf8size(src);
403
404 n = (char *)malloc(bytes);
405
406 if (utf8_null == n) {
407 // out of memory so we bail
408 return utf8_null;
409 } else {
410 bytes = 0;
411
412 // copy src byte-by-byte into our new utf8 string
413 while ('\0' != s[bytes]) {
414 n[bytes] = s[bytes];
415 bytes++;
416 }
417
418 // append null terminating byte
419 n[bytes] = '\0';
420 return n;
421 }
422 }
423
424 void *utf8fry(const void *str);
425
utf8len(const void * str)426 size_t utf8len(const void *str) {
427 const unsigned char *s = (const unsigned char *)str;
428 size_t length = 0;
429
430 while ('\0' != *s) {
431 if (0xf0 == (0xf8 & *s)) {
432 // 4-byte utf8 code point (began with 0b11110xxx)
433 s += 4;
434 } else if (0xe0 == (0xf0 & *s)) {
435 // 3-byte utf8 code point (began with 0b1110xxxx)
436 s += 3;
437 } else if (0xc0 == (0xe0 & *s)) {
438 // 2-byte utf8 code point (began with 0b110xxxxx)
439 s += 2;
440 } else { // if (0x00 == (0x80 & *s)) {
441 // 1-byte ascii (began with 0b0xxxxxxx)
442 s += 1;
443 }
444
445 // no matter the bytes we marched s forward by, it was
446 // only 1 utf8 codepoint
447 length++;
448 }
449
450 return length;
451 }
452
utf8ncasecmp(const void * src1,const void * src2,size_t n)453 int utf8ncasecmp(const void *src1, const void *src2, size_t n) {
454 utf8_int32_t src1_cp, src2_cp, src1_orig_cp, src2_orig_cp;
455
456 do {
457 const unsigned char *const s1 = (const unsigned char *)src1;
458 const unsigned char *const s2 = (const unsigned char *)src2;
459
460 // first check that we have enough bytes left in n to contain an entire
461 // codepoint
462 if (0 == n) {
463 return 0;
464 }
465
466 if ((1 == n) && ((0xc0 == (0xe0 & *s1)) || (0xc0 == (0xe0 & *s2)))) {
467 const utf8_int32_t c1 = (0xe0 & *s1);
468 const utf8_int32_t c2 = (0xe0 & *s2);
469
470 if (c1 < c2) {
471 return -1;
472 } else if (c1 > c2) {
473 return 1;
474 } else {
475 return 0;
476 }
477 }
478
479 if ((2 >= n) && ((0xe0 == (0xf0 & *s1)) || (0xe0 == (0xf0 & *s2)))) {
480 const utf8_int32_t c1 = (0xf0 & *s1);
481 const utf8_int32_t c2 = (0xf0 & *s2);
482
483 if (c1 < c2) {
484 return -1;
485 } else if (c1 > c2) {
486 return 1;
487 } else {
488 return 0;
489 }
490 }
491
492 if ((3 >= n) && ((0xf0 == (0xf8 & *s1)) || (0xf0 == (0xf8 & *s2)))) {
493 const utf8_int32_t c1 = (0xf8 & *s1);
494 const utf8_int32_t c2 = (0xf8 & *s2);
495
496 if (c1 < c2) {
497 return -1;
498 } else if (c1 > c2) {
499 return 1;
500 } else {
501 return 0;
502 }
503 }
504
505 src1 = utf8codepoint(src1, &src1_cp);
506 src2 = utf8codepoint(src2, &src2_cp);
507 n -= utf8codepointsize(src1_cp);
508
509 // Take a copy of src1 & src2
510 src1_orig_cp = src1_cp;
511 src2_orig_cp = src2_cp;
512
513 // Lower srcs if required
514 src1_cp = utf8lwrcodepoint(src1_cp);
515 src2_cp = utf8lwrcodepoint(src2_cp);
516
517 // Check if the lowered codepoints match
518 if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
519 return 0;
520 } else if (src1_cp == src2_cp) {
521 continue;
522 }
523
524 // If they don't match, then we return which of the original's are less
525 if (src1_orig_cp < src2_orig_cp) {
526 return -1;
527 } else if (src1_orig_cp > src2_orig_cp) {
528 return 1;
529 }
530 } while (0 < n);
531
532 // both utf8 strings matched
533 return 0;
534 }
535
utf8ncat(void * utf8_restrict dst,const void * utf8_restrict src,size_t n)536 void *utf8ncat(void *utf8_restrict dst, const void *utf8_restrict src,
537 size_t n) {
538 char *d = (char *)dst;
539 const char *s = (const char *)src;
540
541 // find the null terminating byte in dst
542 while ('\0' != *d) {
543 d++;
544 }
545
546 // overwriting the null terminating byte in dst, append src byte-by-byte
547 // stopping if we run out of space
548 do {
549 *d++ = *s++;
550 } while (('\0' != *s) && (0 != --n));
551
552 // write out a new null terminating byte into dst
553 *d = '\0';
554
555 return dst;
556 }
557
utf8ncmp(const void * src1,const void * src2,size_t n)558 int utf8ncmp(const void *src1, const void *src2, size_t n) {
559 const unsigned char *s1 = (const unsigned char *)src1;
560 const unsigned char *s2 = (const unsigned char *)src2;
561
562 while ((('\0' != *s1) || ('\0' != *s2)) && (0 != n--)) {
563 if (*s1 < *s2) {
564 return -1;
565 } else if (*s1 > *s2) {
566 return 1;
567 }
568
569 s1++;
570 s2++;
571 }
572
573 // both utf8 strings matched
574 return 0;
575 }
576
utf8ncpy(void * utf8_restrict dst,const void * utf8_restrict src,size_t n)577 void *utf8ncpy(void *utf8_restrict dst, const void *utf8_restrict src,
578 size_t n) {
579 char *d = (char *)dst;
580 const char *s = (const char *)src;
581
582 // overwriting anything previously in dst, write byte-by-byte
583 // from src
584 do {
585 *d++ = *s++;
586 } while (('\0' != *s) && (0 != --n));
587
588 // append null terminating byte
589 while (0 != n) {
590 *d++ = '\0';
591 n--;
592 }
593
594 return dst;
595 }
596
utf8ndup(const void * src,size_t n)597 void *utf8ndup(const void *src, size_t n) {
598 const char *s = (const char *)src;
599 char *c = utf8_null;
600 size_t bytes = 0;
601
602 // Find the end of the string or stop when n is reached
603 while ('\0' != s[bytes] && bytes < n) {
604 bytes++;
605 }
606
607 // In case bytes is actually less than n, we need to set it
608 // to be used later in the copy byte by byte.
609 n = bytes;
610
611 c = (char *)malloc(bytes + 1);
612 if (utf8_null == c) {
613 // out of memory so we bail
614 return utf8_null;
615 }
616
617 bytes = 0;
618
619 // copy src byte-by-byte into our new utf8 string
620 while ('\0' != s[bytes] && bytes < n) {
621 c[bytes] = s[bytes];
622 bytes++;
623 }
624
625 // append null terminating byte
626 c[bytes] = '\0';
627 return c;
628 }
629
utf8rchr(const void * src,int chr)630 void *utf8rchr(const void *src, int chr) {
631 const char *s = (const char *)src;
632 const char *match = utf8_null;
633 char c[5] = {'\0', '\0', '\0', '\0', '\0'};
634
635 if (0 == chr) {
636 // being asked to return position of null terminating byte, so
637 // just run s to the end, and return!
638 while ('\0' != *s) {
639 s++;
640 }
641 return (void *)s;
642 } else if (0 == ((int)0xffffff80 & chr)) {
643 // 1-byte/7-bit ascii
644 // (0b0xxxxxxx)
645 c[0] = (char)chr;
646 } else if (0 == ((int)0xfffff800 & chr)) {
647 // 2-byte/11-bit utf8 code point
648 // (0b110xxxxx 0b10xxxxxx)
649 c[0] = 0xc0 | (char)(chr >> 6);
650 c[1] = 0x80 | (char)(chr & 0x3f);
651 } else if (0 == ((int)0xffff0000 & chr)) {
652 // 3-byte/16-bit utf8 code point
653 // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx)
654 c[0] = 0xe0 | (char)(chr >> 12);
655 c[1] = 0x80 | (char)((chr >> 6) & 0x3f);
656 c[2] = 0x80 | (char)(chr & 0x3f);
657 } else { // if (0 == ((int)0xffe00000 & chr)) {
658 // 4-byte/21-bit utf8 code point
659 // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx)
660 c[0] = 0xf0 | (char)(chr >> 18);
661 c[1] = 0x80 | (char)((chr >> 12) & 0x3f);
662 c[2] = 0x80 | (char)((chr >> 6) & 0x3f);
663 c[3] = 0x80 | (char)(chr & 0x3f);
664 }
665
666 // we've created a 2 utf8 codepoint string in c that is
667 // the utf8 character asked for by chr, and a null
668 // terminating byte
669
670 while ('\0' != *s) {
671 size_t offset = 0;
672
673 while (s[offset] == c[offset]) {
674 offset++;
675 }
676
677 if ('\0' == c[offset]) {
678 // we found a matching utf8 code point
679 match = s;
680 s += offset;
681 } else {
682 s += offset;
683
684 // need to march s along to next utf8 codepoint start
685 // (the next byte that doesn't match 0b10xxxxxx)
686 if ('\0' != *s) {
687 do {
688 s++;
689 } while (0x80 == (0xc0 & *s));
690 }
691 }
692 }
693
694 // return the last match we found (or 0 if no match was found)
695 return (void *)match;
696 }
697
utf8pbrk(const void * str,const void * accept)698 void *utf8pbrk(const void *str, const void *accept) {
699 const char *s = (const char *)str;
700
701 while ('\0' != *s) {
702 const char *a = (const char *)accept;
703 size_t offset = 0;
704
705 while ('\0' != *a) {
706 // checking that if *a is the start of a utf8 codepoint
707 // (it is not 0b10xxxxxx) and we have successfully matched
708 // a previous character (0 < offset) - we found a match
709 if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
710 return (void *)s;
711 } else {
712 if (*a == s[offset]) {
713 // part of a utf8 codepoint matched, so move our checking
714 // onwards to the next byte
715 offset++;
716 a++;
717 } else {
718 // r could be in the middle of an unmatching utf8 code point,
719 // so we need to march it on to the next character beginning,
720
721 do {
722 a++;
723 } while (0x80 == (0xc0 & *a));
724
725 // reset offset too as we found a mismatch
726 offset = 0;
727 }
728 }
729 }
730
731 // we found a match on the last utf8 codepoint
732 if (0 < offset) {
733 return (void *)s;
734 }
735
736 // the current utf8 codepoint in src did not match accept, but src
737 // could have been partway through a utf8 codepoint, so we need to
738 // march it onto the next utf8 codepoint starting byte
739 do {
740 s++;
741 } while ((0x80 == (0xc0 & *s)));
742 }
743
744 return utf8_null;
745 }
746
utf8size(const void * str)747 size_t utf8size(const void *str) {
748 const char *s = (const char *)str;
749 size_t size = 0;
750 while ('\0' != s[size]) {
751 size++;
752 }
753
754 // we are including the null terminating byte in the size calculation
755 size++;
756 return size;
757 }
758
utf8spn(const void * src,const void * accept)759 size_t utf8spn(const void *src, const void *accept) {
760 const char *s = (const char *)src;
761 size_t chars = 0;
762
763 while ('\0' != *s) {
764 const char *a = (const char *)accept;
765 size_t offset = 0;
766
767 while ('\0' != *a) {
768 // checking that if *r is the start of a utf8 codepoint
769 // (it is not 0b10xxxxxx) and we have successfully matched
770 // a previous character (0 < offset) - we found a match
771 if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
772 // found a match, so increment the number of utf8 codepoints
773 // that have matched and stop checking whether any other utf8
774 // codepoints in a match
775 chars++;
776 s += offset;
777 break;
778 } else {
779 if (*a == s[offset]) {
780 offset++;
781 a++;
782 } else {
783 // a could be in the middle of an unmatching utf8 codepoint,
784 // so we need to march it on to the next character beginning,
785 do {
786 a++;
787 } while (0x80 == (0xc0 & *a));
788
789 // reset offset too as we found a mismatch
790 offset = 0;
791 }
792 }
793 }
794
795 // if a got to its terminating null byte, then we didn't find a match.
796 // Return the current number of matched utf8 codepoints
797 if ('\0' == *a) {
798 return chars;
799 }
800 }
801
802 return chars;
803 }
804
utf8str(const void * haystack,const void * needle)805 void *utf8str(const void *haystack, const void *needle) {
806 const char *h = (const char *)haystack;
807
808 // if needle has no utf8 codepoints before the null terminating
809 // byte then return haystack
810 if ('\0' == *((const char *)needle)) {
811 return (void *)haystack;
812 }
813
814 while ('\0' != *h) {
815 const char *maybeMatch = h;
816 const char *n = (const char *)needle;
817
818 while (*h == *n && (*h != '\0' && *n != '\0')) {
819 n++;
820 h++;
821 }
822
823 if ('\0' == *n) {
824 // we found the whole utf8 string for needle in haystack at
825 // maybeMatch, so return it
826 return (void *)maybeMatch;
827 } else {
828 // h could be in the middle of an unmatching utf8 codepoint,
829 // so we need to march it on to the next character beginning,
830 if ('\0' != *h) {
831 do {
832 h++;
833 } while (0x80 == (0xc0 & *h));
834 }
835 }
836 }
837
838 // no match
839 return utf8_null;
840 }
841
utf8casestr(const void * haystack,const void * needle)842 void *utf8casestr(const void *haystack, const void *needle) {
843 const void *h = haystack;
844
845 // if needle has no utf8 codepoints before the null terminating
846 // byte then return haystack
847 if ('\0' == *((const char *)needle)) {
848 return (void *)haystack;
849 }
850
851 for (;;) {
852 const void *maybeMatch = h;
853 const void *n = needle;
854 utf8_int32_t h_cp, n_cp;
855
856 h = utf8codepoint(h, &h_cp);
857 n = utf8codepoint(n, &n_cp);
858
859 while ((0 != h_cp) && (0 != n_cp)) {
860 h_cp = utf8lwrcodepoint(h_cp);
861 n_cp = utf8lwrcodepoint(n_cp);
862
863 // if we find a mismatch, bail out!
864 if (h_cp != n_cp) {
865 break;
866 }
867
868 h = utf8codepoint(h, &h_cp);
869 n = utf8codepoint(n, &n_cp);
870 }
871
872 if (0 == n_cp) {
873 // we found the whole utf8 string for needle in haystack at
874 // maybeMatch, so return it
875 return (void *)maybeMatch;
876 }
877
878 if (0 == h_cp) {
879 // no match
880 return utf8_null;
881 }
882 }
883 }
884
utf8valid(const void * str)885 void *utf8valid(const void *str) {
886 const char *s = (const char *)str;
887
888 while ('\0' != *s) {
889 if (0xf0 == (0xf8 & *s)) {
890 // ensure each of the 3 following bytes in this 4-byte
891 // utf8 codepoint began with 0b10xxxxxx
892 if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) ||
893 (0x80 != (0xc0 & s[3]))) {
894 return (void *)s;
895 }
896
897 // ensure that our utf8 codepoint ended after 4 bytes
898 if (0x80 == (0xc0 & s[4])) {
899 return (void *)s;
900 }
901
902 // ensure that the top 5 bits of this 4-byte utf8
903 // codepoint were not 0, as then we could have used
904 // one of the smaller encodings
905 if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1]))) {
906 return (void *)s;
907 }
908
909 // 4-byte utf8 code point (began with 0b11110xxx)
910 s += 4;
911 } else if (0xe0 == (0xf0 & *s)) {
912 // ensure each of the 2 following bytes in this 3-byte
913 // utf8 codepoint began with 0b10xxxxxx
914 if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))) {
915 return (void *)s;
916 }
917
918 // ensure that our utf8 codepoint ended after 3 bytes
919 if (0x80 == (0xc0 & s[3])) {
920 return (void *)s;
921 }
922
923 // ensure that the top 5 bits of this 3-byte utf8
924 // codepoint were not 0, as then we could have used
925 // one of the smaller encodings
926 if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1]))) {
927 return (void *)s;
928 }
929
930 // 3-byte utf8 code point (began with 0b1110xxxx)
931 s += 3;
932 } else if (0xc0 == (0xe0 & *s)) {
933 // ensure the 1 following byte in this 2-byte
934 // utf8 codepoint began with 0b10xxxxxx
935 if (0x80 != (0xc0 & s[1])) {
936 return (void *)s;
937 }
938
939 // ensure that our utf8 codepoint ended after 2 bytes
940 if (0x80 == (0xc0 & s[2])) {
941 return (void *)s;
942 }
943
944 // ensure that the top 4 bits of this 2-byte utf8
945 // codepoint were not 0, as then we could have used
946 // one of the smaller encodings
947 if (0 == (0x1e & s[0])) {
948 return (void *)s;
949 }
950
951 // 2-byte utf8 code point (began with 0b110xxxxx)
952 s += 2;
953 } else if (0x00 == (0x80 & *s)) {
954 // 1-byte ascii (began with 0b0xxxxxxx)
955 s += 1;
956 } else {
957 // we have an invalid 0b1xxxxxxx utf8 code point entry
958 return (void *)s;
959 }
960 }
961
962 return utf8_null;
963 }
964
utf8codepoint(const void * utf8_restrict str,utf8_int32_t * utf8_restrict out_codepoint)965 void *utf8codepoint(const void *utf8_restrict str,
966 utf8_int32_t *utf8_restrict out_codepoint) {
967 const char *s = (const char *)str;
968
969 if (0xf0 == (0xf8 & s[0])) {
970 // 4 byte utf8 codepoint
971 *out_codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) |
972 ((0x3f & s[2]) << 6) | (0x3f & s[3]);
973 s += 4;
974 } else if (0xe0 == (0xf0 & s[0])) {
975 // 3 byte utf8 codepoint
976 *out_codepoint =
977 ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
978 s += 3;
979 } else if (0xc0 == (0xe0 & s[0])) {
980 // 2 byte utf8 codepoint
981 *out_codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
982 s += 2;
983 } else {
984 // 1 byte utf8 codepoint otherwise
985 *out_codepoint = s[0];
986 s += 1;
987 }
988
989 return (void *)s;
990 }
991
utf8codepointsize(utf8_int32_t chr)992 size_t utf8codepointsize(utf8_int32_t chr) {
993 if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
994 return 1;
995 } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
996 return 2;
997 } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
998 return 3;
999 } else { // if (0 == ((int)0xffe00000 & chr)) {
1000 return 4;
1001 }
1002 }
1003
utf8catcodepoint(void * utf8_restrict str,utf8_int32_t chr,size_t n)1004 void *utf8catcodepoint(void *utf8_restrict str, utf8_int32_t chr, size_t n) {
1005 char *s = (char *)str;
1006
1007 if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
1008 // 1-byte/7-bit ascii
1009 // (0b0xxxxxxx)
1010 if (n < 1) {
1011 return utf8_null;
1012 }
1013 s[0] = (char)chr;
1014 s += 1;
1015 } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
1016 // 2-byte/11-bit utf8 code point
1017 // (0b110xxxxx 0b10xxxxxx)
1018 if (n < 2) {
1019 return utf8_null;
1020 }
1021 s[0] = 0xc0 | (char)(chr >> 6);
1022 s[1] = 0x80 | (char)(chr & 0x3f);
1023 s += 2;
1024 } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
1025 // 3-byte/16-bit utf8 code point
1026 // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx)
1027 if (n < 3) {
1028 return utf8_null;
1029 }
1030 s[0] = 0xe0 | (char)(chr >> 12);
1031 s[1] = 0x80 | (char)((chr >> 6) & 0x3f);
1032 s[2] = 0x80 | (char)(chr & 0x3f);
1033 s += 3;
1034 } else { // if (0 == ((int)0xffe00000 & chr)) {
1035 // 4-byte/21-bit utf8 code point
1036 // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx)
1037 if (n < 4) {
1038 return utf8_null;
1039 }
1040 s[0] = 0xf0 | (char)(chr >> 18);
1041 s[1] = 0x80 | (char)((chr >> 12) & 0x3f);
1042 s[2] = 0x80 | (char)((chr >> 6) & 0x3f);
1043 s[3] = 0x80 | (char)(chr & 0x3f);
1044 s += 4;
1045 }
1046
1047 return s;
1048 }
1049
utf8islower(utf8_int32_t chr)1050 int utf8islower(utf8_int32_t chr) { return chr != utf8uprcodepoint(chr); }
1051
utf8isupper(utf8_int32_t chr)1052 int utf8isupper(utf8_int32_t chr) { return chr != utf8lwrcodepoint(chr); }
1053
utf8lwr(void * utf8_restrict str)1054 void utf8lwr(void *utf8_restrict str) {
1055 void *p, *pn;
1056 utf8_int32_t cp;
1057
1058 p = (char *)str;
1059 pn = utf8codepoint(p, &cp);
1060
1061 while (cp != 0) {
1062 const utf8_int32_t lwr_cp = utf8lwrcodepoint(cp);
1063 const size_t size = utf8codepointsize(lwr_cp);
1064
1065 if (lwr_cp != cp) {
1066 utf8catcodepoint(p, lwr_cp, size);
1067 }
1068
1069 p = pn;
1070 pn = utf8codepoint(p, &cp);
1071 }
1072 }
1073
utf8upr(void * utf8_restrict str)1074 void utf8upr(void *utf8_restrict str) {
1075 void *p, *pn;
1076 utf8_int32_t cp;
1077
1078 p = (char *)str;
1079 pn = utf8codepoint(p, &cp);
1080
1081 while (cp != 0) {
1082 const utf8_int32_t lwr_cp = utf8uprcodepoint(cp);
1083 const size_t size = utf8codepointsize(lwr_cp);
1084
1085 if (lwr_cp != cp) {
1086 utf8catcodepoint(p, lwr_cp, size);
1087 }
1088
1089 p = pn;
1090 pn = utf8codepoint(p, &cp);
1091 }
1092 }
1093
utf8lwrcodepoint(utf8_int32_t cp)1094 utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp) {
1095 if (((0x0041 <= cp) && (0x005a >= cp)) ||
1096 ((0x00c0 <= cp) && (0x00d6 >= cp)) ||
1097 ((0x00d8 <= cp) && (0x00de >= cp)) ||
1098 ((0x0391 <= cp) && (0x03a1 >= cp)) ||
1099 ((0x03a3 <= cp) && (0x03ab >= cp))) {
1100 cp += 32;
1101 } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
1102 ((0x0132 <= cp) && (0x0137 >= cp)) ||
1103 ((0x014a <= cp) && (0x0177 >= cp)) ||
1104 ((0x0182 <= cp) && (0x0185 >= cp)) ||
1105 ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
1106 ((0x01de <= cp) && (0x01ef >= cp)) ||
1107 ((0x01f8 <= cp) && (0x021f >= cp)) ||
1108 ((0x0222 <= cp) && (0x0233 >= cp)) ||
1109 ((0x0246 <= cp) && (0x024f >= cp)) ||
1110 ((0x03d8 <= cp) && (0x03ef >= cp))) {
1111 cp |= 0x1;
1112 } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
1113 ((0x0179 <= cp) && (0x017e >= cp)) ||
1114 ((0x01af <= cp) && (0x01b0 >= cp)) ||
1115 ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
1116 ((0x01cd <= cp) && (0x01dc >= cp))) {
1117 cp += 1;
1118 cp &= ~0x1;
1119 } else {
1120 switch (cp) {
1121 default: break;
1122 case 0x0178: cp = 0x00ff; break;
1123 case 0x0243: cp = 0x0180; break;
1124 case 0x018e: cp = 0x01dd; break;
1125 case 0x023d: cp = 0x019a; break;
1126 case 0x0220: cp = 0x019e; break;
1127 case 0x01b7: cp = 0x0292; break;
1128 case 0x01c4: cp = 0x01c6; break;
1129 case 0x01c7: cp = 0x01c9; break;
1130 case 0x01ca: cp = 0x01cc; break;
1131 case 0x01f1: cp = 0x01f3; break;
1132 case 0x01f7: cp = 0x01bf; break;
1133 case 0x0187: cp = 0x0188; break;
1134 case 0x018b: cp = 0x018c; break;
1135 case 0x0191: cp = 0x0192; break;
1136 case 0x0198: cp = 0x0199; break;
1137 case 0x01a7: cp = 0x01a8; break;
1138 case 0x01ac: cp = 0x01ad; break;
1139 case 0x01af: cp = 0x01b0; break;
1140 case 0x01b8: cp = 0x01b9; break;
1141 case 0x01bc: cp = 0x01bd; break;
1142 case 0x01f4: cp = 0x01f5; break;
1143 case 0x023b: cp = 0x023c; break;
1144 case 0x0241: cp = 0x0242; break;
1145 case 0x03fd: cp = 0x037b; break;
1146 case 0x03fe: cp = 0x037c; break;
1147 case 0x03ff: cp = 0x037d; break;
1148 case 0x037f: cp = 0x03f3; break;
1149 case 0x0386: cp = 0x03ac; break;
1150 case 0x0388: cp = 0x03ad; break;
1151 case 0x0389: cp = 0x03ae; break;
1152 case 0x038a: cp = 0x03af; break;
1153 case 0x038c: cp = 0x03cc; break;
1154 case 0x038e: cp = 0x03cd; break;
1155 case 0x038f: cp = 0x03ce; break;
1156 case 0x0370: cp = 0x0371; break;
1157 case 0x0372: cp = 0x0373; break;
1158 case 0x0376: cp = 0x0377; break;
1159 case 0x03f4: cp = 0x03d1; break;
1160 case 0x03cf: cp = 0x03d7; break;
1161 case 0x03f9: cp = 0x03f2; break;
1162 case 0x03f7: cp = 0x03f8; break;
1163 case 0x03fa: cp = 0x03fb; break;
1164 };
1165 }
1166
1167 return cp;
1168 }
1169
utf8uprcodepoint(utf8_int32_t cp)1170 utf8_int32_t utf8uprcodepoint(utf8_int32_t cp) {
1171 if (((0x0061 <= cp) && (0x007a >= cp)) ||
1172 ((0x00e0 <= cp) && (0x00f6 >= cp)) ||
1173 ((0x00f8 <= cp) && (0x00fe >= cp)) ||
1174 ((0x03b1 <= cp) && (0x03c1 >= cp)) ||
1175 ((0x03c3 <= cp) && (0x03cb >= cp))) {
1176 cp -= 32;
1177 } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
1178 ((0x0132 <= cp) && (0x0137 >= cp)) ||
1179 ((0x014a <= cp) && (0x0177 >= cp)) ||
1180 ((0x0182 <= cp) && (0x0185 >= cp)) ||
1181 ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
1182 ((0x01de <= cp) && (0x01ef >= cp)) ||
1183 ((0x01f8 <= cp) && (0x021f >= cp)) ||
1184 ((0x0222 <= cp) && (0x0233 >= cp)) ||
1185 ((0x0246 <= cp) && (0x024f >= cp)) ||
1186 ((0x03d8 <= cp) && (0x03ef >= cp))) {
1187 cp &= ~0x1;
1188 } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
1189 ((0x0179 <= cp) && (0x017e >= cp)) ||
1190 ((0x01af <= cp) && (0x01b0 >= cp)) ||
1191 ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
1192 ((0x01cd <= cp) && (0x01dc >= cp))) {
1193 cp -= 1;
1194 cp |= 0x1;
1195 } else {
1196 switch (cp) {
1197 default: break;
1198 case 0x00ff: cp = 0x0178; break;
1199 case 0x0180: cp = 0x0243; break;
1200 case 0x01dd: cp = 0x018e; break;
1201 case 0x019a: cp = 0x023d; break;
1202 case 0x019e: cp = 0x0220; break;
1203 case 0x0292: cp = 0x01b7; break;
1204 case 0x01c6: cp = 0x01c4; break;
1205 case 0x01c9: cp = 0x01c7; break;
1206 case 0x01cc: cp = 0x01ca; break;
1207 case 0x01f3: cp = 0x01f1; break;
1208 case 0x01bf: cp = 0x01f7; break;
1209 case 0x0188: cp = 0x0187; break;
1210 case 0x018c: cp = 0x018b; break;
1211 case 0x0192: cp = 0x0191; break;
1212 case 0x0199: cp = 0x0198; break;
1213 case 0x01a8: cp = 0x01a7; break;
1214 case 0x01ad: cp = 0x01ac; break;
1215 case 0x01b0: cp = 0x01af; break;
1216 case 0x01b9: cp = 0x01b8; break;
1217 case 0x01bd: cp = 0x01bc; break;
1218 case 0x01f5: cp = 0x01f4; break;
1219 case 0x023c: cp = 0x023b; break;
1220 case 0x0242: cp = 0x0241; break;
1221 case 0x037b: cp = 0x03fd; break;
1222 case 0x037c: cp = 0x03fe; break;
1223 case 0x037d: cp = 0x03ff; break;
1224 case 0x03f3: cp = 0x037f; break;
1225 case 0x03ac: cp = 0x0386; break;
1226 case 0x03ad: cp = 0x0388; break;
1227 case 0x03ae: cp = 0x0389; break;
1228 case 0x03af: cp = 0x038a; break;
1229 case 0x03cc: cp = 0x038c; break;
1230 case 0x03cd: cp = 0x038e; break;
1231 case 0x03ce: cp = 0x038f; break;
1232 case 0x0371: cp = 0x0370; break;
1233 case 0x0373: cp = 0x0372; break;
1234 case 0x0377: cp = 0x0376; break;
1235 case 0x03d1: cp = 0x03f4; break;
1236 case 0x03d7: cp = 0x03cf; break;
1237 case 0x03f2: cp = 0x03f9; break;
1238 case 0x03f8: cp = 0x03f7; break;
1239 case 0x03fb: cp = 0x03fa; break;
1240 };
1241 }
1242
1243 return cp;
1244 }
1245
1246 #undef utf8_restrict
1247 #undef utf8_null
1248
1249 #ifdef __cplusplus
1250 } // extern "C"
1251 #endif
1252
1253 #if defined(__clang__)
1254 #pragma clang diagnostic pop
1255 #endif
1256
1257 #endif // SHEREDOM_UTF8_H_INCLUDED
1258