1 // The latest version of this library is available on GitHub;
2 // https://github.com/sheredom/utf8.h
3 
4 // This is free and unencumbered software released into the public domain.
5 //
6 // Anyone is free to copy, modify, publish, use, compile, sell, or
7 // distribute this software, either in source code form or as a compiled
8 // binary, for any purpose, commercial or non-commercial, and by any
9 // means.
10 //
11 // In jurisdictions that recognize copyright laws, the author or authors
12 // of this software dedicate any and all copyright interest in the
13 // software to the public domain. We make this dedication for the benefit
14 // of the public at large and to the detriment of our heirs and
15 // successors. We intend this dedication to be an overt act of
16 // relinquishment in perpetuity of all present and future rights to this
17 // software under copyright law.
18 //
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
23 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
24 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 // OTHER DEALINGS IN THE SOFTWARE.
26 //
27 // For more information, please refer to <http://unlicense.org/>
28 
29 #ifndef SHEREDOM_UTF8_H_INCLUDED
30 #define SHEREDOM_UTF8_H_INCLUDED
31 
32 #if defined(_MSC_VER)
33 #pragma warning(push)
34 
35 // disable 'bytes padding added after construct' warning
36 #pragma warning(disable : 4820)
37 #endif
38 
39 #include <stddef.h>
40 #include <stdlib.h>
41 
42 #if defined(_MSC_VER)
43 #pragma warning(pop)
44 #endif
45 
46 #if defined(_MSC_VER)
47 typedef __int32 utf8_int32_t;
48 #else
49 #include <stdint.h>
50 typedef int32_t utf8_int32_t;
51 #endif
52 
53 #if defined(__clang__)
54 #pragma clang diagnostic push
55 #pragma clang diagnostic ignored "-Wold-style-cast"
56 #pragma clang diagnostic ignored "-Wcast-qual"
57 #endif
58 
59 #ifdef __cplusplus
60 extern "C" {
61 #endif
62 
63 #if defined(__clang__) || defined(__GNUC__)
64 #define utf8_nonnull __attribute__((nonnull))
65 #define utf8_pure __attribute__((pure))
66 #define utf8_restrict __restrict__
67 #define utf8_weak __attribute__((weak))
68 #elif defined(_MSC_VER)
69 #define utf8_nonnull
70 #define utf8_pure
71 #define utf8_restrict __restrict
72 #define utf8_weak __inline
73 #else
74 #error Non clang, non gcc, non MSVC compiler found!
75 #endif
76 
77 #ifdef __cplusplus
78 #define utf8_null NULL
79 #else
80 #define utf8_null 0
81 #endif
82 
83 // Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
84 // src2 respectively, case insensitive.
85 utf8_nonnull utf8_pure utf8_weak int utf8casecmp(const void *src1,
86                                                  const void *src2);
87 
88 // Append the utf8 string src onto the utf8 string dst.
89 utf8_nonnull utf8_weak void *utf8cat(void *utf8_restrict dst,
90                                      const void *utf8_restrict src);
91 
92 // Find the first match of the utf8 codepoint chr in the utf8 string src.
93 utf8_nonnull utf8_pure utf8_weak void *utf8chr(const void *src,
94                                                utf8_int32_t chr);
95 
96 // Return less than 0, 0, greater than 0 if src1 < src2,
97 // src1 == src2, src1 > src2 respectively.
98 utf8_nonnull utf8_pure utf8_weak int utf8cmp(const void *src1,
99                                              const void *src2);
100 
101 // Copy the utf8 string src onto the memory allocated in dst.
102 utf8_nonnull utf8_weak void *utf8cpy(void *utf8_restrict dst,
103                                      const void *utf8_restrict src);
104 
105 // Number of utf8 codepoints in the utf8 string src that consists entirely
106 // of utf8 codepoints not from the utf8 string reject.
107 utf8_nonnull utf8_pure utf8_weak size_t utf8cspn(const void *src,
108                                                  const void *reject);
109 
110 // Duplicate the utf8 string src by getting its size, malloc'ing a new buffer
111 // copying over the data, and returning that. Or 0 if malloc failed.
112 utf8_nonnull utf8_weak void *utf8dup(const void *src);
113 
114 // Number of utf8 codepoints in the utf8 string str,
115 // excluding the null terminating byte.
116 utf8_nonnull utf8_pure utf8_weak size_t utf8len(const void *str);
117 
118 // Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
119 // src2 respectively, case insensitive. Checking at most n bytes of each utf8
120 // string.
121 utf8_nonnull utf8_pure utf8_weak int utf8ncasecmp(const void *src1,
122                                                   const void *src2, size_t n);
123 
124 // Append the utf8 string src onto the utf8 string dst,
125 // writing at most n+1 bytes. Can produce an invalid utf8
126 // string if n falls partway through a utf8 codepoint.
127 utf8_nonnull utf8_weak void *utf8ncat(void *utf8_restrict dst,
128                                       const void *utf8_restrict src, size_t n);
129 
130 // Return less than 0, 0, greater than 0 if src1 < src2,
131 // src1 == src2, src1 > src2 respectively. Checking at most n
132 // bytes of each utf8 string.
133 utf8_nonnull utf8_pure utf8_weak int utf8ncmp(const void *src1,
134                                               const void *src2, size_t n);
135 
136 // Copy the utf8 string src onto the memory allocated in dst.
137 // Copies at most n bytes. If there is no terminating null byte in
138 // the first n bytes of src, the string placed into dst will not be
139 // null-terminated. If the size (in bytes) of src is less than n,
140 // extra null terminating bytes are appended to dst such that at
141 // total of n bytes are written. Can produce an invalid utf8
142 // string if n falls partway through a utf8 codepoint.
143 utf8_nonnull utf8_weak void *utf8ncpy(void *utf8_restrict dst,
144                                       const void *utf8_restrict src, size_t n);
145 
146 // Similar to utf8dup, except that at most n bytes of src are copied. If src is
147 // longer than n, only n bytes are copied and a null byte is added.
148 //
149 // Returns a new string if successful, 0 otherwise
150 utf8_nonnull utf8_weak void *utf8ndup(const void *src, size_t n);
151 
152 // Locates the first occurence in the utf8 string str of any byte in the
153 // utf8 string accept, or 0 if no match was found.
154 utf8_nonnull utf8_pure utf8_weak void *utf8pbrk(const void *str,
155                                                 const void *accept);
156 
157 // Find the last match of the utf8 codepoint chr in the utf8 string src.
158 utf8_nonnull utf8_pure utf8_weak void *utf8rchr(const void *src, int chr);
159 
160 // Number of bytes in the utf8 string str,
161 // including the null terminating byte.
162 utf8_nonnull utf8_pure utf8_weak size_t utf8size(const void *str);
163 
164 // Number of utf8 codepoints in the utf8 string src that consists entirely
165 // of utf8 codepoints from the utf8 string accept.
166 utf8_nonnull utf8_pure utf8_weak size_t utf8spn(const void *src,
167                                                 const void *accept);
168 
169 // The position of the utf8 string needle in the utf8 string haystack.
170 utf8_nonnull utf8_pure utf8_weak void *utf8str(const void *haystack,
171                                                const void *needle);
172 
173 // The position of the utf8 string needle in the utf8 string haystack, case
174 // insensitive.
175 utf8_nonnull utf8_pure utf8_weak void *utf8casestr(const void *haystack,
176                                                    const void *needle);
177 
178 // Return 0 on success, or the position of the invalid
179 // utf8 codepoint on failure.
180 utf8_nonnull utf8_pure utf8_weak void *utf8valid(const void *str);
181 
182 // Sets out_codepoint to the next utf8 codepoint in str, and returns the address
183 // of the utf8 codepoint after the current one in str.
184 utf8_nonnull utf8_weak void *
185 utf8codepoint(const void *utf8_restrict str,
186               utf8_int32_t *utf8_restrict out_codepoint);
187 
188 // Returns the size of the given codepoint in bytes.
189 utf8_weak size_t utf8codepointsize(utf8_int32_t chr);
190 
191 // Write a codepoint to the given string, and return the address to the next
192 // place after the written codepoint. Pass how many bytes left in the buffer to
193 // n. If there is not enough space for the codepoint, this function returns
194 // null.
195 utf8_nonnull utf8_weak void *utf8catcodepoint(void *utf8_restrict str,
196                                               utf8_int32_t chr, size_t n);
197 
198 // Returns 1 if the given character is lowercase, or 0 if it is not.
199 utf8_weak int utf8islower(utf8_int32_t chr);
200 
201 // Returns 1 if the given character is uppercase, or 0 if it is not.
202 utf8_weak int utf8isupper(utf8_int32_t chr);
203 
204 // Transform the given string into all lowercase codepoints.
205 utf8_nonnull utf8_weak void utf8lwr(void *utf8_restrict str);
206 
207 // Transform the given string into all uppercase codepoints.
208 utf8_nonnull utf8_weak void utf8upr(void *utf8_restrict str);
209 
210 // Make a codepoint lower case if possible.
211 utf8_weak utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp);
212 
213 // Make a codepoint upper case if possible.
214 utf8_weak utf8_int32_t utf8uprcodepoint(utf8_int32_t cp);
215 
216 #undef utf8_weak
217 #undef utf8_pure
218 #undef utf8_nonnull
219 
utf8casecmp(const void * src1,const void * src2)220 int utf8casecmp(const void *src1, const void *src2) {
221   utf8_int32_t src1_cp, src2_cp, src1_orig_cp, src2_orig_cp;
222 
223   for (;;) {
224     src1 = utf8codepoint(src1, &src1_cp);
225     src2 = utf8codepoint(src2, &src2_cp);
226 
227     // Take a copy of src1 & src2
228     src1_orig_cp = src1_cp;
229     src2_orig_cp = src2_cp;
230 
231     // Lower the srcs if required
232     src1_cp = utf8lwrcodepoint(src1_cp);
233     src2_cp = utf8lwrcodepoint(src2_cp);
234 
235     // Check if the lowered codepoints match
236     if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
237       return 0;
238     } else if (src1_cp == src2_cp) {
239       continue;
240     }
241 
242     // If they don't match, then we return which of the original's are less
243     if (src1_orig_cp < src2_orig_cp) {
244       return -1;
245     } else if (src1_orig_cp > src2_orig_cp) {
246       return 1;
247     }
248   }
249 }
250 
utf8cat(void * utf8_restrict dst,const void * utf8_restrict src)251 void *utf8cat(void *utf8_restrict dst, const void *utf8_restrict src) {
252   char *d = (char *)dst;
253   const char *s = (const char *)src;
254 
255   // find the null terminating byte in dst
256   while ('\0' != *d) {
257     d++;
258   }
259 
260   // overwriting the null terminating byte in dst, append src byte-by-byte
261   while ('\0' != *s) {
262     *d++ = *s++;
263   }
264 
265   // write out a new null terminating byte into dst
266   *d = '\0';
267 
268   return dst;
269 }
270 
utf8chr(const void * src,utf8_int32_t chr)271 void *utf8chr(const void *src, utf8_int32_t chr) {
272   char c[5] = {'\0', '\0', '\0', '\0', '\0'};
273 
274   if (0 == chr) {
275     // being asked to return position of null terminating byte, so
276     // just run s to the end, and return!
277     const char *s = (const char *)src;
278     while ('\0' != *s) {
279       s++;
280     }
281     return (void *)s;
282   } else if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
283     // 1-byte/7-bit ascii
284     // (0b0xxxxxxx)
285     c[0] = (char)chr;
286   } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
287     // 2-byte/11-bit utf8 code point
288     // (0b110xxxxx 0b10xxxxxx)
289     c[0] = 0xc0 | (char)(chr >> 6);
290     c[1] = 0x80 | (char)(chr & 0x3f);
291   } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
292     // 3-byte/16-bit utf8 code point
293     // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx)
294     c[0] = 0xe0 | (char)(chr >> 12);
295     c[1] = 0x80 | (char)((chr >> 6) & 0x3f);
296     c[2] = 0x80 | (char)(chr & 0x3f);
297   } else { // if (0 == ((int)0xffe00000 & chr)) {
298     // 4-byte/21-bit utf8 code point
299     // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx)
300     c[0] = 0xf0 | (char)(chr >> 18);
301     c[1] = 0x80 | (char)((chr >> 12) & 0x3f);
302     c[2] = 0x80 | (char)((chr >> 6) & 0x3f);
303     c[3] = 0x80 | (char)(chr & 0x3f);
304   }
305 
306   // we've made c into a 2 utf8 codepoint string, one for the chr we are
307   // seeking, another for the null terminating byte. Now use utf8str to
308   // search
309   return utf8str(src, c);
310 }
311 
utf8cmp(const void * src1,const void * src2)312 int utf8cmp(const void *src1, const void *src2) {
313   const unsigned char *s1 = (const unsigned char *)src1;
314   const unsigned char *s2 = (const unsigned char *)src2;
315 
316   while (('\0' != *s1) || ('\0' != *s2)) {
317     if (*s1 < *s2) {
318       return -1;
319     } else if (*s1 > *s2) {
320       return 1;
321     }
322 
323     s1++;
324     s2++;
325   }
326 
327   // both utf8 strings matched
328   return 0;
329 }
330 
331 int utf8coll(const void *src1, const void *src2);
332 
utf8cpy(void * utf8_restrict dst,const void * utf8_restrict src)333 void *utf8cpy(void *utf8_restrict dst, const void *utf8_restrict src) {
334   char *d = (char *)dst;
335   const char *s = (const char *)src;
336 
337   // overwriting anything previously in dst, write byte-by-byte
338   // from src
339   while ('\0' != *s) {
340     *d++ = *s++;
341   }
342 
343   // append null terminating byte
344   *d = '\0';
345 
346   return dst;
347 }
348 
utf8cspn(const void * src,const void * reject)349 size_t utf8cspn(const void *src, const void *reject) {
350   const char *s = (const char *)src;
351   size_t chars = 0;
352 
353   while ('\0' != *s) {
354     const char *r = (const char *)reject;
355     size_t offset = 0;
356 
357     while ('\0' != *r) {
358       // checking that if *r is the start of a utf8 codepoint
359       // (it is not 0b10xxxxxx) and we have successfully matched
360       // a previous character (0 < offset) - we found a match
361       if ((0x80 != (0xc0 & *r)) && (0 < offset)) {
362         return chars;
363       } else {
364         if (*r == s[offset]) {
365           // part of a utf8 codepoint matched, so move our checking
366           // onwards to the next byte
367           offset++;
368           r++;
369         } else {
370           // r could be in the middle of an unmatching utf8 code point,
371           // so we need to march it on to the next character beginning,
372 
373           do {
374             r++;
375           } while (0x80 == (0xc0 & *r));
376 
377           // reset offset too as we found a mismatch
378           offset = 0;
379         }
380       }
381     }
382 
383     // the current utf8 codepoint in src did not match reject, but src
384     // could have been partway through a utf8 codepoint, so we need to
385     // march it onto the next utf8 codepoint starting byte
386     do {
387       s++;
388     } while ((0x80 == (0xc0 & *s)));
389     chars++;
390   }
391 
392   return chars;
393 }
394 
395 size_t utf8size(const void *str);
396 
utf8dup(const void * src)397 void *utf8dup(const void *src) {
398   const char *s = (const char *)src;
399   char *n = utf8_null;
400 
401   // figure out how many bytes (including the terminator) we need to copy first
402   size_t bytes = utf8size(src);
403 
404   n = (char *)malloc(bytes);
405 
406   if (utf8_null == n) {
407     // out of memory so we bail
408     return utf8_null;
409   } else {
410     bytes = 0;
411 
412     // copy src byte-by-byte into our new utf8 string
413     while ('\0' != s[bytes]) {
414       n[bytes] = s[bytes];
415       bytes++;
416     }
417 
418     // append null terminating byte
419     n[bytes] = '\0';
420     return n;
421   }
422 }
423 
424 void *utf8fry(const void *str);
425 
utf8len(const void * str)426 size_t utf8len(const void *str) {
427   const unsigned char *s = (const unsigned char *)str;
428   size_t length = 0;
429 
430   while ('\0' != *s) {
431     if (0xf0 == (0xf8 & *s)) {
432       // 4-byte utf8 code point (began with 0b11110xxx)
433       s += 4;
434     } else if (0xe0 == (0xf0 & *s)) {
435       // 3-byte utf8 code point (began with 0b1110xxxx)
436       s += 3;
437     } else if (0xc0 == (0xe0 & *s)) {
438       // 2-byte utf8 code point (began with 0b110xxxxx)
439       s += 2;
440     } else { // if (0x00 == (0x80 & *s)) {
441       // 1-byte ascii (began with 0b0xxxxxxx)
442       s += 1;
443     }
444 
445     // no matter the bytes we marched s forward by, it was
446     // only 1 utf8 codepoint
447     length++;
448   }
449 
450   return length;
451 }
452 
utf8ncasecmp(const void * src1,const void * src2,size_t n)453 int utf8ncasecmp(const void *src1, const void *src2, size_t n) {
454   utf8_int32_t src1_cp, src2_cp, src1_orig_cp, src2_orig_cp;
455 
456   do {
457     const unsigned char *const s1 = (const unsigned char *)src1;
458     const unsigned char *const s2 = (const unsigned char *)src2;
459 
460     // first check that we have enough bytes left in n to contain an entire
461     // codepoint
462     if (0 == n) {
463       return 0;
464     }
465 
466     if ((1 == n) && ((0xc0 == (0xe0 & *s1)) || (0xc0 == (0xe0 & *s2)))) {
467       const utf8_int32_t c1 = (0xe0 & *s1);
468       const utf8_int32_t c2 = (0xe0 & *s2);
469 
470       if (c1 < c2) {
471         return -1;
472       } else if (c1 > c2) {
473         return 1;
474       } else {
475         return 0;
476       }
477     }
478 
479     if ((2 >= n) && ((0xe0 == (0xf0 & *s1)) || (0xe0 == (0xf0 & *s2)))) {
480       const utf8_int32_t c1 = (0xf0 & *s1);
481       const utf8_int32_t c2 = (0xf0 & *s2);
482 
483       if (c1 < c2) {
484         return -1;
485       } else if (c1 > c2) {
486         return 1;
487       } else {
488         return 0;
489       }
490     }
491 
492     if ((3 >= n) && ((0xf0 == (0xf8 & *s1)) || (0xf0 == (0xf8 & *s2)))) {
493       const utf8_int32_t c1 = (0xf8 & *s1);
494       const utf8_int32_t c2 = (0xf8 & *s2);
495 
496       if (c1 < c2) {
497         return -1;
498       } else if (c1 > c2) {
499         return 1;
500       } else {
501         return 0;
502       }
503     }
504 
505     src1 = utf8codepoint(src1, &src1_cp);
506     src2 = utf8codepoint(src2, &src2_cp);
507     n -= utf8codepointsize(src1_cp);
508 
509     // Take a copy of src1 & src2
510     src1_orig_cp = src1_cp;
511     src2_orig_cp = src2_cp;
512 
513     // Lower srcs if required
514     src1_cp = utf8lwrcodepoint(src1_cp);
515     src2_cp = utf8lwrcodepoint(src2_cp);
516 
517     // Check if the lowered codepoints match
518     if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
519       return 0;
520     } else if (src1_cp == src2_cp) {
521       continue;
522     }
523 
524     // If they don't match, then we return which of the original's are less
525     if (src1_orig_cp < src2_orig_cp) {
526       return -1;
527     } else if (src1_orig_cp > src2_orig_cp) {
528       return 1;
529     }
530   } while (0 < n);
531 
532   // both utf8 strings matched
533   return 0;
534 }
535 
utf8ncat(void * utf8_restrict dst,const void * utf8_restrict src,size_t n)536 void *utf8ncat(void *utf8_restrict dst, const void *utf8_restrict src,
537                size_t n) {
538   char *d = (char *)dst;
539   const char *s = (const char *)src;
540 
541   // find the null terminating byte in dst
542   while ('\0' != *d) {
543     d++;
544   }
545 
546   // overwriting the null terminating byte in dst, append src byte-by-byte
547   // stopping if we run out of space
548   do {
549     *d++ = *s++;
550   } while (('\0' != *s) && (0 != --n));
551 
552   // write out a new null terminating byte into dst
553   *d = '\0';
554 
555   return dst;
556 }
557 
utf8ncmp(const void * src1,const void * src2,size_t n)558 int utf8ncmp(const void *src1, const void *src2, size_t n) {
559   const unsigned char *s1 = (const unsigned char *)src1;
560   const unsigned char *s2 = (const unsigned char *)src2;
561 
562   while ((('\0' != *s1) || ('\0' != *s2)) && (0 != n--)) {
563     if (*s1 < *s2) {
564       return -1;
565     } else if (*s1 > *s2) {
566       return 1;
567     }
568 
569     s1++;
570     s2++;
571   }
572 
573   // both utf8 strings matched
574   return 0;
575 }
576 
utf8ncpy(void * utf8_restrict dst,const void * utf8_restrict src,size_t n)577 void *utf8ncpy(void *utf8_restrict dst, const void *utf8_restrict src,
578                size_t n) {
579   char *d = (char *)dst;
580   const char *s = (const char *)src;
581 
582   // overwriting anything previously in dst, write byte-by-byte
583   // from src
584   do {
585     *d++ = *s++;
586   } while (('\0' != *s) && (0 != --n));
587 
588   // append null terminating byte
589   while (0 != n) {
590     *d++ = '\0';
591     n--;
592   }
593 
594   return dst;
595 }
596 
utf8ndup(const void * src,size_t n)597 void *utf8ndup(const void *src, size_t n) {
598   const char *s = (const char *)src;
599   char *c = utf8_null;
600   size_t bytes = 0;
601 
602   // Find the end of the string or stop when n is reached
603   while ('\0' != s[bytes] && bytes < n) {
604     bytes++;
605   }
606 
607   // In case bytes is actually less than n, we need to set it
608   // to be used later in the copy byte by byte.
609   n = bytes;
610 
611   c = (char *)malloc(bytes + 1);
612   if (utf8_null == c) {
613     // out of memory so we bail
614     return utf8_null;
615   }
616 
617   bytes = 0;
618 
619   // copy src byte-by-byte into our new utf8 string
620   while ('\0' != s[bytes] && bytes < n) {
621     c[bytes] = s[bytes];
622     bytes++;
623   }
624 
625   // append null terminating byte
626   c[bytes] = '\0';
627   return c;
628 }
629 
utf8rchr(const void * src,int chr)630 void *utf8rchr(const void *src, int chr) {
631   const char *s = (const char *)src;
632   const char *match = utf8_null;
633   char c[5] = {'\0', '\0', '\0', '\0', '\0'};
634 
635   if (0 == chr) {
636     // being asked to return position of null terminating byte, so
637     // just run s to the end, and return!
638     while ('\0' != *s) {
639       s++;
640     }
641     return (void *)s;
642   } else if (0 == ((int)0xffffff80 & chr)) {
643     // 1-byte/7-bit ascii
644     // (0b0xxxxxxx)
645     c[0] = (char)chr;
646   } else if (0 == ((int)0xfffff800 & chr)) {
647     // 2-byte/11-bit utf8 code point
648     // (0b110xxxxx 0b10xxxxxx)
649     c[0] = 0xc0 | (char)(chr >> 6);
650     c[1] = 0x80 | (char)(chr & 0x3f);
651   } else if (0 == ((int)0xffff0000 & chr)) {
652     // 3-byte/16-bit utf8 code point
653     // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx)
654     c[0] = 0xe0 | (char)(chr >> 12);
655     c[1] = 0x80 | (char)((chr >> 6) & 0x3f);
656     c[2] = 0x80 | (char)(chr & 0x3f);
657   } else { // if (0 == ((int)0xffe00000 & chr)) {
658     // 4-byte/21-bit utf8 code point
659     // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx)
660     c[0] = 0xf0 | (char)(chr >> 18);
661     c[1] = 0x80 | (char)((chr >> 12) & 0x3f);
662     c[2] = 0x80 | (char)((chr >> 6) & 0x3f);
663     c[3] = 0x80 | (char)(chr & 0x3f);
664   }
665 
666   // we've created a 2 utf8 codepoint string in c that is
667   // the utf8 character asked for by chr, and a null
668   // terminating byte
669 
670   while ('\0' != *s) {
671     size_t offset = 0;
672 
673     while (s[offset] == c[offset]) {
674       offset++;
675     }
676 
677     if ('\0' == c[offset]) {
678       // we found a matching utf8 code point
679       match = s;
680       s += offset;
681     } else {
682       s += offset;
683 
684       // need to march s along to next utf8 codepoint start
685       // (the next byte that doesn't match 0b10xxxxxx)
686       if ('\0' != *s) {
687         do {
688           s++;
689         } while (0x80 == (0xc0 & *s));
690       }
691     }
692   }
693 
694   // return the last match we found (or 0 if no match was found)
695   return (void *)match;
696 }
697 
utf8pbrk(const void * str,const void * accept)698 void *utf8pbrk(const void *str, const void *accept) {
699   const char *s = (const char *)str;
700 
701   while ('\0' != *s) {
702     const char *a = (const char *)accept;
703     size_t offset = 0;
704 
705     while ('\0' != *a) {
706       // checking that if *a is the start of a utf8 codepoint
707       // (it is not 0b10xxxxxx) and we have successfully matched
708       // a previous character (0 < offset) - we found a match
709       if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
710         return (void *)s;
711       } else {
712         if (*a == s[offset]) {
713           // part of a utf8 codepoint matched, so move our checking
714           // onwards to the next byte
715           offset++;
716           a++;
717         } else {
718           // r could be in the middle of an unmatching utf8 code point,
719           // so we need to march it on to the next character beginning,
720 
721           do {
722             a++;
723           } while (0x80 == (0xc0 & *a));
724 
725           // reset offset too as we found a mismatch
726           offset = 0;
727         }
728       }
729     }
730 
731     // we found a match on the last utf8 codepoint
732     if (0 < offset) {
733       return (void *)s;
734     }
735 
736     // the current utf8 codepoint in src did not match accept, but src
737     // could have been partway through a utf8 codepoint, so we need to
738     // march it onto the next utf8 codepoint starting byte
739     do {
740       s++;
741     } while ((0x80 == (0xc0 & *s)));
742   }
743 
744   return utf8_null;
745 }
746 
utf8size(const void * str)747 size_t utf8size(const void *str) {
748   const char *s = (const char *)str;
749   size_t size = 0;
750   while ('\0' != s[size]) {
751     size++;
752   }
753 
754   // we are including the null terminating byte in the size calculation
755   size++;
756   return size;
757 }
758 
utf8spn(const void * src,const void * accept)759 size_t utf8spn(const void *src, const void *accept) {
760   const char *s = (const char *)src;
761   size_t chars = 0;
762 
763   while ('\0' != *s) {
764     const char *a = (const char *)accept;
765     size_t offset = 0;
766 
767     while ('\0' != *a) {
768       // checking that if *r is the start of a utf8 codepoint
769       // (it is not 0b10xxxxxx) and we have successfully matched
770       // a previous character (0 < offset) - we found a match
771       if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
772         // found a match, so increment the number of utf8 codepoints
773         // that have matched and stop checking whether any other utf8
774         // codepoints in a match
775         chars++;
776         s += offset;
777         break;
778       } else {
779         if (*a == s[offset]) {
780           offset++;
781           a++;
782         } else {
783           // a could be in the middle of an unmatching utf8 codepoint,
784           // so we need to march it on to the next character beginning,
785           do {
786             a++;
787           } while (0x80 == (0xc0 & *a));
788 
789           // reset offset too as we found a mismatch
790           offset = 0;
791         }
792       }
793     }
794 
795     // if a got to its terminating null byte, then we didn't find a match.
796     // Return the current number of matched utf8 codepoints
797     if ('\0' == *a) {
798       return chars;
799     }
800   }
801 
802   return chars;
803 }
804 
utf8str(const void * haystack,const void * needle)805 void *utf8str(const void *haystack, const void *needle) {
806   const char *h = (const char *)haystack;
807 
808   // if needle has no utf8 codepoints before the null terminating
809   // byte then return haystack
810   if ('\0' == *((const char *)needle)) {
811     return (void *)haystack;
812   }
813 
814   while ('\0' != *h) {
815     const char *maybeMatch = h;
816     const char *n = (const char *)needle;
817 
818     while (*h == *n && (*h != '\0' && *n != '\0')) {
819       n++;
820       h++;
821     }
822 
823     if ('\0' == *n) {
824       // we found the whole utf8 string for needle in haystack at
825       // maybeMatch, so return it
826       return (void *)maybeMatch;
827     } else {
828       // h could be in the middle of an unmatching utf8 codepoint,
829       // so we need to march it on to the next character beginning,
830       if ('\0' != *h) {
831         do {
832           h++;
833         } while (0x80 == (0xc0 & *h));
834       }
835     }
836   }
837 
838   // no match
839   return utf8_null;
840 }
841 
utf8casestr(const void * haystack,const void * needle)842 void *utf8casestr(const void *haystack, const void *needle) {
843   const void *h = haystack;
844 
845   // if needle has no utf8 codepoints before the null terminating
846   // byte then return haystack
847   if ('\0' == *((const char *)needle)) {
848     return (void *)haystack;
849   }
850 
851   for (;;) {
852     const void *maybeMatch = h;
853     const void *n = needle;
854     utf8_int32_t h_cp, n_cp;
855 
856     h = utf8codepoint(h, &h_cp);
857     n = utf8codepoint(n, &n_cp);
858 
859     while ((0 != h_cp) && (0 != n_cp)) {
860       h_cp = utf8lwrcodepoint(h_cp);
861       n_cp = utf8lwrcodepoint(n_cp);
862 
863       // if we find a mismatch, bail out!
864       if (h_cp != n_cp) {
865         break;
866       }
867 
868       h = utf8codepoint(h, &h_cp);
869       n = utf8codepoint(n, &n_cp);
870     }
871 
872     if (0 == n_cp) {
873       // we found the whole utf8 string for needle in haystack at
874       // maybeMatch, so return it
875       return (void *)maybeMatch;
876     }
877 
878     if (0 == h_cp) {
879       // no match
880       return utf8_null;
881     }
882   }
883 }
884 
utf8valid(const void * str)885 void *utf8valid(const void *str) {
886   const char *s = (const char *)str;
887 
888   while ('\0' != *s) {
889     if (0xf0 == (0xf8 & *s)) {
890       // ensure each of the 3 following bytes in this 4-byte
891       // utf8 codepoint began with 0b10xxxxxx
892       if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) ||
893           (0x80 != (0xc0 & s[3]))) {
894         return (void *)s;
895       }
896 
897       // ensure that our utf8 codepoint ended after 4 bytes
898       if (0x80 == (0xc0 & s[4])) {
899         return (void *)s;
900       }
901 
902       // ensure that the top 5 bits of this 4-byte utf8
903       // codepoint were not 0, as then we could have used
904       // one of the smaller encodings
905       if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1]))) {
906         return (void *)s;
907       }
908 
909       // 4-byte utf8 code point (began with 0b11110xxx)
910       s += 4;
911     } else if (0xe0 == (0xf0 & *s)) {
912       // ensure each of the 2 following bytes in this 3-byte
913       // utf8 codepoint began with 0b10xxxxxx
914       if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))) {
915         return (void *)s;
916       }
917 
918       // ensure that our utf8 codepoint ended after 3 bytes
919       if (0x80 == (0xc0 & s[3])) {
920         return (void *)s;
921       }
922 
923       // ensure that the top 5 bits of this 3-byte utf8
924       // codepoint were not 0, as then we could have used
925       // one of the smaller encodings
926       if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1]))) {
927         return (void *)s;
928       }
929 
930       // 3-byte utf8 code point (began with 0b1110xxxx)
931       s += 3;
932     } else if (0xc0 == (0xe0 & *s)) {
933       // ensure the 1 following byte in this 2-byte
934       // utf8 codepoint began with 0b10xxxxxx
935       if (0x80 != (0xc0 & s[1])) {
936         return (void *)s;
937       }
938 
939       // ensure that our utf8 codepoint ended after 2 bytes
940       if (0x80 == (0xc0 & s[2])) {
941         return (void *)s;
942       }
943 
944       // ensure that the top 4 bits of this 2-byte utf8
945       // codepoint were not 0, as then we could have used
946       // one of the smaller encodings
947       if (0 == (0x1e & s[0])) {
948         return (void *)s;
949       }
950 
951       // 2-byte utf8 code point (began with 0b110xxxxx)
952       s += 2;
953     } else if (0x00 == (0x80 & *s)) {
954       // 1-byte ascii (began with 0b0xxxxxxx)
955       s += 1;
956     } else {
957       // we have an invalid 0b1xxxxxxx utf8 code point entry
958       return (void *)s;
959     }
960   }
961 
962   return utf8_null;
963 }
964 
utf8codepoint(const void * utf8_restrict str,utf8_int32_t * utf8_restrict out_codepoint)965 void *utf8codepoint(const void *utf8_restrict str,
966                     utf8_int32_t *utf8_restrict out_codepoint) {
967   const char *s = (const char *)str;
968 
969   if (0xf0 == (0xf8 & s[0])) {
970     // 4 byte utf8 codepoint
971     *out_codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) |
972                      ((0x3f & s[2]) << 6) | (0x3f & s[3]);
973     s += 4;
974   } else if (0xe0 == (0xf0 & s[0])) {
975     // 3 byte utf8 codepoint
976     *out_codepoint =
977         ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
978     s += 3;
979   } else if (0xc0 == (0xe0 & s[0])) {
980     // 2 byte utf8 codepoint
981     *out_codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
982     s += 2;
983   } else {
984     // 1 byte utf8 codepoint otherwise
985     *out_codepoint = s[0];
986     s += 1;
987   }
988 
989   return (void *)s;
990 }
991 
utf8codepointsize(utf8_int32_t chr)992 size_t utf8codepointsize(utf8_int32_t chr) {
993   if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
994     return 1;
995   } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
996     return 2;
997   } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
998     return 3;
999   } else { // if (0 == ((int)0xffe00000 & chr)) {
1000     return 4;
1001   }
1002 }
1003 
utf8catcodepoint(void * utf8_restrict str,utf8_int32_t chr,size_t n)1004 void *utf8catcodepoint(void *utf8_restrict str, utf8_int32_t chr, size_t n) {
1005   char *s = (char *)str;
1006 
1007   if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
1008     // 1-byte/7-bit ascii
1009     // (0b0xxxxxxx)
1010     if (n < 1) {
1011       return utf8_null;
1012     }
1013     s[0] = (char)chr;
1014     s += 1;
1015   } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
1016     // 2-byte/11-bit utf8 code point
1017     // (0b110xxxxx 0b10xxxxxx)
1018     if (n < 2) {
1019       return utf8_null;
1020     }
1021     s[0] = 0xc0 | (char)(chr >> 6);
1022     s[1] = 0x80 | (char)(chr & 0x3f);
1023     s += 2;
1024   } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
1025     // 3-byte/16-bit utf8 code point
1026     // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx)
1027     if (n < 3) {
1028       return utf8_null;
1029     }
1030     s[0] = 0xe0 | (char)(chr >> 12);
1031     s[1] = 0x80 | (char)((chr >> 6) & 0x3f);
1032     s[2] = 0x80 | (char)(chr & 0x3f);
1033     s += 3;
1034   } else { // if (0 == ((int)0xffe00000 & chr)) {
1035     // 4-byte/21-bit utf8 code point
1036     // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx)
1037     if (n < 4) {
1038       return utf8_null;
1039     }
1040     s[0] = 0xf0 | (char)(chr >> 18);
1041     s[1] = 0x80 | (char)((chr >> 12) & 0x3f);
1042     s[2] = 0x80 | (char)((chr >> 6) & 0x3f);
1043     s[3] = 0x80 | (char)(chr & 0x3f);
1044     s += 4;
1045   }
1046 
1047   return s;
1048 }
1049 
utf8islower(utf8_int32_t chr)1050 int utf8islower(utf8_int32_t chr) { return chr != utf8uprcodepoint(chr); }
1051 
utf8isupper(utf8_int32_t chr)1052 int utf8isupper(utf8_int32_t chr) { return chr != utf8lwrcodepoint(chr); }
1053 
utf8lwr(void * utf8_restrict str)1054 void utf8lwr(void *utf8_restrict str) {
1055   void *p, *pn;
1056   utf8_int32_t cp;
1057 
1058   p = (char *)str;
1059   pn = utf8codepoint(p, &cp);
1060 
1061   while (cp != 0) {
1062     const utf8_int32_t lwr_cp = utf8lwrcodepoint(cp);
1063     const size_t size = utf8codepointsize(lwr_cp);
1064 
1065     if (lwr_cp != cp) {
1066       utf8catcodepoint(p, lwr_cp, size);
1067     }
1068 
1069     p = pn;
1070     pn = utf8codepoint(p, &cp);
1071   }
1072 }
1073 
utf8upr(void * utf8_restrict str)1074 void utf8upr(void *utf8_restrict str) {
1075   void *p, *pn;
1076   utf8_int32_t cp;
1077 
1078   p = (char *)str;
1079   pn = utf8codepoint(p, &cp);
1080 
1081   while (cp != 0) {
1082     const utf8_int32_t lwr_cp = utf8uprcodepoint(cp);
1083     const size_t size = utf8codepointsize(lwr_cp);
1084 
1085     if (lwr_cp != cp) {
1086       utf8catcodepoint(p, lwr_cp, size);
1087     }
1088 
1089     p = pn;
1090     pn = utf8codepoint(p, &cp);
1091   }
1092 }
1093 
utf8lwrcodepoint(utf8_int32_t cp)1094 utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp) {
1095   if (((0x0041 <= cp) && (0x005a >= cp)) ||
1096       ((0x00c0 <= cp) && (0x00d6 >= cp)) ||
1097       ((0x00d8 <= cp) && (0x00de >= cp)) ||
1098       ((0x0391 <= cp) && (0x03a1 >= cp)) ||
1099       ((0x03a3 <= cp) && (0x03ab >= cp))) {
1100     cp += 32;
1101   } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
1102              ((0x0132 <= cp) && (0x0137 >= cp)) ||
1103              ((0x014a <= cp) && (0x0177 >= cp)) ||
1104              ((0x0182 <= cp) && (0x0185 >= cp)) ||
1105              ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
1106              ((0x01de <= cp) && (0x01ef >= cp)) ||
1107              ((0x01f8 <= cp) && (0x021f >= cp)) ||
1108              ((0x0222 <= cp) && (0x0233 >= cp)) ||
1109              ((0x0246 <= cp) && (0x024f >= cp)) ||
1110              ((0x03d8 <= cp) && (0x03ef >= cp))) {
1111     cp |= 0x1;
1112   } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
1113              ((0x0179 <= cp) && (0x017e >= cp)) ||
1114              ((0x01af <= cp) && (0x01b0 >= cp)) ||
1115              ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
1116              ((0x01cd <= cp) && (0x01dc >= cp))) {
1117     cp += 1;
1118     cp &= ~0x1;
1119   } else {
1120     switch (cp) {
1121     default: break;
1122     case 0x0178: cp = 0x00ff; break;
1123     case 0x0243: cp = 0x0180; break;
1124     case 0x018e: cp = 0x01dd; break;
1125     case 0x023d: cp = 0x019a; break;
1126     case 0x0220: cp = 0x019e; break;
1127     case 0x01b7: cp = 0x0292; break;
1128     case 0x01c4: cp = 0x01c6; break;
1129     case 0x01c7: cp = 0x01c9; break;
1130     case 0x01ca: cp = 0x01cc; break;
1131     case 0x01f1: cp = 0x01f3; break;
1132     case 0x01f7: cp = 0x01bf; break;
1133     case 0x0187: cp = 0x0188; break;
1134     case 0x018b: cp = 0x018c; break;
1135     case 0x0191: cp = 0x0192; break;
1136     case 0x0198: cp = 0x0199; break;
1137     case 0x01a7: cp = 0x01a8; break;
1138     case 0x01ac: cp = 0x01ad; break;
1139     case 0x01af: cp = 0x01b0; break;
1140     case 0x01b8: cp = 0x01b9; break;
1141     case 0x01bc: cp = 0x01bd; break;
1142     case 0x01f4: cp = 0x01f5; break;
1143     case 0x023b: cp = 0x023c; break;
1144     case 0x0241: cp = 0x0242; break;
1145     case 0x03fd: cp = 0x037b; break;
1146     case 0x03fe: cp = 0x037c; break;
1147     case 0x03ff: cp = 0x037d; break;
1148     case 0x037f: cp = 0x03f3; break;
1149     case 0x0386: cp = 0x03ac; break;
1150     case 0x0388: cp = 0x03ad; break;
1151     case 0x0389: cp = 0x03ae; break;
1152     case 0x038a: cp = 0x03af; break;
1153     case 0x038c: cp = 0x03cc; break;
1154     case 0x038e: cp = 0x03cd; break;
1155     case 0x038f: cp = 0x03ce; break;
1156     case 0x0370: cp = 0x0371; break;
1157     case 0x0372: cp = 0x0373; break;
1158     case 0x0376: cp = 0x0377; break;
1159     case 0x03f4: cp = 0x03d1; break;
1160     case 0x03cf: cp = 0x03d7; break;
1161     case 0x03f9: cp = 0x03f2; break;
1162     case 0x03f7: cp = 0x03f8; break;
1163     case 0x03fa: cp = 0x03fb; break;
1164     };
1165   }
1166 
1167   return cp;
1168 }
1169 
utf8uprcodepoint(utf8_int32_t cp)1170 utf8_int32_t utf8uprcodepoint(utf8_int32_t cp) {
1171   if (((0x0061 <= cp) && (0x007a >= cp)) ||
1172       ((0x00e0 <= cp) && (0x00f6 >= cp)) ||
1173       ((0x00f8 <= cp) && (0x00fe >= cp)) ||
1174       ((0x03b1 <= cp) && (0x03c1 >= cp)) ||
1175       ((0x03c3 <= cp) && (0x03cb >= cp))) {
1176     cp -= 32;
1177   } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
1178              ((0x0132 <= cp) && (0x0137 >= cp)) ||
1179              ((0x014a <= cp) && (0x0177 >= cp)) ||
1180              ((0x0182 <= cp) && (0x0185 >= cp)) ||
1181              ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
1182              ((0x01de <= cp) && (0x01ef >= cp)) ||
1183              ((0x01f8 <= cp) && (0x021f >= cp)) ||
1184              ((0x0222 <= cp) && (0x0233 >= cp)) ||
1185              ((0x0246 <= cp) && (0x024f >= cp)) ||
1186              ((0x03d8 <= cp) && (0x03ef >= cp))) {
1187     cp &= ~0x1;
1188   } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
1189              ((0x0179 <= cp) && (0x017e >= cp)) ||
1190              ((0x01af <= cp) && (0x01b0 >= cp)) ||
1191              ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
1192              ((0x01cd <= cp) && (0x01dc >= cp))) {
1193     cp -= 1;
1194     cp |= 0x1;
1195   } else {
1196     switch (cp) {
1197     default: break;
1198     case 0x00ff: cp = 0x0178; break;
1199     case 0x0180: cp = 0x0243; break;
1200     case 0x01dd: cp = 0x018e; break;
1201     case 0x019a: cp = 0x023d; break;
1202     case 0x019e: cp = 0x0220; break;
1203     case 0x0292: cp = 0x01b7; break;
1204     case 0x01c6: cp = 0x01c4; break;
1205     case 0x01c9: cp = 0x01c7; break;
1206     case 0x01cc: cp = 0x01ca; break;
1207     case 0x01f3: cp = 0x01f1; break;
1208     case 0x01bf: cp = 0x01f7; break;
1209     case 0x0188: cp = 0x0187; break;
1210     case 0x018c: cp = 0x018b; break;
1211     case 0x0192: cp = 0x0191; break;
1212     case 0x0199: cp = 0x0198; break;
1213     case 0x01a8: cp = 0x01a7; break;
1214     case 0x01ad: cp = 0x01ac; break;
1215     case 0x01b0: cp = 0x01af; break;
1216     case 0x01b9: cp = 0x01b8; break;
1217     case 0x01bd: cp = 0x01bc; break;
1218     case 0x01f5: cp = 0x01f4; break;
1219     case 0x023c: cp = 0x023b; break;
1220     case 0x0242: cp = 0x0241; break;
1221     case 0x037b: cp = 0x03fd; break;
1222     case 0x037c: cp = 0x03fe; break;
1223     case 0x037d: cp = 0x03ff; break;
1224     case 0x03f3: cp = 0x037f; break;
1225     case 0x03ac: cp = 0x0386; break;
1226     case 0x03ad: cp = 0x0388; break;
1227     case 0x03ae: cp = 0x0389; break;
1228     case 0x03af: cp = 0x038a; break;
1229     case 0x03cc: cp = 0x038c; break;
1230     case 0x03cd: cp = 0x038e; break;
1231     case 0x03ce: cp = 0x038f; break;
1232     case 0x0371: cp = 0x0370; break;
1233     case 0x0373: cp = 0x0372; break;
1234     case 0x0377: cp = 0x0376; break;
1235     case 0x03d1: cp = 0x03f4; break;
1236     case 0x03d7: cp = 0x03cf; break;
1237     case 0x03f2: cp = 0x03f9; break;
1238     case 0x03f8: cp = 0x03f7; break;
1239     case 0x03fb: cp = 0x03fa; break;
1240     };
1241   }
1242 
1243   return cp;
1244 }
1245 
1246 #undef utf8_restrict
1247 #undef utf8_null
1248 
1249 #ifdef __cplusplus
1250 } // extern "C"
1251 #endif
1252 
1253 #if defined(__clang__)
1254 #pragma clang diagnostic pop
1255 #endif
1256 
1257 #endif // SHEREDOM_UTF8_H_INCLUDED
1258