1 /*
2  * Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
3  * <https://github.com/chansen/c-utf8-valid>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  *    this list of conditions and the following disclaimer in the documentation
13  *    and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 #ifndef UTF8_VALID_H
27 #define UTF8_VALID_H
28 #include <stddef.h>
29 #include <string.h>
30 
31 #ifdef __cplusplus
32 extern "C" {
33 #endif
34 
35 size_t
utf8_maximal_subpart(const char * src,size_t len)36 utf8_maximal_subpart(const char *src, size_t len) {
37   const unsigned char *cur = (const unsigned char *)src;
38   U32 v;
39 
40   if (len < 2)
41     return len;
42 
43   v = (cur[0] << 8) | cur[1];
44   if ((v & 0xC0C0) != 0xC080)
45     return 1;
46 
47   if ((v & 0x2000) == 0) {
48     if ((v & 0x1E00) == 0)
49       return 1;
50     return 2;
51   }
52   else if ((v & 0x1000) == 0) {
53     if ((v & 0x0F20) == 0 ||
54         (v & 0x0F20) == 0x0D20)
55       return 1;
56     if (len < 3 || (cur[2] & 0xC0) != 0x80)
57       return 2;
58     return 3;
59   }
60   else {
61     if ((v & 0x0730) == 0 ||
62         (v > 0xF48F))
63       return 1;
64     if (len < 3 || (cur[2] & 0xC0) != 0x80)
65       return 2;
66     if (len < 4 || (cur[3] & 0xC0) != 0x80)
67       return 3;
68     return 4;
69   }
70 }
71 
72 bool
utf8_check(const char * src,size_t len,size_t * cursor)73 utf8_check(const char *src, size_t len, size_t *cursor) {
74   const unsigned char *cur = (const unsigned char *)src;
75   const unsigned char *end = cur + len;
76   const unsigned char *p;
77   unsigned char buf[4];
78   U32 v;
79 
80   for (p = cur;;) {
81     if (cur >= end - 3) {
82       if (cur == end)
83         break;
84       memset(buf, 0, 4);
85       memcpy(buf, cur, end - cur);
86       p = (const unsigned char *)buf;
87     }
88 
89     v = *p++;
90     if ((v & 0x80) == 0) {
91       cur += 1;
92       continue;
93     }
94 
95     v = (v << 8) | *p++;
96     if ((v & 0xE0C0) == 0xC080 &&
97         (v & 0x1E00) != 0) {
98       cur += 2;
99       continue;
100     }
101 
102     v = (v << 8) | *p++;
103     if ((v & 0xF0C0C0) == 0xE08080 &&
104         (v & 0x0F2000) != 0 &&
105         (v & 0x0F2000) != 0x0D2000) {
106       cur += 3;
107       continue;
108     }
109 
110     v = (v << 8) | *p++;
111     if ((v & 0xF8C0C0C0) == 0xF0808080 &&
112         (v & 0x07300000) != 0 &&
113         (v < 0xF4908080)) {
114       cur += 4;
115       continue;
116     }
117 
118     break;
119   }
120 
121   if (cursor)
122     *cursor = (const char *)cur - src;
123 
124   return cur == end;
125 }
126 
127 bool
utf8_valid(const char * src,size_t len)128 utf8_valid(const char *src, size_t len) {
129   return utf8_check(src, len, NULL);
130 }
131 
132 #ifdef __cplusplus
133 }
134 #endif
135 #endif
136 
137