1 /*
2 * Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
3 * <https://github.com/chansen/c-utf8-valid>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26 #ifndef UTF8_VALID_H
27 #define UTF8_VALID_H
28 #include <stddef.h>
29 #include <string.h>
30
31 #ifdef __cplusplus
32 extern "C" {
33 #endif
34
35 size_t
utf8_maximal_subpart(const char * src,size_t len)36 utf8_maximal_subpart(const char *src, size_t len) {
37 const unsigned char *cur = (const unsigned char *)src;
38 U32 v;
39
40 if (len < 2)
41 return len;
42
43 v = (cur[0] << 8) | cur[1];
44 if ((v & 0xC0C0) != 0xC080)
45 return 1;
46
47 if ((v & 0x2000) == 0) {
48 if ((v & 0x1E00) == 0)
49 return 1;
50 return 2;
51 }
52 else if ((v & 0x1000) == 0) {
53 if ((v & 0x0F20) == 0 ||
54 (v & 0x0F20) == 0x0D20)
55 return 1;
56 if (len < 3 || (cur[2] & 0xC0) != 0x80)
57 return 2;
58 return 3;
59 }
60 else {
61 if ((v & 0x0730) == 0 ||
62 (v > 0xF48F))
63 return 1;
64 if (len < 3 || (cur[2] & 0xC0) != 0x80)
65 return 2;
66 if (len < 4 || (cur[3] & 0xC0) != 0x80)
67 return 3;
68 return 4;
69 }
70 }
71
72 bool
utf8_check(const char * src,size_t len,size_t * cursor)73 utf8_check(const char *src, size_t len, size_t *cursor) {
74 const unsigned char *cur = (const unsigned char *)src;
75 const unsigned char *end = cur + len;
76 const unsigned char *p;
77 unsigned char buf[4];
78 U32 v;
79
80 for (p = cur;;) {
81 if (cur >= end - 3) {
82 if (cur == end)
83 break;
84 memset(buf, 0, 4);
85 memcpy(buf, cur, end - cur);
86 p = (const unsigned char *)buf;
87 }
88
89 v = *p++;
90 if ((v & 0x80) == 0) {
91 cur += 1;
92 continue;
93 }
94
95 v = (v << 8) | *p++;
96 if ((v & 0xE0C0) == 0xC080 &&
97 (v & 0x1E00) != 0) {
98 cur += 2;
99 continue;
100 }
101
102 v = (v << 8) | *p++;
103 if ((v & 0xF0C0C0) == 0xE08080 &&
104 (v & 0x0F2000) != 0 &&
105 (v & 0x0F2000) != 0x0D2000) {
106 cur += 3;
107 continue;
108 }
109
110 v = (v << 8) | *p++;
111 if ((v & 0xF8C0C0C0) == 0xF0808080 &&
112 (v & 0x07300000) != 0 &&
113 (v < 0xF4908080)) {
114 cur += 4;
115 continue;
116 }
117
118 break;
119 }
120
121 if (cursor)
122 *cursor = (const char *)cur - src;
123
124 return cur == end;
125 }
126
127 bool
utf8_valid(const char * src,size_t len)128 utf8_valid(const char *src, size_t len) {
129 return utf8_check(src, len, NULL);
130 }
131
132 #ifdef __cplusplus
133 }
134 #endif
135 #endif
136
137