1 /*
2 * Copyright 2017 WebAssembly Community Group participants
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/utf8.h"
18
19 #include <cstdint>
20
21 namespace wabt {
22
23 namespace {
24
25 const int s_utf8_length[256] = {
26 // 0 1 2 3 4 5 6 7 8 9 a b c d e f
27 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00
28 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20
30 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40
32 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60
34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70
35 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80
36 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90
37 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xa0
38 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xb0
39 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0
40 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0
41 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0
42 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0
43 };
44
45 // Returns true if this is a valid continuation byte.
IsCont(uint8_t c)46 bool IsCont(uint8_t c) {
47 return (c & 0xc0) == 0x80;
48 }
49
50 } // end anonymous namespace
51
IsValidUtf8(const char * s,size_t s_length)52 bool IsValidUtf8(const char* s, size_t s_length) {
53 const uint8_t* p = reinterpret_cast<const uint8_t*>(s);
54 const uint8_t* end = p + s_length;
55 while (p < end) {
56 uint8_t cu0 = *p;
57 int length = s_utf8_length[cu0];
58 if (p + length > end) {
59 return false;
60 }
61
62 switch (length) {
63 case 0:
64 return false;
65
66 case 1:
67 p++;
68 break;
69
70 case 2:
71 p++;
72 if (!IsCont(*p++)) {
73 return false;
74 }
75 break;
76
77 case 3: {
78 p++;
79 uint8_t cu1 = *p++;
80 uint8_t cu2 = *p++;
81 if (!(IsCont(cu1) && IsCont(cu2)) ||
82 (cu0 == 0xe0 && cu1 < 0xa0) || // Overlong encoding.
83 (cu0 == 0xed && cu1 >= 0xa0)) // UTF-16 surrogate halves.
84 return false;
85 break;
86 }
87
88 case 4: {
89 p++;
90 uint8_t cu1 = *p++;
91 uint8_t cu2 = *p++;
92 uint8_t cu3 = *p++;
93 if (!(IsCont(cu1) && IsCont(cu2) && IsCont(cu3)) ||
94 (cu0 == 0xf0 && cu1 < 0x90) || // Overlong encoding.
95 (cu0 == 0xf4 && cu1 >= 0x90)) // Code point >= 0x11000.
96 return false;
97 break;
98 }
99 }
100 }
101 return true;
102 }
103
104 } // namespace wabt
105