1 /*
2  * Copyright 2017 WebAssembly Community Group participants
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/utf8.h"
18 
19 #include <cstdint>
20 
21 namespace wabt {
22 
23 namespace {
24 
25 const int s_utf8_length[256] = {
26  // 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f
27     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x00
28     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x10
29     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x20
30     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x30
31     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x40
32     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x50
33     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x60
34     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x70
35     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x80
36     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x90
37     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xa0
38     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xb0
39     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // 0xc0
40     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // 0xd0
41     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // 0xe0
42     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xf0
43 };
44 
45 // Returns true if this is a valid continuation byte.
IsCont(uint8_t c)46 bool IsCont(uint8_t c) {
47   return (c & 0xc0) == 0x80;
48 }
49 
50 }  // end anonymous namespace
51 
IsValidUtf8(const char * s,size_t s_length)52 bool IsValidUtf8(const char* s, size_t s_length) {
53   const uint8_t* p = reinterpret_cast<const uint8_t*>(s);
54   const uint8_t* end = p + s_length;
55   while (p < end) {
56     uint8_t cu0 = *p;
57     int length = s_utf8_length[cu0];
58     if (p + length > end) {
59       return false;
60     }
61 
62     switch (length) {
63       case 0:
64         return false;
65 
66       case 1:
67         p++;
68         break;
69 
70       case 2:
71         p++;
72         if (!IsCont(*p++)) {
73           return false;
74         }
75         break;
76 
77       case 3: {
78         p++;
79         uint8_t cu1 = *p++;
80         uint8_t cu2 = *p++;
81         if (!(IsCont(cu1) && IsCont(cu2)) ||
82             (cu0 == 0xe0 && cu1 < 0xa0) ||  // Overlong encoding.
83             (cu0 == 0xed && cu1 >= 0xa0))   // UTF-16 surrogate halves.
84           return false;
85         break;
86       }
87 
88       case 4: {
89         p++;
90         uint8_t cu1 = *p++;
91         uint8_t cu2 = *p++;
92         uint8_t cu3 = *p++;
93         if (!(IsCont(cu1) && IsCont(cu2) && IsCont(cu3)) ||
94             (cu0 == 0xf0 && cu1 < 0x90) ||  // Overlong encoding.
95             (cu0 == 0xf4 && cu1 >= 0x90))   // Code point >= 0x11000.
96           return false;
97         break;
98       }
99     }
100   }
101   return true;
102 }
103 
104 }  // namespace wabt
105