1 /*
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <folly/Unicode.h>
18
19 #include <folly/Conv.h>
20
21 namespace folly {
22
23 //////////////////////////////////////////////////////////////////////
24
codePointToUtf8(char32_t cp)25 std::string codePointToUtf8(char32_t cp) {
26 std::string result;
27
28 // Based on description from http://en.wikipedia.org/wiki/UTF-8.
29
30 if (cp <= 0x7f) {
31 result.resize(1);
32 result[0] = static_cast<char>(cp);
33 } else if (cp <= 0x7FF) {
34 result.resize(2);
35 result[1] = static_cast<char>(0x80 | (0x3f & cp));
36 result[0] = static_cast<char>(0xC0 | (cp >> 6));
37 } else if (cp <= 0xFFFF) {
38 result.resize(3);
39 result[2] = static_cast<char>(0x80 | (0x3f & cp));
40 result[1] = (0x80 | static_cast<char>((0x3f & (cp >> 6))));
41 result[0] = (0xE0 | static_cast<char>(cp >> 12));
42 } else if (cp <= 0x10FFFF) {
43 result.resize(4);
44 result[3] = static_cast<char>(0x80 | (0x3f & cp));
45 result[2] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
46 result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 12)));
47 result[0] = static_cast<char>(0xF0 | (cp >> 18));
48 }
49
50 return result;
51 }
52
utf8ToCodePoint(const unsigned char * & p,const unsigned char * const e,bool skipOnError)53 char32_t utf8ToCodePoint(
54 const unsigned char*& p, const unsigned char* const e, bool skipOnError) {
55 // clang-format off
56 /** UTF encodings
57 * | # of B | First CP | Last CP | Bit Pattern
58 * | 1 | 0x0000 | 0x007F | 0xxxxxxx
59 * | 2 | 0x0080 | 0x07FF | 110xxxxx 10xxxxxx
60 * | 3 | 0x0800 | 0xFFFF | 1110xxxx 10xxxxxx 10xxxxxx
61 * | 4 | 0x10000 | 0x10FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
62 * | 5 | - | - | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
63 * | 6 | - | - | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
64 *
65 *
66 * NOTE:
67 * - the 4B encoding can encode values up to 0x1FFFFF,
68 * but Unicode defines 0x10FFFF to be the largest code point
69 * - the 5B & 6B encodings will all encode values larger than 0x1FFFFF
70 * (so larger than the largest code point value 0x10FFFF) so they form invalid
71 * unicode code points
72 *
73 * On invalid input (invalid encoding or code points larger than 0x10FFFF):
74 * - When skipOnError is true, this function will skip the first byte and return
75 * U'\ufffd'. Potential optimization: skip the whole invalid range.
76 * - When skipOnError is false, throws.
77 */
78 // clang-format on
79
80 const auto skip = [&] {
81 ++p;
82 return U'\ufffd';
83 };
84
85 if (p >= e) {
86 if (skipOnError) {
87 return skip();
88 }
89 throw std::runtime_error("folly::utf8ToCodePoint empty/invalid string");
90 }
91
92 unsigned char fst = *p;
93 if (!(fst & 0x80)) {
94 // trivial case, 1 byte encoding
95 return *p++;
96 }
97
98 static const uint32_t bitMask[] = {
99 (1 << 7) - 1,
100 (1 << 11) - 1,
101 (1 << 16) - 1,
102 (1 << 21) - 1,
103 };
104
105 // upper control bits are masked out later
106 uint32_t d = fst;
107
108 // multi-byte encoded values must start with bits 0b11. 0xC0 is 0b11000000
109 if ((fst & 0xC0) != 0xC0) {
110 if (skipOnError) {
111 return skip();
112 }
113 throw std::runtime_error(
114 to<std::string>("folly::utf8ToCodePoint i=0 d=", d));
115 }
116
117 fst <<= 1;
118
119 for (unsigned int i = 1; i != 4 && p + i < e; ++i) {
120 const unsigned char tmp = p[i];
121
122 // from the second byte on, format should be 10xxxxxx
123 if ((tmp & 0xC0) != 0x80) {
124 if (skipOnError) {
125 return skip();
126 }
127 throw std::runtime_error(to<std::string>(
128 "folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp));
129 }
130
131 // gradually fill a 32 bit integer d with non control bits in tmp
132 // 0x3F is 0b00111111 which clears out the first 2 control bits
133 d = (d << 6) | (tmp & 0x3F);
134 fst <<= 1;
135
136 if (!(fst & 0x80)) {
137 // We know the length of encoding now, since we encounter the first "0" in
138 // fst (the first byte). This branch processes the last byte of encoding.
139 d &= bitMask[i]; // d is now the code point
140
141 // overlong, could have been encoded with i bytes
142 if ((d & ~bitMask[i - 1]) == 0) {
143 if (skipOnError) {
144 return skip();
145 }
146 throw std::runtime_error(
147 to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
148 }
149
150 // check for surrogates only needed for 3 bytes
151 if (i == 2) {
152 if (d >= 0xD800 && d <= 0xDFFF) {
153 if (skipOnError) {
154 return skip();
155 }
156 throw std::runtime_error(
157 to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
158 }
159 }
160
161 // While UTF-8 encoding can encode arbitrary numbers, 0x10FFFF is the
162 // largest defined Unicode code point.
163 // Only >=4 bytes can UTF-8 encode such values, so i=3 here.
164 if (d > 0x10FFFF) {
165 if (skipOnError) {
166 return skip();
167 }
168 throw std::runtime_error(
169 "folly::utf8ToCodePoint encoding exceeds max unicode code point");
170 }
171 p += i + 1;
172 return d;
173 }
174 }
175
176 if (skipOnError) {
177 return skip();
178 }
179 throw std::runtime_error("folly::utf8ToCodePoint encoding length maxed out");
180 }
181
182 //////////////////////////////////////////////////////////////////////
183
184 } // namespace folly
185