1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <folly/Unicode.h>
18 
19 #include <folly/Conv.h>
20 
21 namespace folly {
22 
23 //////////////////////////////////////////////////////////////////////
24 
codePointToUtf8(char32_t cp)25 std::string codePointToUtf8(char32_t cp) {
26   std::string result;
27 
28   // Based on description from http://en.wikipedia.org/wiki/UTF-8.
29 
30   if (cp <= 0x7f) {
31     result.resize(1);
32     result[0] = static_cast<char>(cp);
33   } else if (cp <= 0x7FF) {
34     result.resize(2);
35     result[1] = static_cast<char>(0x80 | (0x3f & cp));
36     result[0] = static_cast<char>(0xC0 | (cp >> 6));
37   } else if (cp <= 0xFFFF) {
38     result.resize(3);
39     result[2] = static_cast<char>(0x80 | (0x3f & cp));
40     result[1] = (0x80 | static_cast<char>((0x3f & (cp >> 6))));
41     result[0] = (0xE0 | static_cast<char>(cp >> 12));
42   } else if (cp <= 0x10FFFF) {
43     result.resize(4);
44     result[3] = static_cast<char>(0x80 | (0x3f & cp));
45     result[2] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
46     result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 12)));
47     result[0] = static_cast<char>(0xF0 | (cp >> 18));
48   }
49 
50   return result;
51 }
52 
utf8ToCodePoint(const unsigned char * & p,const unsigned char * const e,bool skipOnError)53 char32_t utf8ToCodePoint(
54     const unsigned char*& p, const unsigned char* const e, bool skipOnError) {
55   // clang-format off
56   /** UTF encodings
57   *  | # of B | First CP |  Last CP  | Bit Pattern
58   *  |   1    |   0x0000 |   0x007F  | 0xxxxxxx
59   *  |   2    |   0x0080 |   0x07FF  | 110xxxxx 10xxxxxx
60   *  |   3    |   0x0800 |   0xFFFF  | 1110xxxx 10xxxxxx 10xxxxxx
61   *  |   4    |  0x10000 | 0x10FFFF  | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
62   *  |   5    |       -  |        -  | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
63   *  |   6    |       -  |        -  | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
64   *
65   *
66   * NOTE:
67   * - the 4B encoding can encode values up to 0x1FFFFF,
68   *   but Unicode defines 0x10FFFF to be the largest code point
69   * - the 5B & 6B encodings will all encode values larger than 0x1FFFFF
70   *   (so larger than the largest code point value 0x10FFFF) so they form invalid
71   *   unicode code points
72   *
73   * On invalid input (invalid encoding or code points larger than 0x10FFFF):
74   * - When skipOnError is true, this function will skip the first byte and return
75   *   U'\ufffd'. Potential optimization: skip the whole invalid range.
76   * - When skipOnError is false, throws.
77   */
78   // clang-format on
79 
80   const auto skip = [&] {
81     ++p;
82     return U'\ufffd';
83   };
84 
85   if (p >= e) {
86     if (skipOnError) {
87       return skip();
88     }
89     throw std::runtime_error("folly::utf8ToCodePoint empty/invalid string");
90   }
91 
92   unsigned char fst = *p;
93   if (!(fst & 0x80)) {
94     // trivial case, 1 byte encoding
95     return *p++;
96   }
97 
98   static const uint32_t bitMask[] = {
99       (1 << 7) - 1,
100       (1 << 11) - 1,
101       (1 << 16) - 1,
102       (1 << 21) - 1,
103   };
104 
105   // upper control bits are masked out later
106   uint32_t d = fst;
107 
108   // multi-byte encoded values must start with bits 0b11. 0xC0 is 0b11000000
109   if ((fst & 0xC0) != 0xC0) {
110     if (skipOnError) {
111       return skip();
112     }
113     throw std::runtime_error(
114         to<std::string>("folly::utf8ToCodePoint i=0 d=", d));
115   }
116 
117   fst <<= 1;
118 
119   for (unsigned int i = 1; i != 4 && p + i < e; ++i) {
120     const unsigned char tmp = p[i];
121 
122     // from the second byte on, format should be 10xxxxxx
123     if ((tmp & 0xC0) != 0x80) {
124       if (skipOnError) {
125         return skip();
126       }
127       throw std::runtime_error(to<std::string>(
128           "folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp));
129     }
130 
131     // gradually fill a 32 bit integer d with non control bits in tmp
132     // 0x3F is 0b00111111 which clears out the first 2 control bits
133     d = (d << 6) | (tmp & 0x3F);
134     fst <<= 1;
135 
136     if (!(fst & 0x80)) {
137       // We know the length of encoding now, since we encounter the first "0" in
138       // fst (the first byte). This branch processes the last byte of encoding.
139       d &= bitMask[i]; // d is now the code point
140 
141       // overlong, could have been encoded with i bytes
142       if ((d & ~bitMask[i - 1]) == 0) {
143         if (skipOnError) {
144           return skip();
145         }
146         throw std::runtime_error(
147             to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
148       }
149 
150       // check for surrogates only needed for 3 bytes
151       if (i == 2) {
152         if (d >= 0xD800 && d <= 0xDFFF) {
153           if (skipOnError) {
154             return skip();
155           }
156           throw std::runtime_error(
157               to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
158         }
159       }
160 
161       // While UTF-8 encoding can encode arbitrary numbers, 0x10FFFF is the
162       // largest defined Unicode code point.
163       // Only >=4 bytes can UTF-8 encode such values, so i=3 here.
164       if (d > 0x10FFFF) {
165         if (skipOnError) {
166           return skip();
167         }
168         throw std::runtime_error(
169             "folly::utf8ToCodePoint encoding exceeds max unicode code point");
170       }
171       p += i + 1;
172       return d;
173     }
174   }
175 
176   if (skipOnError) {
177     return skip();
178   }
179   throw std::runtime_error("folly::utf8ToCodePoint encoding length maxed out");
180 }
181 
182 //////////////////////////////////////////////////////////////////////
183 
184 } // namespace folly
185