1 // Copyright 2018 Google LLC.
2 // Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
3 
4 #include "src/utils/SkUTF.h"
5 
6 #include <climits>
7 
left_shift(int32_t value,int32_t shift)8 static constexpr inline int32_t left_shift(int32_t value, int32_t shift) {
9     return (int32_t) ((uint32_t) value << shift);
10 }
11 
is_align2(T x)12 template <typename T> static constexpr bool is_align2(T x) { return 0 == (x & 1); }
13 
is_align4(T x)14 template <typename T> static constexpr bool is_align4(T x) { return 0 == (x & 3); }
15 
utf16_is_high_surrogate(uint16_t c)16 static constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & 0xFC00) == 0xD800; }
17 
utf16_is_low_surrogate(uint16_t c)18 static constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & 0xFC00) == 0xDC00; }
19 
20 /** @returns   -1  iff invalid UTF8 byte,
21                 0  iff UTF8 continuation byte,
22                 1  iff ASCII byte,
23                 2  iff leading byte of 2-byte sequence,
24                 3  iff leading byte of 3-byte sequence, and
25                 4  iff leading byte of 4-byte sequence.
26       I.e.: if return value > 0, then gives length of sequence.
27 */
utf8_byte_type(uint8_t c)28 static int utf8_byte_type(uint8_t c) {
29     if (c < 0x80) {
30         return 1;
31     } else if (c < 0xC0) {
32         return 0;
33     } else if (c >= 0xF5 || (c & 0xFE) == 0xC0) { // "octet values c0, c1, f5 to ff never appear"
34         return -1;
35     } else {
36         int value = (((0xe5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1;
37         // assert(value >= 2 && value <=4);
38         return value;
39     }
40 }
utf8_type_is_valid_leading_byte(int type)41 static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; }
42 
utf8_byte_is_continuation(uint8_t c)43 static bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == 0; }
44 
45 ////////////////////////////////////////////////////////////////////////////////
46 
CountUTF8(const char * utf8,size_t byteLength)47 int SkUTF::CountUTF8(const char* utf8, size_t byteLength) {
48     if (!utf8) {
49         return -1;
50     }
51     int count = 0;
52     const char* stop = utf8 + byteLength;
53     while (utf8 < stop) {
54         int type = utf8_byte_type(*(const uint8_t*)utf8);
55         if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) {
56             return -1;  // Sequence extends beyond end.
57         }
58         while(type-- > 1) {
59             ++utf8;
60             if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) {
61                 return -1;
62             }
63         }
64         ++utf8;
65         ++count;
66     }
67     return count;
68 }
69 
CountUTF16(const uint16_t * utf16,size_t byteLength)70 int SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) {
71     if (!utf16 || !is_align2(intptr_t(utf16)) || !is_align2(byteLength)) {
72         return -1;
73     }
74     const uint16_t* src = (const uint16_t*)utf16;
75     const uint16_t* stop = src + (byteLength >> 1);
76     int count = 0;
77     while (src < stop) {
78         unsigned c = *src++;
79         if (utf16_is_low_surrogate(c)) {
80             return -1;
81         }
82         if (utf16_is_high_surrogate(c)) {
83             if (src >= stop) {
84                 return -1;
85             }
86             c = *src++;
87             if (!utf16_is_low_surrogate(c)) {
88                 return -1;
89             }
90         }
91         count += 1;
92     }
93     return count;
94 }
95 
CountUTF32(const int32_t * utf32,size_t byteLength)96 int SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) {
97     if (!is_align4(intptr_t(utf32)) || !is_align4(byteLength) || byteLength >> 2 > INT_MAX) {
98         return -1;
99     }
100     const uint32_t kInvalidUnicharMask = 0xFF000000;    // unichar fits in 24 bits
101     const uint32_t* ptr = (const uint32_t*)utf32;
102     const uint32_t* stop = ptr + (byteLength >> 2);
103     while (ptr < stop) {
104         if (*ptr & kInvalidUnicharMask) {
105             return -1;
106         }
107         ptr += 1;
108     }
109     return (int)(byteLength >> 2);
110 }
111 
112 template <typename T>
next_fail(const T ** ptr,const T * end)113 static SkUnichar next_fail(const T** ptr, const T* end) {
114     *ptr = end;
115     return -1;
116 }
117 
NextUTF8(const char ** ptr,const char * end)118 SkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) {
119     if (!ptr || !end ) {
120         return -1;
121     }
122     const uint8_t*  p = (const uint8_t*)*ptr;
123     if (!p || p >= (const uint8_t*)end) {
124         return next_fail(ptr, end);
125     }
126     int             c = *p;
127     int             hic = c << 24;
128 
129     if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) {
130         return next_fail(ptr, end);
131     }
132     if (hic < 0) {
133         uint32_t mask = (uint32_t)~0x3F;
134         hic = left_shift(hic, 1);
135         do {
136             ++p;
137             if (p >= (const uint8_t*)end) {
138                 return next_fail(ptr, end);
139             }
140             // check before reading off end of array.
141             uint8_t nextByte = *p;
142             if (!utf8_byte_is_continuation(nextByte)) {
143                 return next_fail(ptr, end);
144             }
145             c = (c << 6) | (nextByte & 0x3F);
146             mask <<= 5;
147         } while ((hic = left_shift(hic, 1)) < 0);
148         c &= ~mask;
149     }
150     *ptr = (char*)p + 1;
151     return c;
152 }
153 
NextUTF16(const uint16_t ** ptr,const uint16_t * end)154 SkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) {
155     if (!ptr || !end ) {
156         return -1;
157     }
158     const uint16_t* src = *ptr;
159     if (!src || src + 1 > end || !is_align2(intptr_t(src))) {
160         return next_fail(ptr, end);
161     }
162     uint16_t c = *src++;
163     SkUnichar result = c;
164     if (utf16_is_low_surrogate(c)) {
165         return next_fail(ptr, end);  // srcPtr should never point at low surrogate.
166     }
167     if (utf16_is_high_surrogate(c)) {
168         if (src + 1 > end) {
169             return next_fail(ptr, end);  // Truncated string.
170         }
171         uint16_t low = *src++;
172         if (!utf16_is_low_surrogate(low)) {
173             return next_fail(ptr, end);
174         }
175         /*
176         [paraphrased from wikipedia]
177         Take the high surrogate and subtract 0xD800, then multiply by 0x400.
178         Take the low surrogate and subtract 0xDC00.  Add these two results
179         together, and finally add 0x10000 to get the final decoded codepoint.
180 
181         unicode = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
182         unicode = (high * 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000
183         unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000
184         unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000)
185         */
186         result = (result << 10) + (SkUnichar)low - ((0xD800 << 10) + 0xDC00 - 0x10000);
187     }
188     *ptr = src;
189     return result;
190 }
191 
NextUTF32(const int32_t ** ptr,const int32_t * end)192 SkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) {
193     if (!ptr || !end ) {
194         return -1;
195     }
196     const int32_t* s = *ptr;
197     if (!s || s + 1 > end || !is_align4(intptr_t(s))) {
198         return next_fail(ptr, end);
199     }
200     int32_t value = *s;
201     const uint32_t kInvalidUnicharMask = 0xFF000000;    // unichar fits in 24 bits
202     if (value & kInvalidUnicharMask) {
203         return next_fail(ptr, end);
204     }
205     *ptr = s + 1;
206     return value;
207 }
208 
ToUTF8(SkUnichar uni,char utf8[SkUTF::kMaxBytesInUTF8Sequence])209 size_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) {
210     if ((uint32_t)uni > 0x10FFFF) {
211         return 0;
212     }
213     if (uni <= 127) {
214         if (utf8) {
215             *utf8 = (char)uni;
216         }
217         return 1;
218     }
219     char    tmp[4];
220     char*   p = tmp;
221     size_t  count = 1;
222     while (uni > 0x7F >> count) {
223         *p++ = (char)(0x80 | (uni & 0x3F));
224         uni >>= 6;
225         count += 1;
226     }
227     if (utf8) {
228         p = tmp;
229         utf8 += count;
230         while (p < tmp + count - 1) {
231             *--utf8 = *p++;
232         }
233         *--utf8 = (char)(~(0xFF >> count) | uni);
234     }
235     return count;
236 }
237 
ToUTF16(SkUnichar uni,uint16_t utf16[2])238 size_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[2]) {
239     if ((uint32_t)uni > 0x10FFFF) {
240         return 0;
241     }
242     int extra = (uni > 0xFFFF);
243     if (utf16) {
244         if (extra) {
245             utf16[0] = (uint16_t)((0xD800 - 64) + (uni >> 10));
246             utf16[1] = (uint16_t)(0xDC00 | (uni & 0x3FF));
247         } else {
248             utf16[0] = (uint16_t)uni;
249         }
250     }
251     return 1 + extra;
252 }
253 
254