1 /*
2 * Copyright 2006 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8
9 #include "SkUtils.h"
10
11 /* 0xxxxxxx 1 total
12 10xxxxxx // never a leading byte
13 110xxxxx 2 total
14 1110xxxx 3 total
15 11110xxx 4 total
16
17 11 10 01 01 xx xx xx xx 0...
18 0xE5XX0000
19 0xE5 << 24
20 */
21
utf8_byte_is_valid(uint8_t c)22 static bool utf8_byte_is_valid(uint8_t c) {
23 return c < 0xF5 && (c & 0xFE) != 0xC0;
24 }
utf8_byte_is_continuation(uint8_t c)25 static bool utf8_byte_is_continuation(uint8_t c) {
26 return (c & 0xC0) == 0x80;
27 }
utf8_byte_is_leading_byte(uint8_t c)28 static bool utf8_byte_is_leading_byte(uint8_t c) {
29 return utf8_byte_is_valid(c) && !utf8_byte_is_continuation(c);
30 }
31
32 #ifdef SK_DEBUG
assert_utf8_leadingbyte(unsigned c)33 static void assert_utf8_leadingbyte(unsigned c) {
34 SkASSERT(utf8_byte_is_leading_byte(SkToU8(c)));
35 }
36
SkUTF8_LeadByteToCount(unsigned c)37 int SkUTF8_LeadByteToCount(unsigned c) {
38 assert_utf8_leadingbyte(c);
39 return (((0xE5 << 24) >> (c >> 4 << 1)) & 3) + 1;
40 }
41 #else
42 #define assert_utf8_leadingbyte(c)
43 #endif
44
45 /**
46 * @returns -1 iff invalid UTF8 byte,
47 * 0 iff UTF8 continuation byte,
48 * 1 iff ASCII byte,
49 * 2 iff leading byte of 2-byte sequence,
50 * 3 iff leading byte of 3-byte sequence, and
51 * 4 iff leading byte of 4-byte sequence.
52 *
53 * I.e.: if return value > 0, then gives length of sequence.
54 */
utf8_byte_type(uint8_t c)55 static int utf8_byte_type(uint8_t c) {
56 if (c < 0x80) {
57 return 1;
58 } else if (c < 0xC0) {
59 return 0;
60 } else if (c < 0xF5 && (c & 0xFE) != 0xC0) { // "octet values C0, C1, F5 to FF never appear"
61 return (((0xE5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1;
62 } else {
63 return -1;
64 }
65 }
utf8_type_is_valid_leading_byte(int type)66 static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; }
67
SkUTF8_CountUnichars(const char utf8[])68 int SkUTF8_CountUnichars(const char utf8[]) {
69 SkASSERT(utf8);
70
71 int count = 0;
72
73 for (;;) {
74 int c = *(const uint8_t*)utf8;
75 if (c == 0) {
76 break;
77 }
78 utf8 += SkUTF8_LeadByteToCount(c);
79 count += 1;
80 }
81 return count;
82 }
83
84 // SAFE: returns -1 if invalid UTF-8
SkUTF8_CountUnichars(const void * text,size_t byteLength)85 int SkUTF8_CountUnichars(const void* text, size_t byteLength) {
86 SkASSERT(text);
87 const char* utf8 = static_cast<const char*>(text);
88 if (byteLength == 0) {
89 return 0;
90 }
91
92 int count = 0;
93 const char* stop = utf8 + byteLength;
94
95 while (utf8 < stop) {
96 int type = utf8_byte_type(*(const uint8_t*)utf8);
97 SkASSERT(type >= -1 && type <= 4);
98 if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) {
99 // Sequence extends beyond end.
100 return -1;
101 }
102 while(type-- > 1) {
103 ++utf8;
104 if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) {
105 return -1;
106 }
107 }
108 ++utf8;
109 ++count;
110 }
111 return count;
112 }
113
SkUTF8_ToUnichar(const char utf8[])114 SkUnichar SkUTF8_ToUnichar(const char utf8[]) {
115 SkASSERT(utf8);
116
117 const uint8_t* p = (const uint8_t*)utf8;
118 int c = *p;
119 int hic = c << 24;
120
121 assert_utf8_leadingbyte(c);
122
123 if (hic < 0) {
124 uint32_t mask = (uint32_t)~0x3F;
125 hic = SkLeftShift(hic, 1);
126 do {
127 c = (c << 6) | (*++p & 0x3F);
128 mask <<= 5;
129 } while ((hic = SkLeftShift(hic, 1)) < 0);
130 c &= ~mask;
131 }
132 return c;
133 }
134
135 // SAFE: returns -1 on invalid UTF-8 sequence.
SkUTF8_NextUnicharWithError(const char ** ptr,const char * end)136 SkUnichar SkUTF8_NextUnicharWithError(const char** ptr, const char* end) {
137 SkASSERT(ptr && *ptr);
138 SkASSERT(*ptr < end);
139 const uint8_t* p = (const uint8_t*)*ptr;
140 int c = *p;
141 int hic = c << 24;
142
143 if (!utf8_byte_is_leading_byte(c)) {
144 return -1;
145 }
146 if (hic < 0) {
147 uint32_t mask = (uint32_t)~0x3F;
148 hic = SkLeftShift(hic, 1);
149 do {
150 ++p;
151 if (p >= (const uint8_t*)end) {
152 return -1;
153 }
154 // check before reading off end of array.
155 uint8_t nextByte = *p;
156 if (!utf8_byte_is_continuation(nextByte)) {
157 return -1;
158 }
159 c = (c << 6) | (nextByte & 0x3F);
160 mask <<= 5;
161 } while ((hic = SkLeftShift(hic, 1)) < 0);
162 c &= ~mask;
163 }
164 *ptr = (char*)p + 1;
165 return c;
166 }
167
SkUTF8_NextUnichar(const char ** ptr)168 SkUnichar SkUTF8_NextUnichar(const char** ptr) {
169 SkASSERT(ptr && *ptr);
170
171 const uint8_t* p = (const uint8_t*)*ptr;
172 int c = *p;
173 int hic = c << 24;
174
175 assert_utf8_leadingbyte(c);
176
177 if (hic < 0) {
178 uint32_t mask = (uint32_t)~0x3F;
179 hic = SkLeftShift(hic, 1);
180 do {
181 c = (c << 6) | (*++p & 0x3F);
182 mask <<= 5;
183 } while ((hic = SkLeftShift(hic, 1)) < 0);
184 c &= ~mask;
185 }
186 *ptr = (char*)p + 1;
187 return c;
188 }
189
SkUTF8_PrevUnichar(const char ** ptr)190 SkUnichar SkUTF8_PrevUnichar(const char** ptr) {
191 SkASSERT(ptr && *ptr);
192
193 const char* p = *ptr;
194
195 if (*--p & 0x80) {
196 while (*--p & 0x40) {
197 ;
198 }
199 }
200
201 *ptr = (char*)p;
202 return SkUTF8_NextUnichar(&p);
203 }
204
SkUTF8_FromUnichar(SkUnichar uni,char utf8[])205 size_t SkUTF8_FromUnichar(SkUnichar uni, char utf8[]) {
206 if ((uint32_t)uni > 0x10FFFF) {
207 SkDEBUGFAIL("bad unichar");
208 return 0;
209 }
210
211 if (uni <= 127) {
212 if (utf8) {
213 *utf8 = (char)uni;
214 }
215 return 1;
216 }
217
218 char tmp[4];
219 char* p = tmp;
220 size_t count = 1;
221
222 SkDEBUGCODE(SkUnichar orig = uni;)
223
224 while (uni > 0x7F >> count) {
225 *p++ = (char)(0x80 | (uni & 0x3F));
226 uni >>= 6;
227 count += 1;
228 }
229
230 if (utf8) {
231 p = tmp;
232 utf8 += count;
233 while (p < tmp + count - 1) {
234 *--utf8 = *p++;
235 }
236 *--utf8 = (char)(~(0xFF >> count) | uni);
237 }
238
239 SkASSERT(utf8 == nullptr || orig == SkUTF8_ToUnichar(utf8));
240 return count;
241 }
242
243 ///////////////////////////////////////////////////////////////////////////////
244
SkUTF16_CountUnichars(const uint16_t src[])245 int SkUTF16_CountUnichars(const uint16_t src[]) {
246 SkASSERT(src);
247
248 int count = 0;
249 unsigned c;
250 while ((c = *src++) != 0) {
251 SkASSERT(!SkUTF16_IsLowSurrogate(c));
252 if (SkUTF16_IsHighSurrogate(c)) {
253 c = *src++;
254 SkASSERT(SkUTF16_IsLowSurrogate(c));
255 }
256 count += 1;
257 }
258 return count;
259 }
260
261 // returns -1 on error
SkUTF16_CountUnichars(const void * text,size_t byteLength)262 int SkUTF16_CountUnichars(const void* text, size_t byteLength) {
263 SkASSERT(text);
264 if (byteLength == 0) {
265 return 0;
266 }
267 if (!SkIsAlign2(intptr_t(text)) || !SkIsAlign2(byteLength)) {
268 return -1;
269 }
270
271 const uint16_t* src = static_cast<const uint16_t*>(text);
272 const uint16_t* stop = src + (byteLength >> 1);
273 int count = 0;
274 while (src < stop) {
275 unsigned c = *src++;
276 SkASSERT(!SkUTF16_IsLowSurrogate(c));
277 if (SkUTF16_IsHighSurrogate(c)) {
278 if (src >= stop) {
279 return -1;
280 }
281 c = *src++;
282 if (!SkUTF16_IsLowSurrogate(c)) {
283 return -1;
284 }
285 }
286 count += 1;
287 }
288 return count;
289 }
290
SkUTF16_NextUnichar(const uint16_t ** srcPtr)291 SkUnichar SkUTF16_NextUnichar(const uint16_t** srcPtr) {
292 SkASSERT(srcPtr && *srcPtr);
293
294 const uint16_t* src = *srcPtr;
295 SkUnichar c = *src++;
296
297 SkASSERT(!SkUTF16_IsLowSurrogate(c));
298 if (SkUTF16_IsHighSurrogate(c)) {
299 unsigned c2 = *src++;
300 SkASSERT(SkUTF16_IsLowSurrogate(c2));
301
302 // c = ((c & 0x3FF) << 10) + (c2 & 0x3FF) + 0x10000
303 // c = (((c & 0x3FF) + 64) << 10) + (c2 & 0x3FF)
304 c = (c << 10) + c2 + (0x10000 - (0xD800 << 10) - 0xDC00);
305 }
306 *srcPtr = src;
307 return c;
308 }
309
SkUTF16_PrevUnichar(const uint16_t ** srcPtr)310 SkUnichar SkUTF16_PrevUnichar(const uint16_t** srcPtr) {
311 SkASSERT(srcPtr && *srcPtr);
312
313 const uint16_t* src = *srcPtr;
314 SkUnichar c = *--src;
315
316 SkASSERT(!SkUTF16_IsHighSurrogate(c));
317 if (SkUTF16_IsLowSurrogate(c)) {
318 unsigned c2 = *--src;
319 SkASSERT(SkUTF16_IsHighSurrogate(c2));
320 c = (c2 << 10) + c + (0x10000 - (0xD800 << 10) - 0xDC00);
321 }
322 *srcPtr = src;
323 return c;
324 }
325
SkUTF16_FromUnichar(SkUnichar uni,uint16_t dst[])326 size_t SkUTF16_FromUnichar(SkUnichar uni, uint16_t dst[]) {
327 SkASSERT((unsigned)uni <= 0x10FFFF);
328
329 int extra = (uni > 0xFFFF);
330
331 if (dst) {
332 if (extra) {
333 // dst[0] = SkToU16(0xD800 | ((uni - 0x10000) >> 10));
334 // dst[0] = SkToU16(0xD800 | ((uni >> 10) - 64));
335 dst[0] = SkToU16((0xD800 - 64) + (uni >> 10));
336 dst[1] = SkToU16(0xDC00 | (uni & 0x3FF));
337
338 SkASSERT(SkUTF16_IsHighSurrogate(dst[0]));
339 SkASSERT(SkUTF16_IsLowSurrogate(dst[1]));
340 } else {
341 dst[0] = SkToU16(uni);
342 SkASSERT(!SkUTF16_IsHighSurrogate(dst[0]));
343 SkASSERT(!SkUTF16_IsLowSurrogate(dst[0]));
344 }
345 }
346 return 1 + extra;
347 }
348
SkUTF16_ToUTF8(const uint16_t utf16[],int numberOf16BitValues,char utf8[])349 size_t SkUTF16_ToUTF8(const uint16_t utf16[], int numberOf16BitValues,
350 char utf8[]) {
351 SkASSERT(numberOf16BitValues >= 0);
352 if (numberOf16BitValues <= 0) {
353 return 0;
354 }
355
356 SkASSERT(utf16 != nullptr);
357
358 const uint16_t* stop = utf16 + numberOf16BitValues;
359 size_t size = 0;
360
361 if (utf8 == nullptr) { // just count
362 while (utf16 < stop) {
363 size += SkUTF8_FromUnichar(SkUTF16_NextUnichar(&utf16), nullptr);
364 }
365 } else {
366 char* start = utf8;
367 while (utf16 < stop) {
368 utf8 += SkUTF8_FromUnichar(SkUTF16_NextUnichar(&utf16), utf8);
369 }
370 size = utf8 - start;
371 }
372 return size;
373 }
374
375 const char SkHexadecimalDigits::gUpper[16] =
376 { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
377 const char SkHexadecimalDigits::gLower[16] =
378 { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
379
380
381 // returns -1 on error
SkUTF32_CountUnichars(const void * text,size_t byteLength)382 int SkUTF32_CountUnichars(const void* text, size_t byteLength) {
383 if (byteLength == 0) {
384 return 0;
385 }
386 if (!SkIsAlign4(intptr_t(text)) || !SkIsAlign4(byteLength)) {
387 return -1;
388 }
389 const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits
390 const uint32_t* ptr = static_cast<const uint32_t*>(text);
391 const uint32_t* stop = ptr + (byteLength >> 2);
392 while (ptr < stop) {
393 if (*ptr & kInvalidUnicharMask) {
394 return -1;
395 }
396 ptr += 1;
397 }
398 return SkToInt(byteLength >> 2);
399 }
400
401