1 /*
2  * Copyright 2006 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 
9 #include "SkUtils.h"
10 
11 /*  0xxxxxxx    1 total
12     10xxxxxx    // never a leading byte
13     110xxxxx    2 total
14     1110xxxx    3 total
15     11110xxx    4 total
16 
17     11 10 01 01 xx xx xx xx 0...
18     0xE5XX0000
19     0xE5 << 24
20 */
21 
utf8_byte_is_valid(uint8_t c)22 static bool utf8_byte_is_valid(uint8_t c) {
23     return c < 0xF5 && (c & 0xFE) != 0xC0;
24 }
utf8_byte_is_continuation(uint8_t c)25 static bool utf8_byte_is_continuation(uint8_t c) {
26     return  (c & 0xC0) == 0x80;
27 }
utf8_byte_is_leading_byte(uint8_t c)28 static bool utf8_byte_is_leading_byte(uint8_t c) {
29     return utf8_byte_is_valid(c) && !utf8_byte_is_continuation(c);
30 }
31 
32 #ifdef SK_DEBUG
assert_utf8_leadingbyte(unsigned c)33     static void assert_utf8_leadingbyte(unsigned c) {
34         SkASSERT(utf8_byte_is_leading_byte(SkToU8(c)));
35     }
36 
SkUTF8_LeadByteToCount(unsigned c)37     int SkUTF8_LeadByteToCount(unsigned c) {
38         assert_utf8_leadingbyte(c);
39         return (((0xE5 << 24) >> (c >> 4 << 1)) & 3) + 1;
40     }
41 #else
42     #define assert_utf8_leadingbyte(c)
43 #endif
44 
45 /**
46  * @returns -1  iff invalid UTF8 byte,
47  *           0  iff UTF8 continuation byte,
48  *           1  iff ASCII byte,
49  *           2  iff leading byte of 2-byte sequence,
50  *           3  iff leading byte of 3-byte sequence, and
51  *           4  iff leading byte of 4-byte sequence.
52  *
53  * I.e.: if return value > 0, then gives length of sequence.
54 */
utf8_byte_type(uint8_t c)55 static int utf8_byte_type(uint8_t c) {
56     if (c < 0x80) {
57         return 1;
58     } else if (c < 0xC0) {
59         return 0;
60     } else if (c < 0xF5 && (c & 0xFE) != 0xC0) { // "octet values C0, C1, F5 to FF never appear"
61         return (((0xE5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1;
62     } else {
63         return -1;
64     }
65 }
utf8_type_is_valid_leading_byte(int type)66 static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; }
67 
SkUTF8_CountUnichars(const char utf8[])68 int SkUTF8_CountUnichars(const char utf8[]) {
69     SkASSERT(utf8);
70 
71     int count = 0;
72 
73     for (;;) {
74         int c = *(const uint8_t*)utf8;
75         if (c == 0) {
76             break;
77         }
78         utf8 += SkUTF8_LeadByteToCount(c);
79         count += 1;
80     }
81     return count;
82 }
83 
84 // SAFE: returns -1 if invalid UTF-8
SkUTF8_CountUnichars(const void * text,size_t byteLength)85 int SkUTF8_CountUnichars(const void* text, size_t byteLength) {
86     SkASSERT(text);
87     const char* utf8 = static_cast<const char*>(text);
88     if (byteLength == 0) {
89         return 0;
90     }
91 
92     int         count = 0;
93     const char* stop = utf8 + byteLength;
94 
95     while (utf8 < stop) {
96         int type = utf8_byte_type(*(const uint8_t*)utf8);
97         SkASSERT(type >= -1 && type <= 4);
98         if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) {
99             // Sequence extends beyond end.
100             return -1;
101         }
102         while(type-- > 1) {
103             ++utf8;
104             if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) {
105                 return -1;
106             }
107         }
108         ++utf8;
109         ++count;
110     }
111     return count;
112 }
113 
SkUTF8_ToUnichar(const char utf8[])114 SkUnichar SkUTF8_ToUnichar(const char utf8[]) {
115     SkASSERT(utf8);
116 
117     const uint8_t*  p = (const uint8_t*)utf8;
118     int             c = *p;
119     int             hic = c << 24;
120 
121     assert_utf8_leadingbyte(c);
122 
123     if (hic < 0) {
124         uint32_t mask = (uint32_t)~0x3F;
125         hic = SkLeftShift(hic, 1);
126         do {
127             c = (c << 6) | (*++p & 0x3F);
128             mask <<= 5;
129         } while ((hic = SkLeftShift(hic, 1)) < 0);
130         c &= ~mask;
131     }
132     return c;
133 }
134 
135 // SAFE: returns -1 on invalid UTF-8 sequence.
SkUTF8_NextUnicharWithError(const char ** ptr,const char * end)136 SkUnichar SkUTF8_NextUnicharWithError(const char** ptr, const char* end) {
137     SkASSERT(ptr && *ptr);
138     SkASSERT(*ptr < end);
139     const uint8_t*  p = (const uint8_t*)*ptr;
140     int             c = *p;
141     int             hic = c << 24;
142 
143     if (!utf8_byte_is_leading_byte(c)) {
144         return -1;
145     }
146     if (hic < 0) {
147         uint32_t mask = (uint32_t)~0x3F;
148         hic = SkLeftShift(hic, 1);
149         do {
150             ++p;
151             if (p >= (const uint8_t*)end) {
152                 return -1;
153             }
154             // check before reading off end of array.
155             uint8_t nextByte = *p;
156             if (!utf8_byte_is_continuation(nextByte)) {
157                 return -1;
158             }
159             c = (c << 6) | (nextByte & 0x3F);
160             mask <<= 5;
161         } while ((hic = SkLeftShift(hic, 1)) < 0);
162         c &= ~mask;
163     }
164     *ptr = (char*)p + 1;
165     return c;
166 }
167 
SkUTF8_NextUnichar(const char ** ptr)168 SkUnichar SkUTF8_NextUnichar(const char** ptr) {
169     SkASSERT(ptr && *ptr);
170 
171     const uint8_t*  p = (const uint8_t*)*ptr;
172     int             c = *p;
173     int             hic = c << 24;
174 
175     assert_utf8_leadingbyte(c);
176 
177     if (hic < 0) {
178         uint32_t mask = (uint32_t)~0x3F;
179         hic = SkLeftShift(hic, 1);
180         do {
181             c = (c << 6) | (*++p & 0x3F);
182             mask <<= 5;
183         } while ((hic = SkLeftShift(hic, 1)) < 0);
184         c &= ~mask;
185     }
186     *ptr = (char*)p + 1;
187     return c;
188 }
189 
SkUTF8_PrevUnichar(const char ** ptr)190 SkUnichar SkUTF8_PrevUnichar(const char** ptr) {
191     SkASSERT(ptr && *ptr);
192 
193     const char* p = *ptr;
194 
195     if (*--p & 0x80) {
196         while (*--p & 0x40) {
197             ;
198         }
199     }
200 
201     *ptr = (char*)p;
202     return SkUTF8_NextUnichar(&p);
203 }
204 
SkUTF8_FromUnichar(SkUnichar uni,char utf8[])205 size_t SkUTF8_FromUnichar(SkUnichar uni, char utf8[]) {
206     if ((uint32_t)uni > 0x10FFFF) {
207         SkDEBUGFAIL("bad unichar");
208         return 0;
209     }
210 
211     if (uni <= 127) {
212         if (utf8) {
213             *utf8 = (char)uni;
214         }
215         return 1;
216     }
217 
218     char    tmp[4];
219     char*   p = tmp;
220     size_t  count = 1;
221 
222     SkDEBUGCODE(SkUnichar orig = uni;)
223 
224     while (uni > 0x7F >> count) {
225         *p++ = (char)(0x80 | (uni & 0x3F));
226         uni >>= 6;
227         count += 1;
228     }
229 
230     if (utf8) {
231         p = tmp;
232         utf8 += count;
233         while (p < tmp + count - 1) {
234             *--utf8 = *p++;
235         }
236         *--utf8 = (char)(~(0xFF >> count) | uni);
237     }
238 
239     SkASSERT(utf8 == nullptr || orig == SkUTF8_ToUnichar(utf8));
240     return count;
241 }
242 
243 ///////////////////////////////////////////////////////////////////////////////
244 
SkUTF16_CountUnichars(const uint16_t src[])245 int SkUTF16_CountUnichars(const uint16_t src[]) {
246     SkASSERT(src);
247 
248     int count = 0;
249     unsigned c;
250     while ((c = *src++) != 0) {
251         SkASSERT(!SkUTF16_IsLowSurrogate(c));
252         if (SkUTF16_IsHighSurrogate(c)) {
253             c = *src++;
254             SkASSERT(SkUTF16_IsLowSurrogate(c));
255         }
256         count += 1;
257     }
258     return count;
259 }
260 
261 // returns -1 on error
SkUTF16_CountUnichars(const void * text,size_t byteLength)262 int SkUTF16_CountUnichars(const void* text, size_t byteLength) {
263     SkASSERT(text);
264     if (byteLength == 0) {
265         return 0;
266     }
267     if (!SkIsAlign2(intptr_t(text)) || !SkIsAlign2(byteLength)) {
268         return -1;
269     }
270 
271     const uint16_t* src = static_cast<const uint16_t*>(text);
272     const uint16_t* stop = src + (byteLength >> 1);
273     int count = 0;
274     while (src < stop) {
275         unsigned c = *src++;
276         SkASSERT(!SkUTF16_IsLowSurrogate(c));
277         if (SkUTF16_IsHighSurrogate(c)) {
278             if (src >= stop) {
279                 return -1;
280             }
281             c = *src++;
282             if (!SkUTF16_IsLowSurrogate(c)) {
283                 return -1;
284             }
285         }
286         count += 1;
287     }
288     return count;
289 }
290 
SkUTF16_NextUnichar(const uint16_t ** srcPtr)291 SkUnichar SkUTF16_NextUnichar(const uint16_t** srcPtr) {
292     SkASSERT(srcPtr && *srcPtr);
293 
294     const uint16_t* src = *srcPtr;
295     SkUnichar       c = *src++;
296 
297     SkASSERT(!SkUTF16_IsLowSurrogate(c));
298     if (SkUTF16_IsHighSurrogate(c)) {
299         unsigned c2 = *src++;
300         SkASSERT(SkUTF16_IsLowSurrogate(c2));
301 
302         // c = ((c & 0x3FF) << 10) + (c2 & 0x3FF) + 0x10000
303         // c = (((c & 0x3FF) + 64) << 10) + (c2 & 0x3FF)
304         c = (c << 10) + c2 + (0x10000 - (0xD800 << 10) - 0xDC00);
305     }
306     *srcPtr = src;
307     return c;
308 }
309 
SkUTF16_PrevUnichar(const uint16_t ** srcPtr)310 SkUnichar SkUTF16_PrevUnichar(const uint16_t** srcPtr) {
311     SkASSERT(srcPtr && *srcPtr);
312 
313     const uint16_t* src = *srcPtr;
314     SkUnichar       c = *--src;
315 
316     SkASSERT(!SkUTF16_IsHighSurrogate(c));
317     if (SkUTF16_IsLowSurrogate(c)) {
318         unsigned c2 = *--src;
319         SkASSERT(SkUTF16_IsHighSurrogate(c2));
320         c = (c2 << 10) + c + (0x10000 - (0xD800 << 10) - 0xDC00);
321     }
322     *srcPtr = src;
323     return c;
324 }
325 
SkUTF16_FromUnichar(SkUnichar uni,uint16_t dst[])326 size_t SkUTF16_FromUnichar(SkUnichar uni, uint16_t dst[]) {
327     SkASSERT((unsigned)uni <= 0x10FFFF);
328 
329     int extra = (uni > 0xFFFF);
330 
331     if (dst) {
332         if (extra) {
333             // dst[0] = SkToU16(0xD800 | ((uni - 0x10000) >> 10));
334             // dst[0] = SkToU16(0xD800 | ((uni >> 10) - 64));
335             dst[0] = SkToU16((0xD800 - 64) + (uni >> 10));
336             dst[1] = SkToU16(0xDC00 | (uni & 0x3FF));
337 
338             SkASSERT(SkUTF16_IsHighSurrogate(dst[0]));
339             SkASSERT(SkUTF16_IsLowSurrogate(dst[1]));
340         } else {
341             dst[0] = SkToU16(uni);
342             SkASSERT(!SkUTF16_IsHighSurrogate(dst[0]));
343             SkASSERT(!SkUTF16_IsLowSurrogate(dst[0]));
344         }
345     }
346     return 1 + extra;
347 }
348 
SkUTF16_ToUTF8(const uint16_t utf16[],int numberOf16BitValues,char utf8[])349 size_t SkUTF16_ToUTF8(const uint16_t utf16[], int numberOf16BitValues,
350                       char utf8[]) {
351     SkASSERT(numberOf16BitValues >= 0);
352     if (numberOf16BitValues <= 0) {
353         return 0;
354     }
355 
356     SkASSERT(utf16 != nullptr);
357 
358     const uint16_t* stop = utf16 + numberOf16BitValues;
359     size_t          size = 0;
360 
361     if (utf8 == nullptr) {    // just count
362         while (utf16 < stop) {
363             size += SkUTF8_FromUnichar(SkUTF16_NextUnichar(&utf16), nullptr);
364         }
365     } else {
366         char* start = utf8;
367         while (utf16 < stop) {
368             utf8 += SkUTF8_FromUnichar(SkUTF16_NextUnichar(&utf16), utf8);
369         }
370         size = utf8 - start;
371     }
372     return size;
373 }
374 
375 const char SkHexadecimalDigits::gUpper[16] =
376            { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
377 const char SkHexadecimalDigits::gLower[16] =
378            { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
379 
380 
381 // returns -1 on error
SkUTF32_CountUnichars(const void * text,size_t byteLength)382 int SkUTF32_CountUnichars(const void* text, size_t byteLength) {
383     if (byteLength == 0) {
384         return 0;
385     }
386     if (!SkIsAlign4(intptr_t(text)) || !SkIsAlign4(byteLength)) {
387         return -1;
388     }
389     const uint32_t kInvalidUnicharMask = 0xFF000000;    // unichar fits in 24 bits
390     const uint32_t* ptr = static_cast<const uint32_t*>(text);
391     const uint32_t* stop = ptr + (byteLength >> 2);
392     while (ptr < stop) {
393         if (*ptr & kInvalidUnicharMask) {
394             return -1;
395         }
396         ptr += 1;
397     }
398     return SkToInt(byteLength >> 2);
399 }
400 
401