1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #ifndef nsUTF8Utils_h_
7 #define nsUTF8Utils_h_
8 
9 // NB: This code may be used from non-XPCOM code, in particular, the
10 // standalone updater executable.  That is, this file may be used in
11 // two ways: if MOZILLA_INTERNAL_API is defined, this file will
12 // provide signatures for the Mozilla abstract string types. It will
13 // use XPCOM assertion/debugging macros, etc.
14 
15 #include <type_traits>
16 
17 #include "mozilla/Assertions.h"
18 #include "mozilla/EndianUtils.h"
19 
20 #include "nsCharTraits.h"
21 
22 #ifdef MOZILLA_INTERNAL_API
23 #  define UTF8UTILS_WARNING(msg) NS_WARNING(msg)
24 #else
25 #  define UTF8UTILS_WARNING(msg)
26 #endif
27 
28 class UTF8traits {
29  public:
isASCII(char aChar)30   static bool isASCII(char aChar) { return (aChar & 0x80) == 0x00; }
isInSeq(char aChar)31   static bool isInSeq(char aChar) { return (aChar & 0xC0) == 0x80; }
is2byte(char aChar)32   static bool is2byte(char aChar) { return (aChar & 0xE0) == 0xC0; }
is3byte(char aChar)33   static bool is3byte(char aChar) { return (aChar & 0xF0) == 0xE0; }
is4byte(char aChar)34   static bool is4byte(char aChar) { return (aChar & 0xF8) == 0xF0; }
is5byte(char aChar)35   static bool is5byte(char aChar) { return (aChar & 0xFC) == 0xF8; }
is6byte(char aChar)36   static bool is6byte(char aChar) { return (aChar & 0xFE) == 0xFC; }
37   // return the number of bytes in a sequence beginning with aChar
bytes(char aChar)38   static int bytes(char aChar) {
39     if (isASCII(aChar)) {
40       return 1;
41     }
42     if (is2byte(aChar)) {
43       return 2;
44     }
45     if (is3byte(aChar)) {
46       return 3;
47     }
48     if (is4byte(aChar)) {
49       return 4;
50     }
51     MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters");
52     return 1;
53   }
54 };
55 
56 /**
57  * Extract the next Unicode scalar value from the buffer and return it. The
58  * pointer passed in is advanced to the start of the next character in the
59  * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
60  * over the maximal valid prefix and *aErr is set to true (if aErr is not
61  * null).
62  *
63  * Note: This method never sets *aErr to false to allow error accumulation
64  * across multiple calls.
65  *
66  * Precondition: *aBuffer < aEnd
67  */
68 class UTF8CharEnumerator {
69  public:
70   static inline char32_t NextChar(const char** aBuffer, const char* aEnd,
71                                   bool* aErr = nullptr) {
72     MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
73     MOZ_ASSERT(aEnd, "null end pointer");
74 
75     const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer);
76     const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);
77 
78     MOZ_ASSERT(p, "null buffer");
79     MOZ_ASSERT(p < end, "Bogus range");
80 
81     unsigned char first = *p;
82     ++p;
83 
84     if (MOZ_LIKELY(first < 0x80U)) {
85       *aBuffer = reinterpret_cast<const char*>(p);
86       return first;
87     }
88 
89     // Unsigned underflow is defined behavior
90     if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) {
91       *aBuffer = reinterpret_cast<const char*>(p);
92       if (aErr) {
93         *aErr = true;
94       }
95       return 0xFFFDU;
96     }
97 
98     unsigned char second = *p;
99 
100     if (first < 0xE0U) {
101       // Two-byte
102       if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) {
103         ++p;
104         *aBuffer = reinterpret_cast<const char*>(p);
105         return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU);
106       }
107       *aBuffer = reinterpret_cast<const char*>(p);
108       if (aErr) {
109         *aErr = true;
110       }
111       return 0xFFFDU;
112     }
113 
114     if (MOZ_LIKELY(first < 0xF0U)) {
115       // Three-byte
116       unsigned char lower = 0x80U;
117       unsigned char upper = 0xBFU;
118       if (first == 0xE0U) {
119         lower = 0xA0U;
120       } else if (first == 0xEDU) {
121         upper = 0x9FU;
122       }
123       if (MOZ_LIKELY(second >= lower && second <= upper)) {
124         ++p;
125         if (MOZ_LIKELY(p != end)) {
126           unsigned char third = *p;
127           if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
128             ++p;
129             *aBuffer = reinterpret_cast<const char*>(p);
130             return ((uint32_t(first) & 0xFU) << 12) |
131                    ((uint32_t(second) & 0x3FU) << 6) |
132                    (uint32_t(third) & 0x3FU);
133           }
134         }
135       }
136       *aBuffer = reinterpret_cast<const char*>(p);
137       if (aErr) {
138         *aErr = true;
139       }
140       return 0xFFFDU;
141     }
142 
143     // Four-byte
144     unsigned char lower = 0x80U;
145     unsigned char upper = 0xBFU;
146     if (first == 0xF0U) {
147       lower = 0x90U;
148     } else if (first == 0xF4U) {
149       upper = 0x8FU;
150     }
151     if (MOZ_LIKELY(second >= lower && second <= upper)) {
152       ++p;
153       if (MOZ_LIKELY(p != end)) {
154         unsigned char third = *p;
155         if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
156           ++p;
157           if (MOZ_LIKELY(p != end)) {
158             unsigned char fourth = *p;
159             if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) {
160               ++p;
161               *aBuffer = reinterpret_cast<const char*>(p);
162               return ((uint32_t(first) & 0x7U) << 18) |
163                      ((uint32_t(second) & 0x3FU) << 12) |
164                      ((uint32_t(third) & 0x3FU) << 6) |
165                      (uint32_t(fourth) & 0x3FU);
166             }
167           }
168         }
169       }
170     }
171     *aBuffer = reinterpret_cast<const char*>(p);
172     if (aErr) {
173       *aErr = true;
174     }
175     return 0xFFFDU;
176   }
177 };
178 
179 /**
180  * Extract the next Unicode scalar value from the buffer and return it. The
181  * pointer passed in is advanced to the start of the next character in the
182  * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
183  * the unpaired surrogate and *aErr is set to true (if aErr is not null).
184  *
185  * Note: This method never sets *aErr to false to allow error accumulation
186  * across multiple calls.
187  *
188  * Precondition: *aBuffer < aEnd
189  */
190 class UTF16CharEnumerator {
191  public:
192   static inline char32_t NextChar(const char16_t** aBuffer,
193                                   const char16_t* aEnd, bool* aErr = nullptr) {
194     MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
195     MOZ_ASSERT(aEnd, "null end pointer");
196 
197     const char16_t* p = *aBuffer;
198 
199     MOZ_ASSERT(p, "null buffer");
200     MOZ_ASSERT(p < aEnd, "Bogus range");
201 
202     char16_t c = *p++;
203 
204     // Let's use encoding_rs-style code golf here.
205     // Unsigned underflow is defined behavior
206     char16_t cMinusSurrogateStart = c - 0xD800U;
207     if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
208       *aBuffer = p;
209       return c;
210     }
211     if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) {
212       // High surrogate
213       if (MOZ_LIKELY(p != aEnd)) {
214         char16_t second = *p;
215         // Unsigned underflow is defined behavior
216         if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) {
217           *aBuffer = ++p;
218           return (uint32_t(c) << 10) + uint32_t(second) -
219                  (((0xD800U << 10) - 0x10000U) + 0xDC00U);
220         }
221       }
222     }
223     // Unpaired surrogate
224     *aBuffer = p;
225     if (aErr) {
226       *aErr = true;
227     }
228     return 0xFFFDU;
229   }
230 };
231 
232 template <typename Char, typename UnsignedT>
RewindToPriorUTF8Codepoint(const Char * utf8Chars,UnsignedT index)233 inline UnsignedT RewindToPriorUTF8Codepoint(const Char* utf8Chars,
234                                             UnsignedT index) {
235   static_assert(std::is_same_v<Char, char> ||
236                     std::is_same_v<Char, unsigned char> ||
237                     std::is_same_v<Char, signed char>,
238                 "UTF-8 data must be in 8-bit units");
239   static_assert(std::is_unsigned_v<UnsignedT>, "index type must be unsigned");
240   while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80) --index;
241 
242   return index;
243 }
244 
245 #undef UTF8UTILS_WARNING
246 
247 #endif /* !defined(nsUTF8Utils_h_) */
248