1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2  * vim: set ts=8 sts=2 et sw=2 tw=80:
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #include "js/CharacterEncoding.h"
8 
9 #include "mozilla/Latin1.h"
10 #include "mozilla/Range.h"
11 #include "mozilla/Span.h"
12 #include "mozilla/Sprintf.h"
13 #include "mozilla/TextUtils.h"
14 #include "mozilla/Utf8.h"
15 
16 #include <algorithm>
17 #include <type_traits>
18 
19 #include "js/friend/ErrorMessages.h"  // js::GetErrorMessage, JSMSG_*
20 #include "util/StringBuffer.h"
21 #include "util/Unicode.h"  // unicode::REPLACEMENT_CHARACTER
22 #include "vm/JSContext.h"
23 
24 using mozilla::AsChars;
25 using mozilla::AsciiValidUpTo;
26 using mozilla::AsWritableChars;
27 using mozilla::ConvertLatin1toUtf8Partial;
28 using mozilla::ConvertUtf16toUtf8Partial;
29 using mozilla::IsAscii;
30 using mozilla::IsUtf8Latin1;
31 using mozilla::LossyConvertUtf16toLatin1;
32 using mozilla::Span;
33 using mozilla::Tie;
34 using mozilla::Tuple;
35 using mozilla::Utf8Unit;
36 
37 using JS::Latin1CharsZ;
38 using JS::TwoByteCharsZ;
39 using JS::UTF8Chars;
40 using JS::UTF8CharsZ;
41 
42 using namespace js;
43 using namespace js::unicode;
44 
LossyTwoByteCharsToNewLatin1CharsZ(JSContext * cx,const mozilla::Range<const char16_t> tbchars)45 Latin1CharsZ JS::LossyTwoByteCharsToNewLatin1CharsZ(
46     JSContext* cx, const mozilla::Range<const char16_t> tbchars) {
47   MOZ_ASSERT(cx);
48   size_t len = tbchars.length();
49   unsigned char* latin1 = cx->pod_malloc<unsigned char>(len + 1);
50   if (!latin1) {
51     return Latin1CharsZ();
52   }
53   LossyConvertUtf16toLatin1(tbchars, AsWritableChars(Span(latin1, len)));
54   latin1[len] = '\0';
55   return Latin1CharsZ(latin1, len);
56 }
57 
58 template <typename CharT>
GetDeflatedUTF8StringLength(const CharT * chars,size_t nchars)59 static size_t GetDeflatedUTF8StringLength(const CharT* chars, size_t nchars) {
60   size_t nbytes = nchars;
61   for (const CharT* end = chars + nchars; chars < end; chars++) {
62     char16_t c = *chars;
63     if (c < 0x80) {
64       continue;
65     }
66     uint32_t v;
67     if (IsSurrogate(c)) {
68       /* nbytes sets 1 length since this is surrogate pair. */
69       if (IsTrailSurrogate(c) || (chars + 1) == end) {
70         nbytes += 2; /* Bad Surrogate */
71         continue;
72       }
73       char16_t c2 = chars[1];
74       if (!IsTrailSurrogate(c2)) {
75         nbytes += 2; /* Bad Surrogate */
76         continue;
77       }
78       v = UTF16Decode(c, c2);
79       nbytes--;
80       chars++;
81     } else {
82       v = c;
83     }
84     v >>= 11;
85     nbytes++;
86     while (v) {
87       v >>= 5;
88       nbytes++;
89     }
90   }
91   return nbytes;
92 }
93 
GetDeflatedUTF8StringLength(JSLinearString * s)94 JS_PUBLIC_API size_t JS::GetDeflatedUTF8StringLength(JSLinearString* s) {
95   JS::AutoCheckCannotGC nogc;
96   return s->hasLatin1Chars()
97              ? ::GetDeflatedUTF8StringLength(s->latin1Chars(nogc), s->length())
98              : ::GetDeflatedUTF8StringLength(s->twoByteChars(nogc),
99                                              s->length());
100 }
101 
DeflateStringToUTF8Buffer(JSLinearString * src,mozilla::Span<char> dst)102 JS_PUBLIC_API size_t JS::DeflateStringToUTF8Buffer(JSLinearString* src,
103                                                    mozilla::Span<char> dst) {
104   JS::AutoCheckCannotGC nogc;
105   if (src->hasLatin1Chars()) {
106     auto source = AsChars(Span(src->latin1Chars(nogc), src->length()));
107     size_t read;
108     size_t written;
109     Tie(read, written) = ConvertLatin1toUtf8Partial(source, dst);
110     (void)read;
111     return written;
112   }
113   auto source = Span(src->twoByteChars(nogc), src->length());
114   size_t read;
115   size_t written;
116   Tie(read, written) = ConvertUtf16toUtf8Partial(source, dst);
117   (void)read;
118   return written;
119 }
120 
121 template <typename CharT>
122 void ConvertToUTF8(mozilla::Span<CharT> src, mozilla::Span<char> dst);
123 
124 template <>
ConvertToUTF8(mozilla::Span<const char16_t> src,mozilla::Span<char> dst)125 void ConvertToUTF8<const char16_t>(mozilla::Span<const char16_t> src,
126                                    mozilla::Span<char> dst) {
127   (void)ConvertUtf16toUtf8Partial(src, dst);
128 }
129 
130 template <>
ConvertToUTF8(mozilla::Span<const Latin1Char> src,mozilla::Span<char> dst)131 void ConvertToUTF8<const Latin1Char>(mozilla::Span<const Latin1Char> src,
132                                      mozilla::Span<char> dst) {
133   (void)ConvertLatin1toUtf8Partial(AsChars(src), dst);
134 }
135 
136 template <typename CharT>
CharsToNewUTF8CharsZ(JSContext * cx,const mozilla::Range<CharT> chars)137 UTF8CharsZ JS::CharsToNewUTF8CharsZ(JSContext* cx,
138                                     const mozilla::Range<CharT> chars) {
139   /* Get required buffer size. */
140   const CharT* str = chars.begin().get();
141   size_t len = ::GetDeflatedUTF8StringLength(str, chars.length());
142 
143   /* Allocate buffer. */
144   char* utf8 = cx->pod_malloc<char>(len + 1);
145   if (!utf8) {
146     return UTF8CharsZ();
147   }
148 
149   /* Encode to UTF8. */
150   ::ConvertToUTF8(Span(str, chars.length()), Span(utf8, len));
151   utf8[len] = '\0';
152 
153   return UTF8CharsZ(utf8, len);
154 }
155 
156 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
157     JSContext* cx, const mozilla::Range<Latin1Char> chars);
158 
159 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
160     JSContext* cx, const mozilla::Range<char16_t> chars);
161 
162 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
163     JSContext* cx, const mozilla::Range<const Latin1Char> chars);
164 
165 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
166     JSContext* cx, const mozilla::Range<const char16_t> chars);
167 
168 static const uint32_t INVALID_UTF8 = UINT32_MAX;
169 
170 /*
171  * Convert a UTF-8 character sequence into a UCS-4 character and return that
172  * character. It is assumed that the caller already checked that the sequence
173  * is valid.
174  */
Utf8ToOneUcs4CharImpl(const uint8_t * utf8Buffer,int utf8Length)175 static uint32_t Utf8ToOneUcs4CharImpl(const uint8_t* utf8Buffer,
176                                       int utf8Length) {
177   MOZ_ASSERT(1 <= utf8Length && utf8Length <= 4);
178 
179   if (utf8Length == 1) {
180     MOZ_ASSERT(!(*utf8Buffer & 0x80));
181     return *utf8Buffer;
182   }
183 
184   /* from Unicode 3.1, non-shortest form is illegal */
185   static const uint32_t minucs4Table[] = {0x80, 0x800, NonBMPMin};
186 
187   MOZ_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) ==
188              (0x100 - (1 << (8 - utf8Length))));
189   uint32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1);
190   uint32_t minucs4Char = minucs4Table[utf8Length - 2];
191   while (--utf8Length) {
192     MOZ_ASSERT((*utf8Buffer & 0xC0) == 0x80);
193     ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F);
194   }
195 
196   if (MOZ_UNLIKELY(ucs4Char < minucs4Char)) {
197     return INVALID_UTF8;
198   }
199 
200   if (MOZ_UNLIKELY(IsSurrogate(ucs4Char))) {
201     return INVALID_UTF8;
202   }
203 
204   return ucs4Char;
205 }
206 
Utf8ToOneUcs4Char(const uint8_t * utf8Buffer,int utf8Length)207 uint32_t JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length) {
208   return Utf8ToOneUcs4CharImpl(utf8Buffer, utf8Length);
209 }
210 
ReportInvalidCharacter(JSContext * cx,uint32_t offset)211 static void ReportInvalidCharacter(JSContext* cx, uint32_t offset) {
212   char buffer[10];
213   SprintfLiteral(buffer, "%u", offset);
214   JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
215                             JSMSG_MALFORMED_UTF8_CHAR, buffer);
216 }
217 
ReportBufferTooSmall(JSContext * cx,uint32_t dummy)218 static void ReportBufferTooSmall(JSContext* cx, uint32_t dummy) {
219   JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
220                             JSMSG_BUFFER_TOO_SMALL);
221 }
222 
ReportTooBigCharacter(JSContext * cx,uint32_t v)223 static void ReportTooBigCharacter(JSContext* cx, uint32_t v) {
224   char buffer[11];
225   SprintfLiteral(buffer, "0x%x", v);
226   JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
227                             JSMSG_UTF8_CHAR_TOO_LARGE, buffer);
228 }
229 
230 enum class LoopDisposition {
231   Break,
232   Continue,
233 };
234 
235 enum class OnUTF8Error {
236   InsertReplacementCharacter,
237   InsertQuestionMark,
238   Throw,
239   Crash,
240 };
241 
242 // Scan UTF-8 input and (internally, at least) convert it to a series of UTF-16
243 // code units. But you can also do odd things like pass an empty lambda for
244 // `dst`, in which case the output is discarded entirely--the only effect of
245 // calling the template that way is error-checking.
246 template <OnUTF8Error ErrorAction, typename OutputFn>
InflateUTF8ToUTF16(JSContext * cx,const UTF8Chars src,OutputFn dst)247 static bool InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src,
248                                OutputFn dst) {
249   size_t srclen = src.length();
250   for (uint32_t i = 0; i < srclen; i++) {
251     uint32_t v = uint32_t(src[i]);
252     if (!(v & 0x80)) {
253       // ASCII code unit.  Simple copy.
254       if (dst(uint16_t(v)) == LoopDisposition::Break) {
255         break;
256       }
257     } else {
258       // Non-ASCII code unit.  Determine its length in bytes (n).
259       uint32_t n = 1;
260       while (v & (0x80 >> n)) {
261         n++;
262       }
263 
264 #define INVALID(report, arg, n2)                                    \
265   do {                                                              \
266     if (ErrorAction == OnUTF8Error::Throw) {                        \
267       report(cx, arg);                                              \
268       return false;                                                 \
269     } else if (ErrorAction == OnUTF8Error::Crash) {                 \
270       MOZ_CRASH("invalid UTF-8 string: " #report);                  \
271     } else {                                                        \
272       char16_t replacement;                                         \
273       if (ErrorAction == OnUTF8Error::InsertReplacementCharacter) { \
274         replacement = REPLACEMENT_CHARACTER;                        \
275       } else {                                                      \
276         MOZ_ASSERT(ErrorAction == OnUTF8Error::InsertQuestionMark); \
277         replacement = '?';                                          \
278       }                                                             \
279       if (dst(replacement) == LoopDisposition::Break) {             \
280         break;                                                      \
281       }                                                             \
282       n = n2;                                                       \
283       goto invalidMultiByteCodeUnit;                                \
284     }                                                               \
285   } while (0)
286 
287       // Check the leading byte.
288       if (n < 2 || n > 4) {
289         INVALID(ReportInvalidCharacter, i, 1);
290       }
291 
292       // Check that |src| is large enough to hold an n-byte code unit.
293       if (i + n > srclen) {
294         INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1);
295       }
296 
297       // Check the second byte.  From Unicode Standard v6.2, Table 3-7
298       // Well-Formed UTF-8 Byte Sequences.
299       if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) ||  // E0 A0~BF
300           (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) ||  // ED 80~9F
301           (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) ||  // F0 90~BF
302           (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80))    // F4 80~8F
303       {
304         INVALID(ReportInvalidCharacter, i, 1);
305       }
306 
307       // Check the continuation bytes.
308       for (uint32_t m = 1; m < n; m++) {
309         if ((src[i + m] & 0xC0) != 0x80) {
310           INVALID(ReportInvalidCharacter, i, m);
311         }
312       }
313 
314       // Determine the code unit's length in CharT and act accordingly.
315       v = Utf8ToOneUcs4CharImpl((uint8_t*)&src[i], n);
316       if (v < NonBMPMin) {
317         // The n-byte UTF8 code unit will fit in a single CharT.
318         if (dst(char16_t(v)) == LoopDisposition::Break) {
319           break;
320         }
321       } else if (v <= NonBMPMax) {
322         // The n-byte UTF8 code unit will fit in two CharT units.
323         if (dst(LeadSurrogate(v)) == LoopDisposition::Break) {
324           break;
325         }
326         if (dst(TrailSurrogate(v)) == LoopDisposition::Break) {
327           break;
328         }
329       } else {
330         // The n-byte UTF8 code unit won't fit in two CharT units.
331         INVALID(ReportTooBigCharacter, v, 1);
332       }
333 
334     invalidMultiByteCodeUnit:
335       // Move i to the last byte of the multi-byte code unit; the loop
336       // header will do the final i++ to move to the start of the next
337       // code unit.
338       i += n - 1;
339     }
340   }
341 
342   return true;
343 }
344 
345 template <OnUTF8Error ErrorAction, typename CharT>
CopyAndInflateUTF8IntoBuffer(JSContext * cx,const UTF8Chars src,CharT * dst,size_t outlen,bool allASCII)346 static void CopyAndInflateUTF8IntoBuffer(JSContext* cx, const UTF8Chars src,
347                                          CharT* dst, size_t outlen,
348                                          bool allASCII) {
349   if (allASCII) {
350     size_t srclen = src.length();
351     MOZ_ASSERT(outlen == srclen);
352     for (uint32_t i = 0; i < srclen; i++) {
353       dst[i] = CharT(src[i]);
354     }
355   } else {
356     size_t j = 0;
357     auto push = [dst, &j](char16_t c) -> LoopDisposition {
358       dst[j++] = CharT(c);
359       return LoopDisposition::Continue;
360     };
361     MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<ErrorAction>(cx, src, push)));
362     MOZ_ASSERT(j == outlen);
363   }
364   dst[outlen] = CharT('\0');  // NUL char
365 }
366 
367 template <OnUTF8Error ErrorAction, typename CharsT>
InflateUTF8StringHelper(JSContext * cx,const UTF8Chars src,size_t * outlen,arena_id_t destArenaId)368 static CharsT InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src,
369                                       size_t* outlen, arena_id_t destArenaId) {
370   using CharT = typename CharsT::CharT;
371   static_assert(
372       std::is_same_v<CharT, char16_t> || std::is_same_v<CharT, Latin1Char>,
373       "bad CharT");
374 
375   *outlen = 0;
376 
377   size_t len = 0;
378   bool allASCII = true;
379   auto count = [&len, &allASCII](char16_t c) -> LoopDisposition {
380     len++;
381     allASCII &= (c < 0x80);
382     return LoopDisposition::Continue;
383   };
384   if (!InflateUTF8ToUTF16<ErrorAction>(cx, src, count)) {
385     return CharsT();
386   }
387   *outlen = len;
388 
389   CharT* dst = cx->pod_arena_malloc<CharT>(destArenaId,
390                                            *outlen + 1);  // +1 for NUL
391 
392   if (!dst) {
393     ReportOutOfMemory(cx);
394     return CharsT();
395   }
396 
397   constexpr OnUTF8Error errorMode =
398       std::is_same_v<CharT, Latin1Char>
399           ? OnUTF8Error::InsertQuestionMark
400           : OnUTF8Error::InsertReplacementCharacter;
401   CopyAndInflateUTF8IntoBuffer<errorMode>(cx, src, dst, *outlen, allASCII);
402 
403   return CharsT(dst, *outlen);
404 }
405 
UTF8CharsToNewTwoByteCharsZ(JSContext * cx,const UTF8Chars utf8,size_t * outlen,arena_id_t destArenaId)406 TwoByteCharsZ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx,
407                                               const UTF8Chars utf8,
408                                               size_t* outlen,
409                                               arena_id_t destArenaId) {
410   return InflateUTF8StringHelper<OnUTF8Error::Throw, TwoByteCharsZ>(
411       cx, utf8, outlen, destArenaId);
412 }
413 
UTF8CharsToNewTwoByteCharsZ(JSContext * cx,const ConstUTF8CharsZ & utf8,size_t * outlen,arena_id_t destArenaId)414 TwoByteCharsZ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx,
415                                               const ConstUTF8CharsZ& utf8,
416                                               size_t* outlen,
417                                               arena_id_t destArenaId) {
418   UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
419   return InflateUTF8StringHelper<OnUTF8Error::Throw, TwoByteCharsZ>(
420       cx, chars, outlen, destArenaId);
421 }
422 
LossyUTF8CharsToNewTwoByteCharsZ(JSContext * cx,const JS::UTF8Chars utf8,size_t * outlen,arena_id_t destArenaId)423 TwoByteCharsZ JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx,
424                                                    const JS::UTF8Chars utf8,
425                                                    size_t* outlen,
426                                                    arena_id_t destArenaId) {
427   return InflateUTF8StringHelper<OnUTF8Error::InsertReplacementCharacter,
428                                  TwoByteCharsZ>(cx, utf8, outlen, destArenaId);
429 }
430 
LossyUTF8CharsToNewTwoByteCharsZ(JSContext * cx,const JS::ConstUTF8CharsZ & utf8,size_t * outlen,arena_id_t destArenaId)431 TwoByteCharsZ JS::LossyUTF8CharsToNewTwoByteCharsZ(
432     JSContext* cx, const JS::ConstUTF8CharsZ& utf8, size_t* outlen,
433     arena_id_t destArenaId) {
434   UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
435   return InflateUTF8StringHelper<OnUTF8Error::InsertReplacementCharacter,
436                                  TwoByteCharsZ>(cx, chars, outlen, destArenaId);
437 }
438 
UpdateSmallestEncodingForChar(char16_t c,JS::SmallestEncoding * encoding)439 static void UpdateSmallestEncodingForChar(char16_t c,
440                                           JS::SmallestEncoding* encoding) {
441   JS::SmallestEncoding newEncoding = JS::SmallestEncoding::ASCII;
442   if (c >= 0x80) {
443     if (c < 0x100) {
444       newEncoding = JS::SmallestEncoding::Latin1;
445     } else {
446       newEncoding = JS::SmallestEncoding::UTF16;
447     }
448   }
449   if (newEncoding > *encoding) {
450     *encoding = newEncoding;
451   }
452 }
453 
FindSmallestEncoding(UTF8Chars utf8)454 JS::SmallestEncoding JS::FindSmallestEncoding(UTF8Chars utf8) {
455   Span<unsigned char> unsignedSpan = utf8;
456   auto charSpan = AsChars(unsignedSpan);
457   size_t upTo = AsciiValidUpTo(charSpan);
458   if (upTo == charSpan.Length()) {
459     return SmallestEncoding::ASCII;
460   }
461   if (IsUtf8Latin1(charSpan.From(upTo))) {
462     return SmallestEncoding::Latin1;
463   }
464   return SmallestEncoding::UTF16;
465 }
466 
UTF8CharsToNewLatin1CharsZ(JSContext * cx,const UTF8Chars utf8,size_t * outlen,arena_id_t destArenaId)467 Latin1CharsZ JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8,
468                                             size_t* outlen,
469                                             arena_id_t destArenaId) {
470   return InflateUTF8StringHelper<OnUTF8Error::Throw, Latin1CharsZ>(
471       cx, utf8, outlen, destArenaId);
472 }
473 
LossyUTF8CharsToNewLatin1CharsZ(JSContext * cx,const UTF8Chars utf8,size_t * outlen,arena_id_t destArenaId)474 Latin1CharsZ JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx,
475                                                  const UTF8Chars utf8,
476                                                  size_t* outlen,
477                                                  arena_id_t destArenaId) {
478   return InflateUTF8StringHelper<OnUTF8Error::InsertQuestionMark, Latin1CharsZ>(
479       cx, utf8, outlen, destArenaId);
480 }
481 
482 /**
483  * Atomization Helpers.
484  *
485  * These functions are extremely single-use, and are not intended for general
486  * consumption.
487  */
488 
GetUTF8AtomizationData(JSContext * cx,const JS::UTF8Chars utf8,size_t * outlen,JS::SmallestEncoding * encoding,HashNumber * hashNum)489 bool GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars utf8,
490                             size_t* outlen, JS::SmallestEncoding* encoding,
491                             HashNumber* hashNum) {
492   *outlen = 0;
493   *encoding = JS::SmallestEncoding::ASCII;
494   *hashNum = 0;
495 
496   auto getMetadata = [outlen, encoding,
497                       hashNum](char16_t c) -> LoopDisposition {
498     (*outlen)++;
499     UpdateSmallestEncodingForChar(c, encoding);
500     *hashNum = mozilla::AddToHash(*hashNum, c);
501     return LoopDisposition::Continue;
502   };
503   if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx, utf8, getMetadata)) {
504     return false;
505   }
506 
507   return true;
508 }
509 
510 template <typename CharT>
UTF8EqualsChars(const JS::UTF8Chars utfChars,const CharT * chars)511 bool UTF8EqualsChars(const JS::UTF8Chars utfChars, const CharT* chars) {
512   size_t ind = 0;
513   bool isEqual = true;
514 
515   auto checkEqual = [&isEqual, &ind, chars](char16_t c) -> LoopDisposition {
516 #ifdef DEBUG
517     JS::SmallestEncoding encoding = JS::SmallestEncoding::ASCII;
518     UpdateSmallestEncodingForChar(c, &encoding);
519     if (std::is_same_v<CharT, JS::Latin1Char>) {
520       MOZ_ASSERT(encoding <= JS::SmallestEncoding::Latin1);
521     } else if (!std::is_same_v<CharT, char16_t>) {
522       MOZ_CRASH("Invalid character type in UTF8EqualsChars");
523     }
524 #endif
525 
526     if (CharT(c) != chars[ind]) {
527       isEqual = false;
528       return LoopDisposition::Break;
529     }
530 
531     ind++;
532     return LoopDisposition::Continue;
533   };
534 
535   // To get here, you must have checked your work.
536   InflateUTF8ToUTF16<OnUTF8Error::Crash>(/* cx = */ nullptr, utfChars,
537                                          checkEqual);
538 
539   return isEqual;
540 }
541 
542 template bool UTF8EqualsChars(const JS::UTF8Chars, const char16_t*);
543 template bool UTF8EqualsChars(const JS::UTF8Chars, const JS::Latin1Char*);
544 
545 template <typename CharT>
InflateUTF8CharsToBufferAndTerminate(const JS::UTF8Chars src,CharT * dst,size_t dstLen,JS::SmallestEncoding encoding)546 void InflateUTF8CharsToBufferAndTerminate(const JS::UTF8Chars src, CharT* dst,
547                                           size_t dstLen,
548                                           JS::SmallestEncoding encoding) {
549   CopyAndInflateUTF8IntoBuffer<OnUTF8Error::Crash>(
550       /* cx = */ nullptr, src, dst, dstLen,
551       encoding == JS::SmallestEncoding::ASCII);
552 }
553 
554 template void InflateUTF8CharsToBufferAndTerminate(
555     const UTF8Chars src, char16_t* dst, size_t dstLen,
556     JS::SmallestEncoding encoding);
557 template void InflateUTF8CharsToBufferAndTerminate(
558     const UTF8Chars src, JS::Latin1Char* dst, size_t dstLen,
559     JS::SmallestEncoding encoding);
560 
561 #ifdef DEBUG
validate(size_t aLength)562 void JS::ConstUTF8CharsZ::validate(size_t aLength) {
563   MOZ_ASSERT(data_);
564   UTF8Chars chars(data_, aLength);
565   auto nop = [](char16_t) -> LoopDisposition {
566     return LoopDisposition::Continue;
567   };
568   InflateUTF8ToUTF16<OnUTF8Error::Crash>(/* cx = */ nullptr, chars, nop);
569 }
570 #endif
571 
StringIsASCII(const char * s)572 bool JS::StringIsASCII(const char* s) {
573   while (*s) {
574     if (*s & 0x80) {
575       return false;
576     }
577     s++;
578   }
579   return true;
580 }
581 
StringIsASCII(Span<const char> s)582 bool JS::StringIsASCII(Span<const char> s) { return IsAscii(s); }
583 
append(const Utf8Unit * units,size_t len)584 bool StringBuffer::append(const Utf8Unit* units, size_t len) {
585   if (isLatin1()) {
586     Latin1CharBuffer& latin1 = latin1Chars();
587 
588     while (len > 0) {
589       if (!IsAscii(*units)) {
590         break;
591       }
592 
593       if (!latin1.append(units->toUnsignedChar())) {
594         return false;
595       }
596 
597       ++units;
598       --len;
599     }
600     if (len == 0) {
601       return true;
602     }
603 
604     // Non-ASCII doesn't *necessarily* mean we couldn't keep appending to
605     // |latin1|, but it's only possible for [U+0080, U+0100) code points,
606     // and handling the full complexity of UTF-8 only for that very small
607     // additional range isn't worth it.  Inflate to two-byte storage before
608     // appending the remaining code points.
609     if (!inflateChars()) {
610       return false;
611     }
612   }
613 
614   UTF8Chars remainingUtf8(units, len);
615 
616   // Determine how many UTF-16 code units are required to represent the
617   // remaining units.
618   size_t utf16Len = 0;
619   auto countInflated = [&utf16Len](char16_t c) -> LoopDisposition {
620     utf16Len++;
621     return LoopDisposition::Continue;
622   };
623   if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx_, remainingUtf8,
624                                               countInflated)) {
625     return false;
626   }
627 
628   TwoByteCharBuffer& buf = twoByteChars();
629 
630   size_t i = buf.length();
631   if (!buf.growByUninitialized(utf16Len)) {
632     return false;
633   }
634   MOZ_ASSERT(i + utf16Len == buf.length(),
635              "growByUninitialized assumed to increase length immediately");
636 
637   char16_t* toFill = &buf[i];
638   auto appendUtf16 = [&toFill](char16_t unit) {
639     *toFill++ = unit;
640     return LoopDisposition::Continue;
641   };
642 
643   MOZ_ALWAYS_TRUE(
644       InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx_, remainingUtf8, appendUtf16));
645   MOZ_ASSERT(toFill == buf.end());
646   return true;
647 }
648