1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * vim: set ts=8 sts=2 et sw=2 tw=80:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #include "js/CharacterEncoding.h"
8
9 #include "mozilla/Latin1.h"
10 #include "mozilla/Range.h"
11 #include "mozilla/Span.h"
12 #include "mozilla/Sprintf.h"
13 #include "mozilla/TextUtils.h"
14 #include "mozilla/Utf8.h"
15
16 #include <algorithm>
17 #include <type_traits>
18
19 #include "js/friend/ErrorMessages.h" // js::GetErrorMessage, JSMSG_*
20 #include "util/StringBuffer.h"
21 #include "util/Unicode.h" // unicode::REPLACEMENT_CHARACTER
22 #include "vm/JSContext.h"
23
24 using mozilla::AsChars;
25 using mozilla::AsciiValidUpTo;
26 using mozilla::AsWritableChars;
27 using mozilla::ConvertLatin1toUtf8Partial;
28 using mozilla::ConvertUtf16toUtf8Partial;
29 using mozilla::IsAscii;
30 using mozilla::IsUtf8Latin1;
31 using mozilla::LossyConvertUtf16toLatin1;
32 using mozilla::Span;
33 using mozilla::Tie;
34 using mozilla::Tuple;
35 using mozilla::Utf8Unit;
36
37 using JS::Latin1CharsZ;
38 using JS::TwoByteCharsZ;
39 using JS::UTF8Chars;
40 using JS::UTF8CharsZ;
41
42 using namespace js;
43 using namespace js::unicode;
44
LossyTwoByteCharsToNewLatin1CharsZ(JSContext * cx,const mozilla::Range<const char16_t> tbchars)45 Latin1CharsZ JS::LossyTwoByteCharsToNewLatin1CharsZ(
46 JSContext* cx, const mozilla::Range<const char16_t> tbchars) {
47 MOZ_ASSERT(cx);
48 size_t len = tbchars.length();
49 unsigned char* latin1 = cx->pod_malloc<unsigned char>(len + 1);
50 if (!latin1) {
51 return Latin1CharsZ();
52 }
53 LossyConvertUtf16toLatin1(tbchars, AsWritableChars(Span(latin1, len)));
54 latin1[len] = '\0';
55 return Latin1CharsZ(latin1, len);
56 }
57
58 template <typename CharT>
GetDeflatedUTF8StringLength(const CharT * chars,size_t nchars)59 static size_t GetDeflatedUTF8StringLength(const CharT* chars, size_t nchars) {
60 size_t nbytes = nchars;
61 for (const CharT* end = chars + nchars; chars < end; chars++) {
62 char16_t c = *chars;
63 if (c < 0x80) {
64 continue;
65 }
66 uint32_t v;
67 if (IsSurrogate(c)) {
68 /* nbytes sets 1 length since this is surrogate pair. */
69 if (IsTrailSurrogate(c) || (chars + 1) == end) {
70 nbytes += 2; /* Bad Surrogate */
71 continue;
72 }
73 char16_t c2 = chars[1];
74 if (!IsTrailSurrogate(c2)) {
75 nbytes += 2; /* Bad Surrogate */
76 continue;
77 }
78 v = UTF16Decode(c, c2);
79 nbytes--;
80 chars++;
81 } else {
82 v = c;
83 }
84 v >>= 11;
85 nbytes++;
86 while (v) {
87 v >>= 5;
88 nbytes++;
89 }
90 }
91 return nbytes;
92 }
93
GetDeflatedUTF8StringLength(JSLinearString * s)94 JS_PUBLIC_API size_t JS::GetDeflatedUTF8StringLength(JSLinearString* s) {
95 JS::AutoCheckCannotGC nogc;
96 return s->hasLatin1Chars()
97 ? ::GetDeflatedUTF8StringLength(s->latin1Chars(nogc), s->length())
98 : ::GetDeflatedUTF8StringLength(s->twoByteChars(nogc),
99 s->length());
100 }
101
DeflateStringToUTF8Buffer(JSLinearString * src,mozilla::Span<char> dst)102 JS_PUBLIC_API size_t JS::DeflateStringToUTF8Buffer(JSLinearString* src,
103 mozilla::Span<char> dst) {
104 JS::AutoCheckCannotGC nogc;
105 if (src->hasLatin1Chars()) {
106 auto source = AsChars(Span(src->latin1Chars(nogc), src->length()));
107 size_t read;
108 size_t written;
109 Tie(read, written) = ConvertLatin1toUtf8Partial(source, dst);
110 (void)read;
111 return written;
112 }
113 auto source = Span(src->twoByteChars(nogc), src->length());
114 size_t read;
115 size_t written;
116 Tie(read, written) = ConvertUtf16toUtf8Partial(source, dst);
117 (void)read;
118 return written;
119 }
120
121 template <typename CharT>
122 void ConvertToUTF8(mozilla::Span<CharT> src, mozilla::Span<char> dst);
123
124 template <>
ConvertToUTF8(mozilla::Span<const char16_t> src,mozilla::Span<char> dst)125 void ConvertToUTF8<const char16_t>(mozilla::Span<const char16_t> src,
126 mozilla::Span<char> dst) {
127 (void)ConvertUtf16toUtf8Partial(src, dst);
128 }
129
130 template <>
ConvertToUTF8(mozilla::Span<const Latin1Char> src,mozilla::Span<char> dst)131 void ConvertToUTF8<const Latin1Char>(mozilla::Span<const Latin1Char> src,
132 mozilla::Span<char> dst) {
133 (void)ConvertLatin1toUtf8Partial(AsChars(src), dst);
134 }
135
136 template <typename CharT>
CharsToNewUTF8CharsZ(JSContext * cx,const mozilla::Range<CharT> chars)137 UTF8CharsZ JS::CharsToNewUTF8CharsZ(JSContext* cx,
138 const mozilla::Range<CharT> chars) {
139 /* Get required buffer size. */
140 const CharT* str = chars.begin().get();
141 size_t len = ::GetDeflatedUTF8StringLength(str, chars.length());
142
143 /* Allocate buffer. */
144 char* utf8 = cx->pod_malloc<char>(len + 1);
145 if (!utf8) {
146 return UTF8CharsZ();
147 }
148
149 /* Encode to UTF8. */
150 ::ConvertToUTF8(Span(str, chars.length()), Span(utf8, len));
151 utf8[len] = '\0';
152
153 return UTF8CharsZ(utf8, len);
154 }
155
156 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
157 JSContext* cx, const mozilla::Range<Latin1Char> chars);
158
159 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
160 JSContext* cx, const mozilla::Range<char16_t> chars);
161
162 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
163 JSContext* cx, const mozilla::Range<const Latin1Char> chars);
164
165 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
166 JSContext* cx, const mozilla::Range<const char16_t> chars);
167
168 static const uint32_t INVALID_UTF8 = UINT32_MAX;
169
170 /*
171 * Convert a UTF-8 character sequence into a UCS-4 character and return that
172 * character. It is assumed that the caller already checked that the sequence
173 * is valid.
174 */
Utf8ToOneUcs4CharImpl(const uint8_t * utf8Buffer,int utf8Length)175 static uint32_t Utf8ToOneUcs4CharImpl(const uint8_t* utf8Buffer,
176 int utf8Length) {
177 MOZ_ASSERT(1 <= utf8Length && utf8Length <= 4);
178
179 if (utf8Length == 1) {
180 MOZ_ASSERT(!(*utf8Buffer & 0x80));
181 return *utf8Buffer;
182 }
183
184 /* from Unicode 3.1, non-shortest form is illegal */
185 static const uint32_t minucs4Table[] = {0x80, 0x800, NonBMPMin};
186
187 MOZ_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) ==
188 (0x100 - (1 << (8 - utf8Length))));
189 uint32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1);
190 uint32_t minucs4Char = minucs4Table[utf8Length - 2];
191 while (--utf8Length) {
192 MOZ_ASSERT((*utf8Buffer & 0xC0) == 0x80);
193 ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F);
194 }
195
196 if (MOZ_UNLIKELY(ucs4Char < minucs4Char)) {
197 return INVALID_UTF8;
198 }
199
200 if (MOZ_UNLIKELY(IsSurrogate(ucs4Char))) {
201 return INVALID_UTF8;
202 }
203
204 return ucs4Char;
205 }
206
Utf8ToOneUcs4Char(const uint8_t * utf8Buffer,int utf8Length)207 uint32_t JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length) {
208 return Utf8ToOneUcs4CharImpl(utf8Buffer, utf8Length);
209 }
210
ReportInvalidCharacter(JSContext * cx,uint32_t offset)211 static void ReportInvalidCharacter(JSContext* cx, uint32_t offset) {
212 char buffer[10];
213 SprintfLiteral(buffer, "%u", offset);
214 JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
215 JSMSG_MALFORMED_UTF8_CHAR, buffer);
216 }
217
ReportBufferTooSmall(JSContext * cx,uint32_t dummy)218 static void ReportBufferTooSmall(JSContext* cx, uint32_t dummy) {
219 JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
220 JSMSG_BUFFER_TOO_SMALL);
221 }
222
ReportTooBigCharacter(JSContext * cx,uint32_t v)223 static void ReportTooBigCharacter(JSContext* cx, uint32_t v) {
224 char buffer[11];
225 SprintfLiteral(buffer, "0x%x", v);
226 JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
227 JSMSG_UTF8_CHAR_TOO_LARGE, buffer);
228 }
229
230 enum class LoopDisposition {
231 Break,
232 Continue,
233 };
234
235 enum class OnUTF8Error {
236 InsertReplacementCharacter,
237 InsertQuestionMark,
238 Throw,
239 Crash,
240 };
241
242 // Scan UTF-8 input and (internally, at least) convert it to a series of UTF-16
243 // code units. But you can also do odd things like pass an empty lambda for
244 // `dst`, in which case the output is discarded entirely--the only effect of
245 // calling the template that way is error-checking.
246 template <OnUTF8Error ErrorAction, typename OutputFn>
InflateUTF8ToUTF16(JSContext * cx,const UTF8Chars src,OutputFn dst)247 static bool InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src,
248 OutputFn dst) {
249 size_t srclen = src.length();
250 for (uint32_t i = 0; i < srclen; i++) {
251 uint32_t v = uint32_t(src[i]);
252 if (!(v & 0x80)) {
253 // ASCII code unit. Simple copy.
254 if (dst(uint16_t(v)) == LoopDisposition::Break) {
255 break;
256 }
257 } else {
258 // Non-ASCII code unit. Determine its length in bytes (n).
259 uint32_t n = 1;
260 while (v & (0x80 >> n)) {
261 n++;
262 }
263
264 #define INVALID(report, arg, n2) \
265 do { \
266 if (ErrorAction == OnUTF8Error::Throw) { \
267 report(cx, arg); \
268 return false; \
269 } else if (ErrorAction == OnUTF8Error::Crash) { \
270 MOZ_CRASH("invalid UTF-8 string: " #report); \
271 } else { \
272 char16_t replacement; \
273 if (ErrorAction == OnUTF8Error::InsertReplacementCharacter) { \
274 replacement = REPLACEMENT_CHARACTER; \
275 } else { \
276 MOZ_ASSERT(ErrorAction == OnUTF8Error::InsertQuestionMark); \
277 replacement = '?'; \
278 } \
279 if (dst(replacement) == LoopDisposition::Break) { \
280 break; \
281 } \
282 n = n2; \
283 goto invalidMultiByteCodeUnit; \
284 } \
285 } while (0)
286
287 // Check the leading byte.
288 if (n < 2 || n > 4) {
289 INVALID(ReportInvalidCharacter, i, 1);
290 }
291
292 // Check that |src| is large enough to hold an n-byte code unit.
293 if (i + n > srclen) {
294 INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1);
295 }
296
297 // Check the second byte. From Unicode Standard v6.2, Table 3-7
298 // Well-Formed UTF-8 Byte Sequences.
299 if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) || // E0 A0~BF
300 (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) || // ED 80~9F
301 (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) || // F0 90~BF
302 (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80)) // F4 80~8F
303 {
304 INVALID(ReportInvalidCharacter, i, 1);
305 }
306
307 // Check the continuation bytes.
308 for (uint32_t m = 1; m < n; m++) {
309 if ((src[i + m] & 0xC0) != 0x80) {
310 INVALID(ReportInvalidCharacter, i, m);
311 }
312 }
313
314 // Determine the code unit's length in CharT and act accordingly.
315 v = Utf8ToOneUcs4CharImpl((uint8_t*)&src[i], n);
316 if (v < NonBMPMin) {
317 // The n-byte UTF8 code unit will fit in a single CharT.
318 if (dst(char16_t(v)) == LoopDisposition::Break) {
319 break;
320 }
321 } else if (v <= NonBMPMax) {
322 // The n-byte UTF8 code unit will fit in two CharT units.
323 if (dst(LeadSurrogate(v)) == LoopDisposition::Break) {
324 break;
325 }
326 if (dst(TrailSurrogate(v)) == LoopDisposition::Break) {
327 break;
328 }
329 } else {
330 // The n-byte UTF8 code unit won't fit in two CharT units.
331 INVALID(ReportTooBigCharacter, v, 1);
332 }
333
334 invalidMultiByteCodeUnit:
335 // Move i to the last byte of the multi-byte code unit; the loop
336 // header will do the final i++ to move to the start of the next
337 // code unit.
338 i += n - 1;
339 }
340 }
341
342 return true;
343 }
344
345 template <OnUTF8Error ErrorAction, typename CharT>
CopyAndInflateUTF8IntoBuffer(JSContext * cx,const UTF8Chars src,CharT * dst,size_t outlen,bool allASCII)346 static void CopyAndInflateUTF8IntoBuffer(JSContext* cx, const UTF8Chars src,
347 CharT* dst, size_t outlen,
348 bool allASCII) {
349 if (allASCII) {
350 size_t srclen = src.length();
351 MOZ_ASSERT(outlen == srclen);
352 for (uint32_t i = 0; i < srclen; i++) {
353 dst[i] = CharT(src[i]);
354 }
355 } else {
356 size_t j = 0;
357 auto push = [dst, &j](char16_t c) -> LoopDisposition {
358 dst[j++] = CharT(c);
359 return LoopDisposition::Continue;
360 };
361 MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<ErrorAction>(cx, src, push)));
362 MOZ_ASSERT(j == outlen);
363 }
364 dst[outlen] = CharT('\0'); // NUL char
365 }
366
367 template <OnUTF8Error ErrorAction, typename CharsT>
InflateUTF8StringHelper(JSContext * cx,const UTF8Chars src,size_t * outlen,arena_id_t destArenaId)368 static CharsT InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src,
369 size_t* outlen, arena_id_t destArenaId) {
370 using CharT = typename CharsT::CharT;
371 static_assert(
372 std::is_same_v<CharT, char16_t> || std::is_same_v<CharT, Latin1Char>,
373 "bad CharT");
374
375 *outlen = 0;
376
377 size_t len = 0;
378 bool allASCII = true;
379 auto count = [&len, &allASCII](char16_t c) -> LoopDisposition {
380 len++;
381 allASCII &= (c < 0x80);
382 return LoopDisposition::Continue;
383 };
384 if (!InflateUTF8ToUTF16<ErrorAction>(cx, src, count)) {
385 return CharsT();
386 }
387 *outlen = len;
388
389 CharT* dst = cx->pod_arena_malloc<CharT>(destArenaId,
390 *outlen + 1); // +1 for NUL
391
392 if (!dst) {
393 ReportOutOfMemory(cx);
394 return CharsT();
395 }
396
397 constexpr OnUTF8Error errorMode =
398 std::is_same_v<CharT, Latin1Char>
399 ? OnUTF8Error::InsertQuestionMark
400 : OnUTF8Error::InsertReplacementCharacter;
401 CopyAndInflateUTF8IntoBuffer<errorMode>(cx, src, dst, *outlen, allASCII);
402
403 return CharsT(dst, *outlen);
404 }
405
UTF8CharsToNewTwoByteCharsZ(JSContext * cx,const UTF8Chars utf8,size_t * outlen,arena_id_t destArenaId)406 TwoByteCharsZ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx,
407 const UTF8Chars utf8,
408 size_t* outlen,
409 arena_id_t destArenaId) {
410 return InflateUTF8StringHelper<OnUTF8Error::Throw, TwoByteCharsZ>(
411 cx, utf8, outlen, destArenaId);
412 }
413
UTF8CharsToNewTwoByteCharsZ(JSContext * cx,const ConstUTF8CharsZ & utf8,size_t * outlen,arena_id_t destArenaId)414 TwoByteCharsZ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx,
415 const ConstUTF8CharsZ& utf8,
416 size_t* outlen,
417 arena_id_t destArenaId) {
418 UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
419 return InflateUTF8StringHelper<OnUTF8Error::Throw, TwoByteCharsZ>(
420 cx, chars, outlen, destArenaId);
421 }
422
LossyUTF8CharsToNewTwoByteCharsZ(JSContext * cx,const JS::UTF8Chars utf8,size_t * outlen,arena_id_t destArenaId)423 TwoByteCharsZ JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx,
424 const JS::UTF8Chars utf8,
425 size_t* outlen,
426 arena_id_t destArenaId) {
427 return InflateUTF8StringHelper<OnUTF8Error::InsertReplacementCharacter,
428 TwoByteCharsZ>(cx, utf8, outlen, destArenaId);
429 }
430
LossyUTF8CharsToNewTwoByteCharsZ(JSContext * cx,const JS::ConstUTF8CharsZ & utf8,size_t * outlen,arena_id_t destArenaId)431 TwoByteCharsZ JS::LossyUTF8CharsToNewTwoByteCharsZ(
432 JSContext* cx, const JS::ConstUTF8CharsZ& utf8, size_t* outlen,
433 arena_id_t destArenaId) {
434 UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
435 return InflateUTF8StringHelper<OnUTF8Error::InsertReplacementCharacter,
436 TwoByteCharsZ>(cx, chars, outlen, destArenaId);
437 }
438
UpdateSmallestEncodingForChar(char16_t c,JS::SmallestEncoding * encoding)439 static void UpdateSmallestEncodingForChar(char16_t c,
440 JS::SmallestEncoding* encoding) {
441 JS::SmallestEncoding newEncoding = JS::SmallestEncoding::ASCII;
442 if (c >= 0x80) {
443 if (c < 0x100) {
444 newEncoding = JS::SmallestEncoding::Latin1;
445 } else {
446 newEncoding = JS::SmallestEncoding::UTF16;
447 }
448 }
449 if (newEncoding > *encoding) {
450 *encoding = newEncoding;
451 }
452 }
453
FindSmallestEncoding(UTF8Chars utf8)454 JS::SmallestEncoding JS::FindSmallestEncoding(UTF8Chars utf8) {
455 Span<unsigned char> unsignedSpan = utf8;
456 auto charSpan = AsChars(unsignedSpan);
457 size_t upTo = AsciiValidUpTo(charSpan);
458 if (upTo == charSpan.Length()) {
459 return SmallestEncoding::ASCII;
460 }
461 if (IsUtf8Latin1(charSpan.From(upTo))) {
462 return SmallestEncoding::Latin1;
463 }
464 return SmallestEncoding::UTF16;
465 }
466
UTF8CharsToNewLatin1CharsZ(JSContext * cx,const UTF8Chars utf8,size_t * outlen,arena_id_t destArenaId)467 Latin1CharsZ JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8,
468 size_t* outlen,
469 arena_id_t destArenaId) {
470 return InflateUTF8StringHelper<OnUTF8Error::Throw, Latin1CharsZ>(
471 cx, utf8, outlen, destArenaId);
472 }
473
LossyUTF8CharsToNewLatin1CharsZ(JSContext * cx,const UTF8Chars utf8,size_t * outlen,arena_id_t destArenaId)474 Latin1CharsZ JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx,
475 const UTF8Chars utf8,
476 size_t* outlen,
477 arena_id_t destArenaId) {
478 return InflateUTF8StringHelper<OnUTF8Error::InsertQuestionMark, Latin1CharsZ>(
479 cx, utf8, outlen, destArenaId);
480 }
481
482 /**
483 * Atomization Helpers.
484 *
485 * These functions are extremely single-use, and are not intended for general
486 * consumption.
487 */
488
GetUTF8AtomizationData(JSContext * cx,const JS::UTF8Chars utf8,size_t * outlen,JS::SmallestEncoding * encoding,HashNumber * hashNum)489 bool GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars utf8,
490 size_t* outlen, JS::SmallestEncoding* encoding,
491 HashNumber* hashNum) {
492 *outlen = 0;
493 *encoding = JS::SmallestEncoding::ASCII;
494 *hashNum = 0;
495
496 auto getMetadata = [outlen, encoding,
497 hashNum](char16_t c) -> LoopDisposition {
498 (*outlen)++;
499 UpdateSmallestEncodingForChar(c, encoding);
500 *hashNum = mozilla::AddToHash(*hashNum, c);
501 return LoopDisposition::Continue;
502 };
503 if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx, utf8, getMetadata)) {
504 return false;
505 }
506
507 return true;
508 }
509
510 template <typename CharT>
UTF8EqualsChars(const JS::UTF8Chars utfChars,const CharT * chars)511 bool UTF8EqualsChars(const JS::UTF8Chars utfChars, const CharT* chars) {
512 size_t ind = 0;
513 bool isEqual = true;
514
515 auto checkEqual = [&isEqual, &ind, chars](char16_t c) -> LoopDisposition {
516 #ifdef DEBUG
517 JS::SmallestEncoding encoding = JS::SmallestEncoding::ASCII;
518 UpdateSmallestEncodingForChar(c, &encoding);
519 if (std::is_same_v<CharT, JS::Latin1Char>) {
520 MOZ_ASSERT(encoding <= JS::SmallestEncoding::Latin1);
521 } else if (!std::is_same_v<CharT, char16_t>) {
522 MOZ_CRASH("Invalid character type in UTF8EqualsChars");
523 }
524 #endif
525
526 if (CharT(c) != chars[ind]) {
527 isEqual = false;
528 return LoopDisposition::Break;
529 }
530
531 ind++;
532 return LoopDisposition::Continue;
533 };
534
535 // To get here, you must have checked your work.
536 InflateUTF8ToUTF16<OnUTF8Error::Crash>(/* cx = */ nullptr, utfChars,
537 checkEqual);
538
539 return isEqual;
540 }
541
542 template bool UTF8EqualsChars(const JS::UTF8Chars, const char16_t*);
543 template bool UTF8EqualsChars(const JS::UTF8Chars, const JS::Latin1Char*);
544
545 template <typename CharT>
InflateUTF8CharsToBufferAndTerminate(const JS::UTF8Chars src,CharT * dst,size_t dstLen,JS::SmallestEncoding encoding)546 void InflateUTF8CharsToBufferAndTerminate(const JS::UTF8Chars src, CharT* dst,
547 size_t dstLen,
548 JS::SmallestEncoding encoding) {
549 CopyAndInflateUTF8IntoBuffer<OnUTF8Error::Crash>(
550 /* cx = */ nullptr, src, dst, dstLen,
551 encoding == JS::SmallestEncoding::ASCII);
552 }
553
554 template void InflateUTF8CharsToBufferAndTerminate(
555 const UTF8Chars src, char16_t* dst, size_t dstLen,
556 JS::SmallestEncoding encoding);
557 template void InflateUTF8CharsToBufferAndTerminate(
558 const UTF8Chars src, JS::Latin1Char* dst, size_t dstLen,
559 JS::SmallestEncoding encoding);
560
561 #ifdef DEBUG
validate(size_t aLength)562 void JS::ConstUTF8CharsZ::validate(size_t aLength) {
563 MOZ_ASSERT(data_);
564 UTF8Chars chars(data_, aLength);
565 auto nop = [](char16_t) -> LoopDisposition {
566 return LoopDisposition::Continue;
567 };
568 InflateUTF8ToUTF16<OnUTF8Error::Crash>(/* cx = */ nullptr, chars, nop);
569 }
570 #endif
571
StringIsASCII(const char * s)572 bool JS::StringIsASCII(const char* s) {
573 while (*s) {
574 if (*s & 0x80) {
575 return false;
576 }
577 s++;
578 }
579 return true;
580 }
581
StringIsASCII(Span<const char> s)582 bool JS::StringIsASCII(Span<const char> s) { return IsAscii(s); }
583
append(const Utf8Unit * units,size_t len)584 bool StringBuffer::append(const Utf8Unit* units, size_t len) {
585 if (isLatin1()) {
586 Latin1CharBuffer& latin1 = latin1Chars();
587
588 while (len > 0) {
589 if (!IsAscii(*units)) {
590 break;
591 }
592
593 if (!latin1.append(units->toUnsignedChar())) {
594 return false;
595 }
596
597 ++units;
598 --len;
599 }
600 if (len == 0) {
601 return true;
602 }
603
604 // Non-ASCII doesn't *necessarily* mean we couldn't keep appending to
605 // |latin1|, but it's only possible for [U+0080, U+0100) code points,
606 // and handling the full complexity of UTF-8 only for that very small
607 // additional range isn't worth it. Inflate to two-byte storage before
608 // appending the remaining code points.
609 if (!inflateChars()) {
610 return false;
611 }
612 }
613
614 UTF8Chars remainingUtf8(units, len);
615
616 // Determine how many UTF-16 code units are required to represent the
617 // remaining units.
618 size_t utf16Len = 0;
619 auto countInflated = [&utf16Len](char16_t c) -> LoopDisposition {
620 utf16Len++;
621 return LoopDisposition::Continue;
622 };
623 if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx_, remainingUtf8,
624 countInflated)) {
625 return false;
626 }
627
628 TwoByteCharBuffer& buf = twoByteChars();
629
630 size_t i = buf.length();
631 if (!buf.growByUninitialized(utf16Len)) {
632 return false;
633 }
634 MOZ_ASSERT(i + utf16Len == buf.length(),
635 "growByUninitialized assumed to increase length immediately");
636
637 char16_t* toFill = &buf[i];
638 auto appendUtf16 = [&toFill](char16_t unit) {
639 *toFill++ = unit;
640 return LoopDisposition::Continue;
641 };
642
643 MOZ_ALWAYS_TRUE(
644 InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx_, remainingUtf8, appendUtf16));
645 MOZ_ASSERT(toFill == buf.end());
646 return true;
647 }
648