1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2  * vim: set ts=8 sts=4 et sw=4 tw=99:
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #include "js/CharacterEncoding.h"
8 
9 #include "mozilla/Range.h"
10 #include "mozilla/Sprintf.h"
11 
12 #include <algorithm>
13 #include <type_traits>
14 
15 #include "vm/JSContext.h"
16 
17 using namespace js;
18 
LossyTwoByteCharsToNewLatin1CharsZ(JSContext * cx,const mozilla::Range<const char16_t> tbchars)19 Latin1CharsZ JS::LossyTwoByteCharsToNewLatin1CharsZ(
20     JSContext* cx, const mozilla::Range<const char16_t> tbchars) {
21   MOZ_ASSERT(cx);
22   size_t len = tbchars.length();
23   unsigned char* latin1 = cx->pod_malloc<unsigned char>(len + 1);
24   if (!latin1) return Latin1CharsZ();
25   for (size_t i = 0; i < len; ++i)
26     latin1[i] = static_cast<unsigned char>(tbchars[i]);
27   latin1[len] = '\0';
28   return Latin1CharsZ(latin1, len);
29 }
30 
31 template <typename CharT>
GetDeflatedUTF8StringLength(const CharT * chars,size_t nchars)32 static size_t GetDeflatedUTF8StringLength(const CharT* chars, size_t nchars) {
33   size_t nbytes = nchars;
34   for (const CharT* end = chars + nchars; chars < end; chars++) {
35     char16_t c = *chars;
36     if (c < 0x80) continue;
37     uint32_t v;
38     if (0xD800 <= c && c <= 0xDFFF) {
39       /* nbytes sets 1 length since this is surrogate pair. */
40       if (c >= 0xDC00 || (chars + 1) == end) {
41         nbytes += 2; /* Bad Surrogate */
42         continue;
43       }
44       char16_t c2 = chars[1];
45       if (c2 < 0xDC00 || c2 > 0xDFFF) {
46         nbytes += 2; /* Bad Surrogate */
47         continue;
48       }
49       v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
50       nbytes--;
51       chars++;
52     } else {
53       v = c;
54     }
55     v >>= 11;
56     nbytes++;
57     while (v) {
58       v >>= 5;
59       nbytes++;
60     }
61   }
62   return nbytes;
63 }
64 
GetDeflatedUTF8StringLength(JSFlatString * s)65 JS_PUBLIC_API size_t JS::GetDeflatedUTF8StringLength(JSFlatString* s) {
66   JS::AutoCheckCannotGC nogc;
67   return s->hasLatin1Chars()
68              ? ::GetDeflatedUTF8StringLength(s->latin1Chars(nogc), s->length())
69              : ::GetDeflatedUTF8StringLength(s->twoByteChars(nogc),
70                                              s->length());
71 }
72 
73 static const char16_t UTF8_REPLACEMENT_CHAR = 0xFFFD;
74 
75 template <typename CharT>
DeflateStringToUTF8Buffer(const CharT * src,size_t srclen,mozilla::RangedPtr<char> dst,size_t * dstlenp=nullptr,size_t * numcharsp=nullptr)76 static void DeflateStringToUTF8Buffer(const CharT* src, size_t srclen,
77                                       mozilla::RangedPtr<char> dst,
78                                       size_t* dstlenp = nullptr,
79                                       size_t* numcharsp = nullptr) {
80   size_t capacity = 0;
81   if (dstlenp) {
82     capacity = *dstlenp;
83     *dstlenp = 0;
84   }
85   if (numcharsp) *numcharsp = 0;
86 
87   while (srclen) {
88     uint32_t v;
89     char16_t c = *src++;
90     srclen--;
91     if (c >= 0xDC00 && c <= 0xDFFF) {
92       v = UTF8_REPLACEMENT_CHAR;
93     } else if (c < 0xD800 || c > 0xDBFF) {
94       v = c;
95     } else {
96       if (srclen < 1) {
97         v = UTF8_REPLACEMENT_CHAR;
98       } else {
99         char16_t c2 = *src;
100         if (c2 < 0xDC00 || c2 > 0xDFFF) {
101           v = UTF8_REPLACEMENT_CHAR;
102         } else {
103           src++;
104           srclen--;
105           v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
106         }
107       }
108     }
109 
110     size_t utf8Len;
111     if (v < 0x0080) {
112       /* no encoding necessary - performance hack */
113       if (dstlenp && *dstlenp + 1 > capacity) return;
114       *dst++ = char(v);
115       utf8Len = 1;
116     } else {
117       uint8_t utf8buf[4];
118       utf8Len = OneUcs4ToUtf8Char(utf8buf, v);
119       if (dstlenp && *dstlenp + utf8Len > capacity) return;
120       for (size_t i = 0; i < utf8Len; i++) *dst++ = char(utf8buf[i]);
121     }
122 
123     if (dstlenp) *dstlenp += utf8Len;
124     if (numcharsp) (*numcharsp)++;
125   }
126 }
127 
DeflateStringToUTF8Buffer(JSFlatString * src,mozilla::RangedPtr<char> dst,size_t * dstlenp,size_t * numcharsp)128 JS_PUBLIC_API void JS::DeflateStringToUTF8Buffer(JSFlatString* src,
129                                                  mozilla::RangedPtr<char> dst,
130                                                  size_t* dstlenp,
131                                                  size_t* numcharsp) {
132   JS::AutoCheckCannotGC nogc;
133   return src->hasLatin1Chars()
134              ? ::DeflateStringToUTF8Buffer(src->latin1Chars(nogc),
135                                            src->length(), dst, dstlenp,
136                                            numcharsp)
137              : ::DeflateStringToUTF8Buffer(src->twoByteChars(nogc),
138                                            src->length(), dst, dstlenp,
139                                            numcharsp);
140 }
141 
142 template <typename CharT>
CharsToNewUTF8CharsZ(JSContext * maybeCx,const mozilla::Range<CharT> chars)143 UTF8CharsZ JS::CharsToNewUTF8CharsZ(JSContext* maybeCx,
144                                     const mozilla::Range<CharT> chars) {
145   /* Get required buffer size. */
146   const CharT* str = chars.begin().get();
147   size_t len = ::GetDeflatedUTF8StringLength(str, chars.length());
148 
149   /* Allocate buffer. */
150   char* utf8;
151   if (maybeCx)
152     utf8 = maybeCx->pod_malloc<char>(len + 1);
153   else
154     utf8 = js_pod_malloc<char>(len + 1);
155   if (!utf8) return UTF8CharsZ();
156 
157   /* Encode to UTF8. */
158   ::DeflateStringToUTF8Buffer(str, chars.length(),
159                               mozilla::RangedPtr<char>(utf8, len));
160   utf8[len] = '\0';
161 
162   return UTF8CharsZ(utf8, len);
163 }
164 
165 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
166     JSContext* maybeCx, const mozilla::Range<Latin1Char> chars);
167 
168 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
169     JSContext* maybeCx, const mozilla::Range<char16_t> chars);
170 
171 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
172     JSContext* maybeCx, const mozilla::Range<const Latin1Char> chars);
173 
174 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
175     JSContext* maybeCx, const mozilla::Range<const char16_t> chars);
176 
177 static const uint32_t INVALID_UTF8 = UINT32_MAX;
178 
179 /*
180  * Convert a utf8 character sequence into a UCS-4 character and return that
181  * character.  It is assumed that the caller already checked that the sequence
182  * is valid.
183  */
Utf8ToOneUcs4Char(const uint8_t * utf8Buffer,int utf8Length)184 uint32_t JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length) {
185   MOZ_ASSERT(1 <= utf8Length && utf8Length <= 4);
186 
187   if (utf8Length == 1) {
188     MOZ_ASSERT(!(*utf8Buffer & 0x80));
189     return *utf8Buffer;
190   }
191 
192   /* from Unicode 3.1, non-shortest form is illegal */
193   static const uint32_t minucs4Table[] = {0x80, 0x800, 0x10000};
194 
195   MOZ_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) ==
196              (0x100 - (1 << (8 - utf8Length))));
197   uint32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1);
198   uint32_t minucs4Char = minucs4Table[utf8Length - 2];
199   while (--utf8Length) {
200     MOZ_ASSERT((*utf8Buffer & 0xC0) == 0x80);
201     ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F);
202   }
203 
204   if (MOZ_UNLIKELY(ucs4Char < minucs4Char ||
205                    (ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF)))
206     return INVALID_UTF8;
207 
208   return ucs4Char;
209 }
210 
ReportInvalidCharacter(JSContext * cx,uint32_t offset)211 static void ReportInvalidCharacter(JSContext* cx, uint32_t offset) {
212   char buffer[10];
213   SprintfLiteral(buffer, "%u", offset);
214   JS_ReportErrorFlagsAndNumberASCII(cx, JSREPORT_ERROR, GetErrorMessage,
215                                     nullptr, JSMSG_MALFORMED_UTF8_CHAR, buffer);
216 }
217 
ReportBufferTooSmall(JSContext * cx,uint32_t dummy)218 static void ReportBufferTooSmall(JSContext* cx, uint32_t dummy) {
219   JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
220                             JSMSG_BUFFER_TOO_SMALL);
221 }
222 
ReportTooBigCharacter(JSContext * cx,uint32_t v)223 static void ReportTooBigCharacter(JSContext* cx, uint32_t v) {
224   char buffer[10];
225   SprintfLiteral(buffer, "0x%x", v + 0x10000);
226   JS_ReportErrorFlagsAndNumberASCII(cx, JSREPORT_ERROR, GetErrorMessage,
227                                     nullptr, JSMSG_UTF8_CHAR_TOO_LARGE, buffer);
228 }
229 
230 enum InflateUTF8Action {
231   CountAndReportInvalids,
232   CountAndIgnoreInvalids,
233   AssertNoInvalids,
234   Copy,
235   FindEncoding
236 };
237 
238 static const char16_t REPLACE_UTF8 = 0xFFFD;
239 static const Latin1Char REPLACE_UTF8_LATIN1 = '?';
240 
241 // If making changes to this algorithm, make sure to also update
242 // LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp
243 template <InflateUTF8Action Action, typename CharT, class ContextT>
InflateUTF8StringToBuffer(ContextT * cx,const UTF8Chars src,CharT * dst,size_t * dstlenp,JS::SmallestEncoding * smallestEncoding)244 static bool InflateUTF8StringToBuffer(ContextT* cx, const UTF8Chars src,
245                                       CharT* dst, size_t* dstlenp,
246                                       JS::SmallestEncoding* smallestEncoding) {
247   if (Action != AssertNoInvalids)
248     *smallestEncoding = JS::SmallestEncoding::ASCII;
249   auto RequireLatin1 = [&smallestEncoding] {
250     *smallestEncoding =
251         std::max(JS::SmallestEncoding::Latin1, *smallestEncoding);
252   };
253   auto RequireUTF16 = [&smallestEncoding] {
254     *smallestEncoding = JS::SmallestEncoding::UTF16;
255   };
256 
257   // Count how many code units need to be in the inflated string.
258   // |i| is the index into |src|, and |j| is the the index into |dst|.
259   size_t srclen = src.length();
260   uint32_t j = 0;
261   for (uint32_t i = 0; i < srclen; i++, j++) {
262     uint32_t v = uint32_t(src[i]);
263     if (!(v & 0x80)) {
264       // ASCII code unit.  Simple copy.
265       if (Action == Copy) dst[j] = CharT(v);
266 
267     } else {
268       // Non-ASCII code unit.  Determine its length in bytes (n).
269       uint32_t n = 1;
270       while (v & (0x80 >> n)) n++;
271 
272 #define INVALID(report, arg, n2)                               \
273   do {                                                         \
274     if (Action == CountAndReportInvalids) {                    \
275       report(cx, arg);                                         \
276       return false;                                            \
277     } else if (Action == AssertNoInvalids) {                   \
278       MOZ_CRASH("invalid UTF-8 string: " #report);             \
279     } else {                                                   \
280       if (Action == Copy) {                                    \
281         if (std::is_same<decltype(dst[0]), Latin1Char>::value) \
282           dst[j] = CharT(REPLACE_UTF8_LATIN1);                 \
283         else                                                   \
284           dst[j] = CharT(REPLACE_UTF8);                        \
285       } else {                                                 \
286         MOZ_ASSERT(Action == CountAndIgnoreInvalids ||         \
287                    Action == FindEncoding);                    \
288       }                                                        \
289       n = n2;                                                  \
290       goto invalidMultiByteCodeUnit;                           \
291     }                                                          \
292   } while (0)
293 
294       // Check the leading byte.
295       if (n < 2 || n > 4) INVALID(ReportInvalidCharacter, i, 1);
296 
297       // Check that |src| is large enough to hold an n-byte code unit.
298       if (i + n > srclen) INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1);
299 
300       // Check the second byte.  From Unicode Standard v6.2, Table 3-7
301       // Well-Formed UTF-8 Byte Sequences.
302       if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) ||  // E0 A0~BF
303           (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) ||  // ED 80~9F
304           (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) ||  // F0 90~BF
305           (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80))    // F4 80~8F
306       {
307         INVALID(ReportInvalidCharacter, i, 1);
308       }
309 
310       // Check the continuation bytes.
311       for (uint32_t m = 1; m < n; m++) {
312         if ((src[i + m] & 0xC0) != 0x80) INVALID(ReportInvalidCharacter, i, m);
313       }
314 
315       // Determine the code unit's length in CharT and act accordingly.
316       v = JS::Utf8ToOneUcs4Char((uint8_t*)&src[i], n);
317       if (Action != AssertNoInvalids) {
318         if (v > 0xff) {
319           RequireUTF16();
320           if (Action == FindEncoding) {
321             MOZ_ASSERT(dst == nullptr);
322             return true;
323           }
324         } else {
325           RequireLatin1();
326         }
327       }
328       if (v < 0x10000) {
329         // The n-byte UTF8 code unit will fit in a single CharT.
330         if (Action == Copy) dst[j] = CharT(v);
331       } else {
332         v -= 0x10000;
333         if (v <= 0xFFFFF) {
334           // The n-byte UTF8 code unit will fit in two CharT units.
335           if (Action == Copy) dst[j] = CharT((v >> 10) + 0xD800);
336           j++;
337           if (Action == Copy) dst[j] = CharT((v & 0x3FF) + 0xDC00);
338 
339         } else {
340           // The n-byte UTF8 code unit won't fit in two CharT units.
341           INVALID(ReportTooBigCharacter, v, 1);
342         }
343       }
344 
345     invalidMultiByteCodeUnit:
346       // Move i to the last byte of the multi-byte code unit;  the loop
347       // header will do the final i++ to move to the start of the next
348       // code unit.
349       i += n - 1;
350       if (Action != AssertNoInvalids) RequireUTF16();
351     }
352   }
353 
354   if (Action != AssertNoInvalids && Action != FindEncoding) *dstlenp = j;
355 
356   return true;
357 }
358 
359 template <InflateUTF8Action Action, typename CharsT, class ContextT>
InflateUTF8StringHelper(ContextT * cx,const UTF8Chars src,size_t * outlen)360 static CharsT InflateUTF8StringHelper(ContextT* cx, const UTF8Chars src,
361                                       size_t* outlen) {
362   using CharT = typename CharsT::CharT;
363   *outlen = 0;
364 
365   JS::SmallestEncoding encoding;
366   if (!InflateUTF8StringToBuffer<Action, CharT>(cx, src, /* dst = */ nullptr,
367                                                 outlen, &encoding))
368     return CharsT();
369 
370   CharT* dst = cx->template pod_malloc<CharT>(*outlen + 1);  // +1 for NUL
371   if (!dst) {
372     ReportOutOfMemory(cx);
373     return CharsT();
374   }
375 
376   if (encoding == JS::SmallestEncoding::ASCII) {
377     size_t srclen = src.length();
378     MOZ_ASSERT(*outlen == srclen);
379     for (uint32_t i = 0; i < srclen; i++) dst[i] = CharT(src[i]);
380   } else {
381     MOZ_ALWAYS_TRUE((InflateUTF8StringToBuffer<Copy, CharT>(
382         cx, src, dst, outlen, &encoding)));
383   }
384 
385   dst[*outlen] = 0;  // NUL char
386 
387   return CharsT(dst, *outlen);
388 }
389 
UTF8CharsToNewTwoByteCharsZ(JSContext * cx,const UTF8Chars utf8,size_t * outlen)390 TwoByteCharsZ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx,
391                                               const UTF8Chars utf8,
392                                               size_t* outlen) {
393   return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(
394       cx, utf8, outlen);
395 }
396 
UTF8CharsToNewTwoByteCharsZ(JSContext * cx,const ConstUTF8CharsZ & utf8,size_t * outlen)397 TwoByteCharsZ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx,
398                                               const ConstUTF8CharsZ& utf8,
399                                               size_t* outlen) {
400   UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
401   return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(
402       cx, chars, outlen);
403 }
404 
LossyUTF8CharsToNewTwoByteCharsZ(JSContext * cx,const JS::UTF8Chars utf8,size_t * outlen)405 TwoByteCharsZ JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx,
406                                                    const JS::UTF8Chars utf8,
407                                                    size_t* outlen) {
408   return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(
409       cx, utf8, outlen);
410 }
411 
LossyUTF8CharsToNewTwoByteCharsZ(JSContext * cx,const JS::ConstUTF8CharsZ & utf8,size_t * outlen)412 TwoByteCharsZ JS::LossyUTF8CharsToNewTwoByteCharsZ(
413     JSContext* cx, const JS::ConstUTF8CharsZ& utf8, size_t* outlen) {
414   UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
415   return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(
416       cx, chars, outlen);
417 }
418 
FindSmallestEncoding(UTF8Chars utf8)419 JS::SmallestEncoding JS::FindSmallestEncoding(UTF8Chars utf8) {
420   JS::SmallestEncoding encoding;
421   MOZ_ALWAYS_TRUE((InflateUTF8StringToBuffer<FindEncoding, char16_t, JSContext>(
422       /* cx = */ nullptr, utf8,
423       /* dst = */ nullptr,
424       /* dstlen = */ nullptr, &encoding)));
425   return encoding;
426 }
427 
UTF8CharsToNewLatin1CharsZ(JSContext * cx,const UTF8Chars utf8,size_t * outlen)428 Latin1CharsZ JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8,
429                                             size_t* outlen) {
430   return InflateUTF8StringHelper<CountAndReportInvalids, Latin1CharsZ>(cx, utf8,
431                                                                        outlen);
432 }
433 
LossyUTF8CharsToNewLatin1CharsZ(JSContext * cx,const UTF8Chars utf8,size_t * outlen)434 Latin1CharsZ JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx,
435                                                  const UTF8Chars utf8,
436                                                  size_t* outlen) {
437   return InflateUTF8StringHelper<CountAndIgnoreInvalids, Latin1CharsZ>(cx, utf8,
438                                                                        outlen);
439 }
440 
441 #ifdef DEBUG
validate(size_t aLength)442 void JS::ConstUTF8CharsZ::validate(size_t aLength) {
443   MOZ_ASSERT(data_);
444   UTF8Chars chars(data_, aLength);
445   InflateUTF8StringToBuffer<AssertNoInvalids, char16_t, JSContext>(
446       /* cx = */ nullptr, chars,
447       /* dst = */ nullptr,
448       /* dstlen = */ nullptr,
449       /* smallestEncoding = */ nullptr);
450 }
451 #endif
452 
StringIsASCII(const char * s)453 bool JS::StringIsASCII(const char* s) {
454   while (*s) {
455     if (*s & 0x80) return false;
456     s++;
457   }
458   return true;
459 }
460 
StringIsUTF8(const uint8_t * s,uint32_t length)461 bool JS::StringIsUTF8(const uint8_t* s, uint32_t length) {
462   const uint8_t* limit = s + length;
463   while (s < limit) {
464     uint32_t len;
465     uint32_t min;
466     uint32_t n = *s;
467     if ((n & 0x80) == 0) {
468       len = 1;
469       min = 0;
470     } else if ((n & 0xE0) == 0xC0) {
471       len = 2;
472       min = 0x80;
473       n &= 0x1F;
474     } else if ((n & 0xF0) == 0xE0) {
475       len = 3;
476       min = 0x800;
477       n &= 0x0F;
478     } else if ((n & 0xF8) == 0xF0) {
479       len = 4;
480       min = 0x10000;
481       n &= 0x07;
482     } else {
483       return false;
484     }
485     if (s + len > limit) return false;
486     for (uint32_t i = 1; i < len; i++) {
487       if ((s[i] & 0xC0) != 0x80) return false;
488       n = (n << 6) | (s[i] & 0x3F);
489     }
490     if (n < min || (0xD800 <= n && n < 0xE000) || n >= 0x110000) return false;
491     s += len;
492   }
493   return true;
494 }
495