1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2  * vim: set ts=8 sts=2 et sw=2 tw=80:
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #ifndef js_CharacterEncoding_h
8 #define js_CharacterEncoding_h
9 
10 #include "mozilla/Range.h"
11 #include "mozilla/Span.h"
12 #include "mozilla/Utf8.h"
13 
14 #include "js/TypeDecls.h"
15 #include "js/Utility.h"
16 
17 class JSLinearString;
18 
19 namespace JS {
20 
21 /*
22  * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
23  * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
24  * byte is treated as a 2-byte character, and there is no way to pass in a
25  * string containing characters beyond U+00FF.
26  */
27 class Latin1Chars : public mozilla::Range<Latin1Char> {
28   typedef mozilla::Range<Latin1Char> Base;
29 
30  public:
31   using CharT = Latin1Char;
32 
33   Latin1Chars() = default;
Latin1Chars(char * aBytes,size_t aLength)34   Latin1Chars(char* aBytes, size_t aLength)
35       : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
Latin1Chars(const Latin1Char * aBytes,size_t aLength)36   Latin1Chars(const Latin1Char* aBytes, size_t aLength)
37       : Base(const_cast<Latin1Char*>(aBytes), aLength) {}
Latin1Chars(const char * aBytes,size_t aLength)38   Latin1Chars(const char* aBytes, size_t aLength)
39       : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)),
40              aLength) {}
41 };
42 
43 /*
44  * Like Latin1Chars, but the chars are const.
45  */
46 class ConstLatin1Chars : public mozilla::Range<const Latin1Char> {
47   typedef mozilla::Range<const Latin1Char> Base;
48 
49  public:
50   using CharT = Latin1Char;
51 
52   ConstLatin1Chars() = default;
ConstLatin1Chars(const Latin1Char * aChars,size_t aLength)53   ConstLatin1Chars(const Latin1Char* aChars, size_t aLength)
54       : Base(aChars, aLength) {}
55 };
56 
57 /*
58  * A Latin1Chars, but with \0 termination for C compatibility.
59  */
60 class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char> {
61   typedef mozilla::RangedPtr<Latin1Char> Base;
62 
63  public:
64   using CharT = Latin1Char;
65 
Latin1CharsZ()66   Latin1CharsZ() : Base(nullptr, 0) {}  // NOLINT
67 
Latin1CharsZ(char * aBytes,size_t aLength)68   Latin1CharsZ(char* aBytes, size_t aLength)
69       : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {
70     MOZ_ASSERT(aBytes[aLength] == '\0');
71   }
72 
Latin1CharsZ(Latin1Char * aBytes,size_t aLength)73   Latin1CharsZ(Latin1Char* aBytes, size_t aLength) : Base(aBytes, aLength) {
74     MOZ_ASSERT(aBytes[aLength] == '\0');
75   }
76 
77   using Base::operator=;
78 
c_str()79   char* c_str() { return reinterpret_cast<char*>(get()); }
80 };
81 
82 class UTF8Chars : public mozilla::Range<unsigned char> {
83   typedef mozilla::Range<unsigned char> Base;
84 
85  public:
86   using CharT = unsigned char;
87 
88   UTF8Chars() = default;
UTF8Chars(char * aBytes,size_t aLength)89   UTF8Chars(char* aBytes, size_t aLength)
90       : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {}
UTF8Chars(const char * aBytes,size_t aLength)91   UTF8Chars(const char* aBytes, size_t aLength)
92       : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)),
93              aLength) {}
UTF8Chars(mozilla::Utf8Unit * aUnits,size_t aLength)94   UTF8Chars(mozilla::Utf8Unit* aUnits, size_t aLength)
95       : UTF8Chars(reinterpret_cast<char*>(aUnits), aLength) {}
UTF8Chars(const mozilla::Utf8Unit * aUnits,size_t aLength)96   UTF8Chars(const mozilla::Utf8Unit* aUnits, size_t aLength)
97       : UTF8Chars(reinterpret_cast<const char*>(aUnits), aLength) {}
98 };
99 
100 /*
101  * Similar to UTF8Chars, but contains WTF-8.
102  * https://simonsapin.github.io/wtf-8/
103  */
104 class WTF8Chars : public mozilla::Range<unsigned char> {
105   typedef mozilla::Range<unsigned char> Base;
106 
107  public:
108   using CharT = unsigned char;
109 
110   WTF8Chars() = default;
WTF8Chars(char * aBytes,size_t aLength)111   WTF8Chars(char* aBytes, size_t aLength)
112       : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {}
WTF8Chars(const char * aBytes,size_t aLength)113   WTF8Chars(const char* aBytes, size_t aLength)
114       : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)),
115              aLength) {}
116 };
117 
118 /*
119  * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
120  */
121 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char> {
122   typedef mozilla::RangedPtr<unsigned char> Base;
123 
124  public:
125   using CharT = unsigned char;
126 
UTF8CharsZ()127   UTF8CharsZ() : Base(nullptr, 0) {}  // NOLINT
128 
UTF8CharsZ(char * aBytes,size_t aLength)129   UTF8CharsZ(char* aBytes, size_t aLength)
130       : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {
131     MOZ_ASSERT(aBytes[aLength] == '\0');
132   }
133 
UTF8CharsZ(unsigned char * aBytes,size_t aLength)134   UTF8CharsZ(unsigned char* aBytes, size_t aLength) : Base(aBytes, aLength) {
135     MOZ_ASSERT(aBytes[aLength] == '\0');
136   }
137 
UTF8CharsZ(mozilla::Utf8Unit * aUnits,size_t aLength)138   UTF8CharsZ(mozilla::Utf8Unit* aUnits, size_t aLength)
139       : UTF8CharsZ(reinterpret_cast<char*>(aUnits), aLength) {}
140 
141   using Base::operator=;
142 
c_str()143   char* c_str() { return reinterpret_cast<char*>(get()); }
144 };
145 
146 /*
147  * A wrapper for a "const char*" that is encoded using UTF-8.
148  * This class does not manage ownership of the data; that is left
149  * to others.  This differs from UTF8CharsZ in that the chars are
150  * const and it disallows assignment.
151  */
152 class JS_PUBLIC_API ConstUTF8CharsZ {
153   const char* data_;
154 
155  public:
156   using CharT = unsigned char;
157 
ConstUTF8CharsZ()158   ConstUTF8CharsZ() : data_(nullptr) {}
159 
ConstUTF8CharsZ(const char * aBytes,size_t aLength)160   ConstUTF8CharsZ(const char* aBytes, size_t aLength) : data_(aBytes) {
161     MOZ_ASSERT(aBytes[aLength] == '\0');
162 #ifdef DEBUG
163     validate(aLength);
164 #endif
165   }
166 
get()167   const void* get() const { return data_; }
168 
c_str()169   const char* c_str() const { return data_; }
170 
171   explicit operator bool() const { return data_ != nullptr; }
172 
173  private:
174 #ifdef DEBUG
175   void validate(size_t aLength);
176 #endif
177 };
178 
179 /*
180  * SpiderMonkey uses a 2-byte character representation: it is a
181  * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
182  * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
183  * sufficiently dedicated JavaScript program to be fully unicode-aware by
184  * manually interpreting UTF-16 extension characters embedded in the JS
185  * string.
186  */
187 class TwoByteChars : public mozilla::Range<char16_t> {
188   typedef mozilla::Range<char16_t> Base;
189 
190  public:
191   using CharT = char16_t;
192 
193   TwoByteChars() = default;
TwoByteChars(char16_t * aChars,size_t aLength)194   TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
TwoByteChars(const char16_t * aChars,size_t aLength)195   TwoByteChars(const char16_t* aChars, size_t aLength)
196       : Base(const_cast<char16_t*>(aChars), aLength) {}
197 };
198 
199 /*
200  * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
201  */
202 class TwoByteCharsZ : public mozilla::RangedPtr<char16_t> {
203   typedef mozilla::RangedPtr<char16_t> Base;
204 
205  public:
206   using CharT = char16_t;
207 
TwoByteCharsZ()208   TwoByteCharsZ() : Base(nullptr, 0) {}  // NOLINT
209 
TwoByteCharsZ(char16_t * chars,size_t length)210   TwoByteCharsZ(char16_t* chars, size_t length) : Base(chars, length) {
211     MOZ_ASSERT(chars[length] == '\0');
212   }
213 
214   using Base::operator=;
215 };
216 
217 typedef mozilla::RangedPtr<const char16_t> ConstCharPtr;
218 
219 /*
220  * Like TwoByteChars, but the chars are const.
221  */
222 class ConstTwoByteChars : public mozilla::Range<const char16_t> {
223   typedef mozilla::Range<const char16_t> Base;
224 
225  public:
226   using CharT = char16_t;
227 
228   ConstTwoByteChars() = default;
ConstTwoByteChars(const char16_t * aChars,size_t aLength)229   ConstTwoByteChars(const char16_t* aChars, size_t aLength)
230       : Base(aChars, aLength) {}
231 };
232 
233 /*
234  * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
235  * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
236  * contains any UTF-16 extension characters, then this may give invalid Latin1
237  * output. The returned string is zero terminated. The returned string or the
238  * returned string's |start()| must be freed with JS_free or js_free,
239  * respectively. If allocation fails, an OOM error will be set and the method
240  * will return a nullptr chars (which can be tested for with the ! operator).
241  * This method cannot trigger GC.
242  */
243 extern Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(
244     JSContext* cx, const mozilla::Range<const char16_t> tbchars);
245 
LossyTwoByteCharsToNewLatin1CharsZ(JSContext * cx,const char16_t * begin,size_t length)246 inline Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx,
247                                                        const char16_t* begin,
248                                                        size_t length) {
249   const mozilla::Range<const char16_t> tbchars(begin, length);
250   return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars);
251 }
252 
253 template <typename CharT>
254 extern UTF8CharsZ CharsToNewUTF8CharsZ(JSContext* maybeCx,
255                                        const mozilla::Range<CharT> chars);
256 
257 JS_PUBLIC_API uint32_t Utf8ToOneUcs4Char(const uint8_t* utf8Buffer,
258                                          int utf8Length);
259 
260 /*
261  * Inflate bytes in UTF-8 encoding to char16_t.
262  * - On error, returns an empty TwoByteCharsZ.
263  * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
264  *   its length;  the length value excludes the trailing null.
265  */
266 extern JS_PUBLIC_API TwoByteCharsZ
267 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen,
268                             arena_id_t destArenaId);
269 
270 /*
271  * Like UTF8CharsToNewTwoByteCharsZ, but for WTF8Chars.
272  */
273 extern JS_PUBLIC_API TwoByteCharsZ
274 WTF8CharsToNewTwoByteCharsZ(JSContext* cx, const WTF8Chars wtf8, size_t* outlen,
275                             arena_id_t destArenaId);
276 
277 /*
278  * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ.
279  */
280 extern JS_PUBLIC_API TwoByteCharsZ
281 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8,
282                             size_t* outlen, arena_id_t destArenaId);
283 
284 /*
285  * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8
286  * characters will be replaced by \uFFFD. No exception will be thrown for
287  * malformed UTF-8 input.
288  */
289 extern JS_PUBLIC_API TwoByteCharsZ
290 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8,
291                                  size_t* outlen, arena_id_t destArenaId);
292 
293 extern JS_PUBLIC_API TwoByteCharsZ
294 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8,
295                                  size_t* outlen, arena_id_t destArenaId);
296 
297 /*
298  * Returns the length of the char buffer required to encode |s| as UTF8.
299  * Does not include the null-terminator.
300  */
301 JS_PUBLIC_API size_t GetDeflatedUTF8StringLength(JSLinearString* s);
302 
303 /*
304  * Encode whole scalar values of |src| into |dst| as UTF-8 until |src| is
305  * exhausted or too little space is available in |dst| to fit the scalar
306  * value. Lone surrogates are converted to REPLACEMENT CHARACTER. Return
307  * the number of bytes of |dst| that were filled.
308  *
309  * Use |JS_EncodeStringToUTF8BufferPartial| if your string isn't already
310  * linear.
311  *
312  * Given |JSString* str = JS_FORGET_STRING_LINEARNESS(src)|,
313  * if |JS_StringHasLatin1Chars(str)|, then |src| is always fully converted
314  * if |dst.Length() >= JS_GetStringLength(str) * 2|. Otherwise |src| is
315  * always fully converted if |dst.Length() >= JS_GetStringLength(str) * 3|.
316  *
317  * The exact space required is always |GetDeflatedUTF8StringLength(str)|.
318  */
319 JS_PUBLIC_API size_t DeflateStringToUTF8Buffer(JSLinearString* src,
320                                                mozilla::Span<char> dst);
321 
322 /*
323  * The smallest character encoding capable of fully representing a particular
324  * string.
325  */
326 enum class SmallestEncoding { ASCII, Latin1, UTF16 };
327 
328 /*
329  * Returns the smallest encoding possible for the given string: if all
330  * codepoints are <128 then ASCII, otherwise if all codepoints are <256
331  * Latin-1, else UTF16.
332  */
333 JS_PUBLIC_API SmallestEncoding FindSmallestEncoding(UTF8Chars utf8);
334 
335 /*
336  * Return a null-terminated Latin-1 string copied from the input string,
337  * storing its length (excluding null terminator) in |*outlen|.  Fail and
338  * report an error if the string contains non-Latin-1 codepoints.  Returns
339  * Latin1CharsZ() on failure.
340  */
341 extern JS_PUBLIC_API Latin1CharsZ
342 UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen,
343                            arena_id_t destArenaId);
344 
345 /*
346  * Return a null-terminated Latin-1 string copied from the input string,
347  * storing its length (excluding null terminator) in |*outlen|.  Non-Latin-1
348  * codepoints are replaced by '?'.  Returns Latin1CharsZ() on failure.
349  */
350 extern JS_PUBLIC_API Latin1CharsZ
351 LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8,
352                                 size_t* outlen, arena_id_t destArenaId);
353 
354 /*
355  * Returns true if all characters in the given null-terminated string are
356  * ASCII, i.e. < 0x80, false otherwise.
357  */
358 extern JS_PUBLIC_API bool StringIsASCII(const char* s);
359 
360 /*
361  * Returns true if all characters in the given span are ASCII,
362  * i.e. < 0x80, false otherwise.
363  */
364 extern JS_PUBLIC_API bool StringIsASCII(mozilla::Span<const char> s);
365 
366 }  // namespace JS
367 
JS_free(JS::Latin1CharsZ & ptr)368 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
JS_free(JS::UTF8CharsZ & ptr)369 inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
370 
371 /**
372  * DEPRECATED
373  *
374  * Allocate memory sufficient to contain the characters of |str| truncated to
375  * Latin-1 and a trailing null terminator, fill the memory with the characters
376  * interpreted in that manner plus the null terminator, and return a pointer to
377  * the memory.
378  *
379  * This function *loses information* when it copies the characters of |str| if
380  * |str| contains code units greater than 0xFF.  Additionally, users that
381  * depend on null-termination will misinterpret the copied characters if |str|
382  * contains any nulls.  Avoid using this function if possible, because it will
383  * eventually be removed.
384  */
385 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToLatin1(JSContext* cx,
386                                                              JSString* str);
387 
388 /**
389  * DEPRECATED
390  *
391  * Same behavior as JS_EncodeStringToLatin1(), but encode into a UTF-8 string.
392  *
393  * This function *loses information* when it copies the characters of |str| if
394  * |str| contains invalid UTF-16: U+FFFD REPLACEMENT CHARACTER will be copied
395  * instead.
396  *
397  * The returned string is also subject to misinterpretation if |str| contains
398  * any nulls (which are faithfully transcribed into the returned string, but
399  * which will implicitly truncate the string if it's passed to functions that
400  * expect null-terminated strings).
401  *
402  * Avoid using this function if possible, because we'll remove it once we can
403  * devise a better API for the task.
404  */
405 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToUTF8(
406     JSContext* cx, JS::Handle<JSString*> str);
407 
408 /**
409  * DEPRECATED
410  *
411  * Same behavior as JS_EncodeStringToLatin1(), but encode into an ASCII string.
412  *
413  * This function asserts in debug mode that the input string contains only
414  * ASCII characters.
415  *
416  * The returned string is also subject to misinterpretation if |str| contains
417  * any nulls (which are faithfully transcribed into the returned string, but
418  * which will implicitly truncate the string if it's passed to functions that
419  * expect null-terminated strings).
420  *
421  * Avoid using this function if possible, because we'll remove it once we can
422  * devise a better API for the task.
423  */
424 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToASCII(JSContext* cx,
425                                                             JSString* str);
426 
427 #endif /* js_CharacterEncoding_h */
428