1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2  * vim: set ts=8 sts=2 et sw=2 tw=80:
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #ifndef js_CharacterEncoding_h
8 #define js_CharacterEncoding_h
9 
10 #include "mozilla/Range.h"
11 #include "mozilla/Span.h"
12 
13 #include "js/TypeDecls.h"
14 #include "js/Utility.h"
15 
16 class JSLinearString;
17 
18 namespace mozilla {
19 union Utf8Unit;
20 }
21 
22 namespace JS {
23 
24 /*
25  * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
26  * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
27  * byte is treated as a 2-byte character, and there is no way to pass in a
28  * string containing characters beyond U+00FF.
29  */
30 class Latin1Chars : public mozilla::Range<Latin1Char> {
31   typedef mozilla::Range<Latin1Char> Base;
32 
33  public:
34   using CharT = Latin1Char;
35 
36   Latin1Chars() = default;
Latin1Chars(char * aBytes,size_t aLength)37   Latin1Chars(char* aBytes, size_t aLength)
38       : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
Latin1Chars(const Latin1Char * aBytes,size_t aLength)39   Latin1Chars(const Latin1Char* aBytes, size_t aLength)
40       : Base(const_cast<Latin1Char*>(aBytes), aLength) {}
Latin1Chars(const char * aBytes,size_t aLength)41   Latin1Chars(const char* aBytes, size_t aLength)
42       : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)),
43              aLength) {}
44 };
45 
46 /*
47  * Like Latin1Chars, but the chars are const.
48  */
49 class ConstLatin1Chars : public mozilla::Range<const Latin1Char> {
50   typedef mozilla::Range<const Latin1Char> Base;
51 
52  public:
53   using CharT = Latin1Char;
54 
55   ConstLatin1Chars() = default;
ConstLatin1Chars(const Latin1Char * aChars,size_t aLength)56   ConstLatin1Chars(const Latin1Char* aChars, size_t aLength)
57       : Base(aChars, aLength) {}
58 };
59 
60 /*
61  * A Latin1Chars, but with \0 termination for C compatibility.
62  */
63 class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char> {
64   typedef mozilla::RangedPtr<Latin1Char> Base;
65 
66  public:
67   using CharT = Latin1Char;
68 
Latin1CharsZ()69   Latin1CharsZ() : Base(nullptr, 0) {}  // NOLINT
70 
Latin1CharsZ(char * aBytes,size_t aLength)71   Latin1CharsZ(char* aBytes, size_t aLength)
72       : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {
73     MOZ_ASSERT(aBytes[aLength] == '\0');
74   }
75 
Latin1CharsZ(Latin1Char * aBytes,size_t aLength)76   Latin1CharsZ(Latin1Char* aBytes, size_t aLength) : Base(aBytes, aLength) {
77     MOZ_ASSERT(aBytes[aLength] == '\0');
78   }
79 
80   using Base::operator=;
81 
c_str()82   char* c_str() { return reinterpret_cast<char*>(get()); }
83 };
84 
85 class UTF8Chars : public mozilla::Range<unsigned char> {
86   typedef mozilla::Range<unsigned char> Base;
87 
88  public:
89   using CharT = unsigned char;
90 
91   UTF8Chars() = default;
UTF8Chars(char * aBytes,size_t aLength)92   UTF8Chars(char* aBytes, size_t aLength)
93       : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {}
UTF8Chars(const char * aBytes,size_t aLength)94   UTF8Chars(const char* aBytes, size_t aLength)
95       : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)),
96              aLength) {}
UTF8Chars(mozilla::Utf8Unit * aUnits,size_t aLength)97   UTF8Chars(mozilla::Utf8Unit* aUnits, size_t aLength)
98       : UTF8Chars(reinterpret_cast<char*>(aUnits), aLength) {}
UTF8Chars(const mozilla::Utf8Unit * aUnits,size_t aLength)99   UTF8Chars(const mozilla::Utf8Unit* aUnits, size_t aLength)
100       : UTF8Chars(reinterpret_cast<const char*>(aUnits), aLength) {}
101 };
102 
103 /*
104  * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
105  */
106 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char> {
107   typedef mozilla::RangedPtr<unsigned char> Base;
108 
109  public:
110   using CharT = unsigned char;
111 
UTF8CharsZ()112   UTF8CharsZ() : Base(nullptr, 0) {}  // NOLINT
113 
UTF8CharsZ(char * aBytes,size_t aLength)114   UTF8CharsZ(char* aBytes, size_t aLength)
115       : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {
116     MOZ_ASSERT(aBytes[aLength] == '\0');
117   }
118 
UTF8CharsZ(unsigned char * aBytes,size_t aLength)119   UTF8CharsZ(unsigned char* aBytes, size_t aLength) : Base(aBytes, aLength) {
120     MOZ_ASSERT(aBytes[aLength] == '\0');
121   }
122 
UTF8CharsZ(mozilla::Utf8Unit * aUnits,size_t aLength)123   UTF8CharsZ(mozilla::Utf8Unit* aUnits, size_t aLength)
124       : UTF8CharsZ(reinterpret_cast<char*>(aUnits), aLength) {}
125 
126   using Base::operator=;
127 
c_str()128   char* c_str() { return reinterpret_cast<char*>(get()); }
129 };
130 
131 /*
132  * A wrapper for a "const char*" that is encoded using UTF-8.
133  * This class does not manage ownership of the data; that is left
134  * to others.  This differs from UTF8CharsZ in that the chars are
135  * const and it disallows assignment.
136  */
137 class JS_PUBLIC_API ConstUTF8CharsZ {
138   const char* data_;
139 
140  public:
141   using CharT = unsigned char;
142 
ConstUTF8CharsZ()143   ConstUTF8CharsZ() : data_(nullptr) {}
144 
ConstUTF8CharsZ(const char * aBytes,size_t aLength)145   ConstUTF8CharsZ(const char* aBytes, size_t aLength) : data_(aBytes) {
146     MOZ_ASSERT(aBytes[aLength] == '\0');
147 #ifdef DEBUG
148     validate(aLength);
149 #endif
150   }
151 
get()152   const void* get() const { return data_; }
153 
c_str()154   const char* c_str() const { return data_; }
155 
156   explicit operator bool() const { return data_ != nullptr; }
157 
158  private:
159 #ifdef DEBUG
160   void validate(size_t aLength);
161 #endif
162 };
163 
164 /*
165  * SpiderMonkey uses a 2-byte character representation: it is a
166  * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
167  * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
168  * sufficiently dedicated JavaScript program to be fully unicode-aware by
169  * manually interpreting UTF-16 extension characters embedded in the JS
170  * string.
171  */
172 class TwoByteChars : public mozilla::Range<char16_t> {
173   typedef mozilla::Range<char16_t> Base;
174 
175  public:
176   using CharT = char16_t;
177 
178   TwoByteChars() = default;
TwoByteChars(char16_t * aChars,size_t aLength)179   TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
TwoByteChars(const char16_t * aChars,size_t aLength)180   TwoByteChars(const char16_t* aChars, size_t aLength)
181       : Base(const_cast<char16_t*>(aChars), aLength) {}
182 };
183 
184 /*
185  * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
186  */
187 class TwoByteCharsZ : public mozilla::RangedPtr<char16_t> {
188   typedef mozilla::RangedPtr<char16_t> Base;
189 
190  public:
191   using CharT = char16_t;
192 
TwoByteCharsZ()193   TwoByteCharsZ() : Base(nullptr, 0) {}  // NOLINT
194 
TwoByteCharsZ(char16_t * chars,size_t length)195   TwoByteCharsZ(char16_t* chars, size_t length) : Base(chars, length) {
196     MOZ_ASSERT(chars[length] == '\0');
197   }
198 
199   using Base::operator=;
200 };
201 
202 typedef mozilla::RangedPtr<const char16_t> ConstCharPtr;
203 
204 /*
205  * Like TwoByteChars, but the chars are const.
206  */
207 class ConstTwoByteChars : public mozilla::Range<const char16_t> {
208   typedef mozilla::Range<const char16_t> Base;
209 
210  public:
211   using CharT = char16_t;
212 
213   ConstTwoByteChars() = default;
ConstTwoByteChars(const char16_t * aChars,size_t aLength)214   ConstTwoByteChars(const char16_t* aChars, size_t aLength)
215       : Base(aChars, aLength) {}
216 };
217 
218 /*
219  * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
220  * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
221  * contains any UTF-16 extension characters, then this may give invalid Latin1
222  * output. The returned string is zero terminated. The returned string or the
223  * returned string's |start()| must be freed with JS_free or js_free,
224  * respectively. If allocation fails, an OOM error will be set and the method
225  * will return a nullptr chars (which can be tested for with the ! operator).
226  * This method cannot trigger GC.
227  */
228 extern Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(
229     JSContext* cx, const mozilla::Range<const char16_t> tbchars);
230 
LossyTwoByteCharsToNewLatin1CharsZ(JSContext * cx,const char16_t * begin,size_t length)231 inline Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx,
232                                                        const char16_t* begin,
233                                                        size_t length) {
234   const mozilla::Range<const char16_t> tbchars(begin, length);
235   return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars);
236 }
237 
238 template <typename CharT>
239 extern UTF8CharsZ CharsToNewUTF8CharsZ(JSContext* cx,
240                                        const mozilla::Range<CharT> chars);
241 
242 JS_PUBLIC_API uint32_t Utf8ToOneUcs4Char(const uint8_t* utf8Buffer,
243                                          int utf8Length);
244 
245 /*
246  * Inflate bytes in UTF-8 encoding to char16_t.
247  * - On error, returns an empty TwoByteCharsZ.
248  * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
249  *   its length;  the length value excludes the trailing null.
250  */
251 extern JS_PUBLIC_API TwoByteCharsZ
252 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen,
253                             arena_id_t destArenaId);
254 
255 /*
256  * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ.
257  */
258 extern JS_PUBLIC_API TwoByteCharsZ
259 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8,
260                             size_t* outlen, arena_id_t destArenaId);
261 
262 /*
263  * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8
264  * characters will be replaced by \uFFFD. No exception will be thrown for
265  * malformed UTF-8 input.
266  */
267 extern JS_PUBLIC_API TwoByteCharsZ
268 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8,
269                                  size_t* outlen, arena_id_t destArenaId);
270 
271 extern JS_PUBLIC_API TwoByteCharsZ
272 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8,
273                                  size_t* outlen, arena_id_t destArenaId);
274 
275 /*
276  * Returns the length of the char buffer required to encode |s| as UTF8.
277  * Does not include the null-terminator.
278  */
279 JS_PUBLIC_API size_t GetDeflatedUTF8StringLength(JSLinearString* s);
280 
281 /*
282  * Encode whole scalar values of |src| into |dst| as UTF-8 until |src| is
283  * exhausted or too little space is available in |dst| to fit the scalar
284  * value. Lone surrogates are converted to REPLACEMENT CHARACTER. Return
285  * the number of bytes of |dst| that were filled.
286  *
287  * Use |JS_EncodeStringToUTF8BufferPartial| if your string isn't already
288  * linear.
289  *
290  * Given |JSString* str = JS_FORGET_STRING_LINEARNESS(src)|,
291  * if |JS::StringHasLatin1Chars(str)|, then |src| is always fully converted
292  * if |dst.Length() >= JS_GetStringLength(str) * 2|. Otherwise |src| is
293  * always fully converted if |dst.Length() >= JS_GetStringLength(str) * 3|.
294  *
295  * The exact space required is always |GetDeflatedUTF8StringLength(str)|.
296  */
297 JS_PUBLIC_API size_t DeflateStringToUTF8Buffer(JSLinearString* src,
298                                                mozilla::Span<char> dst);
299 
300 /*
301  * The smallest character encoding capable of fully representing a particular
302  * string.
303  */
304 enum class SmallestEncoding { ASCII, Latin1, UTF16 };
305 
306 /*
307  * Returns the smallest encoding possible for the given string: if all
308  * codepoints are <128 then ASCII, otherwise if all codepoints are <256
309  * Latin-1, else UTF16.
310  */
311 JS_PUBLIC_API SmallestEncoding FindSmallestEncoding(UTF8Chars utf8);
312 
313 /*
314  * Return a null-terminated Latin-1 string copied from the input string,
315  * storing its length (excluding null terminator) in |*outlen|.  Fail and
316  * report an error if the string contains non-Latin-1 codepoints.  Returns
317  * Latin1CharsZ() on failure.
318  */
319 extern JS_PUBLIC_API Latin1CharsZ
320 UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen,
321                            arena_id_t destArenaId);
322 
323 /*
324  * Return a null-terminated Latin-1 string copied from the input string,
325  * storing its length (excluding null terminator) in |*outlen|.  Non-Latin-1
326  * codepoints are replaced by '?'.  Returns Latin1CharsZ() on failure.
327  */
328 extern JS_PUBLIC_API Latin1CharsZ
329 LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8,
330                                 size_t* outlen, arena_id_t destArenaId);
331 
332 /*
333  * Returns true if all characters in the given null-terminated string are
334  * ASCII, i.e. < 0x80, false otherwise.
335  */
336 extern JS_PUBLIC_API bool StringIsASCII(const char* s);
337 
338 /*
339  * Returns true if all characters in the given span are ASCII,
340  * i.e. < 0x80, false otherwise.
341  */
342 extern JS_PUBLIC_API bool StringIsASCII(mozilla::Span<const char> s);
343 
344 }  // namespace JS
345 
JS_free(JS::Latin1CharsZ & ptr)346 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
JS_free(JS::UTF8CharsZ & ptr)347 inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
348 
349 /**
350  * DEPRECATED
351  *
352  * Allocate memory sufficient to contain the characters of |str| truncated to
353  * Latin-1 and a trailing null terminator, fill the memory with the characters
354  * interpreted in that manner plus the null terminator, and return a pointer to
355  * the memory.
356  *
357  * This function *loses information* when it copies the characters of |str| if
358  * |str| contains code units greater than 0xFF.  Additionally, users that
359  * depend on null-termination will misinterpret the copied characters if |str|
360  * contains any nulls.  Avoid using this function if possible, because it will
361  * eventually be removed.
362  */
363 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToLatin1(JSContext* cx,
364                                                              JSString* str);
365 
366 /**
367  * DEPRECATED
368  *
369  * Same behavior as JS_EncodeStringToLatin1(), but encode into a UTF-8 string.
370  *
371  * This function *loses information* when it copies the characters of |str| if
372  * |str| contains invalid UTF-16: U+FFFD REPLACEMENT CHARACTER will be copied
373  * instead.
374  *
375  * The returned string is also subject to misinterpretation if |str| contains
376  * any nulls (which are faithfully transcribed into the returned string, but
377  * which will implicitly truncate the string if it's passed to functions that
378  * expect null-terminated strings).
379  *
380  * Avoid using this function if possible, because we'll remove it once we can
381  * devise a better API for the task.
382  */
383 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToUTF8(
384     JSContext* cx, JS::Handle<JSString*> str);
385 
386 /**
387  * DEPRECATED
388  *
389  * Same behavior as JS_EncodeStringToLatin1(), but encode into an ASCII string.
390  *
391  * This function asserts in debug mode that the input string contains only
392  * ASCII characters.
393  *
394  * The returned string is also subject to misinterpretation if |str| contains
395  * any nulls (which are faithfully transcribed into the returned string, but
396  * which will implicitly truncate the string if it's passed to functions that
397  * expect null-terminated strings).
398  *
399  * Avoid using this function if possible, because we'll remove it once we can
400  * devise a better API for the task.
401  */
402 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToASCII(JSContext* cx,
403                                                             JSString* str);
404 
405 #endif /* js_CharacterEncoding_h */
406