1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * vim: set ts=8 sts=2 et sw=2 tw=80:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #ifndef js_CharacterEncoding_h
8 #define js_CharacterEncoding_h
9
10 #include "mozilla/Range.h"
11 #include "mozilla/Span.h"
12 #include "mozilla/Utf8.h"
13
14 #include "js/TypeDecls.h"
15 #include "js/Utility.h"
16
17 class JSLinearString;
18
19 namespace JS {
20
21 /*
22 * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
23 * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
24 * byte is treated as a 2-byte character, and there is no way to pass in a
25 * string containing characters beyond U+00FF.
26 */
27 class Latin1Chars : public mozilla::Range<Latin1Char> {
28 typedef mozilla::Range<Latin1Char> Base;
29
30 public:
31 using CharT = Latin1Char;
32
33 Latin1Chars() = default;
Latin1Chars(char * aBytes,size_t aLength)34 Latin1Chars(char* aBytes, size_t aLength)
35 : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
Latin1Chars(const Latin1Char * aBytes,size_t aLength)36 Latin1Chars(const Latin1Char* aBytes, size_t aLength)
37 : Base(const_cast<Latin1Char*>(aBytes), aLength) {}
Latin1Chars(const char * aBytes,size_t aLength)38 Latin1Chars(const char* aBytes, size_t aLength)
39 : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)),
40 aLength) {}
41 };
42
43 /*
44 * Like Latin1Chars, but the chars are const.
45 */
46 class ConstLatin1Chars : public mozilla::Range<const Latin1Char> {
47 typedef mozilla::Range<const Latin1Char> Base;
48
49 public:
50 using CharT = Latin1Char;
51
52 ConstLatin1Chars() = default;
ConstLatin1Chars(const Latin1Char * aChars,size_t aLength)53 ConstLatin1Chars(const Latin1Char* aChars, size_t aLength)
54 : Base(aChars, aLength) {}
55 };
56
57 /*
58 * A Latin1Chars, but with \0 termination for C compatibility.
59 */
60 class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char> {
61 typedef mozilla::RangedPtr<Latin1Char> Base;
62
63 public:
64 using CharT = Latin1Char;
65
Latin1CharsZ()66 Latin1CharsZ() : Base(nullptr, 0) {} // NOLINT
67
Latin1CharsZ(char * aBytes,size_t aLength)68 Latin1CharsZ(char* aBytes, size_t aLength)
69 : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {
70 MOZ_ASSERT(aBytes[aLength] == '\0');
71 }
72
Latin1CharsZ(Latin1Char * aBytes,size_t aLength)73 Latin1CharsZ(Latin1Char* aBytes, size_t aLength) : Base(aBytes, aLength) {
74 MOZ_ASSERT(aBytes[aLength] == '\0');
75 }
76
77 using Base::operator=;
78
c_str()79 char* c_str() { return reinterpret_cast<char*>(get()); }
80 };
81
82 class UTF8Chars : public mozilla::Range<unsigned char> {
83 typedef mozilla::Range<unsigned char> Base;
84
85 public:
86 using CharT = unsigned char;
87
88 UTF8Chars() = default;
UTF8Chars(char * aBytes,size_t aLength)89 UTF8Chars(char* aBytes, size_t aLength)
90 : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {}
UTF8Chars(const char * aBytes,size_t aLength)91 UTF8Chars(const char* aBytes, size_t aLength)
92 : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)),
93 aLength) {}
UTF8Chars(mozilla::Utf8Unit * aUnits,size_t aLength)94 UTF8Chars(mozilla::Utf8Unit* aUnits, size_t aLength)
95 : UTF8Chars(reinterpret_cast<char*>(aUnits), aLength) {}
UTF8Chars(const mozilla::Utf8Unit * aUnits,size_t aLength)96 UTF8Chars(const mozilla::Utf8Unit* aUnits, size_t aLength)
97 : UTF8Chars(reinterpret_cast<const char*>(aUnits), aLength) {}
98 };
99
100 /*
101 * Similar to UTF8Chars, but contains WTF-8.
102 * https://simonsapin.github.io/wtf-8/
103 */
104 class WTF8Chars : public mozilla::Range<unsigned char> {
105 typedef mozilla::Range<unsigned char> Base;
106
107 public:
108 using CharT = unsigned char;
109
110 WTF8Chars() = default;
WTF8Chars(char * aBytes,size_t aLength)111 WTF8Chars(char* aBytes, size_t aLength)
112 : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {}
WTF8Chars(const char * aBytes,size_t aLength)113 WTF8Chars(const char* aBytes, size_t aLength)
114 : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)),
115 aLength) {}
116 };
117
118 /*
119 * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
120 */
121 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char> {
122 typedef mozilla::RangedPtr<unsigned char> Base;
123
124 public:
125 using CharT = unsigned char;
126
UTF8CharsZ()127 UTF8CharsZ() : Base(nullptr, 0) {} // NOLINT
128
UTF8CharsZ(char * aBytes,size_t aLength)129 UTF8CharsZ(char* aBytes, size_t aLength)
130 : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {
131 MOZ_ASSERT(aBytes[aLength] == '\0');
132 }
133
UTF8CharsZ(unsigned char * aBytes,size_t aLength)134 UTF8CharsZ(unsigned char* aBytes, size_t aLength) : Base(aBytes, aLength) {
135 MOZ_ASSERT(aBytes[aLength] == '\0');
136 }
137
UTF8CharsZ(mozilla::Utf8Unit * aUnits,size_t aLength)138 UTF8CharsZ(mozilla::Utf8Unit* aUnits, size_t aLength)
139 : UTF8CharsZ(reinterpret_cast<char*>(aUnits), aLength) {}
140
141 using Base::operator=;
142
c_str()143 char* c_str() { return reinterpret_cast<char*>(get()); }
144 };
145
146 /*
147 * A wrapper for a "const char*" that is encoded using UTF-8.
148 * This class does not manage ownership of the data; that is left
149 * to others. This differs from UTF8CharsZ in that the chars are
150 * const and it disallows assignment.
151 */
152 class JS_PUBLIC_API ConstUTF8CharsZ {
153 const char* data_;
154
155 public:
156 using CharT = unsigned char;
157
ConstUTF8CharsZ()158 ConstUTF8CharsZ() : data_(nullptr) {}
159
ConstUTF8CharsZ(const char * aBytes,size_t aLength)160 ConstUTF8CharsZ(const char* aBytes, size_t aLength) : data_(aBytes) {
161 MOZ_ASSERT(aBytes[aLength] == '\0');
162 #ifdef DEBUG
163 validate(aLength);
164 #endif
165 }
166
get()167 const void* get() const { return data_; }
168
c_str()169 const char* c_str() const { return data_; }
170
171 explicit operator bool() const { return data_ != nullptr; }
172
173 private:
174 #ifdef DEBUG
175 void validate(size_t aLength);
176 #endif
177 };
178
179 /*
180 * SpiderMonkey uses a 2-byte character representation: it is a
181 * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
182 * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
183 * sufficiently dedicated JavaScript program to be fully unicode-aware by
184 * manually interpreting UTF-16 extension characters embedded in the JS
185 * string.
186 */
187 class TwoByteChars : public mozilla::Range<char16_t> {
188 typedef mozilla::Range<char16_t> Base;
189
190 public:
191 using CharT = char16_t;
192
193 TwoByteChars() = default;
TwoByteChars(char16_t * aChars,size_t aLength)194 TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
TwoByteChars(const char16_t * aChars,size_t aLength)195 TwoByteChars(const char16_t* aChars, size_t aLength)
196 : Base(const_cast<char16_t*>(aChars), aLength) {}
197 };
198
199 /*
200 * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
201 */
202 class TwoByteCharsZ : public mozilla::RangedPtr<char16_t> {
203 typedef mozilla::RangedPtr<char16_t> Base;
204
205 public:
206 using CharT = char16_t;
207
TwoByteCharsZ()208 TwoByteCharsZ() : Base(nullptr, 0) {} // NOLINT
209
TwoByteCharsZ(char16_t * chars,size_t length)210 TwoByteCharsZ(char16_t* chars, size_t length) : Base(chars, length) {
211 MOZ_ASSERT(chars[length] == '\0');
212 }
213
214 using Base::operator=;
215 };
216
217 typedef mozilla::RangedPtr<const char16_t> ConstCharPtr;
218
219 /*
220 * Like TwoByteChars, but the chars are const.
221 */
222 class ConstTwoByteChars : public mozilla::Range<const char16_t> {
223 typedef mozilla::Range<const char16_t> Base;
224
225 public:
226 using CharT = char16_t;
227
228 ConstTwoByteChars() = default;
ConstTwoByteChars(const char16_t * aChars,size_t aLength)229 ConstTwoByteChars(const char16_t* aChars, size_t aLength)
230 : Base(aChars, aLength) {}
231 };
232
233 /*
234 * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
235 * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
236 * contains any UTF-16 extension characters, then this may give invalid Latin1
237 * output. The returned string is zero terminated. The returned string or the
238 * returned string's |start()| must be freed with JS_free or js_free,
239 * respectively. If allocation fails, an OOM error will be set and the method
240 * will return a nullptr chars (which can be tested for with the ! operator).
241 * This method cannot trigger GC.
242 */
243 extern Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(
244 JSContext* cx, const mozilla::Range<const char16_t> tbchars);
245
LossyTwoByteCharsToNewLatin1CharsZ(JSContext * cx,const char16_t * begin,size_t length)246 inline Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx,
247 const char16_t* begin,
248 size_t length) {
249 const mozilla::Range<const char16_t> tbchars(begin, length);
250 return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars);
251 }
252
253 template <typename CharT>
254 extern UTF8CharsZ CharsToNewUTF8CharsZ(JSContext* maybeCx,
255 const mozilla::Range<CharT> chars);
256
257 JS_PUBLIC_API uint32_t Utf8ToOneUcs4Char(const uint8_t* utf8Buffer,
258 int utf8Length);
259
260 /*
261 * Inflate bytes in UTF-8 encoding to char16_t.
262 * - On error, returns an empty TwoByteCharsZ.
263 * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
264 * its length; the length value excludes the trailing null.
265 */
266 extern JS_PUBLIC_API TwoByteCharsZ
267 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen,
268 arena_id_t destArenaId);
269
270 /*
271 * Like UTF8CharsToNewTwoByteCharsZ, but for WTF8Chars.
272 */
273 extern JS_PUBLIC_API TwoByteCharsZ
274 WTF8CharsToNewTwoByteCharsZ(JSContext* cx, const WTF8Chars wtf8, size_t* outlen,
275 arena_id_t destArenaId);
276
277 /*
278 * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ.
279 */
280 extern JS_PUBLIC_API TwoByteCharsZ
281 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8,
282 size_t* outlen, arena_id_t destArenaId);
283
284 /*
285 * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8
286 * characters will be replaced by \uFFFD. No exception will be thrown for
287 * malformed UTF-8 input.
288 */
289 extern JS_PUBLIC_API TwoByteCharsZ
290 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8,
291 size_t* outlen, arena_id_t destArenaId);
292
293 extern JS_PUBLIC_API TwoByteCharsZ
294 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8,
295 size_t* outlen, arena_id_t destArenaId);
296
297 /*
298 * Returns the length of the char buffer required to encode |s| as UTF8.
299 * Does not include the null-terminator.
300 */
301 JS_PUBLIC_API size_t GetDeflatedUTF8StringLength(JSLinearString* s);
302
303 /*
304 * Encode whole scalar values of |src| into |dst| as UTF-8 until |src| is
305 * exhausted or too little space is available in |dst| to fit the scalar
306 * value. Lone surrogates are converted to REPLACEMENT CHARACTER. Return
307 * the number of bytes of |dst| that were filled.
308 *
309 * Use |JS_EncodeStringToUTF8BufferPartial| if your string isn't already
310 * linear.
311 *
312 * Given |JSString* str = JS_FORGET_STRING_LINEARNESS(src)|,
313 * if |JS_StringHasLatin1Chars(str)|, then |src| is always fully converted
314 * if |dst.Length() >= JS_GetStringLength(str) * 2|. Otherwise |src| is
315 * always fully converted if |dst.Length() >= JS_GetStringLength(str) * 3|.
316 *
317 * The exact space required is always |GetDeflatedUTF8StringLength(str)|.
318 */
319 JS_PUBLIC_API size_t DeflateStringToUTF8Buffer(JSLinearString* src,
320 mozilla::Span<char> dst);
321
322 /*
323 * The smallest character encoding capable of fully representing a particular
324 * string.
325 */
326 enum class SmallestEncoding { ASCII, Latin1, UTF16 };
327
328 /*
329 * Returns the smallest encoding possible for the given string: if all
330 * codepoints are <128 then ASCII, otherwise if all codepoints are <256
331 * Latin-1, else UTF16.
332 */
333 JS_PUBLIC_API SmallestEncoding FindSmallestEncoding(UTF8Chars utf8);
334
335 /*
336 * Return a null-terminated Latin-1 string copied from the input string,
337 * storing its length (excluding null terminator) in |*outlen|. Fail and
338 * report an error if the string contains non-Latin-1 codepoints. Returns
339 * Latin1CharsZ() on failure.
340 */
341 extern JS_PUBLIC_API Latin1CharsZ
342 UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen,
343 arena_id_t destArenaId);
344
345 /*
346 * Return a null-terminated Latin-1 string copied from the input string,
347 * storing its length (excluding null terminator) in |*outlen|. Non-Latin-1
348 * codepoints are replaced by '?'. Returns Latin1CharsZ() on failure.
349 */
350 extern JS_PUBLIC_API Latin1CharsZ
351 LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8,
352 size_t* outlen, arena_id_t destArenaId);
353
354 /*
355 * Returns true if all characters in the given null-terminated string are
356 * ASCII, i.e. < 0x80, false otherwise.
357 */
358 extern JS_PUBLIC_API bool StringIsASCII(const char* s);
359
360 /*
361 * Returns true if all characters in the given span are ASCII,
362 * i.e. < 0x80, false otherwise.
363 */
364 extern JS_PUBLIC_API bool StringIsASCII(mozilla::Span<const char> s);
365
366 } // namespace JS
367
JS_free(JS::Latin1CharsZ & ptr)368 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
JS_free(JS::UTF8CharsZ & ptr)369 inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
370
371 /**
372 * DEPRECATED
373 *
374 * Allocate memory sufficient to contain the characters of |str| truncated to
375 * Latin-1 and a trailing null terminator, fill the memory with the characters
376 * interpreted in that manner plus the null terminator, and return a pointer to
377 * the memory.
378 *
379 * This function *loses information* when it copies the characters of |str| if
380 * |str| contains code units greater than 0xFF. Additionally, users that
381 * depend on null-termination will misinterpret the copied characters if |str|
382 * contains any nulls. Avoid using this function if possible, because it will
383 * eventually be removed.
384 */
385 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToLatin1(JSContext* cx,
386 JSString* str);
387
388 /**
389 * DEPRECATED
390 *
391 * Same behavior as JS_EncodeStringToLatin1(), but encode into a UTF-8 string.
392 *
393 * This function *loses information* when it copies the characters of |str| if
394 * |str| contains invalid UTF-16: U+FFFD REPLACEMENT CHARACTER will be copied
395 * instead.
396 *
397 * The returned string is also subject to misinterpretation if |str| contains
398 * any nulls (which are faithfully transcribed into the returned string, but
399 * which will implicitly truncate the string if it's passed to functions that
400 * expect null-terminated strings).
401 *
402 * Avoid using this function if possible, because we'll remove it once we can
403 * devise a better API for the task.
404 */
405 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToUTF8(
406 JSContext* cx, JS::Handle<JSString*> str);
407
408 /**
409 * DEPRECATED
410 *
411 * Same behavior as JS_EncodeStringToLatin1(), but encode into an ASCII string.
412 *
413 * This function asserts in debug mode that the input string contains only
414 * ASCII characters.
415 *
416 * The returned string is also subject to misinterpretation if |str| contains
417 * any nulls (which are faithfully transcribed into the returned string, but
418 * which will implicitly truncate the string if it's passed to functions that
419 * expect null-terminated strings).
420 *
421 * Avoid using this function if possible, because we'll remove it once we can
422 * devise a better API for the task.
423 */
424 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToASCII(JSContext* cx,
425 JSString* str);
426
427 #endif /* js_CharacterEncoding_h */
428