1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * vim: set ts=8 sts=2 et sw=2 tw=80:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #ifndef js_CharacterEncoding_h
8 #define js_CharacterEncoding_h
9
10 #include "mozilla/Range.h"
11 #include "mozilla/Span.h"
12
13 #include "js/TypeDecls.h"
14 #include "js/Utility.h"
15
16 class JSLinearString;
17
18 namespace mozilla {
19 union Utf8Unit;
20 }
21
22 namespace JS {
23
24 /*
25 * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
26 * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
27 * byte is treated as a 2-byte character, and there is no way to pass in a
28 * string containing characters beyond U+00FF.
29 */
30 class Latin1Chars : public mozilla::Range<Latin1Char> {
31 typedef mozilla::Range<Latin1Char> Base;
32
33 public:
34 using CharT = Latin1Char;
35
36 Latin1Chars() = default;
Latin1Chars(char * aBytes,size_t aLength)37 Latin1Chars(char* aBytes, size_t aLength)
38 : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
Latin1Chars(const Latin1Char * aBytes,size_t aLength)39 Latin1Chars(const Latin1Char* aBytes, size_t aLength)
40 : Base(const_cast<Latin1Char*>(aBytes), aLength) {}
Latin1Chars(const char * aBytes,size_t aLength)41 Latin1Chars(const char* aBytes, size_t aLength)
42 : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)),
43 aLength) {}
44 };
45
46 /*
47 * Like Latin1Chars, but the chars are const.
48 */
49 class ConstLatin1Chars : public mozilla::Range<const Latin1Char> {
50 typedef mozilla::Range<const Latin1Char> Base;
51
52 public:
53 using CharT = Latin1Char;
54
55 ConstLatin1Chars() = default;
ConstLatin1Chars(const Latin1Char * aChars,size_t aLength)56 ConstLatin1Chars(const Latin1Char* aChars, size_t aLength)
57 : Base(aChars, aLength) {}
58 };
59
60 /*
61 * A Latin1Chars, but with \0 termination for C compatibility.
62 */
63 class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char> {
64 typedef mozilla::RangedPtr<Latin1Char> Base;
65
66 public:
67 using CharT = Latin1Char;
68
Latin1CharsZ()69 Latin1CharsZ() : Base(nullptr, 0) {} // NOLINT
70
Latin1CharsZ(char * aBytes,size_t aLength)71 Latin1CharsZ(char* aBytes, size_t aLength)
72 : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {
73 MOZ_ASSERT(aBytes[aLength] == '\0');
74 }
75
Latin1CharsZ(Latin1Char * aBytes,size_t aLength)76 Latin1CharsZ(Latin1Char* aBytes, size_t aLength) : Base(aBytes, aLength) {
77 MOZ_ASSERT(aBytes[aLength] == '\0');
78 }
79
80 using Base::operator=;
81
c_str()82 char* c_str() { return reinterpret_cast<char*>(get()); }
83 };
84
85 class UTF8Chars : public mozilla::Range<unsigned char> {
86 typedef mozilla::Range<unsigned char> Base;
87
88 public:
89 using CharT = unsigned char;
90
91 UTF8Chars() = default;
UTF8Chars(char * aBytes,size_t aLength)92 UTF8Chars(char* aBytes, size_t aLength)
93 : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {}
UTF8Chars(const char * aBytes,size_t aLength)94 UTF8Chars(const char* aBytes, size_t aLength)
95 : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)),
96 aLength) {}
UTF8Chars(mozilla::Utf8Unit * aUnits,size_t aLength)97 UTF8Chars(mozilla::Utf8Unit* aUnits, size_t aLength)
98 : UTF8Chars(reinterpret_cast<char*>(aUnits), aLength) {}
UTF8Chars(const mozilla::Utf8Unit * aUnits,size_t aLength)99 UTF8Chars(const mozilla::Utf8Unit* aUnits, size_t aLength)
100 : UTF8Chars(reinterpret_cast<const char*>(aUnits), aLength) {}
101 };
102
103 /*
104 * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
105 */
106 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char> {
107 typedef mozilla::RangedPtr<unsigned char> Base;
108
109 public:
110 using CharT = unsigned char;
111
UTF8CharsZ()112 UTF8CharsZ() : Base(nullptr, 0) {} // NOLINT
113
UTF8CharsZ(char * aBytes,size_t aLength)114 UTF8CharsZ(char* aBytes, size_t aLength)
115 : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {
116 MOZ_ASSERT(aBytes[aLength] == '\0');
117 }
118
UTF8CharsZ(unsigned char * aBytes,size_t aLength)119 UTF8CharsZ(unsigned char* aBytes, size_t aLength) : Base(aBytes, aLength) {
120 MOZ_ASSERT(aBytes[aLength] == '\0');
121 }
122
UTF8CharsZ(mozilla::Utf8Unit * aUnits,size_t aLength)123 UTF8CharsZ(mozilla::Utf8Unit* aUnits, size_t aLength)
124 : UTF8CharsZ(reinterpret_cast<char*>(aUnits), aLength) {}
125
126 using Base::operator=;
127
c_str()128 char* c_str() { return reinterpret_cast<char*>(get()); }
129 };
130
131 /*
132 * A wrapper for a "const char*" that is encoded using UTF-8.
133 * This class does not manage ownership of the data; that is left
134 * to others. This differs from UTF8CharsZ in that the chars are
135 * const and it disallows assignment.
136 */
137 class JS_PUBLIC_API ConstUTF8CharsZ {
138 const char* data_;
139
140 public:
141 using CharT = unsigned char;
142
ConstUTF8CharsZ()143 ConstUTF8CharsZ() : data_(nullptr) {}
144
ConstUTF8CharsZ(const char * aBytes,size_t aLength)145 ConstUTF8CharsZ(const char* aBytes, size_t aLength) : data_(aBytes) {
146 MOZ_ASSERT(aBytes[aLength] == '\0');
147 #ifdef DEBUG
148 validate(aLength);
149 #endif
150 }
151
get()152 const void* get() const { return data_; }
153
c_str()154 const char* c_str() const { return data_; }
155
156 explicit operator bool() const { return data_ != nullptr; }
157
158 private:
159 #ifdef DEBUG
160 void validate(size_t aLength);
161 #endif
162 };
163
164 /*
165 * SpiderMonkey uses a 2-byte character representation: it is a
166 * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
167 * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
168 * sufficiently dedicated JavaScript program to be fully unicode-aware by
169 * manually interpreting UTF-16 extension characters embedded in the JS
170 * string.
171 */
172 class TwoByteChars : public mozilla::Range<char16_t> {
173 typedef mozilla::Range<char16_t> Base;
174
175 public:
176 using CharT = char16_t;
177
178 TwoByteChars() = default;
TwoByteChars(char16_t * aChars,size_t aLength)179 TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
TwoByteChars(const char16_t * aChars,size_t aLength)180 TwoByteChars(const char16_t* aChars, size_t aLength)
181 : Base(const_cast<char16_t*>(aChars), aLength) {}
182 };
183
184 /*
185 * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
186 */
187 class TwoByteCharsZ : public mozilla::RangedPtr<char16_t> {
188 typedef mozilla::RangedPtr<char16_t> Base;
189
190 public:
191 using CharT = char16_t;
192
TwoByteCharsZ()193 TwoByteCharsZ() : Base(nullptr, 0) {} // NOLINT
194
TwoByteCharsZ(char16_t * chars,size_t length)195 TwoByteCharsZ(char16_t* chars, size_t length) : Base(chars, length) {
196 MOZ_ASSERT(chars[length] == '\0');
197 }
198
199 using Base::operator=;
200 };
201
202 typedef mozilla::RangedPtr<const char16_t> ConstCharPtr;
203
204 /*
205 * Like TwoByteChars, but the chars are const.
206 */
207 class ConstTwoByteChars : public mozilla::Range<const char16_t> {
208 typedef mozilla::Range<const char16_t> Base;
209
210 public:
211 using CharT = char16_t;
212
213 ConstTwoByteChars() = default;
ConstTwoByteChars(const char16_t * aChars,size_t aLength)214 ConstTwoByteChars(const char16_t* aChars, size_t aLength)
215 : Base(aChars, aLength) {}
216 };
217
218 /*
219 * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
220 * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
221 * contains any UTF-16 extension characters, then this may give invalid Latin1
222 * output. The returned string is zero terminated. The returned string or the
223 * returned string's |start()| must be freed with JS_free or js_free,
224 * respectively. If allocation fails, an OOM error will be set and the method
225 * will return a nullptr chars (which can be tested for with the ! operator).
226 * This method cannot trigger GC.
227 */
228 extern Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(
229 JSContext* cx, const mozilla::Range<const char16_t> tbchars);
230
LossyTwoByteCharsToNewLatin1CharsZ(JSContext * cx,const char16_t * begin,size_t length)231 inline Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx,
232 const char16_t* begin,
233 size_t length) {
234 const mozilla::Range<const char16_t> tbchars(begin, length);
235 return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars);
236 }
237
238 template <typename CharT>
239 extern UTF8CharsZ CharsToNewUTF8CharsZ(JSContext* cx,
240 const mozilla::Range<CharT> chars);
241
242 JS_PUBLIC_API uint32_t Utf8ToOneUcs4Char(const uint8_t* utf8Buffer,
243 int utf8Length);
244
245 /*
246 * Inflate bytes in UTF-8 encoding to char16_t.
247 * - On error, returns an empty TwoByteCharsZ.
248 * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
249 * its length; the length value excludes the trailing null.
250 */
251 extern JS_PUBLIC_API TwoByteCharsZ
252 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen,
253 arena_id_t destArenaId);
254
255 /*
256 * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ.
257 */
258 extern JS_PUBLIC_API TwoByteCharsZ
259 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8,
260 size_t* outlen, arena_id_t destArenaId);
261
262 /*
263 * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8
264 * characters will be replaced by \uFFFD. No exception will be thrown for
265 * malformed UTF-8 input.
266 */
267 extern JS_PUBLIC_API TwoByteCharsZ
268 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8,
269 size_t* outlen, arena_id_t destArenaId);
270
271 extern JS_PUBLIC_API TwoByteCharsZ
272 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8,
273 size_t* outlen, arena_id_t destArenaId);
274
275 /*
276 * Returns the length of the char buffer required to encode |s| as UTF8.
277 * Does not include the null-terminator.
278 */
279 JS_PUBLIC_API size_t GetDeflatedUTF8StringLength(JSLinearString* s);
280
281 /*
282 * Encode whole scalar values of |src| into |dst| as UTF-8 until |src| is
283 * exhausted or too little space is available in |dst| to fit the scalar
284 * value. Lone surrogates are converted to REPLACEMENT CHARACTER. Return
285 * the number of bytes of |dst| that were filled.
286 *
287 * Use |JS_EncodeStringToUTF8BufferPartial| if your string isn't already
288 * linear.
289 *
290 * Given |JSString* str = JS_FORGET_STRING_LINEARNESS(src)|,
291 * if |JS::StringHasLatin1Chars(str)|, then |src| is always fully converted
292 * if |dst.Length() >= JS_GetStringLength(str) * 2|. Otherwise |src| is
293 * always fully converted if |dst.Length() >= JS_GetStringLength(str) * 3|.
294 *
295 * The exact space required is always |GetDeflatedUTF8StringLength(str)|.
296 */
297 JS_PUBLIC_API size_t DeflateStringToUTF8Buffer(JSLinearString* src,
298 mozilla::Span<char> dst);
299
300 /*
301 * The smallest character encoding capable of fully representing a particular
302 * string.
303 */
304 enum class SmallestEncoding { ASCII, Latin1, UTF16 };
305
306 /*
307 * Returns the smallest encoding possible for the given string: if all
308 * codepoints are <128 then ASCII, otherwise if all codepoints are <256
309 * Latin-1, else UTF16.
310 */
311 JS_PUBLIC_API SmallestEncoding FindSmallestEncoding(UTF8Chars utf8);
312
313 /*
314 * Return a null-terminated Latin-1 string copied from the input string,
315 * storing its length (excluding null terminator) in |*outlen|. Fail and
316 * report an error if the string contains non-Latin-1 codepoints. Returns
317 * Latin1CharsZ() on failure.
318 */
319 extern JS_PUBLIC_API Latin1CharsZ
320 UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen,
321 arena_id_t destArenaId);
322
323 /*
324 * Return a null-terminated Latin-1 string copied from the input string,
325 * storing its length (excluding null terminator) in |*outlen|. Non-Latin-1
326 * codepoints are replaced by '?'. Returns Latin1CharsZ() on failure.
327 */
328 extern JS_PUBLIC_API Latin1CharsZ
329 LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8,
330 size_t* outlen, arena_id_t destArenaId);
331
332 /*
333 * Returns true if all characters in the given null-terminated string are
334 * ASCII, i.e. < 0x80, false otherwise.
335 */
336 extern JS_PUBLIC_API bool StringIsASCII(const char* s);
337
338 /*
339 * Returns true if all characters in the given span are ASCII,
340 * i.e. < 0x80, false otherwise.
341 */
342 extern JS_PUBLIC_API bool StringIsASCII(mozilla::Span<const char> s);
343
344 } // namespace JS
345
JS_free(JS::Latin1CharsZ & ptr)346 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
JS_free(JS::UTF8CharsZ & ptr)347 inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
348
349 /**
350 * DEPRECATED
351 *
352 * Allocate memory sufficient to contain the characters of |str| truncated to
353 * Latin-1 and a trailing null terminator, fill the memory with the characters
354 * interpreted in that manner plus the null terminator, and return a pointer to
355 * the memory.
356 *
357 * This function *loses information* when it copies the characters of |str| if
358 * |str| contains code units greater than 0xFF. Additionally, users that
359 * depend on null-termination will misinterpret the copied characters if |str|
360 * contains any nulls. Avoid using this function if possible, because it will
361 * eventually be removed.
362 */
363 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToLatin1(JSContext* cx,
364 JSString* str);
365
366 /**
367 * DEPRECATED
368 *
369 * Same behavior as JS_EncodeStringToLatin1(), but encode into a UTF-8 string.
370 *
371 * This function *loses information* when it copies the characters of |str| if
372 * |str| contains invalid UTF-16: U+FFFD REPLACEMENT CHARACTER will be copied
373 * instead.
374 *
375 * The returned string is also subject to misinterpretation if |str| contains
376 * any nulls (which are faithfully transcribed into the returned string, but
377 * which will implicitly truncate the string if it's passed to functions that
378 * expect null-terminated strings).
379 *
380 * Avoid using this function if possible, because we'll remove it once we can
381 * devise a better API for the task.
382 */
383 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToUTF8(
384 JSContext* cx, JS::Handle<JSString*> str);
385
386 /**
387 * DEPRECATED
388 *
389 * Same behavior as JS_EncodeStringToLatin1(), but encode into an ASCII string.
390 *
391 * This function asserts in debug mode that the input string contains only
392 * ASCII characters.
393 *
394 * The returned string is also subject to misinterpretation if |str| contains
395 * any nulls (which are faithfully transcribed into the returned string, but
396 * which will implicitly truncate the string if it's passed to functions that
397 * expect null-terminated strings).
398 *
399 * Avoid using this function if possible, because we'll remove it once we can
400 * devise a better API for the task.
401 */
402 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToASCII(JSContext* cx,
403 JSString* str);
404
405 #endif /* js_CharacterEncoding_h */
406