1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 * vim: set ts=8 sts=4 et sw=4 tw=99:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #ifndef js_CharacterEncoding_h
8 #define js_CharacterEncoding_h
9
10 #include "mozilla/Range.h"
11
12 #include "js/TypeDecls.h"
13 #include "js/Utility.h"
14
15 namespace js {
16 class ExclusiveContext;
17 } // namespace js
18
19 class JSFlatString;
20
21 namespace JS {
22
23 /*
24 * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
25 * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
26 * byte is treated as a 2-byte character, and there is no way to pass in a
27 * string containing characters beyond U+00FF.
28 */
29 class Latin1Chars : public mozilla::Range<Latin1Char>
30 {
31 typedef mozilla::Range<Latin1Char> Base;
32
33 public:
34 using CharT = Latin1Char;
35
Latin1Chars()36 Latin1Chars() : Base() {}
Latin1Chars(char * aBytes,size_t aLength)37 Latin1Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
Latin1Chars(const Latin1Char * aBytes,size_t aLength)38 Latin1Chars(const Latin1Char* aBytes, size_t aLength)
39 : Base(const_cast<Latin1Char*>(aBytes), aLength)
40 {}
Latin1Chars(const char * aBytes,size_t aLength)41 Latin1Chars(const char* aBytes, size_t aLength)
42 : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)), aLength)
43 {}
44 };
45
46 /*
47 * A Latin1Chars, but with \0 termination for C compatibility.
48 */
49 class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char>
50 {
51 typedef mozilla::RangedPtr<Latin1Char> Base;
52
53 public:
54 using CharT = Latin1Char;
55
Latin1CharsZ()56 Latin1CharsZ() : Base(nullptr, 0) {}
57
Latin1CharsZ(char * aBytes,size_t aLength)58 Latin1CharsZ(char* aBytes, size_t aLength)
59 : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength)
60 {
61 MOZ_ASSERT(aBytes[aLength] == '\0');
62 }
63
Latin1CharsZ(Latin1Char * aBytes,size_t aLength)64 Latin1CharsZ(Latin1Char* aBytes, size_t aLength)
65 : Base(aBytes, aLength)
66 {
67 MOZ_ASSERT(aBytes[aLength] == '\0');
68 }
69
70 using Base::operator=;
71
c_str()72 char* c_str() { return reinterpret_cast<char*>(get()); }
73 };
74
75 class UTF8Chars : public mozilla::Range<unsigned char>
76 {
77 typedef mozilla::Range<unsigned char> Base;
78
79 public:
80 using CharT = unsigned char;
81
UTF8Chars()82 UTF8Chars() : Base() {}
UTF8Chars(char * aBytes,size_t aLength)83 UTF8Chars(char* aBytes, size_t aLength)
84 : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
85 {}
UTF8Chars(const char * aBytes,size_t aLength)86 UTF8Chars(const char* aBytes, size_t aLength)
87 : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), aLength)
88 {}
89 };
90
91 /*
92 * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
93 */
94 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
95 {
96 typedef mozilla::RangedPtr<unsigned char> Base;
97
98 public:
99 using CharT = unsigned char;
100
UTF8CharsZ()101 UTF8CharsZ() : Base(nullptr, 0) {}
102
UTF8CharsZ(char * aBytes,size_t aLength)103 UTF8CharsZ(char* aBytes, size_t aLength)
104 : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
105 {
106 MOZ_ASSERT(aBytes[aLength] == '\0');
107 }
108
UTF8CharsZ(unsigned char * aBytes,size_t aLength)109 UTF8CharsZ(unsigned char* aBytes, size_t aLength)
110 : Base(aBytes, aLength)
111 {
112 MOZ_ASSERT(aBytes[aLength] == '\0');
113 }
114
115 using Base::operator=;
116
c_str()117 char* c_str() { return reinterpret_cast<char*>(get()); }
118 };
119
120 /*
121 * A wrapper for a "const char*" that is encoded using UTF-8.
122 * This class does not manage ownership of the data; that is left
123 * to others. This differs from UTF8CharsZ in that the chars are
124 * const and it allows assignment.
125 */
JS_PUBLIC_API(ConstUTF8CharsZ)126 class JS_PUBLIC_API(ConstUTF8CharsZ)
127 {
128 const char* data_;
129
130 public:
131 using CharT = unsigned char;
132
133 ConstUTF8CharsZ() : data_(nullptr)
134 {}
135
136 ConstUTF8CharsZ(const char* aBytes, size_t aLength)
137 : data_(aBytes)
138 {
139 MOZ_ASSERT(aBytes[aLength] == '\0');
140 #ifdef DEBUG
141 validate(aLength);
142 #endif
143 }
144
145 const void* get() const { return data_; }
146
147 const char* c_str() const { return data_; }
148
149 explicit operator bool() const { return data_ != nullptr; }
150
151 private:
152 #ifdef DEBUG
153 void validate(size_t aLength);
154 #endif
155 };
156
157 /*
158 * SpiderMonkey uses a 2-byte character representation: it is a
159 * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
160 * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
161 * sufficiently dedicated JavaScript program to be fully unicode-aware by
162 * manually interpreting UTF-16 extension characters embedded in the JS
163 * string.
164 */
165 class TwoByteChars : public mozilla::Range<char16_t>
166 {
167 typedef mozilla::Range<char16_t> Base;
168
169 public:
170 using CharT = char16_t;
171
TwoByteChars()172 TwoByteChars() : Base() {}
TwoByteChars(char16_t * aChars,size_t aLength)173 TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
TwoByteChars(const char16_t * aChars,size_t aLength)174 TwoByteChars(const char16_t* aChars, size_t aLength) : Base(const_cast<char16_t*>(aChars), aLength) {}
175 };
176
177 /*
178 * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
179 */
180 class TwoByteCharsZ : public mozilla::RangedPtr<char16_t>
181 {
182 typedef mozilla::RangedPtr<char16_t> Base;
183
184 public:
185 using CharT = char16_t;
186
TwoByteCharsZ()187 TwoByteCharsZ() : Base(nullptr, 0) {}
188
TwoByteCharsZ(char16_t * chars,size_t length)189 TwoByteCharsZ(char16_t* chars, size_t length)
190 : Base(chars, length)
191 {
192 MOZ_ASSERT(chars[length] == '\0');
193 }
194
195 using Base::operator=;
196 };
197
198 typedef mozilla::RangedPtr<const char16_t> ConstCharPtr;
199
200 /*
201 * Like TwoByteChars, but the chars are const.
202 */
203 class ConstTwoByteChars : public mozilla::Range<const char16_t>
204 {
205 typedef mozilla::Range<const char16_t> Base;
206
207 public:
208 using CharT = char16_t;
209
ConstTwoByteChars()210 ConstTwoByteChars() : Base() {}
ConstTwoByteChars(const char16_t * aChars,size_t aLength)211 ConstTwoByteChars(const char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
212 };
213
214 /*
215 * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
216 * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
217 * contains any UTF-16 extension characters, then this may give invalid Latin1
218 * output. The returned string is zero terminated. The returned string or the
219 * returned string's |start()| must be freed with JS_free or js_free,
220 * respectively. If allocation fails, an OOM error will be set and the method
221 * will return a nullptr chars (which can be tested for with the ! operator).
222 * This method cannot trigger GC.
223 */
224 extern Latin1CharsZ
225 LossyTwoByteCharsToNewLatin1CharsZ(js::ExclusiveContext* cx,
226 const mozilla::Range<const char16_t> tbchars);
227
228 inline Latin1CharsZ
LossyTwoByteCharsToNewLatin1CharsZ(js::ExclusiveContext * cx,const char16_t * begin,size_t length)229 LossyTwoByteCharsToNewLatin1CharsZ(js::ExclusiveContext* cx, const char16_t* begin, size_t length)
230 {
231 const mozilla::Range<const char16_t> tbchars(begin, length);
232 return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars);
233 }
234
235 template <typename CharT>
236 extern UTF8CharsZ
237 CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx, const mozilla::Range<CharT> chars);
238
239 JS_PUBLIC_API(uint32_t)
240 Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length);
241
242 /*
243 * Inflate bytes in UTF-8 encoding to char16_t.
244 * - On error, returns an empty TwoByteCharsZ.
245 * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
246 * its length; the length value excludes the trailing null.
247 */
248 extern JS_PUBLIC_API(TwoByteCharsZ)
249 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
250
251 /*
252 * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ.
253 */
254 extern JS_PUBLIC_API(TwoByteCharsZ)
255 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen);
256
257 /*
258 * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters
259 * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8
260 * input.
261 */
262 extern JS_PUBLIC_API(TwoByteCharsZ)
263 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
264
265 extern JS_PUBLIC_API(TwoByteCharsZ)
266 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen);
267
268 /*
269 * Returns the length of the char buffer required to encode |s| as UTF8.
270 * Does not include the null-terminator.
271 */
272 JS_PUBLIC_API(size_t)
273 GetDeflatedUTF8StringLength(JSFlatString* s);
274
275 /*
276 * Encode |src| as UTF8. The caller must either ensure |dst| has enough space
277 * to encode the entire string or pass the length of the buffer as |dstlenp|,
278 * in which case the function will encode characters from the string until
279 * the buffer is exhausted. Does not write the null terminator.
280 *
281 * If |dstlenp| is provided, it will be updated to hold the number of bytes
282 * written to the buffer. If |numcharsp| is provided, it will be updated to hold
283 * the number of Unicode characters written to the buffer (which can be less
284 * than the length of the string, if the buffer is exhausted before the string
285 * is fully encoded).
286 */
287 JS_PUBLIC_API(void)
288 DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst,
289 size_t* dstlenp = nullptr, size_t* numcharsp = nullptr);
290
291 /*
292 * The smallest character encoding capable of fully representing a particular
293 * string.
294 */
295 enum class SmallestEncoding {
296 ASCII,
297 Latin1,
298 UTF16
299 };
300
301 /*
302 * Returns the smallest encoding possible for the given string: if all
303 * codepoints are <128 then ASCII, otherwise if all codepoints are <256
304 * Latin-1, else UTF16.
305 */
306 JS_PUBLIC_API(SmallestEncoding)
307 FindSmallestEncoding(UTF8Chars utf8);
308
309 /*
310 * Return a null-terminated Latin-1 string copied from the input string,
311 * storing its length (excluding null terminator) in |*outlen|. Fail and
312 * report an error if the string contains non-Latin-1 codepoints. Returns
313 * Latin1CharsZ() on failure.
314 */
315 extern JS_PUBLIC_API(Latin1CharsZ)
316 UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
317
318 /*
319 * Return a null-terminated Latin-1 string copied from the input string,
320 * storing its length (excluding null terminator) in |*outlen|. Non-Latin-1
321 * codepoints are replaced by '?'. Returns Latin1CharsZ() on failure.
322 */
323 extern JS_PUBLIC_API(Latin1CharsZ)
324 LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
325
326 /*
327 * Returns true if all characters in the given null-terminated string are
328 * ASCII, i.e. < 0x80, false otherwise.
329 */
330 extern JS_PUBLIC_API(bool)
331 StringIsASCII(const char* s);
332
333 } // namespace JS
334
JS_free(JS::Latin1CharsZ & ptr)335 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
JS_free(JS::UTF8CharsZ & ptr)336 inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
337
338 #endif /* js_CharacterEncoding_h */
339