1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2  * vim: set ts=8 sts=4 et sw=4 tw=99:
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #ifndef js_CharacterEncoding_h
8 #define js_CharacterEncoding_h
9 
10 #include "mozilla/Range.h"
11 
12 #include "js/TypeDecls.h"
13 #include "js/Utility.h"
14 
15 namespace js {
16 class ExclusiveContext;
17 } // namespace js
18 
19 class JSFlatString;
20 
21 namespace JS {
22 
23 /*
24  * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
25  * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
26  * byte is treated as a 2-byte character, and there is no way to pass in a
27  * string containing characters beyond U+00FF.
28  */
29 class Latin1Chars : public mozilla::Range<Latin1Char>
30 {
31     typedef mozilla::Range<Latin1Char> Base;
32 
33   public:
34     using CharT = Latin1Char;
35 
Latin1Chars()36     Latin1Chars() : Base() {}
Latin1Chars(char * aBytes,size_t aLength)37     Latin1Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
Latin1Chars(const Latin1Char * aBytes,size_t aLength)38     Latin1Chars(const Latin1Char* aBytes, size_t aLength)
39       : Base(const_cast<Latin1Char*>(aBytes), aLength)
40     {}
Latin1Chars(const char * aBytes,size_t aLength)41     Latin1Chars(const char* aBytes, size_t aLength)
42       : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)), aLength)
43     {}
44 };
45 
46 /*
47  * A Latin1Chars, but with \0 termination for C compatibility.
48  */
49 class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char>
50 {
51     typedef mozilla::RangedPtr<Latin1Char> Base;
52 
53   public:
54     using CharT = Latin1Char;
55 
Latin1CharsZ()56     Latin1CharsZ() : Base(nullptr, 0) {}
57 
Latin1CharsZ(char * aBytes,size_t aLength)58     Latin1CharsZ(char* aBytes, size_t aLength)
59       : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength)
60     {
61         MOZ_ASSERT(aBytes[aLength] == '\0');
62     }
63 
Latin1CharsZ(Latin1Char * aBytes,size_t aLength)64     Latin1CharsZ(Latin1Char* aBytes, size_t aLength)
65       : Base(aBytes, aLength)
66     {
67         MOZ_ASSERT(aBytes[aLength] == '\0');
68     }
69 
70     using Base::operator=;
71 
c_str()72     char* c_str() { return reinterpret_cast<char*>(get()); }
73 };
74 
75 class UTF8Chars : public mozilla::Range<unsigned char>
76 {
77     typedef mozilla::Range<unsigned char> Base;
78 
79   public:
80     using CharT = unsigned char;
81 
UTF8Chars()82     UTF8Chars() : Base() {}
UTF8Chars(char * aBytes,size_t aLength)83     UTF8Chars(char* aBytes, size_t aLength)
84       : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
85     {}
UTF8Chars(const char * aBytes,size_t aLength)86     UTF8Chars(const char* aBytes, size_t aLength)
87       : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), aLength)
88     {}
89 };
90 
91 /*
92  * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
93  */
94 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
95 {
96     typedef mozilla::RangedPtr<unsigned char> Base;
97 
98   public:
99     using CharT = unsigned char;
100 
UTF8CharsZ()101     UTF8CharsZ() : Base(nullptr, 0) {}
102 
UTF8CharsZ(char * aBytes,size_t aLength)103     UTF8CharsZ(char* aBytes, size_t aLength)
104       : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
105     {
106         MOZ_ASSERT(aBytes[aLength] == '\0');
107     }
108 
UTF8CharsZ(unsigned char * aBytes,size_t aLength)109     UTF8CharsZ(unsigned char* aBytes, size_t aLength)
110       : Base(aBytes, aLength)
111     {
112         MOZ_ASSERT(aBytes[aLength] == '\0');
113     }
114 
115     using Base::operator=;
116 
c_str()117     char* c_str() { return reinterpret_cast<char*>(get()); }
118 };
119 
120 /*
121  * A wrapper for a "const char*" that is encoded using UTF-8.
122  * This class does not manage ownership of the data; that is left
123  * to others.  This differs from UTF8CharsZ in that the chars are
124  * const and it allows assignment.
125  */
JS_PUBLIC_API(ConstUTF8CharsZ)126 class JS_PUBLIC_API(ConstUTF8CharsZ)
127 {
128     const char* data_;
129 
130   public:
131     using CharT = unsigned char;
132 
133     ConstUTF8CharsZ() : data_(nullptr)
134     {}
135 
136     ConstUTF8CharsZ(const char* aBytes, size_t aLength)
137       : data_(aBytes)
138     {
139         MOZ_ASSERT(aBytes[aLength] == '\0');
140 #ifdef DEBUG
141         validate(aLength);
142 #endif
143     }
144 
145     const void* get() const { return data_; }
146 
147     const char* c_str() const { return data_; }
148 
149     explicit operator bool() const { return data_ != nullptr; }
150 
151   private:
152 #ifdef DEBUG
153     void validate(size_t aLength);
154 #endif
155 };
156 
157 /*
158  * SpiderMonkey uses a 2-byte character representation: it is a
159  * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
160  * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
161  * sufficiently dedicated JavaScript program to be fully unicode-aware by
162  * manually interpreting UTF-16 extension characters embedded in the JS
163  * string.
164  */
165 class TwoByteChars : public mozilla::Range<char16_t>
166 {
167     typedef mozilla::Range<char16_t> Base;
168 
169   public:
170     using CharT = char16_t;
171 
TwoByteChars()172     TwoByteChars() : Base() {}
TwoByteChars(char16_t * aChars,size_t aLength)173     TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
TwoByteChars(const char16_t * aChars,size_t aLength)174     TwoByteChars(const char16_t* aChars, size_t aLength) : Base(const_cast<char16_t*>(aChars), aLength) {}
175 };
176 
177 /*
178  * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
179  */
180 class TwoByteCharsZ : public mozilla::RangedPtr<char16_t>
181 {
182     typedef mozilla::RangedPtr<char16_t> Base;
183 
184   public:
185     using CharT = char16_t;
186 
TwoByteCharsZ()187     TwoByteCharsZ() : Base(nullptr, 0) {}
188 
TwoByteCharsZ(char16_t * chars,size_t length)189     TwoByteCharsZ(char16_t* chars, size_t length)
190       : Base(chars, length)
191     {
192         MOZ_ASSERT(chars[length] == '\0');
193     }
194 
195     using Base::operator=;
196 };
197 
198 typedef mozilla::RangedPtr<const char16_t> ConstCharPtr;
199 
200 /*
201  * Like TwoByteChars, but the chars are const.
202  */
203 class ConstTwoByteChars : public mozilla::Range<const char16_t>
204 {
205     typedef mozilla::Range<const char16_t> Base;
206 
207   public:
208     using CharT = char16_t;
209 
ConstTwoByteChars()210     ConstTwoByteChars() : Base() {}
ConstTwoByteChars(const char16_t * aChars,size_t aLength)211     ConstTwoByteChars(const char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
212 };
213 
214 /*
215  * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
216  * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
217  * contains any UTF-16 extension characters, then this may give invalid Latin1
218  * output. The returned string is zero terminated. The returned string or the
219  * returned string's |start()| must be freed with JS_free or js_free,
220  * respectively. If allocation fails, an OOM error will be set and the method
221  * will return a nullptr chars (which can be tested for with the ! operator).
222  * This method cannot trigger GC.
223  */
224 extern Latin1CharsZ
225 LossyTwoByteCharsToNewLatin1CharsZ(js::ExclusiveContext* cx,
226                                    const mozilla::Range<const char16_t> tbchars);
227 
228 inline Latin1CharsZ
LossyTwoByteCharsToNewLatin1CharsZ(js::ExclusiveContext * cx,const char16_t * begin,size_t length)229 LossyTwoByteCharsToNewLatin1CharsZ(js::ExclusiveContext* cx, const char16_t* begin, size_t length)
230 {
231     const mozilla::Range<const char16_t> tbchars(begin, length);
232     return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars);
233 }
234 
235 template <typename CharT>
236 extern UTF8CharsZ
237 CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx, const mozilla::Range<CharT> chars);
238 
239 JS_PUBLIC_API(uint32_t)
240 Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length);
241 
242 /*
243  * Inflate bytes in UTF-8 encoding to char16_t.
244  * - On error, returns an empty TwoByteCharsZ.
245  * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
246  *   its length;  the length value excludes the trailing null.
247  */
248 extern JS_PUBLIC_API(TwoByteCharsZ)
249 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
250 
251 /*
252  * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ.
253  */
254 extern JS_PUBLIC_API(TwoByteCharsZ)
255 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen);
256 
257 /*
258  * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters
259  * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8
260  * input.
261  */
262 extern JS_PUBLIC_API(TwoByteCharsZ)
263 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
264 
265 extern JS_PUBLIC_API(TwoByteCharsZ)
266 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen);
267 
268 /*
269  * Returns the length of the char buffer required to encode |s| as UTF8.
270  * Does not include the null-terminator.
271  */
272 JS_PUBLIC_API(size_t)
273 GetDeflatedUTF8StringLength(JSFlatString* s);
274 
275 /*
276  * Encode |src| as UTF8. The caller must either ensure |dst| has enough space
277  * to encode the entire string or pass the length of the buffer as |dstlenp|,
278  * in which case the function will encode characters from the string until
279  * the buffer is exhausted. Does not write the null terminator.
280  *
281  * If |dstlenp| is provided, it will be updated to hold the number of bytes
282  * written to the buffer. If |numcharsp| is provided, it will be updated to hold
283  * the number of Unicode characters written to the buffer (which can be less
284  * than the length of the string, if the buffer is exhausted before the string
285  * is fully encoded).
286  */
287 JS_PUBLIC_API(void)
288 DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst,
289                           size_t* dstlenp = nullptr, size_t* numcharsp = nullptr);
290 
291 /*
292  * The smallest character encoding capable of fully representing a particular
293  * string.
294  */
295 enum class SmallestEncoding {
296     ASCII,
297     Latin1,
298     UTF16
299 };
300 
301 /*
302  * Returns the smallest encoding possible for the given string: if all
303  * codepoints are <128 then ASCII, otherwise if all codepoints are <256
304  * Latin-1, else UTF16.
305  */
306 JS_PUBLIC_API(SmallestEncoding)
307 FindSmallestEncoding(UTF8Chars utf8);
308 
309 /*
310   * Return a null-terminated Latin-1 string copied from the input string,
311   * storing its length (excluding null terminator) in |*outlen|.  Fail and
312   * report an error if the string contains non-Latin-1 codepoints.  Returns
313   * Latin1CharsZ() on failure.
314  */
315 extern JS_PUBLIC_API(Latin1CharsZ)
316 UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
317 
318 /*
319  * Return a null-terminated Latin-1 string copied from the input string,
320  * storing its length (excluding null terminator) in |*outlen|.  Non-Latin-1
321  * codepoints are replaced by '?'.  Returns Latin1CharsZ() on failure.
322  */
323 extern JS_PUBLIC_API(Latin1CharsZ)
324 LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
325 
326 /*
327  * Returns true if all characters in the given null-terminated string are
328  * ASCII, i.e. < 0x80, false otherwise.
329  */
330 extern JS_PUBLIC_API(bool)
331 StringIsASCII(const char* s);
332 
333 } // namespace JS
334 
JS_free(JS::Latin1CharsZ & ptr)335 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
JS_free(JS::UTF8CharsZ & ptr)336 inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
337 
338 #endif /* js_CharacterEncoding_h */
339