1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2  * vim: set ts=8 sts=4 et sw=4 tw=99:
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #ifndef js_CharacterEncoding_h
8 #define js_CharacterEncoding_h
9 
10 #include "mozilla/Range.h"
11 
12 #include "js/TypeDecls.h"
13 #include "js/Utility.h"
14 
15 namespace js {
16 class ExclusiveContext;
17 } // namespace js
18 
19 class JSFlatString;
20 
21 namespace JS {
22 
23 /*
24  * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
25  * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
26  * byte is treated as a 2-byte character, and there is no way to pass in a
27  * string containing characters beyond U+00FF.
28  */
29 class Latin1Chars : public mozilla::Range<Latin1Char>
30 {
31     typedef mozilla::Range<Latin1Char> Base;
32 
33   public:
Latin1Chars()34     Latin1Chars() : Base() {}
Latin1Chars(char * aBytes,size_t aLength)35     Latin1Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
Latin1Chars(const Latin1Char * aBytes,size_t aLength)36     Latin1Chars(const Latin1Char* aBytes, size_t aLength)
37       : Base(const_cast<Latin1Char*>(aBytes), aLength)
38     {}
Latin1Chars(const char * aBytes,size_t aLength)39     Latin1Chars(const char* aBytes, size_t aLength)
40       : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)), aLength)
41     {}
42 };
43 
44 /*
45  * A Latin1Chars, but with \0 termination for C compatibility.
46  */
47 class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char>
48 {
49     typedef mozilla::RangedPtr<Latin1Char> Base;
50 
51   public:
Latin1CharsZ()52     Latin1CharsZ() : Base(nullptr, 0) {}
53 
Latin1CharsZ(char * aBytes,size_t aLength)54     Latin1CharsZ(char* aBytes, size_t aLength)
55       : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength)
56     {
57         MOZ_ASSERT(aBytes[aLength] == '\0');
58     }
59 
Latin1CharsZ(Latin1Char * aBytes,size_t aLength)60     Latin1CharsZ(Latin1Char* aBytes, size_t aLength)
61       : Base(aBytes, aLength)
62     {
63         MOZ_ASSERT(aBytes[aLength] == '\0');
64     }
65 
66     using Base::operator=;
67 
c_str()68     char* c_str() { return reinterpret_cast<char*>(get()); }
69 };
70 
71 class UTF8Chars : public mozilla::Range<unsigned char>
72 {
73     typedef mozilla::Range<unsigned char> Base;
74 
75   public:
UTF8Chars()76     UTF8Chars() : Base() {}
UTF8Chars(char * aBytes,size_t aLength)77     UTF8Chars(char* aBytes, size_t aLength)
78       : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
79     {}
UTF8Chars(const char * aBytes,size_t aLength)80     UTF8Chars(const char* aBytes, size_t aLength)
81       : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), aLength)
82     {}
83 };
84 
85 /*
86  * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
87  */
88 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
89 {
90     typedef mozilla::RangedPtr<unsigned char> Base;
91 
92   public:
UTF8CharsZ()93     UTF8CharsZ() : Base(nullptr, 0) {}
94 
UTF8CharsZ(char * aBytes,size_t aLength)95     UTF8CharsZ(char* aBytes, size_t aLength)
96       : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
97     {
98         MOZ_ASSERT(aBytes[aLength] == '\0');
99     }
100 
UTF8CharsZ(unsigned char * aBytes,size_t aLength)101     UTF8CharsZ(unsigned char* aBytes, size_t aLength)
102       : Base(aBytes, aLength)
103     {
104         MOZ_ASSERT(aBytes[aLength] == '\0');
105     }
106 
107     using Base::operator=;
108 
c_str()109     char* c_str() { return reinterpret_cast<char*>(get()); }
110 };
111 
112 /*
113  * SpiderMonkey uses a 2-byte character representation: it is a
114  * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
115  * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
116  * sufficiently dedicated JavaScript program to be fully unicode-aware by
117  * manually interpreting UTF-16 extension characters embedded in the JS
118  * string.
119  */
120 class TwoByteChars : public mozilla::Range<char16_t>
121 {
122     typedef mozilla::Range<char16_t> Base;
123 
124   public:
TwoByteChars()125     TwoByteChars() : Base() {}
TwoByteChars(char16_t * aChars,size_t aLength)126     TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
TwoByteChars(const char16_t * aChars,size_t aLength)127     TwoByteChars(const char16_t* aChars, size_t aLength) : Base(const_cast<char16_t*>(aChars), aLength) {}
128 };
129 
130 /*
131  * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
132  */
133 class TwoByteCharsZ : public mozilla::RangedPtr<char16_t>
134 {
135     typedef mozilla::RangedPtr<char16_t> Base;
136 
137   public:
TwoByteCharsZ()138     TwoByteCharsZ() : Base(nullptr, 0) {}
139 
TwoByteCharsZ(char16_t * chars,size_t length)140     TwoByteCharsZ(char16_t* chars, size_t length)
141       : Base(chars, length)
142     {
143         MOZ_ASSERT(chars[length] == '\0');
144     }
145 
146     using Base::operator=;
147 };
148 
149 typedef mozilla::RangedPtr<const char16_t> ConstCharPtr;
150 
151 /*
152  * Like TwoByteChars, but the chars are const.
153  */
154 class ConstTwoByteChars : public mozilla::Range<const char16_t>
155 {
156     typedef mozilla::Range<const char16_t> Base;
157 
158   public:
ConstTwoByteChars()159     ConstTwoByteChars() : Base() {}
ConstTwoByteChars(const char16_t * aChars,size_t aLength)160     ConstTwoByteChars(const char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
161 };
162 
163 /*
164  * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
165  * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
166  * contains any UTF-16 extension characters, then this may give invalid Latin1
167  * output. The returned string is zero terminated. The returned string or the
168  * returned string's |start()| must be freed with JS_free or js_free,
169  * respectively. If allocation fails, an OOM error will be set and the method
170  * will return a nullptr chars (which can be tested for with the ! operator).
171  * This method cannot trigger GC.
172  */
173 extern Latin1CharsZ
174 LossyTwoByteCharsToNewLatin1CharsZ(js::ExclusiveContext* cx,
175                                    const mozilla::Range<const char16_t> tbchars);
176 
177 template <typename CharT>
178 extern UTF8CharsZ
179 CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx, const mozilla::Range<const CharT> chars);
180 
181 uint32_t
182 Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length);
183 
184 /*
185  * Inflate bytes in UTF-8 encoding to char16_t.
186  * - On error, returns an empty TwoByteCharsZ.
187  * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
188  *   its length;  the length value excludes the trailing null.
189  */
190 extern TwoByteCharsZ
191 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
192 
193 /*
194  * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters
195  * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8
196  * input.
197  */
198 extern TwoByteCharsZ
199 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
200 
201 /*
202  * Returns the length of the char buffer required to encode |s| as UTF8.
203  * Does not include the null-terminator.
204  */
205 JS_PUBLIC_API(size_t)
206 GetDeflatedUTF8StringLength(JSFlatString* s);
207 
208 /*
209  * Encode |src| as UTF8. The caller must ensure |dst| has enough space.
210  * Does not write the null terminator.
211  */
212 JS_PUBLIC_API(void)
213 DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst);
214 
215 } // namespace JS
216 
JS_free(JS::Latin1CharsZ & ptr)217 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
JS_free(JS::UTF8CharsZ & ptr)218 inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
219 
220 #endif /* js_CharacterEncoding_h */
221