1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 * vim: set ts=8 sts=4 et sw=4 tw=99:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #ifndef js_CharacterEncoding_h
8 #define js_CharacterEncoding_h
9
10 #include "mozilla/Range.h"
11
12 #include "js/TypeDecls.h"
13 #include "js/Utility.h"
14
15 namespace js {
16 class ExclusiveContext;
17 } // namespace js
18
19 class JSFlatString;
20
21 namespace JS {
22
23 /*
24 * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
25 * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
26 * byte is treated as a 2-byte character, and there is no way to pass in a
27 * string containing characters beyond U+00FF.
28 */
29 class Latin1Chars : public mozilla::Range<Latin1Char>
30 {
31 typedef mozilla::Range<Latin1Char> Base;
32
33 public:
Latin1Chars()34 Latin1Chars() : Base() {}
Latin1Chars(char * aBytes,size_t aLength)35 Latin1Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
Latin1Chars(const Latin1Char * aBytes,size_t aLength)36 Latin1Chars(const Latin1Char* aBytes, size_t aLength)
37 : Base(const_cast<Latin1Char*>(aBytes), aLength)
38 {}
Latin1Chars(const char * aBytes,size_t aLength)39 Latin1Chars(const char* aBytes, size_t aLength)
40 : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)), aLength)
41 {}
42 };
43
44 /*
45 * A Latin1Chars, but with \0 termination for C compatibility.
46 */
47 class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char>
48 {
49 typedef mozilla::RangedPtr<Latin1Char> Base;
50
51 public:
Latin1CharsZ()52 Latin1CharsZ() : Base(nullptr, 0) {}
53
Latin1CharsZ(char * aBytes,size_t aLength)54 Latin1CharsZ(char* aBytes, size_t aLength)
55 : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength)
56 {
57 MOZ_ASSERT(aBytes[aLength] == '\0');
58 }
59
Latin1CharsZ(Latin1Char * aBytes,size_t aLength)60 Latin1CharsZ(Latin1Char* aBytes, size_t aLength)
61 : Base(aBytes, aLength)
62 {
63 MOZ_ASSERT(aBytes[aLength] == '\0');
64 }
65
66 using Base::operator=;
67
c_str()68 char* c_str() { return reinterpret_cast<char*>(get()); }
69 };
70
71 class UTF8Chars : public mozilla::Range<unsigned char>
72 {
73 typedef mozilla::Range<unsigned char> Base;
74
75 public:
UTF8Chars()76 UTF8Chars() : Base() {}
UTF8Chars(char * aBytes,size_t aLength)77 UTF8Chars(char* aBytes, size_t aLength)
78 : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
79 {}
UTF8Chars(const char * aBytes,size_t aLength)80 UTF8Chars(const char* aBytes, size_t aLength)
81 : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), aLength)
82 {}
83 };
84
85 /*
86 * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
87 */
88 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
89 {
90 typedef mozilla::RangedPtr<unsigned char> Base;
91
92 public:
UTF8CharsZ()93 UTF8CharsZ() : Base(nullptr, 0) {}
94
UTF8CharsZ(char * aBytes,size_t aLength)95 UTF8CharsZ(char* aBytes, size_t aLength)
96 : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
97 {
98 MOZ_ASSERT(aBytes[aLength] == '\0');
99 }
100
UTF8CharsZ(unsigned char * aBytes,size_t aLength)101 UTF8CharsZ(unsigned char* aBytes, size_t aLength)
102 : Base(aBytes, aLength)
103 {
104 MOZ_ASSERT(aBytes[aLength] == '\0');
105 }
106
107 using Base::operator=;
108
c_str()109 char* c_str() { return reinterpret_cast<char*>(get()); }
110 };
111
112 /*
113 * SpiderMonkey uses a 2-byte character representation: it is a
114 * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
115 * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
116 * sufficiently dedicated JavaScript program to be fully unicode-aware by
117 * manually interpreting UTF-16 extension characters embedded in the JS
118 * string.
119 */
120 class TwoByteChars : public mozilla::Range<char16_t>
121 {
122 typedef mozilla::Range<char16_t> Base;
123
124 public:
TwoByteChars()125 TwoByteChars() : Base() {}
TwoByteChars(char16_t * aChars,size_t aLength)126 TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
TwoByteChars(const char16_t * aChars,size_t aLength)127 TwoByteChars(const char16_t* aChars, size_t aLength) : Base(const_cast<char16_t*>(aChars), aLength) {}
128 };
129
130 /*
131 * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
132 */
133 class TwoByteCharsZ : public mozilla::RangedPtr<char16_t>
134 {
135 typedef mozilla::RangedPtr<char16_t> Base;
136
137 public:
TwoByteCharsZ()138 TwoByteCharsZ() : Base(nullptr, 0) {}
139
TwoByteCharsZ(char16_t * chars,size_t length)140 TwoByteCharsZ(char16_t* chars, size_t length)
141 : Base(chars, length)
142 {
143 MOZ_ASSERT(chars[length] == '\0');
144 }
145
146 using Base::operator=;
147 };
148
149 typedef mozilla::RangedPtr<const char16_t> ConstCharPtr;
150
151 /*
152 * Like TwoByteChars, but the chars are const.
153 */
154 class ConstTwoByteChars : public mozilla::Range<const char16_t>
155 {
156 typedef mozilla::Range<const char16_t> Base;
157
158 public:
ConstTwoByteChars()159 ConstTwoByteChars() : Base() {}
ConstTwoByteChars(const char16_t * aChars,size_t aLength)160 ConstTwoByteChars(const char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
161 };
162
163 /*
164 * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
165 * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
166 * contains any UTF-16 extension characters, then this may give invalid Latin1
167 * output. The returned string is zero terminated. The returned string or the
168 * returned string's |start()| must be freed with JS_free or js_free,
169 * respectively. If allocation fails, an OOM error will be set and the method
170 * will return a nullptr chars (which can be tested for with the ! operator).
171 * This method cannot trigger GC.
172 */
173 extern Latin1CharsZ
174 LossyTwoByteCharsToNewLatin1CharsZ(js::ExclusiveContext* cx,
175 const mozilla::Range<const char16_t> tbchars);
176
177 template <typename CharT>
178 extern UTF8CharsZ
179 CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx, const mozilla::Range<const CharT> chars);
180
181 uint32_t
182 Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length);
183
184 /*
185 * Inflate bytes in UTF-8 encoding to char16_t.
186 * - On error, returns an empty TwoByteCharsZ.
187 * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
188 * its length; the length value excludes the trailing null.
189 */
190 extern TwoByteCharsZ
191 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
192
193 /*
194 * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters
195 * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8
196 * input.
197 */
198 extern TwoByteCharsZ
199 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
200
201 /*
202 * Returns the length of the char buffer required to encode |s| as UTF8.
203 * Does not include the null-terminator.
204 */
205 JS_PUBLIC_API(size_t)
206 GetDeflatedUTF8StringLength(JSFlatString* s);
207
208 /*
209 * Encode |src| as UTF8. The caller must ensure |dst| has enough space.
210 * Does not write the null terminator.
211 */
212 JS_PUBLIC_API(void)
213 DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst);
214
215 } // namespace JS
216
JS_free(JS::Latin1CharsZ & ptr)217 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
JS_free(JS::UTF8CharsZ & ptr)218 inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
219
220 #endif /* js_CharacterEncoding_h */
221