1 // Tencent is pleased to support the open source community by making RapidJSON available.
2 //
3 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
4 //
5 // Licensed under the MIT License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // http://opensource.org/licenses/MIT
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #ifndef RAPIDJSON_ENCODEDSTREAM_H_
16 #define RAPIDJSON_ENCODEDSTREAM_H_
17 
18 #include "stream.h"
19 #include "memorystream.h"
20 
21 #ifdef __GNUC__
22 RAPIDJSON_DIAG_PUSH
23 RAPIDJSON_DIAG_OFF(effc++)
24 #endif
25 
26 #ifdef __clang__
27 RAPIDJSON_DIAG_PUSH
RAPIDJSON_DIAG_OFF(padded)28 RAPIDJSON_DIAG_OFF(padded)
29 #endif
30 
31 RAPIDJSON_NAMESPACE_BEGIN
32 
33 //! Input byte stream wrapper with a statically bound encoding.
34 /*!
35     \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
36     \tparam InputByteStream Type of input byte stream. For example, FileReadStream.
37 */
38 template <typename Encoding, typename InputByteStream>
39 class EncodedInputStream {
40     RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
41 public:
42     typedef typename Encoding::Ch Ch;
43 
44     EncodedInputStream(InputByteStream& is) : is_(is) {
45         current_ = Encoding::TakeBOM(is_);
46     }
47 
48     Ch Peek() const { return current_; }
49     Ch Take() { Ch c = current_; current_ = Encoding::Take(is_); return c; }
50     size_t Tell() const { return is_.Tell(); }
51 
52     // Not implemented
53     void Put(Ch) { RAPIDJSON_ASSERT(false); }
54     void Flush() { RAPIDJSON_ASSERT(false); }
55     Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
56     size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
57 
58 private:
59     EncodedInputStream(const EncodedInputStream&);
60     EncodedInputStream& operator=(const EncodedInputStream&);
61 
62     InputByteStream& is_;
63     Ch current_;
64 };
65 
66 //! Specialized for UTF8 MemoryStream.
67 template <>
68 class EncodedInputStream<UTF8<>, MemoryStream> {
69 public:
70     typedef UTF8<>::Ch Ch;
71 
EncodedInputStream(MemoryStream & is)72     EncodedInputStream(MemoryStream& is) : is_(is) {
73         if (static_cast<unsigned char>(is_.Peek()) == 0xEFu) is_.Take();
74         if (static_cast<unsigned char>(is_.Peek()) == 0xBBu) is_.Take();
75         if (static_cast<unsigned char>(is_.Peek()) == 0xBFu) is_.Take();
76     }
Peek()77     Ch Peek() const { return is_.Peek(); }
Take()78     Ch Take() { return is_.Take(); }
Tell()79     size_t Tell() const { return is_.Tell(); }
80 
81     // Not implemented
Put(Ch)82     void Put(Ch) {}
Flush()83     void Flush() {}
PutBegin()84     Ch* PutBegin() { return 0; }
PutEnd(Ch *)85     size_t PutEnd(Ch*) { return 0; }
86 
87     MemoryStream& is_;
88 };
89 
90 //! Output byte stream wrapper with statically bound encoding.
91 /*!
92     \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
93     \tparam OutputByteStream Type of input byte stream. For example, FileWriteStream.
94 */
95 template <typename Encoding, typename OutputByteStream>
96 class EncodedOutputStream {
97     RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
98 public:
99     typedef typename Encoding::Ch Ch;
100 
os_(os)101     EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) {
102         if (putBOM)
103             Encoding::PutBOM(os_);
104     }
105 
Put(Ch c)106     void Put(Ch c) { Encoding::Put(os_, c);  }
Flush()107     void Flush() { os_.Flush(); }
108 
109     // Not implemented
Peek()110     Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;}
Take()111     Ch Take() { RAPIDJSON_ASSERT(false); return 0;}
Tell()112     size_t Tell() const { RAPIDJSON_ASSERT(false);  return 0; }
PutBegin()113     Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
PutEnd(Ch *)114     size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
115 
116 private:
117     EncodedOutputStream(const EncodedOutputStream&);
118     EncodedOutputStream& operator=(const EncodedOutputStream&);
119 
120     OutputByteStream& os_;
121 };
122 
123 #define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x
124 
125 //! Input stream wrapper with dynamically bound encoding and automatic encoding detection.
126 /*!
127     \tparam CharType Type of character for reading.
128     \tparam InputByteStream type of input byte stream to be wrapped.
129 */
130 template <typename CharType, typename InputByteStream>
131 class AutoUTFInputStream {
132     RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
133 public:
134     typedef CharType Ch;
135 
136     //! Constructor.
137     /*!
138         \param is input stream to be wrapped.
139         \param type UTF encoding type if it is not detected from the stream.
140     */
141     AutoUTFInputStream(InputByteStream& is, UTFType type = kUTF8) : is_(&is), type_(type), hasBOM_(false) {
142         RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
143         DetectType();
144         static const TakeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Take) };
145         takeFunc_ = f[type_];
146         current_ = takeFunc_(*is_);
147     }
148 
GetType()149     UTFType GetType() const { return type_; }
HasBOM()150     bool HasBOM() const { return hasBOM_; }
151 
Peek()152     Ch Peek() const { return current_; }
Take()153     Ch Take() { Ch c = current_; current_ = takeFunc_(*is_); return c; }
Tell()154     size_t Tell() const { return is_->Tell(); }
155 
156     // Not implemented
Put(Ch)157     void Put(Ch) { RAPIDJSON_ASSERT(false); }
Flush()158     void Flush() { RAPIDJSON_ASSERT(false); }
PutBegin()159     Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
PutEnd(Ch *)160     size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
161 
162 private:
163     AutoUTFInputStream(const AutoUTFInputStream&);
164     AutoUTFInputStream& operator=(const AutoUTFInputStream&);
165 
166     // Detect encoding type with BOM or RFC 4627
DetectType()167     void DetectType() {
168         // BOM (Byte Order Mark):
169         // 00 00 FE FF  UTF-32BE
170         // FF FE 00 00  UTF-32LE
171         // FE FF        UTF-16BE
172         // FF FE        UTF-16LE
173         // EF BB BF     UTF-8
174 
175         const unsigned char* c = reinterpret_cast<const unsigned char *>(is_->Peek4());
176         if (!c)
177             return;
178 
179         unsigned bom = static_cast<unsigned>(c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24));
180         hasBOM_ = false;
181         if (bom == 0xFFFE0000)                  { type_ = kUTF32BE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
182         else if (bom == 0x0000FEFF)             { type_ = kUTF32LE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
183         else if ((bom & 0xFFFF) == 0xFFFE)      { type_ = kUTF16BE; hasBOM_ = true; is_->Take(); is_->Take();                           }
184         else if ((bom & 0xFFFF) == 0xFEFF)      { type_ = kUTF16LE; hasBOM_ = true; is_->Take(); is_->Take();                           }
185         else if ((bom & 0xFFFFFF) == 0xBFBBEF)  { type_ = kUTF8;    hasBOM_ = true; is_->Take(); is_->Take(); is_->Take();              }
186 
187         // RFC 4627: Section 3
188         // "Since the first two characters of a JSON text will always be ASCII
189         // characters [RFC0020], it is possible to determine whether an octet
190         // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
191         // at the pattern of nulls in the first four octets."
192         // 00 00 00 xx  UTF-32BE
193         // 00 xx 00 xx  UTF-16BE
194         // xx 00 00 00  UTF-32LE
195         // xx 00 xx 00  UTF-16LE
196         // xx xx xx xx  UTF-8
197 
198         if (!hasBOM_) {
199             unsigned pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0);
200             switch (pattern) {
201             case 0x08: type_ = kUTF32BE; break;
202             case 0x0A: type_ = kUTF16BE; break;
203             case 0x01: type_ = kUTF32LE; break;
204             case 0x05: type_ = kUTF16LE; break;
205             case 0x0F: type_ = kUTF8;    break;
206             default: break; // Use type defined by user.
207             }
208         }
209 
210         // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
211         if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
212         if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
213     }
214 
215     typedef Ch (*TakeFunc)(InputByteStream& is);
216     InputByteStream* is_;
217     UTFType type_;
218     Ch current_;
219     TakeFunc takeFunc_;
220     bool hasBOM_;
221 };
222 
223 //! Output stream wrapper with dynamically bound encoding and automatic encoding detection.
224 /*!
225     \tparam CharType Type of character for writing.
226     \tparam OutputByteStream type of output byte stream to be wrapped.
227 */
228 template <typename CharType, typename OutputByteStream>
229 class AutoUTFOutputStream {
230     RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
231 public:
232     typedef CharType Ch;
233 
234     //! Constructor.
235     /*!
236         \param os output stream to be wrapped.
237         \param type UTF encoding type.
238         \param putBOM Whether to write BOM at the beginning of the stream.
239     */
AutoUTFOutputStream(OutputByteStream & os,UTFType type,bool putBOM)240     AutoUTFOutputStream(OutputByteStream& os, UTFType type, bool putBOM) : os_(&os), type_(type) {
241         RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
242 
243         // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
244         if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
245         if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
246 
247         static const PutFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Put) };
248         putFunc_ = f[type_];
249 
250         if (putBOM)
251             PutBOM();
252     }
253 
GetType()254     UTFType GetType() const { return type_; }
255 
Put(Ch c)256     void Put(Ch c) { putFunc_(*os_, c); }
Flush()257     void Flush() { os_->Flush(); }
258 
259     // Not implemented
Peek()260     Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;}
Take()261     Ch Take() { RAPIDJSON_ASSERT(false); return 0;}
Tell()262     size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; }
PutBegin()263     Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
PutEnd(Ch *)264     size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
265 
266 private:
267     AutoUTFOutputStream(const AutoUTFOutputStream&);
268     AutoUTFOutputStream& operator=(const AutoUTFOutputStream&);
269 
PutBOM()270     void PutBOM() {
271         typedef void (*PutBOMFunc)(OutputByteStream&);
272         static const PutBOMFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(PutBOM) };
273         f[type_](*os_);
274     }
275 
276     typedef void (*PutFunc)(OutputByteStream&, Ch);
277 
278     OutputByteStream* os_;
279     UTFType type_;
280     PutFunc putFunc_;
281 };
282 
283 #undef RAPIDJSON_ENCODINGS_FUNC
284 
285 RAPIDJSON_NAMESPACE_END
286 
287 #ifdef __clang__
288 RAPIDJSON_DIAG_POP
289 #endif
290 
291 #ifdef __GNUC__
292 RAPIDJSON_DIAG_POP
293 #endif
294 
295 #endif // RAPIDJSON_FILESTREAM_H_
296