1 // Tencent is pleased to support the open source community by making RapidJSON available. 2 // 3 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. 4 // 5 // Licensed under the MIT License (the "License"); you may not use this file except 6 // in compliance with the License. You may obtain a copy of the License at 7 // 8 // http://opensource.org/licenses/MIT 9 // 10 // Unless required by applicable law or agreed to in writing, software distributed 11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 // specific language governing permissions and limitations under the License. 14 15 #ifndef RAPIDJSON_ENCODINGS_H_ 16 #define RAPIDJSON_ENCODINGS_H_ 17 18 #include "rapidjson.h" 19 20 #ifdef _MSC_VER 21 RAPIDJSON_DIAG_PUSH 22 RAPIDJSON_DIAG_OFF(4244) // conversion from 'type1' to 'type2', possible loss of data 23 RAPIDJSON_DIAG_OFF(4702) // unreachable code 24 #elif defined(__GNUC__) 25 RAPIDJSON_DIAG_PUSH 26 RAPIDJSON_DIAG_OFF(effc++) 27 RAPIDJSON_DIAG_OFF(overflow) 28 #endif 29 30 RAPIDJSON_NAMESPACE_BEGIN 31 32 /////////////////////////////////////////////////////////////////////////////// 33 // Encoding 34 35 /*! \class rapidjson::Encoding 36 \brief Concept for encoding of Unicode characters. 37 38 \code 39 concept Encoding { 40 typename Ch; //! Type of character. A "character" is actually a code unit in unicode's definition. 41 42 enum { supportUnicode = 1 }; // or 0 if not supporting unicode 43 44 //! \brief Encode a Unicode codepoint to an output stream. 45 //! \param os Output stream. 46 //! \param codepoint An unicode codepoint, ranging from 0x0 to 0x10FFFF inclusively. 47 template<typename OutputStream> 48 static void Encode(OutputStream& os, unsigned codepoint); 49 50 //! \brief Decode a Unicode codepoint from an input stream. 51 //! \param is Input stream. 52 //! \param codepoint Output of the unicode codepoint. 53 //! \return true if a valid codepoint can be decoded from the stream. 54 template <typename InputStream> 55 static bool Decode(InputStream& is, unsigned* codepoint); 56 57 //! \brief Validate one Unicode codepoint from an encoded stream. 58 //! \param is Input stream to obtain codepoint. 59 //! \param os Output for copying one codepoint. 60 //! \return true if it is valid. 61 //! \note This function just validating and copying the codepoint without actually decode it. 62 template <typename InputStream, typename OutputStream> 63 static bool Validate(InputStream& is, OutputStream& os); 64 65 // The following functions are deal with byte streams. 66 67 //! Take a character from input byte stream, skip BOM if exist. 68 template <typename InputByteStream> 69 static CharType TakeBOM(InputByteStream& is); 70 71 //! Take a character from input byte stream. 72 template <typename InputByteStream> 73 static Ch Take(InputByteStream& is); 74 75 //! Put BOM to output byte stream. 76 template <typename OutputByteStream> 77 static void PutBOM(OutputByteStream& os); 78 79 //! Put a character to output byte stream. 80 template <typename OutputByteStream> 81 static void Put(OutputByteStream& os, Ch c); 82 }; 83 \endcode 84 */ 85 86 /////////////////////////////////////////////////////////////////////////////// 87 // UTF8 88 89 //! UTF-8 encoding. 90 /*! http://en.wikipedia.org/wiki/UTF-8 91 http://tools.ietf.org/html/rfc3629 92 \tparam CharType Code unit for storing 8-bit UTF-8 data. Default is char. 93 \note implements Encoding concept 94 */ 95 template<typename CharType = char> 96 struct UTF8 { 97 typedef CharType Ch; 98 99 enum { supportUnicode = 1 }; 100 101 template<typename OutputStream> EncodeUTF8102 static void Encode(OutputStream& os, unsigned codepoint) { 103 if (codepoint <= 0x7F) 104 os.Put(static_cast<Ch>(codepoint & 0xFF)); 105 else if (codepoint <= 0x7FF) { 106 os.Put(static_cast<Ch>(0xC0 | ((codepoint >> 6) & 0xFF))); 107 os.Put(static_cast<Ch>(0x80 | ((codepoint & 0x3F)))); 108 } 109 else if (codepoint <= 0xFFFF) { 110 os.Put(static_cast<Ch>(0xE0 | ((codepoint >> 12) & 0xFF))); 111 os.Put(static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F))); 112 os.Put(static_cast<Ch>(0x80 | (codepoint & 0x3F))); 113 } 114 else { 115 RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); 116 os.Put(static_cast<Ch>(0xF0 | ((codepoint >> 18) & 0xFF))); 117 os.Put(static_cast<Ch>(0x80 | ((codepoint >> 12) & 0x3F))); 118 os.Put(static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F))); 119 os.Put(static_cast<Ch>(0x80 | (codepoint & 0x3F))); 120 } 121 } 122 123 template<typename OutputStream> EncodeUnsafeUTF8124 static void EncodeUnsafe(OutputStream& os, unsigned codepoint) { 125 if (codepoint <= 0x7F) 126 PutUnsafe(os, static_cast<Ch>(codepoint & 0xFF)); 127 else if (codepoint <= 0x7FF) { 128 PutUnsafe(os, static_cast<Ch>(0xC0 | ((codepoint >> 6) & 0xFF))); 129 PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint & 0x3F)))); 130 } 131 else if (codepoint <= 0xFFFF) { 132 PutUnsafe(os, static_cast<Ch>(0xE0 | ((codepoint >> 12) & 0xFF))); 133 PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F))); 134 PutUnsafe(os, static_cast<Ch>(0x80 | (codepoint & 0x3F))); 135 } 136 else { 137 RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); 138 PutUnsafe(os, static_cast<Ch>(0xF0 | ((codepoint >> 18) & 0xFF))); 139 PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint >> 12) & 0x3F))); 140 PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F))); 141 PutUnsafe(os, static_cast<Ch>(0x80 | (codepoint & 0x3F))); 142 } 143 } 144 145 template <typename InputStream> DecodeUTF8146 static bool Decode(InputStream& is, unsigned* codepoint) { 147 #define COPY() c = is.Take(); *codepoint = (*codepoint << 6) | (static_cast<unsigned char>(c) & 0x3Fu) 148 #define TRANS(mask) result &= ((GetRange(static_cast<unsigned char>(c)) & mask) != 0) 149 #define TAIL() COPY(); TRANS(0x70) 150 typename InputStream::Ch c = is.Take(); 151 if (!(c & 0x80)) { 152 *codepoint = static_cast<unsigned char>(c); 153 return true; 154 } 155 156 unsigned char type = GetRange(static_cast<unsigned char>(c)); 157 if (type >= 32) { 158 *codepoint = 0; 159 } else { 160 *codepoint = (0xFFu >> type) & static_cast<unsigned char>(c); 161 } 162 bool result = true; 163 switch (type) { 164 case 2: TAIL(); return result; 165 case 3: TAIL(); TAIL(); return result; 166 case 4: COPY(); TRANS(0x50); TAIL(); return result; 167 case 5: COPY(); TRANS(0x10); TAIL(); TAIL(); return result; 168 case 6: TAIL(); TAIL(); TAIL(); return result; 169 case 10: COPY(); TRANS(0x20); TAIL(); return result; 170 case 11: COPY(); TRANS(0x60); TAIL(); TAIL(); return result; 171 default: return false; 172 } 173 #undef COPY 174 #undef TRANS 175 #undef TAIL 176 } 177 178 template <typename InputStream, typename OutputStream> ValidateUTF8179 static bool Validate(InputStream& is, OutputStream& os) { 180 #define COPY() os.Put(c = is.Take()) 181 #define TRANS(mask) result &= ((GetRange(static_cast<unsigned char>(c)) & mask) != 0) 182 #define TAIL() COPY(); TRANS(0x70) 183 Ch c; 184 COPY(); 185 if (!(c & 0x80)) 186 return true; 187 188 bool result = true; 189 switch (GetRange(static_cast<unsigned char>(c))) { 190 case 2: TAIL(); return result; 191 case 3: TAIL(); TAIL(); return result; 192 case 4: COPY(); TRANS(0x50); TAIL(); return result; 193 case 5: COPY(); TRANS(0x10); TAIL(); TAIL(); return result; 194 case 6: TAIL(); TAIL(); TAIL(); return result; 195 case 10: COPY(); TRANS(0x20); TAIL(); return result; 196 case 11: COPY(); TRANS(0x60); TAIL(); TAIL(); return result; 197 default: return false; 198 } 199 #undef COPY 200 #undef TRANS 201 #undef TAIL 202 } 203 GetRangeUTF8204 static unsigned char GetRange(unsigned char c) { 205 // Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ 206 // With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types. 207 static const unsigned char type[] = { 208 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 209 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 210 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 211 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 212 0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10, 213 0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40, 214 0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20, 215 0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20, 216 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 217 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, 218 }; 219 return type[c]; 220 } 221 222 template <typename InputByteStream> TakeBOMUTF8223 static CharType TakeBOM(InputByteStream& is) { 224 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); 225 typename InputByteStream::Ch c = Take(is); 226 if (static_cast<unsigned char>(c) != 0xEFu) return c; 227 c = is.Take(); 228 if (static_cast<unsigned char>(c) != 0xBBu) return c; 229 c = is.Take(); 230 if (static_cast<unsigned char>(c) != 0xBFu) return c; 231 c = is.Take(); 232 return c; 233 } 234 235 template <typename InputByteStream> TakeUTF8236 static Ch Take(InputByteStream& is) { 237 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); 238 return static_cast<Ch>(is.Take()); 239 } 240 241 template <typename OutputByteStream> PutBOMUTF8242 static void PutBOM(OutputByteStream& os) { 243 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); 244 os.Put(static_cast<typename OutputByteStream::Ch>(0xEFu)); 245 os.Put(static_cast<typename OutputByteStream::Ch>(0xBBu)); 246 os.Put(static_cast<typename OutputByteStream::Ch>(0xBFu)); 247 } 248 249 template <typename OutputByteStream> PutUTF8250 static void Put(OutputByteStream& os, Ch c) { 251 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); 252 os.Put(static_cast<typename OutputByteStream::Ch>(c)); 253 } 254 }; 255 256 /////////////////////////////////////////////////////////////////////////////// 257 // UTF16 258 259 //! UTF-16 encoding. 260 /*! http://en.wikipedia.org/wiki/UTF-16 261 http://tools.ietf.org/html/rfc2781 262 \tparam CharType Type for storing 16-bit UTF-16 data. Default is wchar_t. C++11 may use char16_t instead. 263 \note implements Encoding concept 264 265 \note For in-memory access, no need to concern endianness. The code units and code points are represented by CPU's endianness. 266 For streaming, use UTF16LE and UTF16BE, which handle endianness. 267 */ 268 template<typename CharType = wchar_t> 269 struct UTF16 { 270 typedef CharType Ch; 271 RAPIDJSON_STATIC_ASSERT(sizeof(Ch) >= 2); 272 273 enum { supportUnicode = 1 }; 274 275 template<typename OutputStream> EncodeUTF16276 static void Encode(OutputStream& os, unsigned codepoint) { 277 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2); 278 if (codepoint <= 0xFFFF) { 279 RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair 280 os.Put(static_cast<typename OutputStream::Ch>(codepoint)); 281 } 282 else { 283 RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); 284 unsigned v = codepoint - 0x10000; 285 os.Put(static_cast<typename OutputStream::Ch>((v >> 10) | 0xD800)); 286 os.Put(static_cast<typename OutputStream::Ch>((v & 0x3FF) | 0xDC00)); 287 } 288 } 289 290 291 template<typename OutputStream> EncodeUnsafeUTF16292 static void EncodeUnsafe(OutputStream& os, unsigned codepoint) { 293 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2); 294 if (codepoint <= 0xFFFF) { 295 RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair 296 PutUnsafe(os, static_cast<typename OutputStream::Ch>(codepoint)); 297 } 298 else { 299 RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); 300 unsigned v = codepoint - 0x10000; 301 PutUnsafe(os, static_cast<typename OutputStream::Ch>((v >> 10) | 0xD800)); 302 PutUnsafe(os, static_cast<typename OutputStream::Ch>((v & 0x3FF) | 0xDC00)); 303 } 304 } 305 306 template <typename InputStream> DecodeUTF16307 static bool Decode(InputStream& is, unsigned* codepoint) { 308 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 2); 309 typename InputStream::Ch c = is.Take(); 310 if (c < 0xD800 || c > 0xDFFF) { 311 *codepoint = static_cast<unsigned>(c); 312 return true; 313 } 314 else if (c <= 0xDBFF) { 315 *codepoint = (static_cast<unsigned>(c) & 0x3FF) << 10; 316 c = is.Take(); 317 *codepoint |= (static_cast<unsigned>(c) & 0x3FF); 318 *codepoint += 0x10000; 319 return c >= 0xDC00 && c <= 0xDFFF; 320 } 321 return false; 322 } 323 324 template <typename InputStream, typename OutputStream> ValidateUTF16325 static bool Validate(InputStream& is, OutputStream& os) { 326 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 2); 327 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2); 328 typename InputStream::Ch c; 329 os.Put(static_cast<typename OutputStream::Ch>(c = is.Take())); 330 if (c < 0xD800 || c > 0xDFFF) 331 return true; 332 else if (c <= 0xDBFF) { 333 os.Put(c = is.Take()); 334 return c >= 0xDC00 && c <= 0xDFFF; 335 } 336 return false; 337 } 338 }; 339 340 //! UTF-16 little endian encoding. 341 template<typename CharType = wchar_t> 342 struct UTF16LE : UTF16<CharType> { 343 template <typename InputByteStream> TakeBOMUTF16LE344 static CharType TakeBOM(InputByteStream& is) { 345 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); 346 CharType c = Take(is); 347 return static_cast<uint16_t>(c) == 0xFEFFu ? Take(is) : c; 348 } 349 350 template <typename InputByteStream> TakeUTF16LE351 static CharType Take(InputByteStream& is) { 352 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); 353 unsigned c = static_cast<uint8_t>(is.Take()); 354 c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8; 355 return static_cast<CharType>(c); 356 } 357 358 template <typename OutputByteStream> PutBOMUTF16LE359 static void PutBOM(OutputByteStream& os) { 360 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); 361 os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu)); 362 os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu)); 363 } 364 365 template <typename OutputByteStream> PutUTF16LE366 static void Put(OutputByteStream& os, CharType c) { 367 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); 368 os.Put(static_cast<typename OutputByteStream::Ch>(static_cast<unsigned>(c) & 0xFFu)); 369 os.Put(static_cast<typename OutputByteStream::Ch>((static_cast<unsigned>(c) >> 8) & 0xFFu)); 370 } 371 }; 372 373 //! UTF-16 big endian encoding. 374 template<typename CharType = wchar_t> 375 struct UTF16BE : UTF16<CharType> { 376 template <typename InputByteStream> TakeBOMUTF16BE377 static CharType TakeBOM(InputByteStream& is) { 378 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); 379 CharType c = Take(is); 380 return static_cast<uint16_t>(c) == 0xFEFFu ? Take(is) : c; 381 } 382 383 template <typename InputByteStream> TakeUTF16BE384 static CharType Take(InputByteStream& is) { 385 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); 386 unsigned c = static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8; 387 c |= static_cast<uint8_t>(is.Take()); 388 return static_cast<CharType>(c); 389 } 390 391 template <typename OutputByteStream> PutBOMUTF16BE392 static void PutBOM(OutputByteStream& os) { 393 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); 394 os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu)); 395 os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu)); 396 } 397 398 template <typename OutputByteStream> PutUTF16BE399 static void Put(OutputByteStream& os, CharType c) { 400 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); 401 os.Put(static_cast<typename OutputByteStream::Ch>((static_cast<unsigned>(c) >> 8) & 0xFFu)); 402 os.Put(static_cast<typename OutputByteStream::Ch>(static_cast<unsigned>(c) & 0xFFu)); 403 } 404 }; 405 406 /////////////////////////////////////////////////////////////////////////////// 407 // UTF32 408 409 //! UTF-32 encoding. 410 /*! http://en.wikipedia.org/wiki/UTF-32 411 \tparam CharType Type for storing 32-bit UTF-32 data. Default is unsigned. C++11 may use char32_t instead. 412 \note implements Encoding concept 413 414 \note For in-memory access, no need to concern endianness. The code units and code points are represented by CPU's endianness. 415 For streaming, use UTF32LE and UTF32BE, which handle endianness. 416 */ 417 template<typename CharType = unsigned> 418 struct UTF32 { 419 typedef CharType Ch; 420 RAPIDJSON_STATIC_ASSERT(sizeof(Ch) >= 4); 421 422 enum { supportUnicode = 1 }; 423 424 template<typename OutputStream> EncodeUTF32425 static void Encode(OutputStream& os, unsigned codepoint) { 426 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 4); 427 RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); 428 os.Put(codepoint); 429 } 430 431 template<typename OutputStream> EncodeUnsafeUTF32432 static void EncodeUnsafe(OutputStream& os, unsigned codepoint) { 433 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 4); 434 RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); 435 PutUnsafe(os, codepoint); 436 } 437 438 template <typename InputStream> DecodeUTF32439 static bool Decode(InputStream& is, unsigned* codepoint) { 440 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 4); 441 Ch c = is.Take(); 442 *codepoint = c; 443 return c <= 0x10FFFF; 444 } 445 446 template <typename InputStream, typename OutputStream> ValidateUTF32447 static bool Validate(InputStream& is, OutputStream& os) { 448 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 4); 449 Ch c; 450 os.Put(c = is.Take()); 451 return c <= 0x10FFFF; 452 } 453 }; 454 455 //! UTF-32 little endian enocoding. 456 template<typename CharType = unsigned> 457 struct UTF32LE : UTF32<CharType> { 458 template <typename InputByteStream> TakeBOMUTF32LE459 static CharType TakeBOM(InputByteStream& is) { 460 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); 461 CharType c = Take(is); 462 return static_cast<uint32_t>(c) == 0x0000FEFFu ? Take(is) : c; 463 } 464 465 template <typename InputByteStream> TakeUTF32LE466 static CharType Take(InputByteStream& is) { 467 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); 468 unsigned c = static_cast<uint8_t>(is.Take()); 469 c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8; 470 c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 16; 471 c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 24; 472 return static_cast<CharType>(c); 473 } 474 475 template <typename OutputByteStream> PutBOMUTF32LE476 static void PutBOM(OutputByteStream& os) { 477 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); 478 os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu)); 479 os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu)); 480 os.Put(static_cast<typename OutputByteStream::Ch>(0x00u)); 481 os.Put(static_cast<typename OutputByteStream::Ch>(0x00u)); 482 } 483 484 template <typename OutputByteStream> PutUTF32LE485 static void Put(OutputByteStream& os, CharType c) { 486 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); 487 os.Put(static_cast<typename OutputByteStream::Ch>(c & 0xFFu)); 488 os.Put(static_cast<typename OutputByteStream::Ch>((c >> 8) & 0xFFu)); 489 os.Put(static_cast<typename OutputByteStream::Ch>((c >> 16) & 0xFFu)); 490 os.Put(static_cast<typename OutputByteStream::Ch>((c >> 24) & 0xFFu)); 491 } 492 }; 493 494 //! UTF-32 big endian encoding. 495 template<typename CharType = unsigned> 496 struct UTF32BE : UTF32<CharType> { 497 template <typename InputByteStream> TakeBOMUTF32BE498 static CharType TakeBOM(InputByteStream& is) { 499 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); 500 CharType c = Take(is); 501 return static_cast<uint32_t>(c) == 0x0000FEFFu ? Take(is) : c; 502 } 503 504 template <typename InputByteStream> TakeUTF32BE505 static CharType Take(InputByteStream& is) { 506 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); 507 unsigned c = static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 24; 508 c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 16; 509 c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8; 510 c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())); 511 return static_cast<CharType>(c); 512 } 513 514 template <typename OutputByteStream> PutBOMUTF32BE515 static void PutBOM(OutputByteStream& os) { 516 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); 517 os.Put(static_cast<typename OutputByteStream::Ch>(0x00u)); 518 os.Put(static_cast<typename OutputByteStream::Ch>(0x00u)); 519 os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu)); 520 os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu)); 521 } 522 523 template <typename OutputByteStream> PutUTF32BE524 static void Put(OutputByteStream& os, CharType c) { 525 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); 526 os.Put(static_cast<typename OutputByteStream::Ch>((c >> 24) & 0xFFu)); 527 os.Put(static_cast<typename OutputByteStream::Ch>((c >> 16) & 0xFFu)); 528 os.Put(static_cast<typename OutputByteStream::Ch>((c >> 8) & 0xFFu)); 529 os.Put(static_cast<typename OutputByteStream::Ch>(c & 0xFFu)); 530 } 531 }; 532 533 /////////////////////////////////////////////////////////////////////////////// 534 // ASCII 535 536 //! ASCII encoding. 537 /*! http://en.wikipedia.org/wiki/ASCII 538 \tparam CharType Code unit for storing 7-bit ASCII data. Default is char. 539 \note implements Encoding concept 540 */ 541 template<typename CharType = char> 542 struct ASCII { 543 typedef CharType Ch; 544 545 enum { supportUnicode = 0 }; 546 547 template<typename OutputStream> EncodeASCII548 static void Encode(OutputStream& os, unsigned codepoint) { 549 RAPIDJSON_ASSERT(codepoint <= 0x7F); 550 os.Put(static_cast<Ch>(codepoint & 0xFF)); 551 } 552 553 template<typename OutputStream> EncodeUnsafeASCII554 static void EncodeUnsafe(OutputStream& os, unsigned codepoint) { 555 RAPIDJSON_ASSERT(codepoint <= 0x7F); 556 PutUnsafe(os, static_cast<Ch>(codepoint & 0xFF)); 557 } 558 559 template <typename InputStream> DecodeASCII560 static bool Decode(InputStream& is, unsigned* codepoint) { 561 uint8_t c = static_cast<uint8_t>(is.Take()); 562 *codepoint = c; 563 return c <= 0X7F; 564 } 565 566 template <typename InputStream, typename OutputStream> ValidateASCII567 static bool Validate(InputStream& is, OutputStream& os) { 568 uint8_t c = static_cast<uint8_t>(is.Take()); 569 os.Put(static_cast<typename OutputStream::Ch>(c)); 570 return c <= 0x7F; 571 } 572 573 template <typename InputByteStream> TakeBOMASCII574 static CharType TakeBOM(InputByteStream& is) { 575 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); 576 uint8_t c = static_cast<uint8_t>(Take(is)); 577 return static_cast<Ch>(c); 578 } 579 580 template <typename InputByteStream> TakeASCII581 static Ch Take(InputByteStream& is) { 582 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); 583 return static_cast<Ch>(is.Take()); 584 } 585 586 template <typename OutputByteStream> PutBOMASCII587 static void PutBOM(OutputByteStream& os) { 588 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); 589 (void)os; 590 } 591 592 template <typename OutputByteStream> PutASCII593 static void Put(OutputByteStream& os, Ch c) { 594 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); 595 os.Put(static_cast<typename OutputByteStream::Ch>(c)); 596 } 597 }; 598 599 /////////////////////////////////////////////////////////////////////////////// 600 // AutoUTF 601 602 //! Runtime-specified UTF encoding type of a stream. 603 enum UTFType { 604 kUTF8 = 0, //!< UTF-8. 605 kUTF16LE = 1, //!< UTF-16 little endian. 606 kUTF16BE = 2, //!< UTF-16 big endian. 607 kUTF32LE = 3, //!< UTF-32 little endian. 608 kUTF32BE = 4 //!< UTF-32 big endian. 609 }; 610 611 //! Dynamically select encoding according to stream's runtime-specified UTF encoding type. 612 /*! \note This class can be used with AutoUTFInputtStream and AutoUTFOutputStream, which provides GetType(). 613 */ 614 template<typename CharType> 615 struct AutoUTF { 616 typedef CharType Ch; 617 618 enum { supportUnicode = 1 }; 619 620 #define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x 621 622 template<typename OutputStream> EncodeAutoUTF623 RAPIDJSON_FORCEINLINE static void Encode(OutputStream& os, unsigned codepoint) { 624 typedef void (*EncodeFunc)(OutputStream&, unsigned); 625 static const EncodeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Encode) }; 626 (*f[os.GetType()])(os, codepoint); 627 } 628 629 template<typename OutputStream> EncodeUnsafeAutoUTF630 RAPIDJSON_FORCEINLINE static void EncodeUnsafe(OutputStream& os, unsigned codepoint) { 631 typedef void (*EncodeFunc)(OutputStream&, unsigned); 632 static const EncodeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(EncodeUnsafe) }; 633 (*f[os.GetType()])(os, codepoint); 634 } 635 636 template <typename InputStream> DecodeAutoUTF637 RAPIDJSON_FORCEINLINE static bool Decode(InputStream& is, unsigned* codepoint) { 638 typedef bool (*DecodeFunc)(InputStream&, unsigned*); 639 static const DecodeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Decode) }; 640 return (*f[is.GetType()])(is, codepoint); 641 } 642 643 template <typename InputStream, typename OutputStream> ValidateAutoUTF644 RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { 645 typedef bool (*ValidateFunc)(InputStream&, OutputStream&); 646 static const ValidateFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Validate) }; 647 return (*f[is.GetType()])(is, os); 648 } 649 650 #undef RAPIDJSON_ENCODINGS_FUNC 651 }; 652 653 /////////////////////////////////////////////////////////////////////////////// 654 // Transcoder 655 656 //! Encoding conversion. 657 template<typename SourceEncoding, typename TargetEncoding> 658 struct Transcoder { 659 //! Take one Unicode codepoint from source encoding, convert it to target encoding and put it to the output stream. 660 template<typename InputStream, typename OutputStream> TranscodeTranscoder661 RAPIDJSON_FORCEINLINE static bool Transcode(InputStream& is, OutputStream& os) { 662 unsigned codepoint; 663 if (!SourceEncoding::Decode(is, &codepoint)) 664 return false; 665 TargetEncoding::Encode(os, codepoint); 666 return true; 667 } 668 669 template<typename InputStream, typename OutputStream> TranscodeUnsafeTranscoder670 RAPIDJSON_FORCEINLINE static bool TranscodeUnsafe(InputStream& is, OutputStream& os) { 671 unsigned codepoint; 672 if (!SourceEncoding::Decode(is, &codepoint)) 673 return false; 674 TargetEncoding::EncodeUnsafe(os, codepoint); 675 return true; 676 } 677 678 //! Validate one Unicode codepoint from an encoded stream. 679 template<typename InputStream, typename OutputStream> ValidateTranscoder680 RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { 681 return Transcode(is, os); // Since source/target encoding is different, must transcode. 682 } 683 }; 684 685 // Forward declaration. 686 template<typename Stream> 687 inline void PutUnsafe(Stream& stream, typename Stream::Ch c); 688 689 //! Specialization of Transcoder with same source and target encoding. 690 template<typename Encoding> 691 struct Transcoder<Encoding, Encoding> { 692 template<typename InputStream, typename OutputStream> 693 RAPIDJSON_FORCEINLINE static bool Transcode(InputStream& is, OutputStream& os) { 694 os.Put(is.Take()); // Just copy one code unit. This semantic is different from primary template class. 695 return true; 696 } 697 698 template<typename InputStream, typename OutputStream> 699 RAPIDJSON_FORCEINLINE static bool TranscodeUnsafe(InputStream& is, OutputStream& os) { 700 PutUnsafe(os, is.Take()); // Just copy one code unit. This semantic is different from primary template class. 701 return true; 702 } 703 704 template<typename InputStream, typename OutputStream> 705 RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { 706 return Encoding::Validate(is, os); // source/target encoding are the same 707 } 708 }; 709 710 RAPIDJSON_NAMESPACE_END 711 712 #if defined(__GNUC__) || defined(_MSC_VER) 713 RAPIDJSON_DIAG_POP 714 #endif 715 716 #endif // RAPIDJSON_ENCODINGS_H_ 717