1 #include <string> 2 #include <climits> 3 #include "char_ucs.h" 4 5 /* 6 7 copyright (c) 2006, 2015 squell <squell@alumina.nl> 8 9 use, modification, copying and distribution of this software is permitted 10 under the conditions described in the file 'COPYING'. 11 12 */ 13 14 namespace charset { 15 using namespace std; 16 17 namespace { 18 union wide { wide(wchar_t wc)19 wide(wchar_t wc) : code(wc) { } 20 wchar_t code; 21 char raw[sizeof(wchar_t)]; 22 }; 23 24 template<class T> inline operator +=(std::basic_string<T> & str,const wide w)25 std::basic_string<T>& operator+=(std::basic_string<T>& str, const wide w) 26 { 27 return str += w.code; 28 } 29 operator +=(std::string & str,const wide w)30 inline std::string& operator+=(std::string& str, const wide w) 31 { 32 return str.append(w.raw, sizeof w.raw); 33 } 34 } 35 decode(const char * s,size_t len,byte_order ord)36 conv<>::data conv_wide::decode(const char* s, size_t len, byte_order ord) 37 { 38 if(!(len &= ~1U)) return conv<>::data(); // force len to 2k, k > 0 39 const char* end = s+len; 40 41 conv<>::data build; 42 build.reserve(len / sizeof(wchar_t)); 43 bool i = (ord == big_endian); 44 45 switch(wide( s[0^i] & 0xFF | s[1^i]<<8 & 0xFF00U ).code) { 46 default: break; 47 case 0xFFFE: i = !i; 48 case 0xFEFF: s += 2; 49 } 50 51 for( ; s < end; s+=2) { 52 wide ch( s[0^i] & 0xFF | s[1^i]<<8 & 0xFF00U ); 53 if(ch.code < 0xD800 || ch.code >= 0xE000) 54 build += ch; 55 else if(ch.code < 0xDC00 && (s+=2) < end) { // UTF-16 surrogate 56 wide lo( s[0^i] & 0xFF | s[1^i]<<8 & 0xFF00U ); 57 if(lo.code >= 0xDC00 && lo.code < 0xE000) 58 build += wide((ch.code&0x3FF)<<10 | (lo.code&0x3FF) | 0x10000); 59 } 60 } 61 return build; 62 } 63 encode(const void * p,size_t len,byte_order ord)64 string conv_wide::encode(const void* p, size_t len, byte_order ord) 65 { 66 const wchar_t* w = (wchar_t*)p; 67 std::string build; 68 build.reserve(len); 69 int i = (ord == big_endian) * 8; 70 71 if(ord == marked) { // write BOM 72 (build += '\xFF') += '\xFE'; 73 } 74 75 for( ; len--; ) { 76 wchar_t c = *w++; 77 if(c < 0x10000) // innocent warning by gcc 78 (build += c>>i & 0xFF) += c>>(8^i) & 0xFF; 79 else { // encode a UTF16 surrogate pair 80 c -= 0x10000; 81 wchar_t hi = (c>>10)&0x3FF | 0xD800, lo = c&0x3FF | 0xDC00; 82 (build += hi>>i & 0xFF) += hi>>(8^i) & 0xFF; 83 (build += lo>>i & 0xFF) += lo>>(8^i) & 0xFF; 84 } 85 } 86 return build; 87 } 88 89 } 90 91