1 // @file Utf8_16.h 2 // Copyright (C) 2002 Scott Kirkwood 3 // 4 // Permission to use, copy, modify, distribute and sell this code 5 // and its documentation for any purpose is hereby granted without fee, 6 // provided that the above copyright notice appear in all copies or 7 // any derived copies. Scott Kirkwood makes no representations 8 // about the suitability of this software for any purpose. 9 // It is provided "as is" without express or implied warranty. 10 // 11 // Notes: Used the UTF information I found at: 12 // http://www.cl.cam.ac.uk/~mgk25/unicode.html 13 //////////////////////////////////////////////////////////////////////////////// 14 15 #ifndef UTF8_16_H 16 #define UTF8_16_H 17 18 class Utf8_16 { 19 public: 20 typedef unsigned short utf16; // 16 bits 21 typedef unsigned char utf8; // 8 bits 22 typedef unsigned char ubyte; 23 enum encodingType { 24 eUnknown, 25 eUtf16BigEndian, 26 eUtf16LittleEndian, // Default on Windows 27 eUtf8, 28 eLast 29 }; 30 static const utf8 k_Boms[eLast][3]; 31 }; 32 33 // Reads UTF-16 and outputs UTF-8 34 class Utf16_Iter : public Utf8_16 { 35 public: 36 Utf16_Iter() noexcept; 37 void reset() noexcept; 38 void set(const ubyte *pBuf, size_t nLen, encodingType eEncoding, ubyte *endSurrogate) noexcept; get()39 utf8 get() const noexcept { 40 return m_nCur; 41 } 42 void operator++() noexcept; 43 operator bool() const noexcept { return m_pRead <= m_pEnd; } 44 utf16 read(const ubyte *pRead) const noexcept; 45 46 protected: 47 enum eState { 48 eStart, 49 eSecondOf4Bytes, 50 ePenultimate, 51 eFinal 52 }; 53 protected: 54 encodingType m_eEncoding; 55 eState m_eState; 56 utf8 m_nCur; 57 int m_nCur16; 58 const ubyte *m_pBuf; 59 const ubyte *m_pRead; 60 const ubyte *m_pEnd; 61 }; 62 63 // Reads UTF-8 and outputs UTF-16 64 class Utf8_Iter : public Utf8_16 { 65 public: 66 Utf8_Iter() noexcept; 67 void reset() noexcept; 68 void set(const ubyte *pBuf, size_t nLen, encodingType eEncoding); get()69 int get() const noexcept { 70 assert(m_eState == eStart); 71 return m_nCur; 72 } canGet()73 bool canGet() const noexcept { return m_eState == eStart; } 74 void operator++() noexcept; 75 operator bool() const noexcept { return m_pRead <= m_pEnd; } 76 77 protected: 78 void toStart() noexcept; // Put to start state 79 enum eState { 80 eStart, 81 eSecondOf4Bytes, 82 ePenultimate, 83 eFinal 84 }; 85 protected: 86 encodingType m_eEncoding; 87 eState m_eState; 88 int m_nCur; 89 const ubyte *m_pBuf; 90 const ubyte *m_pRead; 91 const ubyte *m_pEnd; 92 }; 93 94 // Reads UTF16 and outputs UTF8 95 class Utf8_16_Read : public Utf8_16 { 96 public: 97 Utf8_16_Read(); 98 ~Utf8_16_Read(); 99 100 size_t convert(char *buf, size_t len); getNewBuf()101 char *getNewBuf() noexcept { return reinterpret_cast<char *>(m_pNewBuf); } 102 getEncoding()103 encodingType getEncoding() const noexcept { return m_eEncoding; } 104 protected: 105 int determineEncoding() noexcept; 106 private: 107 encodingType m_eEncoding; 108 ubyte *m_pBuf; 109 ubyte *m_pNewBuf; 110 size_t m_nBufSize; 111 bool m_bFirstRead; 112 ubyte m_leadSurrogate[2]; 113 size_t m_nLen; 114 Utf16_Iter m_Iter16; 115 }; 116 117 // Read in a UTF-8 buffer and write out to UTF-16 or UTF-8 118 class Utf8_16_Write : public Utf8_16 { 119 public: 120 Utf8_16_Write(); 121 ~Utf8_16_Write(); 122 123 void setEncoding(encodingType eType) noexcept; 124 125 void setfile(FILE *pFile) noexcept; 126 size_t fwrite(const void *p, size_t _size); 127 int fclose() noexcept; 128 protected: 129 encodingType m_eEncoding; 130 FILE *m_pFile; 131 utf16 *m_pBuf; 132 size_t m_nBufSize; 133 bool m_bFirstWrite; 134 }; 135 136 #endif 137