1 // @file Utf8_16.h
2 // Copyright (C) 2002 Scott Kirkwood
3 //
4 // Permission to use, copy, modify, distribute and sell this code
5 // and its documentation for any purpose is hereby granted without fee,
6 // provided that the above copyright notice appear in all copies or
7 // any derived copies.  Scott Kirkwood makes no representations
8 // about the suitability of this software for any purpose.
9 // It is provided "as is" without express or implied warranty.
10 //
11 // Notes: Used the UTF information I found at:
12 //   http://www.cl.cam.ac.uk/~mgk25/unicode.html
13 ////////////////////////////////////////////////////////////////////////////////
14 
15 #ifndef UTF8_16_H
16 #define UTF8_16_H
17 
18 class Utf8_16 {
19 public:
20 	typedef unsigned short utf16; // 16 bits
21 	typedef unsigned char utf8; // 8 bits
22 	typedef unsigned char ubyte;
23 	enum encodingType {
24 		eUnknown,
25 		eUtf16BigEndian,
26 		eUtf16LittleEndian,  // Default on Windows
27 		eUtf8,
28 		eLast
29 	};
30 	static const utf8 k_Boms[eLast][3];
31 };
32 
33 // Reads UTF-16 and outputs UTF-8
34 class Utf16_Iter : public Utf8_16 {
35 public:
36 	Utf16_Iter() noexcept;
37 	void reset() noexcept;
38 	void set(const ubyte *pBuf, size_t nLen, encodingType eEncoding, ubyte *endSurrogate) noexcept;
get()39 	utf8 get() const noexcept {
40 		return m_nCur;
41 	}
42 	void operator++() noexcept;
43 	operator bool() const noexcept { return m_pRead <= m_pEnd; }
44 	utf16 read(const ubyte *pRead) const noexcept;
45 
46 protected:
47 	enum eState {
48 		eStart,
49 		eSecondOf4Bytes,
50 		ePenultimate,
51 		eFinal
52 	};
53 protected:
54 	encodingType m_eEncoding;
55 	eState m_eState;
56 	utf8 m_nCur;
57 	int m_nCur16;
58 	const ubyte *m_pBuf;
59 	const ubyte *m_pRead;
60 	const ubyte *m_pEnd;
61 };
62 
63 // Reads UTF-8 and outputs UTF-16
64 class Utf8_Iter : public Utf8_16 {
65 public:
66 	Utf8_Iter() noexcept;
67 	void reset() noexcept;
68 	void set(const ubyte *pBuf, size_t nLen, encodingType eEncoding);
get()69 	int get() const noexcept {
70 		assert(m_eState == eStart);
71 		return m_nCur;
72 	}
canGet()73 	bool canGet() const noexcept { return m_eState == eStart; }
74 	void operator++() noexcept;
75 	operator bool() const noexcept { return m_pRead <= m_pEnd; }
76 
77 protected:
78 	void toStart() noexcept; // Put to start state
79 	enum eState {
80 		eStart,
81 		eSecondOf4Bytes,
82 		ePenultimate,
83 		eFinal
84 	};
85 protected:
86 	encodingType m_eEncoding;
87 	eState m_eState;
88 	int m_nCur;
89 	const ubyte *m_pBuf;
90 	const ubyte *m_pRead;
91 	const ubyte *m_pEnd;
92 };
93 
94 // Reads UTF16 and outputs UTF8
95 class Utf8_16_Read : public Utf8_16 {
96 public:
97 	Utf8_16_Read();
98 	~Utf8_16_Read();
99 
100 	size_t convert(char *buf, size_t len);
getNewBuf()101 	char *getNewBuf() noexcept { return reinterpret_cast<char *>(m_pNewBuf); }
102 
getEncoding()103 	encodingType getEncoding() const noexcept { return m_eEncoding; }
104 protected:
105 	int determineEncoding() noexcept;
106 private:
107 	encodingType m_eEncoding;
108 	ubyte *m_pBuf;
109 	ubyte *m_pNewBuf;
110 	size_t m_nBufSize;
111 	bool m_bFirstRead;
112 	ubyte m_leadSurrogate[2];
113 	size_t m_nLen;
114 	Utf16_Iter m_Iter16;
115 };
116 
117 // Read in a UTF-8 buffer and write out to UTF-16 or UTF-8
118 class Utf8_16_Write : public Utf8_16 {
119 public:
120 	Utf8_16_Write();
121 	~Utf8_16_Write();
122 
123 	void setEncoding(encodingType eType) noexcept;
124 
125 	void setfile(FILE *pFile) noexcept;
126 	size_t fwrite(const void *p, size_t _size);
127 	int fclose() noexcept;
128 protected:
129 	encodingType m_eEncoding;
130 	FILE *m_pFile;
131 	utf16 *m_pBuf;
132 	size_t m_nBufSize;
133 	bool m_bFirstWrite;
134 };
135 
136 #endif
137