1 /*
2 Copyright (C) 2001-2006, William Joseph.
3 All Rights Reserved.
4
5 This file is part of GtkRadiant.
6
7 GtkRadiant is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
11
12 GtkRadiant is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GtkRadiant; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #if !defined(INCLUDED_CONVERT_H)
23 #define INCLUDED_CONVERT_H
24
25 /// \file
26 /// \brief Character encoding conversion.
27
28 #include "debugging/debugging.h"
29 #include <algorithm>
30 #include <glib.h>
31 #include <glib.h>
32
33 #include "character.h"
34
35 /// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding.
utf8_character_length(const char * character)36 inline std::size_t utf8_character_length(const char* character)
37 {
38 if((*character & 0xE0) == 0xC0) // 110xxxxx
39 {
40 return 2;
41 }
42 else if((*character & 0xF0) == 0xE0) // 1110xxxx
43 {
44 return 3;
45 }
46 else if((*character & 0xF8) == 0xF0) // 11110xxx
47 {
48 return 4;
49 }
50 else if((*character & 0xFC) == 0xF8) // 111110xx
51 {
52 return 5;
53 }
54 else if((*character & 0xFE) == 0xFC) // 1111110x
55 {
56 return 6;
57 }
58 ERROR_MESSAGE("");
59 return 0;
60 }
61
62 struct UTF8Character
63 {
64 const char* buffer;
65 std::size_t length;
UTF8CharacterUTF8Character66 UTF8Character() : buffer(0), length(0)
67 {
68 }
UTF8CharacterUTF8Character69 UTF8Character(const char* bytes) : buffer(bytes), length(utf8_character_length(bytes))
70 {
71 }
72 };
73
74 inline bool operator<(const UTF8Character& self, const UTF8Character& other)
75 {
76 return std::lexicographical_compare(self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length);
77 }
78
79 /// \brief Writes \p c to \p ostream in Hex form. Useful for debugging.
80 template<typename TextOutputStreamType>
ostream_write(TextOutputStreamType & ostream,const UTF8Character & c)81 inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const UTF8Character& c)
82 {
83 for(const char* p = c.buffer; p != c.buffer + c.length; ++p)
84 {
85 ostream << HexChar(*p);
86 }
87 return ostream;
88 }
89
90
91
92 /// \brief The character-set encoding for the current C locale.
93 ///
94 /// Obtain the global instance with globalCharacterSet().
95 class CharacterSet
96 {
97 const char* m_charSet;
98 public:
CharacterSet()99 CharacterSet()
100 {
101 if(g_get_charset(&m_charSet) != FALSE)
102 {
103 m_charSet = 0;
104 }
105 }
isUTF8()106 bool isUTF8() const
107 {
108 return m_charSet == 0;
109 }
get()110 const char* get() const
111 {
112 return m_charSet;
113 }
114 };
115
116 typedef LazyStatic<CharacterSet> GlobalCharacterSet;
117
118 /// \brief Returns the global instance of CharacterSet.
globalCharacterSet()119 inline CharacterSet& globalCharacterSet()
120 {
121 return GlobalCharacterSet::instance();
122 }
123
124
125 class UTF8CharacterToExtendedASCII
126 {
127 public:
128 UTF8Character m_utf8;
129 char m_c;
UTF8CharacterToExtendedASCII()130 UTF8CharacterToExtendedASCII() : m_c('\0')
131 {
132 }
UTF8CharacterToExtendedASCII(const UTF8Character & utf8,char c)133 UTF8CharacterToExtendedASCII(const UTF8Character& utf8, char c) : m_utf8(utf8), m_c(c)
134 {
135 }
136 };
137
138 inline bool operator<(const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other)
139 {
140 return self.m_utf8 < other.m_utf8;
141 }
142
extended_ascii_to_index(char c)143 inline std::size_t extended_ascii_to_index(char c)
144 {
145 return static_cast<std::size_t>(c & 0x7F);
146 }
147
extended_ascii_for_index(std::size_t i)148 inline char extended_ascii_for_index(std::size_t i)
149 {
150 return static_cast<char>(i | 0x80);
151 }
152
153 /// \brief The active extended-ascii character set encoding.
154 /// Performs UTF-8 encoding and decoding of extended-ascii characters.
155 ///
156 /// Obtain the global instance with globalExtendedASCIICharacterSet().
157 class ExtendedASCIICharacterSet
158 {
159 typedef char UTF8CharBuffer[6];
160 UTF8CharBuffer m_converted[128];
161 UTF8Character m_decodeMap[128];
162 UTF8CharacterToExtendedASCII m_encodeMap[128];
163 public:
ExtendedASCIICharacterSet()164 ExtendedASCIICharacterSet()
165 {
166 if(!globalCharacterSet().isUTF8())
167 {
168 GIConv descriptor = g_iconv_open("UTF-8", globalCharacterSet().get());
169 for(std::size_t i = 1; i < 128; ++i)
170 {
171 char c = extended_ascii_for_index(i);
172 char* inbuf = &c;
173 std::size_t inbytesleft = 1;
174 char* outbuf = m_converted[i];
175 std::size_t outbytesleft = 6;
176 if(g_iconv(descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft) != (size_t)(-1))
177 {
178 UTF8Character utf8(m_converted[i]);
179 m_decodeMap[i] = utf8;
180 m_encodeMap[i] = UTF8CharacterToExtendedASCII(utf8, c);
181 }
182 }
183 g_iconv_close(descriptor);
184 std::sort(m_encodeMap, m_encodeMap + 128);
185 }
186 }
187 /// \brief Prints the (up to) 128 characters in the current extended-ascii character set.
188 /// Useful for debugging.
print()189 void print() const
190 {
191 globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";
192 for(std::size_t i = 1; i < 128; ++i)
193 {
194 if(m_decodeMap[i].buffer != 0)
195 {
196 globalOutputStream() << extended_ascii_for_index(i) << " = " << m_decodeMap[i] << "\n";
197 }
198 }
199 }
200 /// \brief Returns \p c decoded from extended-ascii to UTF-8.
201 /// \p c must be an extended-ascii character.
decode(char c)202 const UTF8Character& decode(char c) const
203 {
204 ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
205 ASSERT_MESSAGE(!char_is_ascii(c), "decode: ascii character");
206 ASSERT_MESSAGE(m_decodeMap[extended_ascii_to_index(c)].buffer != 0, "decode: invalid character: " << HexChar(c));
207 return m_decodeMap[extended_ascii_to_index(c)];
208 }
209 /// \brief Returns \p c encoded to extended-ascii from UTF-8.
210 /// \p c must map to an extended-ascii character.
encode(const UTF8Character & c)211 char encode(const UTF8Character& c) const
212 {
213 ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
214 ASSERT_MESSAGE(!char_is_ascii(*c.buffer), "encode: ascii character");
215 std::pair<const UTF8CharacterToExtendedASCII*, const UTF8CharacterToExtendedASCII*> range
216 = std::equal_range(m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII(c, 0));
217 ASSERT_MESSAGE(range.first != range.second, "encode: invalid character: " << c);
218 return (*range.first).m_c;
219 }
220 };
221
222 typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;
223
224 /// \brief Returns the global instance of ExtendedASCIICharacterSet.
globalExtendedASCIICharacterSet()225 inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet()
226 {
227 return GlobalExtendedASCIICharacterSet::instance();
228 }
229
230 class ConvertUTF8ToLocale
231 {
232 public:
233 StringRange m_range;
ConvertUTF8ToLocale(const char * string)234 ConvertUTF8ToLocale(const char* string) : m_range(StringRange(string, string + strlen(string)))
235 {
236 }
ConvertUTF8ToLocale(const StringRange & range)237 ConvertUTF8ToLocale(const StringRange& range) : m_range(range)
238 {
239 }
240 };
241
242 /// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8.
243 template<typename TextOutputStreamType>
ostream_write(TextOutputStreamType & ostream,const ConvertUTF8ToLocale & convert)244 inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert)
245 {
246 if(globalCharacterSet().isUTF8())
247 {
248 return ostream << convert.m_range;
249 }
250
251 for(const char* p = convert.m_range.begin; p != convert.m_range.end;)
252 {
253 if(!char_is_ascii(*p))
254 {
255 UTF8Character c(p);
256 ostream << globalExtendedASCIICharacterSet().encode(c);
257 p += c.length;
258 }
259 else
260 {
261 ostream << *p++;
262 }
263 }
264 return ostream;
265 }
266
267
268 class ConvertLocaleToUTF8
269 {
270 public:
271 StringRange m_range;
ConvertLocaleToUTF8(const char * string)272 ConvertLocaleToUTF8(const char* string) : m_range(StringRange(string, string + strlen(string)))
273 {
274 }
ConvertLocaleToUTF8(const StringRange & range)275 ConvertLocaleToUTF8(const StringRange& range) : m_range(range)
276 {
277 }
278 };
279
280 /// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8.
281 template<typename TextOutputStreamType>
ostream_write(TextOutputStreamType & ostream,const ConvertLocaleToUTF8 & convert)282 inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert)
283 {
284 if(globalCharacterSet().isUTF8())
285 {
286 return ostream << convert.m_range;
287 }
288
289 for(const char* p = convert.m_range.begin; p != convert.m_range.end; ++p)
290 {
291 if(!char_is_ascii(*p))
292 {
293 UTF8Character c(globalExtendedASCIICharacterSet().decode(*p));
294 ostream.write(c.buffer, c.length);
295 }
296 else
297 {
298 ostream << *p;
299 }
300 }
301 return ostream;
302 }
303
304
305 #endif
306