1 /*
2 * Copyright 2008-2014 Fabrice Colin
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18
19 #include <errno.h>
20 #include <iostream>
21 #include <glibmm/convert.h>
22 #include <glibmm/ustring.h>
23
24 #include "StringManip.h"
25 #include "TextConverter.h"
26
27 using std::clog;
28 using std::endl;
29 using std::string;
30 using namespace Glib;
31
TextConverter(unsigned int maxErrors)32 TextConverter::TextConverter(unsigned int maxErrors) :
33 m_utf8Locale(false),
34 m_maxErrors(maxErrors),
35 m_conversionErrors(0)
36 {
37 // Get the locale charset
38 m_utf8Locale = get_charset(m_localeCharset);
39 }
40
~TextConverter()41 TextConverter::~TextConverter()
42 {
43 }
44
convert(const dstring & text,string & fromCharset,const string & toCharset)45 dstring TextConverter::convert(const dstring &text,
46 string &fromCharset, const string &toCharset)
47 {
48 dstring outputText;
49 char outputBuffer[8192];
50 char *pInput = const_cast<char *>(text.c_str());
51 gsize inputSize = (gsize)text.length();
52 bool invalidSequence = false;
53
54 outputText.clear();
55 try
56 {
57 IConv converter(toCharset, fromCharset);
58
59 while (inputSize > 0)
60 {
61 char *pOutput = outputBuffer;
62 gsize outputSize = 8192;
63
64 size_t conversions = converter.iconv(&pInput, &inputSize, &pOutput, &outputSize);
65 int errorCode = errno;
66 if (conversions == static_cast<size_t>(-1))
67 {
68 if (errorCode == EILSEQ)
69 {
70 // Conversion was only partially successful
71 ++m_conversionErrors;
72 #ifdef DEBUG
73 clog << "TextConverter::convert: invalid sequence" << endl;
74 #endif
75 if (m_conversionErrors >= m_maxErrors)
76 {
77 // Give up
78 return text;
79 }
80 converter.reset();
81
82 outputText.append(outputBuffer, 8192 - outputSize);
83 if (invalidSequence == false)
84 {
85 outputText += "?";
86 invalidSequence = true;
87 }
88
89 // Skip that
90 ++pInput;
91 --inputSize;
92 continue;
93 }
94 else if (errorCode != E2BIG)
95 {
96 #ifdef DEBUG
97 clog << "TextConverter::convert: unknown error " << errorCode << endl;
98 #endif
99 return text;
100 }
101 }
102 else
103 {
104 invalidSequence = false;
105 }
106
107 // Append what was successfully converted
108 outputText.append(outputBuffer, 8192 - outputSize);
109 }
110
111 #ifdef DEBUG
112 clog << "TextConverter::convert: " << m_conversionErrors << " conversion errors" << endl;
113 #endif
114 }
115 catch (Error &ce)
116 {
117 #ifdef DEBUG
118 clog << "TextConverter::convert: " << ce.what() << endl;
119 #endif
120 outputText.clear();
121
122 string::size_type pos = fromCharset.find('_');
123 if (pos != string::npos)
124 {
125 string fixedCharset(StringManip::replaceSubString(fromCharset, "_", "-"));
126
127 #ifdef DEBUG
128 clog << "TextConverter::convert: trying with charset " << fixedCharset << endl;
129 #endif
130 fromCharset = fixedCharset;
131 outputText = convert(text, fromCharset, toCharset);
132 }
133 }
134 catch (...)
135 {
136 #ifdef DEBUG
137 clog << "TextConverter::convert: unknown exception" << endl;
138 #endif
139 outputText.clear();
140 }
141
142 return outputText;
143 }
144
toUTF8(const dstring & text,string & charset)145 dstring TextConverter::toUTF8(const dstring &text,
146 string &charset)
147 {
148 string textCharset(StringManip::toLowerCase(charset));
149
150 m_conversionErrors = 0;
151
152 if ((text.empty() == true) ||
153 (textCharset == "utf-8"))
154 {
155 // No conversion necessary
156 return text;
157 }
158
159 if (textCharset.empty() == true)
160 {
161 if (m_utf8Locale == true)
162 {
163 // The current locale uses UTF-8
164 return text;
165 }
166
167 textCharset = m_localeCharset;
168 }
169
170 return convert(text, textCharset, "UTF-8");
171 }
172
fromUTF8(const dstring & text,const string & charset)173 dstring TextConverter::fromUTF8(const dstring &text,
174 const string &charset)
175 {
176 string fromCharset("UTF-8");
177
178 return convert(text, fromCharset, charset);
179 }
180
fromUTF8(const string & text)181 string TextConverter::fromUTF8(const string &text)
182 {
183 try
184 {
185 return locale_from_utf8(text);
186 }
187 catch (Error &ce)
188 {
189 #ifdef DEBUG
190 clog << "TextConverter::fromUTF8: " << ce.what() << endl;
191 #endif
192 }
193 catch (...)
194 {
195 #ifdef DEBUG
196 clog << "TextConverter::fromUTF8: unknown exception" << endl;
197 #endif
198 }
199
200 return "";
201 }
202
getErrorsCount(void) const203 unsigned int TextConverter::getErrorsCount(void) const
204 {
205 return m_conversionErrors;
206 }
207
208