1 /*
2  *  Copyright 2008-2014 Fabrice Colin
3  *
4  *  This program is free software; you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation; either version 2 of the License, or
7  *  (at your option) any later version.
8  *
9  *  This program is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with this program; if not, write to the Free Software
16  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17  */
18 
19 #include <errno.h>
20 #include <iostream>
21 #include <glibmm/convert.h>
22 #include <glibmm/ustring.h>
23 
24 #include "StringManip.h"
25 #include "TextConverter.h"
26 
27 using std::clog;
28 using std::endl;
29 using std::string;
30 using namespace Glib;
31 
TextConverter(unsigned int maxErrors)32 TextConverter::TextConverter(unsigned int maxErrors) :
33 	m_utf8Locale(false),
34 	m_maxErrors(maxErrors),
35 	m_conversionErrors(0)
36 {
37 	// Get the locale charset
38 	m_utf8Locale = get_charset(m_localeCharset);
39 }
40 
~TextConverter()41 TextConverter::~TextConverter()
42 {
43 }
44 
convert(const dstring & text,string & fromCharset,const string & toCharset)45 dstring TextConverter::convert(const dstring &text,
46 	string &fromCharset, const string &toCharset)
47 {
48 	dstring outputText;
49 	char outputBuffer[8192];
50 	char *pInput = const_cast<char *>(text.c_str());
51 	gsize inputSize = (gsize)text.length();
52 	bool invalidSequence = false;
53 
54 	outputText.clear();
55 	try
56 	{
57 		IConv converter(toCharset, fromCharset);
58 
59 		while (inputSize > 0)
60 		{
61 			char *pOutput = outputBuffer;
62 			gsize outputSize = 8192;
63 
64 			size_t conversions = converter.iconv(&pInput, &inputSize, &pOutput, &outputSize);
65 			int errorCode = errno;
66 			if (conversions == static_cast<size_t>(-1))
67 			{
68 				if (errorCode == EILSEQ)
69 				{
70 					// Conversion was only partially successful
71 					++m_conversionErrors;
72 #ifdef DEBUG
73 					clog << "TextConverter::convert: invalid sequence" << endl;
74 #endif
75 					if (m_conversionErrors >= m_maxErrors)
76 					{
77 						// Give up
78 						return text;
79 					}
80 					converter.reset();
81 
82 					outputText.append(outputBuffer, 8192 - outputSize);
83 					if (invalidSequence == false)
84 					{
85 						outputText += "?";
86 						invalidSequence = true;
87 					}
88 
89 					// Skip that
90 					++pInput;
91 					--inputSize;
92 					continue;
93 				}
94 				else if (errorCode != E2BIG)
95 				{
96 #ifdef DEBUG
97 					clog << "TextConverter::convert: unknown error " << errorCode << endl;
98 #endif
99 					return text;
100 				}
101 			}
102 			else
103 			{
104 				invalidSequence = false;
105 			}
106 
107 			// Append what was successfully converted
108 			outputText.append(outputBuffer, 8192 - outputSize);
109 		}
110 
111 #ifdef DEBUG
112 		clog << "TextConverter::convert: " << m_conversionErrors << " conversion errors" << endl;
113 #endif
114 	}
115 	catch (Error &ce)
116 	{
117 #ifdef DEBUG
118 		clog << "TextConverter::convert: " << ce.what() << endl;
119 #endif
120 		outputText.clear();
121 
122 		string::size_type pos = fromCharset.find('_');
123 		if (pos != string::npos)
124 		{
125 			string fixedCharset(StringManip::replaceSubString(fromCharset, "_", "-"));
126 
127 #ifdef DEBUG
128 			clog << "TextConverter::convert: trying with charset " << fixedCharset << endl;
129 #endif
130 			fromCharset = fixedCharset;
131 			outputText = convert(text, fromCharset, toCharset);
132 		}
133 	}
134 	catch (...)
135 	{
136 #ifdef DEBUG
137 		clog << "TextConverter::convert: unknown exception" << endl;
138 #endif
139 		outputText.clear();
140 	}
141 
142 	return outputText;
143 }
144 
toUTF8(const dstring & text,string & charset)145 dstring TextConverter::toUTF8(const dstring &text,
146 	string &charset)
147 {
148 	string textCharset(StringManip::toLowerCase(charset));
149 
150 	m_conversionErrors = 0;
151 
152 	if ((text.empty() == true) ||
153 		(textCharset == "utf-8"))
154 	{
155 		// No conversion necessary
156 		return text;
157 	}
158 
159 	if (textCharset.empty() == true)
160 	{
161 		if (m_utf8Locale == true)
162 		{
163 			// The current locale uses UTF-8
164 			return text;
165 		}
166 
167 		textCharset = m_localeCharset;
168 	}
169 
170 	return convert(text, textCharset, "UTF-8");
171 }
172 
fromUTF8(const dstring & text,const string & charset)173 dstring TextConverter::fromUTF8(const dstring &text,
174 	const string &charset)
175 {
176 	string fromCharset("UTF-8");
177 
178 	return convert(text, fromCharset, charset);
179 }
180 
fromUTF8(const string & text)181 string TextConverter::fromUTF8(const string &text)
182 {
183 	try
184 	{
185 		return locale_from_utf8(text);
186 	}
187 	catch (Error &ce)
188 	{
189 #ifdef DEBUG
190 		clog << "TextConverter::fromUTF8: " << ce.what() << endl;
191 #endif
192 	}
193 	catch (...)
194 	{
195 #ifdef DEBUG
196 		clog << "TextConverter::fromUTF8: unknown exception" << endl;
197 #endif
198 	}
199 
200        return "";
201 }
202 
getErrorsCount(void) const203 unsigned int TextConverter::getErrorsCount(void) const
204 {
205 	return m_conversionErrors;
206 }
207 
208