1 /* utf8convert.cc: convert a string to UTF-8 encoding.
2 *
3 * Copyright (C) 2006,2007,2008,2010 Olly Betts
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19
20 #include <config.h>
21
22 #include "utf8convert.h"
23
24 #include <algorithm>
25 #include <string>
26
27 #include "safeerrno.h"
28 #ifdef USE_ICONV
29 # include <iconv.h>
30 #else
31 # include <xapian.h>
32 #endif
33 #include "strcasecmp.h"
34 #include "stringutils.h"
35
36 using namespace std;
37
38 void
convert_to_utf8(string & text,const string & charset)39 convert_to_utf8(string & text, const string & charset)
40 {
41 // Shortcut if it's already in utf8!
42 if (charset.size() == 5 && strcasecmp(charset.c_str(), "utf-8") == 0)
43 return;
44 if (charset.size() == 4 && strcasecmp(charset.c_str(), "utf8") == 0)
45 return;
46
47 // Nobody has told us what charset it's in, so do as little work as
48 // possible!
49 if (charset.empty())
50 return;
51
52 char buf[1024];
53
54 #ifdef USE_ICONV
55 iconv_t conv = iconv_open("UTF-8", charset.c_str());
56 if (conv == (iconv_t)-1) {
57 if (charset.size() < 4 || charset[3] == '-')
58 return;
59
60 // Try correcting common misspellings of UTF-16 and UCS-2 charsets.
61 // In particular, handle ' ' or '_' instead of '-', and a missing '-',
62 // so: UCS2 -> UCS-2, UTF_16 -> UTF-16, etc.
63 //
64 // Note: libiconv on OSX doesn't support these misspellings, though
65 // libiconv on Ubuntu does.
66 if (strncasecmp(charset.c_str(), "ucs", 3) != 0 &&
67 strncasecmp(charset.c_str(), "utf", 3) != 0) {
68 return;
69 }
70
71 string adjusted_charset(charset, 0, 3);
72 adjusted_charset += '-';
73 if (charset[3] == ' ' || charset[3] == '_') {
74 adjusted_charset.append(charset, 4, string::npos);
75 } else {
76 adjusted_charset.append(charset, 3, string::npos);
77 }
78
79 conv = iconv_open("UTF-8", adjusted_charset.c_str());
80 if (conv == (iconv_t)-1) return;
81 }
82
83 string tmp;
84
85 ICONV_INPUT_TYPE in = const_cast<char *>(text.c_str());
86 size_t in_len = text.size();
87 while (in_len) {
88 char * out = buf;
89 size_t out_len = sizeof(buf);
90 if (iconv(conv, &in, &in_len, &out, &out_len) == size_t(-1) &&
91 errno != E2BIG) {
92 // FIXME: how to handle this?
93 break;
94 }
95 tmp.append(buf, out - buf);
96 }
97
98 (void)iconv_close(conv);
99 #else
100 /* If we don't have iconv, handle iso-8859-1, utf-16/ucs-2,
101 * utf-16be/ucs-2be, and utf-16le/ucs-2le. */
102 string tmp;
103 const char * p = charset.c_str();
104
105 bool utf16 = false;
106 if (strncasecmp(p, "utf", 3) == 0) {
107 p += 3;
108 if (*p == '-' || *p == '_' || *p == ' ') ++p;
109 if (*p != '1' || p[1] != '6') return;
110 p += 2;
111 utf16 = true;
112 } else if (strncasecmp(p, "ucs", 3) == 0) {
113 p += 3;
114 if (*p == '-' || *p == '_' || *p == ' ') ++p;
115 if (*p != '2') return;
116 ++p;
117 utf16 = true;
118 }
119
120 if (utf16) {
121 if (text.size() < 2) return;
122
123 bool big_endian = true;
124 string::const_iterator i = text.begin();
125 if (*p == '\0') {
126 if (startswith(text, "\xfe\xff")) {
127 i += 2;
128 } else if (startswith(text, "\xff\xfe")) {
129 big_endian = false;
130 i += 2;
131 }
132 // UTF-16 with no BOM is meant to be assumed to be BE. Strictly
133 // speaking, we're not meant to assume anything for UCS-2 with
134 // no BOM, but we've got to do something, so we might as well
135 // assume it's UTF-16 mislabelled, which is easy and sane.
136 } else if (strcasecmp(p, "LE") == 0) {
137 big_endian = false;
138 } else if (!(strcasecmp(p, "BE") == 0)) {
139 return;
140 }
141
142 tmp.reserve(text.size() / 2);
143
144 size_t start = 0;
145 if (text.size() & 1) {
146 // If there's a half-character at the end, nuke it now to make the
147 // conversion loop below simpler.
148 text.resize(text.size() - 1);
149 }
150
151 while (i != text.end()) {
152 unsigned ch = static_cast<unsigned char>(*i++);
153 unsigned ch2 = static_cast<unsigned char>(*i++);
154 if (big_endian) {
155 ch = (ch << 8) | ch2;
156 } else {
157 ch = (ch2 << 8) | ch;
158 }
159 if (ch >> 10 == 0xd800 >> 10) {
160 // Surrogate pair.
161 if (i == text.end()) break;
162 unsigned hi = (ch & 0x3ff);
163 ch = static_cast<unsigned char>(*i++);
164 ch2 = static_cast<unsigned char>(*i++);
165 if (big_endian) {
166 ch = (ch << 8) | ch2;
167 } else {
168 ch = (ch2 << 8) | ch;
169 }
170 if (ch >> 10 == 0xdc00 >> 10) {
171 ch &= 0x3ff;
172 ch |= (hi << 10);
173 ch += 0x10000;
174 }
175 }
176 start += Xapian::Unicode::to_utf8(ch, buf + start);
177 if (start >= sizeof(buf) - 4) {
178 tmp.append(buf, start);
179 start = 0;
180 }
181 }
182 if (start) tmp.append(buf, start);
183 } else {
184 if (strncasecmp(p, "iso", 3) == 0) {
185 p += 3;
186 if (*p == '-' || *p == '_' || *p == ' ') ++p;
187 }
188 if (strncmp(p, "8859", 4) != 0) return;
189 p += 4;
190 if (*p == '-' || *p == '_' || *p == ' ') ++p;
191 if (strcmp(p, "1") != 0) return;
192
193 // FIXME: pull this out as a standard "normalise utf-8" function?
194 tmp.reserve(text.size());
195
196 size_t start = 0;
197 for (string::const_iterator i = text.begin(); i != text.end(); ++i) {
198 unsigned ch = static_cast<unsigned char>(*i);
199 start += Xapian::Unicode::to_utf8(ch, buf + start);
200 if (start >= sizeof(buf) - 4) {
201 tmp.append(buf, start);
202 start = 0;
203 }
204 }
205 if (start) tmp.append(buf, start);
206 }
207 #endif
208
209 swap(text, tmp);
210 }
211