1 /* utf8convert.cc: convert a string to UTF-8 encoding.
2  *
3  * Copyright (C) 2006,2007,2008,2010 Olly Betts
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
18  */
19 
20 #include <config.h>
21 
22 #include "utf8convert.h"
23 
24 #include <algorithm>
25 #include <string>
26 
27 #include "safeerrno.h"
28 #ifdef USE_ICONV
29 # include <iconv.h>
30 #else
31 # include <xapian.h>
32 #endif
33 #include "strcasecmp.h"
34 #include "stringutils.h"
35 
36 using namespace std;
37 
38 void
convert_to_utf8(string & text,const string & charset)39 convert_to_utf8(string & text, const string & charset)
40 {
41     // Shortcut if it's already in utf8!
42     if (charset.size() == 5 && strcasecmp(charset.c_str(), "utf-8") == 0)
43 	return;
44     if (charset.size() == 4 && strcasecmp(charset.c_str(), "utf8") == 0)
45 	return;
46 
47     // Nobody has told us what charset it's in, so do as little work as
48     // possible!
49     if (charset.empty())
50 	return;
51 
52     char buf[1024];
53 
54 #ifdef USE_ICONV
55     iconv_t conv = iconv_open("UTF-8", charset.c_str());
56     if (conv == (iconv_t)-1) {
57 	if (charset.size() < 4 || charset[3] == '-')
58 	    return;
59 
60 	// Try correcting common misspellings of UTF-16 and UCS-2 charsets.
61 	// In particular, handle ' ' or '_' instead of '-', and a missing '-',
62 	// so: UCS2 -> UCS-2, UTF_16 -> UTF-16, etc.
63 	//
64 	// Note: libiconv on OSX doesn't support these misspellings, though
65 	// libiconv on Ubuntu does.
66 	if (strncasecmp(charset.c_str(), "ucs", 3) != 0 &&
67 	    strncasecmp(charset.c_str(), "utf", 3) != 0) {
68 	    return;
69 	}
70 
71 	string adjusted_charset(charset, 0, 3);
72 	adjusted_charset += '-';
73 	if (charset[3] == ' ' || charset[3] == '_') {
74 	    adjusted_charset.append(charset, 4, string::npos);
75 	} else {
76 	    adjusted_charset.append(charset, 3, string::npos);
77 	}
78 
79 	conv = iconv_open("UTF-8", adjusted_charset.c_str());
80 	if (conv == (iconv_t)-1) return;
81     }
82 
83     string tmp;
84 
85     ICONV_INPUT_TYPE in = const_cast<char *>(text.c_str());
86     size_t in_len = text.size();
87     while (in_len) {
88 	char * out = buf;
89 	size_t out_len = sizeof(buf);
90 	if (iconv(conv, &in, &in_len, &out, &out_len) == size_t(-1) &&
91 	    errno != E2BIG) {
92 	    // FIXME: how to handle this?
93 	    break;
94 	}
95 	tmp.append(buf, out - buf);
96     }
97 
98     (void)iconv_close(conv);
99 #else
100     /* If we don't have iconv, handle iso-8859-1, utf-16/ucs-2,
101      * utf-16be/ucs-2be, and utf-16le/ucs-2le. */
102     string tmp;
103     const char * p = charset.c_str();
104 
105     bool utf16 = false;
106     if (strncasecmp(p, "utf", 3) == 0) {
107 	p += 3;
108 	if (*p == '-' || *p == '_' || *p == ' ') ++p;
109 	if (*p != '1' || p[1] != '6') return;
110 	p += 2;
111 	utf16 = true;
112     } else if (strncasecmp(p, "ucs", 3) == 0) {
113 	p += 3;
114 	if (*p == '-' || *p == '_' || *p == ' ') ++p;
115 	if (*p != '2') return;
116 	++p;
117 	utf16 = true;
118     }
119 
120     if (utf16) {
121 	if (text.size() < 2) return;
122 
123 	bool big_endian = true;
124 	string::const_iterator i = text.begin();
125 	if (*p == '\0') {
126 	    if (startswith(text, "\xfe\xff")) {
127 		i += 2;
128 	    } else if (startswith(text, "\xff\xfe")) {
129 		big_endian = false;
130 		i += 2;
131 	    }
132 	    // UTF-16 with no BOM is meant to be assumed to be BE.  Strictly
133 	    // speaking, we're not meant to assume anything for UCS-2 with
134 	    // no BOM, but we've got to do something, so we might as well
135 	    // assume it's UTF-16 mislabelled, which is easy and sane.
136 	} else if (strcasecmp(p, "LE") == 0) {
137 	    big_endian = false;
138 	} else if (!(strcasecmp(p, "BE") == 0)) {
139 	    return;
140 	}
141 
142 	tmp.reserve(text.size() / 2);
143 
144 	size_t start = 0;
145 	if (text.size() & 1) {
146 	    // If there's a half-character at the end, nuke it now to make the
147 	    // conversion loop below simpler.
148 	    text.resize(text.size() - 1);
149 	}
150 
151 	while (i != text.end()) {
152 	    unsigned ch = static_cast<unsigned char>(*i++);
153 	    unsigned ch2 = static_cast<unsigned char>(*i++);
154 	    if (big_endian) {
155 		ch = (ch << 8) | ch2;
156 	    } else {
157 		ch = (ch2 << 8) | ch;
158 	    }
159 	    if (ch >> 10 == 0xd800 >> 10) {
160 		// Surrogate pair.
161 		if (i == text.end()) break;
162 		unsigned hi = (ch & 0x3ff);
163 		ch = static_cast<unsigned char>(*i++);
164 		ch2 = static_cast<unsigned char>(*i++);
165 		if (big_endian) {
166 		    ch = (ch << 8) | ch2;
167 		} else {
168 		    ch = (ch2 << 8) | ch;
169 		}
170 		if (ch >> 10 == 0xdc00 >> 10) {
171 		    ch &= 0x3ff;
172 		    ch |= (hi << 10);
173 		    ch += 0x10000;
174 		}
175 	    }
176 	    start += Xapian::Unicode::to_utf8(ch, buf + start);
177 	    if (start >= sizeof(buf) - 4) {
178 		tmp.append(buf, start);
179 		start = 0;
180 	    }
181 	}
182 	if (start) tmp.append(buf, start);
183     } else {
184 	if (strncasecmp(p, "iso", 3) == 0) {
185 	    p += 3;
186 	    if (*p == '-' || *p == '_' || *p == ' ') ++p;
187 	}
188 	if (strncmp(p, "8859", 4) != 0) return;
189 	p += 4;
190 	if (*p == '-' || *p == '_' || *p == ' ') ++p;
191 	if (strcmp(p, "1") != 0) return;
192 
193 	// FIXME: pull this out as a standard "normalise utf-8" function?
194 	tmp.reserve(text.size());
195 
196 	size_t start = 0;
197 	for (string::const_iterator i = text.begin(); i != text.end(); ++i) {
198 	    unsigned ch = static_cast<unsigned char>(*i);
199 	    start += Xapian::Unicode::to_utf8(ch, buf + start);
200 	    if (start >= sizeof(buf) - 4) {
201 		tmp.append(buf, start);
202 		start = 0;
203 	    }
204 	}
205 	if (start) tmp.append(buf, start);
206     }
207 #endif
208 
209     swap(text, tmp);
210 }
211