1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 3 requirements
23 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24 **
25 ** GNU General Public License Usage
26 ** Alternatively, this file may be used under the terms of the GNU
27 ** General Public License version 2.0 or (at your option) the GNU General
28 ** Public license version 3 or any later version approved by the KDE Free
29 ** Qt Foundation. The licenses are as published by the Free Software
30 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31 ** included in the packaging of this file. Please review the following
32 ** information to ensure the GNU General Public License requirements will
33 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34 ** https://www.gnu.org/licenses/gpl-3.0.html.
35 **
36 ** $QT_END_LICENSE$
37 **
38 ****************************************************************************/
39
40 #include <QtCore/private/qglobal_p.h>
41
42 #include "qiconvcodec_p.h"
43 #include "qtextcodec_p.h"
44 #include <qdebug.h>
45 #include <qthreadstorage.h>
46
47 #include <errno.h>
48 #include <locale.h>
49 #include <stdio.h>
50 #include <dlfcn.h>
51
52 // unistd.h is needed for the _XOPEN_UNIX macro
53 #include <unistd.h>
54 #if defined(_XOPEN_UNIX) && !defined(Q_OS_QNX)
55 # include <langinfo.h>
56 #endif
57
58 #if defined(Q_OS_HPUX)
59 # define NO_BOM
60 # define UTF16 "ucs2"
61 #elif defined(Q_OS_AIX)
62 # define NO_BOM
63 # define UTF16 "UCS-2"
64 #elif defined(Q_OS_FREEBSD)
65 # define NO_BOM
66 # if Q_BYTE_ORDER == Q_BIG_ENDIAN
67 # define UTF16 "UTF-16BE"
68 # else
69 # define UTF16 "UTF-16LE"
70 # endif
71 #else
72 # define UTF16 "UTF-16"
73 #endif
74
75 QT_BEGIN_NAMESPACE
76
QIconvCodec()77 QIconvCodec::QIconvCodec()
78 : utf16Codec(0)
79 {
80 }
81
init() const82 void QIconvCodec::init() const
83 {
84 utf16Codec = QTextCodec::codecForMib(1015);
85 Q_ASSERT_X(utf16Codec != 0,
86 "QIconvCodec::convertToUnicode",
87 "internal error, UTF-16 codec not found");
88 if (!utf16Codec) {
89 fprintf(stderr, "QIconvCodec::convertToUnicode: internal error, UTF-16 codec not found\n");
90 utf16Codec = reinterpret_cast<QTextCodec *>(~0);
91 }
92 }
93
~QIconvCodec()94 QIconvCodec::~QIconvCodec()
95 {
96 }
97
IconvState(iconv_t x)98 QIconvCodec::IconvState::IconvState(iconv_t x)
99 : buffer(array), bufferLen(sizeof array), cd(x)
100 {
101 }
102
~IconvState()103 QIconvCodec::IconvState::~IconvState()
104 {
105 if (cd != reinterpret_cast<iconv_t>(-1))
106 iconv_close(cd);
107 if (buffer != array)
108 delete[] buffer;
109 }
110
saveChars(const char * c,int count)111 void QIconvCodec::IconvState::saveChars(const char *c, int count)
112 {
113 if (count > bufferLen) {
114 if (buffer != array)
115 delete[] buffer;
116 buffer = new char[bufferLen = count];
117 }
118
119 memcpy(buffer, c, count);
120 }
121
qIconvCodecStateFree(QTextCodec::ConverterState * state)122 static void qIconvCodecStateFree(QTextCodec::ConverterState *state)
123 {
124 delete reinterpret_cast<QIconvCodec::IconvState *>(state->d);
125 }
126
Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState * >,toUnicodeState)127 Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState *>, toUnicodeState)
128
129 QString QIconvCodec::convertToUnicode(const char* chars, int len, ConverterState *convState) const
130 {
131 if (utf16Codec == reinterpret_cast<QTextCodec *>(~0))
132 return QString::fromLatin1(chars, len);
133
134 int invalidCount = 0;
135 int remainingCount = 0;
136 char *remainingBuffer = 0;
137 IconvState *temporaryState = 0;
138 IconvState **pstate;
139
140 if (convState) {
141 // stateful conversion
142 pstate = reinterpret_cast<IconvState **>(&convState->d);
143 if (convState->d) {
144 // restore state
145 remainingCount = convState->remainingChars;
146 remainingBuffer = (*pstate)->buffer;
147 } else {
148 // first time
149 convState->flags |= FreeFunction;
150 QTextCodecUnalignedPointer::encode(convState->state_data, qIconvCodecStateFree);
151 }
152 } else {
153 QThreadStorage<QIconvCodec::IconvState *> *ts = toUnicodeState();
154 if (!ts) {
155 // we're running after the Q_GLOBAL_STATIC has been deleted
156 // or before the QCoreApplication initialization
157 // bad programmer, no cookie for you
158 pstate = &temporaryState;
159 } else {
160 // stateless conversion -- use thread-local data
161 pstate = &toUnicodeState()->localData();
162 }
163 }
164
165 if (!*pstate) {
166 // first time, create the state
167 iconv_t cd = createIconv_t(UTF16, 0);
168 if (cd == reinterpret_cast<iconv_t>(-1)) {
169 static int reported = 0;
170 if (!reported++) {
171 fprintf(stderr,
172 "QIconvCodec::convertToUnicode: using Latin-1 for conversion, iconv_open failed\n");
173 }
174 return QString::fromLatin1(chars, len);
175 }
176
177 *pstate = new IconvState(cd);
178 }
179
180 IconvState *state = *pstate;
181 size_t inBytesLeft = len;
182 // best case assumption, each byte is converted into one UTF-16 character, plus 2 bytes for the BOM
183 #if !QT_CONFIG(posix_libiconv)
184 // GNU doesn't disagree with POSIX :/
185 const char *inBytes = chars;
186 #else
187 char *inBytes = const_cast<char *>(chars);
188 #endif
189
190 QByteArray in;
191 if (remainingCount) {
192 // we have to prepend the remaining bytes from the previous conversion
193 inBytesLeft += remainingCount;
194 in.resize(inBytesLeft);
195 inBytes = in.data();
196
197 memcpy(in.data(), remainingBuffer, remainingCount);
198 memcpy(in.data() + remainingCount, chars, len);
199
200 remainingCount = 0;
201 }
202
203 size_t outBytesLeft = len * 2 + 2;
204 QByteArray ba(outBytesLeft, Qt::Uninitialized);
205 char *outBytes = ba.data();
206 do {
207 size_t ret = iconv(state->cd, &inBytes, &inBytesLeft, &outBytes, &outBytesLeft);
208 if (ret == (size_t) -1) {
209 if (errno == E2BIG) {
210 int offset = ba.size() - outBytesLeft;
211 ba.resize(ba.size() * 2);
212 outBytes = ba.data() + offset;
213 outBytesLeft = ba.size() - offset;
214
215 continue;
216 }
217
218 if (errno == EILSEQ) {
219 // conversion stopped because of an invalid character in the sequence
220 ++invalidCount;
221 } else if (errno == EINVAL && convState) {
222 // conversion stopped because the remaining inBytesLeft make up
223 // an incomplete multi-byte sequence; save them for later
224 state->saveChars(inBytes, inBytesLeft);
225 remainingCount = inBytesLeft;
226 break;
227 }
228
229 if (errno == EILSEQ || errno == EINVAL) {
230 // skip the next character
231 ++inBytes;
232 --inBytesLeft;
233 continue;
234 }
235
236 // some other error
237 // note, cannot use qWarning() since we are implementing the codecForLocale :)
238 perror("QIconvCodec::convertToUnicode: using Latin-1 for conversion, iconv failed");
239
240 if (!convState) {
241 // reset state
242 iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
243 }
244
245 delete temporaryState;
246 return QString::fromLatin1(chars, len);
247 }
248 } while (inBytesLeft != 0);
249
250 QString s;
251
252 if (convState) {
253 s = utf16Codec->toUnicode(ba.constData(), ba.size() - outBytesLeft, &state->internalState);
254
255 convState->invalidChars = invalidCount;
256 convState->remainingChars = remainingCount;
257 } else {
258 s = utf16Codec->toUnicode(ba.constData(), ba.size() - outBytesLeft);
259
260 // reset state
261 iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
262 }
263
264 delete temporaryState;
265 return s;
266 }
267
Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState * >,fromUnicodeState)268 Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState *>, fromUnicodeState)
269
270 static bool setByteOrder(iconv_t cd)
271 {
272 #if !defined(NO_BOM)
273 // give iconv() a BOM
274 char buf[4];
275 ushort bom[] = { QChar::ByteOrderMark };
276
277 char *outBytes = buf;
278 char *inBytes = reinterpret_cast<char *>(bom);
279 size_t outBytesLeft = sizeof buf;
280 size_t inBytesLeft = sizeof bom;
281
282 #if !QT_CONFIG(posix_libiconv)
283 const char **inBytesPtr = const_cast<const char **>(&inBytes);
284 #else
285 char **inBytesPtr = &inBytes;
286 #endif
287
288 if (iconv(cd, inBytesPtr, &inBytesLeft, &outBytes, &outBytesLeft) == (size_t) -1) {
289 return false;
290 }
291 #else
292 Q_UNUSED(cd);
293 #endif // NO_BOM
294
295 return true;
296 }
297
convertFromUnicode(const QChar * uc,int len,ConverterState * convState) const298 QByteArray QIconvCodec::convertFromUnicode(const QChar *uc, int len, ConverterState *convState) const
299 {
300 char *inBytes;
301 char *outBytes;
302 size_t inBytesLeft;
303
304 #if !QT_CONFIG(posix_libiconv)
305 const char **inBytesPtr = const_cast<const char **>(&inBytes);
306 #else
307 char **inBytesPtr = &inBytes;
308 #endif
309
310 IconvState *temporaryState = 0;
311 QThreadStorage<QIconvCodec::IconvState *> *ts = fromUnicodeState();
312 IconvState *&state = ts ? ts->localData() : temporaryState;
313 if (!state) {
314 iconv_t cd = createIconv_t(0, UTF16);
315 if (cd != reinterpret_cast<iconv_t>(-1)) {
316 if (!setByteOrder(cd)) {
317 perror("QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv failed for BOM");
318
319 iconv_close(cd);
320 cd = reinterpret_cast<iconv_t>(-1);
321
322 return QString(uc, len).toLatin1();
323 }
324 }
325 state = new IconvState(cd);
326 }
327 if (state->cd == reinterpret_cast<iconv_t>(-1)) {
328 static int reported = 0;
329 if (!reported++) {
330 fprintf(stderr,
331 "QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv_open failed\n");
332 }
333 delete temporaryState;
334 return QString(uc, len).toLatin1();
335 }
336
337 size_t outBytesLeft = len;
338 QByteArray ba(outBytesLeft, Qt::Uninitialized);
339 outBytes = ba.data();
340
341 // now feed iconv() the real data
342 inBytes = const_cast<char *>(reinterpret_cast<const char *>(uc));
343 inBytesLeft = len * sizeof(QChar);
344
345 QByteArray in;
346 if (convState && convState->remainingChars) {
347 // we have one surrogate char to be prepended
348 in.resize(sizeof(QChar) + len);
349 inBytes = in.data();
350
351 QChar remaining = convState->state_data[0];
352 memcpy(in.data(), &remaining, sizeof(QChar));
353 memcpy(in.data() + sizeof(QChar), uc, inBytesLeft);
354
355 inBytesLeft += sizeof(QChar);
356 convState->remainingChars = 0;
357 }
358
359 int invalidCount = 0;
360 while (inBytesLeft != 0) {
361 if (iconv(state->cd, inBytesPtr, &inBytesLeft, &outBytes, &outBytesLeft) == (size_t) -1) {
362 if (errno == EINVAL && convState) {
363 // buffer ends in a surrogate
364 Q_ASSERT(inBytesLeft == 2);
365 convState->remainingChars = 1;
366 convState->state_data[0] = uc[len - 1].unicode();
367 break;
368 }
369
370 switch (errno) {
371 case EILSEQ:
372 ++invalidCount;
373 Q_FALLTHROUGH();
374 case EINVAL:
375 {
376 inBytes += sizeof(QChar);
377 inBytesLeft -= sizeof(QChar);
378 break;
379 }
380 case E2BIG:
381 {
382 int offset = ba.size() - outBytesLeft;
383 ba.resize(ba.size() * 2);
384 outBytes = ba.data() + offset;
385 outBytesLeft = ba.size() - offset;
386 break;
387 }
388 default:
389 {
390 // note, cannot use qWarning() since we are implementing the codecForLocale :)
391 perror("QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv failed");
392
393 // reset to initial state
394 iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
395
396 delete temporaryState;
397 return QString(uc, len).toLatin1();
398 }
399 }
400 }
401 }
402
403 // reset to initial state
404 iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
405 setByteOrder(state->cd);
406
407 ba.resize(ba.size() - outBytesLeft);
408
409 if (convState)
410 convState->invalidChars = invalidCount;
411
412 delete temporaryState;
413 return ba;
414 }
415
name() const416 QByteArray QIconvCodec::name() const
417 {
418 return "System";
419 }
420
mibEnum() const421 int QIconvCodec::mibEnum() const
422 {
423 return 0;
424 }
425
createIconv_t(const char * to,const char * from) const426 iconv_t QIconvCodec::createIconv_t(const char *to, const char *from) const
427 {
428 Q_ASSERT((to == 0 && from != 0) || (to != 0 && from == 0));
429
430 if (!utf16Codec)
431 init();
432
433 iconv_t cd = (iconv_t) -1;
434 #if defined(__GLIBC__) || !QT_CONFIG(posix_libiconv) || defined(Q_OS_QNX)
435 #if defined(Q_OS_QNX)
436 // on QNX the default locale is UTF-8, and an empty string will cause iconv_open to fail
437 static const char empty_codeset[] = "UTF-8";
438 #else
439 // both GLIBC and libgnuiconv will use the locale's encoding if from or to is an empty string
440 static const char empty_codeset[] = "";
441 #endif
442 const char *codeset = empty_codeset;
443 cd = iconv_open(to ? to : codeset, from ? from : codeset);
444 #else
445 char *codeset = 0;
446 #endif
447
448 #if defined(_XOPEN_UNIX) && !defined(Q_OS_QNX)
449 if (cd == (iconv_t) -1) {
450 codeset = nl_langinfo(CODESET);
451 if (codeset)
452 cd = iconv_open(to ? to : codeset, from ? from : codeset);
453 }
454 #endif
455
456 if (cd == (iconv_t) -1) {
457 // Very poorly defined and followed standards causes lots of
458 // code to try to get all the cases... This logic is
459 // duplicated in QTextCodec, so if you change it here, change
460 // it there too.
461
462 // Try to determine locale codeset from locale name assigned to
463 // LC_CTYPE category.
464
465 // First part is getting that locale name. First try setlocale() which
466 // definitely knows it, but since we cannot fully trust it, get ready
467 // to fall back to environment variables.
468 char * ctype = qstrdup(setlocale(LC_CTYPE, 0));
469
470 // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
471 // environment variables.
472 char * lang = qstrdup(qgetenv("LC_ALL").constData());
473 if (!lang || lang[0] == 0 || strcmp(lang, "C") == 0) {
474 if (lang) delete [] lang;
475 lang = qstrdup(qgetenv("LC_CTYPE").constData());
476 }
477 if (!lang || lang[0] == 0 || strcmp(lang, "C") == 0) {
478 if (lang) delete [] lang;
479 lang = qstrdup(qgetenv("LANG").constData());
480 }
481
482 // Now try these in order:
483 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
484 // 2. CODESET from lang if it contains a .CODESET part
485 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
486 // 4. locale (ditto)
487 // 5. check for "@euro"
488
489 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
490 codeset = ctype ? strchr(ctype, '.') : 0;
491 if (codeset && *codeset == '.') {
492 ++codeset;
493 cd = iconv_open(to ? to : codeset, from ? from : codeset);
494 }
495
496 // 2. CODESET from lang if it contains a .CODESET part
497 codeset = lang ? strchr(lang, '.') : 0;
498 if (cd == (iconv_t) -1 && codeset && *codeset == '.') {
499 ++codeset;
500 cd = iconv_open(to ? to : codeset, from ? from : codeset);
501 }
502
503 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
504 if (cd == (iconv_t) -1 && ctype && *ctype != 0 && strcmp (ctype, "C") != 0)
505 cd = iconv_open(to ? to : ctype, from ? from : ctype);
506
507
508 // 4. locale (ditto)
509 if (cd == (iconv_t) -1 && lang && *lang != 0)
510 cd = iconv_open(to ? to : lang, from ? from : lang);
511
512 // 5. "@euro"
513 if ((cd == (iconv_t) -1 && ctype && strstr(ctype, "@euro")) || (lang && strstr(lang, "@euro")))
514 cd = iconv_open(to ? to : "ISO8859-15", from ? from : "ISO8859-15");
515
516 delete [] ctype;
517 delete [] lang;
518 }
519
520 return cd;
521 }
522
523 QT_END_NAMESPACE
524