1 /****************************************************************************
2 **
3 ** Copyright (C) 2015 The Qt Company Ltd.
4 ** Contact: http://www.qt.io/licensing/
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see http://www.qt.io/terms-conditions. For further
15 ** information use the contact form at http://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 2.1 or version 3 as published by the Free
20 ** Software Foundation and appearing in the file LICENSE.LGPLv21 and
21 ** LICENSE.LGPLv3 included in the packaging of this file. Please review the
22 ** following information to ensure the GNU Lesser General Public License
23 ** requirements will be met: https://www.gnu.org/licenses/lgpl.html and
24 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
25 **
26 ** As a special exception, The Qt Company gives you certain additional
27 ** rights. These rights are described in The Qt Company LGPL Exception
28 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
29 **
30 ** GNU General Public License Usage
31 ** Alternatively, this file may be used under the terms of the GNU
32 ** General Public License version 3.0 as published by the Free Software
33 ** Foundation and appearing in the file LICENSE.GPL included in the
34 ** packaging of this file. Please review the following information to
35 ** ensure the GNU General Public License version 3.0 requirements will be
36 ** met: http://www.gnu.org/copyleft/gpl.html.
37 **
38 ** $QT_END_LICENSE$
39 **
40 ****************************************************************************/
41
42 #include "qiconvcodec_p.h"
43 #include "qtextcodec_p.h"
44 #include <qlibrary.h>
45 #include <qdebug.h>
46 #include <qthreadstorage.h>
47
48 #include <errno.h>
49 #include <locale.h>
50 #include <stdio.h>
51 #include <dlfcn.h>
52
53 // unistd.h is needed for the _XOPEN_UNIX macro
54 #include <unistd.h>
55 #if defined(_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
56 # include <langinfo.h>
57 #endif
58
59 #if defined(Q_OS_HPUX)
60 # define NO_BOM
61 # define UTF16 "ucs2"
62 #elif defined(Q_OS_AIX)
63 # define NO_BOM
64 # define UTF16 "UCS-2"
65 #elif defined(Q_OS_FREEBSD) || defined(Q_OS_MAC)
66 # define NO_BOM
67 # if Q_BYTE_ORDER == Q_BIG_ENDIAN
68 # define UTF16 "UTF-16BE"
69 # else
70 # define UTF16 "UTF-16LE"
71 # endif
72 #else
73 # define UTF16 "UTF-16"
74 #endif
75
76 #if defined(Q_OS_MAC)
77 #ifndef GNU_LIBICONV
78 #define GNU_LIBICONV
79 #endif
80 typedef iconv_t (*Ptr_iconv_open) (const char*, const char*);
81 typedef size_t (*Ptr_iconv) (iconv_t, const char **, size_t *, char **, size_t *);
82 typedef int (*Ptr_iconv_close) (iconv_t);
83
84 static Ptr_iconv_open ptr_iconv_open = 0;
85 static Ptr_iconv ptr_iconv = 0;
86 static Ptr_iconv_close ptr_iconv_close = 0;
87 #endif
88
89 QT_BEGIN_NAMESPACE
90
91 extern bool qt_locale_initialized;
92
QIconvCodec()93 QIconvCodec::QIconvCodec()
94 : utf16Codec(0)
95 {
96 utf16Codec = QTextCodec::codecForMib(1015);
97 Q_ASSERT_X(utf16Codec != 0,
98 "QIconvCodec::convertToUnicode",
99 "internal error, UTF-16 codec not found");
100 if (!utf16Codec) {
101 fprintf(stderr, "QIconvCodec::convertToUnicode: internal error, UTF-16 codec not found\n");
102 utf16Codec = reinterpret_cast<QTextCodec *>(~0);
103 }
104 #if defined(Q_OS_MAC)
105 if (ptr_iconv_open == 0) {
106 QLibrary libiconv(QLatin1String("/usr/lib/libiconv"));
107 libiconv.setLoadHints(QLibrary::ExportExternalSymbolsHint);
108
109 ptr_iconv_open = reinterpret_cast<Ptr_iconv_open>(libiconv.resolve("libiconv_open"));
110 if (!ptr_iconv_open)
111 ptr_iconv_open = reinterpret_cast<Ptr_iconv_open>(libiconv.resolve("iconv_open"));
112 ptr_iconv = reinterpret_cast<Ptr_iconv>(libiconv.resolve("libiconv"));
113 if (!ptr_iconv)
114 ptr_iconv = reinterpret_cast<Ptr_iconv>(libiconv.resolve("iconv"));
115 ptr_iconv_close = reinterpret_cast<Ptr_iconv_close>(libiconv.resolve("libiconv_close"));
116 if (!ptr_iconv_close)
117 ptr_iconv_close = reinterpret_cast<Ptr_iconv_close>(libiconv.resolve("iconv_close"));
118
119 Q_ASSERT_X(ptr_iconv_open && ptr_iconv && ptr_iconv_close,
120 "QIconvCodec::QIconvCodec()",
121 "internal error, could not resolve the iconv functions");
122
123 # undef iconv_open
124 # define iconv_open ptr_iconv_open
125 # undef iconv
126 # define iconv ptr_iconv
127 # undef iconv_close
128 # define iconv_close ptr_iconv_close
129 }
130 #endif
131 }
132
~QIconvCodec()133 QIconvCodec::~QIconvCodec()
134 {
135 }
136
IconvState(iconv_t x)137 QIconvCodec::IconvState::IconvState(iconv_t x)
138 : buffer(array), bufferLen(sizeof array), cd(x)
139 {
140 }
141
~IconvState()142 QIconvCodec::IconvState::~IconvState()
143 {
144 if (cd != reinterpret_cast<iconv_t>(-1))
145 iconv_close(cd);
146 if (buffer != array)
147 delete[] buffer;
148 }
149
saveChars(const char * c,int count)150 void QIconvCodec::IconvState::saveChars(const char *c, int count)
151 {
152 if (count > bufferLen) {
153 if (buffer != array)
154 delete[] buffer;
155 buffer = new char[bufferLen = count];
156 }
157
158 memcpy(buffer, c, count);
159 }
160
qIconvCodecStateFree(QTextCodec::ConverterState * state)161 static void qIconvCodecStateFree(QTextCodec::ConverterState *state)
162 {
163 delete reinterpret_cast<QIconvCodec::IconvState *>(state->d);
164 }
165
Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState * >,toUnicodeState)166 Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState *>, toUnicodeState)
167
168 QString QIconvCodec::convertToUnicode(const char* chars, int len, ConverterState *convState) const
169 {
170 if (utf16Codec == reinterpret_cast<QTextCodec *>(~0))
171 return QString::fromLatin1(chars, len);
172
173 int invalidCount = 0;
174 int remainingCount = 0;
175 char *remainingBuffer = 0;
176 IconvState *temporaryState = 0;
177 IconvState **pstate;
178
179 if (convState) {
180 // stateful conversion
181 pstate = reinterpret_cast<IconvState **>(&convState->d);
182 if (convState->d) {
183 // restore state
184 remainingCount = convState->remainingChars;
185 remainingBuffer = (*pstate)->buffer;
186 } else {
187 // first time
188 convState->flags |= FreeFunction;
189 QTextCodecUnalignedPointer::encode(convState->state_data, qIconvCodecStateFree);
190 }
191 } else {
192 QThreadStorage<QIconvCodec::IconvState *> *ts = toUnicodeState();
193 if (!qt_locale_initialized || !ts) {
194 // we're running after the Q_GLOBAL_STATIC has been deleted
195 // or before the QCoreApplication initialization
196 // bad programmer, no cookie for you
197 pstate = &temporaryState;
198 } else {
199 // stateless conversion -- use thread-local data
200 pstate = &toUnicodeState()->localData();
201 }
202 }
203
204 if (!*pstate) {
205 // first time, create the state
206 iconv_t cd = QIconvCodec::createIconv_t(UTF16, 0);
207 if (cd == reinterpret_cast<iconv_t>(-1)) {
208 static int reported = 0;
209 if (!reported++) {
210 fprintf(stderr,
211 "QIconvCodec::convertToUnicode: using Latin-1 for conversion, iconv_open failed\n");
212 }
213 return QString::fromLatin1(chars, len);
214 }
215
216 *pstate = new IconvState(cd);
217 }
218
219 IconvState *state = *pstate;
220 size_t inBytesLeft = len;
221 // best case assumption, each byte is converted into one UTF-16 character, plus 2 bytes for the BOM
222 char *inBytes = const_cast<char *>(chars);
223
224 QByteArray in;
225 if (remainingCount) {
226 // we have to prepend the remaining bytes from the previous conversion
227 inBytesLeft += remainingCount;
228 in.resize(inBytesLeft);
229 inBytes = in.data();
230
231 memcpy(in.data(), remainingBuffer, remainingCount);
232 memcpy(in.data() + remainingCount, chars, len);
233
234 remainingCount = 0;
235 }
236
237 size_t outBytesLeft = len * 2 + 2;
238 QByteArray ba(outBytesLeft, Qt::Uninitialized);
239 char *outBytes = ba.data();
240 do {
241 size_t ret = iconv(state->cd, &inBytes, &inBytesLeft, &outBytes, &outBytesLeft);
242 if (ret == (size_t) -1) {
243 if (errno == E2BIG) {
244 int offset = ba.size() - outBytesLeft;
245 ba.resize(ba.size() * 2);
246 outBytes = ba.data() + offset;
247 outBytesLeft = ba.size() - offset;
248
249 continue;
250 }
251
252 if (errno == EILSEQ) {
253 // conversion stopped because of an invalid character in the sequence
254 ++invalidCount;
255 } else if (errno == EINVAL && convState) {
256 // conversion stopped because the remaining inBytesLeft make up
257 // an incomplete multi-byte sequence; save them for later
258 state->saveChars(inBytes, inBytesLeft);
259 remainingCount = inBytesLeft;
260 break;
261 }
262
263 if (errno == EILSEQ || errno == EINVAL) {
264 // skip the next character
265 ++inBytes;
266 --inBytesLeft;
267 continue;
268 }
269
270 // some other error
271 // note, cannot use qWarning() since we are implementing the codecForLocale :)
272 perror("QIconvCodec::convertToUnicode: using Latin-1 for conversion, iconv failed");
273
274 if (!convState) {
275 // reset state
276 iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
277 }
278
279 delete temporaryState;
280 return QString::fromLatin1(chars, len);
281 }
282 } while (inBytesLeft != 0);
283
284 QString s;
285
286 if (convState) {
287 s = utf16Codec->toUnicode(ba.constData(), ba.size() - outBytesLeft, &state->internalState);
288
289 convState->invalidChars = invalidCount;
290 convState->remainingChars = remainingCount;
291 } else {
292 s = utf16Codec->toUnicode(ba.constData(), ba.size() - outBytesLeft);
293
294 // reset state
295 iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
296 }
297
298 delete temporaryState;
299 return s;
300 }
301
Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState * >,fromUnicodeState)302 Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState *>, fromUnicodeState)
303
304 static bool setByteOrder(iconv_t cd)
305 {
306 #if !defined(NO_BOM)
307 // give iconv() a BOM
308 char buf[4];
309 ushort bom[] = { QChar::ByteOrderMark };
310
311 char *outBytes = buf;
312 char *inBytes = reinterpret_cast<char *>(bom);
313 size_t outBytesLeft = sizeof buf;
314 size_t inBytesLeft = sizeof bom;
315
316 char **inBytesPtr = &inBytes;
317
318 if (iconv(cd, inBytesPtr, &inBytesLeft, &outBytes, &outBytesLeft) == (size_t) -1) {
319 return false;
320 }
321 #endif // NO_BOM
322
323 return true;
324 }
325
convertFromUnicode(const QChar * uc,int len,ConverterState * convState) const326 QByteArray QIconvCodec::convertFromUnicode(const QChar *uc, int len, ConverterState *convState) const
327 {
328 char *inBytes;
329 char *outBytes;
330 size_t inBytesLeft;
331
332 char **inBytesPtr = &inBytes;
333
334 IconvState *temporaryState = 0;
335 QThreadStorage<QIconvCodec::IconvState *> *ts = fromUnicodeState();
336 IconvState *&state = (qt_locale_initialized && ts) ? ts->localData() : temporaryState;
337 if (!state) {
338 iconv_t cd = QIconvCodec::createIconv_t(0, UTF16);
339 if (cd != reinterpret_cast<iconv_t>(-1)) {
340 if (!setByteOrder(cd)) {
341 perror("QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv failed for BOM");
342
343 iconv_close(cd);
344 cd = reinterpret_cast<iconv_t>(-1);
345
346 return QString(uc, len).toLatin1();
347 }
348 }
349 state = new IconvState(cd);
350 }
351 if (state->cd == reinterpret_cast<iconv_t>(-1)) {
352 static int reported = 0;
353 if (!reported++) {
354 fprintf(stderr,
355 "QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv_open failed\n");
356 }
357 delete temporaryState;
358 return QString(uc, len).toLatin1();
359 }
360
361 size_t outBytesLeft = len;
362 QByteArray ba(outBytesLeft, Qt::Uninitialized);
363 outBytes = ba.data();
364
365 // now feed iconv() the real data
366 inBytes = const_cast<char *>(reinterpret_cast<const char *>(uc));
367 inBytesLeft = len * sizeof(QChar);
368
369 QByteArray in;
370 if (convState && convState->remainingChars) {
371 // we have one surrogate char to be prepended
372 in.resize(sizeof(QChar) + len);
373 inBytes = in.data();
374
375 QChar remaining = convState->state_data[0];
376 memcpy(in.data(), &remaining, sizeof(QChar));
377 memcpy(in.data() + sizeof(QChar), uc, inBytesLeft);
378
379 inBytesLeft += sizeof(QChar);
380 convState->remainingChars = 0;
381 }
382
383 int invalidCount = 0;
384 while (inBytesLeft != 0) {
385 if (iconv(state->cd, inBytesPtr, &inBytesLeft, &outBytes, &outBytesLeft) == (size_t) -1) {
386 if (errno == EINVAL && convState) {
387 // buffer ends in a surrogate
388 Q_ASSERT(inBytesLeft == 2);
389 convState->remainingChars = 1;
390 convState->state_data[0] = uc[len - 1].unicode();
391 break;
392 }
393
394 switch (errno) {
395 case EILSEQ:
396 ++invalidCount;
397 // fall through
398 case EINVAL:
399 {
400 inBytes += sizeof(QChar);
401 inBytesLeft -= sizeof(QChar);
402 break;
403 }
404 case E2BIG:
405 {
406 int offset = ba.size() - outBytesLeft;
407 ba.resize(ba.size() * 2);
408 outBytes = ba.data() + offset;
409 outBytesLeft = ba.size() - offset;
410 break;
411 }
412 default:
413 {
414 // note, cannot use qWarning() since we are implementing the codecForLocale :)
415 perror("QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv failed");
416
417 // reset to initial state
418 iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
419
420 delete temporaryState;
421 return QString(uc, len).toLatin1();
422 }
423 }
424 }
425 }
426
427 // reset to initial state
428 iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
429 setByteOrder(state->cd);
430
431 ba.resize(ba.size() - outBytesLeft);
432
433 if (convState)
434 convState->invalidChars = invalidCount;
435
436 delete temporaryState;
437 return ba;
438 }
439
name() const440 QByteArray QIconvCodec::name() const
441 {
442 return "System";
443 }
444
mibEnum() const445 int QIconvCodec::mibEnum() const
446 {
447 return 0;
448 }
449
createIconv_t(const char * to,const char * from)450 iconv_t QIconvCodec::createIconv_t(const char *to, const char *from)
451 {
452 Q_ASSERT((to == 0 && from != 0) || (to != 0 && from == 0));
453
454 iconv_t cd = (iconv_t) -1;
455 #if defined(__GLIBC__) || defined(GNU_LIBICONV) || defined(Q_OS_QNX)
456 #if defined(Q_OS_QNX)
457 // on QNX the default locale is UTF-8, and an empty string will cause iconv_open to fail
458 static const char empty_codeset[] = "UTF-8";
459 #else
460 // both GLIBC and libgnuiconv will use the locale's encoding if from or to is an empty string
461 static const char empty_codeset[] = "";
462 #endif
463 const char *codeset = empty_codeset;
464 cd = iconv_open(to ? to : codeset, from ? from : codeset);
465 #else
466 char *codeset = 0;
467 #endif
468
469 #if defined(_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
470 if (cd == (iconv_t) -1) {
471 codeset = nl_langinfo(CODESET);
472 if (codeset)
473 cd = iconv_open(to ? to : codeset, from ? from : codeset);
474 }
475 #endif
476
477 if (cd == (iconv_t) -1) {
478 // Very poorly defined and followed standards causes lots of
479 // code to try to get all the cases... This logic is
480 // duplicated in QTextCodec, so if you change it here, change
481 // it there too.
482
483 // Try to determine locale codeset from locale name assigned to
484 // LC_CTYPE category.
485
486 // First part is getting that locale name. First try setlocale() which
487 // definitely knows it, but since we cannot fully trust it, get ready
488 // to fall back to environment variables.
489 char * ctype = qstrdup(setlocale(LC_CTYPE, 0));
490
491 // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
492 // environment variables.
493 char * lang = qstrdup(qgetenv("LC_ALL").constData());
494 if (!lang || lang[0] == 0 || strcmp(lang, "C") == 0) {
495 if (lang) delete [] lang;
496 lang = qstrdup(qgetenv("LC_CTYPE").constData());
497 }
498 if (!lang || lang[0] == 0 || strcmp(lang, "C") == 0) {
499 if (lang) delete [] lang;
500 lang = qstrdup(qgetenv("LANG").constData());
501 }
502
503 // Now try these in order:
504 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
505 // 2. CODESET from lang if it contains a .CODESET part
506 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
507 // 4. locale (ditto)
508 // 5. check for "@euro"
509
510 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
511 codeset = ctype ? strchr(ctype, '.') : 0;
512 if (codeset && *codeset == '.') {
513 ++codeset;
514 cd = iconv_open(to ? to : codeset, from ? from : codeset);
515 }
516
517 // 2. CODESET from lang if it contains a .CODESET part
518 codeset = lang ? strchr(lang, '.') : 0;
519 if (cd == (iconv_t) -1 && codeset && *codeset == '.') {
520 ++codeset;
521 cd = iconv_open(to ? to : codeset, from ? from : codeset);
522 }
523
524 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
525 if (cd == (iconv_t) -1 && ctype && *ctype != 0 && strcmp (ctype, "C") != 0)
526 cd = iconv_open(to ? to : ctype, from ? from : ctype);
527
528
529 // 4. locale (ditto)
530 if (cd == (iconv_t) -1 && lang && *lang != 0)
531 cd = iconv_open(to ? to : lang, from ? from : lang);
532
533 // 5. "@euro"
534 if ((cd == (iconv_t) -1 && ctype && strstr(ctype, "@euro")) || (lang && strstr(lang, "@euro")))
535 cd = iconv_open(to ? to : "ISO8859-15", from ? from : "ISO8859-15");
536
537 delete [] ctype;
538 delete [] lang;
539 }
540
541 return cd;
542 }
543
544 QT_END_NAMESPACE
545