1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 3 requirements
23 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24 **
25 ** GNU General Public License Usage
26 ** Alternatively, this file may be used under the terms of the GNU
27 ** General Public License version 2.0 or (at your option) the GNU General
28 ** Public license version 3 or any later version approved by the KDE Free
29 ** Qt Foundation. The licenses are as published by the Free Software
30 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31 ** included in the packaging of this file. Please review the following
32 ** information to ensure the GNU General Public License requirements will
33 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34 ** https://www.gnu.org/licenses/gpl-3.0.html.
35 **
36 ** $QT_END_LICENSE$
37 **
38 ****************************************************************************/
39 
40 #include <QtCore/private/qglobal_p.h>
41 
42 #include "qiconvcodec_p.h"
43 #include "qtextcodec_p.h"
44 #include <qdebug.h>
45 #include <qthreadstorage.h>
46 
47 #include <errno.h>
48 #include <locale.h>
49 #include <stdio.h>
50 #include <dlfcn.h>
51 
52 // unistd.h is needed for the _XOPEN_UNIX macro
53 #include <unistd.h>
54 #if defined(_XOPEN_UNIX) && !defined(Q_OS_QNX)
55 #  include <langinfo.h>
56 #endif
57 
58 #if defined(Q_OS_HPUX)
59 #  define NO_BOM
60 #  define UTF16 "ucs2"
61 #elif defined(Q_OS_AIX)
62 #  define NO_BOM
63 #  define UTF16 "UCS-2"
64 #elif defined(Q_OS_FREEBSD)
65 #  define NO_BOM
66 #  if Q_BYTE_ORDER == Q_BIG_ENDIAN
67 #    define UTF16 "UTF-16BE"
68 #  else
69 #    define UTF16 "UTF-16LE"
70 #  endif
71 #else
72 #  define UTF16 "UTF-16"
73 #endif
74 
75 QT_BEGIN_NAMESPACE
76 
QIconvCodec()77 QIconvCodec::QIconvCodec()
78     : utf16Codec(0)
79 {
80 }
81 
init() const82 void QIconvCodec::init() const
83 {
84     utf16Codec = QTextCodec::codecForMib(1015);
85     Q_ASSERT_X(utf16Codec != 0,
86                "QIconvCodec::convertToUnicode",
87                "internal error, UTF-16 codec not found");
88     if (!utf16Codec) {
89         fprintf(stderr, "QIconvCodec::convertToUnicode: internal error, UTF-16 codec not found\n");
90         utf16Codec = reinterpret_cast<QTextCodec *>(~0);
91     }
92 }
93 
~QIconvCodec()94 QIconvCodec::~QIconvCodec()
95 {
96 }
97 
IconvState(iconv_t x)98 QIconvCodec::IconvState::IconvState(iconv_t x)
99     : buffer(array), bufferLen(sizeof array), cd(x)
100 {
101 }
102 
~IconvState()103 QIconvCodec::IconvState::~IconvState()
104 {
105     if (cd != reinterpret_cast<iconv_t>(-1))
106         iconv_close(cd);
107     if (buffer != array)
108         delete[] buffer;
109 }
110 
saveChars(const char * c,int count)111 void QIconvCodec::IconvState::saveChars(const char *c, int count)
112 {
113     if (count > bufferLen) {
114         if (buffer != array)
115             delete[] buffer;
116         buffer = new char[bufferLen = count];
117     }
118 
119     memcpy(buffer, c, count);
120 }
121 
qIconvCodecStateFree(QTextCodec::ConverterState * state)122 static void qIconvCodecStateFree(QTextCodec::ConverterState *state)
123 {
124     delete reinterpret_cast<QIconvCodec::IconvState *>(state->d);
125 }
126 
Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState * >,toUnicodeState)127 Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState *>, toUnicodeState)
128 
129 QString QIconvCodec::convertToUnicode(const char* chars, int len, ConverterState *convState) const
130 {
131     if (utf16Codec == reinterpret_cast<QTextCodec *>(~0))
132         return QString::fromLatin1(chars, len);
133 
134     int invalidCount = 0;
135     int remainingCount = 0;
136     char *remainingBuffer = 0;
137     IconvState *temporaryState = 0;
138     IconvState **pstate;
139 
140     if (convState) {
141         // stateful conversion
142         pstate = reinterpret_cast<IconvState **>(&convState->d);
143         if (convState->d) {
144             // restore state
145             remainingCount = convState->remainingChars;
146             remainingBuffer = (*pstate)->buffer;
147         } else {
148             // first time
149             convState->flags |= FreeFunction;
150             QTextCodecUnalignedPointer::encode(convState->state_data, qIconvCodecStateFree);
151         }
152     } else {
153         QThreadStorage<QIconvCodec::IconvState *> *ts = toUnicodeState();
154         if (!ts) {
155             // we're running after the Q_GLOBAL_STATIC has been deleted
156             // or before the QCoreApplication initialization
157             // bad programmer, no cookie for you
158             pstate = &temporaryState;
159         } else {
160             // stateless conversion -- use thread-local data
161             pstate = &toUnicodeState()->localData();
162         }
163     }
164 
165     if (!*pstate) {
166         // first time, create the state
167         iconv_t cd = createIconv_t(UTF16, 0);
168         if (cd == reinterpret_cast<iconv_t>(-1)) {
169             static int reported = 0;
170             if (!reported++) {
171                 fprintf(stderr,
172                         "QIconvCodec::convertToUnicode: using Latin-1 for conversion, iconv_open failed\n");
173             }
174             return QString::fromLatin1(chars, len);
175         }
176 
177         *pstate = new IconvState(cd);
178     }
179 
180     IconvState *state = *pstate;
181     size_t inBytesLeft = len;
182     // best case assumption, each byte is converted into one UTF-16 character, plus 2 bytes for the BOM
183 #if !QT_CONFIG(posix_libiconv)
184     // GNU doesn't disagree with POSIX :/
185     const char *inBytes = chars;
186 #else
187     char *inBytes = const_cast<char *>(chars);
188 #endif
189 
190     QByteArray in;
191     if (remainingCount) {
192         // we have to prepend the remaining bytes from the previous conversion
193         inBytesLeft += remainingCount;
194         in.resize(inBytesLeft);
195         inBytes = in.data();
196 
197         memcpy(in.data(), remainingBuffer, remainingCount);
198         memcpy(in.data() + remainingCount, chars, len);
199 
200         remainingCount = 0;
201     }
202 
203     size_t outBytesLeft = len * 2 + 2;
204     QByteArray ba(outBytesLeft, Qt::Uninitialized);
205     char *outBytes = ba.data();
206     do {
207         size_t ret = iconv(state->cd, &inBytes, &inBytesLeft, &outBytes, &outBytesLeft);
208         if (ret == (size_t) -1) {
209             if (errno == E2BIG) {
210                 int offset = ba.size() - outBytesLeft;
211                 ba.resize(ba.size() * 2);
212                 outBytes = ba.data() + offset;
213                 outBytesLeft = ba.size() - offset;
214 
215                 continue;
216             }
217 
218             if (errno == EILSEQ) {
219                 // conversion stopped because of an invalid character in the sequence
220                 ++invalidCount;
221             } else if (errno == EINVAL && convState) {
222                 // conversion stopped because the remaining inBytesLeft make up
223                 // an incomplete multi-byte sequence; save them for later
224                 state->saveChars(inBytes, inBytesLeft);
225                 remainingCount = inBytesLeft;
226                 break;
227             }
228 
229             if (errno == EILSEQ || errno == EINVAL) {
230                 // skip the next character
231                 ++inBytes;
232                 --inBytesLeft;
233                 continue;
234             }
235 
236             // some other error
237             // note, cannot use qWarning() since we are implementing the codecForLocale :)
238             perror("QIconvCodec::convertToUnicode: using Latin-1 for conversion, iconv failed");
239 
240             if (!convState) {
241                 // reset state
242                 iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
243             }
244 
245             delete temporaryState;
246             return QString::fromLatin1(chars, len);
247         }
248     } while (inBytesLeft != 0);
249 
250     QString s;
251 
252     if (convState) {
253         s = utf16Codec->toUnicode(ba.constData(), ba.size() - outBytesLeft, &state->internalState);
254 
255         convState->invalidChars = invalidCount;
256         convState->remainingChars = remainingCount;
257     } else {
258         s = utf16Codec->toUnicode(ba.constData(), ba.size() - outBytesLeft);
259 
260         // reset state
261         iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
262     }
263 
264     delete temporaryState;
265     return s;
266 }
267 
Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState * >,fromUnicodeState)268 Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState *>, fromUnicodeState)
269 
270 static bool setByteOrder(iconv_t cd)
271 {
272 #if !defined(NO_BOM)
273     // give iconv() a BOM
274     char buf[4];
275     ushort bom[] = { QChar::ByteOrderMark };
276 
277     char *outBytes = buf;
278     char *inBytes = reinterpret_cast<char *>(bom);
279     size_t outBytesLeft = sizeof buf;
280     size_t inBytesLeft = sizeof bom;
281 
282 #if !QT_CONFIG(posix_libiconv)
283     const char **inBytesPtr = const_cast<const char **>(&inBytes);
284 #else
285     char **inBytesPtr = &inBytes;
286 #endif
287 
288     if (iconv(cd, inBytesPtr, &inBytesLeft, &outBytes, &outBytesLeft) == (size_t) -1) {
289         return false;
290     }
291 #else
292     Q_UNUSED(cd);
293 #endif // NO_BOM
294 
295     return true;
296 }
297 
convertFromUnicode(const QChar * uc,int len,ConverterState * convState) const298 QByteArray QIconvCodec::convertFromUnicode(const QChar *uc, int len, ConverterState *convState) const
299 {
300     char *inBytes;
301     char *outBytes;
302     size_t inBytesLeft;
303 
304 #if !QT_CONFIG(posix_libiconv)
305     const char **inBytesPtr = const_cast<const char **>(&inBytes);
306 #else
307     char **inBytesPtr = &inBytes;
308 #endif
309 
310     IconvState *temporaryState = 0;
311     QThreadStorage<QIconvCodec::IconvState *> *ts = fromUnicodeState();
312     IconvState *&state = ts ? ts->localData() : temporaryState;
313     if (!state) {
314         iconv_t cd = createIconv_t(0, UTF16);
315         if (cd != reinterpret_cast<iconv_t>(-1)) {
316             if (!setByteOrder(cd)) {
317                 perror("QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv failed for BOM");
318 
319                 iconv_close(cd);
320                 cd = reinterpret_cast<iconv_t>(-1);
321 
322                 return QString(uc, len).toLatin1();
323             }
324         }
325         state = new IconvState(cd);
326     }
327     if (state->cd == reinterpret_cast<iconv_t>(-1)) {
328         static int reported = 0;
329         if (!reported++) {
330             fprintf(stderr,
331                     "QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv_open failed\n");
332         }
333         delete temporaryState;
334         return QString(uc, len).toLatin1();
335     }
336 
337     size_t outBytesLeft = len;
338     QByteArray ba(outBytesLeft, Qt::Uninitialized);
339     outBytes = ba.data();
340 
341     // now feed iconv() the real data
342     inBytes = const_cast<char *>(reinterpret_cast<const char *>(uc));
343     inBytesLeft = len * sizeof(QChar);
344 
345     QByteArray in;
346     if (convState && convState->remainingChars) {
347         // we have one surrogate char to be prepended
348         in.resize(sizeof(QChar) + len);
349         inBytes = in.data();
350 
351         QChar remaining = convState->state_data[0];
352         memcpy(in.data(), &remaining, sizeof(QChar));
353         memcpy(in.data() + sizeof(QChar), uc, inBytesLeft);
354 
355         inBytesLeft += sizeof(QChar);
356         convState->remainingChars = 0;
357     }
358 
359     int invalidCount = 0;
360     while (inBytesLeft != 0) {
361         if (iconv(state->cd, inBytesPtr, &inBytesLeft, &outBytes, &outBytesLeft) == (size_t) -1) {
362             if (errno == EINVAL && convState) {
363                 // buffer ends in a surrogate
364                 Q_ASSERT(inBytesLeft == 2);
365                 convState->remainingChars = 1;
366                 convState->state_data[0] = uc[len - 1].unicode();
367                 break;
368             }
369 
370             switch (errno) {
371             case EILSEQ:
372                 ++invalidCount;
373                 Q_FALLTHROUGH();
374             case EINVAL:
375                 {
376                     inBytes += sizeof(QChar);
377                     inBytesLeft -= sizeof(QChar);
378                     break;
379                 }
380             case E2BIG:
381                 {
382                     int offset = ba.size() - outBytesLeft;
383                     ba.resize(ba.size() * 2);
384                     outBytes = ba.data() + offset;
385                     outBytesLeft = ba.size() - offset;
386                     break;
387                 }
388             default:
389                 {
390                     // note, cannot use qWarning() since we are implementing the codecForLocale :)
391                     perror("QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv failed");
392 
393                     // reset to initial state
394                     iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
395 
396                     delete temporaryState;
397                     return QString(uc, len).toLatin1();
398                 }
399             }
400         }
401     }
402 
403     // reset to initial state
404     iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
405     setByteOrder(state->cd);
406 
407     ba.resize(ba.size() - outBytesLeft);
408 
409     if (convState)
410         convState->invalidChars = invalidCount;
411 
412     delete temporaryState;
413     return ba;
414 }
415 
name() const416 QByteArray QIconvCodec::name() const
417 {
418     return "System";
419 }
420 
mibEnum() const421 int QIconvCodec::mibEnum() const
422 {
423     return 0;
424 }
425 
createIconv_t(const char * to,const char * from) const426 iconv_t QIconvCodec::createIconv_t(const char *to, const char *from) const
427 {
428     Q_ASSERT((to == 0 && from != 0) || (to != 0 && from == 0));
429 
430     if (!utf16Codec)
431         init();
432 
433     iconv_t cd = (iconv_t) -1;
434 #if defined(__GLIBC__) || !QT_CONFIG(posix_libiconv) || defined(Q_OS_QNX)
435 #if defined(Q_OS_QNX)
436     // on QNX the default locale is UTF-8, and an empty string will cause iconv_open to fail
437     static const char empty_codeset[] = "UTF-8";
438 #else
439     // both GLIBC and libgnuiconv will use the locale's encoding if from or to is an empty string
440     static const char empty_codeset[] = "";
441 #endif
442     const char *codeset = empty_codeset;
443     cd = iconv_open(to ? to : codeset, from ? from : codeset);
444 #else
445     char *codeset = 0;
446 #endif
447 
448 #if defined(_XOPEN_UNIX) && !defined(Q_OS_QNX)
449     if (cd == (iconv_t) -1) {
450         codeset = nl_langinfo(CODESET);
451         if (codeset)
452             cd = iconv_open(to ? to : codeset, from ? from : codeset);
453     }
454 #endif
455 
456     if (cd == (iconv_t) -1) {
457         // Very poorly defined and followed standards causes lots of
458         // code to try to get all the cases... This logic is
459         // duplicated in QTextCodec, so if you change it here, change
460         // it there too.
461 
462         // Try to determine locale codeset from locale name assigned to
463         // LC_CTYPE category.
464 
465         // First part is getting that locale name.  First try setlocale() which
466         // definitely knows it, but since we cannot fully trust it, get ready
467         // to fall back to environment variables.
468         char * ctype = qstrdup(setlocale(LC_CTYPE, 0));
469 
470         // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
471         // environment variables.
472         char * lang = qstrdup(qgetenv("LC_ALL").constData());
473         if (!lang || lang[0] == 0 || strcmp(lang, "C") == 0) {
474             if (lang) delete [] lang;
475             lang = qstrdup(qgetenv("LC_CTYPE").constData());
476         }
477         if (!lang || lang[0] == 0 || strcmp(lang, "C") == 0) {
478             if (lang) delete [] lang;
479             lang = qstrdup(qgetenv("LANG").constData());
480         }
481 
482         // Now try these in order:
483         // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
484         // 2. CODESET from lang if it contains a .CODESET part
485         // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
486         // 4. locale (ditto)
487         // 5. check for "@euro"
488 
489         // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
490         codeset = ctype ? strchr(ctype, '.') : 0;
491         if (codeset && *codeset == '.') {
492             ++codeset;
493             cd = iconv_open(to ? to : codeset, from ? from : codeset);
494         }
495 
496         // 2. CODESET from lang if it contains a .CODESET part
497         codeset = lang ? strchr(lang, '.') : 0;
498         if (cd == (iconv_t) -1 && codeset && *codeset == '.') {
499             ++codeset;
500             cd = iconv_open(to ? to : codeset, from ? from : codeset);
501         }
502 
503         // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
504         if (cd == (iconv_t) -1 && ctype && *ctype != 0 && strcmp (ctype, "C") != 0)
505             cd = iconv_open(to ? to : ctype, from ? from : ctype);
506 
507 
508         // 4. locale (ditto)
509         if (cd == (iconv_t) -1 && lang && *lang != 0)
510             cd = iconv_open(to ? to : lang, from ? from : lang);
511 
512         // 5. "@euro"
513         if ((cd == (iconv_t) -1 && ctype && strstr(ctype, "@euro")) || (lang && strstr(lang, "@euro")))
514             cd = iconv_open(to ? to : "ISO8859-15", from ? from : "ISO8859-15");
515 
516         delete [] ctype;
517         delete [] lang;
518     }
519 
520     return cd;
521 }
522 
523 QT_END_NAMESPACE
524