1 /****************************************************************************
2 **
3 ** Copyright (C) 2015 The Qt Company Ltd.
4 ** Contact: http://www.qt.io/licensing/
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see http://www.qt.io/terms-conditions. For further
15 ** information use the contact form at http://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 2.1 or version 3 as published by the Free
20 ** Software Foundation and appearing in the file LICENSE.LGPLv21 and
21 ** LICENSE.LGPLv3 included in the packaging of this file. Please review the
22 ** following information to ensure the GNU Lesser General Public License
23 ** requirements will be met: https://www.gnu.org/licenses/lgpl.html and
24 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
25 **
26 ** As a special exception, The Qt Company gives you certain additional
27 ** rights. These rights are described in The Qt Company LGPL Exception
28 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
29 **
30 ** GNU General Public License Usage
31 ** Alternatively, this file may be used under the terms of the GNU
32 ** General Public License version 3.0 as published by the Free Software
33 ** Foundation and appearing in the file LICENSE.GPL included in the
34 ** packaging of this file.  Please review the following information to
35 ** ensure the GNU General Public License version 3.0 requirements will be
36 ** met: http://www.gnu.org/copyleft/gpl.html.
37 **
38 ** $QT_END_LICENSE$
39 **
40 ****************************************************************************/
41 
42 #include "qiconvcodec_p.h"
43 #include "qtextcodec_p.h"
44 #include <qlibrary.h>
45 #include <qdebug.h>
46 #include <qthreadstorage.h>
47 
48 #include <errno.h>
49 #include <locale.h>
50 #include <stdio.h>
51 #include <dlfcn.h>
52 
53 // unistd.h is needed for the _XOPEN_UNIX macro
54 #include <unistd.h>
55 #if defined(_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
56 #  include <langinfo.h>
57 #endif
58 
59 #if defined(Q_OS_HPUX)
60 #  define NO_BOM
61 #  define UTF16 "ucs2"
62 #elif defined(Q_OS_AIX)
63 #  define NO_BOM
64 #  define UTF16 "UCS-2"
65 #elif defined(Q_OS_FREEBSD) || defined(Q_OS_MAC)
66 #  define NO_BOM
67 #  if Q_BYTE_ORDER == Q_BIG_ENDIAN
68 #    define UTF16 "UTF-16BE"
69 #  else
70 #    define UTF16 "UTF-16LE"
71 #  endif
72 #else
73 #  define UTF16 "UTF-16"
74 #endif
75 
76 #if defined(Q_OS_MAC)
77 #ifndef GNU_LIBICONV
78 #define GNU_LIBICONV
79 #endif
80 typedef iconv_t (*Ptr_iconv_open) (const char*, const char*);
81 typedef size_t (*Ptr_iconv) (iconv_t, const char **, size_t *, char **, size_t *);
82 typedef int (*Ptr_iconv_close) (iconv_t);
83 
84 static Ptr_iconv_open ptr_iconv_open = 0;
85 static Ptr_iconv ptr_iconv = 0;
86 static Ptr_iconv_close ptr_iconv_close = 0;
87 #endif
88 
89 QT_BEGIN_NAMESPACE
90 
91 extern bool qt_locale_initialized;
92 
QIconvCodec()93 QIconvCodec::QIconvCodec()
94     : utf16Codec(0)
95 {
96     utf16Codec = QTextCodec::codecForMib(1015);
97     Q_ASSERT_X(utf16Codec != 0,
98                "QIconvCodec::convertToUnicode",
99                "internal error, UTF-16 codec not found");
100     if (!utf16Codec) {
101         fprintf(stderr, "QIconvCodec::convertToUnicode: internal error, UTF-16 codec not found\n");
102         utf16Codec = reinterpret_cast<QTextCodec *>(~0);
103     }
104 #if defined(Q_OS_MAC)
105     if (ptr_iconv_open == 0) {
106         QLibrary libiconv(QLatin1String("/usr/lib/libiconv"));
107         libiconv.setLoadHints(QLibrary::ExportExternalSymbolsHint);
108 
109         ptr_iconv_open = reinterpret_cast<Ptr_iconv_open>(libiconv.resolve("libiconv_open"));
110         if (!ptr_iconv_open)
111             ptr_iconv_open = reinterpret_cast<Ptr_iconv_open>(libiconv.resolve("iconv_open"));
112         ptr_iconv = reinterpret_cast<Ptr_iconv>(libiconv.resolve("libiconv"));
113         if (!ptr_iconv)
114             ptr_iconv = reinterpret_cast<Ptr_iconv>(libiconv.resolve("iconv"));
115         ptr_iconv_close = reinterpret_cast<Ptr_iconv_close>(libiconv.resolve("libiconv_close"));
116         if (!ptr_iconv_close)
117             ptr_iconv_close = reinterpret_cast<Ptr_iconv_close>(libiconv.resolve("iconv_close"));
118 
119         Q_ASSERT_X(ptr_iconv_open && ptr_iconv && ptr_iconv_close,
120         "QIconvCodec::QIconvCodec()",
121         "internal error, could not resolve the iconv functions");
122 
123 #       undef iconv_open
124 #       define iconv_open ptr_iconv_open
125 #       undef iconv
126 #       define iconv ptr_iconv
127 #       undef iconv_close
128 #       define iconv_close ptr_iconv_close
129     }
130 #endif
131 }
132 
~QIconvCodec()133 QIconvCodec::~QIconvCodec()
134 {
135 }
136 
IconvState(iconv_t x)137 QIconvCodec::IconvState::IconvState(iconv_t x)
138     : buffer(array), bufferLen(sizeof array), cd(x)
139 {
140 }
141 
~IconvState()142 QIconvCodec::IconvState::~IconvState()
143 {
144     if (cd != reinterpret_cast<iconv_t>(-1))
145         iconv_close(cd);
146     if (buffer != array)
147         delete[] buffer;
148 }
149 
saveChars(const char * c,int count)150 void QIconvCodec::IconvState::saveChars(const char *c, int count)
151 {
152     if (count > bufferLen) {
153         if (buffer != array)
154             delete[] buffer;
155         buffer = new char[bufferLen = count];
156     }
157 
158     memcpy(buffer, c, count);
159 }
160 
qIconvCodecStateFree(QTextCodec::ConverterState * state)161 static void qIconvCodecStateFree(QTextCodec::ConverterState *state)
162 {
163     delete reinterpret_cast<QIconvCodec::IconvState *>(state->d);
164 }
165 
Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState * >,toUnicodeState)166 Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState *>, toUnicodeState)
167 
168 QString QIconvCodec::convertToUnicode(const char* chars, int len, ConverterState *convState) const
169 {
170     if (utf16Codec == reinterpret_cast<QTextCodec *>(~0))
171         return QString::fromLatin1(chars, len);
172 
173     int invalidCount = 0;
174     int remainingCount = 0;
175     char *remainingBuffer = 0;
176     IconvState *temporaryState = 0;
177     IconvState **pstate;
178 
179     if (convState) {
180         // stateful conversion
181         pstate = reinterpret_cast<IconvState **>(&convState->d);
182         if (convState->d) {
183             // restore state
184             remainingCount = convState->remainingChars;
185             remainingBuffer = (*pstate)->buffer;
186         } else {
187             // first time
188             convState->flags |= FreeFunction;
189             QTextCodecUnalignedPointer::encode(convState->state_data, qIconvCodecStateFree);
190         }
191     } else {
192         QThreadStorage<QIconvCodec::IconvState *> *ts = toUnicodeState();
193         if (!qt_locale_initialized || !ts) {
194             // we're running after the Q_GLOBAL_STATIC has been deleted
195             // or before the QCoreApplication initialization
196             // bad programmer, no cookie for you
197             pstate = &temporaryState;
198         } else {
199             // stateless conversion -- use thread-local data
200             pstate = &toUnicodeState()->localData();
201         }
202     }
203 
204     if (!*pstate) {
205         // first time, create the state
206         iconv_t cd = QIconvCodec::createIconv_t(UTF16, 0);
207         if (cd == reinterpret_cast<iconv_t>(-1)) {
208             static int reported = 0;
209             if (!reported++) {
210                 fprintf(stderr,
211                         "QIconvCodec::convertToUnicode: using Latin-1 for conversion, iconv_open failed\n");
212             }
213             return QString::fromLatin1(chars, len);
214         }
215 
216         *pstate = new IconvState(cd);
217     }
218 
219     IconvState *state = *pstate;
220     size_t inBytesLeft = len;
221     // best case assumption, each byte is converted into one UTF-16 character, plus 2 bytes for the BOM
222     char *inBytes = const_cast<char *>(chars);
223 
224     QByteArray in;
225     if (remainingCount) {
226         // we have to prepend the remaining bytes from the previous conversion
227         inBytesLeft += remainingCount;
228         in.resize(inBytesLeft);
229         inBytes = in.data();
230 
231         memcpy(in.data(), remainingBuffer, remainingCount);
232         memcpy(in.data() + remainingCount, chars, len);
233 
234         remainingCount = 0;
235     }
236 
237     size_t outBytesLeft = len * 2 + 2;
238     QByteArray ba(outBytesLeft, Qt::Uninitialized);
239     char *outBytes = ba.data();
240     do {
241         size_t ret = iconv(state->cd, &inBytes, &inBytesLeft, &outBytes, &outBytesLeft);
242         if (ret == (size_t) -1) {
243             if (errno == E2BIG) {
244                 int offset = ba.size() - outBytesLeft;
245                 ba.resize(ba.size() * 2);
246                 outBytes = ba.data() + offset;
247                 outBytesLeft = ba.size() - offset;
248 
249                 continue;
250             }
251 
252             if (errno == EILSEQ) {
253                 // conversion stopped because of an invalid character in the sequence
254                 ++invalidCount;
255             } else if (errno == EINVAL && convState) {
256                 // conversion stopped because the remaining inBytesLeft make up
257                 // an incomplete multi-byte sequence; save them for later
258                 state->saveChars(inBytes, inBytesLeft);
259                 remainingCount = inBytesLeft;
260                 break;
261             }
262 
263             if (errno == EILSEQ || errno == EINVAL) {
264                 // skip the next character
265                 ++inBytes;
266                 --inBytesLeft;
267                 continue;
268             }
269 
270             // some other error
271             // note, cannot use qWarning() since we are implementing the codecForLocale :)
272             perror("QIconvCodec::convertToUnicode: using Latin-1 for conversion, iconv failed");
273 
274             if (!convState) {
275                 // reset state
276                 iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
277             }
278 
279             delete temporaryState;
280             return QString::fromLatin1(chars, len);
281         }
282     } while (inBytesLeft != 0);
283 
284     QString s;
285 
286     if (convState) {
287         s = utf16Codec->toUnicode(ba.constData(), ba.size() - outBytesLeft, &state->internalState);
288 
289         convState->invalidChars = invalidCount;
290         convState->remainingChars = remainingCount;
291     } else {
292         s = utf16Codec->toUnicode(ba.constData(), ba.size() - outBytesLeft);
293 
294         // reset state
295         iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
296     }
297 
298     delete temporaryState;
299     return s;
300 }
301 
Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState * >,fromUnicodeState)302 Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState *>, fromUnicodeState)
303 
304 static bool setByteOrder(iconv_t cd)
305 {
306 #if !defined(NO_BOM)
307     // give iconv() a BOM
308     char buf[4];
309     ushort bom[] = { QChar::ByteOrderMark };
310 
311     char *outBytes = buf;
312     char *inBytes = reinterpret_cast<char *>(bom);
313     size_t outBytesLeft = sizeof buf;
314     size_t inBytesLeft = sizeof bom;
315 
316     char **inBytesPtr = &inBytes;
317 
318     if (iconv(cd, inBytesPtr, &inBytesLeft, &outBytes, &outBytesLeft) == (size_t) -1) {
319         return false;
320     }
321 #endif // NO_BOM
322 
323     return true;
324 }
325 
convertFromUnicode(const QChar * uc,int len,ConverterState * convState) const326 QByteArray QIconvCodec::convertFromUnicode(const QChar *uc, int len, ConverterState *convState) const
327 {
328     char *inBytes;
329     char *outBytes;
330     size_t inBytesLeft;
331 
332     char **inBytesPtr = &inBytes;
333 
334     IconvState *temporaryState = 0;
335     QThreadStorage<QIconvCodec::IconvState *> *ts = fromUnicodeState();
336     IconvState *&state = (qt_locale_initialized && ts) ? ts->localData() : temporaryState;
337     if (!state) {
338         iconv_t cd = QIconvCodec::createIconv_t(0, UTF16);
339         if (cd != reinterpret_cast<iconv_t>(-1)) {
340             if (!setByteOrder(cd)) {
341                 perror("QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv failed for BOM");
342 
343                 iconv_close(cd);
344                 cd = reinterpret_cast<iconv_t>(-1);
345 
346                 return QString(uc, len).toLatin1();
347             }
348         }
349         state = new IconvState(cd);
350     }
351     if (state->cd == reinterpret_cast<iconv_t>(-1)) {
352         static int reported = 0;
353         if (!reported++) {
354             fprintf(stderr,
355                     "QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv_open failed\n");
356         }
357         delete temporaryState;
358         return QString(uc, len).toLatin1();
359     }
360 
361     size_t outBytesLeft = len;
362     QByteArray ba(outBytesLeft, Qt::Uninitialized);
363     outBytes = ba.data();
364 
365     // now feed iconv() the real data
366     inBytes = const_cast<char *>(reinterpret_cast<const char *>(uc));
367     inBytesLeft = len * sizeof(QChar);
368 
369     QByteArray in;
370     if (convState && convState->remainingChars) {
371         // we have one surrogate char to be prepended
372         in.resize(sizeof(QChar) + len);
373         inBytes = in.data();
374 
375         QChar remaining = convState->state_data[0];
376         memcpy(in.data(), &remaining, sizeof(QChar));
377         memcpy(in.data() + sizeof(QChar), uc, inBytesLeft);
378 
379         inBytesLeft += sizeof(QChar);
380         convState->remainingChars = 0;
381     }
382 
383     int invalidCount = 0;
384     while (inBytesLeft != 0) {
385         if (iconv(state->cd, inBytesPtr, &inBytesLeft, &outBytes, &outBytesLeft) == (size_t) -1) {
386             if (errno == EINVAL && convState) {
387                 // buffer ends in a surrogate
388                 Q_ASSERT(inBytesLeft == 2);
389                 convState->remainingChars = 1;
390                 convState->state_data[0] = uc[len - 1].unicode();
391                 break;
392             }
393 
394             switch (errno) {
395             case EILSEQ:
396                 ++invalidCount;
397                 // fall through
398             case EINVAL:
399                 {
400                     inBytes += sizeof(QChar);
401                     inBytesLeft -= sizeof(QChar);
402                     break;
403                 }
404             case E2BIG:
405                 {
406                     int offset = ba.size() - outBytesLeft;
407                     ba.resize(ba.size() * 2);
408                     outBytes = ba.data() + offset;
409                     outBytesLeft = ba.size() - offset;
410                     break;
411                 }
412             default:
413                 {
414                     // note, cannot use qWarning() since we are implementing the codecForLocale :)
415                     perror("QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv failed");
416 
417                     // reset to initial state
418                     iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
419 
420                     delete temporaryState;
421                     return QString(uc, len).toLatin1();
422                 }
423             }
424         }
425     }
426 
427     // reset to initial state
428     iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft);
429     setByteOrder(state->cd);
430 
431     ba.resize(ba.size() - outBytesLeft);
432 
433     if (convState)
434         convState->invalidChars = invalidCount;
435 
436     delete temporaryState;
437     return ba;
438 }
439 
name() const440 QByteArray QIconvCodec::name() const
441 {
442     return "System";
443 }
444 
mibEnum() const445 int QIconvCodec::mibEnum() const
446 {
447     return 0;
448 }
449 
createIconv_t(const char * to,const char * from)450 iconv_t QIconvCodec::createIconv_t(const char *to, const char *from)
451 {
452     Q_ASSERT((to == 0 && from != 0) || (to != 0 && from == 0));
453 
454     iconv_t cd = (iconv_t) -1;
455 #if defined(__GLIBC__) || defined(GNU_LIBICONV) || defined(Q_OS_QNX)
456 #if defined(Q_OS_QNX)
457     // on QNX the default locale is UTF-8, and an empty string will cause iconv_open to fail
458     static const char empty_codeset[] = "UTF-8";
459 #else
460     // both GLIBC and libgnuiconv will use the locale's encoding if from or to is an empty string
461     static const char empty_codeset[] = "";
462 #endif
463     const char *codeset = empty_codeset;
464     cd = iconv_open(to ? to : codeset, from ? from : codeset);
465 #else
466     char *codeset = 0;
467 #endif
468 
469 #if defined(_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
470     if (cd == (iconv_t) -1) {
471         codeset = nl_langinfo(CODESET);
472         if (codeset)
473             cd = iconv_open(to ? to : codeset, from ? from : codeset);
474     }
475 #endif
476 
477     if (cd == (iconv_t) -1) {
478         // Very poorly defined and followed standards causes lots of
479         // code to try to get all the cases... This logic is
480         // duplicated in QTextCodec, so if you change it here, change
481         // it there too.
482 
483         // Try to determine locale codeset from locale name assigned to
484         // LC_CTYPE category.
485 
486         // First part is getting that locale name.  First try setlocale() which
487         // definitely knows it, but since we cannot fully trust it, get ready
488         // to fall back to environment variables.
489         char * ctype = qstrdup(setlocale(LC_CTYPE, 0));
490 
491         // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
492         // environment variables.
493         char * lang = qstrdup(qgetenv("LC_ALL").constData());
494         if (!lang || lang[0] == 0 || strcmp(lang, "C") == 0) {
495             if (lang) delete [] lang;
496             lang = qstrdup(qgetenv("LC_CTYPE").constData());
497         }
498         if (!lang || lang[0] == 0 || strcmp(lang, "C") == 0) {
499             if (lang) delete [] lang;
500             lang = qstrdup(qgetenv("LANG").constData());
501         }
502 
503         // Now try these in order:
504         // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
505         // 2. CODESET from lang if it contains a .CODESET part
506         // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
507         // 4. locale (ditto)
508         // 5. check for "@euro"
509 
510         // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
511         codeset = ctype ? strchr(ctype, '.') : 0;
512         if (codeset && *codeset == '.') {
513             ++codeset;
514             cd = iconv_open(to ? to : codeset, from ? from : codeset);
515         }
516 
517         // 2. CODESET from lang if it contains a .CODESET part
518         codeset = lang ? strchr(lang, '.') : 0;
519         if (cd == (iconv_t) -1 && codeset && *codeset == '.') {
520             ++codeset;
521             cd = iconv_open(to ? to : codeset, from ? from : codeset);
522         }
523 
524         // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
525         if (cd == (iconv_t) -1 && ctype && *ctype != 0 && strcmp (ctype, "C") != 0)
526             cd = iconv_open(to ? to : ctype, from ? from : ctype);
527 
528 
529         // 4. locale (ditto)
530         if (cd == (iconv_t) -1 && lang && *lang != 0)
531             cd = iconv_open(to ? to : lang, from ? from : lang);
532 
533         // 5. "@euro"
534         if ((cd == (iconv_t) -1 && ctype && strstr(ctype, "@euro")) || (lang && strstr(lang, "@euro")))
535             cd = iconv_open(to ? to : "ISO8859-15", from ? from : "ISO8859-15");
536 
537         delete [] ctype;
538         delete [] lang;
539     }
540 
541     return cd;
542 }
543 
544 QT_END_NAMESPACE
545