1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 3 requirements
23 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24 **
25 ** GNU General Public License Usage
26 ** Alternatively, this file may be used under the terms of the GNU
27 ** General Public License version 2.0 or (at your option) the GNU General
28 ** Public license version 3 or any later version approved by the KDE Free
29 ** Qt Foundation. The licenses are as published by the Free Software
30 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31 ** included in the packaging of this file. Please review the following
32 ** information to ensure the GNU General Public License requirements will
33 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34 ** https://www.gnu.org/licenses/gpl-3.0.html.
35 **
36 ** $QT_END_LICENSE$
37 **
38 ****************************************************************************/
39 
40 #include "qicucodec_p.h"
41 
42 #include "qtextcodec_p.h"
43 #include "qutfcodec_p.h"
44 #include "qlatincodec_p.h"
45 #include "qsimplecodec_p.h"
46 #include "private/qcoreglobaldata_p.h"
47 #include "qdebug.h"
48 
49 #include "unicode/ucnv.h"
50 
51 #if QT_CONFIG(codecs)
52 #include "qtsciicodec_p.h"
53 #include "qisciicodec_p.h"
54 #endif
55 
56 QT_BEGIN_NAMESPACE
57 
58 typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt;
59 typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt;
60 
qIcuCodecStateFree(QTextCodec::ConverterState * state)61 static void qIcuCodecStateFree(QTextCodec::ConverterState *state)
62 {
63     ucnv_close(static_cast<UConverter *>(state->d));
64 }
65 
qTextCodecNameMatch(const char * n,const char * h)66 bool qTextCodecNameMatch(const char *n, const char *h)
67 {
68     return ucnv_compareNames(n, h) == 0;
69 }
70 
71 /* The list below is generated from http://www.iana.org/assignments/character-sets/
72    using the snippet of code below:
73 
74 #include <QtCore>
75 #include <unicode/ucnv.h>
76 
77 int main(int argc, char **argv)
78 {
79     QCoreApplication app(argc, argv);
80 
81     QFile file("character-sets.txt");
82     file.open(QFile::ReadOnly);
83     QByteArray name;
84     int mib = -1;
85     QByteArray nameList;
86     int pos = 0;
87     while (!file.atEnd()) {
88         QByteArray s = file.readLine().trimmed();
89         if (s.isEmpty()) {
90             if (mib != -1) {
91                 UErrorCode error = U_ZERO_ERROR;
92                 const char *standard_name = ucnv_getStandardName(name, "MIME", &error);
93                 if (U_FAILURE(error) || !standard_name) {
94                     error = U_ZERO_ERROR;
95                     standard_name = ucnv_getStandardName(name, "IANA", &error);
96                 }
97                 UConverter *conv = ucnv_open(standard_name, &error);
98                 if (!U_FAILURE(error) && conv && standard_name) {
99                     ucnv_close(conv);
100                     printf("    { %d, %d },\n", mib, pos);
101                     nameList += "\"";
102                     nameList += standard_name;
103                     nameList += "\\0\"\n";
104                     pos += strlen(standard_name) + 1;
105                 }
106             }
107             name = QByteArray();
108             mib = -1;
109         }
110         if (s.startsWith("Name: ")) {
111             name = s.mid(5).trimmed();
112             if (name.indexOf(' ') > 0)
113                 name = name.left(name.indexOf(' '));
114         }
115         if (s.startsWith("MIBenum:"))
116             mib = s.mid(8).trimmed().toInt();
117         if (s.startsWith("Alias:") && s.contains("MIME")) {
118             name = s.mid(6).trimmed();
119             name = name.left(name.indexOf(' ')).trimmed();
120         }
121     }
122     qDebug() << nameList;
123 }
124 */
125 
126 struct MibToName {
127     short mib;
128     short index;
129 };
130 
131 static const MibToName mibToName[] = {
132     { 3, 0 },
133     { 4, 9 },
134     { 5, 20 },
135     { 6, 31 },
136     { 7, 42 },
137     { 8, 53 },
138     { 9, 64 },
139     { 10, 75 },
140     { 11, 86 },
141     { 12, 97 },
142     { 13, 108 },
143     { 16, 120 },
144     { 17, 134 },
145     { 18, 144 },
146     { 30, 151 },
147     { 36, 160 },
148     { 37, 167 },
149     { 38, 179 },
150     { 39, 186 },
151     { 40, 198 },
152     { 57, 212 },
153     { 81, 223 },
154     { 82, 234 },
155     { 84, 245 },
156     { 85, 256 },
157     { 104, 267 },
158     { 105, 279 },
159     { 106, 295 },
160     { 109, 301 },
161     { 110, 313 },
162     { 111, 325 },
163     { 113, 337 },
164     { 114, 341 },
165     { 1000, 349 },
166     { 1001, 356 },
167     { 1011, 363 },
168     { 1012, 368 },
169     { 1013, 374 },
170     { 1014, 383 },
171     { 1015, 392 },
172     { 1016, 399 },
173     { 1017, 406 },
174     { 1018, 413 },
175     { 1019, 422 },
176     { 1020, 431 },
177     { 2004, 438 },
178     { 2005, 448 },
179     { 2009, 472 },
180     { 2013, 479 },
181     { 2016, 486 },
182     { 2024, 495 },
183     { 2025, 505 },
184     { 2026, 512 },
185     { 2027, 517 },
186     { 2028, 527 },
187     { 2030, 534 },
188     { 2033, 541 },
189     { 2034, 548 },
190     { 2035, 555 },
191     { 2037, 562 },
192     { 2038, 569 },
193     { 2039, 576 },
194     { 2040, 583 },
195     { 2041, 590 },
196     { 2043, 597 },
197     { 2011, 604 },
198     { 2044, 611 },
199     { 2045, 618 },
200     { 2010, 624 },
201     { 2046, 631 },
202     { 2047, 638 },
203     { 2048, 645 },
204     { 2049, 652 },
205     { 2050, 659 },
206     { 2051, 666 },
207     { 2052, 673 },
208     { 2053, 680 },
209     { 2054, 687 },
210     { 2055, 694 },
211     { 2056, 701 },
212     { 2062, 708 },
213     { 2063, 715 },
214     { 2084, 723 },
215     { 2085, 730 },
216     { 2086, 741 },
217     { 2087, 748 },
218     { 2088, 755 },
219     { 2089, 762 },
220     { 2091, 771 },
221     { 2092, 780 },
222     { 2093, 789 },
223     { 2094, 798 },
224     { 2095, 807 },
225     { 2096, 816 },
226     { 2097, 825 },
227     { 2098, 834 },
228     { 2099, 843 },
229     { 2100, 852 },
230     { 2101, 861 },
231     { 2102, 872 },
232     { 2250, 880 },
233     { 2251, 893 },
234     { 2252, 906 },
235     { 2253, 919 },
236     { 2254, 932 },
237     { 2255, 945 },
238     { 2256, 958 },
239     { 2257, 971 },
240     { 2258, 984 },
241     { 2259, 997 },
242 };
243 int mibToNameSize = sizeof(mibToName)/sizeof(MibToName);
244 
245 static const char mibToNameTable[] =
246     "US-ASCII\0"
247     "ISO-8859-1\0"
248     "ISO-8859-2\0"
249     "ISO-8859-3\0"
250     "ISO-8859-4\0"
251     "ISO-8859-5\0"
252     "ISO-8859-6\0"
253     "ISO-8859-7\0"
254     "ISO-8859-8\0"
255     "ISO-8859-9\0"
256     "ISO-8859-10\0"
257     "ISO-2022-JP-1\0"
258     "Shift_JIS\0"
259     "EUC-JP\0"
260     "US-ASCII\0"
261     "EUC-KR\0"
262     "ISO-2022-KR\0"
263     "EUC-KR\0"
264     "ISO-2022-JP\0"
265     "ISO-2022-JP-2\0"
266     "GB_2312-80\0"
267     "ISO-8859-6\0"
268     "ISO-8859-6\0"
269     "ISO-8859-8\0"
270     "ISO-8859-8\0"
271     "ISO-2022-CN\0"
272     "ISO-2022-CN-EXT\0"
273     "UTF-8\0"
274     "ISO-8859-13\0"
275     "ISO-8859-14\0"
276     "ISO-8859-15\0"
277     "GBK\0"
278     "GB18030\0"
279     "UTF-16\0"
280     "UTF-32\0"
281     "SCSU\0"
282     "UTF-7\0"
283     "UTF-16BE\0"
284     "UTF-16LE\0"
285     "UTF-16\0"
286     "CESU-8\0"
287     "UTF-32\0"
288     "UTF-32BE\0"
289     "UTF-32LE\0"
290     "BOCU-1\0"
291     "hp-roman8\0"
292     "Adobe-Standard-Encoding\0"
293     "IBM850\0"
294     "IBM862\0"
295     "IBM-Thai\0"
296     "Shift_JIS\0"
297     "GB2312\0"
298     "Big5\0"
299     "macintosh\0"
300     "IBM037\0"
301     "IBM273\0"
302     "IBM277\0"
303     "IBM278\0"
304     "IBM280\0"
305     "IBM284\0"
306     "IBM285\0"
307     "IBM290\0"
308     "IBM297\0"
309     "IBM420\0"
310     "IBM424\0"
311     "IBM437\0"
312     "IBM500\0"
313     "cp851\0"
314     "IBM852\0"
315     "IBM855\0"
316     "IBM857\0"
317     "IBM860\0"
318     "IBM861\0"
319     "IBM863\0"
320     "IBM864\0"
321     "IBM865\0"
322     "IBM868\0"
323     "IBM869\0"
324     "IBM870\0"
325     "IBM871\0"
326     "IBM918\0"
327     "IBM1026\0"
328     "KOI8-R\0"
329     "HZ-GB-2312\0"
330     "IBM866\0"
331     "IBM775\0"
332     "KOI8-U\0"
333     "IBM00858\0"
334     "IBM01140\0"
335     "IBM01141\0"
336     "IBM01142\0"
337     "IBM01143\0"
338     "IBM01144\0"
339     "IBM01145\0"
340     "IBM01146\0"
341     "IBM01147\0"
342     "IBM01148\0"
343     "IBM01149\0"
344     "Big5-HKSCS\0"
345     "IBM1047\0"
346     "windows-1250\0"
347     "windows-1251\0"
348     "windows-1252\0"
349     "windows-1253\0"
350     "windows-1254\0"
351     "windows-1255\0"
352     "windows-1256\0"
353     "windows-1257\0"
354     "windows-1258\0"
355     "TIS-620\0";
356 
loadQtCodec(const char * name)357 static QTextCodec *loadQtCodec(const char *name)
358 {
359     if (!strcmp(name, "UTF-8"))
360         return new QUtf8Codec;
361     if (!strcmp(name, "UTF-16"))
362         return new QUtf16Codec;
363     if (!strcmp(name, "ISO-8859-1"))
364         return new QLatin1Codec;
365     if (!strcmp(name, "UTF-16BE"))
366         return new QUtf16BECodec;
367     if (!strcmp(name, "UTF-16LE"))
368         return new QUtf16LECodec;
369     if (!strcmp(name, "UTF-32"))
370         return new QUtf32Codec;
371     if (!strcmp(name, "UTF-32BE"))
372         return new QUtf32BECodec;
373     if (!strcmp(name, "UTF-32LE"))
374         return new QUtf32LECodec;
375     if (!strcmp(name, "ISO-8859-16") || !strcmp(name, "latin10") || !strcmp(name, "iso-ir-226"))
376         return new QSimpleTextCodec(13 /* == 8859-16*/);
377 #if QT_CONFIG(codecs)
378     if (!strcmp(name, "TSCII"))
379         return new QTsciiCodec;
380     if (!qstrnicmp(name, "iscii", 5))
381         return QIsciiCodec::create(name);
382 #endif
383 
384     return nullptr;
385 }
386 
387 /// \threadsafe
availableCodecs()388 QList<QByteArray> QIcuCodec::availableCodecs()
389 {
390     QList<QByteArray> codecs;
391     int n = ucnv_countAvailable();
392     for (int i = 0; i < n; ++i) {
393         const char *name = ucnv_getAvailableName(i);
394 
395         UErrorCode error = U_ZERO_ERROR;
396         const char *standardName = ucnv_getStandardName(name, "MIME", &error);
397         if (U_FAILURE(error) || !standardName) {
398             error = U_ZERO_ERROR;
399             standardName = ucnv_getStandardName(name, "IANA", &error);
400         }
401         if (U_FAILURE(error))
402             continue;
403 
404         error = U_ZERO_ERROR;
405         int ac = ucnv_countAliases(standardName, &error);
406         if (U_FAILURE(error))
407             continue;
408         for (int j = 0; j < ac; ++j) {
409             error = U_ZERO_ERROR;
410             const char *alias = ucnv_getAlias(standardName, j, &error);
411             if (!U_SUCCESS(error))
412                 continue;
413             codecs += alias;
414         }
415     }
416 
417     // handled by Qt and not in ICU:
418     codecs += "TSCII";
419 
420     return codecs;
421 }
422 
423 /// \threadsafe
availableMibs()424 QList<int> QIcuCodec::availableMibs()
425 {
426     QList<int> mibs;
427     mibs.reserve(mibToNameSize + 1);
428     for (int i = 0; i < mibToNameSize; ++i)
429         mibs += mibToName[i].mib;
430 
431     // handled by Qt and not in ICU:
432     mibs += 2107; // TSCII
433 
434     return mibs;
435 }
436 
defaultCodecUnlocked()437 QTextCodec *QIcuCodec::defaultCodecUnlocked()
438 {
439     QCoreGlobalData *globalData = QCoreGlobalData::instance();
440     if (!globalData)
441         return nullptr;
442     QTextCodec *c = globalData->codecForLocale.loadAcquire();
443     if (c)
444         return c;
445 
446 #if defined(QT_LOCALE_IS_UTF8)
447     const char *name = "UTF-8";
448 #else
449     const char *name = ucnv_getDefaultName();
450 #endif
451     c = codecForNameUnlocked(name);
452     globalData->codecForLocale.storeRelease(c);
453     return c;
454 }
455 
456 
codecForNameUnlocked(const char * name)457 QTextCodec *QIcuCodec::codecForNameUnlocked(const char *name)
458 {
459     // backwards compatibility with Qt 4.x
460     if (!qstrcmp(name, "CP949"))
461         name = "windows-949";
462     else if (!qstrcmp(name, "Apple Roman"))
463         name = "macintosh";
464     // these are broken data in ICU 4.4, and can't be resolved even though they are aliases to tis-620
465     if (!qstrcmp(name, "windows-874-2000")
466         || !qstrcmp(name, "windows-874")
467         || !qstrcmp(name, "MS874")
468         || !qstrcmp(name, "x-windows-874")
469         || !qstrcmp(name, "ISO 8859-11"))
470         name = "TIS-620";
471 
472     UErrorCode error = U_ZERO_ERROR;
473     // MIME gives better default names
474     const char *standardName = ucnv_getStandardName(name, "MIME", &error);
475     if (U_FAILURE(error) || !standardName) {
476         error = U_ZERO_ERROR;
477         standardName = ucnv_getStandardName(name, "IANA", &error);
478     }
479     bool qt_only = false;
480     if (U_FAILURE(error) || !standardName) {
481         standardName = name;
482         qt_only = true;
483     } else {
484         // correct some issues where the ICU data set contains duplicated entries.
485         // Where this happens it's because one data set is a subset of another. We
486         // always use the larger data set.
487 
488         if (qstrcmp(standardName, "GB2312") == 0 || qstrcmp(standardName, "GB_2312-80") == 0)
489             standardName = "GBK";
490         else if (qstrcmp(standardName, "KSC_5601") == 0 || qstrcmp(standardName, "EUC-KR") == 0 || qstrcmp(standardName, "cp1363") == 0)
491             standardName = "windows-949";
492     }
493 
494     QCoreGlobalData *globalData = QCoreGlobalData::instance();
495     QTextCodecCache *cache = &globalData->codecCache;
496 
497     QTextCodec *codec;
498     if (cache) {
499         codec = cache->value(standardName);
500         if (codec)
501             return codec;
502     }
503 
504     for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
505         QTextCodec *cursor = *it;
506         if (qTextCodecNameMatch(cursor->name(), standardName)) {
507             if (cache)
508                 cache->insert(standardName, cursor);
509             return cursor;
510         }
511         QList<QByteArray> aliases = cursor->aliases();
512         for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) {
513             if (qTextCodecNameMatch(*ait, standardName)) {
514                 if (cache)
515                     cache->insert(standardName, cursor);
516                 return cursor;
517             }
518         }
519     }
520 
521     QTextCodec *c = loadQtCodec(standardName);
522     if (c)
523         return c;
524 
525     if (qt_only)
526         return nullptr;
527 
528     // check whether there is really a converter for the name available.
529     UConverter *conv = ucnv_open(standardName, &error);
530     if (!conv) {
531         qDebug("codecForName: ucnv_open failed %s %s", standardName, u_errorName(error));
532         return nullptr;
533     }
534     //qDebug() << "QIcuCodec: Standard name for " << name << "is" << standardName;
535     ucnv_close(conv);
536 
537 
538     c = new QIcuCodec(standardName);
539     if (cache)
540         cache->insert(standardName, c);
541     return c;
542 }
543 
544 
codecForMibUnlocked(int mib)545 QTextCodec *QIcuCodec::codecForMibUnlocked(int mib)
546 {
547     for (int i = 0; i < mibToNameSize; ++i) {
548         if (mibToName[i].mib == mib)
549             return codecForNameUnlocked(mibToNameTable + mibToName[i].index);
550     }
551 
552     if (mib == 2107)
553         return codecForNameUnlocked("TSCII");
554 
555     return nullptr;
556 }
557 
558 
QIcuCodec(const char * name)559 QIcuCodec::QIcuCodec(const char *name)
560     : m_name(name)
561 {
562 }
563 
~QIcuCodec()564 QIcuCodec::~QIcuCodec()
565 {
566 }
567 
getConverter(QTextCodec::ConverterState * state) const568 UConverter *QIcuCodec::getConverter(QTextCodec::ConverterState *state) const
569 {
570     UConverter *conv = nullptr;
571     if (state) {
572         if (!state->d) {
573             // first time
574             state->flags |= QTextCodec::FreeFunction;
575             QTextCodecUnalignedPointer::encode(state->state_data, qIcuCodecStateFree);
576             UErrorCode error = U_ZERO_ERROR;
577             state->d = ucnv_open(m_name, &error);
578             ucnv_setSubstChars(static_cast<UConverter *>(state->d),
579                                state->flags & QTextCodec::ConvertInvalidToNull ? "\0" : "?", 1, &error);
580             if (U_FAILURE(error))
581                 qDebug("getConverter(state) ucnv_open failed %s %s", m_name, u_errorName(error));
582         }
583         conv = static_cast<UConverter *>(state->d);
584     }
585     if (!conv) {
586         // stateless conversion
587         UErrorCode error = U_ZERO_ERROR;
588         conv = ucnv_open(m_name, &error);
589         ucnv_setSubstChars(conv, "?", 1, &error);
590         if (U_FAILURE(error))
591             qDebug("getConverter(no state) ucnv_open failed %s %s", m_name, u_errorName(error));
592     }
593     return conv;
594 }
595 
convertToUnicode(const char * chars,int length,QTextCodec::ConverterState * state) const596 QString QIcuCodec::convertToUnicode(const char *chars, int length, QTextCodec::ConverterState *state) const
597 {
598     UConverter *conv = getConverter(state);
599 
600     QString string(length + 2, Qt::Uninitialized);
601 
602     const char *end = chars + length;
603     int convertedChars = 0;
604     while (1) {
605         UChar *uc = (UChar *)string.data();
606         UChar *ucEnd = uc + string.length();
607         uc += convertedChars;
608         UErrorCode error = U_ZERO_ERROR;
609         ucnv_toUnicode(conv,
610                        &uc, ucEnd,
611                        &chars, end,
612                        nullptr, false, &error);
613         if (!U_SUCCESS(error) && error != U_BUFFER_OVERFLOW_ERROR) {
614             qDebug("convertToUnicode failed: %s", u_errorName(error));
615             break;
616         }
617 
618         convertedChars = uc - (UChar *)string.data();
619         if (chars >= end)
620             break;
621         string.resize(string.length()*2);
622     }
623     string.resize(convertedChars);
624 
625     if (!state)
626         ucnv_close(conv);
627     return string;
628 }
629 
630 
convertFromUnicode(const QChar * unicode,int length,QTextCodec::ConverterState * state) const631 QByteArray QIcuCodec::convertFromUnicode(const QChar *unicode, int length, QTextCodec::ConverterState *state) const
632 {
633     UConverter *conv = getConverter(state);
634 
635     int requiredLength = UCNV_GET_MAX_BYTES_FOR_STRING(length, ucnv_getMaxCharSize(conv));
636     QByteArray string(requiredLength, Qt::Uninitialized);
637 
638     const UChar *uc = (const UChar *)unicode;
639     const UChar *end = uc + length;
640     int convertedChars = 0;
641     while (1) {
642         char *ch = (char *)string.data();
643         char *chEnd = ch + string.length();
644         ch += convertedChars;
645         UErrorCode error = U_ZERO_ERROR;
646         ucnv_fromUnicode(conv,
647                          &ch, chEnd,
648                          &uc, end,
649                          nullptr, false, &error);
650         if (!U_SUCCESS(error))
651             qDebug("convertFromUnicode failed: %s", u_errorName(error));
652         convertedChars = ch - string.data();
653         if (uc >= end)
654             break;
655         string.resize(string.length()*2);
656     }
657     string.resize(convertedChars);
658 
659     if (!state)
660         ucnv_close(conv);
661 
662     return string;
663 }
664 
665 
name() const666 QByteArray QIcuCodec::name() const
667 {
668     return m_name;
669 }
670 
671 
aliases() const672 QList<QByteArray> QIcuCodec::aliases() const
673 {
674     UErrorCode error = U_ZERO_ERROR;
675 
676     int n = ucnv_countAliases(m_name, &error);
677 
678     QList<QByteArray> aliases;
679     for (int i = 0; i < n; ++i) {
680         const char *a = ucnv_getAlias(m_name, i, &error);
681         // skip the canonical name
682         if (!a || !qstrcmp(a, m_name))
683             continue;
684         aliases += a;
685     }
686 
687     return aliases;
688 }
689 
690 
mibEnum() const691 int QIcuCodec::mibEnum() const
692 {
693     for (int i = 0; i < mibToNameSize; ++i) {
694         if (qTextCodecNameMatch(m_name, (mibToNameTable + mibToName[i].index)))
695             return mibToName[i].mib;
696     }
697 
698     return 0;
699 }
700 
701 QT_END_NAMESPACE
702