1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 3 requirements
23 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24 **
25 ** GNU General Public License Usage
26 ** Alternatively, this file may be used under the terms of the GNU
27 ** General Public License version 2.0 or (at your option) the GNU General
28 ** Public license version 3 or any later version approved by the KDE Free
29 ** Qt Foundation. The licenses are as published by the Free Software
30 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31 ** included in the packaging of this file. Please review the following
32 ** information to ensure the GNU General Public License requirements will
33 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34 ** https://www.gnu.org/licenses/gpl-3.0.html.
35 **
36 ** $QT_END_LICENSE$
37 **
38 ****************************************************************************/
39
40 #include "qicucodec_p.h"
41
42 #include "qtextcodec_p.h"
43 #include "qutfcodec_p.h"
44 #include "qlatincodec_p.h"
45 #include "qsimplecodec_p.h"
46 #include "private/qcoreglobaldata_p.h"
47 #include "qdebug.h"
48
49 #include "unicode/ucnv.h"
50
51 #if QT_CONFIG(codecs)
52 #include "qtsciicodec_p.h"
53 #include "qisciicodec_p.h"
54 #endif
55
56 QT_BEGIN_NAMESPACE
57
58 typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt;
59 typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt;
60
qIcuCodecStateFree(QTextCodec::ConverterState * state)61 static void qIcuCodecStateFree(QTextCodec::ConverterState *state)
62 {
63 ucnv_close(static_cast<UConverter *>(state->d));
64 }
65
qTextCodecNameMatch(const char * n,const char * h)66 bool qTextCodecNameMatch(const char *n, const char *h)
67 {
68 return ucnv_compareNames(n, h) == 0;
69 }
70
71 /* The list below is generated from http://www.iana.org/assignments/character-sets/
72 using the snippet of code below:
73
74 #include <QtCore>
75 #include <unicode/ucnv.h>
76
77 int main(int argc, char **argv)
78 {
79 QCoreApplication app(argc, argv);
80
81 QFile file("character-sets.txt");
82 file.open(QFile::ReadOnly);
83 QByteArray name;
84 int mib = -1;
85 QByteArray nameList;
86 int pos = 0;
87 while (!file.atEnd()) {
88 QByteArray s = file.readLine().trimmed();
89 if (s.isEmpty()) {
90 if (mib != -1) {
91 UErrorCode error = U_ZERO_ERROR;
92 const char *standard_name = ucnv_getStandardName(name, "MIME", &error);
93 if (U_FAILURE(error) || !standard_name) {
94 error = U_ZERO_ERROR;
95 standard_name = ucnv_getStandardName(name, "IANA", &error);
96 }
97 UConverter *conv = ucnv_open(standard_name, &error);
98 if (!U_FAILURE(error) && conv && standard_name) {
99 ucnv_close(conv);
100 printf(" { %d, %d },\n", mib, pos);
101 nameList += "\"";
102 nameList += standard_name;
103 nameList += "\\0\"\n";
104 pos += strlen(standard_name) + 1;
105 }
106 }
107 name = QByteArray();
108 mib = -1;
109 }
110 if (s.startsWith("Name: ")) {
111 name = s.mid(5).trimmed();
112 if (name.indexOf(' ') > 0)
113 name = name.left(name.indexOf(' '));
114 }
115 if (s.startsWith("MIBenum:"))
116 mib = s.mid(8).trimmed().toInt();
117 if (s.startsWith("Alias:") && s.contains("MIME")) {
118 name = s.mid(6).trimmed();
119 name = name.left(name.indexOf(' ')).trimmed();
120 }
121 }
122 qDebug() << nameList;
123 }
124 */
125
126 struct MibToName {
127 short mib;
128 short index;
129 };
130
131 static const MibToName mibToName[] = {
132 { 3, 0 },
133 { 4, 9 },
134 { 5, 20 },
135 { 6, 31 },
136 { 7, 42 },
137 { 8, 53 },
138 { 9, 64 },
139 { 10, 75 },
140 { 11, 86 },
141 { 12, 97 },
142 { 13, 108 },
143 { 16, 120 },
144 { 17, 134 },
145 { 18, 144 },
146 { 30, 151 },
147 { 36, 160 },
148 { 37, 167 },
149 { 38, 179 },
150 { 39, 186 },
151 { 40, 198 },
152 { 57, 212 },
153 { 81, 223 },
154 { 82, 234 },
155 { 84, 245 },
156 { 85, 256 },
157 { 104, 267 },
158 { 105, 279 },
159 { 106, 295 },
160 { 109, 301 },
161 { 110, 313 },
162 { 111, 325 },
163 { 113, 337 },
164 { 114, 341 },
165 { 1000, 349 },
166 { 1001, 356 },
167 { 1011, 363 },
168 { 1012, 368 },
169 { 1013, 374 },
170 { 1014, 383 },
171 { 1015, 392 },
172 { 1016, 399 },
173 { 1017, 406 },
174 { 1018, 413 },
175 { 1019, 422 },
176 { 1020, 431 },
177 { 2004, 438 },
178 { 2005, 448 },
179 { 2009, 472 },
180 { 2013, 479 },
181 { 2016, 486 },
182 { 2024, 495 },
183 { 2025, 505 },
184 { 2026, 512 },
185 { 2027, 517 },
186 { 2028, 527 },
187 { 2030, 534 },
188 { 2033, 541 },
189 { 2034, 548 },
190 { 2035, 555 },
191 { 2037, 562 },
192 { 2038, 569 },
193 { 2039, 576 },
194 { 2040, 583 },
195 { 2041, 590 },
196 { 2043, 597 },
197 { 2011, 604 },
198 { 2044, 611 },
199 { 2045, 618 },
200 { 2010, 624 },
201 { 2046, 631 },
202 { 2047, 638 },
203 { 2048, 645 },
204 { 2049, 652 },
205 { 2050, 659 },
206 { 2051, 666 },
207 { 2052, 673 },
208 { 2053, 680 },
209 { 2054, 687 },
210 { 2055, 694 },
211 { 2056, 701 },
212 { 2062, 708 },
213 { 2063, 715 },
214 { 2084, 723 },
215 { 2085, 730 },
216 { 2086, 741 },
217 { 2087, 748 },
218 { 2088, 755 },
219 { 2089, 762 },
220 { 2091, 771 },
221 { 2092, 780 },
222 { 2093, 789 },
223 { 2094, 798 },
224 { 2095, 807 },
225 { 2096, 816 },
226 { 2097, 825 },
227 { 2098, 834 },
228 { 2099, 843 },
229 { 2100, 852 },
230 { 2101, 861 },
231 { 2102, 872 },
232 { 2250, 880 },
233 { 2251, 893 },
234 { 2252, 906 },
235 { 2253, 919 },
236 { 2254, 932 },
237 { 2255, 945 },
238 { 2256, 958 },
239 { 2257, 971 },
240 { 2258, 984 },
241 { 2259, 997 },
242 };
243 int mibToNameSize = sizeof(mibToName)/sizeof(MibToName);
244
245 static const char mibToNameTable[] =
246 "US-ASCII\0"
247 "ISO-8859-1\0"
248 "ISO-8859-2\0"
249 "ISO-8859-3\0"
250 "ISO-8859-4\0"
251 "ISO-8859-5\0"
252 "ISO-8859-6\0"
253 "ISO-8859-7\0"
254 "ISO-8859-8\0"
255 "ISO-8859-9\0"
256 "ISO-8859-10\0"
257 "ISO-2022-JP-1\0"
258 "Shift_JIS\0"
259 "EUC-JP\0"
260 "US-ASCII\0"
261 "EUC-KR\0"
262 "ISO-2022-KR\0"
263 "EUC-KR\0"
264 "ISO-2022-JP\0"
265 "ISO-2022-JP-2\0"
266 "GB_2312-80\0"
267 "ISO-8859-6\0"
268 "ISO-8859-6\0"
269 "ISO-8859-8\0"
270 "ISO-8859-8\0"
271 "ISO-2022-CN\0"
272 "ISO-2022-CN-EXT\0"
273 "UTF-8\0"
274 "ISO-8859-13\0"
275 "ISO-8859-14\0"
276 "ISO-8859-15\0"
277 "GBK\0"
278 "GB18030\0"
279 "UTF-16\0"
280 "UTF-32\0"
281 "SCSU\0"
282 "UTF-7\0"
283 "UTF-16BE\0"
284 "UTF-16LE\0"
285 "UTF-16\0"
286 "CESU-8\0"
287 "UTF-32\0"
288 "UTF-32BE\0"
289 "UTF-32LE\0"
290 "BOCU-1\0"
291 "hp-roman8\0"
292 "Adobe-Standard-Encoding\0"
293 "IBM850\0"
294 "IBM862\0"
295 "IBM-Thai\0"
296 "Shift_JIS\0"
297 "GB2312\0"
298 "Big5\0"
299 "macintosh\0"
300 "IBM037\0"
301 "IBM273\0"
302 "IBM277\0"
303 "IBM278\0"
304 "IBM280\0"
305 "IBM284\0"
306 "IBM285\0"
307 "IBM290\0"
308 "IBM297\0"
309 "IBM420\0"
310 "IBM424\0"
311 "IBM437\0"
312 "IBM500\0"
313 "cp851\0"
314 "IBM852\0"
315 "IBM855\0"
316 "IBM857\0"
317 "IBM860\0"
318 "IBM861\0"
319 "IBM863\0"
320 "IBM864\0"
321 "IBM865\0"
322 "IBM868\0"
323 "IBM869\0"
324 "IBM870\0"
325 "IBM871\0"
326 "IBM918\0"
327 "IBM1026\0"
328 "KOI8-R\0"
329 "HZ-GB-2312\0"
330 "IBM866\0"
331 "IBM775\0"
332 "KOI8-U\0"
333 "IBM00858\0"
334 "IBM01140\0"
335 "IBM01141\0"
336 "IBM01142\0"
337 "IBM01143\0"
338 "IBM01144\0"
339 "IBM01145\0"
340 "IBM01146\0"
341 "IBM01147\0"
342 "IBM01148\0"
343 "IBM01149\0"
344 "Big5-HKSCS\0"
345 "IBM1047\0"
346 "windows-1250\0"
347 "windows-1251\0"
348 "windows-1252\0"
349 "windows-1253\0"
350 "windows-1254\0"
351 "windows-1255\0"
352 "windows-1256\0"
353 "windows-1257\0"
354 "windows-1258\0"
355 "TIS-620\0";
356
loadQtCodec(const char * name)357 static QTextCodec *loadQtCodec(const char *name)
358 {
359 if (!strcmp(name, "UTF-8"))
360 return new QUtf8Codec;
361 if (!strcmp(name, "UTF-16"))
362 return new QUtf16Codec;
363 if (!strcmp(name, "ISO-8859-1"))
364 return new QLatin1Codec;
365 if (!strcmp(name, "UTF-16BE"))
366 return new QUtf16BECodec;
367 if (!strcmp(name, "UTF-16LE"))
368 return new QUtf16LECodec;
369 if (!strcmp(name, "UTF-32"))
370 return new QUtf32Codec;
371 if (!strcmp(name, "UTF-32BE"))
372 return new QUtf32BECodec;
373 if (!strcmp(name, "UTF-32LE"))
374 return new QUtf32LECodec;
375 if (!strcmp(name, "ISO-8859-16") || !strcmp(name, "latin10") || !strcmp(name, "iso-ir-226"))
376 return new QSimpleTextCodec(13 /* == 8859-16*/);
377 #if QT_CONFIG(codecs)
378 if (!strcmp(name, "TSCII"))
379 return new QTsciiCodec;
380 if (!qstrnicmp(name, "iscii", 5))
381 return QIsciiCodec::create(name);
382 #endif
383
384 return nullptr;
385 }
386
387 /// \threadsafe
availableCodecs()388 QList<QByteArray> QIcuCodec::availableCodecs()
389 {
390 QList<QByteArray> codecs;
391 int n = ucnv_countAvailable();
392 for (int i = 0; i < n; ++i) {
393 const char *name = ucnv_getAvailableName(i);
394
395 UErrorCode error = U_ZERO_ERROR;
396 const char *standardName = ucnv_getStandardName(name, "MIME", &error);
397 if (U_FAILURE(error) || !standardName) {
398 error = U_ZERO_ERROR;
399 standardName = ucnv_getStandardName(name, "IANA", &error);
400 }
401 if (U_FAILURE(error))
402 continue;
403
404 error = U_ZERO_ERROR;
405 int ac = ucnv_countAliases(standardName, &error);
406 if (U_FAILURE(error))
407 continue;
408 for (int j = 0; j < ac; ++j) {
409 error = U_ZERO_ERROR;
410 const char *alias = ucnv_getAlias(standardName, j, &error);
411 if (!U_SUCCESS(error))
412 continue;
413 codecs += alias;
414 }
415 }
416
417 // handled by Qt and not in ICU:
418 codecs += "TSCII";
419
420 return codecs;
421 }
422
423 /// \threadsafe
availableMibs()424 QList<int> QIcuCodec::availableMibs()
425 {
426 QList<int> mibs;
427 mibs.reserve(mibToNameSize + 1);
428 for (int i = 0; i < mibToNameSize; ++i)
429 mibs += mibToName[i].mib;
430
431 // handled by Qt and not in ICU:
432 mibs += 2107; // TSCII
433
434 return mibs;
435 }
436
defaultCodecUnlocked()437 QTextCodec *QIcuCodec::defaultCodecUnlocked()
438 {
439 QCoreGlobalData *globalData = QCoreGlobalData::instance();
440 if (!globalData)
441 return nullptr;
442 QTextCodec *c = globalData->codecForLocale.loadAcquire();
443 if (c)
444 return c;
445
446 #if defined(QT_LOCALE_IS_UTF8)
447 const char *name = "UTF-8";
448 #else
449 const char *name = ucnv_getDefaultName();
450 #endif
451 c = codecForNameUnlocked(name);
452 globalData->codecForLocale.storeRelease(c);
453 return c;
454 }
455
456
codecForNameUnlocked(const char * name)457 QTextCodec *QIcuCodec::codecForNameUnlocked(const char *name)
458 {
459 // backwards compatibility with Qt 4.x
460 if (!qstrcmp(name, "CP949"))
461 name = "windows-949";
462 else if (!qstrcmp(name, "Apple Roman"))
463 name = "macintosh";
464 // these are broken data in ICU 4.4, and can't be resolved even though they are aliases to tis-620
465 if (!qstrcmp(name, "windows-874-2000")
466 || !qstrcmp(name, "windows-874")
467 || !qstrcmp(name, "MS874")
468 || !qstrcmp(name, "x-windows-874")
469 || !qstrcmp(name, "ISO 8859-11"))
470 name = "TIS-620";
471
472 UErrorCode error = U_ZERO_ERROR;
473 // MIME gives better default names
474 const char *standardName = ucnv_getStandardName(name, "MIME", &error);
475 if (U_FAILURE(error) || !standardName) {
476 error = U_ZERO_ERROR;
477 standardName = ucnv_getStandardName(name, "IANA", &error);
478 }
479 bool qt_only = false;
480 if (U_FAILURE(error) || !standardName) {
481 standardName = name;
482 qt_only = true;
483 } else {
484 // correct some issues where the ICU data set contains duplicated entries.
485 // Where this happens it's because one data set is a subset of another. We
486 // always use the larger data set.
487
488 if (qstrcmp(standardName, "GB2312") == 0 || qstrcmp(standardName, "GB_2312-80") == 0)
489 standardName = "GBK";
490 else if (qstrcmp(standardName, "KSC_5601") == 0 || qstrcmp(standardName, "EUC-KR") == 0 || qstrcmp(standardName, "cp1363") == 0)
491 standardName = "windows-949";
492 }
493
494 QCoreGlobalData *globalData = QCoreGlobalData::instance();
495 QTextCodecCache *cache = &globalData->codecCache;
496
497 QTextCodec *codec;
498 if (cache) {
499 codec = cache->value(standardName);
500 if (codec)
501 return codec;
502 }
503
504 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
505 QTextCodec *cursor = *it;
506 if (qTextCodecNameMatch(cursor->name(), standardName)) {
507 if (cache)
508 cache->insert(standardName, cursor);
509 return cursor;
510 }
511 QList<QByteArray> aliases = cursor->aliases();
512 for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) {
513 if (qTextCodecNameMatch(*ait, standardName)) {
514 if (cache)
515 cache->insert(standardName, cursor);
516 return cursor;
517 }
518 }
519 }
520
521 QTextCodec *c = loadQtCodec(standardName);
522 if (c)
523 return c;
524
525 if (qt_only)
526 return nullptr;
527
528 // check whether there is really a converter for the name available.
529 UConverter *conv = ucnv_open(standardName, &error);
530 if (!conv) {
531 qDebug("codecForName: ucnv_open failed %s %s", standardName, u_errorName(error));
532 return nullptr;
533 }
534 //qDebug() << "QIcuCodec: Standard name for " << name << "is" << standardName;
535 ucnv_close(conv);
536
537
538 c = new QIcuCodec(standardName);
539 if (cache)
540 cache->insert(standardName, c);
541 return c;
542 }
543
544
codecForMibUnlocked(int mib)545 QTextCodec *QIcuCodec::codecForMibUnlocked(int mib)
546 {
547 for (int i = 0; i < mibToNameSize; ++i) {
548 if (mibToName[i].mib == mib)
549 return codecForNameUnlocked(mibToNameTable + mibToName[i].index);
550 }
551
552 if (mib == 2107)
553 return codecForNameUnlocked("TSCII");
554
555 return nullptr;
556 }
557
558
QIcuCodec(const char * name)559 QIcuCodec::QIcuCodec(const char *name)
560 : m_name(name)
561 {
562 }
563
~QIcuCodec()564 QIcuCodec::~QIcuCodec()
565 {
566 }
567
getConverter(QTextCodec::ConverterState * state) const568 UConverter *QIcuCodec::getConverter(QTextCodec::ConverterState *state) const
569 {
570 UConverter *conv = nullptr;
571 if (state) {
572 if (!state->d) {
573 // first time
574 state->flags |= QTextCodec::FreeFunction;
575 QTextCodecUnalignedPointer::encode(state->state_data, qIcuCodecStateFree);
576 UErrorCode error = U_ZERO_ERROR;
577 state->d = ucnv_open(m_name, &error);
578 ucnv_setSubstChars(static_cast<UConverter *>(state->d),
579 state->flags & QTextCodec::ConvertInvalidToNull ? "\0" : "?", 1, &error);
580 if (U_FAILURE(error))
581 qDebug("getConverter(state) ucnv_open failed %s %s", m_name, u_errorName(error));
582 }
583 conv = static_cast<UConverter *>(state->d);
584 }
585 if (!conv) {
586 // stateless conversion
587 UErrorCode error = U_ZERO_ERROR;
588 conv = ucnv_open(m_name, &error);
589 ucnv_setSubstChars(conv, "?", 1, &error);
590 if (U_FAILURE(error))
591 qDebug("getConverter(no state) ucnv_open failed %s %s", m_name, u_errorName(error));
592 }
593 return conv;
594 }
595
convertToUnicode(const char * chars,int length,QTextCodec::ConverterState * state) const596 QString QIcuCodec::convertToUnicode(const char *chars, int length, QTextCodec::ConverterState *state) const
597 {
598 UConverter *conv = getConverter(state);
599
600 QString string(length + 2, Qt::Uninitialized);
601
602 const char *end = chars + length;
603 int convertedChars = 0;
604 while (1) {
605 UChar *uc = (UChar *)string.data();
606 UChar *ucEnd = uc + string.length();
607 uc += convertedChars;
608 UErrorCode error = U_ZERO_ERROR;
609 ucnv_toUnicode(conv,
610 &uc, ucEnd,
611 &chars, end,
612 nullptr, false, &error);
613 if (!U_SUCCESS(error) && error != U_BUFFER_OVERFLOW_ERROR) {
614 qDebug("convertToUnicode failed: %s", u_errorName(error));
615 break;
616 }
617
618 convertedChars = uc - (UChar *)string.data();
619 if (chars >= end)
620 break;
621 string.resize(string.length()*2);
622 }
623 string.resize(convertedChars);
624
625 if (!state)
626 ucnv_close(conv);
627 return string;
628 }
629
630
convertFromUnicode(const QChar * unicode,int length,QTextCodec::ConverterState * state) const631 QByteArray QIcuCodec::convertFromUnicode(const QChar *unicode, int length, QTextCodec::ConverterState *state) const
632 {
633 UConverter *conv = getConverter(state);
634
635 int requiredLength = UCNV_GET_MAX_BYTES_FOR_STRING(length, ucnv_getMaxCharSize(conv));
636 QByteArray string(requiredLength, Qt::Uninitialized);
637
638 const UChar *uc = (const UChar *)unicode;
639 const UChar *end = uc + length;
640 int convertedChars = 0;
641 while (1) {
642 char *ch = (char *)string.data();
643 char *chEnd = ch + string.length();
644 ch += convertedChars;
645 UErrorCode error = U_ZERO_ERROR;
646 ucnv_fromUnicode(conv,
647 &ch, chEnd,
648 &uc, end,
649 nullptr, false, &error);
650 if (!U_SUCCESS(error))
651 qDebug("convertFromUnicode failed: %s", u_errorName(error));
652 convertedChars = ch - string.data();
653 if (uc >= end)
654 break;
655 string.resize(string.length()*2);
656 }
657 string.resize(convertedChars);
658
659 if (!state)
660 ucnv_close(conv);
661
662 return string;
663 }
664
665
name() const666 QByteArray QIcuCodec::name() const
667 {
668 return m_name;
669 }
670
671
aliases() const672 QList<QByteArray> QIcuCodec::aliases() const
673 {
674 UErrorCode error = U_ZERO_ERROR;
675
676 int n = ucnv_countAliases(m_name, &error);
677
678 QList<QByteArray> aliases;
679 for (int i = 0; i < n; ++i) {
680 const char *a = ucnv_getAlias(m_name, i, &error);
681 // skip the canonical name
682 if (!a || !qstrcmp(a, m_name))
683 continue;
684 aliases += a;
685 }
686
687 return aliases;
688 }
689
690
mibEnum() const691 int QIcuCodec::mibEnum() const
692 {
693 for (int i = 0; i < mibToNameSize; ++i) {
694 if (qTextCodecNameMatch(m_name, (mibToNameTable + mibToName[i].index)))
695 return mibToName[i].mib;
696 }
697
698 return 0;
699 }
700
701 QT_END_NAMESPACE
702