1 /****************************************************************************
2 **
3 ** Copyright (C) 2018 The Qt Company Ltd.
4 ** Copyright (C) 2018 Intel Corporation.
5 ** Contact: https://www.qt.io/licensing/
6 **
7 ** This file is part of the QtCore module of the Qt Toolkit.
8 **
9 ** $QT_BEGIN_LICENSE:LGPL$
10 ** Commercial License Usage
11 ** Licensees holding valid commercial Qt licenses may use this file in
12 ** accordance with the commercial license agreement provided with the
13 ** Software or, alternatively, in accordance with the terms contained in
14 ** a written agreement between you and The Qt Company. For licensing terms
15 ** and conditions see https://www.qt.io/terms-conditions. For further
16 ** information use the contact form at https://www.qt.io/contact-us.
17 **
18 ** GNU Lesser General Public License Usage
19 ** Alternatively, this file may be used under the terms of the GNU Lesser
20 ** General Public License version 3 as published by the Free Software
21 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
22 ** packaging of this file. Please review the following information to
23 ** ensure the GNU Lesser General Public License version 3 requirements
24 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25 **
26 ** GNU General Public License Usage
27 ** Alternatively, this file may be used under the terms of the GNU
28 ** General Public License version 2.0 or (at your option) the GNU General
29 ** Public license version 3 or any later version approved by the KDE Free
30 ** Qt Foundation. The licenses are as published by the Free Software
31 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32 ** included in the packaging of this file. Please review the following
33 ** information to ensure the GNU General Public License requirements will
34 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35 ** https://www.gnu.org/licenses/gpl-3.0.html.
36 **
37 ** $QT_END_LICENSE$
38 **
39 ****************************************************************************/
40 
41 #include "qplatformdefs.h"
42 
43 #include "qtextcodec.h"
44 #include "qtextcodec_p.h"
45 
46 #include "qbytearraymatcher.h"
47 #include "qendian.h"
48 #include "qfile.h"
49 #include "qlist.h"
50 #include <private/qlocking_p.h>
51 #include "qstringlist.h"
52 #include "qvarlengtharray.h"
53 #if !defined(QT_BOOTSTRAPPED)
54 #include <private/qcoreapplication_p.h>
55 #endif
56 #include "private/qcoreglobaldata_p.h"
57 
58 #include "qutfcodec_p.h"
59 #include "qlatincodec_p.h"
60 
61 #if !defined(QT_BOOTSTRAPPED)
62 #if QT_CONFIG(codecs)
63 #  include "qtsciicodec_p.h"
64 #  include "qisciicodec_p.h"
65 #endif
66 #if QT_CONFIG(icu)
67 #include "qicucodec_p.h"
68 #else
69 #if QT_CONFIG(iconv)
70 #  include "qiconvcodec_p.h"
71 #endif
72 #ifdef Q_OS_WIN
73 #  include "qwindowscodec_p.h"
74 #endif
75 #  include "qsimplecodec_p.h"
76 #if QT_CONFIG(big_codecs)
77 #  ifndef Q_OS_INTEGRITY
78 #    include "qgb18030codec_p.h"
79 #    include "qeucjpcodec_p.h"
80 #    include "qjiscodec_p.h"
81 #    include "qsjiscodec_p.h"
82 #    include "qeuckrcodec_p.h"
83 #    include "qbig5codec_p.h"
84 #  endif // !Q_OS_INTEGRITY
85 #endif // big_codecs
86 
87 #endif // icu
88 #endif // QT_BOOTSTRAPPED
89 
90 #include <mutex>
91 
92 #include <stdlib.h>
93 #include <ctype.h>
94 #include <locale.h>
95 #if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_ANDROID)
96 # include <langinfo.h>
97 #endif
98 
99 QT_BEGIN_NAMESPACE
100 
101 typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt;
102 typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt;
103 
104 Q_GLOBAL_STATIC(QRecursiveMutex, textCodecsMutex);
105 
106 class TextCodecsMutexLocker
107 {
108     using Lock = decltype(qt_unique_lock(std::declval<QRecursiveMutex&>()));
109     // ### FIXME: this is used when textCodecsMutex already == nullptr
110     const Lock lock = qt_unique_lock(textCodecsMutex());
111 public:
TextCodecsMutexLocker()112     TextCodecsMutexLocker() {} // required d/t an ICC 19 bug
113 };
114 
115 #if !QT_CONFIG(icu)
qtolower(char c)116 static char qtolower(char c)
117 { if (c >= 'A' && c <= 'Z') return c + 0x20; return c; }
qisalnum(char c)118 static bool qisalnum(char c)
119 { return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); }
120 
qTextCodecNameMatch(const char * n,const char * h)121 bool qTextCodecNameMatch(const char *n, const char *h)
122 {
123     if (qstricmp(n, h) == 0)
124         return true;
125 
126     // if the letters and numbers are the same, we have a match
127     while (*n != '\0') {
128         if (qisalnum(*n)) {
129             for (;;) {
130                 if (*h == '\0')
131                     return false;
132                 if (qisalnum(*h))
133                     break;
134                 ++h;
135             }
136             if (qtolower(*n) != qtolower(*h))
137                 return false;
138             ++h;
139         }
140         ++n;
141     }
142     while (*h && !qisalnum(*h))
143            ++h;
144     return (*h == '\0');
145 }
146 
147 
148 #if !defined(Q_OS_WIN32) && !defined(QT_LOCALE_IS_UTF8)
checkForCodec(const QByteArray & name)149 static QTextCodec *checkForCodec(const QByteArray &name) {
150     QTextCodec *c = QTextCodec::codecForName(name);
151     if (!c) {
152         const int index = name.indexOf('@');
153         if (index != -1) {
154             c = QTextCodec::codecForName(name.left(index));
155         }
156     }
157     return c;
158 }
159 #endif
160 
161 static void setup();
162 
163 // \threadsafe
164 // this returns the codec the method sets up as locale codec to
165 // avoid a race condition in codecForLocale() when
166 // setCodecForLocale(0) is called at the same time.
setupLocaleMapper()167 static QTextCodec *setupLocaleMapper()
168 {
169     QCoreGlobalData *globalData = QCoreGlobalData::instance();
170 
171     QTextCodec *locale = nullptr;
172 
173     {
174         const TextCodecsMutexLocker locker;
175         if (globalData->allCodecs.isEmpty())
176             setup();
177     }
178 
179 #if !defined(QT_BOOTSTRAPPED)
180     QCoreApplicationPrivate::initLocale();
181 #endif
182 
183 #if defined(QT_LOCALE_IS_UTF8)
184     locale = QTextCodec::codecForName("UTF-8");
185 #elif defined(Q_OS_WIN)
186     locale = QTextCodec::codecForName("System");
187 #else
188 
189     // First try getting the codecs name from nl_langinfo and see
190     // if we have a builtin codec for it.
191     // Only fall back to using iconv if we can't find a builtin codec
192     // This is because the builtin utf8 codec is around 5 times faster
193     // then the using QIconvCodec
194 
195 #if defined (_XOPEN_UNIX)
196     char *charset = nl_langinfo(CODESET);
197     if (charset)
198         locale = QTextCodec::codecForName(charset);
199 #endif
200 #if QT_CONFIG(iconv)
201     if (!locale) {
202         // no builtin codec for the locale found, let's try using iconv
203         (void) new QIconvCodec();
204         locale = QTextCodec::codecForName("System");
205     }
206 #endif
207 
208     if (!locale) {
209         // Very poorly defined and followed standards causes lots of
210         // code to try to get all the cases... This logic is
211         // duplicated in QIconvCodec, so if you change it here, change
212         // it there too.
213 
214         // Try to determine locale codeset from locale name assigned to
215         // LC_CTYPE category.
216 
217         // First part is getting that locale name.  First try setlocale() which
218         // definitely knows it, but since we cannot fully trust it, get ready
219         // to fall back to environment variables.
220         const QByteArray ctype = setlocale(LC_CTYPE, nullptr);
221 
222         // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
223         // environment variables.
224         QByteArray lang = qgetenv("LC_ALL");
225         if (lang.isEmpty() || lang == "C") {
226             lang = qgetenv("LC_CTYPE");
227         }
228         if (lang.isEmpty() || lang == "C") {
229             lang = qgetenv("LANG");
230         }
231 
232         // Now try these in order:
233         // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
234         // 2. CODESET from lang if it contains a .CODESET part
235         // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
236         // 4. locale (ditto)
237         // 5. check for "@euro"
238         // 6. guess locale from ctype unless ctype is "C"
239         // 7. guess locale from lang
240 
241         // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
242         int indexOfDot = ctype.indexOf('.');
243         if (indexOfDot != -1)
244             locale = checkForCodec( ctype.mid(indexOfDot + 1) );
245 
246         // 2. CODESET from lang if it contains a .CODESET part
247         if (!locale) {
248             indexOfDot = lang.indexOf('.');
249             if (indexOfDot != -1)
250                 locale = checkForCodec( lang.mid(indexOfDot + 1) );
251         }
252 
253         // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
254         if (!locale && !ctype.isEmpty() && ctype != "C")
255             locale = checkForCodec(ctype);
256 
257         // 4. locale (ditto)
258         if (!locale && !lang.isEmpty())
259             locale = checkForCodec(lang);
260 
261         // 5. "@euro"
262         if ((!locale && ctype.contains("@euro")) || lang.contains("@euro"))
263             locale = checkForCodec("ISO 8859-15");
264     }
265 
266 #endif
267     // If everything failed, we default to 8859-1
268     if (!locale)
269         locale = QTextCodec::codecForName("ISO 8859-1");
270     globalData->codecForLocale.storeRelease(locale);
271     return locale;
272 }
273 
274 
275 // textCodecsMutex need to be locked to enter this function
setup()276 static void setup()
277 {
278     static bool initialized = false;
279     if (initialized)
280         return;
281     initialized = true;
282 
283 #if QT_CONFIG(codecs) && !defined(QT_BOOTSTRAPPED)
284     (void)new QTsciiCodec;
285     for (int i = 0; i < 9; ++i)
286         (void)new QIsciiCodec(i);
287     for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i)
288         (void)new QSimpleTextCodec(i);
289 
290 #  if QT_CONFIG(big_codecs) && !defined(Q_OS_INTEGRITY)
291     (void)new QGb18030Codec;
292     (void)new QGbkCodec;
293     (void)new QGb2312Codec;
294     (void)new QEucJpCodec;
295     (void)new QJisCodec;
296     (void)new QSjisCodec;
297     (void)new QEucKrCodec;
298     (void)new QCP949Codec;
299     (void)new QBig5Codec;
300     (void)new QBig5hkscsCodec;
301 #  endif // big_codecs && !Q_OS_INTEGRITY
302 #if QT_CONFIG(iconv)
303     (void) new QIconvCodec;
304 #endif
305 #if defined(Q_OS_WIN32)
306     (void) new QWindowsLocalCodec;
307 #endif // Q_OS_WIN32
308 #endif // codecs && !QT_BOOTSTRAPPED
309 
310     (void)new QUtf16Codec;
311     (void)new QUtf16BECodec;
312     (void)new QUtf16LECodec;
313     (void)new QUtf32Codec;
314     (void)new QUtf32BECodec;
315     (void)new QUtf32LECodec;
316     (void)new QLatin15Codec;
317     (void)new QLatin1Codec;
318     (void)new QUtf8Codec;
319 }
320 #else
setup()321 static void setup() {}
322 #endif // icu
323 
324 /*!
325     \enum QTextCodec::ConversionFlag
326 
327     \value DefaultConversion  No flag is set.
328     \value ConvertInvalidToNull  If this flag is set, each invalid input
329                                  character is output as a null character.
330     \value IgnoreHeader  Ignore any Unicode byte-order mark and don't generate any.
331 
332     \omitvalue FreeFunction
333 */
334 
335 /*!
336     \fn QTextCodec::ConverterState::ConverterState(ConversionFlags flags)
337 
338     Constructs a ConverterState object initialized with the given \a flags.
339 */
340 
341 /*!
342     Destroys the ConverterState object.
343 */
~ConverterState()344 QTextCodec::ConverterState::~ConverterState()
345 {
346     if (flags & FreeFunction)
347         (QTextCodecUnalignedPointer::decode(state_data))(this);
348     else if (d)
349         free(d);
350 }
351 
352 /*!
353     \class QTextCodec
354     \inmodule QtCore
355     \brief The QTextCodec class provides conversions between text encodings.
356     \reentrant
357     \ingroup i18n
358 
359     Qt uses Unicode to store, draw and manipulate strings. In many
360     situations you may wish to deal with data that uses a different
361     encoding. For example, most Japanese documents are still stored
362     in Shift-JIS or ISO 2022-JP, while Russian users often have their
363     documents in KOI8-R or Windows-1251.
364 
365     Qt provides a set of QTextCodec classes to help with converting
366     non-Unicode formats to and from Unicode. You can also create your
367     own codec classes.
368 
369     The supported encodings are:
370 
371     \list
372     \li \l{Big5 Text Codec}{Big5}
373     \li \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
374     \li CP949
375     \li \l{EUC-JP Text Codec}{EUC-JP}
376     \li \l{EUC-KR Text Codec}{EUC-KR}
377     \li \l{GBK Text Codec}{GB18030}
378     \li HP-ROMAN8
379     \li IBM 850
380     \li IBM 866
381     \li IBM 874
382     \li \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
383     \li ISO 8859-1 to 10
384     \li ISO 8859-13 to 16
385     \li Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
386     \li KOI8-R
387     \li KOI8-U
388     \li Macintosh
389     \li \l{Shift-JIS Text Codec}{Shift-JIS}
390     \li TIS-620
391     \li \l{TSCII Text Codec}{TSCII}
392     \li UTF-8
393     \li UTF-16
394     \li UTF-16BE
395     \li UTF-16LE
396     \li UTF-32
397     \li UTF-32BE
398     \li UTF-32LE
399     \li Windows-1250 to 1258
400     \endlist
401 
402     If Qt is compiled with ICU support enabled, most codecs supported by
403     ICU will also be available to the application.
404 
405     \l {QTextCodec}s can be used as follows to convert some locally encoded
406     string to Unicode. Suppose you have some string encoded in Russian
407     KOI8-R encoding, and want to convert it to Unicode. The simple way
408     to do it is like this:
409 
410     \snippet code/src_corelib_codecs_qtextcodec.cpp 0
411 
412     After this, \c string holds the text converted to Unicode.
413     Converting a string from Unicode to the local encoding is just as
414     easy:
415 
416     \snippet code/src_corelib_codecs_qtextcodec.cpp 1
417 
418     To read or write files in various encodings, use QTextStream and
419     its \l{QTextStream::setCodec()}{setCodec()} function. See the
420     \l{tools/codecs}{Codecs} example for an application of QTextCodec
421     to file I/O.
422 
423     Some care must be taken when trying to convert the data in chunks,
424     for example, when receiving it over a network. In such cases it is
425     possible that a multi-byte character will be split over two
426     chunks. At best this might result in the loss of a character and
427     at worst cause the entire conversion to fail.
428 
429     The approach to use in these situations is to create a QTextDecoder
430     object for the codec and use this QTextDecoder for the whole
431     decoding process, as shown below:
432 
433     \snippet code/src_corelib_codecs_qtextcodec.cpp 2
434 
435     The QTextDecoder object maintains state between chunks and therefore
436     works correctly even if a multi-byte character is split between
437     chunks.
438 
439     \section1 Creating Your Own Codec Class
440 
441     Support for new text encodings can be added to Qt by creating
442     QTextCodec subclasses.
443 
444     The pure virtual functions describe the encoder to the system and
445     the coder is used as required in the different text file formats
446     supported by QTextStream, and under X11, for the locale-specific
447     character input and output.
448 
449     To add support for another encoding to Qt, make a subclass of
450     QTextCodec and implement the functions listed in the table below.
451 
452     \table
453     \header \li Function \li Description
454 
455     \row \li name()
456          \li Returns the official name for the encoding. If the
457             encoding is listed in the
458             \l{IANA character-sets encoding file}, the name
459             should be the preferred MIME name for the encoding.
460 
461     \row \li aliases()
462          \li Returns a list of alternative names for the encoding.
463             QTextCodec provides a default implementation that returns
464             an empty list. For example, "ISO-8859-1" has "latin1",
465             "CP819", "IBM819", and "iso-ir-100" as aliases.
466 
467     \row \li \l{QTextCodec::mibEnum()}{mibEnum()}
468          \li Return the MIB enum for the encoding if it is listed in
469             the \l{IANA character-sets encoding file}.
470 
471     \row \li convertToUnicode()
472          \li Converts an 8-bit character string to Unicode.
473 
474     \row \li convertFromUnicode()
475          \li Converts a Unicode string to an 8-bit character string.
476     \endtable
477 
478     \sa QTextStream, QTextDecoder, QTextEncoder, {Text Codecs Example}
479 */
480 
481 /*!
482     Constructs a QTextCodec, and gives it the highest precedence. The
483     QTextCodec should always be constructed on the heap (i.e. with \c
484     new). Qt takes ownership and will delete it when the application
485     terminates.
486 */
QTextCodec()487 QTextCodec::QTextCodec()
488 {
489     const TextCodecsMutexLocker locker;
490 
491     QCoreGlobalData *globalInstance = QCoreGlobalData::instance();
492     if (globalInstance->allCodecs.isEmpty())
493         setup();
494 
495     globalInstance->allCodecs.prepend(this);
496 }
497 
498 
499 /*!
500     \nonreentrant
501 
502     Destroys the QTextCodec. Note that you should not delete codecs
503     yourself: once created they become Qt's responsibility.
504 */
~QTextCodec()505 QTextCodec::~QTextCodec()
506 {
507     QCoreGlobalData *globalData = QCoreGlobalData::instance();
508     if (!globalData)
509         return;
510 
511     globalData->codecForLocale.testAndSetRelaxed(this, nullptr);
512 
513     const TextCodecsMutexLocker locker;
514 
515     globalData->allCodecs.removeOne(this);
516 
517     auto it = globalData->codecCache.begin();
518 
519     while (it != globalData->codecCache.end()) {
520         if (it.value() == this)
521             it = globalData->codecCache.erase(it);
522         else
523             ++it;
524     }
525 }
526 
527 /*!
528     \fn QTextCodec *QTextCodec::codecForName(const char *name)
529 
530     Searches all installed QTextCodec objects and returns the one
531     which best matches \a name; the match is case-insensitive. Returns
532     0 if no codec matching the name \a name could be found.
533 */
534 
535 /*!
536     \threadsafe
537     Searches all installed QTextCodec objects and returns the one
538     which best matches \a name; the match is case-insensitive. Returns
539     0 if no codec matching the name \a name could be found.
540 */
codecForName(const QByteArray & name)541 QTextCodec *QTextCodec::codecForName(const QByteArray &name)
542 {
543     if (name.isEmpty())
544         return nullptr;
545 
546     const TextCodecsMutexLocker locker;
547 
548     QCoreGlobalData *globalData = QCoreGlobalData::instance();
549     if (!globalData)
550         return nullptr;
551     setup();
552 
553 #if !QT_CONFIG(icu)
554     QTextCodecCache *cache = &globalData->codecCache;
555     QTextCodec *codec;
556     codec = cache->value(name);
557     if (codec)
558         return codec;
559 
560     for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
561         QTextCodec *cursor = *it;
562         if (qTextCodecNameMatch(cursor->name(), name)) {
563             if (cache)
564                 cache->insert(name, cursor);
565             return cursor;
566         }
567         QList<QByteArray> aliases = cursor->aliases();
568         for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) {
569             if (qTextCodecNameMatch(*ait, name)) {
570                 cache->insert(name, cursor);
571                 return cursor;
572             }
573         }
574     }
575 
576     return nullptr;
577 #else
578     return QIcuCodec::codecForNameUnlocked(name);
579 #endif
580 }
581 
582 
583 /*!
584     \threadsafe
585     Returns the QTextCodec which matches the
586     \l{QTextCodec::mibEnum()}{MIBenum} \a mib.
587 */
codecForMib(int mib)588 QTextCodec* QTextCodec::codecForMib(int mib)
589 {
590     const TextCodecsMutexLocker locker;
591 
592     QCoreGlobalData *globalData = QCoreGlobalData::instance();
593     if (!globalData)
594         return nullptr;
595     if (globalData->allCodecs.isEmpty())
596         setup();
597 
598     QByteArray key = "MIB: " + QByteArray::number(mib);
599 
600     QTextCodecCache *cache = &globalData->codecCache;
601     QTextCodec *codec;
602     if (cache) {
603         codec = cache->value(key);
604         if (codec)
605             return codec;
606     }
607 
608     for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
609         QTextCodec *cursor = *it;
610         if (cursor->mibEnum() == mib) {
611             if (cache)
612                 cache->insert(key, cursor);
613             return cursor;
614         }
615     }
616 
617 #if QT_CONFIG(icu)
618     return QIcuCodec::codecForMibUnlocked(mib);
619 #else
620     return nullptr;
621 #endif
622 }
623 
624 /*!
625     \threadsafe
626     Returns the list of all available codecs, by name. Call
627     QTextCodec::codecForName() to obtain the QTextCodec for the name.
628 
629     The list may contain many mentions of the same codec
630     if the codec has aliases.
631 
632     \sa availableMibs(), name(), aliases()
633 */
availableCodecs()634 QList<QByteArray> QTextCodec::availableCodecs()
635 {
636     const TextCodecsMutexLocker locker;
637 
638     QCoreGlobalData *globalData = QCoreGlobalData::instance();
639     if (globalData->allCodecs.isEmpty())
640         setup();
641 
642     QList<QByteArray> codecs;
643 
644     for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
645         codecs += (*it)->name();
646         codecs += (*it)->aliases();
647     }
648 
649 #if QT_CONFIG(icu)
650     codecs += QIcuCodec::availableCodecs();
651 #endif
652 
653     return codecs;
654 }
655 
656 /*!
657     \threadsafe
658     Returns the list of MIBs for all available codecs. Call
659     QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
660 
661     \sa availableCodecs(), mibEnum()
662 */
availableMibs()663 QList<int> QTextCodec::availableMibs()
664 {
665 #if QT_CONFIG(icu)
666     return QIcuCodec::availableMibs();
667 #else
668     const TextCodecsMutexLocker locker;
669 
670     QCoreGlobalData *globalData = QCoreGlobalData::instance();
671     if (globalData->allCodecs.isEmpty())
672         setup();
673 
674     QList<int> codecs;
675 
676     for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it)
677         codecs += (*it)->mibEnum();
678 
679     return codecs;
680 #endif
681 }
682 
683 /*!
684     \nonreentrant
685 
686     Set the codec to \a c; this will be returned by
687     codecForLocale(). If \a c is \nullptr, the codec is reset to
688     the default.
689 
690     This might be needed for some applications that want to use their
691     own mechanism for setting the locale.
692 
693     \sa codecForLocale()
694 */
setCodecForLocale(QTextCodec * c)695 void QTextCodec::setCodecForLocale(QTextCodec *c)
696 {
697     QCoreGlobalData::instance()->codecForLocale.storeRelease(c);
698 }
699 
700 /*!
701     \threadsafe
702     Returns a pointer to the codec most suitable for this locale.
703 
704     The codec will be retrieved from ICU where that backend is in use, otherwise
705     it may be obtained from an OS-specific API.  In the latter case, the codec's
706     name may be "System".
707 */
708 
codecForLocale()709 QTextCodec* QTextCodec::codecForLocale()
710 {
711     QCoreGlobalData *globalData = QCoreGlobalData::instance();
712     if (!globalData)
713         return nullptr;
714 
715     QTextCodec *codec = globalData->codecForLocale.loadAcquire();
716     if (!codec) {
717 #if QT_CONFIG(icu)
718         const TextCodecsMutexLocker locker;
719         codec = QIcuCodec::defaultCodecUnlocked();
720 #else
721         // setupLocaleMapper locks as necessary
722         codec = setupLocaleMapper();
723 #endif
724     }
725 
726     return codec;
727 }
728 
729 
730 /*!
731     \fn QByteArray QTextCodec::name() const
732 
733     QTextCodec subclasses must reimplement this function. It returns
734     the name of the encoding supported by the subclass.
735 
736     If the codec is registered as a character set in the
737     \l{IANA character-sets encoding file} this method should
738     return the preferred mime name for the codec if defined,
739     otherwise its name.
740 */
741 
742 /*!
743     \fn int QTextCodec::mibEnum() const
744 
745     Subclasses of QTextCodec must reimplement this function. It
746     returns the \l{QTextCodec::mibEnum()}{MIBenum} (see \l{IANA character-sets encoding file}
747     for more information). It is important that each QTextCodec
748     subclass returns the correct unique value for this function.
749 */
750 
751 /*!
752   Subclasses can return a number of aliases for the codec in question.
753 
754   Standard aliases for codecs can be found in the
755   \l{IANA character-sets encoding file}.
756 */
aliases() const757 QList<QByteArray> QTextCodec::aliases() const
758 {
759     return QList<QByteArray>();
760 }
761 
762 /*!
763     \fn QString QTextCodec::convertToUnicode(const char *chars, int len,
764                                              ConverterState *state) const
765 
766     QTextCodec subclasses must reimplement this function.
767 
768     Converts the first \a len characters of \a chars from the
769     encoding of the subclass to Unicode, and returns the result in a
770     QString.
771 
772     \a state can be \nullptr, in which case the conversion is stateless and
773     default conversion rules should be used. If state is not 0, the
774     codec should save the state after the conversion in \a state, and
775     adjust the \c remainingChars and \c invalidChars members of the struct.
776 */
777 
778 /*!
779     \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number,
780                                                   ConverterState *state) const
781 
782     QTextCodec subclasses must reimplement this function.
783 
784     Converts the first \a number of characters from the \a input array
785     from Unicode to the encoding of the subclass, and returns the result
786     in a QByteArray.
787 
788     \a state can be \nullptr in which case the conversion is stateless and
789     default conversion rules should be used. If state is not 0, the
790     codec should save the state after the conversion in \a state, and
791     adjust the \c remainingChars and \c invalidChars members of the struct.
792 */
793 
794 /*!
795     Creates a QTextDecoder with a specified \a flags to decode chunks
796     of \c{char *} data to create chunks of Unicode data.
797 
798     The caller is responsible for deleting the returned object.
799 
800     \since 4.7
801 */
makeDecoder(QTextCodec::ConversionFlags flags) const802 QTextDecoder* QTextCodec::makeDecoder(QTextCodec::ConversionFlags flags) const
803 {
804     return new QTextDecoder(this, flags);
805 }
806 
807 /*!
808     Creates a QTextEncoder with a specified \a flags to encode chunks
809     of Unicode data as \c{char *} data.
810 
811     The caller is responsible for deleting the returned object.
812 
813     \since 4.7
814 */
makeEncoder(QTextCodec::ConversionFlags flags) const815 QTextEncoder* QTextCodec::makeEncoder(QTextCodec::ConversionFlags flags) const
816 {
817     return new QTextEncoder(this, flags);
818 }
819 
820 /*!
821     \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number,
822                                            ConverterState *state) const
823 
824     Converts the first \a number of characters from the \a input array
825     from Unicode to the encoding of this codec, and returns the result
826     in a QByteArray.
827 
828     The \a state of the convertor used is updated.
829 */
830 
831 #if QT_STRINGVIEW_LEVEL < 2
832 /*!
833     Converts \a str from Unicode to the encoding of this codec, and
834     returns the result in a QByteArray.
835 */
fromUnicode(const QString & str) const836 QByteArray QTextCodec::fromUnicode(const QString& str) const
837 {
838     return convertFromUnicode(str.constData(), str.length(), nullptr);
839 }
840 #endif
841 
842 /*!
843     \overload
844     \since 5.10
845 
846     Converts \a str from Unicode to the encoding of this codec, and
847     returns the result in a QByteArray.
848 */
fromUnicode(QStringView str) const849 QByteArray QTextCodec::fromUnicode(QStringView str) const
850 {
851     return convertFromUnicode(str.data(), str.length(), nullptr);
852 }
853 
854 /*!
855     \fn QString QTextCodec::toUnicode(const char *input, int size,
856                                       ConverterState *state) const
857 
858     Converts the first \a size characters from the \a input from the
859     encoding of this codec to Unicode, and returns the result in a
860     QString.
861 
862     The \a state of the convertor used is updated.
863 */
864 
865 /*!
866     Converts \a a from the encoding of this codec to Unicode, and
867     returns the result in a QString.
868 */
toUnicode(const QByteArray & a) const869 QString QTextCodec::toUnicode(const QByteArray& a) const
870 {
871     return convertToUnicode(a.constData(), a.length(), nullptr);
872 }
873 
874 /*!
875     Returns \c true if the Unicode character \a ch can be fully encoded
876     with this codec; otherwise returns \c false.
877 */
canEncode(QChar ch) const878 bool QTextCodec::canEncode(QChar ch) const
879 {
880     ConverterState state;
881     state.flags = ConvertInvalidToNull;
882     convertFromUnicode(&ch, 1, &state);
883     return (state.invalidChars == 0);
884 }
885 
886 #if QT_STRINGVIEW_LEVEL < 2
887 /*!
888     \overload
889 
890     \a s contains the string being tested for encode-ability.
891 */
canEncode(const QString & s) const892 bool QTextCodec::canEncode(const QString& s) const
893 {
894     ConverterState state;
895     state.flags = ConvertInvalidToNull;
896     convertFromUnicode(s.constData(), s.length(), &state);
897     return (state.invalidChars == 0);
898 }
899 #endif
900 
901 /*!
902     \overload
903     \since 5.10
904 
905     Returns \c true if the Unicode string \a s can be fully encoded
906     with this codec; otherwise returns \c false.
907 */
canEncode(QStringView s) const908 bool QTextCodec::canEncode(QStringView s) const
909 {
910     ConverterState state;
911     state.flags = ConvertInvalidToNull;
912     convertFromUnicode(s.data(), s.length(), &state);
913     return !state.invalidChars;
914 }
915 /*!
916     \overload
917 
918     \a chars contains the source characters.
919 */
toUnicode(const char * chars) const920 QString QTextCodec::toUnicode(const char *chars) const
921 {
922     int len = qstrlen(chars);
923     return convertToUnicode(chars, len, nullptr);
924 }
925 
926 
927 /*!
928     \class QTextEncoder
929     \inmodule QtCore
930     \brief The QTextEncoder class provides a state-based encoder.
931     \reentrant
932     \ingroup i18n
933 
934     A text encoder converts text from Unicode into an encoded text format
935     using a specific codec.
936 
937     The encoder converts Unicode into another format, remembering any
938     state that is required between calls.
939 
940     \sa QTextCodec::makeEncoder(), QTextDecoder
941 */
942 
943 /*!
944     \fn QTextEncoder::QTextEncoder(const QTextCodec *codec)
945 
946     Constructs a text encoder for the given \a codec.
947 */
948 
949 /*!
950     Constructs a text encoder for the given \a codec and conversion \a flags.
951 
952     \since 4.7
953 */
QTextEncoder(const QTextCodec * codec,QTextCodec::ConversionFlags flags)954 QTextEncoder::QTextEncoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
955     : c(codec), state()
956 {
957     state.flags = flags;
958 }
959 
960 /*!
961     Destroys the encoder.
962 */
~QTextEncoder()963 QTextEncoder::~QTextEncoder()
964 {
965 }
966 
967 /*!
968     \internal
969     \since 4.5
970     Determines whether the eecoder encountered a failure while decoding the input. If
971     an error was encountered, the produced result is undefined, and gets converted as according
972     to the conversion flags.
973  */
hasFailure() const974 bool QTextEncoder::hasFailure() const
975 {
976     return state.invalidChars != 0;
977 }
978 
979 #if QT_STRINGVIEW_LEVEL < 2
980 /*!
981     Converts the Unicode string \a str into an encoded QByteArray.
982 */
fromUnicode(const QString & str)983 QByteArray QTextEncoder::fromUnicode(const QString& str)
984 {
985     QByteArray result = c->fromUnicode(str.constData(), str.length(), &state);
986     return result;
987 }
988 #endif
989 
990 /*!
991     \overload
992     \since 5.10
993     Converts the Unicode string \a str into an encoded QByteArray.
994 */
fromUnicode(QStringView str)995 QByteArray QTextEncoder::fromUnicode(QStringView str)
996 {
997     return c->fromUnicode(str.data(), str.length(), &state);
998 }
999 
1000 /*!
1001     \overload
1002 
1003     Converts \a len characters (not bytes) from \a uc, and returns the
1004     result in a QByteArray.
1005 */
fromUnicode(const QChar * uc,int len)1006 QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len)
1007 {
1008     QByteArray result = c->fromUnicode(uc, len, &state);
1009     return result;
1010 }
1011 
1012 /*!
1013     \class QTextDecoder
1014     \inmodule QtCore
1015     \brief The QTextDecoder class provides a state-based decoder.
1016     \reentrant
1017     \ingroup i18n
1018 
1019     A text decoder converts text from an encoded text format into Unicode
1020     using a specific codec.
1021 
1022     The decoder converts text in this format into Unicode, remembering any
1023     state that is required between calls.
1024 
1025     \sa QTextCodec::makeDecoder(), QTextEncoder
1026 */
1027 
1028 /*!
1029     \fn QTextDecoder::QTextDecoder(const QTextCodec *codec)
1030 
1031     Constructs a text decoder for the given \a codec.
1032 */
1033 
1034 /*!
1035     Constructs a text decoder for the given \a codec and conversion \a flags.
1036 
1037     \since 4.7
1038 */
1039 
QTextDecoder(const QTextCodec * codec,QTextCodec::ConversionFlags flags)1040 QTextDecoder::QTextDecoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
1041     : c(codec), state()
1042 {
1043     state.flags = flags;
1044 }
1045 
1046 /*!
1047     Destroys the decoder.
1048 */
~QTextDecoder()1049 QTextDecoder::~QTextDecoder()
1050 {
1051 }
1052 
1053 /*!
1054     \fn QString QTextDecoder::toUnicode(const char *chars, int len)
1055 
1056     Converts the first \a len bytes in \a chars to Unicode, returning
1057     the result.
1058 
1059     If not all characters are used (e.g. if only part of a multi-byte
1060     encoding is at the end of the characters), the decoder remembers
1061     enough state to continue with the next call to this function.
1062 */
toUnicode(const char * chars,int len)1063 QString QTextDecoder::toUnicode(const char *chars, int len)
1064 {
1065     return c->toUnicode(chars, len, &state);
1066 }
1067 
1068 // in qstring.cpp:
1069 void qt_from_latin1(ushort *dst, const char *str, size_t size) noexcept;
1070 
1071 /*! \overload
1072 
1073     The converted string is returned in \a target.
1074  */
toUnicode(QString * target,const char * chars,int len)1075 void QTextDecoder::toUnicode(QString *target, const char *chars, int len)
1076 {
1077     Q_ASSERT(target);
1078     switch (c->mibEnum()) {
1079     case 106: // utf8
1080         static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
1081         break;
1082     case 4: // latin1
1083         target->resize(len);
1084         qt_from_latin1((ushort*)target->data(), chars, len);
1085         break;
1086     default:
1087         *target = c->toUnicode(chars, len, &state);
1088     }
1089 }
1090 
1091 
1092 /*!
1093     \overload
1094 
1095     Converts the bytes in the byte array specified by \a ba to Unicode
1096     and returns the result.
1097 */
toUnicode(const QByteArray & ba)1098 QString QTextDecoder::toUnicode(const QByteArray &ba)
1099 {
1100     return c->toUnicode(ba.constData(), ba.length(), &state);
1101 }
1102 
1103 /*!
1104     \since 4.4
1105 
1106     Tries to detect the encoding of the provided snippet of HTML in
1107     the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1108     and the content-type meta header and returns a QTextCodec instance
1109     that is capable of decoding the html to unicode.  If the codec
1110     cannot be detected from the content provided, \a defaultCodec is
1111     returned.
1112 
1113     \sa codecForUtfText()
1114 */
codecForHtml(const QByteArray & ba,QTextCodec * defaultCodec)1115 QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
1116 {
1117     // determine charset
1118     QTextCodec *c = QTextCodec::codecForUtfText(ba, nullptr);
1119     if (!c) {
1120         static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher("meta ");
1121         QByteArray header = ba.left(1024).toLower();
1122         int pos = matcher.indexIn(header);
1123         if (pos != -1) {
1124             static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher("charset=");
1125             pos = matcher.indexIn(header, pos);
1126             if (pos != -1) {
1127                 pos += qstrlen("charset=");
1128 
1129                 int pos2 = pos;
1130                 // The attribute can be closed with either """, "'", ">" or "/",
1131                 // none of which are valid charset characters.
1132                 while (++pos2 < header.size()) {
1133                     char ch = header.at(pos2);
1134                     if (ch == '\"' || ch == '\'' || ch == '>') {
1135                         QByteArray name = header.mid(pos, pos2 - pos);
1136                         if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
1137                             name = QByteArrayLiteral("UTF-8");
1138                         c = QTextCodec::codecForName(name);
1139                         return c ? c : defaultCodec;
1140                     }
1141                 }
1142             }
1143         }
1144     }
1145     if (!c)
1146         c = defaultCodec;
1147 
1148     return c;
1149 }
1150 
1151 /*!
1152     \overload
1153 
1154     Tries to detect the encoding of the provided snippet of HTML in
1155     the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1156     and the content-type meta header and returns a QTextCodec instance
1157     that is capable of decoding the html to unicode. If the codec cannot
1158     be detected, this overload returns a Latin-1 QTextCodec.
1159 */
codecForHtml(const QByteArray & ba)1160 QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
1161 {
1162     return codecForHtml(ba, QTextCodec::codecForName("ISO-8859-1"));
1163 }
1164 
1165 /*!
1166     \since 4.6
1167 
1168     Tries to detect the encoding of the provided snippet \a ba by
1169     using the BOM (Byte Order Mark) and returns a QTextCodec instance
1170     that is capable of decoding the text to unicode. This function can
1171     detect one of the following codecs:
1172 
1173     \list
1174       \li UTF-32 Little Endian
1175       \li UTF-32 Big Endian
1176       \li UTF-16 Little Endian
1177       \li UTF-16 Big Endian
1178       \li UTF-8
1179     \endlist
1180 
1181     If the codec cannot be detected from the content provided, \a defaultCodec
1182     is returned.
1183 
1184     \sa codecForHtml()
1185 */
codecForUtfText(const QByteArray & ba,QTextCodec * defaultCodec)1186 QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec)
1187 {
1188     const int arraySize = ba.size();
1189     const uchar *buf = reinterpret_cast<const uchar *>(ba.constData());
1190     const uint bom = 0xfeff;
1191 
1192     if (arraySize > 3) {
1193         uint uc = qFromUnaligned<uint>(buf);
1194         if (uc == qToBigEndian(bom))
1195             return QTextCodec::codecForMib(1018); // utf-32 be
1196         else if (uc == qToLittleEndian(bom))
1197             return QTextCodec::codecForMib(1019); // utf-32 le
1198     }
1199 
1200     if (arraySize < 2)
1201         return defaultCodec;
1202 
1203     ushort uc = qFromUnaligned<ushort>(buf);
1204     if (uc == qToBigEndian(ushort(bom)))
1205         return QTextCodec::codecForMib(1013); // utf16 be
1206     else if (uc == qToLittleEndian(ushort(bom)))
1207         return QTextCodec::codecForMib(1014); // utf16 le
1208 
1209     if (arraySize < 3)
1210         return defaultCodec;
1211 
1212     static const char utf8bom[] = "\xef\xbb\xbf";
1213     if (memcmp(buf, utf8bom, sizeof(utf8bom) - 1) == 0)
1214         return QTextCodec::codecForMib(106); // utf-8
1215 
1216     return defaultCodec;
1217 }
1218 
1219 /*!
1220     \overload
1221 
1222     Tries to detect the encoding of the provided snippet \a ba by
1223     using the BOM (Byte Order Mark) and returns a QTextCodec instance
1224     that is capable of decoding the text to unicode. This function can
1225     detect one of the following codecs:
1226 
1227     \list
1228       \li UTF-32 Little Endian
1229       \li UTF-32 Big Endian
1230       \li UTF-16 Little Endian
1231       \li UTF-16 Big Endian
1232       \li UTF-8
1233     \endlist
1234 
1235     If the codec cannot be detected from the content provided, this overload
1236     returns a Latin-1 QTextCodec.
1237 
1238     \sa codecForHtml()
1239 */
codecForUtfText(const QByteArray & ba)1240 QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba)
1241 {
1242     return codecForUtfText(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
1243 }
1244 
1245 /*!
1246     \fn QTextCodec * QTextCodec::codecForTr ()
1247     \obsolete
1248 
1249     Returns the codec used by QObject::tr() on its argument. If this
1250     function returns \nullptr (the default), tr() assumes Latin-1.
1251 */
1252 
1253 /*!
1254     \internal
1255     \since 4.3
1256     Determines whether the decoder encountered a failure while decoding the
1257     input. If an error was encountered, the produced result is undefined, and
1258     gets converted as according to the conversion flags.
1259  */
hasFailure() const1260 bool QTextDecoder::hasFailure() const
1261 {
1262     return state.invalidChars != 0;
1263 }
1264 
1265 /*!
1266     \internal
1267     \since 5.12
1268 
1269     Determines whether the decoder needs more bytes to continue decoding. That
1270     is, this signifies that the input string ended in the middle of a
1271     multi-byte sequence. Note that it's possible some codecs do not report this.
1272  */
needsMoreData() const1273 bool QTextDecoder::needsMoreData() const
1274 {
1275     return state.remainingChars;
1276 }
1277 
1278 QT_END_NAMESPACE
1279