1 /****************************************************************************
2 **
3 ** Copyright (C) 2018 The Qt Company Ltd.
4 ** Copyright (C) 2018 Intel Corporation.
5 ** Contact: https://www.qt.io/licensing/
6 **
7 ** This file is part of the QtCore module of the Qt Toolkit.
8 **
9 ** $QT_BEGIN_LICENSE:LGPL$
10 ** Commercial License Usage
11 ** Licensees holding valid commercial Qt licenses may use this file in
12 ** accordance with the commercial license agreement provided with the
13 ** Software or, alternatively, in accordance with the terms contained in
14 ** a written agreement between you and The Qt Company. For licensing terms
15 ** and conditions see https://www.qt.io/terms-conditions. For further
16 ** information use the contact form at https://www.qt.io/contact-us.
17 **
18 ** GNU Lesser General Public License Usage
19 ** Alternatively, this file may be used under the terms of the GNU Lesser
20 ** General Public License version 3 as published by the Free Software
21 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
22 ** packaging of this file. Please review the following information to
23 ** ensure the GNU Lesser General Public License version 3 requirements
24 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25 **
26 ** GNU General Public License Usage
27 ** Alternatively, this file may be used under the terms of the GNU
28 ** General Public License version 2.0 or (at your option) the GNU General
29 ** Public license version 3 or any later version approved by the KDE Free
30 ** Qt Foundation. The licenses are as published by the Free Software
31 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32 ** included in the packaging of this file. Please review the following
33 ** information to ensure the GNU General Public License requirements will
34 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35 ** https://www.gnu.org/licenses/gpl-3.0.html.
36 **
37 ** $QT_END_LICENSE$
38 **
39 ****************************************************************************/
40 
41 #ifndef QUTFCODEC_P_H
42 #define QUTFCODEC_P_H
43 
44 //
45 //  W A R N I N G
46 //  -------------
47 //
48 // This file is not part of the Qt API.  It exists purely as an
49 // implementation detail.  This header file may change from version to
50 // version without notice, or even be removed.
51 //
52 // We mean it.
53 //
54 
55 #include <QtCore/qstring.h>
56 #include <QtCore/qlist.h>
57 
58 #if QT_CONFIG(textcodec)
59 #include "QtCore/qtextcodec.h"
60 #endif
61 
62 #include "private/qtextcodec_p.h"
63 
64 QT_BEGIN_NAMESPACE
65 
66 struct QUtf8BaseTraits
67 {
68     static const bool isTrusted = false;
69     static const bool allowNonCharacters = true;
70     static const bool skipAsciiHandling = false;
71     static const int Error = -1;
72     static const int EndOfString = -2;
73 
isValidCharacterQUtf8BaseTraits74     static bool isValidCharacter(uint u)
75     { return int(u) >= 0; }
76 
appendByteQUtf8BaseTraits77     static void appendByte(uchar *&ptr, uchar b)
78     { *ptr++ = b; }
79 
80     static uchar peekByte(const uchar *ptr, int n = 0)
81     { return ptr[n]; }
82 
availableBytesQUtf8BaseTraits83     static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
84     { return end - ptr; }
85 
86     static void advanceByte(const uchar *&ptr, int n = 1)
87     { ptr += n; }
88 
appendUtf16QUtf8BaseTraits89     static void appendUtf16(ushort *&ptr, ushort uc)
90     { *ptr++ = uc; }
91 
appendUcs4QUtf8BaseTraits92     static void appendUcs4(ushort *&ptr, uint uc)
93     {
94         appendUtf16(ptr, QChar::highSurrogate(uc));
95         appendUtf16(ptr, QChar::lowSurrogate(uc));
96     }
97 
98     static ushort peekUtf16(const ushort *ptr, int n = 0)
99     { return ptr[n]; }
100 
availableUtf16QUtf8BaseTraits101     static qptrdiff availableUtf16(const ushort *ptr, const ushort *end)
102     { return end - ptr; }
103 
104     static void advanceUtf16(const ushort *&ptr, int n = 1)
105     { ptr += n; }
106 
107     // it's possible to output to UCS-4 too
appendUtf16QUtf8BaseTraits108     static void appendUtf16(uint *&ptr, ushort uc)
109     { *ptr++ = uc; }
110 
appendUcs4QUtf8BaseTraits111     static void appendUcs4(uint *&ptr, uint uc)
112     { *ptr++ = uc; }
113 };
114 
115 struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits
116 {
117     static const bool skipAsciiHandling = true;
118 };
119 
120 namespace QUtf8Functions
121 {
122     /// returns 0 on success; errors can only happen if \a u is a surrogate:
123     /// Error if \a u is a low surrogate;
124     /// if \a u is a high surrogate, Error if the next isn't a low one,
125     /// EndOfString if we run into the end of the string.
126     template <typename Traits, typename OutputPtr, typename InputPtr> inline
toUtf8(ushort u,OutputPtr & dst,InputPtr & src,InputPtr end)127     int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end)
128     {
129         if (!Traits::skipAsciiHandling && u < 0x80) {
130             // U+0000 to U+007F (US-ASCII) - one byte
131             Traits::appendByte(dst, uchar(u));
132             return 0;
133         } else if (u < 0x0800) {
134             // U+0080 to U+07FF - two bytes
135             // first of two bytes
136             Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
137         } else {
138             if (!QChar::isSurrogate(u)) {
139                 // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
140                 if (!Traits::allowNonCharacters && QChar::isNonCharacter(u))
141                     return Traits::Error;
142 
143                 // first of three bytes
144                 Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
145             } else {
146                 // U+10000 to U+10FFFF - four bytes
147                 // need to get one extra codepoint
148                 if (Traits::availableUtf16(src, end) == 0)
149                     return Traits::EndOfString;
150 
151                 ushort low = Traits::peekUtf16(src);
152                 if (!QChar::isHighSurrogate(u))
153                     return Traits::Error;
154                 if (!QChar::isLowSurrogate(low))
155                     return Traits::Error;
156 
157                 Traits::advanceUtf16(src);
158                 uint ucs4 = QChar::surrogateToUcs4(u, low);
159 
160                 if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
161                     return Traits::Error;
162 
163                 // first byte
164                 Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
165 
166                 // second of four bytes
167                 Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
168 
169                 // for the rest of the bytes
170                 u = ushort(ucs4);
171             }
172 
173             // second to last byte
174             Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
175         }
176 
177         // last byte
178         Traits::appendByte(dst, 0x80 | (u & 0x3f));
179         return 0;
180     }
181 
isContinuationByte(uchar b)182     inline bool isContinuationByte(uchar b)
183     {
184         return (b & 0xc0) == 0x80;
185     }
186 
187     /// returns the number of characters consumed (including \a b) in case of success;
188     /// returns negative in case of error: Traits::Error or Traits::EndOfString
189     template <typename Traits, typename OutputPtr, typename InputPtr> inline
fromUtf8(uchar b,OutputPtr & dst,InputPtr & src,InputPtr end)190     int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
191     {
192         int charsNeeded;
193         uint min_uc;
194         uint uc;
195 
196         if (!Traits::skipAsciiHandling && b < 0x80) {
197             // US-ASCII
198             Traits::appendUtf16(dst, b);
199             return 1;
200         }
201 
202         if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) {
203             // an UTF-8 first character must be at least 0xC0
204             // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
205             return Traits::Error;
206         } else if (b < 0xe0) {
207             charsNeeded = 2;
208             min_uc = 0x80;
209             uc = b & 0x1f;
210         } else if (b < 0xf0) {
211             charsNeeded = 3;
212             min_uc = 0x800;
213             uc = b & 0x0f;
214         } else if (b < 0xf5) {
215             charsNeeded = 4;
216             min_uc = 0x10000;
217             uc = b & 0x07;
218         } else {
219             // the last Unicode character is U+10FFFF
220             // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
221             // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
222             return Traits::Error;
223         }
224 
225         int bytesAvailable = Traits::availableBytes(src, end);
226         if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) {
227             // it's possible that we have an error instead of just unfinished bytes
228             if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0)))
229                 return Traits::Error;
230             if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1)))
231                 return Traits::Error;
232             return Traits::EndOfString;
233         }
234 
235         // first continuation character
236         b = Traits::peekByte(src, 0);
237         if (!isContinuationByte(b))
238             return Traits::Error;
239         uc <<= 6;
240         uc |= b & 0x3f;
241 
242         if (charsNeeded > 2) {
243             // second continuation character
244             b = Traits::peekByte(src, 1);
245             if (!isContinuationByte(b))
246                 return Traits::Error;
247             uc <<= 6;
248             uc |= b & 0x3f;
249 
250             if (charsNeeded > 3) {
251                 // third continuation character
252                 b = Traits::peekByte(src, 2);
253                 if (!isContinuationByte(b))
254                     return Traits::Error;
255                 uc <<= 6;
256                 uc |= b & 0x3f;
257             }
258         }
259 
260         // we've decoded something; safety-check it
261         if (!Traits::isTrusted) {
262             if (uc < min_uc)
263                 return Traits::Error;
264             if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint)
265                 return Traits::Error;
266             if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc))
267                 return Traits::Error;
268         }
269 
270         // write the UTF-16 sequence
271         if (!QChar::requiresSurrogates(uc)) {
272             // UTF-8 decoded and no surrogates are required
273             // detach if necessary
274             Traits::appendUtf16(dst, ushort(uc));
275         } else {
276             // UTF-8 decoded to something that requires a surrogate pair
277             Traits::appendUcs4(dst, uc);
278         }
279 
280         Traits::advanceByte(src, charsNeeded - 1);
281         return charsNeeded;
282     }
283 }
284 
285 enum DataEndianness
286 {
287     DetectEndianness,
288     BigEndianness,
289     LittleEndianness
290 };
291 
292 struct QUtf8
293 {
294     static QChar *convertToUnicode(QChar *, const char *, int) noexcept;
295     static QString convertToUnicode(const char *, int);
296     static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *);
297     static QByteArray convertFromUnicode(const QChar *, int);
298     static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *);
299     struct ValidUtf8Result {
300         bool isValidUtf8;
301         bool isValidAscii;
302     };
303     static ValidUtf8Result isValidUtf8(const char *, qsizetype);
304     static int compareUtf8(const char *, qsizetype, const QChar *, int);
305     static int compareUtf8(const char *, qsizetype, QLatin1String s);
306 };
307 
308 struct QUtf16
309 {
310     static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
311     static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
312 };
313 
314 struct QUtf32
315 {
316     static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
317     static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
318 };
319 
320 #if QT_CONFIG(textcodec)
321 
322 class QUtf8Codec : public QTextCodec {
323 public:
324     ~QUtf8Codec();
325 
326     QByteArray name() const override;
327     int mibEnum() const override;
328 
329     QString convertToUnicode(const char *, int, ConverterState *) const override;
330     QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const override;
331     void convertToUnicode(QString *target, const char *, int, ConverterState *) const;
332 };
333 
334 class QUtf16Codec : public QTextCodec {
335 protected:
336 public:
QUtf16Codec()337     QUtf16Codec() { e = DetectEndianness; }
338     ~QUtf16Codec();
339 
340     QByteArray name() const override;
341     QList<QByteArray> aliases() const override;
342     int mibEnum() const override;
343 
344     QString convertToUnicode(const char *, int, ConverterState *) const override;
345     QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const override;
346 
347 protected:
348     DataEndianness e;
349 };
350 
351 class QUtf16BECodec : public QUtf16Codec {
352 public:
QUtf16BECodec()353     QUtf16BECodec() : QUtf16Codec() { e = BigEndianness; }
354     QByteArray name() const override;
355     QList<QByteArray> aliases() const override;
356     int mibEnum() const override;
357 };
358 
359 class QUtf16LECodec : public QUtf16Codec {
360 public:
QUtf16LECodec()361     QUtf16LECodec() : QUtf16Codec() { e = LittleEndianness; }
362     QByteArray name() const override;
363     QList<QByteArray> aliases() const override;
364     int mibEnum() const override;
365 };
366 
367 class QUtf32Codec : public QTextCodec {
368 public:
QUtf32Codec()369     QUtf32Codec() { e = DetectEndianness; }
370     ~QUtf32Codec();
371 
372     QByteArray name() const override;
373     QList<QByteArray> aliases() const override;
374     int mibEnum() const override;
375 
376     QString convertToUnicode(const char *, int, ConverterState *) const override;
377     QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const override;
378 
379 protected:
380     DataEndianness e;
381 };
382 
383 class QUtf32BECodec : public QUtf32Codec {
384 public:
QUtf32BECodec()385     QUtf32BECodec() : QUtf32Codec() { e = BigEndianness; }
386     QByteArray name() const override;
387     QList<QByteArray> aliases() const override;
388     int mibEnum() const override;
389 };
390 
391 class QUtf32LECodec : public QUtf32Codec {
392 public:
QUtf32LECodec()393     QUtf32LECodec() : QUtf32Codec() { e = LittleEndianness; }
394     QByteArray name() const override;
395     QList<QByteArray> aliases() const override;
396     int mibEnum() const override;
397 };
398 
399 
400 #endif // textcodec
401 
402 QT_END_NAMESPACE
403 
404 #endif // QUTFCODEC_P_H
405