1 /**************************************************************************** 2 ** 3 ** Copyright (C) 2018 The Qt Company Ltd. 4 ** Copyright (C) 2018 Intel Corporation. 5 ** Contact: https://www.qt.io/licensing/ 6 ** 7 ** This file is part of the QtCore module of the Qt Toolkit. 8 ** 9 ** $QT_BEGIN_LICENSE:LGPL$ 10 ** Commercial License Usage 11 ** Licensees holding valid commercial Qt licenses may use this file in 12 ** accordance with the commercial license agreement provided with the 13 ** Software or, alternatively, in accordance with the terms contained in 14 ** a written agreement between you and The Qt Company. For licensing terms 15 ** and conditions see https://www.qt.io/terms-conditions. For further 16 ** information use the contact form at https://www.qt.io/contact-us. 17 ** 18 ** GNU Lesser General Public License Usage 19 ** Alternatively, this file may be used under the terms of the GNU Lesser 20 ** General Public License version 3 as published by the Free Software 21 ** Foundation and appearing in the file LICENSE.LGPL3 included in the 22 ** packaging of this file. Please review the following information to 23 ** ensure the GNU Lesser General Public License version 3 requirements 24 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. 25 ** 26 ** GNU General Public License Usage 27 ** Alternatively, this file may be used under the terms of the GNU 28 ** General Public License version 2.0 or (at your option) the GNU General 29 ** Public license version 3 or any later version approved by the KDE Free 30 ** Qt Foundation. The licenses are as published by the Free Software 31 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 32 ** included in the packaging of this file. Please review the following 33 ** information to ensure the GNU General Public License requirements will 34 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and 35 ** https://www.gnu.org/licenses/gpl-3.0.html. 36 ** 37 ** $QT_END_LICENSE$ 38 ** 39 ****************************************************************************/ 40 41 #ifndef QUTFCODEC_P_H 42 #define QUTFCODEC_P_H 43 44 // 45 // W A R N I N G 46 // ------------- 47 // 48 // This file is not part of the Qt API. It exists purely as an 49 // implementation detail. This header file may change from version to 50 // version without notice, or even be removed. 51 // 52 // We mean it. 53 // 54 55 #include <QtCore/qstring.h> 56 #include <QtCore/qlist.h> 57 58 #if QT_CONFIG(textcodec) 59 #include "QtCore/qtextcodec.h" 60 #endif 61 62 #include "private/qtextcodec_p.h" 63 64 QT_BEGIN_NAMESPACE 65 66 struct QUtf8BaseTraits 67 { 68 static const bool isTrusted = false; 69 static const bool allowNonCharacters = true; 70 static const bool skipAsciiHandling = false; 71 static const int Error = -1; 72 static const int EndOfString = -2; 73 isValidCharacterQUtf8BaseTraits74 static bool isValidCharacter(uint u) 75 { return int(u) >= 0; } 76 appendByteQUtf8BaseTraits77 static void appendByte(uchar *&ptr, uchar b) 78 { *ptr++ = b; } 79 80 static uchar peekByte(const uchar *ptr, int n = 0) 81 { return ptr[n]; } 82 availableBytesQUtf8BaseTraits83 static qptrdiff availableBytes(const uchar *ptr, const uchar *end) 84 { return end - ptr; } 85 86 static void advanceByte(const uchar *&ptr, int n = 1) 87 { ptr += n; } 88 appendUtf16QUtf8BaseTraits89 static void appendUtf16(ushort *&ptr, ushort uc) 90 { *ptr++ = uc; } 91 appendUcs4QUtf8BaseTraits92 static void appendUcs4(ushort *&ptr, uint uc) 93 { 94 appendUtf16(ptr, QChar::highSurrogate(uc)); 95 appendUtf16(ptr, QChar::lowSurrogate(uc)); 96 } 97 98 static ushort peekUtf16(const ushort *ptr, int n = 0) 99 { return ptr[n]; } 100 availableUtf16QUtf8BaseTraits101 static qptrdiff availableUtf16(const ushort *ptr, const ushort *end) 102 { return end - ptr; } 103 104 static void advanceUtf16(const ushort *&ptr, int n = 1) 105 { ptr += n; } 106 107 // it's possible to output to UCS-4 too appendUtf16QUtf8BaseTraits108 static void appendUtf16(uint *&ptr, ushort uc) 109 { *ptr++ = uc; } 110 appendUcs4QUtf8BaseTraits111 static void appendUcs4(uint *&ptr, uint uc) 112 { *ptr++ = uc; } 113 }; 114 115 struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits 116 { 117 static const bool skipAsciiHandling = true; 118 }; 119 120 namespace QUtf8Functions 121 { 122 /// returns 0 on success; errors can only happen if \a u is a surrogate: 123 /// Error if \a u is a low surrogate; 124 /// if \a u is a high surrogate, Error if the next isn't a low one, 125 /// EndOfString if we run into the end of the string. 126 template <typename Traits, typename OutputPtr, typename InputPtr> inline toUtf8(ushort u,OutputPtr & dst,InputPtr & src,InputPtr end)127 int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end) 128 { 129 if (!Traits::skipAsciiHandling && u < 0x80) { 130 // U+0000 to U+007F (US-ASCII) - one byte 131 Traits::appendByte(dst, uchar(u)); 132 return 0; 133 } else if (u < 0x0800) { 134 // U+0080 to U+07FF - two bytes 135 // first of two bytes 136 Traits::appendByte(dst, 0xc0 | uchar(u >> 6)); 137 } else { 138 if (!QChar::isSurrogate(u)) { 139 // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes 140 if (!Traits::allowNonCharacters && QChar::isNonCharacter(u)) 141 return Traits::Error; 142 143 // first of three bytes 144 Traits::appendByte(dst, 0xe0 | uchar(u >> 12)); 145 } else { 146 // U+10000 to U+10FFFF - four bytes 147 // need to get one extra codepoint 148 if (Traits::availableUtf16(src, end) == 0) 149 return Traits::EndOfString; 150 151 ushort low = Traits::peekUtf16(src); 152 if (!QChar::isHighSurrogate(u)) 153 return Traits::Error; 154 if (!QChar::isLowSurrogate(low)) 155 return Traits::Error; 156 157 Traits::advanceUtf16(src); 158 uint ucs4 = QChar::surrogateToUcs4(u, low); 159 160 if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4)) 161 return Traits::Error; 162 163 // first byte 164 Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf)); 165 166 // second of four bytes 167 Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f)); 168 169 // for the rest of the bytes 170 u = ushort(ucs4); 171 } 172 173 // second to last byte 174 Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f)); 175 } 176 177 // last byte 178 Traits::appendByte(dst, 0x80 | (u & 0x3f)); 179 return 0; 180 } 181 isContinuationByte(uchar b)182 inline bool isContinuationByte(uchar b) 183 { 184 return (b & 0xc0) == 0x80; 185 } 186 187 /// returns the number of characters consumed (including \a b) in case of success; 188 /// returns negative in case of error: Traits::Error or Traits::EndOfString 189 template <typename Traits, typename OutputPtr, typename InputPtr> inline fromUtf8(uchar b,OutputPtr & dst,InputPtr & src,InputPtr end)190 int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end) 191 { 192 int charsNeeded; 193 uint min_uc; 194 uint uc; 195 196 if (!Traits::skipAsciiHandling && b < 0x80) { 197 // US-ASCII 198 Traits::appendUtf16(dst, b); 199 return 1; 200 } 201 202 if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) { 203 // an UTF-8 first character must be at least 0xC0 204 // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences 205 return Traits::Error; 206 } else if (b < 0xe0) { 207 charsNeeded = 2; 208 min_uc = 0x80; 209 uc = b & 0x1f; 210 } else if (b < 0xf0) { 211 charsNeeded = 3; 212 min_uc = 0x800; 213 uc = b & 0x0f; 214 } else if (b < 0xf5) { 215 charsNeeded = 4; 216 min_uc = 0x10000; 217 uc = b & 0x07; 218 } else { 219 // the last Unicode character is U+10FFFF 220 // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF" 221 // therefore, a byte higher than 0xF4 is not the UTF-8 first byte 222 return Traits::Error; 223 } 224 225 int bytesAvailable = Traits::availableBytes(src, end); 226 if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) { 227 // it's possible that we have an error instead of just unfinished bytes 228 if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0))) 229 return Traits::Error; 230 if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1))) 231 return Traits::Error; 232 return Traits::EndOfString; 233 } 234 235 // first continuation character 236 b = Traits::peekByte(src, 0); 237 if (!isContinuationByte(b)) 238 return Traits::Error; 239 uc <<= 6; 240 uc |= b & 0x3f; 241 242 if (charsNeeded > 2) { 243 // second continuation character 244 b = Traits::peekByte(src, 1); 245 if (!isContinuationByte(b)) 246 return Traits::Error; 247 uc <<= 6; 248 uc |= b & 0x3f; 249 250 if (charsNeeded > 3) { 251 // third continuation character 252 b = Traits::peekByte(src, 2); 253 if (!isContinuationByte(b)) 254 return Traits::Error; 255 uc <<= 6; 256 uc |= b & 0x3f; 257 } 258 } 259 260 // we've decoded something; safety-check it 261 if (!Traits::isTrusted) { 262 if (uc < min_uc) 263 return Traits::Error; 264 if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) 265 return Traits::Error; 266 if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc)) 267 return Traits::Error; 268 } 269 270 // write the UTF-16 sequence 271 if (!QChar::requiresSurrogates(uc)) { 272 // UTF-8 decoded and no surrogates are required 273 // detach if necessary 274 Traits::appendUtf16(dst, ushort(uc)); 275 } else { 276 // UTF-8 decoded to something that requires a surrogate pair 277 Traits::appendUcs4(dst, uc); 278 } 279 280 Traits::advanceByte(src, charsNeeded - 1); 281 return charsNeeded; 282 } 283 } 284 285 enum DataEndianness 286 { 287 DetectEndianness, 288 BigEndianness, 289 LittleEndianness 290 }; 291 292 struct QUtf8 293 { 294 static QChar *convertToUnicode(QChar *, const char *, int) noexcept; 295 static QString convertToUnicode(const char *, int); 296 static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *); 297 static QByteArray convertFromUnicode(const QChar *, int); 298 static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *); 299 struct ValidUtf8Result { 300 bool isValidUtf8; 301 bool isValidAscii; 302 }; 303 static ValidUtf8Result isValidUtf8(const char *, qsizetype); 304 static int compareUtf8(const char *, qsizetype, const QChar *, int); 305 static int compareUtf8(const char *, qsizetype, QLatin1String s); 306 }; 307 308 struct QUtf16 309 { 310 static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); 311 static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); 312 }; 313 314 struct QUtf32 315 { 316 static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); 317 static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); 318 }; 319 320 #if QT_CONFIG(textcodec) 321 322 class QUtf8Codec : public QTextCodec { 323 public: 324 ~QUtf8Codec(); 325 326 QByteArray name() const override; 327 int mibEnum() const override; 328 329 QString convertToUnicode(const char *, int, ConverterState *) const override; 330 QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const override; 331 void convertToUnicode(QString *target, const char *, int, ConverterState *) const; 332 }; 333 334 class QUtf16Codec : public QTextCodec { 335 protected: 336 public: QUtf16Codec()337 QUtf16Codec() { e = DetectEndianness; } 338 ~QUtf16Codec(); 339 340 QByteArray name() const override; 341 QList<QByteArray> aliases() const override; 342 int mibEnum() const override; 343 344 QString convertToUnicode(const char *, int, ConverterState *) const override; 345 QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const override; 346 347 protected: 348 DataEndianness e; 349 }; 350 351 class QUtf16BECodec : public QUtf16Codec { 352 public: QUtf16BECodec()353 QUtf16BECodec() : QUtf16Codec() { e = BigEndianness; } 354 QByteArray name() const override; 355 QList<QByteArray> aliases() const override; 356 int mibEnum() const override; 357 }; 358 359 class QUtf16LECodec : public QUtf16Codec { 360 public: QUtf16LECodec()361 QUtf16LECodec() : QUtf16Codec() { e = LittleEndianness; } 362 QByteArray name() const override; 363 QList<QByteArray> aliases() const override; 364 int mibEnum() const override; 365 }; 366 367 class QUtf32Codec : public QTextCodec { 368 public: QUtf32Codec()369 QUtf32Codec() { e = DetectEndianness; } 370 ~QUtf32Codec(); 371 372 QByteArray name() const override; 373 QList<QByteArray> aliases() const override; 374 int mibEnum() const override; 375 376 QString convertToUnicode(const char *, int, ConverterState *) const override; 377 QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const override; 378 379 protected: 380 DataEndianness e; 381 }; 382 383 class QUtf32BECodec : public QUtf32Codec { 384 public: QUtf32BECodec()385 QUtf32BECodec() : QUtf32Codec() { e = BigEndianness; } 386 QByteArray name() const override; 387 QList<QByteArray> aliases() const override; 388 int mibEnum() const override; 389 }; 390 391 class QUtf32LECodec : public QUtf32Codec { 392 public: QUtf32LECodec()393 QUtf32LECodec() : QUtf32Codec() { e = LittleEndianness; } 394 QByteArray name() const override; 395 QList<QByteArray> aliases() const override; 396 int mibEnum() const override; 397 }; 398 399 400 #endif // textcodec 401 402 QT_END_NAMESPACE 403 404 #endif // QUTFCODEC_P_H 405