1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 Intel Corporation.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 3 requirements
23 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24 **
25 ** GNU General Public License Usage
26 ** Alternatively, this file may be used under the terms of the GNU
27 ** General Public License version 2.0 or (at your option) the GNU General
28 ** Public license version 3 or any later version approved by the KDE Free
29 ** Qt Foundation. The licenses are as published by the Free Software
30 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31 ** included in the packaging of this file. Please review the following
32 ** information to ensure the GNU General Public License requirements will
33 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34 ** https://www.gnu.org/licenses/gpl-3.0.html.
35 **
36 ** $QT_END_LICENSE$
37 **
38 ****************************************************************************/
39 
40 #include "qurl.h"
41 #include "private/qutfcodec_p.h"
42 #include "private/qtools_p.h"
43 #include "private/qsimd_p.h"
44 
45 QT_BEGIN_NAMESPACE
46 
47 // ### move to qurl_p.h
48 enum EncodingAction {
49     DecodeCharacter = 0,
50     LeaveCharacter = 1,
51     EncodeCharacter = 2
52 };
53 
54 // From RFC 3896, Appendix A Collected ABNF for URI
55 //    unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
56 //    reserved      = gen-delims / sub-delims
57 //    gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
58 //    sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
59 //                  / "*" / "+" / "," / ";" / "="
60 static const uchar defaultActionTable[96] = {
61     2, // space
62     1, // '!' (sub-delim)
63     2, // '"'
64     1, // '#' (gen-delim)
65     1, // '$' (gen-delim)
66     2, // '%' (percent)
67     1, // '&' (gen-delim)
68     1, // "'" (sub-delim)
69     1, // '(' (sub-delim)
70     1, // ')' (sub-delim)
71     1, // '*' (sub-delim)
72     1, // '+' (sub-delim)
73     1, // ',' (sub-delim)
74     0, // '-' (unreserved)
75     0, // '.' (unreserved)
76     1, // '/' (gen-delim)
77 
78     0, 0, 0, 0, 0,  // '0' to '4' (unreserved)
79     0, 0, 0, 0, 0,  // '5' to '9' (unreserved)
80     1, // ':' (gen-delim)
81     1, // ';' (sub-delim)
82     2, // '<'
83     1, // '=' (sub-delim)
84     2, // '>'
85     1, // '?' (gen-delim)
86 
87     1, // '@' (gen-delim)
88     0, 0, 0, 0, 0,  // 'A' to 'E' (unreserved)
89     0, 0, 0, 0, 0,  // 'F' to 'J' (unreserved)
90     0, 0, 0, 0, 0,  // 'K' to 'O' (unreserved)
91     0, 0, 0, 0, 0,  // 'P' to 'T' (unreserved)
92     0, 0, 0, 0, 0, 0,  // 'U' to 'Z' (unreserved)
93     1, // '[' (gen-delim)
94     2, // '\'
95     1, // ']' (gen-delim)
96     2, // '^'
97     0, // '_' (unreserved)
98 
99     2, // '`'
100     0, 0, 0, 0, 0,  // 'a' to 'e' (unreserved)
101     0, 0, 0, 0, 0,  // 'f' to 'j' (unreserved)
102     0, 0, 0, 0, 0,  // 'k' to 'o' (unreserved)
103     0, 0, 0, 0, 0,  // 'p' to 't' (unreserved)
104     0, 0, 0, 0, 0, 0,  // 'u' to 'z' (unreserved)
105     2, // '{'
106     2, // '|'
107     2, // '}'
108     0, // '~' (unreserved)
109 
110     2  // BSKP
111 };
112 
113 // mask tables, in negative polarity
114 // 0x00 if it belongs to this category
115 // 0xff if it doesn't
116 
117 static const uchar reservedMask[96] = {
118     0xff, // space
119     0xff, // '!' (sub-delim)
120     0x00, // '"'
121     0xff, // '#' (gen-delim)
122     0xff, // '$' (gen-delim)
123     0xff, // '%' (percent)
124     0xff, // '&' (gen-delim)
125     0xff, // "'" (sub-delim)
126     0xff, // '(' (sub-delim)
127     0xff, // ')' (sub-delim)
128     0xff, // '*' (sub-delim)
129     0xff, // '+' (sub-delim)
130     0xff, // ',' (sub-delim)
131     0xff, // '-' (unreserved)
132     0xff, // '.' (unreserved)
133     0xff, // '/' (gen-delim)
134 
135     0xff, 0xff, 0xff, 0xff, 0xff,  // '0' to '4' (unreserved)
136     0xff, 0xff, 0xff, 0xff, 0xff,  // '5' to '9' (unreserved)
137     0xff, // ':' (gen-delim)
138     0xff, // ';' (sub-delim)
139     0x00, // '<'
140     0xff, // '=' (sub-delim)
141     0x00, // '>'
142     0xff, // '?' (gen-delim)
143 
144     0xff, // '@' (gen-delim)
145     0xff, 0xff, 0xff, 0xff, 0xff,  // 'A' to 'E' (unreserved)
146     0xff, 0xff, 0xff, 0xff, 0xff,  // 'F' to 'J' (unreserved)
147     0xff, 0xff, 0xff, 0xff, 0xff,  // 'K' to 'O' (unreserved)
148     0xff, 0xff, 0xff, 0xff, 0xff,  // 'P' to 'T' (unreserved)
149     0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  // 'U' to 'Z' (unreserved)
150     0xff, // '[' (gen-delim)
151     0x00, // '\'
152     0xff, // ']' (gen-delim)
153     0x00, // '^'
154     0xff, // '_' (unreserved)
155 
156     0x00, // '`'
157     0xff, 0xff, 0xff, 0xff, 0xff,  // 'a' to 'e' (unreserved)
158     0xff, 0xff, 0xff, 0xff, 0xff,  // 'f' to 'j' (unreserved)
159     0xff, 0xff, 0xff, 0xff, 0xff,  // 'k' to 'o' (unreserved)
160     0xff, 0xff, 0xff, 0xff, 0xff,  // 'p' to 't' (unreserved)
161     0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  // 'u' to 'z' (unreserved)
162     0x00, // '{'
163     0x00, // '|'
164     0x00, // '}'
165     0xff, // '~' (unreserved)
166 
167     0xff  // BSKP
168 };
169 
isHex(ushort c)170 static inline bool isHex(ushort c)
171 {
172     return (c >= 'a' && c <= 'f') ||
173             (c >= 'A' && c <= 'F') ||
174             (c >= '0' && c <= '9');
175 }
176 
isUpperHex(ushort c)177 static inline bool isUpperHex(ushort c)
178 {
179     // undefined behaviour if c isn't an hex char!
180     return c < 0x60;
181 }
182 
toUpperHex(ushort c)183 static inline ushort toUpperHex(ushort c)
184 {
185     return isUpperHex(c) ? c : c - 0x20;
186 }
187 
decodeNibble(ushort c)188 static inline ushort decodeNibble(ushort c)
189 {
190     return c >= 'a' ? c - 'a' + 0xA :
191            c >= 'A' ? c - 'A' + 0xA : c - '0';
192 }
193 
194 // if the sequence at input is 2*HEXDIG, returns its decoding
195 // returns -1 if it isn't.
196 // assumes that the range has been checked already
decodePercentEncoding(const ushort * input)197 static inline ushort decodePercentEncoding(const ushort *input)
198 {
199     ushort c1 = input[1];
200     ushort c2 = input[2];
201     if (!isHex(c1) || !isHex(c2))
202         return ushort(-1);
203     return decodeNibble(c1) << 4 | decodeNibble(c2);
204 }
205 
encodeNibble(ushort c)206 static inline ushort encodeNibble(ushort c)
207 {
208     return ushort(QtMiscUtils::toHexUpper(c));
209 }
210 
ensureDetached(QString & result,ushort * & output,const ushort * begin,const ushort * input,const ushort * end,int add=0)211 static void ensureDetached(QString &result, ushort *&output, const ushort *begin, const ushort *input, const ushort *end,
212                            int add = 0)
213 {
214     if (!output) {
215         // now detach
216         // create enough space if the rest of the string needed to be percent-encoded
217         int charsProcessed = input - begin;
218         int charsRemaining = end - input;
219         int spaceNeeded = end - begin + 2 * charsRemaining + add;
220         int origSize = result.size();
221         result.resize(origSize + spaceNeeded);
222 
223         // we know that resize() above detached, so we bypass the reference count check
224         output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData()))
225                  + origSize;
226 
227         // copy the chars we've already processed
228         int i;
229         for (i = 0; i < charsProcessed; ++i)
230             output[i] = begin[i];
231         output += i;
232     }
233 }
234 
235 namespace {
236 struct QUrlUtf8Traits : public QUtf8BaseTraitsNoAscii
237 {
238     // From RFC 3987:
239     //    iunreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
240     //
241     //    ucschar        = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
242     //                   / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
243     //                   / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
244     //                   / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
245     //                   / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
246     //                   / %xD0000-DFFFD / %xE1000-EFFFD
247     //
248     //    iprivate       = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD
249     //
250     // That RFC allows iprivate only as part of iquery, but we don't know here
251     // whether we're looking at a query or another part of an URI, so we accept
252     // them too. The definition above excludes U+FFF0 to U+FFFD from appearing
253     // unencoded, but we see no reason for its exclusion, so we allow them to
254     // be decoded (and we need U+FFFD the replacement character to indicate
255     // failure to decode).
256     //
257     // That means we must disallow:
258     //  * unpaired surrogates (QUtf8Functions takes care of that for us)
259     //  * non-characters
260     static const bool allowNonCharacters = false;
261 
262     // override: our "bytes" are three percent-encoded UTF-16 characters
appendByte__anon62611b050111::QUrlUtf8Traits263     static void appendByte(ushort *&ptr, uchar b)
264     {
265         // b >= 0x80, by construction, so percent-encode
266         *ptr++ = '%';
267         *ptr++ = encodeNibble(b >> 4);
268         *ptr++ = encodeNibble(b & 0xf);
269     }
270 
peekByte__anon62611b050111::QUrlUtf8Traits271     static uchar peekByte(const ushort *ptr, int n = 0)
272     {
273         // decodePercentEncoding returns ushort(-1) if it can't decode,
274         // which means we return 0xff, which is not a valid continuation byte.
275         // If ptr[i * 3] is not '%', we'll multiply by zero and return 0,
276         // also not a valid continuation byte (if it's '%', we multiply by 1).
277         return uchar(decodePercentEncoding(ptr + n * 3))
278                 * uchar(ptr[n * 3] == '%');
279     }
280 
availableBytes__anon62611b050111::QUrlUtf8Traits281     static qptrdiff availableBytes(const ushort *ptr, const ushort *end)
282     {
283         return (end - ptr) / 3;
284     }
285 
advanceByte__anon62611b050111::QUrlUtf8Traits286     static void advanceByte(const ushort *&ptr, int n = 1)
287     {
288         ptr += n * 3;
289     }
290 };
291 }
292 
293 // returns true if we performed an UTF-8 decoding
encodedUtf8ToUtf16(QString & result,ushort * & output,const ushort * begin,const ushort * & input,const ushort * end,ushort decoded)294 static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *begin, const ushort *&input,
295                                const ushort *end, ushort decoded)
296 {
297     uint ucs4, *dst = &ucs4;
298     const ushort *src = input + 3;// skip the %XX that yielded \a decoded
299     int charsNeeded = QUtf8Functions::fromUtf8<QUrlUtf8Traits>(decoded, dst, src, end);
300     if (charsNeeded < 0)
301         return false;
302 
303     if (!QChar::requiresSurrogates(ucs4)) {
304         // UTF-8 decoded and no surrogates are required
305         // detach if necessary
306         // possibilities are: 6 chars (%XX%XX) -> one char; 9 chars (%XX%XX%XX) -> one char
307         ensureDetached(result, output, begin, input, end, -3 * charsNeeded + 1);
308         *output++ = ucs4;
309     } else {
310         // UTF-8 decoded to something that requires a surrogate pair
311         // compressing from %XX%XX%XX%XX (12 chars) to two
312         ensureDetached(result, output, begin, input, end, -10);
313         *output++ = QChar::highSurrogate(ucs4);
314         *output++ = QChar::lowSurrogate(ucs4);
315     }
316 
317     input = src - 1;
318     return true;
319 }
320 
unicodeToEncodedUtf8(QString & result,ushort * & output,const ushort * begin,const ushort * & input,const ushort * end,ushort decoded)321 static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *begin,
322                                  const ushort *&input, const ushort *end, ushort decoded)
323 {
324     // calculate the utf8 length and ensure enough space is available
325     int utf8len = QChar::isHighSurrogate(decoded) ? 4 : decoded >= 0x800 ? 3 : 2;
326 
327     // detach
328     if (!output) {
329         // we need 3 * utf8len for the encoded UTF-8 sequence
330         // but ensureDetached already adds 3 for the char we're processing
331         ensureDetached(result, output, begin, input, end, 3*utf8len - 3);
332     } else {
333         // verify that there's enough space or expand
334         int charsRemaining = end - input - 1; // not including this one
335         int pos = output - reinterpret_cast<const ushort *>(result.constData());
336         int spaceRemaining = result.size() - pos;
337         if (spaceRemaining < 3*charsRemaining + 3*utf8len) {
338             // must resize
339             result.resize(result.size() + 3*utf8len);
340 
341             // we know that resize() above detached, so we bypass the reference count check
342             output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData()));
343             output += pos;
344         }
345     }
346 
347     ++input;
348     int res = QUtf8Functions::toUtf8<QUrlUtf8Traits>(decoded, output, input, end);
349     --input;
350     if (res < 0) {
351         // bad surrogate pair sequence
352         // we will encode bad UTF-16 to UTF-8
353         // but they don't get decoded back
354 
355         // first of three bytes
356         uchar c = 0xe0 | uchar(decoded >> 12);
357         *output++ = '%';
358         *output++ = 'E';
359         *output++ = encodeNibble(c & 0xf);
360 
361         // second byte
362         c = 0x80 | (uchar(decoded >> 6) & 0x3f);
363         *output++ = '%';
364         *output++ = encodeNibble(c >> 4);
365         *output++ = encodeNibble(c & 0xf);
366 
367         // third byte
368         c = 0x80 | (decoded & 0x3f);
369         *output++ = '%';
370         *output++ = encodeNibble(c >> 4);
371         *output++ = encodeNibble(c & 0xf);
372     }
373 }
374 
recode(QString & result,const ushort * begin,const ushort * end,QUrl::ComponentFormattingOptions encoding,const uchar * actionTable,bool retryBadEncoding)375 static int recode(QString &result, const ushort *begin, const ushort *end, QUrl::ComponentFormattingOptions encoding,
376                   const uchar *actionTable, bool retryBadEncoding)
377 {
378     const int origSize = result.size();
379     const ushort *input = begin;
380     ushort *output = nullptr;
381 
382     EncodingAction action = EncodeCharacter;
383     for ( ; input != end; ++input) {
384         ushort c;
385         // try a run where no change is necessary
386         for ( ; input != end; ++input) {
387             c = *input;
388             if (c < 0x20U)
389                 action = EncodeCharacter;
390             if (c < 0x20U || c >= 0x80U) // also: (c - 0x20 < 0x60U)
391                 goto non_trivial;
392             action = EncodingAction(actionTable[c - ' ']);
393             if (action == EncodeCharacter)
394                 goto non_trivial;
395             if (output)
396                 *output++ = c;
397         }
398         break;
399 
400 non_trivial:
401         uint decoded;
402         if (c == '%' && retryBadEncoding) {
403             // always write "%25"
404             ensureDetached(result, output, begin, input, end);
405             *output++ = '%';
406             *output++ = '2';
407             *output++ = '5';
408             continue;
409         } else if (c == '%') {
410             // check if the input is valid
411             if (input + 2 >= end || (decoded = decodePercentEncoding(input)) == ushort(-1)) {
412                 // not valid, retry
413                 result.resize(origSize);
414                 return recode(result, begin, end, encoding, actionTable, true);
415             }
416 
417             if (decoded >= 0x80) {
418                 // decode the UTF-8 sequence
419                 if (!(encoding & QUrl::EncodeUnicode) &&
420                         encodedUtf8ToUtf16(result, output, begin, input, end, decoded))
421                     continue;
422 
423                 // decoding the encoded UTF-8 failed
424                 action = LeaveCharacter;
425             } else if (decoded >= 0x20) {
426                 action = EncodingAction(actionTable[decoded - ' ']);
427             }
428         } else {
429             decoded = c;
430             if (decoded >= 0x80 && encoding & QUrl::EncodeUnicode) {
431                 // encode the UTF-8 sequence
432                 unicodeToEncodedUtf8(result, output, begin, input, end, decoded);
433                 continue;
434             } else if (decoded >= 0x80) {
435                 if (output)
436                     *output++ = c;
437                 continue;
438             }
439         }
440 
441         // there are six possibilities:
442         //  current \ action  | DecodeCharacter | LeaveCharacter | EncodeCharacter
443         //      decoded       |    1:leave      |    2:leave     |    3:encode
444         //      encoded       |    4:decode     |    5:leave     |    6:leave
445         // cases 1 and 2 were handled before this section
446 
447         if (c == '%' && action != DecodeCharacter) {
448             // cases 5 and 6: it's encoded and we're leaving it as it is
449             // except we're pedantic and we'll uppercase the hex
450             if (output || !isUpperHex(input[1]) || !isUpperHex(input[2])) {
451                 ensureDetached(result, output, begin, input, end);
452                 *output++ = '%';
453                 *output++ = toUpperHex(*++input);
454                 *output++ = toUpperHex(*++input);
455             }
456         } else if (c == '%' && action == DecodeCharacter) {
457             // case 4: we need to decode
458             ensureDetached(result, output, begin, input, end);
459             *output++ = decoded;
460             input += 2;
461         } else {
462             // must be case 3: we need to encode
463             ensureDetached(result, output, begin, input, end);
464             *output++ = '%';
465             *output++ = encodeNibble(c >> 4);
466             *output++ = encodeNibble(c & 0xf);
467         }
468     }
469 
470     if (output) {
471         int len = output - reinterpret_cast<const ushort *>(result.constData());
472         result.truncate(len);
473         return len - origSize;
474     }
475     return 0;
476 }
477 
478 /*
479  * Returns true if the input it checked (if it checked anything) is not
480  * encoded. A return of false indicates there's a percent at \a input that
481  * needs to be decoded.
482  */
483 #ifdef __SSE2__
simdCheckNonEncoded(ushort * & output,const ushort * & input,const ushort * end)484 static bool simdCheckNonEncoded(ushort *&output, const ushort *&input, const ushort *end)
485 {
486 #  ifdef __AVX2__
487     const __m256i percents256 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128('%'));
488     const __m128i percents = _mm256_castsi256_si128(percents256);
489 #  else
490     const __m128i percents = _mm_set1_epi16('%');
491 #  endif
492 
493     uint idx = 0;
494     quint32 mask = 0;
495     if (input + 16 <= end) {
496         qptrdiff offset = 0;
497         for ( ; input + offset + 16 <= end; offset += 16) {
498 #  ifdef __AVX2__
499             // do 32 bytes at a time using AVX2
500             __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(input + offset));
501             __m256i comparison = _mm256_cmpeq_epi16(data, percents256);
502             mask = _mm256_movemask_epi8(comparison);
503             _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + offset), data);
504 #  else
505             // do 32 bytes at a time using unrolled SSE2
506             __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + offset));
507             __m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + offset + 8));
508             __m128i comparison1 = _mm_cmpeq_epi16(data1, percents);
509             __m128i comparison2 = _mm_cmpeq_epi16(data2, percents);
510             uint mask1 = _mm_movemask_epi8(comparison1);
511             uint mask2 = _mm_movemask_epi8(comparison2);
512 
513             _mm_storeu_si128(reinterpret_cast<__m128i *>(output + offset), data1);
514             if (!mask1)
515                 _mm_storeu_si128(reinterpret_cast<__m128i *>(output + offset + 8), data2);
516             mask = mask1 | (mask2 << 16);
517 #  endif
518 
519             if (mask) {
520                 idx = qCountTrailingZeroBits(mask) / 2;
521                 break;
522             }
523         }
524 
525         input += offset;
526         if (output)
527             output += offset;
528     } else if (input + 8 <= end) {
529         // do 16 bytes at a time
530         __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input));
531         __m128i comparison = _mm_cmpeq_epi16(data, percents);
532         mask = _mm_movemask_epi8(comparison);
533         _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
534         idx = qCountTrailingZeroBits(quint16(mask)) / 2;
535     } else if (input + 4 <= end) {
536         // do 8 bytes only
537         __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(input));
538         __m128i comparison = _mm_cmpeq_epi16(data, percents);
539         mask = _mm_movemask_epi8(comparison) & 0xffu;
540         _mm_storel_epi64(reinterpret_cast<__m128i *>(output), data);
541         idx = qCountTrailingZeroBits(quint8(mask)) / 2;
542     } else {
543         // no percents found (because we didn't check)
544         return true;
545     }
546 
547     // advance to the next non-encoded
548     input += idx;
549     output += idx;
550 
551     return !mask;
552 }
553 #else
simdCheckNonEncoded(...)554 static bool simdCheckNonEncoded(...)
555 {
556     return true;
557 }
558 #endif
559 
560 /*!
561     \since 5.0
562     \internal
563 
564     This function decodes a percent-encoded string located from \a begin to \a
565     end, by appending each character to \a appendTo. It returns the number of
566     characters appended. Each percent-encoded sequence is decoded as follows:
567 
568     \list
569       \li from %00 to %7F: the exact decoded value is appended;
570       \li from %80 to %FF: QChar::ReplacementCharacter is appended;
571       \li bad encoding: original input is copied to the output, undecoded.
572     \endlist
573 
574     Given the above, it's important for the input to already have all UTF-8
575     percent sequences decoded by qt_urlRecode (that is, the input should not
576     have been processed with QUrl::EncodeUnicode).
577 
578     The input should also be a valid percent-encoded sequence (the output of
579     qt_urlRecode is always valid).
580 */
decode(QString & appendTo,const ushort * begin,const ushort * end)581 static int decode(QString &appendTo, const ushort *begin, const ushort *end)
582 {
583     // fast check whether there's anything to be decoded in the first place
584     const ushort *input = QtPrivate::qustrchr(QStringView(begin, end), '%');
585     if (Q_LIKELY(input == end))
586         return 0;           // nothing to do, it was already decoded!
587 
588     // detach
589     const int origSize = appendTo.size();
590     appendTo.resize(origSize + (end - begin));
591     ushort *output = reinterpret_cast<ushort *>(appendTo.begin()) + origSize;
592     memcpy(static_cast<void *>(output), static_cast<const void *>(begin), (input - begin) * sizeof(ushort));
593     output += input - begin;
594 
595     while (input != end) {
596         // something was encoded
597         Q_ASSERT(*input == '%');
598 
599         if (Q_UNLIKELY(end - input < 3 || !isHex(input[1]) || !isHex(input[2]))) {
600             // badly-encoded data
601             appendTo.resize(origSize + (end - begin));
602             memcpy(static_cast<void *>(appendTo.begin() + origSize), static_cast<const void *>(begin), (end - begin) * sizeof(ushort));
603             return end - begin;
604         }
605 
606         ++input;
607         *output++ = decodeNibble(input[0]) << 4 | decodeNibble(input[1]);
608         if (output[-1] >= 0x80)
609             output[-1] = QChar::ReplacementCharacter;
610         input += 2;
611 
612         // search for the next percent, copying from input to output
613         if (simdCheckNonEncoded(output, input, end)) {
614             while (input != end) {
615                 ushort uc = *input;
616                 if (uc == '%')
617                     break;
618                 *output++ = uc;
619                 ++input;
620             }
621         }
622     }
623 
624     int len = output - reinterpret_cast<ushort *>(appendTo.begin());
625     appendTo.truncate(len);
626     return len - origSize;
627 }
628 
629 template <size_t N>
maskTable(uchar (& table)[N],const uchar (& mask)[N])630 static void maskTable(uchar (&table)[N], const uchar (&mask)[N])
631 {
632     for (size_t i = 0; i < N; ++i)
633         table[i] &= mask[i];
634 }
635 
636 /*!
637     \internal
638 
639     Recodes the string from \a begin to \a end. If any transformations are
640     done, append them to \a appendTo and return the number of characters added.
641     If no transformations were required, return 0.
642 
643     The \a encoding option modifies the default behaviour:
644     \list
645     \li QUrl::DecodeReserved: if set, reserved characters will be decoded;
646                               if unset, reserved characters will be encoded
647     \li QUrl::EncodeSpaces: if set, spaces will be encoded to "%20"; if unset, they will be " "
648     \li QUrl::EncodeUnicode: if set, characters above U+0080 will be encoded to their UTF-8
649                              percent-encoded form; if unset, they will be decoded to UTF-16
650     \li QUrl::FullyDecoded: if set, this function will decode all percent-encoded sequences,
651                             including that of the percent character. The resulting string
652                             will not be percent-encoded anymore. Use with caution!
653                             In this mode, the behaviour is undefined if the input string
654                             contains any percent-encoding sequences above %80.
655                             Also, the function will not correct bad % sequences.
656     \endlist
657 
658     Other flags are ignored (including QUrl::EncodeReserved).
659 
660     The \a tableModifications argument can be used to supply extra
661     modifications to the tables, to be applied after the flags above are
662     handled. It consists of a sequence of 16-bit values, where the low 8 bits
663     indicate the character in question and the high 8 bits are either \c
664     EncodeCharacter, \c LeaveCharacter or \c DecodeCharacter.
665 
666     This function corrects percent-encoded errors by interpreting every '%' as
667     meaning "%25" (all percents in the same content).
668  */
669 
670 Q_AUTOTEST_EXPORT int
qt_urlRecode(QString & appendTo,const QChar * begin,const QChar * end,QUrl::ComponentFormattingOptions encoding,const ushort * tableModifications)671 qt_urlRecode(QString &appendTo, const QChar *begin, const QChar *end,
672              QUrl::ComponentFormattingOptions encoding, const ushort *tableModifications)
673 {
674     uchar actionTable[sizeof defaultActionTable];
675     if ((encoding & QUrl::FullyDecoded) == QUrl::FullyDecoded) {
676         return decode(appendTo, reinterpret_cast<const ushort *>(begin), reinterpret_cast<const ushort *>(end));
677     }
678 
679     memcpy(actionTable, defaultActionTable, sizeof actionTable);
680     if (encoding & QUrl::DecodeReserved)
681         maskTable(actionTable, reservedMask);
682     if (!(encoding & QUrl::EncodeSpaces))
683         actionTable[0] = DecodeCharacter; // decode
684 
685     if (tableModifications) {
686         for (const ushort *p = tableModifications; *p; ++p)
687             actionTable[uchar(*p) - ' '] = *p >> 8;
688     }
689 
690     return recode(appendTo, reinterpret_cast<const ushort *>(begin), reinterpret_cast<const ushort *>(end),
691                   encoding, actionTable, false);
692 }
693 
694 // qstring.cpp
695 bool qt_is_ascii(const char *&ptr, const char *end) noexcept;
696 
697 /*!
698     \internal
699     \since 5.0
700 
701     \a ba contains an 8-bit form of the component and it might be
702     percent-encoded already. We can't use QString::fromUtf8 because it might
703     contain non-UTF8 sequences. We can't use QByteArray::toPercentEncoding
704     because it might already contain percent-encoded sequences. We can't use
705     qt_urlRecode because it needs UTF-16 input.
706 */
707 Q_AUTOTEST_EXPORT
qt_urlRecodeByteArray(const QByteArray & ba)708 QString qt_urlRecodeByteArray(const QByteArray &ba)
709 {
710     if (ba.isNull())
711         return QString();
712 
713     // scan ba for anything above or equal to 0x80
714     // control points below 0x20 are fine in QString
715     const char *in = ba.constData();
716     const char *const end = ba.constEnd();
717     if (qt_is_ascii(in, end)) {
718         // no non-ASCII found, we're safe to convert to QString
719         return QString::fromLatin1(ba, ba.size());
720     }
721 
722     // we found something that we need to encode
723     QByteArray intermediate = ba;
724     intermediate.resize(ba.size() * 3 - (in - ba.constData()));
725     uchar *out = reinterpret_cast<uchar *>(intermediate.data() + (in - ba.constData()));
726     for ( ; in < end; ++in) {
727         if (*in & 0x80) {
728             // encode
729             *out++ = '%';
730             *out++ = encodeNibble(uchar(*in) >> 4);
731             *out++ = encodeNibble(uchar(*in) & 0xf);
732         } else {
733             // keep
734             *out++ = uchar(*in);
735         }
736     }
737 
738     // now it's safe to call fromLatin1
739     return QString::fromLatin1(intermediate, out - reinterpret_cast<uchar *>(intermediate.data()));
740 }
741 
742 QT_END_NAMESPACE
743