1 /*
2   ==============================================================================
3 
4    This file is part of the JUCE library.
5    Copyright (c) 2020 - Raw Material Software Limited
6 
7    JUCE is an open source library subject to commercial or open-source
8    licensing.
9 
10    The code included in this file is provided under the terms of the ISC license
11    http://www.isc.org/downloads/software-support-policy/isc-license. Permission
12    To use, copy, modify, and/or distribute this software for any purpose with or
13    without fee is hereby granted provided that the above copyright notice and
14    this permission notice appear in all copies.
15 
16    JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
17    EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
18    DISCLAIMED.
19 
20   ==============================================================================
21 */
22 
23 namespace juce
24 {
25 
26 //==============================================================================
27 /**
28     Wraps a pointer to a null-terminated UTF-8 character string, and provides
29     various methods to operate on the data.
30     @see CharPointer_UTF16, CharPointer_UTF32
31 
32     @tags{Core}
33 */
34 class CharPointer_UTF8  final
35 {
36 public:
37     using CharType = char;
38 
CharPointer_UTF8(const CharType * rawPointer)39     explicit CharPointer_UTF8 (const CharType* rawPointer) noexcept
40         : data (const_cast<CharType*> (rawPointer))
41     {
42     }
43 
44     CharPointer_UTF8 (const CharPointer_UTF8& other) = default;
45 
46     CharPointer_UTF8 operator= (CharPointer_UTF8 other) noexcept
47     {
48         data = other.data;
49         return *this;
50     }
51 
52     CharPointer_UTF8 operator= (const CharType* text) noexcept
53     {
54         data = const_cast<CharType*> (text);
55         return *this;
56     }
57 
58     /** This is a pointer comparison, it doesn't compare the actual text. */
59     bool operator== (CharPointer_UTF8 other) const noexcept      { return data == other.data; }
60     bool operator!= (CharPointer_UTF8 other) const noexcept      { return data != other.data; }
61     bool operator<= (CharPointer_UTF8 other) const noexcept      { return data <= other.data; }
62     bool operator<  (CharPointer_UTF8 other) const noexcept      { return data <  other.data; }
63     bool operator>= (CharPointer_UTF8 other) const noexcept      { return data >= other.data; }
64     bool operator>  (CharPointer_UTF8 other) const noexcept      { return data >  other.data; }
65 
66     /** Returns the address that this pointer is pointing to. */
getAddress()67     CharType* getAddress() const noexcept        { return data; }
68 
69     /** Returns the address that this pointer is pointing to. */
70     operator const CharType*() const noexcept    { return data; }
71 
72     /** Returns true if this pointer is pointing to a null character. */
isEmpty()73     bool isEmpty() const noexcept                { return *data == 0; }
74 
75     /** Returns true if this pointer is not pointing to a null character. */
isNotEmpty()76     bool isNotEmpty() const noexcept             { return *data != 0; }
77 
78     /** Returns the unicode character that this pointer is pointing to. */
79     juce_wchar operator*() const noexcept
80     {
81         auto byte = (signed char) *data;
82 
83         if (byte >= 0)
84             return (juce_wchar) (uint8) byte;
85 
86         uint32 n = (uint32) (uint8) byte;
87         uint32 mask = 0x7f;
88         uint32 bit = 0x40;
89         int numExtraValues = 0;
90 
91         while ((n & bit) != 0 && bit > 0x8)
92         {
93             mask >>= 1;
94             ++numExtraValues;
95             bit >>= 1;
96         }
97 
98         n &= mask;
99 
100         for (int i = 1; i <= numExtraValues; ++i)
101         {
102             auto nextByte = (uint32) (uint8) data[i];
103 
104             if ((nextByte & 0xc0) != 0x80)
105                 break;
106 
107             n <<= 6;
108             n |= (nextByte & 0x3f);
109         }
110 
111         return (juce_wchar) n;
112     }
113 
114     /** Moves this pointer along to the next character in the string. */
115     CharPointer_UTF8& operator++() noexcept
116     {
117         jassert (*data != 0); // trying to advance past the end of the string?
118         auto n = (signed char) *data++;
119 
120         if (n < 0)
121         {
122             uint8 bit = 0x40;
123 
124             while ((static_cast<uint8> (n) & bit) != 0 && bit > 0x8)
125             {
126                 ++data;
127                 bit = static_cast<uint8> (bit >> 1);
128             }
129         }
130 
131         return *this;
132     }
133 
134     /** Moves this pointer back to the previous character in the string. */
135     CharPointer_UTF8 operator--() noexcept
136     {
137         int count = 0;
138 
139         while ((*--data & 0xc0) == 0x80 && ++count < 4)
140         {}
141 
142         return *this;
143     }
144 
145     /** Returns the character that this pointer is currently pointing to, and then
146         advances the pointer to point to the next character. */
getAndAdvance()147     juce_wchar getAndAdvance() noexcept
148     {
149         auto byte = (signed char) *data++;
150 
151         if (byte >= 0)
152             return (juce_wchar) (uint8) byte;
153 
154         uint32 n = (uint32) (uint8) byte;
155         uint32 mask = 0x7f;
156         uint32 bit = 0x40;
157         int numExtraValues = 0;
158 
159         while ((n & bit) != 0 && bit > 0x8)
160         {
161             mask >>= 1;
162             ++numExtraValues;
163             bit >>= 1;
164         }
165 
166         n &= mask;
167 
168         while (--numExtraValues >= 0)
169         {
170             auto nextByte = (uint32) (uint8) *data;
171 
172             if ((nextByte & 0xc0) != 0x80)
173                 break;
174 
175             ++data;
176             n <<= 6;
177             n |= (nextByte & 0x3f);
178         }
179 
180         return (juce_wchar) n;
181     }
182 
183     /** Moves this pointer along to the next character in the string. */
184     CharPointer_UTF8 operator++ (int) noexcept
185     {
186         CharPointer_UTF8 temp (*this);
187         ++*this;
188         return temp;
189     }
190 
191     /** Moves this pointer forwards by the specified number of characters. */
192     void operator+= (int numToSkip) noexcept
193     {
194         if (numToSkip < 0)
195         {
196             while (++numToSkip <= 0)
197                 --*this;
198         }
199         else
200         {
201             while (--numToSkip >= 0)
202                 ++*this;
203         }
204     }
205 
206     /** Moves this pointer backwards by the specified number of characters. */
207     void operator-= (int numToSkip) noexcept
208     {
209         operator+= (-numToSkip);
210     }
211 
212     /** Returns the character at a given character index from the start of the string. */
213     juce_wchar operator[] (int characterIndex) const noexcept
214     {
215         auto p (*this);
216         p += characterIndex;
217         return *p;
218     }
219 
220     /** Returns a pointer which is moved forwards from this one by the specified number of characters. */
221     CharPointer_UTF8 operator+ (int numToSkip) const noexcept
222     {
223         auto p (*this);
224         p += numToSkip;
225         return p;
226     }
227 
228     /** Returns a pointer which is moved backwards from this one by the specified number of characters. */
229     CharPointer_UTF8 operator- (int numToSkip) const noexcept
230     {
231         auto p (*this);
232         p += -numToSkip;
233         return p;
234     }
235 
236     /** Returns the number of characters in this string. */
length()237     size_t length() const noexcept
238     {
239         auto* d = data;
240         size_t count = 0;
241 
242         for (;;)
243         {
244             auto n = (uint32) (uint8) *d++;
245 
246             if ((n & 0x80) != 0)
247             {
248                 while ((*d & 0xc0) == 0x80)
249                     ++d;
250             }
251             else if (n == 0)
252                 break;
253 
254             ++count;
255         }
256 
257         return count;
258     }
259 
260     /** Returns the number of characters in this string, or the given value, whichever is lower. */
lengthUpTo(const size_t maxCharsToCount)261     size_t lengthUpTo (const size_t maxCharsToCount) const noexcept
262     {
263         return CharacterFunctions::lengthUpTo (*this, maxCharsToCount);
264     }
265 
266     /** Returns the number of characters in this string, or up to the given end pointer, whichever is lower. */
lengthUpTo(const CharPointer_UTF8 end)267     size_t lengthUpTo (const CharPointer_UTF8 end) const noexcept
268     {
269         return CharacterFunctions::lengthUpTo (*this, end);
270     }
271 
272     /** Returns the number of bytes that are used to represent this string.
273         This includes the terminating null character.
274     */
sizeInBytes()275     size_t sizeInBytes() const noexcept
276     {
277         jassert (data != nullptr);
278         return strlen (data) + 1;
279     }
280 
281     /** Returns the number of bytes that would be needed to represent the given
282         unicode character in this encoding format.
283     */
getBytesRequiredFor(const juce_wchar charToWrite)284     static size_t getBytesRequiredFor (const juce_wchar charToWrite) noexcept
285     {
286         size_t num = 1;
287         auto c = (uint32) charToWrite;
288 
289         if (c >= 0x80)
290         {
291             ++num;
292             if (c >= 0x800)
293             {
294                 ++num;
295                 if (c >= 0x10000)
296                     ++num;
297             }
298         }
299 
300         return num;
301     }
302 
303     /** Returns the number of bytes that would be needed to represent the given
304         string in this encoding format.
305         The value returned does NOT include the terminating null character.
306     */
307     template <class CharPointer>
getBytesRequiredFor(CharPointer text)308     static size_t getBytesRequiredFor (CharPointer text) noexcept
309     {
310         size_t count = 0;
311 
312         while (auto n = text.getAndAdvance())
313             count += getBytesRequiredFor (n);
314 
315         return count;
316     }
317 
318     /** Returns a pointer to the null character that terminates this string. */
findTerminatingNull()319     CharPointer_UTF8 findTerminatingNull() const noexcept
320     {
321         return CharPointer_UTF8 (data + strlen (data));
322     }
323 
324     /** Writes a unicode character to this string, and advances this pointer to point to the next position. */
write(const juce_wchar charToWrite)325     void write (const juce_wchar charToWrite) noexcept
326     {
327         auto c = (uint32) charToWrite;
328 
329         if (c >= 0x80)
330         {
331             int numExtraBytes = 1;
332             if (c >= 0x800)
333             {
334                 ++numExtraBytes;
335                 if (c >= 0x10000)
336                     ++numExtraBytes;
337             }
338 
339             *data++ = (CharType) ((uint32) (0xff << (7 - numExtraBytes)) | (c >> (numExtraBytes * 6)));
340 
341             while (--numExtraBytes >= 0)
342                 *data++ = (CharType) (0x80 | (0x3f & (c >> (numExtraBytes * 6))));
343         }
344         else
345         {
346             *data++ = (CharType) c;
347         }
348     }
349 
350     /** Writes a null character to this string (leaving the pointer's position unchanged). */
writeNull()351     void writeNull() const noexcept
352     {
353         *data = 0;
354     }
355 
356     /** Copies a source string to this pointer, advancing this pointer as it goes. */
357     template <typename CharPointer>
writeAll(const CharPointer src)358     void writeAll (const CharPointer src) noexcept
359     {
360         CharacterFunctions::copyAll (*this, src);
361     }
362 
363     /** Copies a source string to this pointer, advancing this pointer as it goes. */
writeAll(const CharPointer_UTF8 src)364     void writeAll (const CharPointer_UTF8 src) noexcept
365     {
366         auto* s = src.data;
367 
368         while ((*data = *s) != 0)
369         {
370             ++data;
371             ++s;
372         }
373     }
374 
375     /** Copies a source string to this pointer, advancing this pointer as it goes.
376         The maxDestBytes parameter specifies the maximum number of bytes that can be written
377         to the destination buffer before stopping.
378     */
379     template <typename CharPointer>
writeWithDestByteLimit(const CharPointer src,const size_t maxDestBytes)380     size_t writeWithDestByteLimit (const CharPointer src, const size_t maxDestBytes) noexcept
381     {
382         return CharacterFunctions::copyWithDestByteLimit (*this, src, maxDestBytes);
383     }
384 
385     /** Copies a source string to this pointer, advancing this pointer as it goes.
386         The maxChars parameter specifies the maximum number of characters that can be
387         written to the destination buffer before stopping (including the terminating null).
388     */
389     template <typename CharPointer>
writeWithCharLimit(const CharPointer src,const int maxChars)390     void writeWithCharLimit (const CharPointer src, const int maxChars) noexcept
391     {
392         CharacterFunctions::copyWithCharLimit (*this, src, maxChars);
393     }
394 
395     /** Compares this string with another one. */
396     template <typename CharPointer>
compare(const CharPointer other)397     int compare (const CharPointer other) const noexcept
398     {
399         return CharacterFunctions::compare (*this, other);
400     }
401 
402     /** Compares this string with another one, up to a specified number of characters. */
403     template <typename CharPointer>
compareUpTo(const CharPointer other,const int maxChars)404     int compareUpTo (const CharPointer other, const int maxChars) const noexcept
405     {
406         return CharacterFunctions::compareUpTo (*this, other, maxChars);
407     }
408 
409     /** Compares this string with another one. */
410     template <typename CharPointer>
compareIgnoreCase(const CharPointer other)411     int compareIgnoreCase (const CharPointer other) const noexcept
412     {
413         return CharacterFunctions::compareIgnoreCase (*this, other);
414     }
415 
416     /** Compares this string with another one. */
compareIgnoreCase(const CharPointer_UTF8 other)417     int compareIgnoreCase (const CharPointer_UTF8 other) const noexcept
418     {
419         return CharacterFunctions::compareIgnoreCase (*this, other);
420     }
421 
422     /** Compares this string with another one, up to a specified number of characters. */
423     template <typename CharPointer>
compareIgnoreCaseUpTo(const CharPointer other,const int maxChars)424     int compareIgnoreCaseUpTo (const CharPointer other, const int maxChars) const noexcept
425     {
426         return CharacterFunctions::compareIgnoreCaseUpTo (*this, other, maxChars);
427     }
428 
429     /** Returns the character index of a substring, or -1 if it isn't found. */
430     template <typename CharPointer>
indexOf(const CharPointer stringToFind)431     int indexOf (const CharPointer stringToFind) const noexcept
432     {
433         return CharacterFunctions::indexOf (*this, stringToFind);
434     }
435 
436     /** Returns the character index of a unicode character, or -1 if it isn't found. */
indexOf(const juce_wchar charToFind)437     int indexOf (const juce_wchar charToFind) const noexcept
438     {
439         return CharacterFunctions::indexOfChar (*this, charToFind);
440     }
441 
442     /** Returns the character index of a unicode character, or -1 if it isn't found. */
indexOf(const juce_wchar charToFind,const bool ignoreCase)443     int indexOf (const juce_wchar charToFind, const bool ignoreCase) const noexcept
444     {
445         return ignoreCase ? CharacterFunctions::indexOfCharIgnoreCase (*this, charToFind)
446                           : CharacterFunctions::indexOfChar (*this, charToFind);
447     }
448 
449     /** Returns true if the first character of this string is whitespace. */
isWhitespace()450     bool isWhitespace() const noexcept          { const CharType c = *data; return c == ' ' || (c <= 13 && c >= 9); }
451     /** Returns true if the first character of this string is a digit. */
isDigit()452     bool isDigit() const noexcept               { const CharType c = *data; return c >= '0' && c <= '9'; }
453     /** Returns true if the first character of this string is a letter. */
isLetter()454     bool isLetter() const noexcept              { return CharacterFunctions::isLetter (operator*()) != 0; }
455     /** Returns true if the first character of this string is a letter or digit. */
isLetterOrDigit()456     bool isLetterOrDigit() const noexcept       { return CharacterFunctions::isLetterOrDigit (operator*()) != 0; }
457     /** Returns true if the first character of this string is upper-case. */
isUpperCase()458     bool isUpperCase() const noexcept           { return CharacterFunctions::isUpperCase (operator*()) != 0; }
459     /** Returns true if the first character of this string is lower-case. */
isLowerCase()460     bool isLowerCase() const noexcept           { return CharacterFunctions::isLowerCase (operator*()) != 0; }
461 
462     /** Returns an upper-case version of the first character of this string. */
toUpperCase()463     juce_wchar toUpperCase() const noexcept     { return CharacterFunctions::toUpperCase (operator*()); }
464     /** Returns a lower-case version of the first character of this string. */
toLowerCase()465     juce_wchar toLowerCase() const noexcept     { return CharacterFunctions::toLowerCase (operator*()); }
466 
467     /** Parses this string as a 32-bit integer. */
getIntValue32()468     int getIntValue32() const noexcept          { return atoi (data); }
469 
470     /** Parses this string as a 64-bit integer. */
getIntValue64()471     int64 getIntValue64() const noexcept
472     {
473        #if JUCE_WINDOWS && ! JUCE_MINGW
474         return _atoi64 (data);
475        #else
476         return atoll (data);
477        #endif
478     }
479 
480     /** Parses this string as a floating point double. */
getDoubleValue()481     double getDoubleValue() const noexcept                      { return CharacterFunctions::getDoubleValue (*this); }
482 
483     /** Returns the first non-whitespace character in the string. */
findEndOfWhitespace()484     CharPointer_UTF8 findEndOfWhitespace() const noexcept       { return CharacterFunctions::findEndOfWhitespace (*this); }
485 
486     /** Returns true if the given unicode character can be represented in this encoding. */
canRepresent(juce_wchar character)487     static bool canRepresent (juce_wchar character) noexcept
488     {
489         return ((uint32) character) < (uint32) 0x10ffff;
490     }
491 
492     /** Returns true if this data contains a valid string in this encoding. */
isValidString(const CharType * dataToTest,int maxBytesToRead)493     static bool isValidString (const CharType* dataToTest, int maxBytesToRead)
494     {
495         while (--maxBytesToRead >= 0 && *dataToTest != 0)
496         {
497             auto byte = (signed char) *dataToTest++;
498 
499             if (byte < 0)
500             {
501                 int bit = 0x40;
502                 int numExtraValues = 0;
503 
504                 while ((byte & bit) != 0)
505                 {
506                     if (bit < 8)
507                         return false;
508 
509                     ++numExtraValues;
510                     bit >>= 1;
511 
512                     if (bit == 8 && (numExtraValues > maxBytesToRead
513                                        || *CharPointer_UTF8 (dataToTest - 1) > 0x10ffff))
514                         return false;
515                 }
516 
517                 if (numExtraValues == 0)
518                     return false;
519 
520                 maxBytesToRead -= numExtraValues;
521                 if (maxBytesToRead < 0)
522                     return false;
523 
524                 while (--numExtraValues >= 0)
525                     if ((*dataToTest++ & 0xc0) != 0x80)
526                         return false;
527             }
528         }
529 
530         return true;
531     }
532 
533     /** Atomically swaps this pointer for a new value, returning the previous value. */
atomicSwap(const CharPointer_UTF8 newValue)534     CharPointer_UTF8 atomicSwap (const CharPointer_UTF8 newValue)
535     {
536         return CharPointer_UTF8 (reinterpret_cast<Atomic<CharType*>&> (data).exchange (newValue.data));
537     }
538 
539     /** These values are the byte-order mark (BOM) values for a UTF-8 stream. */
540     enum
541     {
542         byteOrderMark1 = 0xef,
543         byteOrderMark2 = 0xbb,
544         byteOrderMark3 = 0xbf
545     };
546 
547     /** Returns true if the first three bytes in this pointer are the UTF8 byte-order mark (BOM).
548         The pointer must not be null, and must point to at least 3 valid bytes.
549     */
isByteOrderMark(const void * possibleByteOrder)550     static bool isByteOrderMark (const void* possibleByteOrder) noexcept
551     {
552         jassert (possibleByteOrder != nullptr);
553         auto c = static_cast<const uint8*> (possibleByteOrder);
554 
555         return c[0] == (uint8) byteOrderMark1
556             && c[1] == (uint8) byteOrderMark2
557             && c[2] == (uint8) byteOrderMark3;
558     }
559 
560 private:
561     CharType* data;
562 };
563 
564 } // namespace juce
565