1 /*
2   ==============================================================================
3 
4    This file is part of the Water library.
5    Copyright (c) 2016 ROLI Ltd.
6    Copyright (C) 2017 Filipe Coelho <falktx@falktx.com>
7 
8    Permission is granted to use this software under the terms of the ISC license
9    http://www.isc.org/downloads/software-support-policy/isc-license/
10 
11    Permission to use, copy, modify, and/or distribute this software for any
12    purpose with or without fee is hereby granted, provided that the above
13    copyright notice and this permission notice appear in all copies.
14 
15    THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH REGARD
16    TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17    FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
18    OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
19    USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
20    TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
21    OF THIS SOFTWARE.
22 
23   ==============================================================================
24 */
25 
26 #ifndef WATER_CHARPOINTER_UTF8_H_INCLUDED
27 #define WATER_CHARPOINTER_UTF8_H_INCLUDED
28 
29 #include "CharacterFunctions.h"
30 #include "../memory/Atomic.h"
31 
32 #include "CarlaUtils.hpp"
33 
34 namespace water {
35 
36 //==============================================================================
37 /**
38     Wraps a pointer to a null-terminated UTF-8 character string, and provides
39     various methods to operate on the data.
40     @see CharPointer_UTF16, CharPointer_UTF32
41 */
42 class CharPointer_UTF8
43 {
44 public:
45     typedef char CharType;
46 
CharPointer_UTF8(const CharType * const rawPointer)47     inline explicit CharPointer_UTF8 (const CharType* const rawPointer) noexcept
48         : data (const_cast<CharType*> (rawPointer))
49     {
50     }
51 
CharPointer_UTF8(const CharPointer_UTF8 & other)52     inline CharPointer_UTF8 (const CharPointer_UTF8& other) noexcept
53         : data (other.data)
54     {
55     }
56 
57     inline CharPointer_UTF8& operator= (CharPointer_UTF8 other) noexcept
58     {
59         data = other.data;
60         return *this;
61     }
62 
63     inline CharPointer_UTF8& operator= (const CharType* text) noexcept
64     {
65         data = const_cast<CharType*> (text);
66         return *this;
67     }
68 
69     /** This is a pointer comparison, it doesn't compare the actual text. */
70     inline bool operator== (CharPointer_UTF8 other) const noexcept      { return data == other.data; }
71     inline bool operator!= (CharPointer_UTF8 other) const noexcept      { return data != other.data; }
72     inline bool operator<= (CharPointer_UTF8 other) const noexcept      { return data <= other.data; }
73     inline bool operator<  (CharPointer_UTF8 other) const noexcept      { return data <  other.data; }
74     inline bool operator>= (CharPointer_UTF8 other) const noexcept      { return data >= other.data; }
75     inline bool operator>  (CharPointer_UTF8 other) const noexcept      { return data >  other.data; }
76 
77     /** Returns the address that this pointer is pointing to. */
getAddress()78     inline CharType* getAddress() const noexcept        { return data; }
79 
80     /** Returns the address that this pointer is pointing to. */
81     inline operator const CharType*() const noexcept    { return data; }
82 
83     /** Returns true if this pointer is pointing to a null character. */
isEmpty()84     inline bool isEmpty() const noexcept                { return *data == 0; }
85 
86     /** Returns the unicode character that this pointer is pointing to. */
87     water_uchar operator*() const noexcept
88     {
89         const signed char byte = (signed char) *data;
90 
91         if (byte >= 0)
92             return (water_uchar) (uint8) byte;
93 
94         uint32 n = (uint32) (uint8) byte;
95         uint32 mask = 0x7f;
96         uint32 bit = 0x40;
97         int numExtraValues = 0;
98 
99         while ((n & bit) != 0 && bit > 0x8)
100         {
101             mask >>= 1;
102             ++numExtraValues;
103             bit >>= 1;
104         }
105 
106         n &= mask;
107 
108         for (int i = 1; i <= numExtraValues; ++i)
109         {
110             const uint32 nextByte = (uint32) (uint8) data[i];
111 
112             if ((nextByte & 0xc0) != 0x80)
113                 break;
114 
115             n <<= 6;
116             n |= (nextByte & 0x3f);
117         }
118 
119         return (water_uchar) n;
120     }
121 
122     /** Moves this pointer along to the next character in the string. */
123     CharPointer_UTF8& operator++() noexcept
124     {
125         wassert (*data != 0); // trying to advance past the end of the string?
126         const signed char n = (signed char) *data++;
127 
128         if (n < 0)
129         {
130             water_uchar bit = 0x40;
131 
132             while ((static_cast<unsigned char>(n) & bit) != 0 && bit > 0x8)
133             {
134                 ++data;
135                 bit >>= 1;
136             }
137         }
138 
139         return *this;
140     }
141 
142     /** Moves this pointer back to the previous character in the string. */
143     CharPointer_UTF8& operator--() noexcept
144     {
145         int count = 0;
146 
147         while ((*--data & 0xc0) == 0x80 && ++count < 4)
148         {}
149 
150         return *this;
151     }
152 
153     /** Returns the character that this pointer is currently pointing to, and then
154         advances the pointer to point to the next character. */
getAndAdvance()155     water_uchar getAndAdvance() noexcept
156     {
157         const signed char byte = (signed char) *data++;
158 
159         if (byte >= 0)
160             return (water_uchar) (uint8) byte;
161 
162         uint32 n = (uint32) (uint8) byte;
163         uint32 mask = 0x7f;
164         uint32 bit = 0x40;
165         int numExtraValues = 0;
166 
167         while ((n & bit) != 0 && bit > 0x8)
168         {
169             mask >>= 1;
170             ++numExtraValues;
171             bit >>= 1;
172         }
173 
174         n &= mask;
175 
176         while (--numExtraValues >= 0)
177         {
178             const uint32 nextByte = (uint32) (uint8) *data;
179 
180             if ((nextByte & 0xc0) != 0x80)
181                 break;
182 
183             ++data;
184             n <<= 6;
185             n |= (nextByte & 0x3f);
186         }
187 
188         return (water_uchar) n;
189     }
190 
191     /** Moves this pointer along to the next character in the string. */
192     CharPointer_UTF8 operator++ (int) noexcept
193     {
194         CharPointer_UTF8 temp (*this);
195         ++*this;
196         return temp;
197     }
198 
199     /** Moves this pointer forwards by the specified number of characters. */
200     void operator+= (int numToSkip) noexcept
201     {
202         if (numToSkip < 0)
203         {
204             while (++numToSkip <= 0)
205                 --*this;
206         }
207         else
208         {
209             while (--numToSkip >= 0)
210                 ++*this;
211         }
212     }
213 
214     /** Moves this pointer backwards by the specified number of characters. */
215     void operator-= (int numToSkip) noexcept
216     {
217         operator+= (-numToSkip);
218     }
219 
220     /** Returns the character at a given character index from the start of the string. */
221     water_uchar operator[] (int characterIndex) const noexcept
222     {
223         CharPointer_UTF8 p (*this);
224         p += characterIndex;
225         return *p;
226     }
227 
228     /** Returns a pointer which is moved forwards from this one by the specified number of characters. */
229     CharPointer_UTF8 operator+ (int numToSkip) const noexcept
230     {
231         CharPointer_UTF8 p (*this);
232         p += numToSkip;
233         return p;
234     }
235 
236     /** Returns a pointer which is moved backwards from this one by the specified number of characters. */
237     CharPointer_UTF8 operator- (int numToSkip) const noexcept
238     {
239         CharPointer_UTF8 p (*this);
240         p += -numToSkip;
241         return p;
242     }
243 
244     /** Returns the number of characters in this string. */
length()245     size_t length() const noexcept
246     {
247         const CharType* d = data;
248         size_t count = 0;
249 
250         for (;;)
251         {
252             const uint32 n = (uint32) (uint8) *d++;
253 
254             if ((n & 0x80) != 0)
255             {
256                 while ((*d & 0xc0) == 0x80)
257                     ++d;
258             }
259             else if (n == 0)
260                 break;
261 
262             ++count;
263         }
264 
265         return count;
266     }
267 
268     /** Returns the number of characters in this string, or the given value, whichever is lower. */
lengthUpTo(const size_t maxCharsToCount)269     size_t lengthUpTo (const size_t maxCharsToCount) const noexcept
270     {
271         return CharacterFunctions::lengthUpTo (*this, maxCharsToCount);
272     }
273 
274     /** Returns the number of characters in this string, or up to the given end pointer, whichever is lower. */
lengthUpTo(const CharPointer_UTF8 end)275     size_t lengthUpTo (const CharPointer_UTF8 end) const noexcept
276     {
277         return CharacterFunctions::lengthUpTo (*this, end);
278     }
279 
280     /** Returns the number of bytes that are used to represent this string.
281         This includes the terminating null character.
282     */
sizeInBytes()283     size_t sizeInBytes() const noexcept
284     {
285         wassert (data != nullptr);
286         return strlen (data) + 1;
287     }
288 
289     /** Returns the number of bytes that would be needed to represent the given
290         unicode character in this encoding format.
291     */
getBytesRequiredFor(const water_uchar charToWrite)292     static size_t getBytesRequiredFor (const water_uchar charToWrite) noexcept
293     {
294         size_t num = 1;
295         const uint32 c = (uint32) charToWrite;
296 
297         if (c >= 0x80)
298         {
299             ++num;
300             if (c >= 0x800)
301             {
302                 ++num;
303                 if (c >= 0x10000)
304                     ++num;
305             }
306         }
307 
308         return num;
309     }
310 
311     /** Returns the number of bytes that would be needed to represent the given
312         string in this encoding format.
313         The value returned does NOT include the terminating null character.
314     */
315     template <class CharPointer>
getBytesRequiredFor(CharPointer text)316     static size_t getBytesRequiredFor (CharPointer text) noexcept
317     {
318         size_t count = 0;
319 
320         while (water_uchar n = text.getAndAdvance())
321             count += getBytesRequiredFor (n);
322 
323         return count;
324     }
325 
326     /** Returns a pointer to the null character that terminates this string. */
findTerminatingNull()327     CharPointer_UTF8 findTerminatingNull() const noexcept
328     {
329         return CharPointer_UTF8 (data + strlen (data));
330     }
331 
332     /** Writes a unicode character to this string, and advances this pointer to point to the next position. */
write(const water_uchar charToWrite)333     void write (const water_uchar charToWrite) noexcept
334     {
335         const uint32 c = (uint32) charToWrite;
336 
337         if (c >= 0x80)
338         {
339             int numExtraBytes = 1;
340             if (c >= 0x800)
341             {
342                 ++numExtraBytes;
343                 if (c >= 0x10000)
344                     ++numExtraBytes;
345             }
346 
347             *data++ = (CharType) ((uint32) (0xff << (7 - numExtraBytes)) | (c >> (numExtraBytes * 6)));
348 
349             while (--numExtraBytes >= 0)
350                 *data++ = (CharType) (0x80 | (0x3f & (c >> (numExtraBytes * 6))));
351         }
352         else
353         {
354             *data++ = (CharType) c;
355         }
356     }
357 
358     /** Writes a null character to this string (leaving the pointer's position unchanged). */
writeNull()359     inline void writeNull() const noexcept
360     {
361         *data = 0;
362     }
363 
364     /** Copies a source string to this pointer, advancing this pointer as it goes. */
365     template <typename CharPointer>
writeAll(const CharPointer src)366     void writeAll (const CharPointer src) noexcept
367     {
368         CharacterFunctions::copyAll (*this, src);
369     }
370 
371     /** Copies a source string to this pointer, advancing this pointer as it goes. */
writeAll(const CharPointer_UTF8 src)372     void writeAll (const CharPointer_UTF8 src) noexcept
373     {
374         const CharType* s = src.data;
375 
376         while ((*data = *s) != 0)
377         {
378             ++data;
379             ++s;
380         }
381     }
382 
383     /** Copies a source string to this pointer, advancing this pointer as it goes.
384         The maxDestBytes parameter specifies the maximum number of bytes that can be written
385         to the destination buffer before stopping.
386     */
387     template <typename CharPointer>
writeWithDestByteLimit(const CharPointer src,const size_t maxDestBytes)388     size_t writeWithDestByteLimit (const CharPointer src, const size_t maxDestBytes) noexcept
389     {
390         return CharacterFunctions::copyWithDestByteLimit (*this, src, maxDestBytes);
391     }
392 
393     /** Copies a source string to this pointer, advancing this pointer as it goes.
394         The maxChars parameter specifies the maximum number of characters that can be
395         written to the destination buffer before stopping (including the terminating null).
396     */
397     template <typename CharPointer>
writeWithCharLimit(const CharPointer src,const int maxChars)398     void writeWithCharLimit (const CharPointer src, const int maxChars) noexcept
399     {
400         CharacterFunctions::copyWithCharLimit (*this, src, maxChars);
401     }
402 
403     /** Compares this string with another one. */
404     template <typename CharPointer>
compare(const CharPointer other)405     int compare (const CharPointer other) const noexcept
406     {
407         return CharacterFunctions::compare (*this, other);
408     }
409 
410     /** Compares this string with another one, up to a specified number of characters. */
411     template <typename CharPointer>
compareUpTo(const CharPointer other,const int maxChars)412     int compareUpTo (const CharPointer other, const int maxChars) const noexcept
413     {
414         return CharacterFunctions::compareUpTo (*this, other, maxChars);
415     }
416 
417     /** Compares this string with another one. */
418     template <typename CharPointer>
compareIgnoreCase(const CharPointer other)419     int compareIgnoreCase (const CharPointer other) const noexcept
420     {
421         return CharacterFunctions::compareIgnoreCase (*this, other);
422     }
423 
424     /** Compares this string with another one. */
compareIgnoreCase(const CharPointer_UTF8 other)425     int compareIgnoreCase (const CharPointer_UTF8 other) const noexcept
426     {
427         return CharacterFunctions::compareIgnoreCase (*this, other);
428     }
429 
430     /** Compares this string with another one, up to a specified number of characters. */
431     template <typename CharPointer>
compareIgnoreCaseUpTo(const CharPointer other,const int maxChars)432     int compareIgnoreCaseUpTo (const CharPointer other, const int maxChars) const noexcept
433     {
434         return CharacterFunctions::compareIgnoreCaseUpTo (*this, other, maxChars);
435     }
436 
437     /** Returns the character index of a substring, or -1 if it isn't found. */
438     template <typename CharPointer>
indexOf(const CharPointer stringToFind)439     int indexOf (const CharPointer stringToFind) const noexcept
440     {
441         return CharacterFunctions::indexOf (*this, stringToFind);
442     }
443 
444     /** Returns the character index of a unicode character, or -1 if it isn't found. */
indexOf(const water_uchar charToFind)445     int indexOf (const water_uchar charToFind) const noexcept
446     {
447         return CharacterFunctions::indexOfChar (*this, charToFind);
448     }
449 
450     /** Returns the character index of a unicode character, or -1 if it isn't found. */
indexOf(const water_uchar charToFind,const bool ignoreCase)451     int indexOf (const water_uchar charToFind, const bool ignoreCase) const noexcept
452     {
453         return ignoreCase ? CharacterFunctions::indexOfCharIgnoreCase (*this, charToFind)
454                           : CharacterFunctions::indexOfChar (*this, charToFind);
455     }
456 
457     /** Returns true if the first character of this string is whitespace. */
isWhitespace()458     bool isWhitespace() const noexcept      { const CharType c = *data; return c == ' ' || (c <= 13 && c >= 9); }
459     /** Returns true if the first character of this string is a digit. */
isDigit()460     bool isDigit() const noexcept           { const CharType c = *data; return c >= '0' && c <= '9'; }
461     /** Returns true if the first character of this string is a letter. */
isLetter()462     bool isLetter() const noexcept          { return CharacterFunctions::isLetter (operator*()) != 0; }
463     /** Returns true if the first character of this string is a letter or digit. */
isLetterOrDigit()464     bool isLetterOrDigit() const noexcept   { return CharacterFunctions::isLetterOrDigit (operator*()) != 0; }
465     /** Returns true if the first character of this string is upper-case. */
isUpperCase()466     bool isUpperCase() const noexcept       { return CharacterFunctions::isUpperCase (operator*()) != 0; }
467     /** Returns true if the first character of this string is lower-case. */
isLowerCase()468     bool isLowerCase() const noexcept       { return CharacterFunctions::isLowerCase (operator*()) != 0; }
469 
470     /** Returns an upper-case version of the first character of this string. */
toUpperCase()471     water_uchar toUpperCase() const noexcept { return CharacterFunctions::toUpperCase (operator*()); }
472     /** Returns a lower-case version of the first character of this string. */
toLowerCase()473     water_uchar toLowerCase() const noexcept { return CharacterFunctions::toLowerCase (operator*()); }
474 
475     /** Parses this string as a 32-bit integer. */
getIntValue32()476     int getIntValue32() const noexcept      { return atoi (data); }
477 
478     /** Parses this string as a 64-bit integer. */
getIntValue64()479     int64 getIntValue64() const noexcept
480     {
481         return atoll (data);
482        #if 0
483         return CharacterFunctions::getIntValue <int64, CharPointer_UTF8> (*this);
484        #endif
485     }
486 
487     /** Parses this string as a floating point double. */
getDoubleValue()488     double getDoubleValue() const noexcept  { return CharacterFunctions::getDoubleValue (*this); }
489 
490     /** Returns the first non-whitespace character in the string. */
findEndOfWhitespace()491     CharPointer_UTF8 findEndOfWhitespace() const noexcept   { return CharacterFunctions::findEndOfWhitespace (*this); }
492 
493     /** Returns true if the given unicode character can be represented in this encoding. */
canRepresent(water_uchar character)494     static bool canRepresent (water_uchar character) noexcept
495     {
496         return ((unsigned int) character) < (unsigned int) 0x10ffff;
497     }
498 
499     /** Returns true if this data contains a valid string in this encoding. */
isValidString(const CharType * dataToTest,int maxBytesToRead)500     static bool isValidString (const CharType* dataToTest, int maxBytesToRead)
501     {
502         while (--maxBytesToRead >= 0 && *dataToTest != 0)
503         {
504             const signed char byte = (signed char) *dataToTest++;
505 
506             if (byte < 0)
507             {
508                 int bit = 0x40;
509                 int numExtraValues = 0;
510 
511                 while ((byte & bit) != 0)
512                 {
513                     if (bit < 8)
514                         return false;
515 
516                     ++numExtraValues;
517                     bit >>= 1;
518 
519                     if (bit == 8 && (numExtraValues > maxBytesToRead
520                                        || *CharPointer_UTF8 (dataToTest - 1) > 0x10ffff))
521                         return false;
522                 }
523 
524                 if (numExtraValues == 0)
525                     return false;
526 
527                 maxBytesToRead -= numExtraValues;
528                 if (maxBytesToRead < 0)
529                     return false;
530 
531                 while (--numExtraValues >= 0)
532                     if ((*dataToTest++ & 0xc0) != 0x80)
533                         return false;
534             }
535         }
536 
537         return true;
538     }
539 
540     /** Atomically swaps this pointer for a new value, returning the previous value. */
atomicSwap(const CharPointer_UTF8 newValue)541     CharPointer_UTF8 atomicSwap (const CharPointer_UTF8 newValue)
542     {
543         return CharPointer_UTF8 (reinterpret_cast<Atomic<CharType*>&> (data).exchange (newValue.data));
544     }
545 
546     /** These values are the byte-order mark (BOM) values for a UTF-8 stream. */
547     enum
548     {
549         byteOrderMark1 = 0xef,
550         byteOrderMark2 = 0xbb,
551         byteOrderMark3 = 0xbf
552     };
553 
554     /** Returns true if the first three bytes in this pointer are the UTF8 byte-order mark (BOM).
555         The pointer must not be null, and must point to at least 3 valid bytes.
556     */
isByteOrderMark(const void * possibleByteOrder)557     static bool isByteOrderMark (const void* possibleByteOrder) noexcept
558     {
559         wassert (possibleByteOrder != nullptr);
560         const uint8* const c = static_cast<const uint8*> (possibleByteOrder);
561 
562         return c[0] == (uint8) byteOrderMark1
563             && c[1] == (uint8) byteOrderMark2
564             && c[2] == (uint8) byteOrderMark3;
565     }
566 
567 private:
568     CharType* data;
569 };
570 
571 }
572 
573 #endif // WATER_CHARPOINTER_UTF8_H_INCLUDED
574