1 /* 2 ============================================================================== 3 4 This file is part of the Water library. 5 Copyright (c) 2016 ROLI Ltd. 6 Copyright (C) 2017 Filipe Coelho <falktx@falktx.com> 7 8 Permission is granted to use this software under the terms of the ISC license 9 http://www.isc.org/downloads/software-support-policy/isc-license/ 10 11 Permission to use, copy, modify, and/or distribute this software for any 12 purpose with or without fee is hereby granted, provided that the above 13 copyright notice and this permission notice appear in all copies. 14 15 THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH REGARD 16 TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 17 FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, 18 OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF 19 USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 20 TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 21 OF THIS SOFTWARE. 22 23 ============================================================================== 24 */ 25 26 #ifndef WATER_CHARPOINTER_UTF8_H_INCLUDED 27 #define WATER_CHARPOINTER_UTF8_H_INCLUDED 28 29 #include "CharacterFunctions.h" 30 #include "../memory/Atomic.h" 31 32 #include "CarlaUtils.hpp" 33 34 namespace water { 35 36 //============================================================================== 37 /** 38 Wraps a pointer to a null-terminated UTF-8 character string, and provides 39 various methods to operate on the data. 40 @see CharPointer_UTF16, CharPointer_UTF32 41 */ 42 class CharPointer_UTF8 43 { 44 public: 45 typedef char CharType; 46 CharPointer_UTF8(const CharType * const rawPointer)47 inline explicit CharPointer_UTF8 (const CharType* const rawPointer) noexcept 48 : data (const_cast<CharType*> (rawPointer)) 49 { 50 } 51 CharPointer_UTF8(const CharPointer_UTF8 & other)52 inline CharPointer_UTF8 (const CharPointer_UTF8& other) noexcept 53 : data (other.data) 54 { 55 } 56 57 inline CharPointer_UTF8& operator= (CharPointer_UTF8 other) noexcept 58 { 59 data = other.data; 60 return *this; 61 } 62 63 inline CharPointer_UTF8& operator= (const CharType* text) noexcept 64 { 65 data = const_cast<CharType*> (text); 66 return *this; 67 } 68 69 /** This is a pointer comparison, it doesn't compare the actual text. */ 70 inline bool operator== (CharPointer_UTF8 other) const noexcept { return data == other.data; } 71 inline bool operator!= (CharPointer_UTF8 other) const noexcept { return data != other.data; } 72 inline bool operator<= (CharPointer_UTF8 other) const noexcept { return data <= other.data; } 73 inline bool operator< (CharPointer_UTF8 other) const noexcept { return data < other.data; } 74 inline bool operator>= (CharPointer_UTF8 other) const noexcept { return data >= other.data; } 75 inline bool operator> (CharPointer_UTF8 other) const noexcept { return data > other.data; } 76 77 /** Returns the address that this pointer is pointing to. */ getAddress()78 inline CharType* getAddress() const noexcept { return data; } 79 80 /** Returns the address that this pointer is pointing to. */ 81 inline operator const CharType*() const noexcept { return data; } 82 83 /** Returns true if this pointer is pointing to a null character. */ isEmpty()84 inline bool isEmpty() const noexcept { return *data == 0; } 85 86 /** Returns the unicode character that this pointer is pointing to. */ 87 water_uchar operator*() const noexcept 88 { 89 const signed char byte = (signed char) *data; 90 91 if (byte >= 0) 92 return (water_uchar) (uint8) byte; 93 94 uint32 n = (uint32) (uint8) byte; 95 uint32 mask = 0x7f; 96 uint32 bit = 0x40; 97 int numExtraValues = 0; 98 99 while ((n & bit) != 0 && bit > 0x8) 100 { 101 mask >>= 1; 102 ++numExtraValues; 103 bit >>= 1; 104 } 105 106 n &= mask; 107 108 for (int i = 1; i <= numExtraValues; ++i) 109 { 110 const uint32 nextByte = (uint32) (uint8) data[i]; 111 112 if ((nextByte & 0xc0) != 0x80) 113 break; 114 115 n <<= 6; 116 n |= (nextByte & 0x3f); 117 } 118 119 return (water_uchar) n; 120 } 121 122 /** Moves this pointer along to the next character in the string. */ 123 CharPointer_UTF8& operator++() noexcept 124 { 125 wassert (*data != 0); // trying to advance past the end of the string? 126 const signed char n = (signed char) *data++; 127 128 if (n < 0) 129 { 130 water_uchar bit = 0x40; 131 132 while ((static_cast<unsigned char>(n) & bit) != 0 && bit > 0x8) 133 { 134 ++data; 135 bit >>= 1; 136 } 137 } 138 139 return *this; 140 } 141 142 /** Moves this pointer back to the previous character in the string. */ 143 CharPointer_UTF8& operator--() noexcept 144 { 145 int count = 0; 146 147 while ((*--data & 0xc0) == 0x80 && ++count < 4) 148 {} 149 150 return *this; 151 } 152 153 /** Returns the character that this pointer is currently pointing to, and then 154 advances the pointer to point to the next character. */ getAndAdvance()155 water_uchar getAndAdvance() noexcept 156 { 157 const signed char byte = (signed char) *data++; 158 159 if (byte >= 0) 160 return (water_uchar) (uint8) byte; 161 162 uint32 n = (uint32) (uint8) byte; 163 uint32 mask = 0x7f; 164 uint32 bit = 0x40; 165 int numExtraValues = 0; 166 167 while ((n & bit) != 0 && bit > 0x8) 168 { 169 mask >>= 1; 170 ++numExtraValues; 171 bit >>= 1; 172 } 173 174 n &= mask; 175 176 while (--numExtraValues >= 0) 177 { 178 const uint32 nextByte = (uint32) (uint8) *data; 179 180 if ((nextByte & 0xc0) != 0x80) 181 break; 182 183 ++data; 184 n <<= 6; 185 n |= (nextByte & 0x3f); 186 } 187 188 return (water_uchar) n; 189 } 190 191 /** Moves this pointer along to the next character in the string. */ 192 CharPointer_UTF8 operator++ (int) noexcept 193 { 194 CharPointer_UTF8 temp (*this); 195 ++*this; 196 return temp; 197 } 198 199 /** Moves this pointer forwards by the specified number of characters. */ 200 void operator+= (int numToSkip) noexcept 201 { 202 if (numToSkip < 0) 203 { 204 while (++numToSkip <= 0) 205 --*this; 206 } 207 else 208 { 209 while (--numToSkip >= 0) 210 ++*this; 211 } 212 } 213 214 /** Moves this pointer backwards by the specified number of characters. */ 215 void operator-= (int numToSkip) noexcept 216 { 217 operator+= (-numToSkip); 218 } 219 220 /** Returns the character at a given character index from the start of the string. */ 221 water_uchar operator[] (int characterIndex) const noexcept 222 { 223 CharPointer_UTF8 p (*this); 224 p += characterIndex; 225 return *p; 226 } 227 228 /** Returns a pointer which is moved forwards from this one by the specified number of characters. */ 229 CharPointer_UTF8 operator+ (int numToSkip) const noexcept 230 { 231 CharPointer_UTF8 p (*this); 232 p += numToSkip; 233 return p; 234 } 235 236 /** Returns a pointer which is moved backwards from this one by the specified number of characters. */ 237 CharPointer_UTF8 operator- (int numToSkip) const noexcept 238 { 239 CharPointer_UTF8 p (*this); 240 p += -numToSkip; 241 return p; 242 } 243 244 /** Returns the number of characters in this string. */ length()245 size_t length() const noexcept 246 { 247 const CharType* d = data; 248 size_t count = 0; 249 250 for (;;) 251 { 252 const uint32 n = (uint32) (uint8) *d++; 253 254 if ((n & 0x80) != 0) 255 { 256 while ((*d & 0xc0) == 0x80) 257 ++d; 258 } 259 else if (n == 0) 260 break; 261 262 ++count; 263 } 264 265 return count; 266 } 267 268 /** Returns the number of characters in this string, or the given value, whichever is lower. */ lengthUpTo(const size_t maxCharsToCount)269 size_t lengthUpTo (const size_t maxCharsToCount) const noexcept 270 { 271 return CharacterFunctions::lengthUpTo (*this, maxCharsToCount); 272 } 273 274 /** Returns the number of characters in this string, or up to the given end pointer, whichever is lower. */ lengthUpTo(const CharPointer_UTF8 end)275 size_t lengthUpTo (const CharPointer_UTF8 end) const noexcept 276 { 277 return CharacterFunctions::lengthUpTo (*this, end); 278 } 279 280 /** Returns the number of bytes that are used to represent this string. 281 This includes the terminating null character. 282 */ sizeInBytes()283 size_t sizeInBytes() const noexcept 284 { 285 wassert (data != nullptr); 286 return strlen (data) + 1; 287 } 288 289 /** Returns the number of bytes that would be needed to represent the given 290 unicode character in this encoding format. 291 */ getBytesRequiredFor(const water_uchar charToWrite)292 static size_t getBytesRequiredFor (const water_uchar charToWrite) noexcept 293 { 294 size_t num = 1; 295 const uint32 c = (uint32) charToWrite; 296 297 if (c >= 0x80) 298 { 299 ++num; 300 if (c >= 0x800) 301 { 302 ++num; 303 if (c >= 0x10000) 304 ++num; 305 } 306 } 307 308 return num; 309 } 310 311 /** Returns the number of bytes that would be needed to represent the given 312 string in this encoding format. 313 The value returned does NOT include the terminating null character. 314 */ 315 template <class CharPointer> getBytesRequiredFor(CharPointer text)316 static size_t getBytesRequiredFor (CharPointer text) noexcept 317 { 318 size_t count = 0; 319 320 while (water_uchar n = text.getAndAdvance()) 321 count += getBytesRequiredFor (n); 322 323 return count; 324 } 325 326 /** Returns a pointer to the null character that terminates this string. */ findTerminatingNull()327 CharPointer_UTF8 findTerminatingNull() const noexcept 328 { 329 return CharPointer_UTF8 (data + strlen (data)); 330 } 331 332 /** Writes a unicode character to this string, and advances this pointer to point to the next position. */ write(const water_uchar charToWrite)333 void write (const water_uchar charToWrite) noexcept 334 { 335 const uint32 c = (uint32) charToWrite; 336 337 if (c >= 0x80) 338 { 339 int numExtraBytes = 1; 340 if (c >= 0x800) 341 { 342 ++numExtraBytes; 343 if (c >= 0x10000) 344 ++numExtraBytes; 345 } 346 347 *data++ = (CharType) ((uint32) (0xff << (7 - numExtraBytes)) | (c >> (numExtraBytes * 6))); 348 349 while (--numExtraBytes >= 0) 350 *data++ = (CharType) (0x80 | (0x3f & (c >> (numExtraBytes * 6)))); 351 } 352 else 353 { 354 *data++ = (CharType) c; 355 } 356 } 357 358 /** Writes a null character to this string (leaving the pointer's position unchanged). */ writeNull()359 inline void writeNull() const noexcept 360 { 361 *data = 0; 362 } 363 364 /** Copies a source string to this pointer, advancing this pointer as it goes. */ 365 template <typename CharPointer> writeAll(const CharPointer src)366 void writeAll (const CharPointer src) noexcept 367 { 368 CharacterFunctions::copyAll (*this, src); 369 } 370 371 /** Copies a source string to this pointer, advancing this pointer as it goes. */ writeAll(const CharPointer_UTF8 src)372 void writeAll (const CharPointer_UTF8 src) noexcept 373 { 374 const CharType* s = src.data; 375 376 while ((*data = *s) != 0) 377 { 378 ++data; 379 ++s; 380 } 381 } 382 383 /** Copies a source string to this pointer, advancing this pointer as it goes. 384 The maxDestBytes parameter specifies the maximum number of bytes that can be written 385 to the destination buffer before stopping. 386 */ 387 template <typename CharPointer> writeWithDestByteLimit(const CharPointer src,const size_t maxDestBytes)388 size_t writeWithDestByteLimit (const CharPointer src, const size_t maxDestBytes) noexcept 389 { 390 return CharacterFunctions::copyWithDestByteLimit (*this, src, maxDestBytes); 391 } 392 393 /** Copies a source string to this pointer, advancing this pointer as it goes. 394 The maxChars parameter specifies the maximum number of characters that can be 395 written to the destination buffer before stopping (including the terminating null). 396 */ 397 template <typename CharPointer> writeWithCharLimit(const CharPointer src,const int maxChars)398 void writeWithCharLimit (const CharPointer src, const int maxChars) noexcept 399 { 400 CharacterFunctions::copyWithCharLimit (*this, src, maxChars); 401 } 402 403 /** Compares this string with another one. */ 404 template <typename CharPointer> compare(const CharPointer other)405 int compare (const CharPointer other) const noexcept 406 { 407 return CharacterFunctions::compare (*this, other); 408 } 409 410 /** Compares this string with another one, up to a specified number of characters. */ 411 template <typename CharPointer> compareUpTo(const CharPointer other,const int maxChars)412 int compareUpTo (const CharPointer other, const int maxChars) const noexcept 413 { 414 return CharacterFunctions::compareUpTo (*this, other, maxChars); 415 } 416 417 /** Compares this string with another one. */ 418 template <typename CharPointer> compareIgnoreCase(const CharPointer other)419 int compareIgnoreCase (const CharPointer other) const noexcept 420 { 421 return CharacterFunctions::compareIgnoreCase (*this, other); 422 } 423 424 /** Compares this string with another one. */ compareIgnoreCase(const CharPointer_UTF8 other)425 int compareIgnoreCase (const CharPointer_UTF8 other) const noexcept 426 { 427 return CharacterFunctions::compareIgnoreCase (*this, other); 428 } 429 430 /** Compares this string with another one, up to a specified number of characters. */ 431 template <typename CharPointer> compareIgnoreCaseUpTo(const CharPointer other,const int maxChars)432 int compareIgnoreCaseUpTo (const CharPointer other, const int maxChars) const noexcept 433 { 434 return CharacterFunctions::compareIgnoreCaseUpTo (*this, other, maxChars); 435 } 436 437 /** Returns the character index of a substring, or -1 if it isn't found. */ 438 template <typename CharPointer> indexOf(const CharPointer stringToFind)439 int indexOf (const CharPointer stringToFind) const noexcept 440 { 441 return CharacterFunctions::indexOf (*this, stringToFind); 442 } 443 444 /** Returns the character index of a unicode character, or -1 if it isn't found. */ indexOf(const water_uchar charToFind)445 int indexOf (const water_uchar charToFind) const noexcept 446 { 447 return CharacterFunctions::indexOfChar (*this, charToFind); 448 } 449 450 /** Returns the character index of a unicode character, or -1 if it isn't found. */ indexOf(const water_uchar charToFind,const bool ignoreCase)451 int indexOf (const water_uchar charToFind, const bool ignoreCase) const noexcept 452 { 453 return ignoreCase ? CharacterFunctions::indexOfCharIgnoreCase (*this, charToFind) 454 : CharacterFunctions::indexOfChar (*this, charToFind); 455 } 456 457 /** Returns true if the first character of this string is whitespace. */ isWhitespace()458 bool isWhitespace() const noexcept { const CharType c = *data; return c == ' ' || (c <= 13 && c >= 9); } 459 /** Returns true if the first character of this string is a digit. */ isDigit()460 bool isDigit() const noexcept { const CharType c = *data; return c >= '0' && c <= '9'; } 461 /** Returns true if the first character of this string is a letter. */ isLetter()462 bool isLetter() const noexcept { return CharacterFunctions::isLetter (operator*()) != 0; } 463 /** Returns true if the first character of this string is a letter or digit. */ isLetterOrDigit()464 bool isLetterOrDigit() const noexcept { return CharacterFunctions::isLetterOrDigit (operator*()) != 0; } 465 /** Returns true if the first character of this string is upper-case. */ isUpperCase()466 bool isUpperCase() const noexcept { return CharacterFunctions::isUpperCase (operator*()) != 0; } 467 /** Returns true if the first character of this string is lower-case. */ isLowerCase()468 bool isLowerCase() const noexcept { return CharacterFunctions::isLowerCase (operator*()) != 0; } 469 470 /** Returns an upper-case version of the first character of this string. */ toUpperCase()471 water_uchar toUpperCase() const noexcept { return CharacterFunctions::toUpperCase (operator*()); } 472 /** Returns a lower-case version of the first character of this string. */ toLowerCase()473 water_uchar toLowerCase() const noexcept { return CharacterFunctions::toLowerCase (operator*()); } 474 475 /** Parses this string as a 32-bit integer. */ getIntValue32()476 int getIntValue32() const noexcept { return atoi (data); } 477 478 /** Parses this string as a 64-bit integer. */ getIntValue64()479 int64 getIntValue64() const noexcept 480 { 481 return atoll (data); 482 #if 0 483 return CharacterFunctions::getIntValue <int64, CharPointer_UTF8> (*this); 484 #endif 485 } 486 487 /** Parses this string as a floating point double. */ getDoubleValue()488 double getDoubleValue() const noexcept { return CharacterFunctions::getDoubleValue (*this); } 489 490 /** Returns the first non-whitespace character in the string. */ findEndOfWhitespace()491 CharPointer_UTF8 findEndOfWhitespace() const noexcept { return CharacterFunctions::findEndOfWhitespace (*this); } 492 493 /** Returns true if the given unicode character can be represented in this encoding. */ canRepresent(water_uchar character)494 static bool canRepresent (water_uchar character) noexcept 495 { 496 return ((unsigned int) character) < (unsigned int) 0x10ffff; 497 } 498 499 /** Returns true if this data contains a valid string in this encoding. */ isValidString(const CharType * dataToTest,int maxBytesToRead)500 static bool isValidString (const CharType* dataToTest, int maxBytesToRead) 501 { 502 while (--maxBytesToRead >= 0 && *dataToTest != 0) 503 { 504 const signed char byte = (signed char) *dataToTest++; 505 506 if (byte < 0) 507 { 508 int bit = 0x40; 509 int numExtraValues = 0; 510 511 while ((byte & bit) != 0) 512 { 513 if (bit < 8) 514 return false; 515 516 ++numExtraValues; 517 bit >>= 1; 518 519 if (bit == 8 && (numExtraValues > maxBytesToRead 520 || *CharPointer_UTF8 (dataToTest - 1) > 0x10ffff)) 521 return false; 522 } 523 524 if (numExtraValues == 0) 525 return false; 526 527 maxBytesToRead -= numExtraValues; 528 if (maxBytesToRead < 0) 529 return false; 530 531 while (--numExtraValues >= 0) 532 if ((*dataToTest++ & 0xc0) != 0x80) 533 return false; 534 } 535 } 536 537 return true; 538 } 539 540 /** Atomically swaps this pointer for a new value, returning the previous value. */ atomicSwap(const CharPointer_UTF8 newValue)541 CharPointer_UTF8 atomicSwap (const CharPointer_UTF8 newValue) 542 { 543 return CharPointer_UTF8 (reinterpret_cast<Atomic<CharType*>&> (data).exchange (newValue.data)); 544 } 545 546 /** These values are the byte-order mark (BOM) values for a UTF-8 stream. */ 547 enum 548 { 549 byteOrderMark1 = 0xef, 550 byteOrderMark2 = 0xbb, 551 byteOrderMark3 = 0xbf 552 }; 553 554 /** Returns true if the first three bytes in this pointer are the UTF8 byte-order mark (BOM). 555 The pointer must not be null, and must point to at least 3 valid bytes. 556 */ isByteOrderMark(const void * possibleByteOrder)557 static bool isByteOrderMark (const void* possibleByteOrder) noexcept 558 { 559 wassert (possibleByteOrder != nullptr); 560 const uint8* const c = static_cast<const uint8*> (possibleByteOrder); 561 562 return c[0] == (uint8) byteOrderMark1 563 && c[1] == (uint8) byteOrderMark2 564 && c[2] == (uint8) byteOrderMark3; 565 } 566 567 private: 568 CharType* data; 569 }; 570 571 } 572 573 #endif // WATER_CHARPOINTER_UTF8_H_INCLUDED 574