1 /* 2 ============================================================================== 3 4 This file is part of the JUCE library. 5 Copyright (c) 2020 - Raw Material Software Limited 6 7 JUCE is an open source library subject to commercial or open-source 8 licensing. 9 10 The code included in this file is provided under the terms of the ISC license 11 http://www.isc.org/downloads/software-support-policy/isc-license. Permission 12 To use, copy, modify, and/or distribute this software for any purpose with or 13 without fee is hereby granted provided that the above copyright notice and 14 this permission notice appear in all copies. 15 16 JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER 17 EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE 18 DISCLAIMED. 19 20 ============================================================================== 21 */ 22 23 namespace juce 24 { 25 26 //============================================================================== 27 /** 28 Wraps a pointer to a null-terminated UTF-8 character string, and provides 29 various methods to operate on the data. 30 @see CharPointer_UTF16, CharPointer_UTF32 31 32 @tags{Core} 33 */ 34 class CharPointer_UTF8 final 35 { 36 public: 37 using CharType = char; 38 CharPointer_UTF8(const CharType * rawPointer)39 explicit CharPointer_UTF8 (const CharType* rawPointer) noexcept 40 : data (const_cast<CharType*> (rawPointer)) 41 { 42 } 43 44 CharPointer_UTF8 (const CharPointer_UTF8& other) = default; 45 46 CharPointer_UTF8 operator= (CharPointer_UTF8 other) noexcept 47 { 48 data = other.data; 49 return *this; 50 } 51 52 CharPointer_UTF8 operator= (const CharType* text) noexcept 53 { 54 data = const_cast<CharType*> (text); 55 return *this; 56 } 57 58 /** This is a pointer comparison, it doesn't compare the actual text. */ 59 bool operator== (CharPointer_UTF8 other) const noexcept { return data == other.data; } 60 bool operator!= (CharPointer_UTF8 other) const noexcept { return data != other.data; } 61 bool operator<= (CharPointer_UTF8 other) const noexcept { return data <= other.data; } 62 bool operator< (CharPointer_UTF8 other) const noexcept { return data < other.data; } 63 bool operator>= (CharPointer_UTF8 other) const noexcept { return data >= other.data; } 64 bool operator> (CharPointer_UTF8 other) const noexcept { return data > other.data; } 65 66 /** Returns the address that this pointer is pointing to. */ getAddress()67 CharType* getAddress() const noexcept { return data; } 68 69 /** Returns the address that this pointer is pointing to. */ 70 operator const CharType*() const noexcept { return data; } 71 72 /** Returns true if this pointer is pointing to a null character. */ isEmpty()73 bool isEmpty() const noexcept { return *data == 0; } 74 75 /** Returns true if this pointer is not pointing to a null character. */ isNotEmpty()76 bool isNotEmpty() const noexcept { return *data != 0; } 77 78 /** Returns the unicode character that this pointer is pointing to. */ 79 juce_wchar operator*() const noexcept 80 { 81 auto byte = (signed char) *data; 82 83 if (byte >= 0) 84 return (juce_wchar) (uint8) byte; 85 86 uint32 n = (uint32) (uint8) byte; 87 uint32 mask = 0x7f; 88 uint32 bit = 0x40; 89 int numExtraValues = 0; 90 91 while ((n & bit) != 0 && bit > 0x8) 92 { 93 mask >>= 1; 94 ++numExtraValues; 95 bit >>= 1; 96 } 97 98 n &= mask; 99 100 for (int i = 1; i <= numExtraValues; ++i) 101 { 102 auto nextByte = (uint32) (uint8) data[i]; 103 104 if ((nextByte & 0xc0) != 0x80) 105 break; 106 107 n <<= 6; 108 n |= (nextByte & 0x3f); 109 } 110 111 return (juce_wchar) n; 112 } 113 114 /** Moves this pointer along to the next character in the string. */ 115 CharPointer_UTF8& operator++() noexcept 116 { 117 jassert (*data != 0); // trying to advance past the end of the string? 118 auto n = (signed char) *data++; 119 120 if (n < 0) 121 { 122 uint8 bit = 0x40; 123 124 while ((static_cast<uint8> (n) & bit) != 0 && bit > 0x8) 125 { 126 ++data; 127 bit = static_cast<uint8> (bit >> 1); 128 } 129 } 130 131 return *this; 132 } 133 134 /** Moves this pointer back to the previous character in the string. */ 135 CharPointer_UTF8 operator--() noexcept 136 { 137 int count = 0; 138 139 while ((*--data & 0xc0) == 0x80 && ++count < 4) 140 {} 141 142 return *this; 143 } 144 145 /** Returns the character that this pointer is currently pointing to, and then 146 advances the pointer to point to the next character. */ getAndAdvance()147 juce_wchar getAndAdvance() noexcept 148 { 149 auto byte = (signed char) *data++; 150 151 if (byte >= 0) 152 return (juce_wchar) (uint8) byte; 153 154 uint32 n = (uint32) (uint8) byte; 155 uint32 mask = 0x7f; 156 uint32 bit = 0x40; 157 int numExtraValues = 0; 158 159 while ((n & bit) != 0 && bit > 0x8) 160 { 161 mask >>= 1; 162 ++numExtraValues; 163 bit >>= 1; 164 } 165 166 n &= mask; 167 168 while (--numExtraValues >= 0) 169 { 170 auto nextByte = (uint32) (uint8) *data; 171 172 if ((nextByte & 0xc0) != 0x80) 173 break; 174 175 ++data; 176 n <<= 6; 177 n |= (nextByte & 0x3f); 178 } 179 180 return (juce_wchar) n; 181 } 182 183 /** Moves this pointer along to the next character in the string. */ 184 CharPointer_UTF8 operator++ (int) noexcept 185 { 186 CharPointer_UTF8 temp (*this); 187 ++*this; 188 return temp; 189 } 190 191 /** Moves this pointer forwards by the specified number of characters. */ 192 void operator+= (int numToSkip) noexcept 193 { 194 if (numToSkip < 0) 195 { 196 while (++numToSkip <= 0) 197 --*this; 198 } 199 else 200 { 201 while (--numToSkip >= 0) 202 ++*this; 203 } 204 } 205 206 /** Moves this pointer backwards by the specified number of characters. */ 207 void operator-= (int numToSkip) noexcept 208 { 209 operator+= (-numToSkip); 210 } 211 212 /** Returns the character at a given character index from the start of the string. */ 213 juce_wchar operator[] (int characterIndex) const noexcept 214 { 215 auto p (*this); 216 p += characterIndex; 217 return *p; 218 } 219 220 /** Returns a pointer which is moved forwards from this one by the specified number of characters. */ 221 CharPointer_UTF8 operator+ (int numToSkip) const noexcept 222 { 223 auto p (*this); 224 p += numToSkip; 225 return p; 226 } 227 228 /** Returns a pointer which is moved backwards from this one by the specified number of characters. */ 229 CharPointer_UTF8 operator- (int numToSkip) const noexcept 230 { 231 auto p (*this); 232 p += -numToSkip; 233 return p; 234 } 235 236 /** Returns the number of characters in this string. */ length()237 size_t length() const noexcept 238 { 239 auto* d = data; 240 size_t count = 0; 241 242 for (;;) 243 { 244 auto n = (uint32) (uint8) *d++; 245 246 if ((n & 0x80) != 0) 247 { 248 while ((*d & 0xc0) == 0x80) 249 ++d; 250 } 251 else if (n == 0) 252 break; 253 254 ++count; 255 } 256 257 return count; 258 } 259 260 /** Returns the number of characters in this string, or the given value, whichever is lower. */ lengthUpTo(const size_t maxCharsToCount)261 size_t lengthUpTo (const size_t maxCharsToCount) const noexcept 262 { 263 return CharacterFunctions::lengthUpTo (*this, maxCharsToCount); 264 } 265 266 /** Returns the number of characters in this string, or up to the given end pointer, whichever is lower. */ lengthUpTo(const CharPointer_UTF8 end)267 size_t lengthUpTo (const CharPointer_UTF8 end) const noexcept 268 { 269 return CharacterFunctions::lengthUpTo (*this, end); 270 } 271 272 /** Returns the number of bytes that are used to represent this string. 273 This includes the terminating null character. 274 */ sizeInBytes()275 size_t sizeInBytes() const noexcept 276 { 277 jassert (data != nullptr); 278 return strlen (data) + 1; 279 } 280 281 /** Returns the number of bytes that would be needed to represent the given 282 unicode character in this encoding format. 283 */ getBytesRequiredFor(const juce_wchar charToWrite)284 static size_t getBytesRequiredFor (const juce_wchar charToWrite) noexcept 285 { 286 size_t num = 1; 287 auto c = (uint32) charToWrite; 288 289 if (c >= 0x80) 290 { 291 ++num; 292 if (c >= 0x800) 293 { 294 ++num; 295 if (c >= 0x10000) 296 ++num; 297 } 298 } 299 300 return num; 301 } 302 303 /** Returns the number of bytes that would be needed to represent the given 304 string in this encoding format. 305 The value returned does NOT include the terminating null character. 306 */ 307 template <class CharPointer> getBytesRequiredFor(CharPointer text)308 static size_t getBytesRequiredFor (CharPointer text) noexcept 309 { 310 size_t count = 0; 311 312 while (auto n = text.getAndAdvance()) 313 count += getBytesRequiredFor (n); 314 315 return count; 316 } 317 318 /** Returns a pointer to the null character that terminates this string. */ findTerminatingNull()319 CharPointer_UTF8 findTerminatingNull() const noexcept 320 { 321 return CharPointer_UTF8 (data + strlen (data)); 322 } 323 324 /** Writes a unicode character to this string, and advances this pointer to point to the next position. */ write(const juce_wchar charToWrite)325 void write (const juce_wchar charToWrite) noexcept 326 { 327 auto c = (uint32) charToWrite; 328 329 if (c >= 0x80) 330 { 331 int numExtraBytes = 1; 332 if (c >= 0x800) 333 { 334 ++numExtraBytes; 335 if (c >= 0x10000) 336 ++numExtraBytes; 337 } 338 339 *data++ = (CharType) ((uint32) (0xff << (7 - numExtraBytes)) | (c >> (numExtraBytes * 6))); 340 341 while (--numExtraBytes >= 0) 342 *data++ = (CharType) (0x80 | (0x3f & (c >> (numExtraBytes * 6)))); 343 } 344 else 345 { 346 *data++ = (CharType) c; 347 } 348 } 349 350 /** Writes a null character to this string (leaving the pointer's position unchanged). */ writeNull()351 void writeNull() const noexcept 352 { 353 *data = 0; 354 } 355 356 /** Copies a source string to this pointer, advancing this pointer as it goes. */ 357 template <typename CharPointer> writeAll(const CharPointer src)358 void writeAll (const CharPointer src) noexcept 359 { 360 CharacterFunctions::copyAll (*this, src); 361 } 362 363 /** Copies a source string to this pointer, advancing this pointer as it goes. */ writeAll(const CharPointer_UTF8 src)364 void writeAll (const CharPointer_UTF8 src) noexcept 365 { 366 auto* s = src.data; 367 368 while ((*data = *s) != 0) 369 { 370 ++data; 371 ++s; 372 } 373 } 374 375 /** Copies a source string to this pointer, advancing this pointer as it goes. 376 The maxDestBytes parameter specifies the maximum number of bytes that can be written 377 to the destination buffer before stopping. 378 */ 379 template <typename CharPointer> writeWithDestByteLimit(const CharPointer src,const size_t maxDestBytes)380 size_t writeWithDestByteLimit (const CharPointer src, const size_t maxDestBytes) noexcept 381 { 382 return CharacterFunctions::copyWithDestByteLimit (*this, src, maxDestBytes); 383 } 384 385 /** Copies a source string to this pointer, advancing this pointer as it goes. 386 The maxChars parameter specifies the maximum number of characters that can be 387 written to the destination buffer before stopping (including the terminating null). 388 */ 389 template <typename CharPointer> writeWithCharLimit(const CharPointer src,const int maxChars)390 void writeWithCharLimit (const CharPointer src, const int maxChars) noexcept 391 { 392 CharacterFunctions::copyWithCharLimit (*this, src, maxChars); 393 } 394 395 /** Compares this string with another one. */ 396 template <typename CharPointer> compare(const CharPointer other)397 int compare (const CharPointer other) const noexcept 398 { 399 return CharacterFunctions::compare (*this, other); 400 } 401 402 /** Compares this string with another one, up to a specified number of characters. */ 403 template <typename CharPointer> compareUpTo(const CharPointer other,const int maxChars)404 int compareUpTo (const CharPointer other, const int maxChars) const noexcept 405 { 406 return CharacterFunctions::compareUpTo (*this, other, maxChars); 407 } 408 409 /** Compares this string with another one. */ 410 template <typename CharPointer> compareIgnoreCase(const CharPointer other)411 int compareIgnoreCase (const CharPointer other) const noexcept 412 { 413 return CharacterFunctions::compareIgnoreCase (*this, other); 414 } 415 416 /** Compares this string with another one. */ compareIgnoreCase(const CharPointer_UTF8 other)417 int compareIgnoreCase (const CharPointer_UTF8 other) const noexcept 418 { 419 return CharacterFunctions::compareIgnoreCase (*this, other); 420 } 421 422 /** Compares this string with another one, up to a specified number of characters. */ 423 template <typename CharPointer> compareIgnoreCaseUpTo(const CharPointer other,const int maxChars)424 int compareIgnoreCaseUpTo (const CharPointer other, const int maxChars) const noexcept 425 { 426 return CharacterFunctions::compareIgnoreCaseUpTo (*this, other, maxChars); 427 } 428 429 /** Returns the character index of a substring, or -1 if it isn't found. */ 430 template <typename CharPointer> indexOf(const CharPointer stringToFind)431 int indexOf (const CharPointer stringToFind) const noexcept 432 { 433 return CharacterFunctions::indexOf (*this, stringToFind); 434 } 435 436 /** Returns the character index of a unicode character, or -1 if it isn't found. */ indexOf(const juce_wchar charToFind)437 int indexOf (const juce_wchar charToFind) const noexcept 438 { 439 return CharacterFunctions::indexOfChar (*this, charToFind); 440 } 441 442 /** Returns the character index of a unicode character, or -1 if it isn't found. */ indexOf(const juce_wchar charToFind,const bool ignoreCase)443 int indexOf (const juce_wchar charToFind, const bool ignoreCase) const noexcept 444 { 445 return ignoreCase ? CharacterFunctions::indexOfCharIgnoreCase (*this, charToFind) 446 : CharacterFunctions::indexOfChar (*this, charToFind); 447 } 448 449 /** Returns true if the first character of this string is whitespace. */ isWhitespace()450 bool isWhitespace() const noexcept { const CharType c = *data; return c == ' ' || (c <= 13 && c >= 9); } 451 /** Returns true if the first character of this string is a digit. */ isDigit()452 bool isDigit() const noexcept { const CharType c = *data; return c >= '0' && c <= '9'; } 453 /** Returns true if the first character of this string is a letter. */ isLetter()454 bool isLetter() const noexcept { return CharacterFunctions::isLetter (operator*()) != 0; } 455 /** Returns true if the first character of this string is a letter or digit. */ isLetterOrDigit()456 bool isLetterOrDigit() const noexcept { return CharacterFunctions::isLetterOrDigit (operator*()) != 0; } 457 /** Returns true if the first character of this string is upper-case. */ isUpperCase()458 bool isUpperCase() const noexcept { return CharacterFunctions::isUpperCase (operator*()) != 0; } 459 /** Returns true if the first character of this string is lower-case. */ isLowerCase()460 bool isLowerCase() const noexcept { return CharacterFunctions::isLowerCase (operator*()) != 0; } 461 462 /** Returns an upper-case version of the first character of this string. */ toUpperCase()463 juce_wchar toUpperCase() const noexcept { return CharacterFunctions::toUpperCase (operator*()); } 464 /** Returns a lower-case version of the first character of this string. */ toLowerCase()465 juce_wchar toLowerCase() const noexcept { return CharacterFunctions::toLowerCase (operator*()); } 466 467 /** Parses this string as a 32-bit integer. */ getIntValue32()468 int getIntValue32() const noexcept { return atoi (data); } 469 470 /** Parses this string as a 64-bit integer. */ getIntValue64()471 int64 getIntValue64() const noexcept 472 { 473 #if JUCE_WINDOWS && ! JUCE_MINGW 474 return _atoi64 (data); 475 #else 476 return atoll (data); 477 #endif 478 } 479 480 /** Parses this string as a floating point double. */ getDoubleValue()481 double getDoubleValue() const noexcept { return CharacterFunctions::getDoubleValue (*this); } 482 483 /** Returns the first non-whitespace character in the string. */ findEndOfWhitespace()484 CharPointer_UTF8 findEndOfWhitespace() const noexcept { return CharacterFunctions::findEndOfWhitespace (*this); } 485 486 /** Returns true if the given unicode character can be represented in this encoding. */ canRepresent(juce_wchar character)487 static bool canRepresent (juce_wchar character) noexcept 488 { 489 return ((uint32) character) < (uint32) 0x10ffff; 490 } 491 492 /** Returns true if this data contains a valid string in this encoding. */ isValidString(const CharType * dataToTest,int maxBytesToRead)493 static bool isValidString (const CharType* dataToTest, int maxBytesToRead) 494 { 495 while (--maxBytesToRead >= 0 && *dataToTest != 0) 496 { 497 auto byte = (signed char) *dataToTest++; 498 499 if (byte < 0) 500 { 501 int bit = 0x40; 502 int numExtraValues = 0; 503 504 while ((byte & bit) != 0) 505 { 506 if (bit < 8) 507 return false; 508 509 ++numExtraValues; 510 bit >>= 1; 511 512 if (bit == 8 && (numExtraValues > maxBytesToRead 513 || *CharPointer_UTF8 (dataToTest - 1) > 0x10ffff)) 514 return false; 515 } 516 517 if (numExtraValues == 0) 518 return false; 519 520 maxBytesToRead -= numExtraValues; 521 if (maxBytesToRead < 0) 522 return false; 523 524 while (--numExtraValues >= 0) 525 if ((*dataToTest++ & 0xc0) != 0x80) 526 return false; 527 } 528 } 529 530 return true; 531 } 532 533 /** Atomically swaps this pointer for a new value, returning the previous value. */ atomicSwap(const CharPointer_UTF8 newValue)534 CharPointer_UTF8 atomicSwap (const CharPointer_UTF8 newValue) 535 { 536 return CharPointer_UTF8 (reinterpret_cast<Atomic<CharType*>&> (data).exchange (newValue.data)); 537 } 538 539 /** These values are the byte-order mark (BOM) values for a UTF-8 stream. */ 540 enum 541 { 542 byteOrderMark1 = 0xef, 543 byteOrderMark2 = 0xbb, 544 byteOrderMark3 = 0xbf 545 }; 546 547 /** Returns true if the first three bytes in this pointer are the UTF8 byte-order mark (BOM). 548 The pointer must not be null, and must point to at least 3 valid bytes. 549 */ isByteOrderMark(const void * possibleByteOrder)550 static bool isByteOrderMark (const void* possibleByteOrder) noexcept 551 { 552 jassert (possibleByteOrder != nullptr); 553 auto c = static_cast<const uint8*> (possibleByteOrder); 554 555 return c[0] == (uint8) byteOrderMark1 556 && c[1] == (uint8) byteOrderMark2 557 && c[2] == (uint8) byteOrderMark3; 558 } 559 560 private: 561 CharType* data; 562 }; 563 564 } // namespace juce 565