1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef nsCharTraits_h___ 8 #define nsCharTraits_h___ 9 10 #include <ctype.h> // for |EOF|, |WEOF| 11 #include <stdint.h> // for |uint32_t| 12 #include <string.h> // for |memcpy|, et al 13 #include "mozilla/MemoryChecking.h" 14 15 // This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in 16 // particular the standalone software updater. In that case stub out 17 // the macros provided by nsDebug.h which are only usable when linking XPCOM 18 19 #ifdef NS_NO_XPCOM 20 # define NS_WARNING(msg) 21 # define NS_ASSERTION(cond, msg) 22 # define NS_ERROR(msg) 23 #else 24 # include "nsDebug.h" // for NS_ASSERTION 25 #endif 26 27 /* 28 * Some macros for converting char16_t (UTF-16) to and from Unicode scalar 29 * values. 30 * 31 * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by 32 * using "surrogate pairs". These consist of a high surrogate, i.e. a code 33 * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point 34 * in the range U+DC00 - U+DFFF, like this: 35 * 36 * U+D800 U+DC00 = U+10000 37 * U+D800 U+DC01 = U+10001 38 * ... 39 * U+DBFF U+DFFE = U+10FFFE 40 * U+DBFF U+DFFF = U+10FFFF 41 * 42 * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode 43 * scalar values and are not well-formed UTF-16 except as high-surrogate / 44 * low-surrogate pairs. 45 */ 46 47 #define PLANE1_BASE uint32_t(0x00010000) 48 // High surrogates are in the range 0xD800 -- OxDBFF 49 #define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800) 50 // Low surrogates are in the range 0xDC00 -- 0xDFFF 51 #define NS_IS_LOW_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xDC00) 52 // Easier to type than NS_IS_HIGH_SURROGATE && NS_IS_LOW_SURROGATE 53 #define NS_IS_SURROGATE_PAIR(h, l) \ 54 (NS_IS_HIGH_SURROGATE(h) && NS_IS_LOW_SURROGATE(l)) 55 // Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE 56 #define IS_SURROGATE(u) ((uint32_t(u) & 0xFFFFF800) == 0xD800) 57 58 // Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF 59 60 // N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00) 61 // I wonder whether we could somehow assert that H is a high surrogate 62 // and L is a low surrogate 63 #define SURROGATE_TO_UCS4(h, l) \ 64 (((uint32_t(h) & 0x03FF) << 10) + (uint32_t(l) & 0x03FF) + PLANE1_BASE) 65 66 // Extract surrogates from a UCS4 char 67 // Reference: the Unicode standard 4.0, section 3.9 68 // Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and 69 // 0xD7C0 == 0xD800 - 0x0080, 70 // ((c - 0x10000) >> 10) + 0xD800 can be simplified to 71 #define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + char16_t(0xD7C0)) 72 // where it's to be noted that 0xD7C0 is not bitwise-OR'd 73 // but added. 74 75 // Since 0x10000 & 0x03FF == 0, 76 // (c - 0x10000) & 0x03FF == c & 0x03FF so that 77 // ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to 78 #define L_SURROGATE(c) \ 79 char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | char16_t(0xDC00)) 80 81 #define IS_IN_BMP(ucs) (uint32_t(ucs) < PLANE1_BASE) 82 #define UCS2_REPLACEMENT_CHAR char16_t(0xFFFD) 83 84 #define UCS_END uint32_t(0x00110000) 85 #define IS_VALID_CHAR(c) ((uint32_t(c) < UCS_END) && !IS_SURROGATE(c)) 86 #define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR) 87 88 template <class CharT> 89 struct nsCharTraits {}; 90 91 template <> 92 struct nsCharTraits<char16_t> { 93 typedef char16_t char_type; 94 typedef uint16_t unsigned_char_type; 95 typedef char incompatible_char_type; 96 97 static char_type* const sEmptyBuffer; 98 99 // integer representation of characters: 100 typedef int int_type; 101 102 static char_type to_char_type(int_type aChar) { return char_type(aChar); } 103 104 static int_type to_int_type(char_type aChar) { 105 return int_type(static_cast<unsigned_char_type>(aChar)); 106 } 107 108 static bool eq_int_type(int_type aLhs, int_type aRhs) { return aLhs == aRhs; } 109 110 // |char_type| comparisons: 111 112 static bool eq(char_type aLhs, char_type aRhs) { return aLhs == aRhs; } 113 114 static bool lt(char_type aLhs, char_type aRhs) { return aLhs < aRhs; } 115 116 // operations on s[n] arrays: 117 118 static char_type* move(char_type* aStr1, const char_type* aStr2, size_t aN) { 119 return static_cast<char_type*>( 120 memmove(aStr1, aStr2, aN * sizeof(char_type))); 121 } 122 123 static char_type* copy(char_type* aStr1, const char_type* aStr2, size_t aN) { 124 return static_cast<char_type*>( 125 memcpy(aStr1, aStr2, aN * sizeof(char_type))); 126 } 127 128 static void uninitialize(char_type* aStr, size_t aN) { 129 #ifdef DEBUG 130 memset(aStr, 0xE4, aN * sizeof(char_type)); 131 #endif 132 MOZ_MAKE_MEM_UNDEFINED(aStr, aN * sizeof(char_type)); 133 } 134 135 static char_type* copyASCII(char_type* aStr1, const char* aStr2, size_t aN) { 136 for (char_type* s = aStr1; aN--; ++s, ++aStr2) { 137 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); 138 *s = static_cast<char_type>(*aStr2); 139 } 140 return aStr1; 141 } 142 143 static int compare(const char_type* aStr1, const char_type* aStr2, 144 size_t aN) { 145 for (; aN--; ++aStr1, ++aStr2) { 146 if (!eq(*aStr1, *aStr2)) { 147 return to_int_type(*aStr1) - to_int_type(*aStr2); 148 } 149 } 150 151 return 0; 152 } 153 154 static int compareASCII(const char_type* aStr1, const char* aStr2, 155 size_t aN) { 156 for (; aN--; ++aStr1, ++aStr2) { 157 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); 158 if (!eq_int_type(to_int_type(*aStr1), 159 to_int_type(static_cast<char_type>(*aStr2)))) { 160 return to_int_type(*aStr1) - 161 to_int_type(static_cast<char_type>(*aStr2)); 162 } 163 } 164 165 return 0; 166 } 167 168 static bool equalsLatin1(const char_type* aStr1, const char* aStr2, 169 const size_t aN) { 170 for (size_t i = aN; i > 0; --i, ++aStr1, ++aStr2) { 171 if (*aStr1 != static_cast<char_type>(*aStr2)) { 172 return false; 173 } 174 } 175 176 return true; 177 } 178 179 // this version assumes that s2 is null-terminated and s1 has length n. 180 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, 181 // we return 1. 182 static int compareASCIINullTerminated(const char_type* aStr1, size_t aN, 183 const char* aStr2) { 184 for (; aN--; ++aStr1, ++aStr2) { 185 if (!*aStr2) { 186 return 1; 187 } 188 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); 189 if (!eq_int_type(to_int_type(*aStr1), 190 to_int_type(static_cast<char_type>(*aStr2)))) { 191 return to_int_type(*aStr1) - 192 to_int_type(static_cast<char_type>(*aStr2)); 193 } 194 } 195 196 if (*aStr2) { 197 return -1; 198 } 199 200 return 0; 201 } 202 203 /** 204 * Convert c to its lower-case form, but only if c is in the ASCII 205 * range. Otherwise leave it alone. 206 */ 207 static char_type ASCIIToLower(char_type aChar) { 208 if (aChar >= 'A' && aChar <= 'Z') { 209 return char_type(aChar + ('a' - 'A')); 210 } 211 212 return aChar; 213 } 214 215 static int compareLowerCaseToASCII(const char_type* aStr1, const char* aStr2, 216 size_t aN) { 217 for (; aN--; ++aStr1, ++aStr2) { 218 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); 219 NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'), 220 "Unexpected uppercase character"); 221 char_type lower_s1 = ASCIIToLower(*aStr1); 222 if (lower_s1 != static_cast<char_type>(*aStr2)) { 223 return to_int_type(lower_s1) - 224 to_int_type(static_cast<char_type>(*aStr2)); 225 } 226 } 227 228 return 0; 229 } 230 231 // this version assumes that s2 is null-terminated and s1 has length n. 232 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, 233 // we return 1. 234 static int compareLowerCaseToASCIINullTerminated(const char_type* aStr1, 235 size_t aN, 236 const char* aStr2) { 237 for (; aN--; ++aStr1, ++aStr2) { 238 if (!*aStr2) { 239 return 1; 240 } 241 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); 242 NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'), 243 "Unexpected uppercase character"); 244 char_type lower_s1 = ASCIIToLower(*aStr1); 245 if (lower_s1 != static_cast<char_type>(*aStr2)) { 246 return to_int_type(lower_s1) - 247 to_int_type(static_cast<char_type>(*aStr2)); 248 } 249 } 250 251 if (*aStr2) { 252 return -1; 253 } 254 255 return 0; 256 } 257 258 static size_t length(const char_type* aStr) { 259 size_t result = 0; 260 while (!eq(*aStr++, char_type(0))) { 261 ++result; 262 } 263 return result; 264 } 265 266 static const char_type* find(const char_type* aStr, size_t aN, 267 char_type aChar) { 268 while (aN--) { 269 if (eq(*aStr, aChar)) { 270 return aStr; 271 } 272 ++aStr; 273 } 274 275 return 0; 276 } 277 }; 278 279 template <> 280 struct nsCharTraits<char> { 281 typedef char char_type; 282 typedef unsigned char unsigned_char_type; 283 typedef char16_t incompatible_char_type; 284 285 static char_type* const sEmptyBuffer; 286 287 // integer representation of characters: 288 289 typedef int int_type; 290 291 static char_type to_char_type(int_type aChar) { return char_type(aChar); } 292 293 static int_type to_int_type(char_type aChar) { 294 return int_type(static_cast<unsigned_char_type>(aChar)); 295 } 296 297 static bool eq_int_type(int_type aLhs, int_type aRhs) { return aLhs == aRhs; } 298 299 // |char_type| comparisons: 300 301 static bool eq(char_type aLhs, char_type aRhs) { return aLhs == aRhs; } 302 303 static bool lt(char_type aLhs, char_type aRhs) { return aLhs < aRhs; } 304 305 // operations on s[n] arrays: 306 307 static char_type* move(char_type* aStr1, const char_type* aStr2, size_t aN) { 308 return static_cast<char_type*>( 309 memmove(aStr1, aStr2, aN * sizeof(char_type))); 310 } 311 312 static char_type* copy(char_type* aStr1, const char_type* aStr2, size_t aN) { 313 return static_cast<char_type*>( 314 memcpy(aStr1, aStr2, aN * sizeof(char_type))); 315 } 316 317 static void uninitialize(char_type* aStr, size_t aN) { 318 #ifdef DEBUG 319 memset(aStr, 0xE4, aN * sizeof(char_type)); 320 #endif 321 MOZ_MAKE_MEM_UNDEFINED(aStr, aN * sizeof(char_type)); 322 } 323 324 static char_type* copyASCII(char_type* aStr1, const char* aStr2, size_t aN) { 325 return copy(aStr1, aStr2, aN); 326 } 327 328 static int compare(const char_type* aStr1, const char_type* aStr2, 329 size_t aN) { 330 return memcmp(aStr1, aStr2, aN); 331 } 332 333 static int compareASCII(const char_type* aStr1, const char* aStr2, 334 size_t aN) { 335 #ifdef DEBUG 336 for (size_t i = 0; i < aN; ++i) { 337 NS_ASSERTION(!(aStr2[i] & ~0x7F), "Unexpected non-ASCII character"); 338 } 339 #endif 340 return compare(aStr1, aStr2, aN); 341 } 342 343 static bool equalsLatin1(const char_type* aStr1, const char* aStr2, 344 size_t aN) { 345 return memcmp(aStr1, aStr2, aN) == 0; 346 } 347 348 // this version assumes that s2 is null-terminated and s1 has length n. 349 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, 350 // we return 1. 351 static int compareASCIINullTerminated(const char_type* aStr1, size_t aN, 352 const char* aStr2) { 353 // can't use strcmp here because we don't want to stop when aStr1 354 // contains a null 355 for (; aN--; ++aStr1, ++aStr2) { 356 if (!*aStr2) { 357 return 1; 358 } 359 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); 360 if (*aStr1 != *aStr2) { 361 return to_int_type(*aStr1) - to_int_type(*aStr2); 362 } 363 } 364 365 if (*aStr2) { 366 return -1; 367 } 368 369 return 0; 370 } 371 372 /** 373 * Convert c to its lower-case form, but only if c is ASCII. 374 */ 375 static char_type ASCIIToLower(char_type aChar) { 376 if (aChar >= 'A' && aChar <= 'Z') { 377 return char_type(aChar + ('a' - 'A')); 378 } 379 380 return aChar; 381 } 382 383 static int compareLowerCaseToASCII(const char_type* aStr1, const char* aStr2, 384 size_t aN) { 385 for (; aN--; ++aStr1, ++aStr2) { 386 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); 387 NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'), 388 "Unexpected uppercase character"); 389 char_type lower_s1 = ASCIIToLower(*aStr1); 390 if (lower_s1 != *aStr2) { 391 return to_int_type(lower_s1) - to_int_type(*aStr2); 392 } 393 } 394 return 0; 395 } 396 397 // this version assumes that s2 is null-terminated and s1 has length n. 398 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, 399 // we return 1. 400 static int compareLowerCaseToASCIINullTerminated(const char_type* aStr1, 401 size_t aN, 402 const char* aStr2) { 403 for (; aN--; ++aStr1, ++aStr2) { 404 if (!*aStr2) { 405 return 1; 406 } 407 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); 408 NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'), 409 "Unexpected uppercase character"); 410 char_type lower_s1 = ASCIIToLower(*aStr1); 411 if (lower_s1 != *aStr2) { 412 return to_int_type(lower_s1) - to_int_type(*aStr2); 413 } 414 } 415 416 if (*aStr2) { 417 return -1; 418 } 419 420 return 0; 421 } 422 423 static size_t length(const char_type* aStr) { return strlen(aStr); } 424 425 static const char_type* find(const char_type* aStr, size_t aN, 426 char_type aChar) { 427 return reinterpret_cast<const char_type*>( 428 memchr(aStr, to_int_type(aChar), aN)); 429 } 430 }; 431 432 template <class InputIterator> 433 struct nsCharSourceTraits { 434 typedef typename InputIterator::difference_type difference_type; 435 436 static difference_type readable_distance(const InputIterator& aFirst, 437 const InputIterator& aLast) { 438 // assumes single fragment 439 return aLast.get() - aFirst.get(); 440 } 441 442 static const typename InputIterator::value_type* read( 443 const InputIterator& aIter) { 444 return aIter.get(); 445 } 446 447 static void advance(InputIterator& aStr, difference_type aN) { 448 aStr.advance(aN); 449 } 450 }; 451 452 template <class CharT> 453 struct nsCharSourceTraits<CharT*> { 454 typedef ptrdiff_t difference_type; 455 456 static difference_type readable_distance(CharT* aStr) { 457 return nsCharTraits<CharT>::length(aStr); 458 } 459 460 static difference_type readable_distance(CharT* aFirst, CharT* aLast) { 461 return aLast - aFirst; 462 } 463 464 static const CharT* read(CharT* aStr) { return aStr; } 465 466 static void advance(CharT*& aStr, difference_type aN) { aStr += aN; } 467 }; 468 469 template <class OutputIterator> 470 struct nsCharSinkTraits { 471 static void write(OutputIterator& aIter, 472 const typename OutputIterator::value_type* aStr, 473 size_t aN) { 474 aIter.write(aStr, aN); 475 } 476 }; 477 478 template <class CharT> 479 struct nsCharSinkTraits<CharT*> { 480 static void write(CharT*& aIter, const CharT* aStr, size_t aN) { 481 nsCharTraits<CharT>::move(aIter, aStr, aN); 482 aIter += aN; 483 } 484 }; 485 486 #endif // !defined(nsCharTraits_h___) 487