1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #ifndef nsCharTraits_h___
8 #define nsCharTraits_h___
9 
10 #include <ctype.h>   // for |EOF|, |WEOF|
11 #include <stdint.h>  // for |uint32_t|
12 #include <string.h>  // for |memcpy|, et al
13 #include "mozilla/MemoryChecking.h"
14 
15 // This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in
16 // particular the standalone software updater. In that case stub out
17 // the macros provided by nsDebug.h which are only usable when linking XPCOM
18 
19 #ifdef NS_NO_XPCOM
20 #  define NS_WARNING(msg)
21 #  define NS_ASSERTION(cond, msg)
22 #  define NS_ERROR(msg)
23 #else
24 #  include "nsDebug.h"  // for NS_ASSERTION
25 #endif
26 
27 /*
28  * Some macros for converting char16_t (UTF-16) to and from Unicode scalar
29  * values.
30  *
31  * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by
32  * using "surrogate pairs". These consist of a high surrogate, i.e. a code
33  * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point
34  * in the range U+DC00 - U+DFFF, like this:
35  *
36  *  U+D800 U+DC00 =  U+10000
37  *  U+D800 U+DC01 =  U+10001
38  *  ...
39  *  U+DBFF U+DFFE = U+10FFFE
40  *  U+DBFF U+DFFF = U+10FFFF
41  *
42  * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode
43  * scalar values and are not well-formed UTF-16 except as high-surrogate /
44  * low-surrogate pairs.
45  */
46 
47 #define PLANE1_BASE uint32_t(0x00010000)
48 // High surrogates are in the range 0xD800 -- OxDBFF
49 #define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800)
50 // Low surrogates are in the range 0xDC00 -- 0xDFFF
51 #define NS_IS_LOW_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xDC00)
52 // Easier to type than NS_IS_HIGH_SURROGATE && NS_IS_LOW_SURROGATE
53 #define NS_IS_SURROGATE_PAIR(h, l) \
54   (NS_IS_HIGH_SURROGATE(h) && NS_IS_LOW_SURROGATE(l))
55 // Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE
56 #define IS_SURROGATE(u) ((uint32_t(u) & 0xFFFFF800) == 0xD800)
57 
58 // Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF
59 
60 // N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00)
61 // I wonder whether we could somehow assert that H is a high surrogate
62 // and L is a low surrogate
63 #define SURROGATE_TO_UCS4(h, l) \
64   (((uint32_t(h) & 0x03FF) << 10) + (uint32_t(l) & 0x03FF) + PLANE1_BASE)
65 
66 // Extract surrogates from a UCS4 char
67 // Reference: the Unicode standard 4.0, section 3.9
68 // Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and
69 // 0xD7C0 == 0xD800 - 0x0080,
70 // ((c - 0x10000) >> 10) + 0xD800 can be simplified to
71 #define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + char16_t(0xD7C0))
72 // where it's to be noted that 0xD7C0 is not bitwise-OR'd
73 // but added.
74 
75 // Since 0x10000 & 0x03FF == 0,
76 // (c - 0x10000) & 0x03FF == c & 0x03FF so that
77 // ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to
78 #define L_SURROGATE(c) \
79   char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | char16_t(0xDC00))
80 
81 #define IS_IN_BMP(ucs) (uint32_t(ucs) < PLANE1_BASE)
82 #define UCS2_REPLACEMENT_CHAR char16_t(0xFFFD)
83 
84 #define UCS_END uint32_t(0x00110000)
85 #define IS_VALID_CHAR(c) ((uint32_t(c) < UCS_END) && !IS_SURROGATE(c))
86 #define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR)
87 
88 template <class CharT>
89 struct nsCharTraits {};
90 
91 template <>
92 struct nsCharTraits<char16_t> {
93   typedef char16_t char_type;
94   typedef uint16_t unsigned_char_type;
95   typedef char incompatible_char_type;
96 
97   static char_type* const sEmptyBuffer;
98 
99   // integer representation of characters:
100   typedef int int_type;
101 
102   static char_type to_char_type(int_type aChar) { return char_type(aChar); }
103 
104   static int_type to_int_type(char_type aChar) {
105     return int_type(static_cast<unsigned_char_type>(aChar));
106   }
107 
108   static bool eq_int_type(int_type aLhs, int_type aRhs) { return aLhs == aRhs; }
109 
110   // |char_type| comparisons:
111 
112   static bool eq(char_type aLhs, char_type aRhs) { return aLhs == aRhs; }
113 
114   static bool lt(char_type aLhs, char_type aRhs) { return aLhs < aRhs; }
115 
116   // operations on s[n] arrays:
117 
118   static char_type* move(char_type* aStr1, const char_type* aStr2, size_t aN) {
119     return static_cast<char_type*>(
120         memmove(aStr1, aStr2, aN * sizeof(char_type)));
121   }
122 
123   static char_type* copy(char_type* aStr1, const char_type* aStr2, size_t aN) {
124     return static_cast<char_type*>(
125         memcpy(aStr1, aStr2, aN * sizeof(char_type)));
126   }
127 
128   static void uninitialize(char_type* aStr, size_t aN) {
129 #ifdef DEBUG
130     memset(aStr, 0xE4, aN * sizeof(char_type));
131 #endif
132     MOZ_MAKE_MEM_UNDEFINED(aStr, aN * sizeof(char_type));
133   }
134 
135   static char_type* copyASCII(char_type* aStr1, const char* aStr2, size_t aN) {
136     for (char_type* s = aStr1; aN--; ++s, ++aStr2) {
137       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
138       *s = static_cast<char_type>(*aStr2);
139     }
140     return aStr1;
141   }
142 
143   static int compare(const char_type* aStr1, const char_type* aStr2,
144                      size_t aN) {
145     for (; aN--; ++aStr1, ++aStr2) {
146       if (!eq(*aStr1, *aStr2)) {
147         return to_int_type(*aStr1) - to_int_type(*aStr2);
148       }
149     }
150 
151     return 0;
152   }
153 
154   static int compareASCII(const char_type* aStr1, const char* aStr2,
155                           size_t aN) {
156     for (; aN--; ++aStr1, ++aStr2) {
157       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
158       if (!eq_int_type(to_int_type(*aStr1),
159                        to_int_type(static_cast<char_type>(*aStr2)))) {
160         return to_int_type(*aStr1) -
161                to_int_type(static_cast<char_type>(*aStr2));
162       }
163     }
164 
165     return 0;
166   }
167 
168   static bool equalsLatin1(const char_type* aStr1, const char* aStr2,
169                            const size_t aN) {
170     for (size_t i = aN; i > 0; --i, ++aStr1, ++aStr2) {
171       if (*aStr1 != static_cast<char_type>(*aStr2)) {
172         return false;
173       }
174     }
175 
176     return true;
177   }
178 
179   // this version assumes that s2 is null-terminated and s1 has length n.
180   // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
181   // we return 1.
182   static int compareASCIINullTerminated(const char_type* aStr1, size_t aN,
183                                         const char* aStr2) {
184     for (; aN--; ++aStr1, ++aStr2) {
185       if (!*aStr2) {
186         return 1;
187       }
188       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
189       if (!eq_int_type(to_int_type(*aStr1),
190                        to_int_type(static_cast<char_type>(*aStr2)))) {
191         return to_int_type(*aStr1) -
192                to_int_type(static_cast<char_type>(*aStr2));
193       }
194     }
195 
196     if (*aStr2) {
197       return -1;
198     }
199 
200     return 0;
201   }
202 
203   /**
204    * Convert c to its lower-case form, but only if c is in the ASCII
205    * range. Otherwise leave it alone.
206    */
207   static char_type ASCIIToLower(char_type aChar) {
208     if (aChar >= 'A' && aChar <= 'Z') {
209       return char_type(aChar + ('a' - 'A'));
210     }
211 
212     return aChar;
213   }
214 
215   static int compareLowerCaseToASCII(const char_type* aStr1, const char* aStr2,
216                                      size_t aN) {
217     for (; aN--; ++aStr1, ++aStr2) {
218       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
219       NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
220                    "Unexpected uppercase character");
221       char_type lower_s1 = ASCIIToLower(*aStr1);
222       if (lower_s1 != static_cast<char_type>(*aStr2)) {
223         return to_int_type(lower_s1) -
224                to_int_type(static_cast<char_type>(*aStr2));
225       }
226     }
227 
228     return 0;
229   }
230 
231   // this version assumes that s2 is null-terminated and s1 has length n.
232   // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
233   // we return 1.
234   static int compareLowerCaseToASCIINullTerminated(const char_type* aStr1,
235                                                    size_t aN,
236                                                    const char* aStr2) {
237     for (; aN--; ++aStr1, ++aStr2) {
238       if (!*aStr2) {
239         return 1;
240       }
241       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
242       NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
243                    "Unexpected uppercase character");
244       char_type lower_s1 = ASCIIToLower(*aStr1);
245       if (lower_s1 != static_cast<char_type>(*aStr2)) {
246         return to_int_type(lower_s1) -
247                to_int_type(static_cast<char_type>(*aStr2));
248       }
249     }
250 
251     if (*aStr2) {
252       return -1;
253     }
254 
255     return 0;
256   }
257 
258   static size_t length(const char_type* aStr) {
259     size_t result = 0;
260     while (!eq(*aStr++, char_type(0))) {
261       ++result;
262     }
263     return result;
264   }
265 
266   static const char_type* find(const char_type* aStr, size_t aN,
267                                char_type aChar) {
268     while (aN--) {
269       if (eq(*aStr, aChar)) {
270         return aStr;
271       }
272       ++aStr;
273     }
274 
275     return 0;
276   }
277 };
278 
279 template <>
280 struct nsCharTraits<char> {
281   typedef char char_type;
282   typedef unsigned char unsigned_char_type;
283   typedef char16_t incompatible_char_type;
284 
285   static char_type* const sEmptyBuffer;
286 
287   // integer representation of characters:
288 
289   typedef int int_type;
290 
291   static char_type to_char_type(int_type aChar) { return char_type(aChar); }
292 
293   static int_type to_int_type(char_type aChar) {
294     return int_type(static_cast<unsigned_char_type>(aChar));
295   }
296 
297   static bool eq_int_type(int_type aLhs, int_type aRhs) { return aLhs == aRhs; }
298 
299   // |char_type| comparisons:
300 
301   static bool eq(char_type aLhs, char_type aRhs) { return aLhs == aRhs; }
302 
303   static bool lt(char_type aLhs, char_type aRhs) { return aLhs < aRhs; }
304 
305   // operations on s[n] arrays:
306 
307   static char_type* move(char_type* aStr1, const char_type* aStr2, size_t aN) {
308     return static_cast<char_type*>(
309         memmove(aStr1, aStr2, aN * sizeof(char_type)));
310   }
311 
312   static char_type* copy(char_type* aStr1, const char_type* aStr2, size_t aN) {
313     return static_cast<char_type*>(
314         memcpy(aStr1, aStr2, aN * sizeof(char_type)));
315   }
316 
317   static void uninitialize(char_type* aStr, size_t aN) {
318 #ifdef DEBUG
319     memset(aStr, 0xE4, aN * sizeof(char_type));
320 #endif
321     MOZ_MAKE_MEM_UNDEFINED(aStr, aN * sizeof(char_type));
322   }
323 
324   static char_type* copyASCII(char_type* aStr1, const char* aStr2, size_t aN) {
325     return copy(aStr1, aStr2, aN);
326   }
327 
328   static int compare(const char_type* aStr1, const char_type* aStr2,
329                      size_t aN) {
330     return memcmp(aStr1, aStr2, aN);
331   }
332 
333   static int compareASCII(const char_type* aStr1, const char* aStr2,
334                           size_t aN) {
335 #ifdef DEBUG
336     for (size_t i = 0; i < aN; ++i) {
337       NS_ASSERTION(!(aStr2[i] & ~0x7F), "Unexpected non-ASCII character");
338     }
339 #endif
340     return compare(aStr1, aStr2, aN);
341   }
342 
343   static bool equalsLatin1(const char_type* aStr1, const char* aStr2,
344                            size_t aN) {
345     return memcmp(aStr1, aStr2, aN) == 0;
346   }
347 
348   // this version assumes that s2 is null-terminated and s1 has length n.
349   // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
350   // we return 1.
351   static int compareASCIINullTerminated(const char_type* aStr1, size_t aN,
352                                         const char* aStr2) {
353     // can't use strcmp here because we don't want to stop when aStr1
354     // contains a null
355     for (; aN--; ++aStr1, ++aStr2) {
356       if (!*aStr2) {
357         return 1;
358       }
359       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
360       if (*aStr1 != *aStr2) {
361         return to_int_type(*aStr1) - to_int_type(*aStr2);
362       }
363     }
364 
365     if (*aStr2) {
366       return -1;
367     }
368 
369     return 0;
370   }
371 
372   /**
373    * Convert c to its lower-case form, but only if c is ASCII.
374    */
375   static char_type ASCIIToLower(char_type aChar) {
376     if (aChar >= 'A' && aChar <= 'Z') {
377       return char_type(aChar + ('a' - 'A'));
378     }
379 
380     return aChar;
381   }
382 
383   static int compareLowerCaseToASCII(const char_type* aStr1, const char* aStr2,
384                                      size_t aN) {
385     for (; aN--; ++aStr1, ++aStr2) {
386       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
387       NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
388                    "Unexpected uppercase character");
389       char_type lower_s1 = ASCIIToLower(*aStr1);
390       if (lower_s1 != *aStr2) {
391         return to_int_type(lower_s1) - to_int_type(*aStr2);
392       }
393     }
394     return 0;
395   }
396 
397   // this version assumes that s2 is null-terminated and s1 has length n.
398   // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
399   // we return 1.
400   static int compareLowerCaseToASCIINullTerminated(const char_type* aStr1,
401                                                    size_t aN,
402                                                    const char* aStr2) {
403     for (; aN--; ++aStr1, ++aStr2) {
404       if (!*aStr2) {
405         return 1;
406       }
407       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
408       NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
409                    "Unexpected uppercase character");
410       char_type lower_s1 = ASCIIToLower(*aStr1);
411       if (lower_s1 != *aStr2) {
412         return to_int_type(lower_s1) - to_int_type(*aStr2);
413       }
414     }
415 
416     if (*aStr2) {
417       return -1;
418     }
419 
420     return 0;
421   }
422 
423   static size_t length(const char_type* aStr) { return strlen(aStr); }
424 
425   static const char_type* find(const char_type* aStr, size_t aN,
426                                char_type aChar) {
427     return reinterpret_cast<const char_type*>(
428         memchr(aStr, to_int_type(aChar), aN));
429   }
430 };
431 
432 template <class InputIterator>
433 struct nsCharSourceTraits {
434   typedef typename InputIterator::difference_type difference_type;
435 
436   static difference_type readable_distance(const InputIterator& aFirst,
437                                            const InputIterator& aLast) {
438     // assumes single fragment
439     return aLast.get() - aFirst.get();
440   }
441 
442   static const typename InputIterator::value_type* read(
443       const InputIterator& aIter) {
444     return aIter.get();
445   }
446 
447   static void advance(InputIterator& aStr, difference_type aN) {
448     aStr.advance(aN);
449   }
450 };
451 
452 template <class CharT>
453 struct nsCharSourceTraits<CharT*> {
454   typedef ptrdiff_t difference_type;
455 
456   static difference_type readable_distance(CharT* aStr) {
457     return nsCharTraits<CharT>::length(aStr);
458   }
459 
460   static difference_type readable_distance(CharT* aFirst, CharT* aLast) {
461     return aLast - aFirst;
462   }
463 
464   static const CharT* read(CharT* aStr) { return aStr; }
465 
466   static void advance(CharT*& aStr, difference_type aN) { aStr += aN; }
467 };
468 
469 template <class OutputIterator>
470 struct nsCharSinkTraits {
471   static void write(OutputIterator& aIter,
472                     const typename OutputIterator::value_type* aStr,
473                     size_t aN) {
474     aIter.write(aStr, aN);
475   }
476 };
477 
478 template <class CharT>
479 struct nsCharSinkTraits<CharT*> {
480   static void write(CharT*& aIter, const CharT* aStr, size_t aN) {
481     nsCharTraits<CharT>::move(aIter, aStr, aN);
482     aIter += aN;
483   }
484 };
485 
486 #endif  // !defined(nsCharTraits_h___)
487