1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3  * License, v. 2.0. If a copy of the MPL was not distributed with this
4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 
6 #include "nsUnicharUtils.h"
7 #include "nsUTF8Utils.h"
8 #include "nsUnicodeProperties.h"
9 #include "mozilla/Likely.h"
10 #include "mozilla/HashFunctions.h"
11 
12 // We map x -> x, except for upper-case letters,
13 // which we map to their lower-case equivalents.
14 static const uint8_t gASCIIToLower[128] = {
15     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
16     0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
17     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,
18     0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
19     0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
20     0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
21     0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73,
22     0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
23     0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
24     0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
25     0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
26 };
27 
28 #define IS_ASCII(u) ((u) < 0x80)
29 #define IS_ASCII_UPPER(u) (('A' <= (u)) && ((u) <= 'Z'))
30 #define IS_ASCII_LOWER(u) (('a' <= (u)) && ((u) <= 'z'))
31 #define IS_ASCII_ALPHA(u) (IS_ASCII_UPPER(u) || IS_ASCII_LOWER(u))
32 #define IS_ASCII_SPACE(u) (' ' == (u))
33 
34 // We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast
35 // when they're called from within the case-insensitive comparators, so we
36 // define inlined versions.
ToLowerCase_inline(uint32_t aChar)37 static MOZ_ALWAYS_INLINE uint32_t ToLowerCase_inline(uint32_t aChar) {
38   if (IS_ASCII(aChar)) {
39     return gASCIIToLower[aChar];
40   }
41 
42   return mozilla::unicode::GetLowercase(aChar);
43 }
44 
45 static MOZ_ALWAYS_INLINE uint32_t
ToLowerCaseASCII_inline(const uint32_t aChar)46 ToLowerCaseASCII_inline(const uint32_t aChar) {
47   if (IS_ASCII(aChar)) {
48     return gASCIIToLower[aChar];
49   }
50 
51   return aChar;
52 }
53 
ToLowerCase(nsAString & aString)54 void ToLowerCase(nsAString& aString) {
55   char16_t* buf = aString.BeginWriting();
56   ToLowerCase(buf, buf, aString.Length());
57 }
58 
ToLowerCase(const nsAString & aSource,nsAString & aDest)59 void ToLowerCase(const nsAString& aSource, nsAString& aDest) {
60   const char16_t* in = aSource.BeginReading();
61   uint32_t len = aSource.Length();
62 
63   aDest.SetLength(len);
64   char16_t* out = aDest.BeginWriting();
65 
66   ToLowerCase(in, out, len);
67 }
68 
ToLowerCaseASCII(const uint32_t aChar)69 uint32_t ToLowerCaseASCII(const uint32_t aChar) {
70   return ToLowerCaseASCII_inline(aChar);
71 }
72 
ToUpperCase(nsAString & aString)73 void ToUpperCase(nsAString& aString) {
74   char16_t* buf = aString.BeginWriting();
75   ToUpperCase(buf, buf, aString.Length());
76 }
77 
ToUpperCase(const nsAString & aSource,nsAString & aDest)78 void ToUpperCase(const nsAString& aSource, nsAString& aDest) {
79   const char16_t* in = aSource.BeginReading();
80   uint32_t len = aSource.Length();
81 
82   aDest.SetLength(len);
83   char16_t* out = aDest.BeginWriting();
84 
85   ToUpperCase(in, out, len);
86 }
87 
88 #ifdef MOZILLA_INTERNAL_API
89 
operator ()(const char16_t * lhs,const char16_t * rhs,uint32_t lLength,uint32_t rLength) const90 int32_t nsCaseInsensitiveStringComparator::operator()(const char16_t* lhs,
91                                                       const char16_t* rhs,
92                                                       uint32_t lLength,
93                                                       uint32_t rLength) const {
94   return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength)
95                               : (lLength > rLength) ? 1 : -1;
96 }
97 
operator ()(const char * lhs,const char * rhs,uint32_t lLength,uint32_t rLength) const98 int32_t nsCaseInsensitiveUTF8StringComparator::operator()(
99     const char* lhs, const char* rhs, uint32_t lLength,
100     uint32_t rLength) const {
101   return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
102 }
103 
operator ()(const char16_t * lhs,const char16_t * rhs,uint32_t lLength,uint32_t rLength) const104 int32_t nsASCIICaseInsensitiveStringComparator::operator()(
105     const char16_t* lhs, const char16_t* rhs, uint32_t lLength,
106     uint32_t rLength) const {
107   if (lLength != rLength) {
108     if (lLength > rLength) return 1;
109     return -1;
110   }
111 
112   while (rLength) {
113     // we don't care about surrogates here, because we're only
114     // lowercasing the ASCII range
115     char16_t l = *lhs++;
116     char16_t r = *rhs++;
117     if (l != r) {
118       l = ToLowerCaseASCII_inline(l);
119       r = ToLowerCaseASCII_inline(r);
120 
121       if (l > r)
122         return 1;
123       else if (r > l)
124         return -1;
125     }
126     rLength--;
127   }
128 
129   return 0;
130 }
131 
132 #endif  // MOZILLA_INTERNAL_API
133 
ToLowerCase(uint32_t aChar)134 uint32_t ToLowerCase(uint32_t aChar) { return ToLowerCase_inline(aChar); }
135 
ToLowerCase(const char16_t * aIn,char16_t * aOut,uint32_t aLen)136 void ToLowerCase(const char16_t* aIn, char16_t* aOut, uint32_t aLen) {
137   for (uint32_t i = 0; i < aLen; i++) {
138     uint32_t ch = aIn[i];
139     if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
140         NS_IS_LOW_SURROGATE(aIn[i + 1])) {
141       ch = mozilla::unicode::GetLowercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
142       NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
143       aOut[i++] = H_SURROGATE(ch);
144       aOut[i] = L_SURROGATE(ch);
145       continue;
146     }
147     aOut[i] = ToLowerCase(ch);
148   }
149 }
150 
ToUpperCase(uint32_t aChar)151 uint32_t ToUpperCase(uint32_t aChar) {
152   if (IS_ASCII(aChar)) {
153     if (IS_ASCII_LOWER(aChar)) {
154       return aChar - 0x20;
155     }
156     return aChar;
157   }
158 
159   return mozilla::unicode::GetUppercase(aChar);
160 }
161 
ToUpperCase(const char16_t * aIn,char16_t * aOut,uint32_t aLen)162 void ToUpperCase(const char16_t* aIn, char16_t* aOut, uint32_t aLen) {
163   for (uint32_t i = 0; i < aLen; i++) {
164     uint32_t ch = aIn[i];
165     if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
166         NS_IS_LOW_SURROGATE(aIn[i + 1])) {
167       ch = mozilla::unicode::GetUppercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
168       NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
169       aOut[i++] = H_SURROGATE(ch);
170       aOut[i] = L_SURROGATE(ch);
171       continue;
172     }
173     aOut[i] = ToUpperCase(ch);
174   }
175 }
176 
ToTitleCase(uint32_t aChar)177 uint32_t ToTitleCase(uint32_t aChar) {
178   if (IS_ASCII(aChar)) {
179     return ToUpperCase(aChar);
180   }
181 
182   return mozilla::unicode::GetTitlecaseForLower(aChar);
183 }
184 
CaseInsensitiveCompare(const char16_t * a,const char16_t * b,uint32_t len)185 int32_t CaseInsensitiveCompare(const char16_t* a, const char16_t* b,
186                                uint32_t len) {
187   NS_ASSERTION(a && b, "Do not pass in invalid pointers!");
188 
189   if (len) {
190     do {
191       uint32_t c1 = *a++;
192       uint32_t c2 = *b++;
193 
194       // Unfortunately, we need to check for surrogates BEFORE we check
195       // for equality, because we could have identical high surrogates
196       // but non-identical characters, so we can't just skip them
197 
198       // If c1 isn't a surrogate, we don't bother to check c2;
199       // in the case where it _is_ a surrogate, we're definitely going to get
200       // a mismatch, and don't need to interpret and lowercase it
201 
202       if (NS_IS_HIGH_SURROGATE(c1) && len > 1 && NS_IS_LOW_SURROGATE(*a)) {
203         c1 = SURROGATE_TO_UCS4(c1, *a++);
204         if (NS_IS_HIGH_SURROGATE(c2) && NS_IS_LOW_SURROGATE(*b)) {
205           c2 = SURROGATE_TO_UCS4(c2, *b++);
206         }
207         // If c2 wasn't a surrogate, decrementing len means we'd stop
208         // short of the end of string b, but that doesn't actually matter
209         // because we're going to find a mismatch and return early
210         --len;
211       }
212 
213       if (c1 != c2) {
214         c1 = ToLowerCase_inline(c1);
215         c2 = ToLowerCase_inline(c2);
216         if (c1 != c2) {
217           if (c1 < c2) {
218             return -1;
219           }
220           return 1;
221         }
222       }
223     } while (--len != 0);
224   }
225   return 0;
226 }
227 
228 // Inlined definition of GetLowerUTF8Codepoint, which we use because we want
229 // to be fast when called from the case-insensitive comparators.
GetLowerUTF8Codepoint_inline(const char * aStr,const char * aEnd,const char ** aNext)230 static MOZ_ALWAYS_INLINE uint32_t GetLowerUTF8Codepoint_inline(
231     const char* aStr, const char* aEnd, const char** aNext) {
232   // Convert to unsigned char so that stuffing chars into PRUint32s doesn't
233   // sign extend.
234   const unsigned char* str = (unsigned char*)aStr;
235 
236   if (UTF8traits::isASCII(str[0])) {
237     // It's ASCII; just convert to lower-case and return it.
238     *aNext = aStr + 1;
239     return gASCIIToLower[*str];
240   }
241   if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) {
242     // It's a two-byte sequence, so it looks like
243     //  110XXXXX 10XXXXXX.
244     // This is definitely in the BMP, so we can store straightaway into a
245     // uint16_t.
246 
247     uint16_t c;
248     c = (str[0] & 0x1F) << 6;
249     c += (str[1] & 0x3F);
250 
251     // we don't go through ToLowerCase here, because we know this isn't
252     // an ASCII character so the ASCII fast-path there is useless
253     c = mozilla::unicode::GetLowercase(c);
254 
255     *aNext = aStr + 2;
256     return c;
257   }
258   if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) {
259     // It's a three-byte sequence, so it looks like
260     //  1110XXXX 10XXXXXX 10XXXXXX.
261     // This will just barely fit into 16-bits, so store into a uint16_t.
262 
263     uint16_t c;
264     c = (str[0] & 0x0F) << 12;
265     c += (str[1] & 0x3F) << 6;
266     c += (str[2] & 0x3F);
267 
268     c = mozilla::unicode::GetLowercase(c);
269 
270     *aNext = aStr + 3;
271     return c;
272   }
273   if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) {
274     // It's a four-byte sequence, so it looks like
275     //   11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.
276 
277     uint32_t c;
278     c = (str[0] & 0x07) << 18;
279     c += (str[1] & 0x3F) << 12;
280     c += (str[2] & 0x3F) << 6;
281     c += (str[3] & 0x3F);
282 
283     c = mozilla::unicode::GetLowercase(c);
284 
285     *aNext = aStr + 4;
286     return c;
287   }
288 
289   // Hm, we don't understand this sequence.
290   return -1;
291 }
292 
GetLowerUTF8Codepoint(const char * aStr,const char * aEnd,const char ** aNext)293 uint32_t GetLowerUTF8Codepoint(const char* aStr, const char* aEnd,
294                                const char** aNext) {
295   return GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext);
296 }
297 
CaseInsensitiveCompare(const char * aLeft,const char * aRight,uint32_t aLeftBytes,uint32_t aRightBytes)298 int32_t CaseInsensitiveCompare(const char* aLeft, const char* aRight,
299                                uint32_t aLeftBytes, uint32_t aRightBytes) {
300   const char* leftEnd = aLeft + aLeftBytes;
301   const char* rightEnd = aRight + aRightBytes;
302 
303   while (aLeft < leftEnd && aRight < rightEnd) {
304     uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, leftEnd, &aLeft);
305     if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) return -1;
306 
307     uint32_t rightChar =
308         GetLowerUTF8Codepoint_inline(aRight, rightEnd, &aRight);
309     if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) return -1;
310 
311     // Now leftChar and rightChar are lower-case, so we can compare them.
312     if (leftChar != rightChar) {
313       if (leftChar > rightChar) return 1;
314       return -1;
315     }
316   }
317 
318   // Make sure that if one string is longer than the other we return the
319   // correct result.
320   if (aLeft < leftEnd) return 1;
321   if (aRight < rightEnd) return -1;
322 
323   return 0;
324 }
325 
CaseInsensitiveUTF8CharsEqual(const char * aLeft,const char * aRight,const char * aLeftEnd,const char * aRightEnd,const char ** aLeftNext,const char ** aRightNext,bool * aErr)326 bool CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
327                                    const char* aLeftEnd, const char* aRightEnd,
328                                    const char** aLeftNext,
329                                    const char** aRightNext, bool* aErr) {
330   NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");
331   NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");
332   NS_ASSERTION(aErr, "Out pointer shouldn't be null.");
333   NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
334   NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");
335 
336   uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, aLeftEnd, aLeftNext);
337   if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) {
338     *aErr = true;
339     return false;
340   }
341 
342   uint32_t rightChar =
343       GetLowerUTF8Codepoint_inline(aRight, aRightEnd, aRightNext);
344   if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) {
345     *aErr = true;
346     return false;
347   }
348 
349   // Can't have an error past this point.
350   *aErr = false;
351 
352   return leftChar == rightChar;
353 }
354 
355 namespace mozilla {
356 
HashUTF8AsUTF16(const char * aUTF8,uint32_t aLength,bool * aErr)357 uint32_t HashUTF8AsUTF16(const char* aUTF8, uint32_t aLength, bool* aErr) {
358   uint32_t hash = 0;
359   const char* s = aUTF8;
360   const char* end = aUTF8 + aLength;
361 
362   *aErr = false;
363 
364   while (s < end) {
365     uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);
366     if (*aErr) {
367       return 0;
368     }
369 
370     if (ucs4 < PLANE1_BASE) {
371       hash = AddToHash(hash, ucs4);
372     } else {
373       hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));
374     }
375   }
376 
377   return hash;
378 }
379 
IsSegmentBreakSkipChar(uint32_t u)380 bool IsSegmentBreakSkipChar(uint32_t u) {
381   return unicode::IsEastAsianWidthFWH(u) &&
382          unicode::GetScriptCode(u) != unicode::Script::HANGUL;
383 }
384 
385 }  // namespace mozilla
386