1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim:set ts=4 sw=2 sts=2 et cindent: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #include "nsUnicodeProperties.h"
8 #include "nsUnicodePropertyData.cpp"
9 
10 #include "mozilla/ArrayUtils.h"
11 #include "mozilla/HashTable.h"
12 #include "nsCharTraits.h"
13 
14 #include "BaseChars.h"
15 
16 #define UNICODE_BMP_LIMIT 0x10000
17 #define UNICODE_LIMIT 0x110000
18 
GetCharProps2(uint32_t aCh)19 const nsCharProps2& GetCharProps2(uint32_t aCh) {
20   if (aCh < UNICODE_BMP_LIMIT) {
21     return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]]
22                            [aCh & ((1 << kCharProp2CharBits) - 1)];
23   }
24   if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) {
25     return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]]
26                                            [(aCh & 0xffff) >>
27                                             kCharProp2CharBits]]
28                            [aCh & ((1 << kCharProp2CharBits) - 1)];
29   }
30 
31   MOZ_ASSERT_UNREACHABLE(
32       "Getting CharProps for codepoint outside Unicode "
33       "range");
34 
35   // Default values for unassigned
36   using namespace mozilla::unicode;
37   static const nsCharProps2 undefined = {
38       VERTICAL_ORIENTATION_R,
39       0  // IdentifierType
40   };
41   return undefined;
42 }
43 
44 namespace mozilla {
45 
46 namespace unicode {
47 
48 /*
49 To store properties for a million Unicode codepoints compactly, we use
50 a three-level array structure, with the Unicode values considered as
51 three elements: Plane, Page, and Char.
52 
53 Space optimization happens because multiple Planes can refer to the same
54 Page array, and multiple Pages can refer to the same Char array holding
55 the actual values. In practice, most of the higher planes are empty and
56 thus share the same data; and within the BMP, there are also many pages
57 that repeat the same data for any given property.
58 
59 Plane is usually zero, so we skip a lookup in this case, and require
60 that the Plane 0 pages are always the first set of entries in the Page
61 array.
62 
63 The division of the remaining 16 bits into Page and Char fields is
64 adjusted for each property (by experiment using the generation tool)
65 to provide the most compact storage, depending on the distribution
66 of values.
67 */
68 
69 const nsUGenCategory sDetailedToGeneralCategory[] = {
70     // clang-format off
71   /*
72    * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
73    * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-unicode.h.
74    */
75   /* CONTROL */             nsUGenCategory::kOther,
76   /* FORMAT */              nsUGenCategory::kOther,
77   /* UNASSIGNED */          nsUGenCategory::kOther,
78   /* PRIVATE_USE */         nsUGenCategory::kOther,
79   /* SURROGATE */           nsUGenCategory::kOther,
80   /* LOWERCASE_LETTER */    nsUGenCategory::kLetter,
81   /* MODIFIER_LETTER */     nsUGenCategory::kLetter,
82   /* OTHER_LETTER */        nsUGenCategory::kLetter,
83   /* TITLECASE_LETTER */    nsUGenCategory::kLetter,
84   /* UPPERCASE_LETTER */    nsUGenCategory::kLetter,
85   /* COMBINING_MARK */      nsUGenCategory::kMark,
86   /* ENCLOSING_MARK */      nsUGenCategory::kMark,
87   /* NON_SPACING_MARK */    nsUGenCategory::kMark,
88   /* DECIMAL_NUMBER */      nsUGenCategory::kNumber,
89   /* LETTER_NUMBER */       nsUGenCategory::kNumber,
90   /* OTHER_NUMBER */        nsUGenCategory::kNumber,
91   /* CONNECT_PUNCTUATION */ nsUGenCategory::kPunctuation,
92   /* DASH_PUNCTUATION */    nsUGenCategory::kPunctuation,
93   /* CLOSE_PUNCTUATION */   nsUGenCategory::kPunctuation,
94   /* FINAL_PUNCTUATION */   nsUGenCategory::kPunctuation,
95   /* INITIAL_PUNCTUATION */ nsUGenCategory::kPunctuation,
96   /* OTHER_PUNCTUATION */   nsUGenCategory::kPunctuation,
97   /* OPEN_PUNCTUATION */    nsUGenCategory::kPunctuation,
98   /* CURRENCY_SYMBOL */     nsUGenCategory::kSymbol,
99   /* MODIFIER_SYMBOL */     nsUGenCategory::kSymbol,
100   /* MATH_SYMBOL */         nsUGenCategory::kSymbol,
101   /* OTHER_SYMBOL */        nsUGenCategory::kSymbol,
102   /* LINE_SEPARATOR */      nsUGenCategory::kSeparator,
103   /* PARAGRAPH_SEPARATOR */ nsUGenCategory::kSeparator,
104   /* SPACE_SEPARATOR */     nsUGenCategory::kSeparator
105     // clang-format on
106 };
107 
108 const hb_unicode_general_category_t sICUtoHBcategory[U_CHAR_CATEGORY_COUNT] = {
109     // clang-format off
110   HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // U_GENERAL_OTHER_TYPES = 0,
111   HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER, // U_UPPERCASE_LETTER = 1,
112   HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER, // U_LOWERCASE_LETTER = 2,
113   HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER, // U_TITLECASE_LETTER = 3,
114   HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER, // U_MODIFIER_LETTER = 4,
115   HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER, // U_OTHER_LETTER = 5,
116   HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK, // U_NON_SPACING_MARK = 6,
117   HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK, // U_ENCLOSING_MARK = 7,
118   HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK, // U_COMBINING_SPACING_MARK = 8,
119   HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER, // U_DECIMAL_DIGIT_NUMBER = 9,
120   HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER, // U_LETTER_NUMBER = 10,
121   HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER, // U_OTHER_NUMBER = 11,
122   HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR, // U_SPACE_SEPARATOR = 12,
123   HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR, // U_LINE_SEPARATOR = 13,
124   HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR, // U_PARAGRAPH_SEPARATOR = 14,
125   HB_UNICODE_GENERAL_CATEGORY_CONTROL, // U_CONTROL_CHAR = 15,
126   HB_UNICODE_GENERAL_CATEGORY_FORMAT, // U_FORMAT_CHAR = 16,
127   HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE, // U_PRIVATE_USE_CHAR = 17,
128   HB_UNICODE_GENERAL_CATEGORY_SURROGATE, // U_SURROGATE = 18,
129   HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION, // U_DASH_PUNCTUATION = 19,
130   HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION, // U_START_PUNCTUATION = 20,
131   HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION, // U_END_PUNCTUATION = 21,
132   HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION, // U_CONNECTOR_PUNCTUATION = 22,
133   HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION, // U_OTHER_PUNCTUATION = 23,
134   HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL, // U_MATH_SYMBOL = 24,
135   HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL, // U_CURRENCY_SYMBOL = 25,
136   HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL, // U_MODIFIER_SYMBOL = 26,
137   HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL, // U_OTHER_SYMBOL = 27,
138   HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION, // U_INITIAL_PUNCTUATION = 28,
139   HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION, // U_FINAL_PUNCTUATION = 29,
140     // clang-format on
141 };
142 
143 #define DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(prefix_)             \
144   uint32_t Get##prefix_(uint32_t aCh) {                         \
145     if (aCh >= UNICODE_BMP_LIMIT) {                             \
146       return aCh;                                               \
147     }                                                           \
148     auto page = s##prefix_##Pages[aCh >> k##prefix_##CharBits]; \
149     auto index = aCh & ((1 << k##prefix_##CharBits) - 1);       \
150     uint32_t v = s##prefix_##Values[page][index];               \
151     return v ? v : aCh;                                         \
152   }
153 
154 // full-width mappings only exist for BMP characters; all others are
155 // returned unchanged
156 DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidth)
DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse)157 DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse)
158 
159 bool IsClusterExtender(uint32_t aCh, uint8_t aCategory) {
160   return (
161       (aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
162        aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
163       (aCh >= 0x200c && aCh <= 0x200d) ||    // ZWJ, ZWNJ
164       (aCh >= 0xff9e && aCh <= 0xff9f) ||    // katakana sound marks
165       (aCh >= 0x1F3FB && aCh <= 0x1F3FF) ||  // fitzpatrick skin tone modifiers
166       (aCh >= 0xe0020 && aCh <= 0xe007f));   // emoji (flag) tag characters
167 }
168 
169 enum HSType {
170   HST_NONE = U_HST_NOT_APPLICABLE,
171   HST_L = U_HST_LEADING_JAMO,
172   HST_V = U_HST_VOWEL_JAMO,
173   HST_T = U_HST_TRAILING_JAMO,
174   HST_LV = U_HST_LV_SYLLABLE,
175   HST_LVT = U_HST_LVT_SYLLABLE
176 };
177 
GetHangulSyllableType(uint32_t aCh)178 static HSType GetHangulSyllableType(uint32_t aCh) {
179   return HSType(u_getIntPropertyValue(aCh, UCHAR_HANGUL_SYLLABLE_TYPE));
180 }
181 
Next()182 void ClusterIterator::Next() {
183   if (AtEnd()) {
184     NS_WARNING("ClusterIterator has already reached the end");
185     return;
186   }
187 
188   uint32_t ch = *mPos++;
189 
190   if (mPos < mLimit && NS_IS_SURROGATE_PAIR(ch, *mPos)) {
191     ch = SURROGATE_TO_UCS4(ch, *mPos++);
192   } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) ||
193              (ch >= 0xac00 && ch <= 0xd7ff)) {
194     // Handle conjoining Jamo that make Hangul syllables
195     HSType hangulState = GetHangulSyllableType(ch);
196     while (mPos < mLimit) {
197       ch = *mPos;
198       HSType hangulType = GetHangulSyllableType(ch);
199       switch (hangulType) {
200         case HST_L:
201         case HST_LV:
202         case HST_LVT:
203           if (hangulState == HST_L) {
204             hangulState = hangulType;
205             mPos++;
206             continue;
207           }
208           break;
209         case HST_V:
210           if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
211               (hangulState != HST_LVT)) {
212             hangulState = hangulType;
213             mPos++;
214             continue;
215           }
216           break;
217         case HST_T:
218           if (hangulState != HST_NONE && hangulState != HST_L) {
219             hangulState = hangulType;
220             mPos++;
221             continue;
222           }
223           break;
224         default:
225           break;
226       }
227       break;
228     }
229   }
230 
231   const uint32_t kVS16 = 0xfe0f;
232   const uint32_t kZWJ = 0x200d;
233   // UTF-16 surrogate values for Fitzpatrick type modifiers
234   const uint32_t kFitzpatrickHigh = 0xD83C;
235   const uint32_t kFitzpatrickLowFirst = 0xDFFB;
236   const uint32_t kFitzpatrickLowLast = 0xDFFF;
237 
238   bool baseIsEmoji = (GetEmojiPresentation(ch) == EmojiDefault) ||
239                      (GetEmojiPresentation(ch) == TextDefault &&
240                       ((mPos < mLimit && *mPos == kVS16) ||
241                        (mPos + 1 < mLimit && *mPos == kFitzpatrickHigh &&
242                         *(mPos + 1) >= kFitzpatrickLowFirst &&
243                         *(mPos + 1) <= kFitzpatrickLowLast)));
244   bool prevWasZwj = false;
245 
246   while (mPos < mLimit) {
247     ch = *mPos;
248     size_t chLen = 1;
249 
250     // Check for surrogate pairs; note that isolated surrogates will just
251     // be treated as generic (non-cluster-extending) characters here,
252     // which is fine for cluster-iterating purposes
253     if (mPos < mLimit - 1 && NS_IS_SURROGATE_PAIR(ch, *(mPos + 1))) {
254       ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
255       chLen = 2;
256     }
257 
258     bool extendCluster =
259         IsClusterExtender(ch) ||
260         (baseIsEmoji && prevWasZwj &&
261          ((GetEmojiPresentation(ch) == EmojiDefault) ||
262           (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < mLimit &&
263            *(mPos + chLen) == kVS16)));
264     if (!extendCluster) {
265       break;
266     }
267 
268     prevWasZwj = (ch == kZWJ);
269     mPos += chLen;
270   }
271 
272   NS_ASSERTION(mText < mPos && mPos <= mLimit,
273                "ClusterIterator::Next has overshot the string!");
274 }
275 
Next()276 void ClusterReverseIterator::Next() {
277   if (AtEnd()) {
278     NS_WARNING("ClusterReverseIterator has already reached the end");
279     return;
280   }
281 
282   uint32_t ch;
283   do {
284     ch = *--mPos;
285 
286     if (mPos > mLimit && NS_IS_SURROGATE_PAIR(*(mPos - 1), ch)) {
287       ch = SURROGATE_TO_UCS4(*--mPos, ch);
288     }
289 
290     if (!IsClusterExtender(ch)) {
291       break;
292     }
293   } while (mPos > mLimit);
294 
295   // XXX May need to handle conjoining Jamo
296 
297   NS_ASSERTION(mPos >= mLimit,
298                "ClusterReverseIterator::Next has overshot the string!");
299 }
300 
CountGraphemeClusters(const char16_t * aText,uint32_t aLength)301 uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength) {
302   ClusterIterator iter(aText, aLength);
303   uint32_t result = 0;
304   while (!iter.AtEnd()) {
305     ++result;
306     iter.Next();
307   }
308   return result;
309 }
310 
GetNaked(uint32_t aCh)311 uint32_t GetNaked(uint32_t aCh) {
312   uint32_t index = aCh >> 8;
313   if (index >= MOZ_ARRAY_LENGTH(BASE_CHAR_MAPPING_BLOCK_INDEX)) {
314     return aCh;
315   }
316   index = BASE_CHAR_MAPPING_BLOCK_INDEX[index];
317   if (index == 0xff) {
318     return aCh;
319   }
320   const BaseCharMappingBlock& block = BASE_CHAR_MAPPING_BLOCKS[index];
321   uint8_t lo = aCh & 0xff;
322   if (lo < block.mFirst || lo > block.mLast) {
323     return aCh;
324   }
325   return (aCh & 0xffff0000) |
326          BASE_CHAR_MAPPING_LIST[block.mMappingStartOffset + lo - block.mFirst];
327 }
328 
329 }  // end namespace unicode
330 
331 }  // end namespace mozilla
332