1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /* vim:set ts=4 sw=4 sts=4 et cindent: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #include "nsUnicodeProperties.h"
8 #include "nsUnicodePropertyData.cpp"
9 
10 #include "mozilla/ArrayUtils.h"
11 #include "nsCharTraits.h"
12 
13 #define UNICODE_BMP_LIMIT 0x10000
14 #define UNICODE_LIMIT 0x110000
15 
GetCharProps2(uint32_t aCh)16 const nsCharProps2& GetCharProps2(uint32_t aCh) {
17   if (aCh < UNICODE_BMP_LIMIT) {
18     return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]]
19                            [aCh & ((1 << kCharProp2CharBits) - 1)];
20   }
21   if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) {
22     return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]]
23                                            [(aCh & 0xffff) >>
24                                             kCharProp2CharBits]]
25                            [aCh & ((1 << kCharProp2CharBits) - 1)];
26   }
27 
28   NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range");
29   // Default values for unassigned
30   using namespace mozilla::unicode;
31   static const nsCharProps2 undefined = {
32       VERTICAL_ORIENTATION_R,
33       0  // IdentifierType
34   };
35   return undefined;
36 }
37 
38 namespace mozilla {
39 
40 namespace unicode {
41 
42 /*
43 To store properties for a million Unicode codepoints compactly, we use
44 a three-level array structure, with the Unicode values considered as
45 three elements: Plane, Page, and Char.
46 
47 Space optimization happens because multiple Planes can refer to the same
48 Page array, and multiple Pages can refer to the same Char array holding
49 the actual values. In practice, most of the higher planes are empty and
50 thus share the same data; and within the BMP, there are also many pages
51 that repeat the same data for any given property.
52 
53 Plane is usually zero, so we skip a lookup in this case, and require
54 that the Plane 0 pages are always the first set of entries in the Page
55 array.
56 
57 The division of the remaining 16 bits into Page and Char fields is
58 adjusted for each property (by experiment using the generation tool)
59 to provide the most compact storage, depending on the distribution
60 of values.
61 */
62 
63 const nsUGenCategory sDetailedToGeneralCategory[] = {
64     // clang-format off
65   /*
66    * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
67    * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-unicode.h.
68    */
69   /* CONTROL */             nsUGenCategory::kOther,
70   /* FORMAT */              nsUGenCategory::kOther,
71   /* UNASSIGNED */          nsUGenCategory::kOther,
72   /* PRIVATE_USE */         nsUGenCategory::kOther,
73   /* SURROGATE */           nsUGenCategory::kOther,
74   /* LOWERCASE_LETTER */    nsUGenCategory::kLetter,
75   /* MODIFIER_LETTER */     nsUGenCategory::kLetter,
76   /* OTHER_LETTER */        nsUGenCategory::kLetter,
77   /* TITLECASE_LETTER */    nsUGenCategory::kLetter,
78   /* UPPERCASE_LETTER */    nsUGenCategory::kLetter,
79   /* COMBINING_MARK */      nsUGenCategory::kMark,
80   /* ENCLOSING_MARK */      nsUGenCategory::kMark,
81   /* NON_SPACING_MARK */    nsUGenCategory::kMark,
82   /* DECIMAL_NUMBER */      nsUGenCategory::kNumber,
83   /* LETTER_NUMBER */       nsUGenCategory::kNumber,
84   /* OTHER_NUMBER */        nsUGenCategory::kNumber,
85   /* CONNECT_PUNCTUATION */ nsUGenCategory::kPunctuation,
86   /* DASH_PUNCTUATION */    nsUGenCategory::kPunctuation,
87   /* CLOSE_PUNCTUATION */   nsUGenCategory::kPunctuation,
88   /* FINAL_PUNCTUATION */   nsUGenCategory::kPunctuation,
89   /* INITIAL_PUNCTUATION */ nsUGenCategory::kPunctuation,
90   /* OTHER_PUNCTUATION */   nsUGenCategory::kPunctuation,
91   /* OPEN_PUNCTUATION */    nsUGenCategory::kPunctuation,
92   /* CURRENCY_SYMBOL */     nsUGenCategory::kSymbol,
93   /* MODIFIER_SYMBOL */     nsUGenCategory::kSymbol,
94   /* MATH_SYMBOL */         nsUGenCategory::kSymbol,
95   /* OTHER_SYMBOL */        nsUGenCategory::kSymbol,
96   /* LINE_SEPARATOR */      nsUGenCategory::kSeparator,
97   /* PARAGRAPH_SEPARATOR */ nsUGenCategory::kSeparator,
98   /* SPACE_SEPARATOR */     nsUGenCategory::kSeparator
99     // clang-format on
100 };
101 
102 const hb_unicode_general_category_t sICUtoHBcategory[U_CHAR_CATEGORY_COUNT] = {
103     // clang-format off
104   HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // U_GENERAL_OTHER_TYPES = 0,
105   HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER, // U_UPPERCASE_LETTER = 1,
106   HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER, // U_LOWERCASE_LETTER = 2,
107   HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER, // U_TITLECASE_LETTER = 3,
108   HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER, // U_MODIFIER_LETTER = 4,
109   HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER, // U_OTHER_LETTER = 5,
110   HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK, // U_NON_SPACING_MARK = 6,
111   HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK, // U_ENCLOSING_MARK = 7,
112   HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK, // U_COMBINING_SPACING_MARK = 8,
113   HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER, // U_DECIMAL_DIGIT_NUMBER = 9,
114   HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER, // U_LETTER_NUMBER = 10,
115   HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER, // U_OTHER_NUMBER = 11,
116   HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR, // U_SPACE_SEPARATOR = 12,
117   HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR, // U_LINE_SEPARATOR = 13,
118   HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR, // U_PARAGRAPH_SEPARATOR = 14,
119   HB_UNICODE_GENERAL_CATEGORY_CONTROL, // U_CONTROL_CHAR = 15,
120   HB_UNICODE_GENERAL_CATEGORY_FORMAT, // U_FORMAT_CHAR = 16,
121   HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE, // U_PRIVATE_USE_CHAR = 17,
122   HB_UNICODE_GENERAL_CATEGORY_SURROGATE, // U_SURROGATE = 18,
123   HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION, // U_DASH_PUNCTUATION = 19,
124   HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION, // U_START_PUNCTUATION = 20,
125   HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION, // U_END_PUNCTUATION = 21,
126   HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION, // U_CONNECTOR_PUNCTUATION = 22,
127   HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION, // U_OTHER_PUNCTUATION = 23,
128   HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL, // U_MATH_SYMBOL = 24,
129   HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL, // U_CURRENCY_SYMBOL = 25,
130   HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL, // U_MODIFIER_SYMBOL = 26,
131   HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL, // U_OTHER_SYMBOL = 27,
132   HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION, // U_INITIAL_PUNCTUATION = 28,
133   HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION, // U_FINAL_PUNCTUATION = 29,
134     // clang-format on
135 };
136 
137 #define DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(prefix_)             \
138   uint32_t Get##prefix_(uint32_t aCh) {                         \
139     if (aCh >= UNICODE_BMP_LIMIT) {                             \
140       return aCh;                                               \
141     }                                                           \
142     auto page = s##prefix_##Pages[aCh >> k##prefix_##CharBits]; \
143     auto index = aCh & ((1 << k##prefix_##CharBits) - 1);       \
144     uint32_t v = s##prefix_##Values[page][index];               \
145     return v ? v : aCh;                                         \
146   }
147 
148 // full-width mappings only exist for BMP characters; all others are
149 // returned unchanged
150 DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidth)
DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse)151 DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse)
152 
153 bool IsClusterExtender(uint32_t aCh, uint8_t aCategory) {
154   return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
155            aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
156           (aCh >= 0x200c && aCh <= 0x200d) ||  // ZWJ, ZWNJ
157           (aCh >= 0xff9e && aCh <= 0xff9f));   // katakana sound marks
158 }
159 
160 enum HSType {
161   HST_NONE = U_HST_NOT_APPLICABLE,
162   HST_L = U_HST_LEADING_JAMO,
163   HST_V = U_HST_VOWEL_JAMO,
164   HST_T = U_HST_TRAILING_JAMO,
165   HST_LV = U_HST_LV_SYLLABLE,
166   HST_LVT = U_HST_LVT_SYLLABLE
167 };
168 
GetHangulSyllableType(uint32_t aCh)169 static HSType GetHangulSyllableType(uint32_t aCh) {
170   return HSType(u_getIntPropertyValue(aCh, UCHAR_HANGUL_SYLLABLE_TYPE));
171 }
172 
Next()173 void ClusterIterator::Next() {
174   if (AtEnd()) {
175     NS_WARNING("ClusterIterator has already reached the end");
176     return;
177   }
178 
179   uint32_t ch = *mPos++;
180 
181   if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit && NS_IS_LOW_SURROGATE(*mPos)) {
182     ch = SURROGATE_TO_UCS4(ch, *mPos++);
183   } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) ||
184              (ch >= 0xac00 && ch <= 0xd7ff)) {
185     // Handle conjoining Jamo that make Hangul syllables
186     HSType hangulState = GetHangulSyllableType(ch);
187     while (mPos < mLimit) {
188       ch = *mPos;
189       HSType hangulType = GetHangulSyllableType(ch);
190       switch (hangulType) {
191         case HST_L:
192         case HST_LV:
193         case HST_LVT:
194           if (hangulState == HST_L) {
195             hangulState = hangulType;
196             mPos++;
197             continue;
198           }
199           break;
200         case HST_V:
201           if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
202               (hangulState != HST_LVT)) {
203             hangulState = hangulType;
204             mPos++;
205             continue;
206           }
207           break;
208         case HST_T:
209           if (hangulState != HST_NONE && hangulState != HST_L) {
210             hangulState = hangulType;
211             mPos++;
212             continue;
213           }
214           break;
215         default:
216           break;
217       }
218       break;
219     }
220   }
221 
222   while (mPos < mLimit) {
223     ch = *mPos;
224 
225     // Check for surrogate pairs; note that isolated surrogates will just
226     // be treated as generic (non-cluster-extending) characters here,
227     // which is fine for cluster-iterating purposes
228     if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 &&
229         NS_IS_LOW_SURROGATE(*(mPos + 1))) {
230       ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
231     }
232 
233     if (!IsClusterExtender(ch)) {
234       break;
235     }
236 
237     mPos++;
238     if (!IS_IN_BMP(ch)) {
239       mPos++;
240     }
241   }
242 
243   NS_ASSERTION(mText < mPos && mPos <= mLimit,
244                "ClusterIterator::Next has overshot the string!");
245 }
246 
Next()247 void ClusterReverseIterator::Next() {
248   if (AtEnd()) {
249     NS_WARNING("ClusterReverseIterator has already reached the end");
250     return;
251   }
252 
253   uint32_t ch;
254   do {
255     ch = *--mPos;
256 
257     if (NS_IS_LOW_SURROGATE(ch) && mPos > mLimit &&
258         NS_IS_HIGH_SURROGATE(*(mPos - 1))) {
259       ch = SURROGATE_TO_UCS4(*--mPos, ch);
260     }
261 
262     if (!IsClusterExtender(ch)) {
263       break;
264     }
265   } while (mPos > mLimit);
266 
267   // XXX May need to handle conjoining Jamo
268 
269   NS_ASSERTION(mPos >= mLimit,
270                "ClusterReverseIterator::Next has overshot the string!");
271 }
272 
CountGraphemeClusters(const char16_t * aText,uint32_t aLength)273 uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength) {
274   ClusterIterator iter(aText, aLength);
275   uint32_t result = 0;
276   while (!iter.AtEnd()) {
277     ++result;
278     iter.Next();
279   }
280   return result;
281 }
282 
283 }  // end namespace unicode
284 
285 }  // end namespace mozilla
286