1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /* vim:set ts=4 sw=4 sts=4 et cindent: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #include "nsUnicodeProperties.h"
8 #include "nsUnicodePropertyData.cpp"
9
10 #include "mozilla/ArrayUtils.h"
11 #include "nsCharTraits.h"
12
13 #define UNICODE_BMP_LIMIT 0x10000
14 #define UNICODE_LIMIT 0x110000
15
GetCharProps2(uint32_t aCh)16 const nsCharProps2& GetCharProps2(uint32_t aCh) {
17 if (aCh < UNICODE_BMP_LIMIT) {
18 return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]]
19 [aCh & ((1 << kCharProp2CharBits) - 1)];
20 }
21 if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) {
22 return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]]
23 [(aCh & 0xffff) >>
24 kCharProp2CharBits]]
25 [aCh & ((1 << kCharProp2CharBits) - 1)];
26 }
27
28 NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range");
29 // Default values for unassigned
30 using namespace mozilla::unicode;
31 static const nsCharProps2 undefined = {
32 VERTICAL_ORIENTATION_R,
33 0 // IdentifierType
34 };
35 return undefined;
36 }
37
38 namespace mozilla {
39
40 namespace unicode {
41
42 /*
43 To store properties for a million Unicode codepoints compactly, we use
44 a three-level array structure, with the Unicode values considered as
45 three elements: Plane, Page, and Char.
46
47 Space optimization happens because multiple Planes can refer to the same
48 Page array, and multiple Pages can refer to the same Char array holding
49 the actual values. In practice, most of the higher planes are empty and
50 thus share the same data; and within the BMP, there are also many pages
51 that repeat the same data for any given property.
52
53 Plane is usually zero, so we skip a lookup in this case, and require
54 that the Plane 0 pages are always the first set of entries in the Page
55 array.
56
57 The division of the remaining 16 bits into Page and Char fields is
58 adjusted for each property (by experiment using the generation tool)
59 to provide the most compact storage, depending on the distribution
60 of values.
61 */
62
63 const nsUGenCategory sDetailedToGeneralCategory[] = {
64 // clang-format off
65 /*
66 * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
67 * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-unicode.h.
68 */
69 /* CONTROL */ nsUGenCategory::kOther,
70 /* FORMAT */ nsUGenCategory::kOther,
71 /* UNASSIGNED */ nsUGenCategory::kOther,
72 /* PRIVATE_USE */ nsUGenCategory::kOther,
73 /* SURROGATE */ nsUGenCategory::kOther,
74 /* LOWERCASE_LETTER */ nsUGenCategory::kLetter,
75 /* MODIFIER_LETTER */ nsUGenCategory::kLetter,
76 /* OTHER_LETTER */ nsUGenCategory::kLetter,
77 /* TITLECASE_LETTER */ nsUGenCategory::kLetter,
78 /* UPPERCASE_LETTER */ nsUGenCategory::kLetter,
79 /* COMBINING_MARK */ nsUGenCategory::kMark,
80 /* ENCLOSING_MARK */ nsUGenCategory::kMark,
81 /* NON_SPACING_MARK */ nsUGenCategory::kMark,
82 /* DECIMAL_NUMBER */ nsUGenCategory::kNumber,
83 /* LETTER_NUMBER */ nsUGenCategory::kNumber,
84 /* OTHER_NUMBER */ nsUGenCategory::kNumber,
85 /* CONNECT_PUNCTUATION */ nsUGenCategory::kPunctuation,
86 /* DASH_PUNCTUATION */ nsUGenCategory::kPunctuation,
87 /* CLOSE_PUNCTUATION */ nsUGenCategory::kPunctuation,
88 /* FINAL_PUNCTUATION */ nsUGenCategory::kPunctuation,
89 /* INITIAL_PUNCTUATION */ nsUGenCategory::kPunctuation,
90 /* OTHER_PUNCTUATION */ nsUGenCategory::kPunctuation,
91 /* OPEN_PUNCTUATION */ nsUGenCategory::kPunctuation,
92 /* CURRENCY_SYMBOL */ nsUGenCategory::kSymbol,
93 /* MODIFIER_SYMBOL */ nsUGenCategory::kSymbol,
94 /* MATH_SYMBOL */ nsUGenCategory::kSymbol,
95 /* OTHER_SYMBOL */ nsUGenCategory::kSymbol,
96 /* LINE_SEPARATOR */ nsUGenCategory::kSeparator,
97 /* PARAGRAPH_SEPARATOR */ nsUGenCategory::kSeparator,
98 /* SPACE_SEPARATOR */ nsUGenCategory::kSeparator
99 // clang-format on
100 };
101
102 const hb_unicode_general_category_t sICUtoHBcategory[U_CHAR_CATEGORY_COUNT] = {
103 // clang-format off
104 HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // U_GENERAL_OTHER_TYPES = 0,
105 HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER, // U_UPPERCASE_LETTER = 1,
106 HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER, // U_LOWERCASE_LETTER = 2,
107 HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER, // U_TITLECASE_LETTER = 3,
108 HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER, // U_MODIFIER_LETTER = 4,
109 HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER, // U_OTHER_LETTER = 5,
110 HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK, // U_NON_SPACING_MARK = 6,
111 HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK, // U_ENCLOSING_MARK = 7,
112 HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK, // U_COMBINING_SPACING_MARK = 8,
113 HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER, // U_DECIMAL_DIGIT_NUMBER = 9,
114 HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER, // U_LETTER_NUMBER = 10,
115 HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER, // U_OTHER_NUMBER = 11,
116 HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR, // U_SPACE_SEPARATOR = 12,
117 HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR, // U_LINE_SEPARATOR = 13,
118 HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR, // U_PARAGRAPH_SEPARATOR = 14,
119 HB_UNICODE_GENERAL_CATEGORY_CONTROL, // U_CONTROL_CHAR = 15,
120 HB_UNICODE_GENERAL_CATEGORY_FORMAT, // U_FORMAT_CHAR = 16,
121 HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE, // U_PRIVATE_USE_CHAR = 17,
122 HB_UNICODE_GENERAL_CATEGORY_SURROGATE, // U_SURROGATE = 18,
123 HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION, // U_DASH_PUNCTUATION = 19,
124 HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION, // U_START_PUNCTUATION = 20,
125 HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION, // U_END_PUNCTUATION = 21,
126 HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION, // U_CONNECTOR_PUNCTUATION = 22,
127 HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION, // U_OTHER_PUNCTUATION = 23,
128 HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL, // U_MATH_SYMBOL = 24,
129 HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL, // U_CURRENCY_SYMBOL = 25,
130 HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL, // U_MODIFIER_SYMBOL = 26,
131 HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL, // U_OTHER_SYMBOL = 27,
132 HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION, // U_INITIAL_PUNCTUATION = 28,
133 HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION, // U_FINAL_PUNCTUATION = 29,
134 // clang-format on
135 };
136
137 #define DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(prefix_) \
138 uint32_t Get##prefix_(uint32_t aCh) { \
139 if (aCh >= UNICODE_BMP_LIMIT) { \
140 return aCh; \
141 } \
142 auto page = s##prefix_##Pages[aCh >> k##prefix_##CharBits]; \
143 auto index = aCh & ((1 << k##prefix_##CharBits) - 1); \
144 uint32_t v = s##prefix_##Values[page][index]; \
145 return v ? v : aCh; \
146 }
147
148 // full-width mappings only exist for BMP characters; all others are
149 // returned unchanged
150 DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidth)
DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse)151 DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse)
152
153 bool IsClusterExtender(uint32_t aCh, uint8_t aCategory) {
154 return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
155 aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
156 (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ
157 (aCh >= 0xff9e && aCh <= 0xff9f)); // katakana sound marks
158 }
159
160 enum HSType {
161 HST_NONE = U_HST_NOT_APPLICABLE,
162 HST_L = U_HST_LEADING_JAMO,
163 HST_V = U_HST_VOWEL_JAMO,
164 HST_T = U_HST_TRAILING_JAMO,
165 HST_LV = U_HST_LV_SYLLABLE,
166 HST_LVT = U_HST_LVT_SYLLABLE
167 };
168
GetHangulSyllableType(uint32_t aCh)169 static HSType GetHangulSyllableType(uint32_t aCh) {
170 return HSType(u_getIntPropertyValue(aCh, UCHAR_HANGUL_SYLLABLE_TYPE));
171 }
172
Next()173 void ClusterIterator::Next() {
174 if (AtEnd()) {
175 NS_WARNING("ClusterIterator has already reached the end");
176 return;
177 }
178
179 uint32_t ch = *mPos++;
180
181 if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit && NS_IS_LOW_SURROGATE(*mPos)) {
182 ch = SURROGATE_TO_UCS4(ch, *mPos++);
183 } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) ||
184 (ch >= 0xac00 && ch <= 0xd7ff)) {
185 // Handle conjoining Jamo that make Hangul syllables
186 HSType hangulState = GetHangulSyllableType(ch);
187 while (mPos < mLimit) {
188 ch = *mPos;
189 HSType hangulType = GetHangulSyllableType(ch);
190 switch (hangulType) {
191 case HST_L:
192 case HST_LV:
193 case HST_LVT:
194 if (hangulState == HST_L) {
195 hangulState = hangulType;
196 mPos++;
197 continue;
198 }
199 break;
200 case HST_V:
201 if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
202 (hangulState != HST_LVT)) {
203 hangulState = hangulType;
204 mPos++;
205 continue;
206 }
207 break;
208 case HST_T:
209 if (hangulState != HST_NONE && hangulState != HST_L) {
210 hangulState = hangulType;
211 mPos++;
212 continue;
213 }
214 break;
215 default:
216 break;
217 }
218 break;
219 }
220 }
221
222 while (mPos < mLimit) {
223 ch = *mPos;
224
225 // Check for surrogate pairs; note that isolated surrogates will just
226 // be treated as generic (non-cluster-extending) characters here,
227 // which is fine for cluster-iterating purposes
228 if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 &&
229 NS_IS_LOW_SURROGATE(*(mPos + 1))) {
230 ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
231 }
232
233 if (!IsClusterExtender(ch)) {
234 break;
235 }
236
237 mPos++;
238 if (!IS_IN_BMP(ch)) {
239 mPos++;
240 }
241 }
242
243 NS_ASSERTION(mText < mPos && mPos <= mLimit,
244 "ClusterIterator::Next has overshot the string!");
245 }
246
Next()247 void ClusterReverseIterator::Next() {
248 if (AtEnd()) {
249 NS_WARNING("ClusterReverseIterator has already reached the end");
250 return;
251 }
252
253 uint32_t ch;
254 do {
255 ch = *--mPos;
256
257 if (NS_IS_LOW_SURROGATE(ch) && mPos > mLimit &&
258 NS_IS_HIGH_SURROGATE(*(mPos - 1))) {
259 ch = SURROGATE_TO_UCS4(*--mPos, ch);
260 }
261
262 if (!IsClusterExtender(ch)) {
263 break;
264 }
265 } while (mPos > mLimit);
266
267 // XXX May need to handle conjoining Jamo
268
269 NS_ASSERTION(mPos >= mLimit,
270 "ClusterReverseIterator::Next has overshot the string!");
271 }
272
CountGraphemeClusters(const char16_t * aText,uint32_t aLength)273 uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength) {
274 ClusterIterator iter(aText, aLength);
275 uint32_t result = 0;
276 while (!iter.AtEnd()) {
277 ++result;
278 iter.Next();
279 }
280 return result;
281 }
282
283 } // end namespace unicode
284
285 } // end namespace mozilla
286