1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim:set ts=4 sw=2 sts=2 et cindent: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #include "nsUnicodeProperties.h"
8 #include "nsUnicodePropertyData.cpp"
9
10 #include "mozilla/ArrayUtils.h"
11 #include "mozilla/HashTable.h"
12 #include "nsCharTraits.h"
13
14 #include "BaseChars.h"
15
16 #define UNICODE_BMP_LIMIT 0x10000
17 #define UNICODE_LIMIT 0x110000
18
GetCharProps2(uint32_t aCh)19 const nsCharProps2& GetCharProps2(uint32_t aCh) {
20 if (aCh < UNICODE_BMP_LIMIT) {
21 return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]]
22 [aCh & ((1 << kCharProp2CharBits) - 1)];
23 }
24 if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) {
25 return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]]
26 [(aCh & 0xffff) >>
27 kCharProp2CharBits]]
28 [aCh & ((1 << kCharProp2CharBits) - 1)];
29 }
30
31 MOZ_ASSERT_UNREACHABLE(
32 "Getting CharProps for codepoint outside Unicode "
33 "range");
34
35 // Default values for unassigned
36 using namespace mozilla::unicode;
37 static const nsCharProps2 undefined = {
38 VERTICAL_ORIENTATION_R,
39 0 // IdentifierType
40 };
41 return undefined;
42 }
43
44 namespace mozilla {
45
46 namespace unicode {
47
48 /*
49 To store properties for a million Unicode codepoints compactly, we use
50 a three-level array structure, with the Unicode values considered as
51 three elements: Plane, Page, and Char.
52
53 Space optimization happens because multiple Planes can refer to the same
54 Page array, and multiple Pages can refer to the same Char array holding
55 the actual values. In practice, most of the higher planes are empty and
56 thus share the same data; and within the BMP, there are also many pages
57 that repeat the same data for any given property.
58
59 Plane is usually zero, so we skip a lookup in this case, and require
60 that the Plane 0 pages are always the first set of entries in the Page
61 array.
62
63 The division of the remaining 16 bits into Page and Char fields is
64 adjusted for each property (by experiment using the generation tool)
65 to provide the most compact storage, depending on the distribution
66 of values.
67 */
68
69 const nsUGenCategory sDetailedToGeneralCategory[] = {
70 // clang-format off
71 /*
72 * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
73 * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-unicode.h.
74 */
75 /* CONTROL */ nsUGenCategory::kOther,
76 /* FORMAT */ nsUGenCategory::kOther,
77 /* UNASSIGNED */ nsUGenCategory::kOther,
78 /* PRIVATE_USE */ nsUGenCategory::kOther,
79 /* SURROGATE */ nsUGenCategory::kOther,
80 /* LOWERCASE_LETTER */ nsUGenCategory::kLetter,
81 /* MODIFIER_LETTER */ nsUGenCategory::kLetter,
82 /* OTHER_LETTER */ nsUGenCategory::kLetter,
83 /* TITLECASE_LETTER */ nsUGenCategory::kLetter,
84 /* UPPERCASE_LETTER */ nsUGenCategory::kLetter,
85 /* COMBINING_MARK */ nsUGenCategory::kMark,
86 /* ENCLOSING_MARK */ nsUGenCategory::kMark,
87 /* NON_SPACING_MARK */ nsUGenCategory::kMark,
88 /* DECIMAL_NUMBER */ nsUGenCategory::kNumber,
89 /* LETTER_NUMBER */ nsUGenCategory::kNumber,
90 /* OTHER_NUMBER */ nsUGenCategory::kNumber,
91 /* CONNECT_PUNCTUATION */ nsUGenCategory::kPunctuation,
92 /* DASH_PUNCTUATION */ nsUGenCategory::kPunctuation,
93 /* CLOSE_PUNCTUATION */ nsUGenCategory::kPunctuation,
94 /* FINAL_PUNCTUATION */ nsUGenCategory::kPunctuation,
95 /* INITIAL_PUNCTUATION */ nsUGenCategory::kPunctuation,
96 /* OTHER_PUNCTUATION */ nsUGenCategory::kPunctuation,
97 /* OPEN_PUNCTUATION */ nsUGenCategory::kPunctuation,
98 /* CURRENCY_SYMBOL */ nsUGenCategory::kSymbol,
99 /* MODIFIER_SYMBOL */ nsUGenCategory::kSymbol,
100 /* MATH_SYMBOL */ nsUGenCategory::kSymbol,
101 /* OTHER_SYMBOL */ nsUGenCategory::kSymbol,
102 /* LINE_SEPARATOR */ nsUGenCategory::kSeparator,
103 /* PARAGRAPH_SEPARATOR */ nsUGenCategory::kSeparator,
104 /* SPACE_SEPARATOR */ nsUGenCategory::kSeparator
105 // clang-format on
106 };
107
108 const hb_unicode_general_category_t sICUtoHBcategory[U_CHAR_CATEGORY_COUNT] = {
109 // clang-format off
110 HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // U_GENERAL_OTHER_TYPES = 0,
111 HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER, // U_UPPERCASE_LETTER = 1,
112 HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER, // U_LOWERCASE_LETTER = 2,
113 HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER, // U_TITLECASE_LETTER = 3,
114 HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER, // U_MODIFIER_LETTER = 4,
115 HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER, // U_OTHER_LETTER = 5,
116 HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK, // U_NON_SPACING_MARK = 6,
117 HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK, // U_ENCLOSING_MARK = 7,
118 HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK, // U_COMBINING_SPACING_MARK = 8,
119 HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER, // U_DECIMAL_DIGIT_NUMBER = 9,
120 HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER, // U_LETTER_NUMBER = 10,
121 HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER, // U_OTHER_NUMBER = 11,
122 HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR, // U_SPACE_SEPARATOR = 12,
123 HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR, // U_LINE_SEPARATOR = 13,
124 HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR, // U_PARAGRAPH_SEPARATOR = 14,
125 HB_UNICODE_GENERAL_CATEGORY_CONTROL, // U_CONTROL_CHAR = 15,
126 HB_UNICODE_GENERAL_CATEGORY_FORMAT, // U_FORMAT_CHAR = 16,
127 HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE, // U_PRIVATE_USE_CHAR = 17,
128 HB_UNICODE_GENERAL_CATEGORY_SURROGATE, // U_SURROGATE = 18,
129 HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION, // U_DASH_PUNCTUATION = 19,
130 HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION, // U_START_PUNCTUATION = 20,
131 HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION, // U_END_PUNCTUATION = 21,
132 HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION, // U_CONNECTOR_PUNCTUATION = 22,
133 HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION, // U_OTHER_PUNCTUATION = 23,
134 HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL, // U_MATH_SYMBOL = 24,
135 HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL, // U_CURRENCY_SYMBOL = 25,
136 HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL, // U_MODIFIER_SYMBOL = 26,
137 HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL, // U_OTHER_SYMBOL = 27,
138 HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION, // U_INITIAL_PUNCTUATION = 28,
139 HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION, // U_FINAL_PUNCTUATION = 29,
140 // clang-format on
141 };
142
143 #define DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(prefix_) \
144 uint32_t Get##prefix_(uint32_t aCh) { \
145 if (aCh >= UNICODE_BMP_LIMIT) { \
146 return aCh; \
147 } \
148 auto page = s##prefix_##Pages[aCh >> k##prefix_##CharBits]; \
149 auto index = aCh & ((1 << k##prefix_##CharBits) - 1); \
150 uint32_t v = s##prefix_##Values[page][index]; \
151 return v ? v : aCh; \
152 }
153
154 // full-width mappings only exist for BMP characters; all others are
155 // returned unchanged
156 DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidth)
DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse)157 DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse)
158
159 bool IsClusterExtender(uint32_t aCh, uint8_t aCategory) {
160 return (
161 (aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
162 aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
163 (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ
164 (aCh >= 0xff9e && aCh <= 0xff9f) || // katakana sound marks
165 (aCh >= 0x1F3FB && aCh <= 0x1F3FF) || // fitzpatrick skin tone modifiers
166 (aCh >= 0xe0020 && aCh <= 0xe007f)); // emoji (flag) tag characters
167 }
168
169 enum HSType {
170 HST_NONE = U_HST_NOT_APPLICABLE,
171 HST_L = U_HST_LEADING_JAMO,
172 HST_V = U_HST_VOWEL_JAMO,
173 HST_T = U_HST_TRAILING_JAMO,
174 HST_LV = U_HST_LV_SYLLABLE,
175 HST_LVT = U_HST_LVT_SYLLABLE
176 };
177
GetHangulSyllableType(uint32_t aCh)178 static HSType GetHangulSyllableType(uint32_t aCh) {
179 return HSType(u_getIntPropertyValue(aCh, UCHAR_HANGUL_SYLLABLE_TYPE));
180 }
181
Next()182 void ClusterIterator::Next() {
183 if (AtEnd()) {
184 NS_WARNING("ClusterIterator has already reached the end");
185 return;
186 }
187
188 uint32_t ch = *mPos++;
189
190 if (mPos < mLimit && NS_IS_SURROGATE_PAIR(ch, *mPos)) {
191 ch = SURROGATE_TO_UCS4(ch, *mPos++);
192 } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) ||
193 (ch >= 0xac00 && ch <= 0xd7ff)) {
194 // Handle conjoining Jamo that make Hangul syllables
195 HSType hangulState = GetHangulSyllableType(ch);
196 while (mPos < mLimit) {
197 ch = *mPos;
198 HSType hangulType = GetHangulSyllableType(ch);
199 switch (hangulType) {
200 case HST_L:
201 case HST_LV:
202 case HST_LVT:
203 if (hangulState == HST_L) {
204 hangulState = hangulType;
205 mPos++;
206 continue;
207 }
208 break;
209 case HST_V:
210 if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
211 (hangulState != HST_LVT)) {
212 hangulState = hangulType;
213 mPos++;
214 continue;
215 }
216 break;
217 case HST_T:
218 if (hangulState != HST_NONE && hangulState != HST_L) {
219 hangulState = hangulType;
220 mPos++;
221 continue;
222 }
223 break;
224 default:
225 break;
226 }
227 break;
228 }
229 }
230
231 const uint32_t kVS16 = 0xfe0f;
232 const uint32_t kZWJ = 0x200d;
233 // UTF-16 surrogate values for Fitzpatrick type modifiers
234 const uint32_t kFitzpatrickHigh = 0xD83C;
235 const uint32_t kFitzpatrickLowFirst = 0xDFFB;
236 const uint32_t kFitzpatrickLowLast = 0xDFFF;
237
238 bool baseIsEmoji = (GetEmojiPresentation(ch) == EmojiDefault) ||
239 (GetEmojiPresentation(ch) == TextDefault &&
240 ((mPos < mLimit && *mPos == kVS16) ||
241 (mPos + 1 < mLimit && *mPos == kFitzpatrickHigh &&
242 *(mPos + 1) >= kFitzpatrickLowFirst &&
243 *(mPos + 1) <= kFitzpatrickLowLast)));
244 bool prevWasZwj = false;
245
246 while (mPos < mLimit) {
247 ch = *mPos;
248 size_t chLen = 1;
249
250 // Check for surrogate pairs; note that isolated surrogates will just
251 // be treated as generic (non-cluster-extending) characters here,
252 // which is fine for cluster-iterating purposes
253 if (mPos < mLimit - 1 && NS_IS_SURROGATE_PAIR(ch, *(mPos + 1))) {
254 ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
255 chLen = 2;
256 }
257
258 bool extendCluster =
259 IsClusterExtender(ch) ||
260 (baseIsEmoji && prevWasZwj &&
261 ((GetEmojiPresentation(ch) == EmojiDefault) ||
262 (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < mLimit &&
263 *(mPos + chLen) == kVS16)));
264 if (!extendCluster) {
265 break;
266 }
267
268 prevWasZwj = (ch == kZWJ);
269 mPos += chLen;
270 }
271
272 NS_ASSERTION(mText < mPos && mPos <= mLimit,
273 "ClusterIterator::Next has overshot the string!");
274 }
275
Next()276 void ClusterReverseIterator::Next() {
277 if (AtEnd()) {
278 NS_WARNING("ClusterReverseIterator has already reached the end");
279 return;
280 }
281
282 uint32_t ch;
283 do {
284 ch = *--mPos;
285
286 if (mPos > mLimit && NS_IS_SURROGATE_PAIR(*(mPos - 1), ch)) {
287 ch = SURROGATE_TO_UCS4(*--mPos, ch);
288 }
289
290 if (!IsClusterExtender(ch)) {
291 break;
292 }
293 } while (mPos > mLimit);
294
295 // XXX May need to handle conjoining Jamo
296
297 NS_ASSERTION(mPos >= mLimit,
298 "ClusterReverseIterator::Next has overshot the string!");
299 }
300
CountGraphemeClusters(const char16_t * aText,uint32_t aLength)301 uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength) {
302 ClusterIterator iter(aText, aLength);
303 uint32_t result = 0;
304 while (!iter.AtEnd()) {
305 ++result;
306 iter.Next();
307 }
308 return result;
309 }
310
GetNaked(uint32_t aCh)311 uint32_t GetNaked(uint32_t aCh) {
312 uint32_t index = aCh >> 8;
313 if (index >= MOZ_ARRAY_LENGTH(BASE_CHAR_MAPPING_BLOCK_INDEX)) {
314 return aCh;
315 }
316 index = BASE_CHAR_MAPPING_BLOCK_INDEX[index];
317 if (index == 0xff) {
318 return aCh;
319 }
320 const BaseCharMappingBlock& block = BASE_CHAR_MAPPING_BLOCKS[index];
321 uint8_t lo = aCh & 0xff;
322 if (lo < block.mFirst || lo > block.mLast) {
323 return aCh;
324 }
325 return (aCh & 0xffff0000) |
326 BASE_CHAR_MAPPING_LIST[block.mMappingStartOffset + lo - block.mFirst];
327 }
328
329 } // end namespace unicode
330
331 } // end namespace mozilla
332