1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "third_party/blink/renderer/platform/fonts/script_run_iterator.h"
6
7 #include "testing/gtest/include/gtest/gtest.h"
8 #include "third_party/blink/renderer/platform/wtf/assertions.h"
9 #include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
10 #include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
11 #include "third_party/blink/renderer/platform/wtf/threading.h"
12
13 namespace blink {
14
15 struct ScriptTestRun {
16 const char* const text;
17 UScriptCode code;
18 };
19
20 struct ScriptExpectedRun {
21 unsigned limit;
22 UScriptCode code;
23
ScriptExpectedRunblink::ScriptExpectedRun24 ScriptExpectedRun(unsigned the_limit, UScriptCode the_code)
25 : limit(the_limit), code(the_code) {}
26 };
27
28 class MockScriptData : public ScriptData {
29 public:
30 ~MockScriptData() override = default;
31
Instance()32 static const MockScriptData* Instance() {
33 DEFINE_THREAD_SAFE_STATIC_LOCAL(const MockScriptData, mock_script_data, ());
34 return &mock_script_data;
35 }
36
GetScripts(UChar32 ch,UScriptCodeList & dst) const37 void GetScripts(UChar32 ch, UScriptCodeList& dst) const override {
38 DCHECK_GE(ch, kMockCharMin);
39 DCHECK_LT(ch, kMockCharLimit);
40
41 int code = ch - kMockCharMin;
42 dst.clear();
43 switch (code & kCodeSpecialMask) {
44 case kCodeSpecialCommon:
45 dst.push_back(USCRIPT_COMMON);
46 break;
47 case kCodeSpecialInherited:
48 dst.push_back(USCRIPT_INHERITED);
49 break;
50 default:
51 break;
52 }
53 int list_bits = kTable[code & kCodeListIndexMask];
54 if (dst.IsEmpty() && list_bits == 0) {
55 dst.push_back(USCRIPT_UNKNOWN);
56 return;
57 }
58 while (list_bits) {
59 switch (list_bits & kListMask) {
60 case 0:
61 break;
62 case kLatin:
63 dst.push_back(USCRIPT_LATIN);
64 break;
65 case kHan:
66 dst.push_back(USCRIPT_HAN);
67 break;
68 case kGreek:
69 dst.push_back(USCRIPT_GREEK);
70 break;
71 }
72 list_bits >>= kListShift;
73 }
74 }
75
GetPairedBracket(UChar32 ch) const76 UChar32 GetPairedBracket(UChar32 ch) const override {
77 switch (GetPairedBracketType(ch)) {
78 case PairedBracketType::kBracketTypeClose:
79 return ch - kBracketDelta;
80 case PairedBracketType::kBracketTypeOpen:
81 return ch + kBracketDelta;
82 default:
83 return ch;
84 }
85 }
86
GetPairedBracketType(UChar32 ch) const87 PairedBracketType GetPairedBracketType(UChar32 ch) const override {
88 DCHECK_GE(ch, kMockCharMin);
89 DCHECK_LT(ch, kMockCharLimit);
90 int code = ch - kMockCharMin;
91 if ((code & kCodeBracketBit) == 0) {
92 return PairedBracketType::kBracketTypeNone;
93 }
94 if (code & kCodeBracketCloseBit) {
95 return PairedBracketType::kBracketTypeClose;
96 }
97 return PairedBracketType::kBracketTypeOpen;
98 }
99
TableLookup(int value)100 static int TableLookup(int value) {
101 for (int i = 0; i < 16; ++i) {
102 if (kTable[i] == value) {
103 return i;
104 }
105 }
106 DLOG(ERROR) << "Table does not contain value 0x" << std::hex << value;
107 return 0;
108 }
109
ToTestString(const std::string & input)110 static String ToTestString(const std::string& input) {
111 StringBuilder result;
112 result.Ensure16Bit();
113 bool in_set = false;
114 int seen = 0;
115 int code = 0;
116 int list = 0;
117 int current_shift = 0;
118 for (char c : input) {
119 if (in_set) {
120 switch (c) {
121 case '(':
122 DCHECK_EQ(seen, 0);
123 seen |= kSawBracket;
124 code |= kCodeBracketBit;
125 break;
126 case '[':
127 DCHECK_EQ(seen, 0);
128 seen |= kSawBracket;
129 code |= kCodeBracketBit | kCodeSquareBracketBit;
130 break;
131 case ')':
132 DCHECK_EQ(seen, 0);
133 seen |= kSawBracket;
134 code |= kCodeBracketBit | kCodeBracketCloseBit;
135 break;
136 case ']':
137 DCHECK_EQ(seen, 0);
138 seen |= kSawBracket;
139 code |=
140 kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketCloseBit;
141 break;
142 case 'i':
143 DCHECK_EQ(seen, 0); // brackets can't be inherited
144 seen |= kSawSpecial;
145 code |= kCodeSpecialInherited;
146 break;
147 case 'c':
148 DCHECK_EQ((seen & ~kSawBracket), 0);
149 seen |= kSawSpecial;
150 code |= kCodeSpecialCommon;
151 break;
152 case 'l':
153 DCHECK_EQ((seen & kSawLatin), 0);
154 DCHECK_LT(current_shift, 3);
155 seen |= kSawLatin;
156 list |= kLatin << (2 * current_shift++);
157 break;
158 case 'h':
159 DCHECK_EQ((seen & kSawHan), 0);
160 DCHECK_LT(current_shift, 3);
161 seen |= kSawHan;
162 list |= kHan << (2 * current_shift++);
163 break;
164 case 'g':
165 DCHECK_EQ((seen & kSawGreek), 0);
166 DCHECK_LT(current_shift, 3);
167 seen |= kSawGreek;
168 list |= kGreek << (2 * current_shift++);
169 break;
170 case '>':
171 DCHECK_NE(seen, 0);
172 code |= TableLookup(list);
173 result.Append(static_cast<UChar>(kMockCharMin + code));
174 in_set = false;
175 break;
176 default:
177 DLOG(ERROR) << "Illegal mock string set char: '" << c << "'";
178 break;
179 }
180 continue;
181 }
182 // not in set
183 switch (c) {
184 case '<':
185 seen = 0;
186 code = 0;
187 list = 0;
188 current_shift = 0;
189 in_set = true;
190 break;
191 case '(':
192 code = kCodeBracketBit | kCodeSpecialCommon;
193 break;
194 case '[':
195 code = kCodeBracketBit | kCodeSquareBracketBit | kCodeSpecialCommon;
196 break;
197 case ')':
198 code = kCodeBracketBit | kCodeBracketCloseBit | kCodeSpecialCommon;
199 break;
200 case ']':
201 code = kCodeBracketBit | kCodeSquareBracketBit |
202 kCodeBracketCloseBit | kCodeSpecialCommon;
203 break;
204 case 'i':
205 code = kCodeSpecialInherited;
206 break;
207 case 'c':
208 code = kCodeSpecialCommon;
209 break;
210 case 'l':
211 code = kLatin;
212 break;
213 case 'h':
214 code = kHan;
215 break;
216 case 'g':
217 code = kGreek;
218 break;
219 case '?':
220 code = 0; // unknown
221 break;
222 default:
223 DLOG(ERROR) << "Illegal mock string set char: '" << c << "'";
224 }
225 if (!in_set) {
226 result.Append(static_cast<UChar>(kMockCharMin + code));
227 }
228 }
229 return result.ToString();
230 }
231
232 // We determine properties based on the offset from kMockCharMin:
233 // bits 0-3 represent the list of l, h, c scripts (index into table)
234 // bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal
235 // bit 6 clear means non-bracket, open means bracket
236 // bit 7 clear means open bracket, set means close bracket
237 // bit 8 clear means paren, set means bracket
238 // if it's a bracket, the matching bracket is 64 code points away
239 static const UChar32 kMockCharMin = 0xe000;
240 static const UChar32 kMockCharLimit = kMockCharMin + 0x200;
241 static const int kLatin = 1;
242 static const int kHan = 2;
243 static const int kGreek = 3;
244 static const int kCodeListIndexMask = 0xf;
245 static const int kCodeSpecialMask = 0x30;
246 static const int kCodeSpecialCommon = 0x10;
247 static const int kCodeSpecialInherited = 0x20;
248 static const int kCodeBracketCloseBit = 0x40;
249 static const int kCodeBracketBit = 0x80;
250 static const int kCodeSquareBracketBit = 0x100;
251 static const int kListShift = 2;
252 static const int kListMask = 0x3;
253 static const int kBracketDelta = kCodeBracketCloseBit;
254 static const int kTable[16];
255
256 static const int kSawBracket = 0x1;
257 static const int kSawSpecial = 0x2;
258 static const int kSawLatin = 0x4;
259 static const int kSawHan = 0x8;
260 static const int kSawGreek = 0x10;
261 };
262
263 static const int kLatin2 = MockScriptData::kLatin << 2;
264 static const int kHan2 = MockScriptData::kHan << 2;
265 static const int kGreek2 = MockScriptData::kGreek << 2;
266 static const int kLatin3 = MockScriptData::kLatin << 4;
267 static const int kHan3 = MockScriptData::kHan << 4;
268 static const int kGreek3 = MockScriptData::kGreek << 4;
269 const int MockScriptData::kTable[] = {
270 0,
271 kLatin,
272 kHan,
273 kGreek,
274 kLatin2 + kHan,
275 kLatin2 + kGreek,
276 kHan2 + kLatin,
277 kHan2 + kGreek,
278 kGreek2 + kLatin,
279 kGreek2 + kHan,
280 kLatin3 + kHan2 + kGreek,
281 kLatin3 + kGreek2 + kHan,
282 kHan3 + kLatin2 + kGreek,
283 kHan3 + kGreek2 + kLatin,
284 kGreek3 + kLatin2 + kHan,
285 kGreek3 + kHan2 + kLatin,
286 };
287
288 class ScriptRunIteratorTest : public testing::Test {
289 protected:
CheckRuns(const Vector<ScriptTestRun> & runs)290 void CheckRuns(const Vector<ScriptTestRun>& runs) {
291 StringBuilder text;
292 text.Ensure16Bit();
293 Vector<ScriptExpectedRun> expect;
294 for (auto& run : runs) {
295 text.Append(String::FromUTF8(run.text));
296 expect.push_back(ScriptExpectedRun(text.length(), run.code));
297 }
298 ScriptRunIterator script_run_iterator(text.Characters16(), text.length());
299 VerifyRuns(&script_run_iterator, expect);
300 }
301
302 // FIXME crbug.com/527329 - CheckMockRuns should be replaced by finding
303 // suitable equivalent real codepoint sequences instead.
CheckMockRuns(const Vector<ScriptTestRun> & runs)304 void CheckMockRuns(const Vector<ScriptTestRun>& runs) {
305 StringBuilder text;
306 text.Ensure16Bit();
307 Vector<ScriptExpectedRun> expect;
308 for (const ScriptTestRun& run : runs) {
309 text.Append(MockScriptData::ToTestString(run.text));
310 expect.push_back(ScriptExpectedRun(text.length(), run.code));
311 }
312
313 ScriptRunIterator script_run_iterator(text.Characters16(), text.length(),
314 MockScriptData::Instance());
315 VerifyRuns(&script_run_iterator, expect);
316 }
317
VerifyRuns(ScriptRunIterator * script_run_iterator,const Vector<ScriptExpectedRun> & expect)318 void VerifyRuns(ScriptRunIterator* script_run_iterator,
319 const Vector<ScriptExpectedRun>& expect) {
320 unsigned limit;
321 UScriptCode code;
322 size_t run_count = 0;
323 while (script_run_iterator->Consume(&limit, &code)) {
324 ASSERT_LT(run_count, expect.size());
325 ASSERT_EQ(expect[run_count].limit, limit);
326 ASSERT_EQ(expect[run_count].code, code);
327 ++run_count;
328 }
329 ASSERT_EQ(expect.size(), run_count);
330 }
331 };
332
TEST_F(ScriptRunIteratorTest,Empty)333 TEST_F(ScriptRunIteratorTest, Empty) {
334 String empty(g_empty_string16_bit);
335 ScriptRunIterator script_run_iterator(empty.Characters16(), empty.length());
336 unsigned limit = 0;
337 UScriptCode code = USCRIPT_INVALID_CODE;
338 DCHECK(!script_run_iterator.Consume(&limit, &code));
339 ASSERT_EQ(limit, 0u);
340 ASSERT_EQ(code, USCRIPT_INVALID_CODE);
341 }
342
343 // Some of our compilers cannot initialize a vector from an array yet.
344 #define DECLARE_SCRIPT_RUNSVECTOR(...) \
345 static const ScriptTestRun kRunsArray[] = __VA_ARGS__; \
346 Vector<ScriptTestRun> runs; \
347 runs.Append(kRunsArray, sizeof(kRunsArray) / sizeof(*kRunsArray));
348
349 #define CHECK_SCRIPT_RUNS(...) \
350 DECLARE_SCRIPT_RUNSVECTOR(__VA_ARGS__); \
351 CheckRuns(runs);
352
353 #define CHECK_MOCK_SCRIPT_RUNS(...) \
354 DECLARE_SCRIPT_RUNSVECTOR(__VA_ARGS__); \
355 CheckMockRuns(runs);
356
TEST_F(ScriptRunIteratorTest,Whitespace)357 TEST_F(ScriptRunIteratorTest, Whitespace) {
358 CHECK_SCRIPT_RUNS({{" \t ", USCRIPT_COMMON}});
359 }
360
TEST_F(ScriptRunIteratorTest,Common)361 TEST_F(ScriptRunIteratorTest, Common) {
362 CHECK_SCRIPT_RUNS({{" ... !?", USCRIPT_COMMON}});
363 }
364
TEST_F(ScriptRunIteratorTest,CombiningCircle)365 TEST_F(ScriptRunIteratorTest, CombiningCircle) {
366 CHECK_SCRIPT_RUNS({{"◌́◌̀◌̈◌̂◌̄◌̊", USCRIPT_COMMON}});
367 }
368
TEST_F(ScriptRunIteratorTest,Latin)369 TEST_F(ScriptRunIteratorTest, Latin) {
370 CHECK_SCRIPT_RUNS({{"latin", USCRIPT_LATIN}});
371 }
372
TEST_F(ScriptRunIteratorTest,Chinese)373 TEST_F(ScriptRunIteratorTest, Chinese) {
374 CHECK_SCRIPT_RUNS({{"萬國碼", USCRIPT_HAN}});
375 }
376
377 struct JapaneseMixedScript {
378 const char* string;
379 // The expected primary_script when the string alone was evaluated.
380 UScriptCode script;
381 } japanese_mixed_scripts[] = {{"あ", USCRIPT_HIRAGANA},
382 // Katakana should be normalized to Hiragana
383 {"ア", USCRIPT_HIRAGANA},
384 // Script_Extensions=Hira Kana
385 {"\u30FC", USCRIPT_HIRAGANA},
386 // Script_Extensions=Hani Hira Kana
387 {"\u303C", USCRIPT_HAN},
388 // Script_Extensions=Bopo Hang Hani Hira Kana
389 {"\u3003", USCRIPT_BOPOMOFO},
390 // Script_Extensions=Bopo Hang Hani Hira Kana Yiii
391 {"\u3001", USCRIPT_BOPOMOFO}};
392
393 class JapaneseMixedScriptTest
394 : public ScriptRunIteratorTest,
395 public testing::WithParamInterface<JapaneseMixedScript> {};
396
397 INSTANTIATE_TEST_SUITE_P(ScriptRunIteratorTest,
398 JapaneseMixedScriptTest,
399 testing::ValuesIn(japanese_mixed_scripts));
400
TEST_P(JapaneseMixedScriptTest,Data)401 TEST_P(JapaneseMixedScriptTest, Data) {
402 const auto& data = GetParam();
403 std::string string(data.string);
404
405 CheckRuns({{string.data(), data.script}});
406
407 // If the string follows Hiragana or Katakana, or is followed by Hiragnaa or
408 // Katakana, it should be normalized as Hiragana.
409 std::string hiragana("か");
410 std::string katakana("カ");
411 CheckRuns({{(hiragana + string).data(), USCRIPT_HIRAGANA}});
412 CheckRuns({{(string + hiragana).data(), USCRIPT_HIRAGANA}});
413
414 CheckRuns({{(katakana + string).data(), USCRIPT_HIRAGANA}});
415 CheckRuns({{(string + katakana).data(), USCRIPT_HIRAGANA}});
416
417 CheckRuns({{(hiragana + string + katakana).data(), USCRIPT_HIRAGANA}});
418 CheckRuns({{(katakana + string + hiragana).data(), USCRIPT_HIRAGANA}});
419 }
420
421 // Close bracket without matching open is ignored
TEST_F(ScriptRunIteratorTest,UnbalancedParens1)422 TEST_F(ScriptRunIteratorTest, UnbalancedParens1) {
423 CHECK_SCRIPT_RUNS(
424 {{"(萬", USCRIPT_HAN}, {"a]", USCRIPT_LATIN}, {")", USCRIPT_HAN}});
425 }
426
427 // Open bracket without matching close is popped when inside
428 // matching close brackets, so doesn't match later close.
TEST_F(ScriptRunIteratorTest,UnbalancedParens2)429 TEST_F(ScriptRunIteratorTest, UnbalancedParens2) {
430 CHECK_SCRIPT_RUNS(
431 {{"(萬", USCRIPT_HAN}, {"a[", USCRIPT_LATIN}, {")]", USCRIPT_HAN}});
432 }
433
434 // space goes with leading script
TEST_F(ScriptRunIteratorTest,LatinHan)435 TEST_F(ScriptRunIteratorTest, LatinHan) {
436 CHECK_SCRIPT_RUNS({{"Unicode ", USCRIPT_LATIN}, {"萬國碼", USCRIPT_HAN}});
437 }
438
439 // space goes with leading script
TEST_F(ScriptRunIteratorTest,HanLatin)440 TEST_F(ScriptRunIteratorTest, HanLatin) {
441 CHECK_SCRIPT_RUNS({{"萬國碼 ", USCRIPT_HAN}, {"Unicode", USCRIPT_LATIN}});
442 }
443
TEST_F(ScriptRunIteratorTest,ParenEmptyParen)444 TEST_F(ScriptRunIteratorTest, ParenEmptyParen) {
445 CHECK_SCRIPT_RUNS({{"()", USCRIPT_COMMON}});
446 }
447
TEST_F(ScriptRunIteratorTest,ParenChineseParen)448 TEST_F(ScriptRunIteratorTest, ParenChineseParen) {
449 CHECK_SCRIPT_RUNS({{"(萬國碼)", USCRIPT_HAN}});
450 }
451
TEST_F(ScriptRunIteratorTest,ParenLatinParen)452 TEST_F(ScriptRunIteratorTest, ParenLatinParen) {
453 CHECK_SCRIPT_RUNS({{"(Unicode)", USCRIPT_LATIN}});
454 }
455
456 // open paren gets leading script
TEST_F(ScriptRunIteratorTest,LatinParenChineseParen)457 TEST_F(ScriptRunIteratorTest, LatinParenChineseParen) {
458 CHECK_SCRIPT_RUNS({{"Unicode (", USCRIPT_LATIN},
459 {"萬國碼", USCRIPT_HAN},
460 {")", USCRIPT_LATIN}});
461 }
462
463 // open paren gets first trailing script if no leading script
TEST_F(ScriptRunIteratorTest,ParenChineseParenLatin)464 TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin) {
465 CHECK_SCRIPT_RUNS({{"(萬國碼) ", USCRIPT_HAN}, {"Unicode", USCRIPT_LATIN}});
466 }
467
468 // leading common and open paren get first trailing script.
469 // TODO(dougfelt): we don't do quote matching, but probably should figure out
470 // something better then doing nothing.
TEST_F(ScriptRunIteratorTest,QuoteParenChineseParenLatinQuote)471 TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote) {
472 CHECK_SCRIPT_RUNS(
473 {{"\"(萬國碼) ", USCRIPT_HAN}, {"Unicode\"", USCRIPT_LATIN}});
474 }
475
476 // Emojies are resolved to the leading script.
TEST_F(ScriptRunIteratorTest,EmojiCommon)477 TEST_F(ScriptRunIteratorTest, EmojiCommon) {
478 CHECK_SCRIPT_RUNS({{"百家姓", USCRIPT_HAN}});
479 }
480
481 // Unmatched close brace gets leading context
TEST_F(ScriptRunIteratorTest,UnmatchedClose)482 TEST_F(ScriptRunIteratorTest, UnmatchedClose) {
483 CHECK_SCRIPT_RUNS({{"Unicode (", USCRIPT_LATIN},
484 {"萬國碼] ", USCRIPT_HAN},
485 {") Unicode\"", USCRIPT_LATIN}});
486 }
487
488 // Match up to 32 bracket pairs
TEST_F(ScriptRunIteratorTest,Match32Brackets)489 TEST_F(ScriptRunIteratorTest, Match32Brackets) {
490 CHECK_SCRIPT_RUNS({{"[萬國碼 ", USCRIPT_HAN},
491 {"Unicode (((((((((((((((((((((((((((((((!"
492 ")))))))))))))))))))))))))))))))",
493 USCRIPT_LATIN},
494 {"]", USCRIPT_HAN}});
495 }
496
497 // Matches 32 most recent bracket pairs. More than that, and we revert to
498 // surrounding script.
TEST_F(ScriptRunIteratorTest,Match32MostRecentBrackets)499 TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets) {
500 CHECK_SCRIPT_RUNS({{"((([萬國碼 ", USCRIPT_HAN},
501 {"Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN},
502 {"萬國碼!", USCRIPT_HAN},
503 {")))))))))))))))))))))))))))))))", USCRIPT_LATIN},
504 {"]", USCRIPT_HAN},
505 {"But )))", USCRIPT_LATIN}});
506 }
507
508 // A char with multiple scripts that match both leading and trailing context
509 // gets the leading context.
TEST_F(ScriptRunIteratorTest,ExtensionsPreferLeadingContext)510 TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext) {
511 CHECK_MOCK_SCRIPT_RUNS({{"h<lh>", USCRIPT_HAN}, {"l", USCRIPT_LATIN}});
512 }
513
514 // A char with multiple scripts that only match trailing context gets the
515 // trailing context.
TEST_F(ScriptRunIteratorTest,ExtensionsMatchTrailingContext)516 TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext) {
517 CHECK_MOCK_SCRIPT_RUNS({{"h", USCRIPT_HAN}, {"<gl>l", USCRIPT_LATIN}});
518 }
519
520 // Retain first established priority script. <lhg><gh> produce the script <gh>
521 // with g as priority, because of the two priority scripts l and g, only g
522 // remains. Then <gh><hgl> retains g as priority, because of the two priority
523 // scripts g and h that remain, g was encountered first.
TEST_F(ScriptRunIteratorTest,ExtensionsRetainFirstPriorityScript)524 TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript) {
525 CHECK_MOCK_SCRIPT_RUNS({{"<lhg><gh><hgl>", USCRIPT_GREEK}});
526 }
527
528 // Parens can have scripts that break script runs.
TEST_F(ScriptRunIteratorTest,ExtensionsParens)529 TEST_F(ScriptRunIteratorTest, ExtensionsParens) {
530 CHECK_MOCK_SCRIPT_RUNS({{"<gl><(lg>", USCRIPT_GREEK},
531 {"h<[hl>", USCRIPT_HAN},
532 {"l", USCRIPT_LATIN},
533 {"<]hl>", USCRIPT_HAN},
534 {"<)lg>", USCRIPT_GREEK}});
535 }
536
537 // The close paren might be encountered before we've established the open
538 // paren's script, but when this is the case the current set is still valid, so
539 // this doesn't affect it nor break the run.
TEST_F(ScriptRunIteratorTest,ExtensionsParens2)540 TEST_F(ScriptRunIteratorTest, ExtensionsParens2) {
541 CHECK_MOCK_SCRIPT_RUNS({{"<(lhg><gh><)lhg>", USCRIPT_GREEK}});
542 }
543
544 // A common script with a single extension should be treated as common, but
545 // with the extended script as a default. If we encounter anything other than
546 // common, that takes priority. If we encounter other common scripts with a
547 // single extension, the current priority remains.
TEST_F(ScriptRunIteratorTest,CommonWithPriority)548 TEST_F(ScriptRunIteratorTest, CommonWithPriority) {
549 CHECK_MOCK_SCRIPT_RUNS({{"<ch>", USCRIPT_HAN}});
550 }
551
TEST_F(ScriptRunIteratorTest,CommonWithPriority2)552 TEST_F(ScriptRunIteratorTest, CommonWithPriority2) {
553 CHECK_MOCK_SCRIPT_RUNS({{"<ch><lh>", USCRIPT_LATIN}});
554 }
555
TEST_F(ScriptRunIteratorTest,CommonWithPriority3)556 TEST_F(ScriptRunIteratorTest, CommonWithPriority3) {
557 CHECK_MOCK_SCRIPT_RUNS({{"<ch><cl><cg>", USCRIPT_HAN}});
558 }
559
560 // UDatta (\xE0\xA5\x91) is inherited with LATIN, DEVANAGARI, BENGALI and
561 // other Indic scripts. Since it has LATIN, and the
562 // dotted circle U+25CC (\xE2\x97\x8C) is COMMON and has adopted the
563 // preceding LATIN, it gets the LATIN. This is standard.
TEST_F(ScriptRunIteratorTest,LatinDottedCircleUdatta)564 TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta) {
565 CHECK_SCRIPT_RUNS({{"Latin \xE2\x97\x8C\xE0\xA5\x91", USCRIPT_LATIN}});
566 }
567
568 // In this situation, UDatta U+0951 (\xE0\xA5\x91) doesn't share a script
569 // with the value inherited by the dotted circle U+25CC (\xE2\x97\x8C).
570 // It captures the preceding dotted circle and breaks it from the run it would
571 // normally have been in. U+0951 is used in multiple scripts (DEVA, BENG, LATN,
572 // etc) and has multiple values for Script_Extension property. At the moment,
573 // getScripts() treats the script with the lowest script code as 'true' primary,
574 // and BENG comes before DEVA in the script enum so that we get BENGALI.
575 // Taking into account a Unicode block and returning DEVANAGARI would be
576 // slightly better.
TEST_F(ScriptRunIteratorTest,HanDottedCircleUdatta)577 TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta) {
578 CHECK_SCRIPT_RUNS({{"萬國碼 ", USCRIPT_HAN},
579 {"\xE2\x97\x8C\xE0\xA5\x91", USCRIPT_BENGALI}});
580 }
581
582 // Tatweel is \xD9\x80 Lm, Fathatan is \xD9\x8B Mn. The script of tatweel is
583 // common, that of Fathatan is inherited. The script extensions for Fathatan
584 // are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the
585 // preferred script for Fathatan is Arabic, according to Behdad's
586 // heuristic. This is exactly analogous to the Udatta tests above, except
587 // Tatweel is Lm. But we don't take properties into account, only scripts.
TEST_F(ScriptRunIteratorTest,LatinTatweelFathatan)588 TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan) {
589 CHECK_SCRIPT_RUNS(
590 {{"Latin ", USCRIPT_LATIN}, {"\xD9\x80\xD9\x8B", USCRIPT_ARABIC}});
591 }
592
593 // Another case where if the mark accepts a script that was inherited by the
594 // preceding common-script character, they both continue in that script.
595 // SYRIAC LETTER NUN \xDC\xA2
596 // ARABIC TATWEEL \xD9\x80
597 // ARABIC FATHATAN \xD9\x82
TEST_F(ScriptRunIteratorTest,SyriacTatweelFathatan)598 TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan) {
599 CHECK_SCRIPT_RUNS({{"\xDC\xA2\xD9\x80\xD9\x8B", USCRIPT_SYRIAC}});
600 }
601
602 // The Udatta (\xE0\xA5\x91) is inherited, so will share runs with anything that
603 // is not common.
TEST_F(ScriptRunIteratorTest,HanUdatta)604 TEST_F(ScriptRunIteratorTest, HanUdatta) {
605 CHECK_SCRIPT_RUNS({{"萬國碼\xE0\xA5\x91", USCRIPT_HAN}});
606 }
607
608 // The Udatta U+0951 (\xE0\xA5\x91) is inherited, and will capture the space
609 // and turn it into Bengali because SCRIPT_BENAGLI is 4 and SCRIPT_DEVANAGARI
610 // is 10. See TODO comment for |getScripts| and HanDottedCircleUdatta.
TEST_F(ScriptRunIteratorTest,HanSpaceUdatta)611 TEST_F(ScriptRunIteratorTest, HanSpaceUdatta) {
612 CHECK_SCRIPT_RUNS(
613 {{"萬國碼", USCRIPT_HAN}, {" \xE0\xA5\x91", USCRIPT_BENGALI}});
614 }
615
616 // Corresponds to one test in RunSegmenter, where orientation of the
617 // space character is sidesways in vertical.
TEST_F(ScriptRunIteratorTest,Hangul)618 TEST_F(ScriptRunIteratorTest, Hangul) {
619 CHECK_SCRIPT_RUNS({{"키스의 고유조건은", USCRIPT_HANGUL}});
620 }
621
622 // Corresponds to one test in RunSegmenter, which tests that the punctuation
623 // characters mixed in are actually sideways in vertical. The ScriptIterator
624 // should report one run, but the RunSegmenter should report three, with the
625 // middle one rotated sideways.
TEST_F(ScriptRunIteratorTest,HiraganaMixedPunctuation)626 TEST_F(ScriptRunIteratorTest, HiraganaMixedPunctuation) {
627 CHECK_SCRIPT_RUNS({{"いろはに.…¡ほへと", USCRIPT_HIRAGANA}});
628 }
629
630 // Make sure Mock code works too.
TEST_F(ScriptRunIteratorTest,MockHanInheritedGL)631 TEST_F(ScriptRunIteratorTest, MockHanInheritedGL) {
632 CHECK_MOCK_SCRIPT_RUNS({{"h<igl>", USCRIPT_HAN}});
633 }
634
TEST_F(ScriptRunIteratorTest,MockHanCommonInheritedGL)635 TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL) {
636 CHECK_MOCK_SCRIPT_RUNS({{"h", USCRIPT_HAN}, {"c<igl>", USCRIPT_GREEK}});
637 }
638
639 // Leading inherited just act like common, except there's no preferred script.
TEST_F(ScriptRunIteratorTest,MockLeadingInherited)640 TEST_F(ScriptRunIteratorTest, MockLeadingInherited) {
641 CHECK_MOCK_SCRIPT_RUNS({{"<igl>", USCRIPT_COMMON}});
642 }
643
644 // Leading inherited just act like common, except there's no preferred script.
TEST_F(ScriptRunIteratorTest,MockLeadingInherited2)645 TEST_F(ScriptRunIteratorTest, MockLeadingInherited2) {
646 CHECK_MOCK_SCRIPT_RUNS({{"<igl><ih>", USCRIPT_COMMON}});
647 }
648
TEST_F(ScriptRunIteratorTest,LeadingInheritedHan)649 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan) {
650 // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
651 CHECK_SCRIPT_RUNS({{"\xE0\xA5\x91萬國碼", USCRIPT_HAN}});
652 }
653
TEST_F(ScriptRunIteratorTest,LeadingInheritedHan2)654 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2) {
655 // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
656 // ARABIC FATHATAN \xD9\x8B
657 CHECK_SCRIPT_RUNS({{"\xE0\xA5\x91\xD9\x8B萬國碼", USCRIPT_HAN}});
658 }
659
TEST_F(ScriptRunIteratorTest,OddLatinString)660 TEST_F(ScriptRunIteratorTest, OddLatinString) {
661 CHECK_SCRIPT_RUNS({{"ç̈", USCRIPT_LATIN}});
662 }
663
TEST_F(ScriptRunIteratorTest,CommonMalayalam)664 TEST_F(ScriptRunIteratorTest, CommonMalayalam) {
665 CHECK_SCRIPT_RUNS({{"100-ാം", USCRIPT_MALAYALAM}});
666 }
667
668 class ScriptRunIteratorICUDataTest : public testing::Test {
669 public:
ScriptRunIteratorICUDataTest()670 ScriptRunIteratorICUDataTest()
671 : max_extensions_(0), max_extensions_codepoint_(0xffff) {
672 int max_extensions = 0;
673 UChar32 max_extensionscp = 0;
674 for (UChar32 cp = 0; cp < 0x11000; ++cp) {
675 UErrorCode status = U_ZERO_ERROR;
676 int count = uscript_getScriptExtensions(cp, nullptr, 0, &status);
677 if (count > max_extensions) {
678 max_extensions = count;
679 max_extensionscp = cp;
680 }
681 }
682 max_extensions_ = max_extensions;
683 max_extensions_codepoint_ = max_extensionscp;
684 }
685
686 protected:
GetACharWithMaxExtensions(int * num_extensions)687 UChar32 GetACharWithMaxExtensions(int* num_extensions) {
688 if (num_extensions) {
689 *num_extensions = max_extensions_;
690 }
691 return max_extensions_codepoint_;
692 }
693
694 private:
695 int max_extensions_;
696 UChar32 max_extensions_codepoint_;
697 };
698
699 // Validate that ICU never returns more than our maximum expected number of
700 // script extensions.
TEST_F(ScriptRunIteratorICUDataTest,ValidateICUMaxScriptExtensions)701 TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions) {
702 int max_extensions;
703 UChar32 cp = GetACharWithMaxExtensions(&max_extensions);
704 ASSERT_LE(max_extensions, ScriptData::kMaxScriptCount)
705 << "char " << std::hex << cp << std::dec;
706 }
707
708 // Check that ICUScriptData returns all of a character's scripts.
709 // This only checks one likely character, but doesn't check all cases.
TEST_F(ScriptRunIteratorICUDataTest,ICUDataGetScriptsReturnsAllExtensions)710 TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions) {
711 int max_extensions;
712 UChar32 cp = GetACharWithMaxExtensions(&max_extensions);
713 ScriptData::UScriptCodeList extensions;
714 ICUScriptData::Instance()->GetScripts(cp, extensions);
715
716 // It's possible that GetScripts adds the primary script to the list of
717 // extensions, resulting in one more script than the raw extension count.
718 ASSERT_GE(static_cast<int>(extensions.size()), max_extensions)
719 << "char " << std::hex << cp << std::dec;
720 }
721
TEST_F(ScriptRunIteratorICUDataTest,CommonHaveNoMoreThanOneExtension)722 TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension) {
723 ScriptData::UScriptCodeList extensions;
724 for (UChar32 cp = 0; cp < 0x110000; ++cp) {
725 ICUScriptData::Instance()->GetScripts(cp, extensions);
726 UScriptCode primary = extensions.at(0);
727 if (primary == USCRIPT_COMMON) {
728 ASSERT_LE(extensions.size(), 2ul) << "cp: " << std::hex << cp << std::dec;
729 }
730 }
731 }
732
733 // ZWJ is \u200D Cf (Format, other) and its script is inherited. I'm going to
734 // ignore this for now, as I think it shouldn't matter which run it ends up
735 // in. HarfBuzz needs to be able to use it as context and shape each
736 // neighboring character appropriately no matter what run it got assigned to.
737
738 } // namespace blink
739