1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "third_party/blink/renderer/platform/fonts/script_run_iterator.h"
6 
7 #include "testing/gtest/include/gtest/gtest.h"
8 #include "third_party/blink/renderer/platform/wtf/assertions.h"
9 #include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
10 #include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
11 #include "third_party/blink/renderer/platform/wtf/threading.h"
12 
13 namespace blink {
14 
15 struct ScriptTestRun {
16   const char* const text;
17   UScriptCode code;
18 };
19 
20 struct ScriptExpectedRun {
21   unsigned limit;
22   UScriptCode code;
23 
ScriptExpectedRunblink::ScriptExpectedRun24   ScriptExpectedRun(unsigned the_limit, UScriptCode the_code)
25       : limit(the_limit), code(the_code) {}
26 };
27 
28 class MockScriptData : public ScriptData {
29  public:
30   ~MockScriptData() override = default;
31 
Instance()32   static const MockScriptData* Instance() {
33     DEFINE_THREAD_SAFE_STATIC_LOCAL(const MockScriptData, mock_script_data, ());
34     return &mock_script_data;
35   }
36 
GetScripts(UChar32 ch,UScriptCodeList & dst) const37   void GetScripts(UChar32 ch, UScriptCodeList& dst) const override {
38     DCHECK_GE(ch, kMockCharMin);
39     DCHECK_LT(ch, kMockCharLimit);
40 
41     int code = ch - kMockCharMin;
42     dst.clear();
43     switch (code & kCodeSpecialMask) {
44       case kCodeSpecialCommon:
45         dst.push_back(USCRIPT_COMMON);
46         break;
47       case kCodeSpecialInherited:
48         dst.push_back(USCRIPT_INHERITED);
49         break;
50       default:
51         break;
52     }
53     int list_bits = kTable[code & kCodeListIndexMask];
54     if (dst.IsEmpty() && list_bits == 0) {
55       dst.push_back(USCRIPT_UNKNOWN);
56       return;
57     }
58     while (list_bits) {
59       switch (list_bits & kListMask) {
60         case 0:
61           break;
62         case kLatin:
63           dst.push_back(USCRIPT_LATIN);
64           break;
65         case kHan:
66           dst.push_back(USCRIPT_HAN);
67           break;
68         case kGreek:
69           dst.push_back(USCRIPT_GREEK);
70           break;
71       }
72       list_bits >>= kListShift;
73     }
74   }
75 
GetPairedBracket(UChar32 ch) const76   UChar32 GetPairedBracket(UChar32 ch) const override {
77     switch (GetPairedBracketType(ch)) {
78       case PairedBracketType::kBracketTypeClose:
79         return ch - kBracketDelta;
80       case PairedBracketType::kBracketTypeOpen:
81         return ch + kBracketDelta;
82       default:
83         return ch;
84     }
85   }
86 
GetPairedBracketType(UChar32 ch) const87   PairedBracketType GetPairedBracketType(UChar32 ch) const override {
88     DCHECK_GE(ch, kMockCharMin);
89     DCHECK_LT(ch, kMockCharLimit);
90     int code = ch - kMockCharMin;
91     if ((code & kCodeBracketBit) == 0) {
92       return PairedBracketType::kBracketTypeNone;
93     }
94     if (code & kCodeBracketCloseBit) {
95       return PairedBracketType::kBracketTypeClose;
96     }
97     return PairedBracketType::kBracketTypeOpen;
98   }
99 
TableLookup(int value)100   static int TableLookup(int value) {
101     for (int i = 0; i < 16; ++i) {
102       if (kTable[i] == value) {
103         return i;
104       }
105     }
106     DLOG(ERROR) << "Table does not contain value 0x" << std::hex << value;
107     return 0;
108   }
109 
ToTestString(const std::string & input)110   static String ToTestString(const std::string& input) {
111     StringBuilder result;
112     result.Ensure16Bit();
113     bool in_set = false;
114     int seen = 0;
115     int code = 0;
116     int list = 0;
117     int current_shift = 0;
118     for (char c : input) {
119       if (in_set) {
120         switch (c) {
121           case '(':
122             DCHECK_EQ(seen, 0);
123             seen |= kSawBracket;
124             code |= kCodeBracketBit;
125             break;
126           case '[':
127             DCHECK_EQ(seen, 0);
128             seen |= kSawBracket;
129             code |= kCodeBracketBit | kCodeSquareBracketBit;
130             break;
131           case ')':
132             DCHECK_EQ(seen, 0);
133             seen |= kSawBracket;
134             code |= kCodeBracketBit | kCodeBracketCloseBit;
135             break;
136           case ']':
137             DCHECK_EQ(seen, 0);
138             seen |= kSawBracket;
139             code |=
140                 kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketCloseBit;
141             break;
142           case 'i':
143             DCHECK_EQ(seen, 0);  // brackets can't be inherited
144             seen |= kSawSpecial;
145             code |= kCodeSpecialInherited;
146             break;
147           case 'c':
148             DCHECK_EQ((seen & ~kSawBracket), 0);
149             seen |= kSawSpecial;
150             code |= kCodeSpecialCommon;
151             break;
152           case 'l':
153             DCHECK_EQ((seen & kSawLatin), 0);
154             DCHECK_LT(current_shift, 3);
155             seen |= kSawLatin;
156             list |= kLatin << (2 * current_shift++);
157             break;
158           case 'h':
159             DCHECK_EQ((seen & kSawHan), 0);
160             DCHECK_LT(current_shift, 3);
161             seen |= kSawHan;
162             list |= kHan << (2 * current_shift++);
163             break;
164           case 'g':
165             DCHECK_EQ((seen & kSawGreek), 0);
166             DCHECK_LT(current_shift, 3);
167             seen |= kSawGreek;
168             list |= kGreek << (2 * current_shift++);
169             break;
170           case '>':
171             DCHECK_NE(seen, 0);
172             code |= TableLookup(list);
173             result.Append(static_cast<UChar>(kMockCharMin + code));
174             in_set = false;
175             break;
176           default:
177             DLOG(ERROR) << "Illegal mock string set char: '" << c << "'";
178             break;
179         }
180         continue;
181       }
182       // not in set
183       switch (c) {
184         case '<':
185           seen = 0;
186           code = 0;
187           list = 0;
188           current_shift = 0;
189           in_set = true;
190           break;
191         case '(':
192           code = kCodeBracketBit | kCodeSpecialCommon;
193           break;
194         case '[':
195           code = kCodeBracketBit | kCodeSquareBracketBit | kCodeSpecialCommon;
196           break;
197         case ')':
198           code = kCodeBracketBit | kCodeBracketCloseBit | kCodeSpecialCommon;
199           break;
200         case ']':
201           code = kCodeBracketBit | kCodeSquareBracketBit |
202                  kCodeBracketCloseBit | kCodeSpecialCommon;
203           break;
204         case 'i':
205           code = kCodeSpecialInherited;
206           break;
207         case 'c':
208           code = kCodeSpecialCommon;
209           break;
210         case 'l':
211           code = kLatin;
212           break;
213         case 'h':
214           code = kHan;
215           break;
216         case 'g':
217           code = kGreek;
218           break;
219         case '?':
220           code = 0;  // unknown
221           break;
222         default:
223           DLOG(ERROR) << "Illegal mock string set char: '" << c << "'";
224       }
225       if (!in_set) {
226         result.Append(static_cast<UChar>(kMockCharMin + code));
227       }
228     }
229     return result.ToString();
230   }
231 
232   // We determine properties based on the offset from kMockCharMin:
233   // bits 0-3 represent the list of l, h, c scripts (index into table)
234   // bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal
235   // bit 6 clear means non-bracket, open means bracket
236   // bit 7 clear means open bracket, set means close bracket
237   // bit 8 clear means paren, set means bracket
238   // if it's a bracket, the matching bracket is 64 code points away
239   static const UChar32 kMockCharMin = 0xe000;
240   static const UChar32 kMockCharLimit = kMockCharMin + 0x200;
241   static const int kLatin = 1;
242   static const int kHan = 2;
243   static const int kGreek = 3;
244   static const int kCodeListIndexMask = 0xf;
245   static const int kCodeSpecialMask = 0x30;
246   static const int kCodeSpecialCommon = 0x10;
247   static const int kCodeSpecialInherited = 0x20;
248   static const int kCodeBracketCloseBit = 0x40;
249   static const int kCodeBracketBit = 0x80;
250   static const int kCodeSquareBracketBit = 0x100;
251   static const int kListShift = 2;
252   static const int kListMask = 0x3;
253   static const int kBracketDelta = kCodeBracketCloseBit;
254   static const int kTable[16];
255 
256   static const int kSawBracket = 0x1;
257   static const int kSawSpecial = 0x2;
258   static const int kSawLatin = 0x4;
259   static const int kSawHan = 0x8;
260   static const int kSawGreek = 0x10;
261 };
262 
263 static const int kLatin2 = MockScriptData::kLatin << 2;
264 static const int kHan2 = MockScriptData::kHan << 2;
265 static const int kGreek2 = MockScriptData::kGreek << 2;
266 static const int kLatin3 = MockScriptData::kLatin << 4;
267 static const int kHan3 = MockScriptData::kHan << 4;
268 static const int kGreek3 = MockScriptData::kGreek << 4;
269 const int MockScriptData::kTable[] = {
270     0,
271     kLatin,
272     kHan,
273     kGreek,
274     kLatin2 + kHan,
275     kLatin2 + kGreek,
276     kHan2 + kLatin,
277     kHan2 + kGreek,
278     kGreek2 + kLatin,
279     kGreek2 + kHan,
280     kLatin3 + kHan2 + kGreek,
281     kLatin3 + kGreek2 + kHan,
282     kHan3 + kLatin2 + kGreek,
283     kHan3 + kGreek2 + kLatin,
284     kGreek3 + kLatin2 + kHan,
285     kGreek3 + kHan2 + kLatin,
286 };
287 
288 class ScriptRunIteratorTest : public testing::Test {
289  protected:
CheckRuns(const Vector<ScriptTestRun> & runs)290   void CheckRuns(const Vector<ScriptTestRun>& runs) {
291     StringBuilder text;
292     text.Ensure16Bit();
293     Vector<ScriptExpectedRun> expect;
294     for (auto& run : runs) {
295       text.Append(String::FromUTF8(run.text));
296       expect.push_back(ScriptExpectedRun(text.length(), run.code));
297     }
298     ScriptRunIterator script_run_iterator(text.Characters16(), text.length());
299     VerifyRuns(&script_run_iterator, expect);
300   }
301 
302   // FIXME crbug.com/527329 - CheckMockRuns should be replaced by finding
303   // suitable equivalent real codepoint sequences instead.
CheckMockRuns(const Vector<ScriptTestRun> & runs)304   void CheckMockRuns(const Vector<ScriptTestRun>& runs) {
305     StringBuilder text;
306     text.Ensure16Bit();
307     Vector<ScriptExpectedRun> expect;
308     for (const ScriptTestRun& run : runs) {
309       text.Append(MockScriptData::ToTestString(run.text));
310       expect.push_back(ScriptExpectedRun(text.length(), run.code));
311     }
312 
313     ScriptRunIterator script_run_iterator(text.Characters16(), text.length(),
314                                           MockScriptData::Instance());
315     VerifyRuns(&script_run_iterator, expect);
316   }
317 
VerifyRuns(ScriptRunIterator * script_run_iterator,const Vector<ScriptExpectedRun> & expect)318   void VerifyRuns(ScriptRunIterator* script_run_iterator,
319                   const Vector<ScriptExpectedRun>& expect) {
320     unsigned limit;
321     UScriptCode code;
322     size_t run_count = 0;
323     while (script_run_iterator->Consume(&limit, &code)) {
324       ASSERT_LT(run_count, expect.size());
325       ASSERT_EQ(expect[run_count].limit, limit);
326       ASSERT_EQ(expect[run_count].code, code);
327       ++run_count;
328     }
329     ASSERT_EQ(expect.size(), run_count);
330   }
331 };
332 
TEST_F(ScriptRunIteratorTest,Empty)333 TEST_F(ScriptRunIteratorTest, Empty) {
334   String empty(g_empty_string16_bit);
335   ScriptRunIterator script_run_iterator(empty.Characters16(), empty.length());
336   unsigned limit = 0;
337   UScriptCode code = USCRIPT_INVALID_CODE;
338   DCHECK(!script_run_iterator.Consume(&limit, &code));
339   ASSERT_EQ(limit, 0u);
340   ASSERT_EQ(code, USCRIPT_INVALID_CODE);
341 }
342 
343 // Some of our compilers cannot initialize a vector from an array yet.
344 #define DECLARE_SCRIPT_RUNSVECTOR(...)                   \
345   static const ScriptTestRun kRunsArray[] = __VA_ARGS__; \
346   Vector<ScriptTestRun> runs;                            \
347   runs.Append(kRunsArray, sizeof(kRunsArray) / sizeof(*kRunsArray));
348 
349 #define CHECK_SCRIPT_RUNS(...)            \
350   DECLARE_SCRIPT_RUNSVECTOR(__VA_ARGS__); \
351   CheckRuns(runs);
352 
353 #define CHECK_MOCK_SCRIPT_RUNS(...)       \
354   DECLARE_SCRIPT_RUNSVECTOR(__VA_ARGS__); \
355   CheckMockRuns(runs);
356 
TEST_F(ScriptRunIteratorTest,Whitespace)357 TEST_F(ScriptRunIteratorTest, Whitespace) {
358   CHECK_SCRIPT_RUNS({{" \t ", USCRIPT_COMMON}});
359 }
360 
TEST_F(ScriptRunIteratorTest,Common)361 TEST_F(ScriptRunIteratorTest, Common) {
362   CHECK_SCRIPT_RUNS({{" ... !?", USCRIPT_COMMON}});
363 }
364 
TEST_F(ScriptRunIteratorTest,CombiningCircle)365 TEST_F(ScriptRunIteratorTest, CombiningCircle) {
366   CHECK_SCRIPT_RUNS({{"◌́◌̀◌̈◌̂◌̄◌̊", USCRIPT_COMMON}});
367 }
368 
TEST_F(ScriptRunIteratorTest,Latin)369 TEST_F(ScriptRunIteratorTest, Latin) {
370   CHECK_SCRIPT_RUNS({{"latin", USCRIPT_LATIN}});
371 }
372 
TEST_F(ScriptRunIteratorTest,Chinese)373 TEST_F(ScriptRunIteratorTest, Chinese) {
374   CHECK_SCRIPT_RUNS({{"萬國碼", USCRIPT_HAN}});
375 }
376 
377 struct JapaneseMixedScript {
378   const char* string;
379   // The expected primary_script when the string alone was evaluated.
380   UScriptCode script;
381 } japanese_mixed_scripts[] = {{"あ", USCRIPT_HIRAGANA},
382                               // Katakana should be normalized to Hiragana
383                               {"ア", USCRIPT_HIRAGANA},
384                               // Script_Extensions=Hira Kana
385                               {"\u30FC", USCRIPT_HIRAGANA},
386                               // Script_Extensions=Hani Hira Kana
387                               {"\u303C", USCRIPT_HAN},
388                               // Script_Extensions=Bopo Hang Hani Hira Kana
389                               {"\u3003", USCRIPT_BOPOMOFO},
390                               // Script_Extensions=Bopo Hang Hani Hira Kana Yiii
391                               {"\u3001", USCRIPT_BOPOMOFO}};
392 
393 class JapaneseMixedScriptTest
394     : public ScriptRunIteratorTest,
395       public testing::WithParamInterface<JapaneseMixedScript> {};
396 
397 INSTANTIATE_TEST_SUITE_P(ScriptRunIteratorTest,
398                          JapaneseMixedScriptTest,
399                          testing::ValuesIn(japanese_mixed_scripts));
400 
TEST_P(JapaneseMixedScriptTest,Data)401 TEST_P(JapaneseMixedScriptTest, Data) {
402   const auto& data = GetParam();
403   std::string string(data.string);
404 
405   CheckRuns({{string.data(), data.script}});
406 
407   // If the string follows Hiragana or Katakana, or is followed by Hiragnaa or
408   // Katakana, it should be normalized as Hiragana.
409   std::string hiragana("か");
410   std::string katakana("カ");
411   CheckRuns({{(hiragana + string).data(), USCRIPT_HIRAGANA}});
412   CheckRuns({{(string + hiragana).data(), USCRIPT_HIRAGANA}});
413 
414   CheckRuns({{(katakana + string).data(), USCRIPT_HIRAGANA}});
415   CheckRuns({{(string + katakana).data(), USCRIPT_HIRAGANA}});
416 
417   CheckRuns({{(hiragana + string + katakana).data(), USCRIPT_HIRAGANA}});
418   CheckRuns({{(katakana + string + hiragana).data(), USCRIPT_HIRAGANA}});
419 }
420 
421 // Close bracket without matching open is ignored
TEST_F(ScriptRunIteratorTest,UnbalancedParens1)422 TEST_F(ScriptRunIteratorTest, UnbalancedParens1) {
423   CHECK_SCRIPT_RUNS(
424       {{"(萬", USCRIPT_HAN}, {"a]", USCRIPT_LATIN}, {")", USCRIPT_HAN}});
425 }
426 
427 // Open bracket without matching close is popped when inside
428 // matching close brackets, so doesn't match later close.
TEST_F(ScriptRunIteratorTest,UnbalancedParens2)429 TEST_F(ScriptRunIteratorTest, UnbalancedParens2) {
430   CHECK_SCRIPT_RUNS(
431       {{"(萬", USCRIPT_HAN}, {"a[", USCRIPT_LATIN}, {")]", USCRIPT_HAN}});
432 }
433 
434 // space goes with leading script
TEST_F(ScriptRunIteratorTest,LatinHan)435 TEST_F(ScriptRunIteratorTest, LatinHan) {
436   CHECK_SCRIPT_RUNS({{"Unicode ", USCRIPT_LATIN}, {"萬國碼", USCRIPT_HAN}});
437 }
438 
439 // space goes with leading script
TEST_F(ScriptRunIteratorTest,HanLatin)440 TEST_F(ScriptRunIteratorTest, HanLatin) {
441   CHECK_SCRIPT_RUNS({{"萬國碼 ", USCRIPT_HAN}, {"Unicode", USCRIPT_LATIN}});
442 }
443 
TEST_F(ScriptRunIteratorTest,ParenEmptyParen)444 TEST_F(ScriptRunIteratorTest, ParenEmptyParen) {
445   CHECK_SCRIPT_RUNS({{"()", USCRIPT_COMMON}});
446 }
447 
TEST_F(ScriptRunIteratorTest,ParenChineseParen)448 TEST_F(ScriptRunIteratorTest, ParenChineseParen) {
449   CHECK_SCRIPT_RUNS({{"(萬國碼)", USCRIPT_HAN}});
450 }
451 
TEST_F(ScriptRunIteratorTest,ParenLatinParen)452 TEST_F(ScriptRunIteratorTest, ParenLatinParen) {
453   CHECK_SCRIPT_RUNS({{"(Unicode)", USCRIPT_LATIN}});
454 }
455 
456 // open paren gets leading script
TEST_F(ScriptRunIteratorTest,LatinParenChineseParen)457 TEST_F(ScriptRunIteratorTest, LatinParenChineseParen) {
458   CHECK_SCRIPT_RUNS({{"Unicode (", USCRIPT_LATIN},
459                      {"萬國碼", USCRIPT_HAN},
460                      {")", USCRIPT_LATIN}});
461 }
462 
463 // open paren gets first trailing script if no leading script
TEST_F(ScriptRunIteratorTest,ParenChineseParenLatin)464 TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin) {
465   CHECK_SCRIPT_RUNS({{"(萬國碼) ", USCRIPT_HAN}, {"Unicode", USCRIPT_LATIN}});
466 }
467 
468 // leading common and open paren get first trailing script.
469 // TODO(dougfelt): we don't do quote matching, but probably should figure out
470 // something better then doing nothing.
TEST_F(ScriptRunIteratorTest,QuoteParenChineseParenLatinQuote)471 TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote) {
472   CHECK_SCRIPT_RUNS(
473       {{"\"(萬國碼) ", USCRIPT_HAN}, {"Unicode\"", USCRIPT_LATIN}});
474 }
475 
476 // Emojies are resolved to the leading script.
TEST_F(ScriptRunIteratorTest,EmojiCommon)477 TEST_F(ScriptRunIteratorTest, EmojiCommon) {
478   CHECK_SCRIPT_RUNS({{"百家姓��������", USCRIPT_HAN}});
479 }
480 
481 // Unmatched close brace gets leading context
TEST_F(ScriptRunIteratorTest,UnmatchedClose)482 TEST_F(ScriptRunIteratorTest, UnmatchedClose) {
483   CHECK_SCRIPT_RUNS({{"Unicode (", USCRIPT_LATIN},
484                      {"萬國碼] ", USCRIPT_HAN},
485                      {") Unicode\"", USCRIPT_LATIN}});
486 }
487 
488 // Match up to 32 bracket pairs
TEST_F(ScriptRunIteratorTest,Match32Brackets)489 TEST_F(ScriptRunIteratorTest, Match32Brackets) {
490   CHECK_SCRIPT_RUNS({{"[萬國碼 ", USCRIPT_HAN},
491                      {"Unicode (((((((((((((((((((((((((((((((!"
492                       ")))))))))))))))))))))))))))))))",
493                       USCRIPT_LATIN},
494                      {"]", USCRIPT_HAN}});
495 }
496 
497 // Matches 32 most recent bracket pairs. More than that, and we revert to
498 // surrounding script.
TEST_F(ScriptRunIteratorTest,Match32MostRecentBrackets)499 TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets) {
500   CHECK_SCRIPT_RUNS({{"((([萬國碼 ", USCRIPT_HAN},
501                      {"Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN},
502                      {"萬國碼!", USCRIPT_HAN},
503                      {")))))))))))))))))))))))))))))))", USCRIPT_LATIN},
504                      {"]", USCRIPT_HAN},
505                      {"But )))", USCRIPT_LATIN}});
506 }
507 
508 // A char with multiple scripts that match both leading and trailing context
509 // gets the leading context.
TEST_F(ScriptRunIteratorTest,ExtensionsPreferLeadingContext)510 TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext) {
511   CHECK_MOCK_SCRIPT_RUNS({{"h<lh>", USCRIPT_HAN}, {"l", USCRIPT_LATIN}});
512 }
513 
514 // A char with multiple scripts that only match trailing context gets the
515 // trailing context.
TEST_F(ScriptRunIteratorTest,ExtensionsMatchTrailingContext)516 TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext) {
517   CHECK_MOCK_SCRIPT_RUNS({{"h", USCRIPT_HAN}, {"<gl>l", USCRIPT_LATIN}});
518 }
519 
520 // Retain first established priority script.  <lhg><gh> produce the script <gh>
521 // with g as priority, because of the two priority scripts l and g, only g
522 // remains.  Then <gh><hgl> retains g as priority, because of the two priority
523 // scripts g and h that remain, g was encountered first.
TEST_F(ScriptRunIteratorTest,ExtensionsRetainFirstPriorityScript)524 TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript) {
525   CHECK_MOCK_SCRIPT_RUNS({{"<lhg><gh><hgl>", USCRIPT_GREEK}});
526 }
527 
528 // Parens can have scripts that break script runs.
TEST_F(ScriptRunIteratorTest,ExtensionsParens)529 TEST_F(ScriptRunIteratorTest, ExtensionsParens) {
530   CHECK_MOCK_SCRIPT_RUNS({{"<gl><(lg>", USCRIPT_GREEK},
531                           {"h<[hl>", USCRIPT_HAN},
532                           {"l", USCRIPT_LATIN},
533                           {"<]hl>", USCRIPT_HAN},
534                           {"<)lg>", USCRIPT_GREEK}});
535 }
536 
537 // The close paren might be encountered before we've established the open
538 // paren's script, but when this is the case the current set is still valid, so
539 // this doesn't affect it nor break the run.
TEST_F(ScriptRunIteratorTest,ExtensionsParens2)540 TEST_F(ScriptRunIteratorTest, ExtensionsParens2) {
541   CHECK_MOCK_SCRIPT_RUNS({{"<(lhg><gh><)lhg>", USCRIPT_GREEK}});
542 }
543 
544 // A common script with a single extension should be treated as common, but
545 // with the extended script as a default.  If we encounter anything other than
546 // common, that takes priority.  If we encounter other common scripts with a
547 // single extension, the current priority remains.
TEST_F(ScriptRunIteratorTest,CommonWithPriority)548 TEST_F(ScriptRunIteratorTest, CommonWithPriority) {
549   CHECK_MOCK_SCRIPT_RUNS({{"<ch>", USCRIPT_HAN}});
550 }
551 
TEST_F(ScriptRunIteratorTest,CommonWithPriority2)552 TEST_F(ScriptRunIteratorTest, CommonWithPriority2) {
553   CHECK_MOCK_SCRIPT_RUNS({{"<ch><lh>", USCRIPT_LATIN}});
554 }
555 
TEST_F(ScriptRunIteratorTest,CommonWithPriority3)556 TEST_F(ScriptRunIteratorTest, CommonWithPriority3) {
557   CHECK_MOCK_SCRIPT_RUNS({{"<ch><cl><cg>", USCRIPT_HAN}});
558 }
559 
560 // UDatta (\xE0\xA5\x91) is inherited with LATIN, DEVANAGARI, BENGALI and
561 // other Indic scripts. Since it has LATIN, and the
562 // dotted circle U+25CC (\xE2\x97\x8C) is COMMON and has adopted the
563 // preceding LATIN, it gets the LATIN. This is standard.
TEST_F(ScriptRunIteratorTest,LatinDottedCircleUdatta)564 TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta) {
565   CHECK_SCRIPT_RUNS({{"Latin \xE2\x97\x8C\xE0\xA5\x91", USCRIPT_LATIN}});
566 }
567 
568 // In this situation, UDatta U+0951 (\xE0\xA5\x91) doesn't share a script
569 // with the value inherited by the dotted circle U+25CC (\xE2\x97\x8C).
570 // It captures the preceding dotted circle and breaks it from the run it would
571 // normally have been in. U+0951 is used in multiple scripts (DEVA, BENG, LATN,
572 // etc) and has multiple values for Script_Extension property. At the moment,
573 // getScripts() treats the script with the lowest script code as 'true' primary,
574 // and BENG comes before DEVA in the script enum so that we get BENGALI.
575 // Taking into account a Unicode block and returning DEVANAGARI would be
576 // slightly better.
TEST_F(ScriptRunIteratorTest,HanDottedCircleUdatta)577 TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta) {
578   CHECK_SCRIPT_RUNS({{"萬國碼 ", USCRIPT_HAN},
579                      {"\xE2\x97\x8C\xE0\xA5\x91", USCRIPT_BENGALI}});
580 }
581 
582 // Tatweel is \xD9\x80 Lm, Fathatan is \xD9\x8B Mn. The script of tatweel is
583 // common, that of Fathatan is inherited.  The script extensions for Fathatan
584 // are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the
585 // preferred script for Fathatan is Arabic, according to Behdad's
586 // heuristic. This is exactly analogous to the Udatta tests above, except
587 // Tatweel is Lm. But we don't take properties into account, only scripts.
TEST_F(ScriptRunIteratorTest,LatinTatweelFathatan)588 TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan) {
589   CHECK_SCRIPT_RUNS(
590       {{"Latin ", USCRIPT_LATIN}, {"\xD9\x80\xD9\x8B", USCRIPT_ARABIC}});
591 }
592 
593 // Another case where if the mark accepts a script that was inherited by the
594 // preceding common-script character, they both continue in that script.
595 // SYRIAC LETTER NUN \xDC\xA2
596 // ARABIC TATWEEL \xD9\x80
597 // ARABIC FATHATAN \xD9\x82
TEST_F(ScriptRunIteratorTest,SyriacTatweelFathatan)598 TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan) {
599   CHECK_SCRIPT_RUNS({{"\xDC\xA2\xD9\x80\xD9\x8B", USCRIPT_SYRIAC}});
600 }
601 
602 // The Udatta (\xE0\xA5\x91) is inherited, so will share runs with anything that
603 // is not common.
TEST_F(ScriptRunIteratorTest,HanUdatta)604 TEST_F(ScriptRunIteratorTest, HanUdatta) {
605   CHECK_SCRIPT_RUNS({{"萬國碼\xE0\xA5\x91", USCRIPT_HAN}});
606 }
607 
608 // The Udatta U+0951 (\xE0\xA5\x91) is inherited, and will capture the space
609 // and turn it into Bengali because SCRIPT_BENAGLI is 4 and SCRIPT_DEVANAGARI
610 // is 10. See TODO comment for |getScripts| and HanDottedCircleUdatta.
TEST_F(ScriptRunIteratorTest,HanSpaceUdatta)611 TEST_F(ScriptRunIteratorTest, HanSpaceUdatta) {
612   CHECK_SCRIPT_RUNS(
613       {{"萬國碼", USCRIPT_HAN}, {" \xE0\xA5\x91", USCRIPT_BENGALI}});
614 }
615 
616 // Corresponds to one test in RunSegmenter, where orientation of the
617 // space character is sidesways in vertical.
TEST_F(ScriptRunIteratorTest,Hangul)618 TEST_F(ScriptRunIteratorTest, Hangul) {
619   CHECK_SCRIPT_RUNS({{"키스의 고유조건은", USCRIPT_HANGUL}});
620 }
621 
622 // Corresponds to one test in RunSegmenter, which tests that the punctuation
623 // characters mixed in are actually sideways in vertical. The ScriptIterator
624 // should report one run, but the RunSegmenter should report three, with the
625 // middle one rotated sideways.
TEST_F(ScriptRunIteratorTest,HiraganaMixedPunctuation)626 TEST_F(ScriptRunIteratorTest, HiraganaMixedPunctuation) {
627   CHECK_SCRIPT_RUNS({{"いろはに.…¡ほへと", USCRIPT_HIRAGANA}});
628 }
629 
630 // Make sure Mock code works too.
TEST_F(ScriptRunIteratorTest,MockHanInheritedGL)631 TEST_F(ScriptRunIteratorTest, MockHanInheritedGL) {
632   CHECK_MOCK_SCRIPT_RUNS({{"h<igl>", USCRIPT_HAN}});
633 }
634 
TEST_F(ScriptRunIteratorTest,MockHanCommonInheritedGL)635 TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL) {
636   CHECK_MOCK_SCRIPT_RUNS({{"h", USCRIPT_HAN}, {"c<igl>", USCRIPT_GREEK}});
637 }
638 
639 // Leading inherited just act like common, except there's no preferred script.
TEST_F(ScriptRunIteratorTest,MockLeadingInherited)640 TEST_F(ScriptRunIteratorTest, MockLeadingInherited) {
641   CHECK_MOCK_SCRIPT_RUNS({{"<igl>", USCRIPT_COMMON}});
642 }
643 
644 // Leading inherited just act like common, except there's no preferred script.
TEST_F(ScriptRunIteratorTest,MockLeadingInherited2)645 TEST_F(ScriptRunIteratorTest, MockLeadingInherited2) {
646   CHECK_MOCK_SCRIPT_RUNS({{"<igl><ih>", USCRIPT_COMMON}});
647 }
648 
TEST_F(ScriptRunIteratorTest,LeadingInheritedHan)649 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan) {
650   // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
651   CHECK_SCRIPT_RUNS({{"\xE0\xA5\x91萬國碼", USCRIPT_HAN}});
652 }
653 
TEST_F(ScriptRunIteratorTest,LeadingInheritedHan2)654 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2) {
655   // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
656   // ARABIC FATHATAN \xD9\x8B
657   CHECK_SCRIPT_RUNS({{"\xE0\xA5\x91\xD9\x8B萬國碼", USCRIPT_HAN}});
658 }
659 
TEST_F(ScriptRunIteratorTest,OddLatinString)660 TEST_F(ScriptRunIteratorTest, OddLatinString) {
661   CHECK_SCRIPT_RUNS({{"ç̈", USCRIPT_LATIN}});
662 }
663 
TEST_F(ScriptRunIteratorTest,CommonMalayalam)664 TEST_F(ScriptRunIteratorTest, CommonMalayalam) {
665   CHECK_SCRIPT_RUNS({{"100-ാം", USCRIPT_MALAYALAM}});
666 }
667 
668 class ScriptRunIteratorICUDataTest : public testing::Test {
669  public:
ScriptRunIteratorICUDataTest()670   ScriptRunIteratorICUDataTest()
671       : max_extensions_(0), max_extensions_codepoint_(0xffff) {
672     int max_extensions = 0;
673     UChar32 max_extensionscp = 0;
674     for (UChar32 cp = 0; cp < 0x11000; ++cp) {
675       UErrorCode status = U_ZERO_ERROR;
676       int count = uscript_getScriptExtensions(cp, nullptr, 0, &status);
677       if (count > max_extensions) {
678         max_extensions = count;
679         max_extensionscp = cp;
680       }
681     }
682     max_extensions_ = max_extensions;
683     max_extensions_codepoint_ = max_extensionscp;
684   }
685 
686  protected:
GetACharWithMaxExtensions(int * num_extensions)687   UChar32 GetACharWithMaxExtensions(int* num_extensions) {
688     if (num_extensions) {
689       *num_extensions = max_extensions_;
690     }
691     return max_extensions_codepoint_;
692   }
693 
694  private:
695   int max_extensions_;
696   UChar32 max_extensions_codepoint_;
697 };
698 
699 // Validate that ICU never returns more than our maximum expected number of
700 // script extensions.
TEST_F(ScriptRunIteratorICUDataTest,ValidateICUMaxScriptExtensions)701 TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions) {
702   int max_extensions;
703   UChar32 cp = GetACharWithMaxExtensions(&max_extensions);
704   ASSERT_LE(max_extensions, ScriptData::kMaxScriptCount)
705       << "char " << std::hex << cp << std::dec;
706 }
707 
708 // Check that ICUScriptData returns all of a character's scripts.
709 // This only checks one likely character, but doesn't check all cases.
TEST_F(ScriptRunIteratorICUDataTest,ICUDataGetScriptsReturnsAllExtensions)710 TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions) {
711   int max_extensions;
712   UChar32 cp = GetACharWithMaxExtensions(&max_extensions);
713   ScriptData::UScriptCodeList extensions;
714   ICUScriptData::Instance()->GetScripts(cp, extensions);
715 
716   // It's possible that GetScripts adds the primary script to the list of
717   // extensions, resulting in one more script than the raw extension count.
718   ASSERT_GE(static_cast<int>(extensions.size()), max_extensions)
719       << "char " << std::hex << cp << std::dec;
720 }
721 
TEST_F(ScriptRunIteratorICUDataTest,CommonHaveNoMoreThanOneExtension)722 TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension) {
723   ScriptData::UScriptCodeList extensions;
724   for (UChar32 cp = 0; cp < 0x110000; ++cp) {
725     ICUScriptData::Instance()->GetScripts(cp, extensions);
726     UScriptCode primary = extensions.at(0);
727     if (primary == USCRIPT_COMMON) {
728       ASSERT_LE(extensions.size(), 2ul) << "cp: " << std::hex << cp << std::dec;
729     }
730   }
731 }
732 
733 // ZWJ is \u200D Cf (Format, other) and its script is inherited.  I'm going to
734 // ignore this for now, as I think it shouldn't matter which run it ends up
735 // in. HarfBuzz needs to be able to use it as context and shape each
736 // neighboring character appropriately no matter what run it got assigned to.
737 
738 }  // namespace blink
739