1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package norm 6 7import ( 8 "bytes" 9 "flag" 10 "fmt" 11 "io" 12 "io/ioutil" 13 "log" 14 "os" 15 "os/exec" 16 "path/filepath" 17 "runtime" 18 "strings" 19 "testing" 20 "unicode/utf8" 21 22 "golang.org/x/text/internal/testtext" 23 "golang.org/x/text/transform" 24) 25 26var ( 27 testn = flag.Int("testn", -1, "specific test number to run or -1 for all") 28) 29 30// pc replaces any rune r that is repeated n times, for n > 1, with r{n}. 31func pc(s string) []byte { 32 b := bytes.NewBuffer(make([]byte, 0, len(s))) 33 for i := 0; i < len(s); { 34 r, sz := utf8.DecodeRuneInString(s[i:]) 35 n := 0 36 if sz == 1 { 37 // Special-case one-byte case to handle repetition for invalid UTF-8. 38 for c := s[i]; i+n < len(s) && s[i+n] == c; n++ { 39 } 40 } else { 41 for _, r2 := range s[i:] { 42 if r2 != r { 43 break 44 } 45 n++ 46 } 47 } 48 b.WriteString(s[i : i+sz]) 49 if n > 1 { 50 fmt.Fprintf(b, "{%d}", n) 51 } 52 i += sz * n 53 } 54 return b.Bytes() 55} 56 57// pidx finds the index from which two strings start to differ, plus context. 58// It returns the index and ellipsis if the index is greater than 0. 59func pidx(a, b string) (i int, prefix string) { 60 for ; i < len(a) && i < len(b) && a[i] == b[i]; i++ { 61 } 62 if i < 8 { 63 return 0, "" 64 } 65 i -= 3 // ensure taking at least one full rune before the difference. 66 for k := i - 7; i > k && !utf8.RuneStart(a[i]); i-- { 67 } 68 return i, "..." 69} 70 71type PositionTest struct { 72 input string 73 pos int 74 buffer string // expected contents of reorderBuffer, if applicable 75} 76 77type positionFunc func(rb *reorderBuffer, s string) (int, []byte) 78 79func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) { 80 rb := reorderBuffer{} 81 rb.init(f, nil) 82 for i, test := range tests { 83 rb.reset() 84 rb.src = inputString(test.input) 85 rb.nsrc = len(test.input) 86 pos, out := fn(&rb, test.input) 87 if pos != test.pos { 88 t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos) 89 } 90 if outs := string(out); outs != test.buffer { 91 k, pfx := pidx(outs, test.buffer) 92 t.Errorf("%s:%d: buffer \nwas %s%+q; \nwant %s%+q", name, i, pfx, pc(outs[k:]), pfx, pc(test.buffer[k:])) 93 } 94 } 95} 96 97func grave(n int) string { 98 return rep(0x0300, n) 99} 100 101func rep(r rune, n int) string { 102 return strings.Repeat(string(r), n) 103} 104 105const segSize = maxByteBufferSize 106 107var cgj = GraphemeJoiner 108 109var decomposeSegmentTests = []PositionTest{ 110 // illegal runes 111 {"\xC2", 0, ""}, 112 {"\xC0", 1, "\xC0"}, 113 {"\u00E0\x80", 2, "\u0061\u0300"}, 114 // starter 115 {"a", 1, "a"}, 116 {"ab", 1, "a"}, 117 // starter + composing 118 {"a\u0300", 3, "a\u0300"}, 119 {"a\u0300b", 3, "a\u0300"}, 120 // with decomposition 121 {"\u00C0", 2, "A\u0300"}, 122 {"\u00C0b", 2, "A\u0300"}, 123 // long 124 {grave(31), 60, grave(30) + cgj}, 125 {"a" + grave(31), 61, "a" + grave(30) + cgj}, 126 127 // Stability tests: see https://www.unicode.org/review/pr-29.html. 128 // U+0300 COMBINING GRAVE ACCENT;Mn;230;NSM;;;;;N;NON-SPACING GRAVE;;;; 129 // U+0B47 ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;; 130 // U+0B3E ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;; 131 // U+1100 HANGUL CHOSEONG KIYEOK;Lo;0;L;;;;;N;;;;; 132 // U+1161 HANGUL JUNGSEONG A;Lo;0;L;;;;;N;;;;; 133 {"\u0B47\u0300\u0B3E", 8, "\u0B47\u0300\u0B3E"}, 134 {"\u1100\u0300\u1161", 8, "\u1100\u0300\u1161"}, 135 {"\u0B47\u0B3E", 6, "\u0B47\u0B3E"}, 136 {"\u1100\u1161", 6, "\u1100\u1161"}, 137 138 // U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;; 139 // Sequence of decomposing characters that are starters and modifiers. 140 {"\u0d4a" + strings.Repeat("\u0d3e", 31), 90, "\u0d46" + strings.Repeat("\u0d3e", 30) + cgj}, 141 142 {grave(30), 60, grave(30)}, 143 // U+FF9E is a starter, but decomposes to U+3099, which is not. 144 {grave(30) + "\uff9e", 60, grave(30) + cgj}, 145 // ends with incomplete UTF-8 encoding 146 {"\xCC", 0, ""}, 147 {"\u0300\xCC", 2, "\u0300"}, 148} 149 150func decomposeSegmentF(rb *reorderBuffer, s string) (int, []byte) { 151 rb.initString(NFD, s) 152 rb.setFlusher(nil, appendFlush) 153 p := decomposeSegment(rb, 0, true) 154 return p, rb.out 155} 156 157func TestDecomposeSegment(t *testing.T) { 158 runPosTests(t, "TestDecomposeSegment", NFC, decomposeSegmentF, decomposeSegmentTests) 159} 160 161var firstBoundaryTests = []PositionTest{ 162 // no boundary 163 {"", -1, ""}, 164 {"\u0300", -1, ""}, 165 {"\x80\x80", -1, ""}, 166 // illegal runes 167 {"\xff", 0, ""}, 168 {"\u0300\xff", 2, ""}, 169 {"\u0300\xc0\x80\x80", 2, ""}, 170 // boundaries 171 {"a", 0, ""}, 172 {"\u0300a", 2, ""}, 173 // Hangul 174 {"\u1103\u1161", 0, ""}, 175 {"\u110B\u1173\u11B7", 0, ""}, 176 {"\u1161\u110B\u1173\u11B7", 3, ""}, 177 {"\u1173\u11B7\u1103\u1161", 6, ""}, 178 // too many combining characters. 179 {grave(maxNonStarters - 1), -1, ""}, 180 {grave(maxNonStarters), 60, ""}, 181 {grave(maxNonStarters + 1), 60, ""}, 182} 183 184func firstBoundaryF(rb *reorderBuffer, s string) (int, []byte) { 185 return rb.f.form.FirstBoundary([]byte(s)), nil 186} 187 188func firstBoundaryStringF(rb *reorderBuffer, s string) (int, []byte) { 189 return rb.f.form.FirstBoundaryInString(s), nil 190} 191 192func TestFirstBoundary(t *testing.T) { 193 runPosTests(t, "TestFirstBoundary", NFC, firstBoundaryF, firstBoundaryTests) 194 runPosTests(t, "TestFirstBoundaryInString", NFC, firstBoundaryStringF, firstBoundaryTests) 195} 196 197func TestNextBoundary(t *testing.T) { 198 testCases := []struct { 199 input string 200 atEOF bool 201 want int 202 }{ 203 // no boundary 204 {"", true, 0}, 205 {"", false, -1}, 206 {"\u0300", true, 2}, 207 {"\u0300", false, -1}, 208 {"\x80\x80", true, 1}, 209 {"\x80\x80", false, 1}, 210 // illegal runes 211 {"\xff", false, 1}, 212 {"\u0300\xff", false, 2}, 213 {"\u0300\xc0\x80\x80", false, 2}, 214 {"\xc2\x80\x80", false, 2}, 215 {"\xc2", false, -1}, 216 {"\xc2", true, 1}, 217 {"a\u0300\xc2", false, -1}, 218 {"a\u0300\xc2", true, 3}, 219 // boundaries 220 {"a", true, 1}, 221 {"a", false, -1}, 222 {"aa", false, 1}, 223 {"\u0300", true, 2}, 224 {"\u0300", false, -1}, 225 {"\u0300a", false, 2}, 226 // Hangul 227 {"\u1103\u1161", true, 6}, 228 {"\u1103\u1161", false, -1}, 229 {"\u110B\u1173\u11B7", false, -1}, 230 {"\u110B\u1173\u11B7\u110B\u1173\u11B7", false, 9}, 231 {"\u1161\u110B\u1173\u11B7", false, 3}, 232 {"\u1173\u11B7\u1103\u1161", false, 6}, 233 // too many combining characters. 234 {grave(maxNonStarters - 1), false, -1}, 235 {grave(maxNonStarters), false, 60}, 236 {grave(maxNonStarters + 1), false, 60}, 237 } 238 239 for _, tc := range testCases { 240 if got := NFC.NextBoundary([]byte(tc.input), tc.atEOF); got != tc.want { 241 t.Errorf("NextBoundary(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want) 242 } 243 if got := NFC.NextBoundaryInString(tc.input, tc.atEOF); got != tc.want { 244 t.Errorf("NextBoundaryInString(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want) 245 } 246 } 247} 248 249var decomposeToLastTests = []PositionTest{ 250 // ends with inert character 251 {"Hello!", 6, ""}, 252 {"\u0632", 2, ""}, 253 {"a\u0301\u0635", 5, ""}, 254 // ends with non-inert starter 255 {"a", 0, "a"}, 256 {"a\u0301a", 3, "a"}, 257 {"a\u0301\u03B9", 3, "\u03B9"}, 258 {"a\u0327", 0, "a\u0327"}, 259 // illegal runes 260 {"\xFF", 1, ""}, 261 {"aa\xFF", 3, ""}, 262 {"\xC0\x80\x80", 3, ""}, 263 {"\xCC\x80\x80", 3, ""}, 264 // ends with incomplete UTF-8 encoding 265 {"a\xCC", 2, ""}, 266 // ends with combining characters 267 {"\u0300\u0301", 0, "\u0300\u0301"}, 268 {"a\u0300\u0301", 0, "a\u0300\u0301"}, 269 {"a\u0301\u0308", 0, "a\u0301\u0308"}, 270 {"a\u0308\u0301", 0, "a\u0308\u0301"}, 271 {"aaaa\u0300\u0301", 3, "a\u0300\u0301"}, 272 {"\u0300a\u0300\u0301", 2, "a\u0300\u0301"}, 273 {"\u00C0", 0, "A\u0300"}, 274 {"a\u00C0", 1, "A\u0300"}, 275 // decomposing 276 {"a\u0300\u00E0", 3, "a\u0300"}, 277 // multisegment decompositions (flushes leading segments) 278 {"a\u0300\uFDC0", 7, "\u064A"}, 279 {"\uFDC0" + grave(29), 4, "\u064A" + grave(29)}, 280 {"\uFDC0" + grave(30), 4, "\u064A" + grave(30)}, 281 {"\uFDC0" + grave(31), 5, grave(30)}, 282 {"\uFDFA" + grave(14), 31, "\u0645" + grave(14)}, 283 // Overflow 284 {"\u00E0" + grave(29), 0, "a" + grave(30)}, 285 {"\u00E0" + grave(30), 2, grave(30)}, 286 // Hangul 287 {"a\u1103", 1, "\u1103"}, 288 {"a\u110B", 1, "\u110B"}, 289 {"a\u110B\u1173", 1, "\u110B\u1173"}, 290 // See comment in composition.go:compBoundaryAfter. 291 {"a\u110B\u1173\u11B7", 1, "\u110B\u1173\u11B7"}, 292 {"a\uC73C", 1, "\u110B\u1173"}, 293 {"다음", 3, "\u110B\u1173\u11B7"}, 294 {"다", 0, "\u1103\u1161"}, 295 {"\u1103\u1161\u110B\u1173\u11B7", 6, "\u110B\u1173\u11B7"}, 296 {"\u110B\u1173\u11B7\u1103\u1161", 9, "\u1103\u1161"}, 297 {"다음음", 6, "\u110B\u1173\u11B7"}, 298 {"음다다", 6, "\u1103\u1161"}, 299 // maximized buffer 300 {"a" + grave(30), 0, "a" + grave(30)}, 301 // Buffer overflow 302 {"a" + grave(31), 3, grave(30)}, 303 // weird UTF-8 304 {"a\u0300\u11B7", 0, "a\u0300\u11B7"}, 305} 306 307func decomposeToLast(rb *reorderBuffer, s string) (int, []byte) { 308 rb.setFlusher([]byte(s), appendFlush) 309 decomposeToLastBoundary(rb) 310 buf := rb.flush(nil) 311 return len(rb.out), buf 312} 313 314func TestDecomposeToLastBoundary(t *testing.T) { 315 runPosTests(t, "TestDecomposeToLastBoundary", NFKC, decomposeToLast, decomposeToLastTests) 316} 317 318var lastBoundaryTests = []PositionTest{ 319 // ends with inert character 320 {"Hello!", 6, ""}, 321 {"\u0632", 2, ""}, 322 // ends with non-inert starter 323 {"a", 0, ""}, 324 // illegal runes 325 {"\xff", 1, ""}, 326 {"aa\xff", 3, ""}, 327 {"a\xff\u0300", 1, ""}, // TODO: should probably be 2. 328 {"\xc0\x80\x80", 3, ""}, 329 {"\xc0\x80\x80\u0300", 3, ""}, 330 // ends with incomplete UTF-8 encoding 331 {"\xCC", -1, ""}, 332 {"\xE0\x80", -1, ""}, 333 {"\xF0\x80\x80", -1, ""}, 334 {"a\xCC", 0, ""}, 335 {"\x80\xCC", 1, ""}, 336 {"\xCC\xCC", 1, ""}, 337 // ends with combining characters 338 {"a\u0300\u0301", 0, ""}, 339 {"aaaa\u0300\u0301", 3, ""}, 340 {"\u0300a\u0300\u0301", 2, ""}, 341 {"\u00C2", 0, ""}, 342 {"a\u00C2", 1, ""}, 343 // decomposition may recombine 344 {"\u0226", 0, ""}, 345 // no boundary 346 {"", -1, ""}, 347 {"\u0300\u0301", -1, ""}, 348 {"\u0300", -1, ""}, 349 {"\x80\x80", -1, ""}, 350 {"\x80\x80\u0301", -1, ""}, 351 // Hangul 352 {"다음", 3, ""}, 353 {"다", 0, ""}, 354 {"\u1103\u1161\u110B\u1173\u11B7", 6, ""}, 355 {"\u110B\u1173\u11B7\u1103\u1161", 9, ""}, 356 // too many combining characters. 357 {grave(maxNonStarters - 1), -1, ""}, 358 // May still be preceded with a non-starter. 359 {grave(maxNonStarters), -1, ""}, 360 // May still need to insert a cgj after the last combiner. 361 {grave(maxNonStarters + 1), 2, ""}, 362 {grave(maxNonStarters + 2), 4, ""}, 363 364 {"a" + grave(maxNonStarters-1), 0, ""}, 365 {"a" + grave(maxNonStarters), 0, ""}, 366 // May still need to insert a cgj after the last combiner. 367 {"a" + grave(maxNonStarters+1), 3, ""}, 368 {"a" + grave(maxNonStarters+2), 5, ""}, 369} 370 371func lastBoundaryF(rb *reorderBuffer, s string) (int, []byte) { 372 return rb.f.form.LastBoundary([]byte(s)), nil 373} 374 375func TestLastBoundary(t *testing.T) { 376 runPosTests(t, "TestLastBoundary", NFC, lastBoundaryF, lastBoundaryTests) 377} 378 379type spanTest struct { 380 input string 381 atEOF bool 382 n int 383 err error 384} 385 386var quickSpanTests = []spanTest{ 387 {"", true, 0, nil}, 388 // starters 389 {"a", true, 1, nil}, 390 {"abc", true, 3, nil}, 391 {"\u043Eb", true, 3, nil}, 392 // incomplete last rune. 393 {"\xCC", true, 1, nil}, 394 {"\xCC", false, 0, transform.ErrShortSrc}, 395 {"a\xCC", true, 2, nil}, 396 {"a\xCC", false, 0, transform.ErrShortSrc}, // TODO: could be 1 for NFD 397 // incorrectly ordered combining characters 398 {"\u0300\u0316", true, 0, transform.ErrEndOfSpan}, 399 {"\u0300\u0316", false, 0, transform.ErrEndOfSpan}, 400 {"\u0300\u0316cd", true, 0, transform.ErrEndOfSpan}, 401 {"\u0300\u0316cd", false, 0, transform.ErrEndOfSpan}, 402 // have a maximum number of combining characters. 403 {rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan}, 404 {"a" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan}, 405 {"Ɵ" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan}, 406 {"aa" + rep(0x035D, 30) + "\u035B", true, 1, transform.ErrEndOfSpan}, 407 {rep(0x035D, 30) + cgj + "\u035B", true, 64, nil}, 408 {"a" + rep(0x035D, 30) + cgj + "\u035B", true, 65, nil}, 409 {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil}, 410 {"aa" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil}, 411 412 {"a" + rep(0x035D, 30) + cgj + "\u035B", false, 61, transform.ErrShortSrc}, 413 {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc}, 414 {"aa" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc}, 415} 416 417var quickSpanNFDTests = []spanTest{ 418 // needs decomposing 419 {"\u00C0", true, 0, transform.ErrEndOfSpan}, 420 {"abc\u00C0", true, 3, transform.ErrEndOfSpan}, 421 // correctly ordered combining characters 422 {"\u0300", true, 2, nil}, 423 {"ab\u0300", true, 4, nil}, 424 {"ab\u0300cd", true, 6, nil}, 425 {"\u0300cd", true, 4, nil}, 426 {"\u0316\u0300", true, 4, nil}, 427 {"ab\u0316\u0300", true, 6, nil}, 428 {"ab\u0316\u0300cd", true, 8, nil}, 429 {"ab\u0316\u0300\u00C0", true, 6, transform.ErrEndOfSpan}, 430 {"\u0316\u0300cd", true, 6, nil}, 431 {"\u043E\u0308b", true, 5, nil}, 432 // incorrectly ordered combining characters 433 {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, // TODO: we could skip 'b' as well. 434 {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan}, 435 // Hangul 436 {"같은", true, 0, transform.ErrEndOfSpan}, 437} 438 439var quickSpanNFCTests = []spanTest{ 440 // okay composed 441 {"\u00C0", true, 2, nil}, 442 {"abc\u00C0", true, 5, nil}, 443 // correctly ordered combining characters 444 // TODO: b may combine with modifiers, which is why this fails. We could 445 // make a more precise test that actually checks whether last 446 // characters combines. Probably not worth it. 447 {"ab\u0300", true, 1, transform.ErrEndOfSpan}, 448 {"ab\u0300cd", true, 1, transform.ErrEndOfSpan}, 449 {"ab\u0316\u0300", true, 1, transform.ErrEndOfSpan}, 450 {"ab\u0316\u0300cd", true, 1, transform.ErrEndOfSpan}, 451 {"\u00C0\u035D", true, 4, nil}, 452 // we do not special case leading combining characters 453 {"\u0300cd", true, 0, transform.ErrEndOfSpan}, 454 {"\u0300", true, 0, transform.ErrEndOfSpan}, 455 {"\u0316\u0300", true, 0, transform.ErrEndOfSpan}, 456 {"\u0316\u0300cd", true, 0, transform.ErrEndOfSpan}, 457 // incorrectly ordered combining characters 458 {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, 459 {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan}, 460 // Hangul 461 {"같은", true, 6, nil}, 462 {"같은", false, 3, transform.ErrShortSrc}, 463 // We return the start of the violating segment in case of overflow. 464 {grave(30) + "\uff9e", true, 0, transform.ErrEndOfSpan}, 465 {grave(30), true, 0, transform.ErrEndOfSpan}, 466} 467 468func runSpanTests(t *testing.T, name string, f Form, testCases []spanTest) { 469 for i, tc := range testCases { 470 s := fmt.Sprintf("Bytes/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF) 471 ok := testtext.Run(t, s, func(t *testing.T) { 472 n, err := f.Span([]byte(tc.input), tc.atEOF) 473 if n != tc.n || err != tc.err { 474 t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err) 475 } 476 }) 477 if !ok { 478 continue // Don't do the String variant if the Bytes variant failed. 479 } 480 s = fmt.Sprintf("String/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF) 481 testtext.Run(t, s, func(t *testing.T) { 482 n, err := f.SpanString(tc.input, tc.atEOF) 483 if n != tc.n || err != tc.err { 484 t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err) 485 } 486 }) 487 } 488} 489 490func TestSpan(t *testing.T) { 491 runSpanTests(t, "NFD", NFD, quickSpanTests) 492 runSpanTests(t, "NFD", NFD, quickSpanNFDTests) 493 runSpanTests(t, "NFC", NFC, quickSpanTests) 494 runSpanTests(t, "NFC", NFC, quickSpanNFCTests) 495} 496 497var isNormalTests = []PositionTest{ 498 {"", 1, ""}, 499 // illegal runes 500 {"\xff", 1, ""}, 501 // starters 502 {"a", 1, ""}, 503 {"abc", 1, ""}, 504 {"\u043Eb", 1, ""}, 505 // incorrectly ordered combining characters 506 {"\u0300\u0316", 0, ""}, 507 {"ab\u0300\u0316", 0, ""}, 508 {"ab\u0300\u0316cd", 0, ""}, 509 {"\u0300\u0316cd", 0, ""}, 510} 511var isNormalNFDTests = []PositionTest{ 512 // needs decomposing 513 {"\u00C0", 0, ""}, 514 {"abc\u00C0", 0, ""}, 515 // correctly ordered combining characters 516 {"\u0300", 1, ""}, 517 {"ab\u0300", 1, ""}, 518 {"ab\u0300cd", 1, ""}, 519 {"\u0300cd", 1, ""}, 520 {"\u0316\u0300", 1, ""}, 521 {"ab\u0316\u0300", 1, ""}, 522 {"ab\u0316\u0300cd", 1, ""}, 523 {"\u0316\u0300cd", 1, ""}, 524 {"\u043E\u0308b", 1, ""}, 525 // Hangul 526 {"같은", 0, ""}, 527} 528var isNormalNFCTests = []PositionTest{ 529 // okay composed 530 {"\u00C0", 1, ""}, 531 {"abc\u00C0", 1, ""}, 532 // need reordering 533 {"a\u0300", 0, ""}, 534 {"a\u0300cd", 0, ""}, 535 {"a\u0316\u0300", 0, ""}, 536 {"a\u0316\u0300cd", 0, ""}, 537 // correctly ordered combining characters 538 {"ab\u0300", 1, ""}, 539 {"ab\u0300cd", 1, ""}, 540 {"ab\u0316\u0300", 1, ""}, 541 {"ab\u0316\u0300cd", 1, ""}, 542 {"\u00C0\u035D", 1, ""}, 543 {"\u0300", 1, ""}, 544 {"\u0316\u0300cd", 1, ""}, 545 // Hangul 546 {"같은", 1, ""}, 547} 548 549var isNormalNFKXTests = []PositionTest{ 550 // Special case. 551 {"\u00BC", 0, ""}, 552} 553 554func isNormalF(rb *reorderBuffer, s string) (int, []byte) { 555 if rb.f.form.IsNormal([]byte(s)) { 556 return 1, nil 557 } 558 return 0, nil 559} 560 561func isNormalStringF(rb *reorderBuffer, s string) (int, []byte) { 562 if rb.f.form.IsNormalString(s) { 563 return 1, nil 564 } 565 return 0, nil 566} 567 568func TestIsNormal(t *testing.T) { 569 runPosTests(t, "TestIsNormalNFD1", NFD, isNormalF, isNormalTests) 570 runPosTests(t, "TestIsNormalNFD2", NFD, isNormalF, isNormalNFDTests) 571 runPosTests(t, "TestIsNormalNFC1", NFC, isNormalF, isNormalTests) 572 runPosTests(t, "TestIsNormalNFC2", NFC, isNormalF, isNormalNFCTests) 573 runPosTests(t, "TestIsNormalNFKD1", NFKD, isNormalF, isNormalTests) 574 runPosTests(t, "TestIsNormalNFKD2", NFKD, isNormalF, isNormalNFDTests) 575 runPosTests(t, "TestIsNormalNFKD3", NFKD, isNormalF, isNormalNFKXTests) 576 runPosTests(t, "TestIsNormalNFKC1", NFKC, isNormalF, isNormalTests) 577 runPosTests(t, "TestIsNormalNFKC2", NFKC, isNormalF, isNormalNFCTests) 578 runPosTests(t, "TestIsNormalNFKC3", NFKC, isNormalF, isNormalNFKXTests) 579} 580 581func TestIsNormalString(t *testing.T) { 582 runPosTests(t, "TestIsNormalNFD1", NFD, isNormalStringF, isNormalTests) 583 runPosTests(t, "TestIsNormalNFD2", NFD, isNormalStringF, isNormalNFDTests) 584 runPosTests(t, "TestIsNormalNFC1", NFC, isNormalStringF, isNormalTests) 585 runPosTests(t, "TestIsNormalNFC2", NFC, isNormalStringF, isNormalNFCTests) 586} 587 588type AppendTest struct { 589 left string 590 right string 591 out string 592} 593 594type appendFunc func(f Form, out []byte, s string) []byte 595 596var fstr = []string{"NFC", "NFD", "NFKC", "NFKD"} 597 598func runNormTests(t *testing.T, name string, fn appendFunc) { 599 for f := NFC; f <= NFKD; f++ { 600 runAppendTests(t, name, f, fn, normTests[f]) 601 } 602} 603 604func runAppendTests(t *testing.T, name string, f Form, fn appendFunc, tests []AppendTest) { 605 for i, test := range tests { 606 t.Run(fmt.Sprintf("%s/%d", fstr[f], i), func(t *testing.T) { 607 id := pc(test.left + test.right) 608 if *testn >= 0 && i != *testn { 609 return 610 } 611 t.Run("fn", func(t *testing.T) { 612 out := []byte(test.left) 613 have := string(fn(f, out, test.right)) 614 if len(have) != len(test.out) { 615 t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(test.out), pc(have), pc(test.out)) 616 } 617 if have != test.out { 618 k, pf := pidx(have, test.out) 619 t.Errorf("%+q:\nwas %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(test.out[k:])) 620 } 621 }) 622 623 // Bootstrap by normalizing input. Ensures that the various variants 624 // behave the same. 625 for g := NFC; g <= NFKD; g++ { 626 if f == g { 627 continue 628 } 629 t.Run(fstr[g], func(t *testing.T) { 630 want := g.String(test.left + test.right) 631 have := string(fn(g, g.AppendString(nil, test.left), test.right)) 632 if len(have) != len(want) { 633 t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(want), pc(have), pc(want)) 634 } 635 if have != want { 636 k, pf := pidx(have, want) 637 t.Errorf("%+q:\nwas %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(want[k:])) 638 } 639 }) 640 } 641 }) 642 } 643} 644 645var normTests = [][]AppendTest{ 646 appendTestsNFC, 647 appendTestsNFD, 648 appendTestsNFKC, 649 appendTestsNFKD, 650} 651 652var appendTestsNFC = []AppendTest{ 653 {"", ascii, ascii}, 654 {"", txt_all, txt_all}, 655 {"\uff9e", grave(30), "\uff9e" + grave(29) + cgj + grave(1)}, 656 {grave(30), "\uff9e", grave(30) + cgj + "\uff9e"}, 657 658 // Tests designed for Iter. 659 { // ordering of non-composing combining characters 660 "", 661 "\u0305\u0316", 662 "\u0316\u0305", 663 }, 664 { // segment overflow 665 "", 666 "a" + rep(0x0305, maxNonStarters+4) + "\u0316", 667 "a" + rep(0x0305, maxNonStarters) + cgj + "\u0316" + rep(0x305, 4), 668 }, 669 670 { // Combine across non-blocking non-starters. 671 // U+0327 COMBINING CEDILLA;Mn;202;NSM;;;;;N;NON-SPACING CEDILLA;;;; 672 // U+0325 COMBINING RING BELOW;Mn;220;NSM;;;;;N;NON-SPACING RING BELOW;;;; 673 "", "a\u0327\u0325", "\u1e01\u0327", 674 }, 675 676 { // Jamo V+T does not combine. 677 "", 678 "\u1161\u11a8", 679 "\u1161\u11a8", 680 }, 681 682 // Stability tests: see https://www.unicode.org/review/pr-29.html. 683 {"", "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"}, 684 {"", "\u1100\u0300\u1161", "\u1100\u0300\u1161"}, 685 {"", "\u0b47\u0b3e", "\u0b4b"}, 686 {"", "\u1100\u1161", "\uac00"}, 687 688 // U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;; 689 { // 0d4a starts a new segment. 690 "", 691 "\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15), 692 "\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15), 693 }, 694 695 { // Split combining characters. 696 // TODO: don't insert CGJ before starters. 697 "", 698 "\u0d46" + strings.Repeat("\u0d3e", 31), 699 "\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e", 700 }, 701 702 { // Split combining characters. 703 "", 704 "\u0d4a" + strings.Repeat("\u0d3e", 30), 705 "\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e", 706 }, 707 708 { // https://golang.org/issues/20079 709 "", 710 "\xeb\u0344", 711 "\xeb\u0308\u0301", 712 }, 713 714 { // https://golang.org/issues/20079 715 "", 716 "\uac00" + strings.Repeat("\u0300", 30), 717 "\uac00" + strings.Repeat("\u0300", 29) + "\u034f\u0300", 718 }, 719 720 { // https://golang.org/issues/20079 721 "", 722 "\xeb" + strings.Repeat("\u0300", 31), 723 "\xeb" + strings.Repeat("\u0300", 30) + "\u034f\u0300", 724 }, 725} 726 727var appendTestsNFD = []AppendTest{ 728 // TODO: Move some of the tests here. 729} 730 731var appendTestsNFKC = []AppendTest{ 732 // empty buffers 733 {"", "", ""}, 734 {"a", "", "a"}, 735 {"", "a", "a"}, 736 {"", "\u0041\u0307\u0304", "\u01E0"}, 737 // segment split across buffers 738 {"", "a\u0300b", "\u00E0b"}, 739 {"a", "\u0300b", "\u00E0b"}, 740 {"a", "\u0300\u0316", "\u00E0\u0316"}, 741 {"a", "\u0316\u0300", "\u00E0\u0316"}, 742 {"a", "\u0300a\u0300", "\u00E0\u00E0"}, 743 {"a", "\u0300a\u0300a\u0300", "\u00E0\u00E0\u00E0"}, 744 {"a", "\u0300aaa\u0300aaa\u0300", "\u00E0aa\u00E0aa\u00E0"}, 745 {"a\u0300", "\u0327", "\u00E0\u0327"}, 746 {"a\u0327", "\u0300", "\u00E0\u0327"}, 747 {"a\u0316", "\u0300", "\u00E0\u0316"}, 748 {"\u0041\u0307", "\u0304", "\u01E0"}, 749 // Hangul 750 {"", "\u110B\u1173", "\uC73C"}, 751 {"", "\u1103\u1161", "\uB2E4"}, 752 {"", "\u110B\u1173\u11B7", "\uC74C"}, 753 {"", "\u320E", "\x28\uAC00\x29"}, 754 {"", "\x28\u1100\u1161\x29", "\x28\uAC00\x29"}, 755 {"\u1103", "\u1161", "\uB2E4"}, 756 {"\u110B", "\u1173\u11B7", "\uC74C"}, 757 {"\u110B\u1173", "\u11B7", "\uC74C"}, 758 {"\uC73C", "\u11B7", "\uC74C"}, 759 // UTF-8 encoding split across buffers 760 {"a\xCC", "\x80", "\u00E0"}, 761 {"a\xCC", "\x80b", "\u00E0b"}, 762 {"a\xCC", "\x80a\u0300", "\u00E0\u00E0"}, 763 {"a\xCC", "\x80\x80", "\u00E0\x80"}, 764 {"a\xCC", "\x80\xCC", "\u00E0\xCC"}, 765 {"a\u0316\xCC", "\x80a\u0316\u0300", "\u00E0\u0316\u00E0\u0316"}, 766 // ending in incomplete UTF-8 encoding 767 {"", "\xCC", "\xCC"}, 768 {"a", "\xCC", "a\xCC"}, 769 {"a", "b\xCC", "ab\xCC"}, 770 {"\u0226", "\xCC", "\u0226\xCC"}, 771 // illegal runes 772 {"", "\x80", "\x80"}, 773 {"", "\x80\x80\x80", "\x80\x80\x80"}, 774 {"", "\xCC\x80\x80\x80", "\xCC\x80\x80\x80"}, 775 {"", "a\x80", "a\x80"}, 776 {"", "a\x80\x80\x80", "a\x80\x80\x80"}, 777 {"", "a\x80\x80\x80\x80\x80\x80", "a\x80\x80\x80\x80\x80\x80"}, 778 {"a", "\x80\x80\x80", "a\x80\x80\x80"}, 779 // overflow 780 {"", strings.Repeat("\x80", 33), strings.Repeat("\x80", 33)}, 781 {strings.Repeat("\x80", 33), "", strings.Repeat("\x80", 33)}, 782 {strings.Repeat("\x80", 33), strings.Repeat("\x80", 33), strings.Repeat("\x80", 66)}, 783 // overflow of combining characters 784 {"", grave(34), grave(30) + cgj + grave(4)}, 785 {"", grave(36), grave(30) + cgj + grave(6)}, 786 {grave(29), grave(5), grave(30) + cgj + grave(4)}, 787 {grave(30), grave(4), grave(30) + cgj + grave(4)}, 788 {grave(30), grave(3), grave(30) + cgj + grave(3)}, 789 {grave(30) + "\xCC", "\x80", grave(30) + cgj + grave(1)}, 790 {"", "\uFDFA" + grave(14), "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645" + grave(14)}, 791 {"", "\uFDFA" + grave(28) + "\u0316", "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645\u0316" + grave(28)}, 792 // - First rune has a trailing non-starter. 793 {"\u00d5", grave(30), "\u00d5" + grave(29) + cgj + grave(1)}, 794 // - U+FF9E decomposes into a non-starter in compatibility mode. A CGJ must be 795 // inserted even when FF9E starts a new segment. 796 {"\uff9e", grave(30), "\u3099" + grave(29) + cgj + grave(1)}, 797 {grave(30), "\uff9e", grave(30) + cgj + "\u3099"}, 798 // - Many non-starter decompositions in a row causing overflow. 799 {"", rep(0x340, 31), rep(0x300, 30) + cgj + "\u0300"}, 800 {"", rep(0xFF9E, 31), rep(0x3099, 30) + cgj + "\u3099"}, 801 802 {"", "\u0644\u0625" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + "\u0300\u0300"}, 803 {"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)}, 804 {"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)}, 805 806 // U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers. 807 {"", "\u0f7f" + rep(0xf71, 29) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80"}, 808 {"", "\u0f7f" + rep(0xf71, 28) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + "\u0f80"}, 809 {"", "\u0f7f" + rep(0xf81, 16), "\u0f7f" + rep(0xf71, 15) + rep(0xf80, 15) + cgj + "\u0f71\u0f80"}, 810 811 // weird UTF-8 812 {"\u00E0\xE1", "\x86", "\u00E0\xE1\x86"}, 813 {"a\u0300\u11B7", "\u0300", "\u00E0\u11B7\u0300"}, 814 {"a\u0300\u11B7\u0300", "\u0300", "\u00E0\u11B7\u0300\u0300"}, 815 {"\u0300", "\xF8\x80\x80\x80\x80\u0300", "\u0300\xF8\x80\x80\x80\x80\u0300"}, 816 {"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"}, 817 {"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"}, 818 {"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"}, 819 {"\xF8\x80\x80\x80", "\x80\u0300\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"}, 820 821 {"", strings.Repeat("a\u0316\u0300", 6), strings.Repeat("\u00E0\u0316", 6)}, 822 // large input. 823 {"", strings.Repeat("a\u0300\u0316", 31), strings.Repeat("\u00E0\u0316", 31)}, 824 {"", strings.Repeat("a\u0300\u0316", 4000), strings.Repeat("\u00E0\u0316", 4000)}, 825 {"", strings.Repeat("\x80\x80", 4000), strings.Repeat("\x80\x80", 4000)}, 826 {"", "\u0041\u0307\u0304", "\u01E0"}, 827} 828 829var appendTestsNFKD = []AppendTest{ 830 {"", "a" + grave(64), "a" + grave(30) + cgj + grave(30) + cgj + grave(4)}, 831 832 { // segment overflow on unchanged character 833 "", 834 "a" + grave(64) + "\u0316", 835 "a" + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(4), 836 }, 837 { // segment overflow on unchanged character + start value 838 "", 839 "a" + grave(98) + "\u0316", 840 "a" + grave(30) + cgj + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(8), 841 }, 842 { // segment overflow on decomposition. (U+0340 decomposes to U+0300.) 843 "", 844 "a" + grave(59) + "\u0340", 845 "a" + grave(30) + cgj + grave(30), 846 }, 847 { // segment overflow on non-starter decomposition 848 "", 849 "a" + grave(33) + "\u0340" + grave(30) + "\u0320", 850 "a" + grave(30) + cgj + grave(30) + cgj + "\u0320" + grave(4), 851 }, 852 { // start value after ASCII overflow 853 "", 854 rep('a', segSize) + grave(32) + "\u0320", 855 rep('a', segSize) + grave(30) + cgj + "\u0320" + grave(2), 856 }, 857 { // Jamo overflow 858 "", 859 "\u1100\u1161" + grave(30) + "\u0320" + grave(2), 860 "\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3), 861 }, 862 { // Hangul 863 "", 864 "\uac00", 865 "\u1100\u1161", 866 }, 867 { // Hangul overflow 868 "", 869 "\uac00" + grave(32) + "\u0320", 870 "\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3), 871 }, 872 { // Hangul overflow in Hangul mode. 873 "", 874 "\uac00\uac00" + grave(32) + "\u0320", 875 "\u1100\u1161\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3), 876 }, 877 { // Hangul overflow in Hangul mode. 878 "", 879 strings.Repeat("\uac00", 3) + grave(32) + "\u0320", 880 strings.Repeat("\u1100\u1161", 3) + grave(29) + cgj + "\u0320" + grave(3), 881 }, 882 { // start value after cc=0 883 "", 884 "您您" + grave(34) + "\u0320", 885 "您您" + grave(30) + cgj + "\u0320" + grave(4), 886 }, 887 { // start value after normalization 888 "", 889 "\u0300\u0320a" + grave(34) + "\u0320", 890 "\u0320\u0300a" + grave(30) + cgj + "\u0320" + grave(4), 891 }, 892 { 893 // U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers. 894 "", 895 "a\u0f7f" + rep(0xf71, 29) + "\u0f81", 896 "a\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80", 897 }, 898} 899 900func TestAppend(t *testing.T) { 901 runNormTests(t, "Append", func(f Form, out []byte, s string) []byte { 902 return f.Append(out, []byte(s)...) 903 }) 904} 905 906func TestAppendString(t *testing.T) { 907 runNormTests(t, "AppendString", func(f Form, out []byte, s string) []byte { 908 return f.AppendString(out, s) 909 }) 910} 911 912func TestBytes(t *testing.T) { 913 runNormTests(t, "Bytes", func(f Form, out []byte, s string) []byte { 914 buf := []byte{} 915 buf = append(buf, out...) 916 buf = append(buf, s...) 917 return f.Bytes(buf) 918 }) 919} 920 921func TestString(t *testing.T) { 922 runNormTests(t, "String", func(f Form, out []byte, s string) []byte { 923 outs := string(out) + s 924 return []byte(f.String(outs)) 925 }) 926} 927 928func runNM(code string) (string, error) { 929 // Write the file. 930 tmpdir, err := ioutil.TempDir(os.TempDir(), "normalize_test") 931 if err != nil { 932 return "", fmt.Errorf("failed to create tmpdir: %v", err) 933 } 934 defer os.RemoveAll(tmpdir) 935 goTool := filepath.Join(runtime.GOROOT(), "bin", "go") 936 filename := filepath.Join(tmpdir, "main.go") 937 if err := ioutil.WriteFile(filename, []byte(code), 0644); err != nil { 938 return "", fmt.Errorf("failed to write main.go: %v", err) 939 } 940 outputFile := filepath.Join(tmpdir, "main") 941 942 // Build the binary. 943 out, err := exec.Command(goTool, "build", "-o", outputFile, filename).CombinedOutput() 944 if err != nil { 945 return "", fmt.Errorf("failed to execute command: %v", err) 946 } 947 948 // Get the symbols. 949 out, err = exec.Command(goTool, "tool", "nm", outputFile).CombinedOutput() 950 return string(out), err 951} 952 953func TestLinking(t *testing.T) { 954 const prog = ` 955 package main 956 import "fmt" 957 import "golang.org/x/text/unicode/norm" 958 func main() { fmt.Println(norm.%s) } 959 ` 960 961 baseline, errB := runNM(fmt.Sprintf(prog, "MaxSegmentSize")) 962 withTables, errT := runNM(fmt.Sprintf(prog, `NFC.String("")`)) 963 if errB != nil || errT != nil { 964 t.Skipf("TestLinking failed: %v and %v", errB, errT) 965 } 966 967 symbols := []string{"norm.formTable", "norm.nfkcValues", "norm.decomps"} 968 for _, symbol := range symbols { 969 if strings.Contains(baseline, symbol) { 970 t.Errorf("found: %q unexpectedly", symbol) 971 } 972 if !strings.Contains(withTables, symbol) { 973 t.Errorf("didn't find: %q unexpectedly", symbol) 974 } 975 } 976} 977 978func appendBench(f Form, in []byte) func() { 979 buf := make([]byte, 0, 4*len(in)) 980 return func() { 981 f.Append(buf, in...) 982 } 983} 984 985func bytesBench(f Form, in []byte) func() { 986 return func() { 987 f.Bytes(in) 988 } 989} 990 991func iterBench(f Form, in []byte) func() { 992 iter := Iter{} 993 return func() { 994 iter.Init(f, in) 995 for !iter.Done() { 996 iter.Next() 997 } 998 } 999} 1000 1001func transformBench(f Form, in []byte) func() { 1002 buf := make([]byte, 4*len(in)) 1003 return func() { 1004 if _, n, err := f.Transform(buf, in, true); err != nil || len(in) != n { 1005 log.Panic(n, len(in), err) 1006 } 1007 } 1008} 1009 1010func readerBench(f Form, in []byte) func() { 1011 buf := make([]byte, 4*len(in)) 1012 return func() { 1013 r := f.Reader(bytes.NewReader(in)) 1014 var err error 1015 for err == nil { 1016 _, err = r.Read(buf) 1017 } 1018 if err != io.EOF { 1019 panic("") 1020 } 1021 } 1022} 1023 1024func writerBench(f Form, in []byte) func() { 1025 buf := make([]byte, 0, 4*len(in)) 1026 return func() { 1027 r := f.Writer(bytes.NewBuffer(buf)) 1028 if _, err := r.Write(in); err != nil { 1029 panic("") 1030 } 1031 } 1032} 1033 1034func appendBenchmarks(bm []func(), f Form, in []byte) []func() { 1035 bm = append(bm, appendBench(f, in)) 1036 bm = append(bm, iterBench(f, in)) 1037 bm = append(bm, transformBench(f, in)) 1038 bm = append(bm, readerBench(f, in)) 1039 bm = append(bm, writerBench(f, in)) 1040 return bm 1041} 1042 1043func doFormBenchmark(b *testing.B, inf, f Form, s string) { 1044 b.StopTimer() 1045 in := inf.Bytes([]byte(s)) 1046 bm := appendBenchmarks(nil, f, in) 1047 b.SetBytes(int64(len(in) * len(bm))) 1048 b.StartTimer() 1049 for i := 0; i < b.N; i++ { 1050 for _, fn := range bm { 1051 fn() 1052 } 1053 } 1054} 1055 1056func doSingle(b *testing.B, f func(Form, []byte) func(), s []byte) { 1057 b.StopTimer() 1058 fn := f(NFC, s) 1059 b.SetBytes(int64(len(s))) 1060 b.StartTimer() 1061 for i := 0; i < b.N; i++ { 1062 fn() 1063 } 1064} 1065 1066var ( 1067 smallNoChange = []byte("nörmalization") 1068 smallChange = []byte("No\u0308rmalization") 1069 ascii = strings.Repeat("There is nothing to change here! ", 500) 1070) 1071 1072func lowerBench(f Form, in []byte) func() { 1073 // Use package strings instead of bytes as it doesn't allocate memory 1074 // if there aren't any changes. 1075 s := string(in) 1076 return func() { 1077 strings.ToLower(s) 1078 } 1079} 1080 1081func BenchmarkLowerCaseNoChange(b *testing.B) { 1082 doSingle(b, lowerBench, smallNoChange) 1083} 1084func BenchmarkLowerCaseChange(b *testing.B) { 1085 doSingle(b, lowerBench, smallChange) 1086} 1087 1088func quickSpanBench(f Form, in []byte) func() { 1089 return func() { 1090 f.QuickSpan(in) 1091 } 1092} 1093 1094func BenchmarkQuickSpanChangeNFC(b *testing.B) { 1095 doSingle(b, quickSpanBench, smallNoChange) 1096} 1097 1098func BenchmarkBytesNoChangeNFC(b *testing.B) { 1099 doSingle(b, bytesBench, smallNoChange) 1100} 1101func BenchmarkBytesChangeNFC(b *testing.B) { 1102 doSingle(b, bytesBench, smallChange) 1103} 1104 1105func BenchmarkAppendNoChangeNFC(b *testing.B) { 1106 doSingle(b, appendBench, smallNoChange) 1107} 1108func BenchmarkAppendChangeNFC(b *testing.B) { 1109 doSingle(b, appendBench, smallChange) 1110} 1111func BenchmarkAppendLargeNFC(b *testing.B) { 1112 doSingle(b, appendBench, txt_all_bytes) 1113} 1114 1115func BenchmarkIterNoChangeNFC(b *testing.B) { 1116 doSingle(b, iterBench, smallNoChange) 1117} 1118func BenchmarkIterChangeNFC(b *testing.B) { 1119 doSingle(b, iterBench, smallChange) 1120} 1121func BenchmarkIterLargeNFC(b *testing.B) { 1122 doSingle(b, iterBench, txt_all_bytes) 1123} 1124 1125func BenchmarkTransformNoChangeNFC(b *testing.B) { 1126 doSingle(b, transformBench, smallNoChange) 1127} 1128func BenchmarkTransformChangeNFC(b *testing.B) { 1129 doSingle(b, transformBench, smallChange) 1130} 1131func BenchmarkTransformLargeNFC(b *testing.B) { 1132 doSingle(b, transformBench, txt_all_bytes) 1133} 1134 1135func BenchmarkNormalizeAsciiNFC(b *testing.B) { 1136 doFormBenchmark(b, NFC, NFC, ascii) 1137} 1138func BenchmarkNormalizeAsciiNFD(b *testing.B) { 1139 doFormBenchmark(b, NFC, NFD, ascii) 1140} 1141func BenchmarkNormalizeAsciiNFKC(b *testing.B) { 1142 doFormBenchmark(b, NFC, NFKC, ascii) 1143} 1144func BenchmarkNormalizeAsciiNFKD(b *testing.B) { 1145 doFormBenchmark(b, NFC, NFKD, ascii) 1146} 1147 1148func BenchmarkNormalizeNFC2NFC(b *testing.B) { 1149 doFormBenchmark(b, NFC, NFC, txt_all) 1150} 1151func BenchmarkNormalizeNFC2NFD(b *testing.B) { 1152 doFormBenchmark(b, NFC, NFD, txt_all) 1153} 1154func BenchmarkNormalizeNFD2NFC(b *testing.B) { 1155 doFormBenchmark(b, NFD, NFC, txt_all) 1156} 1157func BenchmarkNormalizeNFD2NFD(b *testing.B) { 1158 doFormBenchmark(b, NFD, NFD, txt_all) 1159} 1160 1161// Hangul is often special-cased, so we test it separately. 1162func BenchmarkNormalizeHangulNFC2NFC(b *testing.B) { 1163 doFormBenchmark(b, NFC, NFC, txt_kr) 1164} 1165func BenchmarkNormalizeHangulNFC2NFD(b *testing.B) { 1166 doFormBenchmark(b, NFC, NFD, txt_kr) 1167} 1168func BenchmarkNormalizeHangulNFD2NFC(b *testing.B) { 1169 doFormBenchmark(b, NFD, NFC, txt_kr) 1170} 1171func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) { 1172 doFormBenchmark(b, NFD, NFD, txt_kr) 1173} 1174 1175var forms = []Form{NFC, NFD, NFKC, NFKD} 1176 1177func doTextBenchmark(b *testing.B, s string) { 1178 b.StopTimer() 1179 in := []byte(s) 1180 bm := []func(){} 1181 for _, f := range forms { 1182 bm = appendBenchmarks(bm, f, in) 1183 } 1184 b.SetBytes(int64(len(s) * len(bm))) 1185 b.StartTimer() 1186 for i := 0; i < b.N; i++ { 1187 for _, f := range bm { 1188 f() 1189 } 1190 } 1191} 1192 1193func BenchmarkCanonicalOrdering(b *testing.B) { 1194 doTextBenchmark(b, txt_canon) 1195} 1196func BenchmarkExtendedLatin(b *testing.B) { 1197 doTextBenchmark(b, txt_vn) 1198} 1199func BenchmarkMiscTwoByteUtf8(b *testing.B) { 1200 doTextBenchmark(b, twoByteUtf8) 1201} 1202func BenchmarkMiscThreeByteUtf8(b *testing.B) { 1203 doTextBenchmark(b, threeByteUtf8) 1204} 1205func BenchmarkHangul(b *testing.B) { 1206 doTextBenchmark(b, txt_kr) 1207} 1208func BenchmarkJapanese(b *testing.B) { 1209 doTextBenchmark(b, txt_jp) 1210} 1211func BenchmarkChinese(b *testing.B) { 1212 doTextBenchmark(b, txt_cn) 1213} 1214func BenchmarkOverflow(b *testing.B) { 1215 doTextBenchmark(b, overflow) 1216} 1217 1218var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B" 1219 1220// Tests sampled from the Canonical ordering tests (Part 2) of 1221// https://unicode.org/Public/UNIDATA/NormalizationTest.txt 1222const txt_canon = `\u0061\u0315\u0300\u05AE\u0300\u0062 \u0061\u0300\u0315\u0300\u05AE\u0062 1223\u0061\u0302\u0315\u0300\u05AE\u0062 \u0061\u0307\u0315\u0300\u05AE\u0062 1224\u0061\u0315\u0300\u05AE\u030A\u0062 \u0061\u059A\u0316\u302A\u031C\u0062 1225\u0061\u032E\u059A\u0316\u302A\u0062 \u0061\u0338\u093C\u0334\u0062 1226\u0061\u059A\u0316\u302A\u0339 \u0061\u0341\u0315\u0300\u05AE\u0062 1227\u0061\u0348\u059A\u0316\u302A\u0062 \u0061\u0361\u0345\u035D\u035C\u0062 1228\u0061\u0366\u0315\u0300\u05AE\u0062 \u0061\u0315\u0300\u05AE\u0486\u0062 1229\u0061\u05A4\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0613\u0062 1230\u0061\u0315\u0300\u05AE\u0615\u0062 \u0061\u0617\u0315\u0300\u05AE\u0062 1231\u0061\u0619\u0618\u064D\u064E\u0062 \u0061\u0315\u0300\u05AE\u0654\u0062 1232\u0061\u0315\u0300\u05AE\u06DC\u0062 \u0061\u0733\u0315\u0300\u05AE\u0062 1233\u0061\u0744\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0745\u0062 1234\u0061\u09CD\u05B0\u094D\u3099\u0062 \u0061\u0E38\u0E48\u0E38\u0C56\u0062 1235\u0061\u0EB8\u0E48\u0E38\u0E49\u0062 \u0061\u0F72\u0F71\u0EC8\u0F71\u0062 1236\u0061\u1039\u05B0\u094D\u3099\u0062 \u0061\u05B0\u094D\u3099\u1A60\u0062 1237\u0061\u3099\u093C\u0334\u1BE6\u0062 \u0061\u3099\u093C\u0334\u1C37\u0062 1238\u0061\u1CD9\u059A\u0316\u302A\u0062 \u0061\u2DED\u0315\u0300\u05AE\u0062 1239\u0061\u2DEF\u0315\u0300\u05AE\u0062 \u0061\u302D\u302E\u059A\u0316\u0062` 1240 1241// Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/ 1242const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả. 1243Nếu bạn sử dụng, chuyển đổi, hoặc xây dựng dự án từ 1244nội dung được chia sẻ này, bạn phải áp dụng giấy phép này hoặc 1245một giấy phép khác có các điều khoản tương tự như giấy phép này 1246cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào 1247trên đây cũng có thể được miễn bỏ nếu bạn được sự cho phép của 1248người sở hữu bản quyền. Phạm vi công chúng — Khi tác phẩm hoặc 1249bất kỳ chương nào của tác phẩm đã trong vùng dành cho công 1250chúng theo quy định của pháp luật thì tình trạng của nó không 1251bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.` 1252 1253// Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru 1254const txt_ru = `При обязательном соблюдении следующих условий: 1255Attribution — Вы должны атрибутировать произведение (указывать 1256автора и источник) в порядке, предусмотренном автором или 1257лицензиаром (но только так, чтобы никоим образом не подразумевалось, 1258что они поддерживают вас или использование вами данного произведения). 1259Υπό τις ακόλουθες προϋποθέσεις:` 1260 1261// Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/ 1262const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με τον 1263τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια 1264(χωρίς όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή 1265τη χρήση του έργου από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε, 1266τροποποιήσετε ή δημιουργήσετε περαιτέρω βασισμένοι στο έργο θα 1267μπορείτε να διανέμετε το έργο που θα προκύψει μόνο με την ίδια ή 1268παρόμοια άδεια.` 1269 1270// Taken from http://creativecommons.org/licenses/by-sa/3.0/deed.ar 1271const txt_ar = `بموجب الشروط التالية نسب المصنف — يجب عليك أن 1272تنسب العمل بالطريقة التي تحددها المؤلف أو المرخص (ولكن ليس بأي حال من 1273الأحوال أن توحي وتقترح بتحول أو استخدامك للعمل). 1274المشاركة على قدم المساواة — إذا كنت يعدل ، والتغيير ، أو الاستفادة 1275من هذا العمل ، قد ينتج عن توزيع العمل إلا في ظل تشابه او تطابق فى واحد 1276لهذا الترخيص.` 1277 1278// Taken from http://creativecommons.org/licenses/by-sa/1.0/il/ 1279const txt_il = `בכפוף לתנאים הבאים: ייחוס — עליך לייחס את היצירה (לתת קרדיט) באופן 1280המצויין על-ידי היוצר או מעניק הרישיון (אך לא בשום אופן המרמז על כך 1281שהם תומכים בך או בשימוש שלך ביצירה). שיתוף זהה — אם תחליט/י לשנות, 1282לעבד או ליצור יצירה נגזרת בהסתמך על יצירה זו, תוכל/י להפיץ את יצירתך 1283החדשה רק תחת אותו הרישיון או רישיון דומה לרישיון זה.` 1284 1285const twoByteUtf8 = txt_ru + txt_gr + txt_ar + txt_il 1286 1287// Taken from http://creativecommons.org/licenses/by-sa/2.0/kr/ 1288const txt_kr = `다음과 같은 조건을 따라야 합니다: 저작자표시 1289(Attribution) — 저작자나 이용허락자가 정한 방법으로 저작물의 1290원저작자를 표시하여야 합니다(그러나 원저작자가 이용자나 이용자의 1291이용을 보증하거나 추천한다는 의미로 표시해서는 안됩니다). 1292동일조건변경허락 — 이 저작물을 이용하여 만든 이차적 저작물에는 본 1293라이선스와 동일한 라이선스를 적용해야 합니다.` 1294 1295// Taken from http://creativecommons.org/licenses/by-sa/3.0/th/ 1296const txt_th = `ภายใต้เงื่อนไข ดังต่อไปนี้ : แสดงที่มา — คุณต้องแสดงที่ 1297มาของงานดังกล่าว ตามรูปแบบที่ผู้สร้างสรรค์หรือผู้อนุญาตกำหนด (แต่ 1298ไม่ใช่ในลักษณะที่ว่า พวกเขาสนับสนุนคุณหรือสนับสนุนการที่ 1299คุณนำงานไปใช้) อนุญาตแบบเดียวกัน — หากคุณดัดแปลง เปลี่ยนรูป หรื 1300อต่อเติมงานนี้ คุณต้องใช้สัญญาอนุญาตแบบเดียวกันหรือแบบที่เหมื 1301อนกับสัญญาอนุญาตที่ใช้กับงานนี้เท่านั้น` 1302 1303const threeByteUtf8 = txt_th 1304 1305// Taken from http://creativecommons.org/licenses/by-sa/2.0/jp/ 1306const txt_jp = `あなたの従うべき条件は以下の通りです。 1307表示 — あなたは原著作者のクレジットを表示しなければなりません。 1308継承 — もしあなたがこの作品を改変、変形または加工した場合、 1309あなたはその結果生じた作品をこの作品と同一の許諾条件の下でのみ 1310頒布することができます。` 1311 1312// http://creativecommons.org/licenses/by-sa/2.5/cn/ 1313const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、 1314广播或通过信息网络传播本作品 创作演绎作品 1315对本作品进行商业性使用 惟须遵守下列条件: 1316署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。 1317相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作, 1318您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。` 1319 1320const txt_cjk = txt_cn + txt_jp + txt_kr 1321const txt_all = txt_vn + twoByteUtf8 + threeByteUtf8 + txt_cjk 1322 1323var txt_all_bytes = []byte(txt_all) 1324