1// Copyright 2016 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build go1.10 6//go:generate go run gen.go gen_trieval.go gen_common.go 7 8// Package idna implements IDNA2008 using the compatibility processing 9// defined by UTS (Unicode Technical Standard) #46, which defines a standard to 10// deal with the transition from IDNA2003. 11// 12// IDNA2008 (Internationalized Domain Names for Applications), is defined in RFC 13// 5890, RFC 5891, RFC 5892, RFC 5893 and RFC 5894. 14// UTS #46 is defined in https://www.unicode.org/reports/tr46. 15// See https://unicode.org/cldr/utility/idna.jsp for a visualization of the 16// differences between these two standards. 17package idna // import "golang.org/x/text/internal/export/idna" 18 19import ( 20 "fmt" 21 "strings" 22 "unicode/utf8" 23 24 "golang.org/x/text/secure/bidirule" 25 "golang.org/x/text/unicode/bidi" 26 "golang.org/x/text/unicode/norm" 27) 28 29// NOTE: Unlike common practice in Go APIs, the functions will return a 30// sanitized domain name in case of errors. Browsers sometimes use a partially 31// evaluated string as lookup. 32// TODO: the current error handling is, in my opinion, the least opinionated. 33// Other strategies are also viable, though: 34// Option 1) Return an empty string in case of error, but allow the user to 35// specify explicitly which errors to ignore. 36// Option 2) Return the partially evaluated string if it is itself a valid 37// string, otherwise return the empty string in case of error. 38// Option 3) Option 1 and 2. 39// Option 4) Always return an empty string for now and implement Option 1 as 40// needed, and document that the return string may not be empty in case of 41// error in the future. 42// I think Option 1 is best, but it is quite opinionated. 43 44// ToASCII is a wrapper for Punycode.ToASCII. 45func ToASCII(s string) (string, error) { 46 return Punycode.process(s, true) 47} 48 49// ToUnicode is a wrapper for Punycode.ToUnicode. 50func ToUnicode(s string) (string, error) { 51 return Punycode.process(s, false) 52} 53 54// An Option configures a Profile at creation time. 55type Option func(*options) 56 57// Transitional sets a Profile to use the Transitional mapping as defined in UTS 58// #46. This will cause, for example, "ß" to be mapped to "ss". Using the 59// transitional mapping provides a compromise between IDNA2003 and IDNA2008 60// compatibility. It is used by most browsers when resolving domain names. This 61// option is only meaningful if combined with MapForLookup. 62func Transitional(transitional bool) Option { 63 return func(o *options) { o.transitional = true } 64} 65 66// VerifyDNSLength sets whether a Profile should fail if any of the IDN parts 67// are longer than allowed by the RFC. 68// 69// This option corresponds to the VerifyDnsLength flag in UTS #46. 70func VerifyDNSLength(verify bool) Option { 71 return func(o *options) { o.verifyDNSLength = verify } 72} 73 74// RemoveLeadingDots removes leading label separators. Leading runes that map to 75// dots, such as U+3002 IDEOGRAPHIC FULL STOP, are removed as well. 76func RemoveLeadingDots(remove bool) Option { 77 return func(o *options) { o.removeLeadingDots = remove } 78} 79 80// ValidateLabels sets whether to check the mandatory label validation criteria 81// as defined in Section 5.4 of RFC 5891. This includes testing for correct use 82// of hyphens ('-'), normalization, validity of runes, and the context rules. 83// In particular, ValidateLabels also sets the CheckHyphens and CheckJoiners flags 84// in UTS #46. 85func ValidateLabels(enable bool) Option { 86 return func(o *options) { 87 // Don't override existing mappings, but set one that at least checks 88 // normalization if it is not set. 89 if o.mapping == nil && enable { 90 o.mapping = normalize 91 } 92 o.trie = trie 93 o.checkJoiners = enable 94 o.checkHyphens = enable 95 if enable { 96 o.fromPuny = validateFromPunycode 97 } else { 98 o.fromPuny = nil 99 } 100 } 101} 102 103// CheckHyphens sets whether to check for correct use of hyphens ('-') in 104// labels. Most web browsers do not have this option set, since labels such as 105// "r3---sn-apo3qvuoxuxbt-j5pe" are in common use. 106// 107// This option corresponds to the CheckHyphens flag in UTS #46. 108func CheckHyphens(enable bool) Option { 109 return func(o *options) { o.checkHyphens = enable } 110} 111 112// CheckJoiners sets whether to check the ContextJ rules as defined in Appendix 113// A of RFC 5892, concerning the use of joiner runes. 114// 115// This option corresponds to the CheckJoiners flag in UTS #46. 116func CheckJoiners(enable bool) Option { 117 return func(o *options) { 118 o.trie = trie 119 o.checkJoiners = enable 120 } 121} 122 123// StrictDomainName limits the set of permissible ASCII characters to those 124// allowed in domain names as defined in RFC 1034 (A-Z, a-z, 0-9 and the 125// hyphen). This is set by default for MapForLookup and ValidateForRegistration, 126// but is only useful if ValidateLabels is set. 127// 128// This option is useful, for instance, for browsers that allow characters 129// outside this range, for example a '_' (U+005F LOW LINE). See 130// http://www.rfc-editor.org/std/std3.txt for more details. 131// 132// This option corresponds to the UseSTD3ASCIIRules flag in UTS #46. 133func StrictDomainName(use bool) Option { 134 return func(o *options) { o.useSTD3Rules = use } 135} 136 137// NOTE: the following options pull in tables. The tables should not be linked 138// in as long as the options are not used. 139 140// BidiRule enables the Bidi rule as defined in RFC 5893. Any application 141// that relies on proper validation of labels should include this rule. 142// 143// This option corresponds to the CheckBidi flag in UTS #46. 144func BidiRule() Option { 145 return func(o *options) { o.bidirule = bidirule.ValidString } 146} 147 148// ValidateForRegistration sets validation options to verify that a given IDN is 149// properly formatted for registration as defined by Section 4 of RFC 5891. 150func ValidateForRegistration() Option { 151 return func(o *options) { 152 o.mapping = validateRegistration 153 StrictDomainName(true)(o) 154 ValidateLabels(true)(o) 155 VerifyDNSLength(true)(o) 156 BidiRule()(o) 157 } 158} 159 160// MapForLookup sets validation and mapping options such that a given IDN is 161// transformed for domain name lookup according to the requirements set out in 162// Section 5 of RFC 5891. The mappings follow the recommendations of RFC 5894, 163// RFC 5895 and UTS 46. It does not add the Bidi Rule. Use the BidiRule option 164// to add this check. 165// 166// The mappings include normalization and mapping case, width and other 167// compatibility mappings. 168func MapForLookup() Option { 169 return func(o *options) { 170 o.mapping = validateAndMap 171 StrictDomainName(true)(o) 172 ValidateLabels(true)(o) 173 } 174} 175 176type options struct { 177 transitional bool 178 useSTD3Rules bool 179 checkHyphens bool 180 checkJoiners bool 181 verifyDNSLength bool 182 removeLeadingDots bool 183 184 trie *idnaTrie 185 186 // fromPuny calls validation rules when converting A-labels to U-labels. 187 fromPuny func(p *Profile, s string) error 188 189 // mapping implements a validation and mapping step as defined in RFC 5895 190 // or UTS 46, tailored to, for example, domain registration or lookup. 191 mapping func(p *Profile, s string) (mapped string, isBidi bool, err error) 192 193 // bidirule, if specified, checks whether s conforms to the Bidi Rule 194 // defined in RFC 5893. 195 bidirule func(s string) bool 196} 197 198// A Profile defines the configuration of an IDNA mapper. 199type Profile struct { 200 options 201} 202 203func apply(o *options, opts []Option) { 204 for _, f := range opts { 205 f(o) 206 } 207} 208 209// New creates a new Profile. 210// 211// With no options, the returned Profile is the most permissive and equals the 212// Punycode Profile. Options can be passed to further restrict the Profile. The 213// MapForLookup and ValidateForRegistration options set a collection of options, 214// for lookup and registration purposes respectively, which can be tailored by 215// adding more fine-grained options, where later options override earlier 216// options. 217func New(o ...Option) *Profile { 218 p := &Profile{} 219 apply(&p.options, o) 220 return p 221} 222 223// ToASCII converts a domain or domain label to its ASCII form. For example, 224// ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and 225// ToASCII("golang") is "golang". If an error is encountered it will return 226// an error and a (partially) processed result. 227func (p *Profile) ToASCII(s string) (string, error) { 228 return p.process(s, true) 229} 230 231// ToUnicode converts a domain or domain label to its Unicode form. For example, 232// ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and 233// ToUnicode("golang") is "golang". If an error is encountered it will return 234// an error and a (partially) processed result. 235func (p *Profile) ToUnicode(s string) (string, error) { 236 pp := *p 237 pp.transitional = false 238 return pp.process(s, false) 239} 240 241// String reports a string with a description of the profile for debugging 242// purposes. The string format may change with different versions. 243func (p *Profile) String() string { 244 s := "" 245 if p.transitional { 246 s = "Transitional" 247 } else { 248 s = "NonTransitional" 249 } 250 if p.useSTD3Rules { 251 s += ":UseSTD3Rules" 252 } 253 if p.checkHyphens { 254 s += ":CheckHyphens" 255 } 256 if p.checkJoiners { 257 s += ":CheckJoiners" 258 } 259 if p.verifyDNSLength { 260 s += ":VerifyDNSLength" 261 } 262 return s 263} 264 265var ( 266 // Punycode is a Profile that does raw punycode processing with a minimum 267 // of validation. 268 Punycode *Profile = punycode 269 270 // Lookup is the recommended profile for looking up domain names, according 271 // to Section 5 of RFC 5891. The exact configuration of this profile may 272 // change over time. 273 Lookup *Profile = lookup 274 275 // Display is the recommended profile for displaying domain names. 276 // The configuration of this profile may change over time. 277 Display *Profile = display 278 279 // Registration is the recommended profile for checking whether a given 280 // IDN is valid for registration, according to Section 4 of RFC 5891. 281 Registration *Profile = registration 282 283 punycode = &Profile{} 284 lookup = &Profile{options{ 285 transitional: true, 286 useSTD3Rules: true, 287 checkHyphens: true, 288 checkJoiners: true, 289 trie: trie, 290 fromPuny: validateFromPunycode, 291 mapping: validateAndMap, 292 bidirule: bidirule.ValidString, 293 }} 294 display = &Profile{options{ 295 useSTD3Rules: true, 296 checkHyphens: true, 297 checkJoiners: true, 298 trie: trie, 299 fromPuny: validateFromPunycode, 300 mapping: validateAndMap, 301 bidirule: bidirule.ValidString, 302 }} 303 registration = &Profile{options{ 304 useSTD3Rules: true, 305 verifyDNSLength: true, 306 checkHyphens: true, 307 checkJoiners: true, 308 trie: trie, 309 fromPuny: validateFromPunycode, 310 mapping: validateRegistration, 311 bidirule: bidirule.ValidString, 312 }} 313 314 // TODO: profiles 315 // Register: recommended for approving domain names: don't do any mappings 316 // but rather reject on invalid input. Bundle or block deviation characters. 317) 318 319type labelError struct{ label, code_ string } 320 321func (e labelError) code() string { return e.code_ } 322func (e labelError) Error() string { 323 return fmt.Sprintf("idna: invalid label %q", e.label) 324} 325 326type runeError rune 327 328func (e runeError) code() string { return "P1" } 329func (e runeError) Error() string { 330 return fmt.Sprintf("idna: disallowed rune %U", e) 331} 332 333// process implements the algorithm described in section 4 of UTS #46, 334// see https://www.unicode.org/reports/tr46. 335func (p *Profile) process(s string, toASCII bool) (string, error) { 336 var err error 337 var isBidi bool 338 if p.mapping != nil { 339 s, isBidi, err = p.mapping(p, s) 340 } 341 // Remove leading empty labels. 342 if p.removeLeadingDots { 343 for ; len(s) > 0 && s[0] == '.'; s = s[1:] { 344 } 345 } 346 // TODO: allow for a quick check of the tables data. 347 // It seems like we should only create this error on ToASCII, but the 348 // UTS 46 conformance tests suggests we should always check this. 349 if err == nil && p.verifyDNSLength && s == "" { 350 err = &labelError{s, "A4"} 351 } 352 labels := labelIter{orig: s} 353 for ; !labels.done(); labels.next() { 354 label := labels.label() 355 if label == "" { 356 // Empty labels are not okay. The label iterator skips the last 357 // label if it is empty. 358 if err == nil && p.verifyDNSLength { 359 err = &labelError{s, "A4"} 360 } 361 continue 362 } 363 if strings.HasPrefix(label, acePrefix) { 364 u, err2 := decode(label[len(acePrefix):]) 365 if err2 != nil { 366 if err == nil { 367 err = err2 368 } 369 // Spec says keep the old label. 370 continue 371 } 372 isBidi = isBidi || bidirule.DirectionString(u) != bidi.LeftToRight 373 labels.set(u) 374 if err == nil && p.fromPuny != nil { 375 err = p.fromPuny(p, u) 376 } 377 if err == nil { 378 // This should be called on NonTransitional, according to the 379 // spec, but that currently does not have any effect. Use the 380 // original profile to preserve options. 381 err = p.validateLabel(u) 382 } 383 } else if err == nil { 384 err = p.validateLabel(label) 385 } 386 } 387 if isBidi && p.bidirule != nil && err == nil { 388 for labels.reset(); !labels.done(); labels.next() { 389 if !p.bidirule(labels.label()) { 390 err = &labelError{s, "B"} 391 break 392 } 393 } 394 } 395 if toASCII { 396 for labels.reset(); !labels.done(); labels.next() { 397 label := labels.label() 398 if !ascii(label) { 399 a, err2 := encode(acePrefix, label) 400 if err == nil { 401 err = err2 402 } 403 label = a 404 labels.set(a) 405 } 406 n := len(label) 407 if p.verifyDNSLength && err == nil && (n == 0 || n > 63) { 408 err = &labelError{label, "A4"} 409 } 410 } 411 } 412 s = labels.result() 413 if toASCII && p.verifyDNSLength && err == nil { 414 // Compute the length of the domain name minus the root label and its dot. 415 n := len(s) 416 if n > 0 && s[n-1] == '.' { 417 n-- 418 } 419 if len(s) < 1 || n > 253 { 420 err = &labelError{s, "A4"} 421 } 422 } 423 return s, err 424} 425 426func normalize(p *Profile, s string) (mapped string, isBidi bool, err error) { 427 // TODO: consider first doing a quick check to see if any of these checks 428 // need to be done. This will make it slower in the general case, but 429 // faster in the common case. 430 mapped = norm.NFC.String(s) 431 isBidi = bidirule.DirectionString(mapped) == bidi.RightToLeft 432 return mapped, isBidi, nil 433} 434 435func validateRegistration(p *Profile, s string) (idem string, bidi bool, err error) { 436 // TODO: filter need for normalization in loop below. 437 if !norm.NFC.IsNormalString(s) { 438 return s, false, &labelError{s, "V1"} 439 } 440 for i := 0; i < len(s); { 441 v, sz := trie.lookupString(s[i:]) 442 if sz == 0 { 443 return s, bidi, runeError(utf8.RuneError) 444 } 445 bidi = bidi || info(v).isBidi(s[i:]) 446 // Copy bytes not copied so far. 447 switch p.simplify(info(v).category()) { 448 // TODO: handle the NV8 defined in the Unicode idna data set to allow 449 // for strict conformance to IDNA2008. 450 case valid, deviation: 451 case disallowed, mapped, unknown, ignored: 452 r, _ := utf8.DecodeRuneInString(s[i:]) 453 return s, bidi, runeError(r) 454 } 455 i += sz 456 } 457 return s, bidi, nil 458} 459 460func (c info) isBidi(s string) bool { 461 if !c.isMapped() { 462 return c&attributesMask == rtl 463 } 464 // TODO: also store bidi info for mapped data. This is possible, but a bit 465 // cumbersome and not for the common case. 466 p, _ := bidi.LookupString(s) 467 switch p.Class() { 468 case bidi.R, bidi.AL, bidi.AN: 469 return true 470 } 471 return false 472} 473 474func validateAndMap(p *Profile, s string) (vm string, bidi bool, err error) { 475 var ( 476 b []byte 477 k int 478 ) 479 // combinedInfoBits contains the or-ed bits of all runes. We use this 480 // to derive the mayNeedNorm bit later. This may trigger normalization 481 // overeagerly, but it will not do so in the common case. The end result 482 // is another 10% saving on BenchmarkProfile for the common case. 483 var combinedInfoBits info 484 for i := 0; i < len(s); { 485 v, sz := trie.lookupString(s[i:]) 486 if sz == 0 { 487 b = append(b, s[k:i]...) 488 b = append(b, "\ufffd"...) 489 k = len(s) 490 if err == nil { 491 err = runeError(utf8.RuneError) 492 } 493 break 494 } 495 combinedInfoBits |= info(v) 496 bidi = bidi || info(v).isBidi(s[i:]) 497 start := i 498 i += sz 499 // Copy bytes not copied so far. 500 switch p.simplify(info(v).category()) { 501 case valid: 502 continue 503 case disallowed: 504 if err == nil { 505 r, _ := utf8.DecodeRuneInString(s[start:]) 506 err = runeError(r) 507 } 508 continue 509 case mapped, deviation: 510 b = append(b, s[k:start]...) 511 b = info(v).appendMapping(b, s[start:i]) 512 case ignored: 513 b = append(b, s[k:start]...) 514 // drop the rune 515 case unknown: 516 b = append(b, s[k:start]...) 517 b = append(b, "\ufffd"...) 518 } 519 k = i 520 } 521 if k == 0 { 522 // No changes so far. 523 if combinedInfoBits&mayNeedNorm != 0 { 524 s = norm.NFC.String(s) 525 } 526 } else { 527 b = append(b, s[k:]...) 528 if norm.NFC.QuickSpan(b) != len(b) { 529 b = norm.NFC.Bytes(b) 530 } 531 // TODO: the punycode converters require strings as input. 532 s = string(b) 533 } 534 return s, bidi, err 535} 536 537// A labelIter allows iterating over domain name labels. 538type labelIter struct { 539 orig string 540 slice []string 541 curStart int 542 curEnd int 543 i int 544} 545 546func (l *labelIter) reset() { 547 l.curStart = 0 548 l.curEnd = 0 549 l.i = 0 550} 551 552func (l *labelIter) done() bool { 553 return l.curStart >= len(l.orig) 554} 555 556func (l *labelIter) result() string { 557 if l.slice != nil { 558 return strings.Join(l.slice, ".") 559 } 560 return l.orig 561} 562 563func (l *labelIter) label() string { 564 if l.slice != nil { 565 return l.slice[l.i] 566 } 567 p := strings.IndexByte(l.orig[l.curStart:], '.') 568 l.curEnd = l.curStart + p 569 if p == -1 { 570 l.curEnd = len(l.orig) 571 } 572 return l.orig[l.curStart:l.curEnd] 573} 574 575// next sets the value to the next label. It skips the last label if it is empty. 576func (l *labelIter) next() { 577 l.i++ 578 if l.slice != nil { 579 if l.i >= len(l.slice) || l.i == len(l.slice)-1 && l.slice[l.i] == "" { 580 l.curStart = len(l.orig) 581 } 582 } else { 583 l.curStart = l.curEnd + 1 584 if l.curStart == len(l.orig)-1 && l.orig[l.curStart] == '.' { 585 l.curStart = len(l.orig) 586 } 587 } 588} 589 590func (l *labelIter) set(s string) { 591 if l.slice == nil { 592 l.slice = strings.Split(l.orig, ".") 593 } 594 l.slice[l.i] = s 595} 596 597// acePrefix is the ASCII Compatible Encoding prefix. 598const acePrefix = "xn--" 599 600func (p *Profile) simplify(cat category) category { 601 switch cat { 602 case disallowedSTD3Mapped: 603 if p.useSTD3Rules { 604 cat = disallowed 605 } else { 606 cat = mapped 607 } 608 case disallowedSTD3Valid: 609 if p.useSTD3Rules { 610 cat = disallowed 611 } else { 612 cat = valid 613 } 614 case deviation: 615 if !p.transitional { 616 cat = valid 617 } 618 case validNV8, validXV8: 619 // TODO: handle V2008 620 cat = valid 621 } 622 return cat 623} 624 625func validateFromPunycode(p *Profile, s string) error { 626 if !norm.NFC.IsNormalString(s) { 627 return &labelError{s, "V1"} 628 } 629 // TODO: detect whether string may have to be normalized in the following 630 // loop. 631 for i := 0; i < len(s); { 632 v, sz := trie.lookupString(s[i:]) 633 if sz == 0 { 634 return runeError(utf8.RuneError) 635 } 636 if c := p.simplify(info(v).category()); c != valid && c != deviation { 637 return &labelError{s, "V6"} 638 } 639 i += sz 640 } 641 return nil 642} 643 644const ( 645 zwnj = "\u200c" 646 zwj = "\u200d" 647) 648 649type joinState int8 650 651const ( 652 stateStart joinState = iota 653 stateVirama 654 stateBefore 655 stateBeforeVirama 656 stateAfter 657 stateFAIL 658) 659 660var joinStates = [][numJoinTypes]joinState{ 661 stateStart: { 662 joiningL: stateBefore, 663 joiningD: stateBefore, 664 joinZWNJ: stateFAIL, 665 joinZWJ: stateFAIL, 666 joinVirama: stateVirama, 667 }, 668 stateVirama: { 669 joiningL: stateBefore, 670 joiningD: stateBefore, 671 }, 672 stateBefore: { 673 joiningL: stateBefore, 674 joiningD: stateBefore, 675 joiningT: stateBefore, 676 joinZWNJ: stateAfter, 677 joinZWJ: stateFAIL, 678 joinVirama: stateBeforeVirama, 679 }, 680 stateBeforeVirama: { 681 joiningL: stateBefore, 682 joiningD: stateBefore, 683 joiningT: stateBefore, 684 }, 685 stateAfter: { 686 joiningL: stateFAIL, 687 joiningD: stateBefore, 688 joiningT: stateAfter, 689 joiningR: stateStart, 690 joinZWNJ: stateFAIL, 691 joinZWJ: stateFAIL, 692 joinVirama: stateAfter, // no-op as we can't accept joiners here 693 }, 694 stateFAIL: { 695 0: stateFAIL, 696 joiningL: stateFAIL, 697 joiningD: stateFAIL, 698 joiningT: stateFAIL, 699 joiningR: stateFAIL, 700 joinZWNJ: stateFAIL, 701 joinZWJ: stateFAIL, 702 joinVirama: stateFAIL, 703 }, 704} 705 706// validateLabel validates the criteria from Section 4.1. Item 1, 4, and 6 are 707// already implicitly satisfied by the overall implementation. 708func (p *Profile) validateLabel(s string) (err error) { 709 if s == "" { 710 if p.verifyDNSLength { 711 return &labelError{s, "A4"} 712 } 713 return nil 714 } 715 if p.checkHyphens { 716 if len(s) > 4 && s[2] == '-' && s[3] == '-' { 717 return &labelError{s, "V2"} 718 } 719 if s[0] == '-' || s[len(s)-1] == '-' { 720 return &labelError{s, "V3"} 721 } 722 } 723 if !p.checkJoiners { 724 return nil 725 } 726 trie := p.trie // p.checkJoiners is only set if trie is set. 727 // TODO: merge the use of this in the trie. 728 v, sz := trie.lookupString(s) 729 x := info(v) 730 if x.isModifier() { 731 return &labelError{s, "V5"} 732 } 733 // Quickly return in the absence of zero-width (non) joiners. 734 if strings.Index(s, zwj) == -1 && strings.Index(s, zwnj) == -1 { 735 return nil 736 } 737 st := stateStart 738 for i := 0; ; { 739 jt := x.joinType() 740 if s[i:i+sz] == zwj { 741 jt = joinZWJ 742 } else if s[i:i+sz] == zwnj { 743 jt = joinZWNJ 744 } 745 st = joinStates[st][jt] 746 if x.isViramaModifier() { 747 st = joinStates[st][joinVirama] 748 } 749 if i += sz; i == len(s) { 750 break 751 } 752 v, sz = trie.lookupString(s[i:]) 753 x = info(v) 754 } 755 if st == stateFAIL || st == stateAfter { 756 return &labelError{s, "C"} 757 } 758 return nil 759} 760 761func ascii(s string) bool { 762 for i := 0; i < len(s); i++ { 763 if s[i] >= utf8.RuneSelf { 764 return false 765 } 766 } 767 return true 768} 769