1// Copyright 2016 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build !go1.10 6//go:generate go run gen.go gen_trieval.go gen_common.go 7 8// Package idna implements IDNA2008 using the compatibility processing 9// defined by UTS (Unicode Technical Standard) #46, which defines a standard to 10// deal with the transition from IDNA2003. 11// 12// IDNA2008 (Internationalized Domain Names for Applications), is defined in RFC 13// 5890, RFC 5891, RFC 5892, RFC 5893 and RFC 5894. 14// UTS #46 is defined in https://www.unicode.org/reports/tr46. 15// See https://unicode.org/cldr/utility/idna.jsp for a visualization of the 16// differences between these two standards. 17package idna // import "golang.org/x/text/internal/export/idna" 18 19import ( 20 "fmt" 21 "strings" 22 "unicode/utf8" 23 24 "golang.org/x/text/secure/bidirule" 25 "golang.org/x/text/unicode/norm" 26) 27 28// NOTE: Unlike common practice in Go APIs, the functions will return a 29// sanitized domain name in case of errors. Browsers sometimes use a partially 30// evaluated string as lookup. 31// TODO: the current error handling is, in my opinion, the least opinionated. 32// Other strategies are also viable, though: 33// Option 1) Return an empty string in case of error, but allow the user to 34// specify explicitly which errors to ignore. 35// Option 2) Return the partially evaluated string if it is itself a valid 36// string, otherwise return the empty string in case of error. 37// Option 3) Option 1 and 2. 38// Option 4) Always return an empty string for now and implement Option 1 as 39// needed, and document that the return string may not be empty in case of 40// error in the future. 41// I think Option 1 is best, but it is quite opinionated. 42 43// ToASCII is a wrapper for Punycode.ToASCII. 44func ToASCII(s string) (string, error) { 45 return Punycode.process(s, true) 46} 47 48// ToUnicode is a wrapper for Punycode.ToUnicode. 49func ToUnicode(s string) (string, error) { 50 return Punycode.process(s, false) 51} 52 53// An Option configures a Profile at creation time. 54type Option func(*options) 55 56// Transitional sets a Profile to use the Transitional mapping as defined in UTS 57// #46. This will cause, for example, "ß" to be mapped to "ss". Using the 58// transitional mapping provides a compromise between IDNA2003 and IDNA2008 59// compatibility. It is used by most browsers when resolving domain names. This 60// option is only meaningful if combined with MapForLookup. 61func Transitional(transitional bool) Option { 62 return func(o *options) { o.transitional = true } 63} 64 65// VerifyDNSLength sets whether a Profile should fail if any of the IDN parts 66// are longer than allowed by the RFC. 67// 68// This option corresponds to the VerifyDnsLength flag in UTS #46. 69func VerifyDNSLength(verify bool) Option { 70 return func(o *options) { o.verifyDNSLength = verify } 71} 72 73// RemoveLeadingDots removes leading label separators. Leading runes that map to 74// dots, such as U+3002 IDEOGRAPHIC FULL STOP, are removed as well. 75func RemoveLeadingDots(remove bool) Option { 76 return func(o *options) { o.removeLeadingDots = remove } 77} 78 79// ValidateLabels sets whether to check the mandatory label validation criteria 80// as defined in Section 5.4 of RFC 5891. This includes testing for correct use 81// of hyphens ('-'), normalization, validity of runes, and the context rules. 82// In particular, ValidateLabels also sets the CheckHyphens and CheckJoiners flags 83// in UTS #46. 84func ValidateLabels(enable bool) Option { 85 return func(o *options) { 86 // Don't override existing mappings, but set one that at least checks 87 // normalization if it is not set. 88 if o.mapping == nil && enable { 89 o.mapping = normalize 90 } 91 o.trie = trie 92 o.checkJoiners = enable 93 o.checkHyphens = enable 94 if enable { 95 o.fromPuny = validateFromPunycode 96 } else { 97 o.fromPuny = nil 98 } 99 } 100} 101 102// CheckHyphens sets whether to check for correct use of hyphens ('-') in 103// labels. Most web browsers do not have this option set, since labels such as 104// "r3---sn-apo3qvuoxuxbt-j5pe" are in common use. 105// 106// This option corresponds to the CheckHyphens flag in UTS #46. 107func CheckHyphens(enable bool) Option { 108 return func(o *options) { o.checkHyphens = enable } 109} 110 111// CheckJoiners sets whether to check the ContextJ rules as defined in Appendix 112// A of RFC 5892, concerning the use of joiner runes. 113// 114// This option corresponds to the CheckJoiners flag in UTS #46. 115func CheckJoiners(enable bool) Option { 116 return func(o *options) { 117 o.trie = trie 118 o.checkJoiners = enable 119 } 120} 121 122// StrictDomainName limits the set of permissable ASCII characters to those 123// allowed in domain names as defined in RFC 1034 (A-Z, a-z, 0-9 and the 124// hyphen). This is set by default for MapForLookup and ValidateForRegistration, 125// but is only useful if ValidateLabels is set. 126// 127// This option is useful, for instance, for browsers that allow characters 128// outside this range, for example a '_' (U+005F LOW LINE). See 129// http://www.rfc-editor.org/std/std3.txt for more details. 130// 131// This option corresponds to the UseSTD3ASCIIRules flag in UTS #46. 132func StrictDomainName(use bool) Option { 133 return func(o *options) { o.useSTD3Rules = use } 134} 135 136// NOTE: the following options pull in tables. The tables should not be linked 137// in as long as the options are not used. 138 139// BidiRule enables the Bidi rule as defined in RFC 5893. Any application 140// that relies on proper validation of labels should include this rule. 141// 142// This option corresponds to the CheckBidi flag in UTS #46. 143func BidiRule() Option { 144 return func(o *options) { o.bidirule = bidirule.ValidString } 145} 146 147// ValidateForRegistration sets validation options to verify that a given IDN is 148// properly formatted for registration as defined by Section 4 of RFC 5891. 149func ValidateForRegistration() Option { 150 return func(o *options) { 151 o.mapping = validateRegistration 152 StrictDomainName(true)(o) 153 ValidateLabels(true)(o) 154 VerifyDNSLength(true)(o) 155 BidiRule()(o) 156 } 157} 158 159// MapForLookup sets validation and mapping options such that a given IDN is 160// transformed for domain name lookup according to the requirements set out in 161// Section 5 of RFC 5891. The mappings follow the recommendations of RFC 5894, 162// RFC 5895 and UTS 46. It does not add the Bidi Rule. Use the BidiRule option 163// to add this check. 164// 165// The mappings include normalization and mapping case, width and other 166// compatibility mappings. 167func MapForLookup() Option { 168 return func(o *options) { 169 o.mapping = validateAndMap 170 StrictDomainName(true)(o) 171 ValidateLabels(true)(o) 172 RemoveLeadingDots(true)(o) 173 } 174} 175 176type options struct { 177 transitional bool 178 useSTD3Rules bool 179 checkHyphens bool 180 checkJoiners bool 181 verifyDNSLength bool 182 removeLeadingDots bool 183 184 trie *idnaTrie 185 186 // fromPuny calls validation rules when converting A-labels to U-labels. 187 fromPuny func(p *Profile, s string) error 188 189 // mapping implements a validation and mapping step as defined in RFC 5895 190 // or UTS 46, tailored to, for example, domain registration or lookup. 191 mapping func(p *Profile, s string) (string, error) 192 193 // bidirule, if specified, checks whether s conforms to the Bidi Rule 194 // defined in RFC 5893. 195 bidirule func(s string) bool 196} 197 198// A Profile defines the configuration of a IDNA mapper. 199type Profile struct { 200 options 201} 202 203func apply(o *options, opts []Option) { 204 for _, f := range opts { 205 f(o) 206 } 207} 208 209// New creates a new Profile. 210// 211// With no options, the returned Profile is the most permissive and equals the 212// Punycode Profile. Options can be passed to further restrict the Profile. The 213// MapForLookup and ValidateForRegistration options set a collection of options, 214// for lookup and registration purposes respectively, which can be tailored by 215// adding more fine-grained options, where later options override earlier 216// options. 217func New(o ...Option) *Profile { 218 p := &Profile{} 219 apply(&p.options, o) 220 return p 221} 222 223// ToASCII converts a domain or domain label to its ASCII form. For example, 224// ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and 225// ToASCII("golang") is "golang". If an error is encountered it will return 226// an error and a (partially) processed result. 227func (p *Profile) ToASCII(s string) (string, error) { 228 return p.process(s, true) 229} 230 231// ToUnicode converts a domain or domain label to its Unicode form. For example, 232// ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and 233// ToUnicode("golang") is "golang". If an error is encountered it will return 234// an error and a (partially) processed result. 235func (p *Profile) ToUnicode(s string) (string, error) { 236 pp := *p 237 pp.transitional = false 238 return pp.process(s, false) 239} 240 241// String reports a string with a description of the profile for debugging 242// purposes. The string format may change with different versions. 243func (p *Profile) String() string { 244 s := "" 245 if p.transitional { 246 s = "Transitional" 247 } else { 248 s = "NonTransitional" 249 } 250 if p.useSTD3Rules { 251 s += ":UseSTD3Rules" 252 } 253 if p.checkHyphens { 254 s += ":CheckHyphens" 255 } 256 if p.checkJoiners { 257 s += ":CheckJoiners" 258 } 259 if p.verifyDNSLength { 260 s += ":VerifyDNSLength" 261 } 262 return s 263} 264 265var ( 266 // Punycode is a Profile that does raw punycode processing with a minimum 267 // of validation. 268 Punycode *Profile = punycode 269 270 // Lookup is the recommended profile for looking up domain names, according 271 // to Section 5 of RFC 5891. The exact configuration of this profile may 272 // change over time. 273 Lookup *Profile = lookup 274 275 // Display is the recommended profile for displaying domain names. 276 // The configuration of this profile may change over time. 277 Display *Profile = display 278 279 // Registration is the recommended profile for checking whether a given 280 // IDN is valid for registration, according to Section 4 of RFC 5891. 281 Registration *Profile = registration 282 283 punycode = &Profile{} 284 lookup = &Profile{options{ 285 transitional: true, 286 removeLeadingDots: true, 287 useSTD3Rules: true, 288 checkHyphens: true, 289 checkJoiners: true, 290 trie: trie, 291 fromPuny: validateFromPunycode, 292 mapping: validateAndMap, 293 bidirule: bidirule.ValidString, 294 }} 295 display = &Profile{options{ 296 useSTD3Rules: true, 297 removeLeadingDots: true, 298 checkHyphens: true, 299 checkJoiners: true, 300 trie: trie, 301 fromPuny: validateFromPunycode, 302 mapping: validateAndMap, 303 bidirule: bidirule.ValidString, 304 }} 305 registration = &Profile{options{ 306 useSTD3Rules: true, 307 verifyDNSLength: true, 308 checkHyphens: true, 309 checkJoiners: true, 310 trie: trie, 311 fromPuny: validateFromPunycode, 312 mapping: validateRegistration, 313 bidirule: bidirule.ValidString, 314 }} 315 316 // TODO: profiles 317 // Register: recommended for approving domain names: don't do any mappings 318 // but rather reject on invalid input. Bundle or block deviation characters. 319) 320 321type labelError struct{ label, code_ string } 322 323func (e labelError) code() string { return e.code_ } 324func (e labelError) Error() string { 325 return fmt.Sprintf("idna: invalid label %q", e.label) 326} 327 328type runeError rune 329 330func (e runeError) code() string { return "P1" } 331func (e runeError) Error() string { 332 return fmt.Sprintf("idna: disallowed rune %U", e) 333} 334 335// process implements the algorithm described in section 4 of UTS #46, 336// see https://www.unicode.org/reports/tr46. 337func (p *Profile) process(s string, toASCII bool) (string, error) { 338 var err error 339 if p.mapping != nil { 340 s, err = p.mapping(p, s) 341 } 342 // Remove leading empty labels. 343 if p.removeLeadingDots { 344 for ; len(s) > 0 && s[0] == '.'; s = s[1:] { 345 } 346 } 347 // It seems like we should only create this error on ToASCII, but the 348 // UTS 46 conformance tests suggests we should always check this. 349 if err == nil && p.verifyDNSLength && s == "" { 350 err = &labelError{s, "A4"} 351 } 352 labels := labelIter{orig: s} 353 for ; !labels.done(); labels.next() { 354 label := labels.label() 355 if label == "" { 356 // Empty labels are not okay. The label iterator skips the last 357 // label if it is empty. 358 if err == nil && p.verifyDNSLength { 359 err = &labelError{s, "A4"} 360 } 361 continue 362 } 363 if strings.HasPrefix(label, acePrefix) { 364 u, err2 := decode(label[len(acePrefix):]) 365 if err2 != nil { 366 if err == nil { 367 err = err2 368 } 369 // Spec says keep the old label. 370 continue 371 } 372 labels.set(u) 373 if err == nil && p.fromPuny != nil { 374 err = p.fromPuny(p, u) 375 } 376 if err == nil { 377 // This should be called on NonTransitional, according to the 378 // spec, but that currently does not have any effect. Use the 379 // original profile to preserve options. 380 err = p.validateLabel(u) 381 } 382 } else if err == nil { 383 err = p.validateLabel(label) 384 } 385 } 386 if toASCII { 387 for labels.reset(); !labels.done(); labels.next() { 388 label := labels.label() 389 if !ascii(label) { 390 a, err2 := encode(acePrefix, label) 391 if err == nil { 392 err = err2 393 } 394 label = a 395 labels.set(a) 396 } 397 n := len(label) 398 if p.verifyDNSLength && err == nil && (n == 0 || n > 63) { 399 err = &labelError{label, "A4"} 400 } 401 } 402 } 403 s = labels.result() 404 if toASCII && p.verifyDNSLength && err == nil { 405 // Compute the length of the domain name minus the root label and its dot. 406 n := len(s) 407 if n > 0 && s[n-1] == '.' { 408 n-- 409 } 410 if len(s) < 1 || n > 253 { 411 err = &labelError{s, "A4"} 412 } 413 } 414 return s, err 415} 416 417func normalize(p *Profile, s string) (string, error) { 418 return norm.NFC.String(s), nil 419} 420 421func validateRegistration(p *Profile, s string) (string, error) { 422 if !norm.NFC.IsNormalString(s) { 423 return s, &labelError{s, "V1"} 424 } 425 for i := 0; i < len(s); { 426 v, sz := trie.lookupString(s[i:]) 427 // Copy bytes not copied so far. 428 switch p.simplify(info(v).category()) { 429 // TODO: handle the NV8 defined in the Unicode idna data set to allow 430 // for strict conformance to IDNA2008. 431 case valid, deviation: 432 case disallowed, mapped, unknown, ignored: 433 r, _ := utf8.DecodeRuneInString(s[i:]) 434 return s, runeError(r) 435 } 436 i += sz 437 } 438 return s, nil 439} 440 441func validateAndMap(p *Profile, s string) (string, error) { 442 var ( 443 err error 444 b []byte 445 k int 446 ) 447 for i := 0; i < len(s); { 448 v, sz := trie.lookupString(s[i:]) 449 start := i 450 i += sz 451 // Copy bytes not copied so far. 452 switch p.simplify(info(v).category()) { 453 case valid: 454 continue 455 case disallowed: 456 if err == nil { 457 r, _ := utf8.DecodeRuneInString(s[start:]) 458 err = runeError(r) 459 } 460 continue 461 case mapped, deviation: 462 b = append(b, s[k:start]...) 463 b = info(v).appendMapping(b, s[start:i]) 464 case ignored: 465 b = append(b, s[k:start]...) 466 // drop the rune 467 case unknown: 468 b = append(b, s[k:start]...) 469 b = append(b, "\ufffd"...) 470 } 471 k = i 472 } 473 if k == 0 { 474 // No changes so far. 475 s = norm.NFC.String(s) 476 } else { 477 b = append(b, s[k:]...) 478 if norm.NFC.QuickSpan(b) != len(b) { 479 b = norm.NFC.Bytes(b) 480 } 481 // TODO: the punycode converters require strings as input. 482 s = string(b) 483 } 484 return s, err 485} 486 487// A labelIter allows iterating over domain name labels. 488type labelIter struct { 489 orig string 490 slice []string 491 curStart int 492 curEnd int 493 i int 494} 495 496func (l *labelIter) reset() { 497 l.curStart = 0 498 l.curEnd = 0 499 l.i = 0 500} 501 502func (l *labelIter) done() bool { 503 return l.curStart >= len(l.orig) 504} 505 506func (l *labelIter) result() string { 507 if l.slice != nil { 508 return strings.Join(l.slice, ".") 509 } 510 return l.orig 511} 512 513func (l *labelIter) label() string { 514 if l.slice != nil { 515 return l.slice[l.i] 516 } 517 p := strings.IndexByte(l.orig[l.curStart:], '.') 518 l.curEnd = l.curStart + p 519 if p == -1 { 520 l.curEnd = len(l.orig) 521 } 522 return l.orig[l.curStart:l.curEnd] 523} 524 525// next sets the value to the next label. It skips the last label if it is empty. 526func (l *labelIter) next() { 527 l.i++ 528 if l.slice != nil { 529 if l.i >= len(l.slice) || l.i == len(l.slice)-1 && l.slice[l.i] == "" { 530 l.curStart = len(l.orig) 531 } 532 } else { 533 l.curStart = l.curEnd + 1 534 if l.curStart == len(l.orig)-1 && l.orig[l.curStart] == '.' { 535 l.curStart = len(l.orig) 536 } 537 } 538} 539 540func (l *labelIter) set(s string) { 541 if l.slice == nil { 542 l.slice = strings.Split(l.orig, ".") 543 } 544 l.slice[l.i] = s 545} 546 547// acePrefix is the ASCII Compatible Encoding prefix. 548const acePrefix = "xn--" 549 550func (p *Profile) simplify(cat category) category { 551 switch cat { 552 case disallowedSTD3Mapped: 553 if p.useSTD3Rules { 554 cat = disallowed 555 } else { 556 cat = mapped 557 } 558 case disallowedSTD3Valid: 559 if p.useSTD3Rules { 560 cat = disallowed 561 } else { 562 cat = valid 563 } 564 case deviation: 565 if !p.transitional { 566 cat = valid 567 } 568 case validNV8, validXV8: 569 // TODO: handle V2008 570 cat = valid 571 } 572 return cat 573} 574 575func validateFromPunycode(p *Profile, s string) error { 576 if !norm.NFC.IsNormalString(s) { 577 return &labelError{s, "V1"} 578 } 579 for i := 0; i < len(s); { 580 v, sz := trie.lookupString(s[i:]) 581 if c := p.simplify(info(v).category()); c != valid && c != deviation { 582 return &labelError{s, "V6"} 583 } 584 i += sz 585 } 586 return nil 587} 588 589const ( 590 zwnj = "\u200c" 591 zwj = "\u200d" 592) 593 594type joinState int8 595 596const ( 597 stateStart joinState = iota 598 stateVirama 599 stateBefore 600 stateBeforeVirama 601 stateAfter 602 stateFAIL 603) 604 605var joinStates = [][numJoinTypes]joinState{ 606 stateStart: { 607 joiningL: stateBefore, 608 joiningD: stateBefore, 609 joinZWNJ: stateFAIL, 610 joinZWJ: stateFAIL, 611 joinVirama: stateVirama, 612 }, 613 stateVirama: { 614 joiningL: stateBefore, 615 joiningD: stateBefore, 616 }, 617 stateBefore: { 618 joiningL: stateBefore, 619 joiningD: stateBefore, 620 joiningT: stateBefore, 621 joinZWNJ: stateAfter, 622 joinZWJ: stateFAIL, 623 joinVirama: stateBeforeVirama, 624 }, 625 stateBeforeVirama: { 626 joiningL: stateBefore, 627 joiningD: stateBefore, 628 joiningT: stateBefore, 629 }, 630 stateAfter: { 631 joiningL: stateFAIL, 632 joiningD: stateBefore, 633 joiningT: stateAfter, 634 joiningR: stateStart, 635 joinZWNJ: stateFAIL, 636 joinZWJ: stateFAIL, 637 joinVirama: stateAfter, // no-op as we can't accept joiners here 638 }, 639 stateFAIL: { 640 0: stateFAIL, 641 joiningL: stateFAIL, 642 joiningD: stateFAIL, 643 joiningT: stateFAIL, 644 joiningR: stateFAIL, 645 joinZWNJ: stateFAIL, 646 joinZWJ: stateFAIL, 647 joinVirama: stateFAIL, 648 }, 649} 650 651// validateLabel validates the criteria from Section 4.1. Item 1, 4, and 6 are 652// already implicitly satisfied by the overall implementation. 653func (p *Profile) validateLabel(s string) error { 654 if s == "" { 655 if p.verifyDNSLength { 656 return &labelError{s, "A4"} 657 } 658 return nil 659 } 660 if p.bidirule != nil && !p.bidirule(s) { 661 return &labelError{s, "B"} 662 } 663 if p.checkHyphens { 664 if len(s) > 4 && s[2] == '-' && s[3] == '-' { 665 return &labelError{s, "V2"} 666 } 667 if s[0] == '-' || s[len(s)-1] == '-' { 668 return &labelError{s, "V3"} 669 } 670 } 671 if !p.checkJoiners { 672 return nil 673 } 674 trie := p.trie // p.checkJoiners is only set if trie is set. 675 // TODO: merge the use of this in the trie. 676 v, sz := trie.lookupString(s) 677 x := info(v) 678 if x.isModifier() { 679 return &labelError{s, "V5"} 680 } 681 // Quickly return in the absence of zero-width (non) joiners. 682 if strings.Index(s, zwj) == -1 && strings.Index(s, zwnj) == -1 { 683 return nil 684 } 685 st := stateStart 686 for i := 0; ; { 687 jt := x.joinType() 688 if s[i:i+sz] == zwj { 689 jt = joinZWJ 690 } else if s[i:i+sz] == zwnj { 691 jt = joinZWNJ 692 } 693 st = joinStates[st][jt] 694 if x.isViramaModifier() { 695 st = joinStates[st][joinVirama] 696 } 697 if i += sz; i == len(s) { 698 break 699 } 700 v, sz = trie.lookupString(s[i:]) 701 x = info(v) 702 } 703 if st == stateFAIL || st == stateAfter { 704 return &labelError{s, "C"} 705 } 706 return nil 707} 708 709func ascii(s string) bool { 710 for i := 0; i < len(s); i++ { 711 if s[i] >= utf8.RuneSelf { 712 return false 713 } 714 } 715 return true 716} 717