1// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
2
3// Copyright 2016 The Go Authors. All rights reserved.
4// Use of this source code is governed by a BSD-style
5// license that can be found in the LICENSE file.
6
7// Package idna implements IDNA2008 using the compatibility processing
8// defined by UTS (Unicode Technical Standard) #46, which defines a standard to
9// deal with the transition from IDNA2003.
10//
11// IDNA2008 (Internationalized Domain Names for Applications), is defined in RFC
12// 5890, RFC 5891, RFC 5892, RFC 5893 and RFC 5894.
13// UTS #46 is defined in http://www.unicode.org/reports/tr46.
14// See http://unicode.org/cldr/utility/idna.jsp for a visualization of the
15// differences between these two standards.
16package idna // import "golang.org/x/net/idna"
17
18import (
19	"fmt"
20	"strings"
21	"unicode/utf8"
22
23	"golang.org/x/text/secure/bidirule"
24	"golang.org/x/text/unicode/bidi"
25	"golang.org/x/text/unicode/norm"
26)
27
28// NOTE: Unlike common practice in Go APIs, the functions will return a
29// sanitized domain name in case of errors. Browsers sometimes use a partially
30// evaluated string as lookup.
31// TODO: the current error handling is, in my opinion, the least opinionated.
32// Other strategies are also viable, though:
33// Option 1) Return an empty string in case of error, but allow the user to
34//    specify explicitly which errors to ignore.
35// Option 2) Return the partially evaluated string if it is itself a valid
36//    string, otherwise return the empty string in case of error.
37// Option 3) Option 1 and 2.
38// Option 4) Always return an empty string for now and implement Option 1 as
39//    needed, and document that the return string may not be empty in case of
40//    error in the future.
41// I think Option 1 is best, but it is quite opinionated.
42
43// ToASCII is a wrapper for Punycode.ToASCII.
44func ToASCII(s string) (string, error) {
45	return Punycode.process(s, true)
46}
47
48// ToUnicode is a wrapper for Punycode.ToUnicode.
49func ToUnicode(s string) (string, error) {
50	return Punycode.process(s, false)
51}
52
53// An Option configures a Profile at creation time.
54type Option func(*options)
55
56// Transitional sets a Profile to use the Transitional mapping as defined in UTS
57// #46. This will cause, for example, "ß" to be mapped to "ss". Using the
58// transitional mapping provides a compromise between IDNA2003 and IDNA2008
59// compatibility. It is used by most browsers when resolving domain names. This
60// option is only meaningful if combined with MapForLookup.
61func Transitional(transitional bool) Option {
62	return func(o *options) { o.transitional = true }
63}
64
65// VerifyDNSLength sets whether a Profile should fail if any of the IDN parts
66// are longer than allowed by the RFC.
67func VerifyDNSLength(verify bool) Option {
68	return func(o *options) { o.verifyDNSLength = verify }
69}
70
71// RemoveLeadingDots removes leading label separators. Leading runes that map to
72// dots, such as U+3002 IDEOGRAPHIC FULL STOP, are removed as well.
73//
74// This is the behavior suggested by the UTS #46 and is adopted by some
75// browsers.
76func RemoveLeadingDots(remove bool) Option {
77	return func(o *options) { o.removeLeadingDots = remove }
78}
79
80// ValidateLabels sets whether to check the mandatory label validation criteria
81// as defined in Section 5.4 of RFC 5891. This includes testing for correct use
82// of hyphens ('-'), normalization, validity of runes, and the context rules.
83func ValidateLabels(enable bool) Option {
84	return func(o *options) {
85		// Don't override existing mappings, but set one that at least checks
86		// normalization if it is not set.
87		if o.mapping == nil && enable {
88			o.mapping = normalize
89		}
90		o.trie = trie
91		o.validateLabels = enable
92		o.fromPuny = validateFromPunycode
93	}
94}
95
96// StrictDomainName limits the set of permissible ASCII characters to those
97// allowed in domain names as defined in RFC 1034 (A-Z, a-z, 0-9 and the
98// hyphen). This is set by default for MapForLookup and ValidateForRegistration.
99//
100// This option is useful, for instance, for browsers that allow characters
101// outside this range, for example a '_' (U+005F LOW LINE). See
102// http://www.rfc-editor.org/std/std3.txt for more details This option
103// corresponds to the UseSTD3ASCIIRules option in UTS #46.
104func StrictDomainName(use bool) Option {
105	return func(o *options) {
106		o.trie = trie
107		o.useSTD3Rules = use
108		o.fromPuny = validateFromPunycode
109	}
110}
111
112// NOTE: the following options pull in tables. The tables should not be linked
113// in as long as the options are not used.
114
115// BidiRule enables the Bidi rule as defined in RFC 5893. Any application
116// that relies on proper validation of labels should include this rule.
117func BidiRule() Option {
118	return func(o *options) { o.bidirule = bidirule.ValidString }
119}
120
121// ValidateForRegistration sets validation options to verify that a given IDN is
122// properly formatted for registration as defined by Section 4 of RFC 5891.
123func ValidateForRegistration() Option {
124	return func(o *options) {
125		o.mapping = validateRegistration
126		StrictDomainName(true)(o)
127		ValidateLabels(true)(o)
128		VerifyDNSLength(true)(o)
129		BidiRule()(o)
130	}
131}
132
133// MapForLookup sets validation and mapping options such that a given IDN is
134// transformed for domain name lookup according to the requirements set out in
135// Section 5 of RFC 5891. The mappings follow the recommendations of RFC 5894,
136// RFC 5895 and UTS 46. It does not add the Bidi Rule. Use the BidiRule option
137// to add this check.
138//
139// The mappings include normalization and mapping case, width and other
140// compatibility mappings.
141func MapForLookup() Option {
142	return func(o *options) {
143		o.mapping = validateAndMap
144		StrictDomainName(true)(o)
145		ValidateLabels(true)(o)
146	}
147}
148
149type options struct {
150	transitional      bool
151	useSTD3Rules      bool
152	validateLabels    bool
153	verifyDNSLength   bool
154	removeLeadingDots bool
155
156	trie *idnaTrie
157
158	// fromPuny calls validation rules when converting A-labels to U-labels.
159	fromPuny func(p *Profile, s string) error
160
161	// mapping implements a validation and mapping step as defined in RFC 5895
162	// or UTS 46, tailored to, for example, domain registration or lookup.
163	mapping func(p *Profile, s string) (mapped string, isBidi bool, err error)
164
165	// bidirule, if specified, checks whether s conforms to the Bidi Rule
166	// defined in RFC 5893.
167	bidirule func(s string) bool
168}
169
170// A Profile defines the configuration of an IDNA mapper.
171type Profile struct {
172	options
173}
174
175func apply(o *options, opts []Option) {
176	for _, f := range opts {
177		f(o)
178	}
179}
180
181// New creates a new Profile.
182//
183// With no options, the returned Profile is the most permissive and equals the
184// Punycode Profile. Options can be passed to further restrict the Profile. The
185// MapForLookup and ValidateForRegistration options set a collection of options,
186// for lookup and registration purposes respectively, which can be tailored by
187// adding more fine-grained options, where later options override earlier
188// options.
189func New(o ...Option) *Profile {
190	p := &Profile{}
191	apply(&p.options, o)
192	return p
193}
194
195// ToASCII converts a domain or domain label to its ASCII form. For example,
196// ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and
197// ToASCII("golang") is "golang". If an error is encountered it will return
198// an error and a (partially) processed result.
199func (p *Profile) ToASCII(s string) (string, error) {
200	return p.process(s, true)
201}
202
203// ToUnicode converts a domain or domain label to its Unicode form. For example,
204// ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and
205// ToUnicode("golang") is "golang". If an error is encountered it will return
206// an error and a (partially) processed result.
207func (p *Profile) ToUnicode(s string) (string, error) {
208	pp := *p
209	pp.transitional = false
210	return pp.process(s, false)
211}
212
213// String reports a string with a description of the profile for debugging
214// purposes. The string format may change with different versions.
215func (p *Profile) String() string {
216	s := ""
217	if p.transitional {
218		s = "Transitional"
219	} else {
220		s = "NonTransitional"
221	}
222	if p.useSTD3Rules {
223		s += ":UseSTD3Rules"
224	}
225	if p.validateLabels {
226		s += ":ValidateLabels"
227	}
228	if p.verifyDNSLength {
229		s += ":VerifyDNSLength"
230	}
231	return s
232}
233
234var (
235	// Punycode is a Profile that does raw punycode processing with a minimum
236	// of validation.
237	Punycode *Profile = punycode
238
239	// Lookup is the recommended profile for looking up domain names, according
240	// to Section 5 of RFC 5891. The exact configuration of this profile may
241	// change over time.
242	Lookup *Profile = lookup
243
244	// Display is the recommended profile for displaying domain names.
245	// The configuration of this profile may change over time.
246	Display *Profile = display
247
248	// Registration is the recommended profile for checking whether a given
249	// IDN is valid for registration, according to Section 4 of RFC 5891.
250	Registration *Profile = registration
251
252	punycode = &Profile{}
253	lookup   = &Profile{options{
254		transitional:   true,
255		useSTD3Rules:   true,
256		validateLabels: true,
257		trie:           trie,
258		fromPuny:       validateFromPunycode,
259		mapping:        validateAndMap,
260		bidirule:       bidirule.ValidString,
261	}}
262	display = &Profile{options{
263		useSTD3Rules:   true,
264		validateLabels: true,
265		trie:           trie,
266		fromPuny:       validateFromPunycode,
267		mapping:        validateAndMap,
268		bidirule:       bidirule.ValidString,
269	}}
270	registration = &Profile{options{
271		useSTD3Rules:    true,
272		validateLabels:  true,
273		verifyDNSLength: true,
274		trie:            trie,
275		fromPuny:        validateFromPunycode,
276		mapping:         validateRegistration,
277		bidirule:        bidirule.ValidString,
278	}}
279
280	// TODO: profiles
281	// Register: recommended for approving domain names: don't do any mappings
282	// but rather reject on invalid input. Bundle or block deviation characters.
283)
284
285type labelError struct{ label, code_ string }
286
287func (e labelError) code() string { return e.code_ }
288func (e labelError) Error() string {
289	return fmt.Sprintf("idna: invalid label %q", e.label)
290}
291
292type runeError rune
293
294func (e runeError) code() string { return "P1" }
295func (e runeError) Error() string {
296	return fmt.Sprintf("idna: disallowed rune %U", e)
297}
298
299// process implements the algorithm described in section 4 of UTS #46,
300// see http://www.unicode.org/reports/tr46.
301func (p *Profile) process(s string, toASCII bool) (string, error) {
302	var err error
303	var isBidi bool
304	if p.mapping != nil {
305		s, isBidi, err = p.mapping(p, s)
306	}
307	// Remove leading empty labels.
308	if p.removeLeadingDots {
309		for ; len(s) > 0 && s[0] == '.'; s = s[1:] {
310		}
311	}
312	// TODO: allow for a quick check of the tables data.
313	// It seems like we should only create this error on ToASCII, but the
314	// UTS 46 conformance tests suggests we should always check this.
315	if err == nil && p.verifyDNSLength && s == "" {
316		err = &labelError{s, "A4"}
317	}
318	labels := labelIter{orig: s}
319	for ; !labels.done(); labels.next() {
320		label := labels.label()
321		if label == "" {
322			// Empty labels are not okay. The label iterator skips the last
323			// label if it is empty.
324			if err == nil && p.verifyDNSLength {
325				err = &labelError{s, "A4"}
326			}
327			continue
328		}
329		if strings.HasPrefix(label, acePrefix) {
330			u, err2 := decode(label[len(acePrefix):])
331			if err2 != nil {
332				if err == nil {
333					err = err2
334				}
335				// Spec says keep the old label.
336				continue
337			}
338			isBidi = isBidi || bidirule.DirectionString(u) != bidi.LeftToRight
339			labels.set(u)
340			if err == nil && p.validateLabels {
341				err = p.fromPuny(p, u)
342			}
343			if err == nil {
344				// This should be called on NonTransitional, according to the
345				// spec, but that currently does not have any effect. Use the
346				// original profile to preserve options.
347				err = p.validateLabel(u)
348			}
349		} else if err == nil {
350			err = p.validateLabel(label)
351		}
352	}
353	if isBidi && p.bidirule != nil && err == nil {
354		for labels.reset(); !labels.done(); labels.next() {
355			if !p.bidirule(labels.label()) {
356				err = &labelError{s, "B"}
357				break
358			}
359		}
360	}
361	if toASCII {
362		for labels.reset(); !labels.done(); labels.next() {
363			label := labels.label()
364			if !ascii(label) {
365				a, err2 := encode(acePrefix, label)
366				if err == nil {
367					err = err2
368				}
369				label = a
370				labels.set(a)
371			}
372			n := len(label)
373			if p.verifyDNSLength && err == nil && (n == 0 || n > 63) {
374				err = &labelError{label, "A4"}
375			}
376		}
377	}
378	s = labels.result()
379	if toASCII && p.verifyDNSLength && err == nil {
380		// Compute the length of the domain name minus the root label and its dot.
381		n := len(s)
382		if n > 0 && s[n-1] == '.' {
383			n--
384		}
385		if len(s) < 1 || n > 253 {
386			err = &labelError{s, "A4"}
387		}
388	}
389	return s, err
390}
391
392func normalize(p *Profile, s string) (mapped string, isBidi bool, err error) {
393	// TODO: consider first doing a quick check to see if any of these checks
394	// need to be done. This will make it slower in the general case, but
395	// faster in the common case.
396	mapped = norm.NFC.String(s)
397	isBidi = bidirule.DirectionString(mapped) == bidi.RightToLeft
398	return mapped, isBidi, nil
399}
400
401func validateRegistration(p *Profile, s string) (idem string, bidi bool, err error) {
402	// TODO: filter need for normalization in loop below.
403	if !norm.NFC.IsNormalString(s) {
404		return s, false, &labelError{s, "V1"}
405	}
406	for i := 0; i < len(s); {
407		v, sz := trie.lookupString(s[i:])
408		if sz == 0 {
409			return s, bidi, runeError(utf8.RuneError)
410		}
411		bidi = bidi || info(v).isBidi(s[i:])
412		// Copy bytes not copied so far.
413		switch p.simplify(info(v).category()) {
414		// TODO: handle the NV8 defined in the Unicode idna data set to allow
415		// for strict conformance to IDNA2008.
416		case valid, deviation:
417		case disallowed, mapped, unknown, ignored:
418			r, _ := utf8.DecodeRuneInString(s[i:])
419			return s, bidi, runeError(r)
420		}
421		i += sz
422	}
423	return s, bidi, nil
424}
425
426func (c info) isBidi(s string) bool {
427	if !c.isMapped() {
428		return c&attributesMask == rtl
429	}
430	// TODO: also store bidi info for mapped data. This is possible, but a bit
431	// cumbersome and not for the common case.
432	p, _ := bidi.LookupString(s)
433	switch p.Class() {
434	case bidi.R, bidi.AL, bidi.AN:
435		return true
436	}
437	return false
438}
439
440func validateAndMap(p *Profile, s string) (vm string, bidi bool, err error) {
441	var (
442		b []byte
443		k int
444	)
445	// combinedInfoBits contains the or-ed bits of all runes. We use this
446	// to derive the mayNeedNorm bit later. This may trigger normalization
447	// overeagerly, but it will not do so in the common case. The end result
448	// is another 10% saving on BenchmarkProfile for the common case.
449	var combinedInfoBits info
450	for i := 0; i < len(s); {
451		v, sz := trie.lookupString(s[i:])
452		if sz == 0 {
453			b = append(b, s[k:i]...)
454			b = append(b, "\ufffd"...)
455			k = len(s)
456			if err == nil {
457				err = runeError(utf8.RuneError)
458			}
459			break
460		}
461		combinedInfoBits |= info(v)
462		bidi = bidi || info(v).isBidi(s[i:])
463		start := i
464		i += sz
465		// Copy bytes not copied so far.
466		switch p.simplify(info(v).category()) {
467		case valid:
468			continue
469		case disallowed:
470			if err == nil {
471				r, _ := utf8.DecodeRuneInString(s[start:])
472				err = runeError(r)
473			}
474			continue
475		case mapped, deviation:
476			b = append(b, s[k:start]...)
477			b = info(v).appendMapping(b, s[start:i])
478		case ignored:
479			b = append(b, s[k:start]...)
480			// drop the rune
481		case unknown:
482			b = append(b, s[k:start]...)
483			b = append(b, "\ufffd"...)
484		}
485		k = i
486	}
487	if k == 0 {
488		// No changes so far.
489		if combinedInfoBits&mayNeedNorm != 0 {
490			s = norm.NFC.String(s)
491		}
492	} else {
493		b = append(b, s[k:]...)
494		if norm.NFC.QuickSpan(b) != len(b) {
495			b = norm.NFC.Bytes(b)
496		}
497		// TODO: the punycode converters require strings as input.
498		s = string(b)
499	}
500	return s, bidi, err
501}
502
503// A labelIter allows iterating over domain name labels.
504type labelIter struct {
505	orig     string
506	slice    []string
507	curStart int
508	curEnd   int
509	i        int
510}
511
512func (l *labelIter) reset() {
513	l.curStart = 0
514	l.curEnd = 0
515	l.i = 0
516}
517
518func (l *labelIter) done() bool {
519	return l.curStart >= len(l.orig)
520}
521
522func (l *labelIter) result() string {
523	if l.slice != nil {
524		return strings.Join(l.slice, ".")
525	}
526	return l.orig
527}
528
529func (l *labelIter) label() string {
530	if l.slice != nil {
531		return l.slice[l.i]
532	}
533	p := strings.IndexByte(l.orig[l.curStart:], '.')
534	l.curEnd = l.curStart + p
535	if p == -1 {
536		l.curEnd = len(l.orig)
537	}
538	return l.orig[l.curStart:l.curEnd]
539}
540
541// next sets the value to the next label. It skips the last label if it is empty.
542func (l *labelIter) next() {
543	l.i++
544	if l.slice != nil {
545		if l.i >= len(l.slice) || l.i == len(l.slice)-1 && l.slice[l.i] == "" {
546			l.curStart = len(l.orig)
547		}
548	} else {
549		l.curStart = l.curEnd + 1
550		if l.curStart == len(l.orig)-1 && l.orig[l.curStart] == '.' {
551			l.curStart = len(l.orig)
552		}
553	}
554}
555
556func (l *labelIter) set(s string) {
557	if l.slice == nil {
558		l.slice = strings.Split(l.orig, ".")
559	}
560	l.slice[l.i] = s
561}
562
563// acePrefix is the ASCII Compatible Encoding prefix.
564const acePrefix = "xn--"
565
566func (p *Profile) simplify(cat category) category {
567	switch cat {
568	case disallowedSTD3Mapped:
569		if p.useSTD3Rules {
570			cat = disallowed
571		} else {
572			cat = mapped
573		}
574	case disallowedSTD3Valid:
575		if p.useSTD3Rules {
576			cat = disallowed
577		} else {
578			cat = valid
579		}
580	case deviation:
581		if !p.transitional {
582			cat = valid
583		}
584	case validNV8, validXV8:
585		// TODO: handle V2008
586		cat = valid
587	}
588	return cat
589}
590
591func validateFromPunycode(p *Profile, s string) error {
592	if !norm.NFC.IsNormalString(s) {
593		return &labelError{s, "V1"}
594	}
595	// TODO: detect whether string may have to be normalized in the following
596	// loop.
597	for i := 0; i < len(s); {
598		v, sz := trie.lookupString(s[i:])
599		if sz == 0 {
600			return runeError(utf8.RuneError)
601		}
602		if c := p.simplify(info(v).category()); c != valid && c != deviation {
603			return &labelError{s, "V6"}
604		}
605		i += sz
606	}
607	return nil
608}
609
610const (
611	zwnj = "\u200c"
612	zwj  = "\u200d"
613)
614
615type joinState int8
616
617const (
618	stateStart joinState = iota
619	stateVirama
620	stateBefore
621	stateBeforeVirama
622	stateAfter
623	stateFAIL
624)
625
626var joinStates = [][numJoinTypes]joinState{
627	stateStart: {
628		joiningL:   stateBefore,
629		joiningD:   stateBefore,
630		joinZWNJ:   stateFAIL,
631		joinZWJ:    stateFAIL,
632		joinVirama: stateVirama,
633	},
634	stateVirama: {
635		joiningL: stateBefore,
636		joiningD: stateBefore,
637	},
638	stateBefore: {
639		joiningL:   stateBefore,
640		joiningD:   stateBefore,
641		joiningT:   stateBefore,
642		joinZWNJ:   stateAfter,
643		joinZWJ:    stateFAIL,
644		joinVirama: stateBeforeVirama,
645	},
646	stateBeforeVirama: {
647		joiningL: stateBefore,
648		joiningD: stateBefore,
649		joiningT: stateBefore,
650	},
651	stateAfter: {
652		joiningL:   stateFAIL,
653		joiningD:   stateBefore,
654		joiningT:   stateAfter,
655		joiningR:   stateStart,
656		joinZWNJ:   stateFAIL,
657		joinZWJ:    stateFAIL,
658		joinVirama: stateAfter, // no-op as we can't accept joiners here
659	},
660	stateFAIL: {
661		0:          stateFAIL,
662		joiningL:   stateFAIL,
663		joiningD:   stateFAIL,
664		joiningT:   stateFAIL,
665		joiningR:   stateFAIL,
666		joinZWNJ:   stateFAIL,
667		joinZWJ:    stateFAIL,
668		joinVirama: stateFAIL,
669	},
670}
671
672// validateLabel validates the criteria from Section 4.1. Item 1, 4, and 6 are
673// already implicitly satisfied by the overall implementation.
674func (p *Profile) validateLabel(s string) (err error) {
675	if s == "" {
676		if p.verifyDNSLength {
677			return &labelError{s, "A4"}
678		}
679		return nil
680	}
681	if !p.validateLabels {
682		return nil
683	}
684	trie := p.trie // p.validateLabels is only set if trie is set.
685	if len(s) > 4 && s[2] == '-' && s[3] == '-' {
686		return &labelError{s, "V2"}
687	}
688	if s[0] == '-' || s[len(s)-1] == '-' {
689		return &labelError{s, "V3"}
690	}
691	// TODO: merge the use of this in the trie.
692	v, sz := trie.lookupString(s)
693	x := info(v)
694	if x.isModifier() {
695		return &labelError{s, "V5"}
696	}
697	// Quickly return in the absence of zero-width (non) joiners.
698	if strings.Index(s, zwj) == -1 && strings.Index(s, zwnj) == -1 {
699		return nil
700	}
701	st := stateStart
702	for i := 0; ; {
703		jt := x.joinType()
704		if s[i:i+sz] == zwj {
705			jt = joinZWJ
706		} else if s[i:i+sz] == zwnj {
707			jt = joinZWNJ
708		}
709		st = joinStates[st][jt]
710		if x.isViramaModifier() {
711			st = joinStates[st][joinVirama]
712		}
713		if i += sz; i == len(s) {
714			break
715		}
716		v, sz = trie.lookupString(s[i:])
717		x = info(v)
718	}
719	if st == stateFAIL || st == stateAfter {
720		return &labelError{s, "C"}
721	}
722	return nil
723}
724
725func ascii(s string) bool {
726	for i := 0; i < len(s); i++ {
727		if s[i] >= utf8.RuneSelf {
728			return false
729		}
730	}
731	return true
732}
733