1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package language
6
7import (
8	"flag"
9	"testing"
10)
11
12var verbose = flag.Bool("verbose", false, "set to true to print the internal tables of matchers")
13
14func TestAddLikelySubtags(t *testing.T) {
15	tests := []struct{ in, out string }{
16		{"aa", "aa-Latn-ET"},
17		{"aa-Latn", "aa-Latn-ET"},
18		{"aa-Arab", "aa-Arab-ET"},
19		{"aa-Arab-ER", "aa-Arab-ER"},
20		{"kk", "kk-Cyrl-KZ"},
21		{"kk-CN", "kk-Arab-CN"},
22		{"cmn", "cmn"},
23		{"zh-AU", "zh-Hant-AU"},
24		{"zh-VN", "zh-Hant-VN"},
25		{"zh-SG", "zh-Hans-SG"},
26		{"zh-Hant", "zh-Hant-TW"},
27		{"zh-Hani", "zh-Hani-CN"},
28		{"und-Hani", "zh-Hani-CN"},
29		{"und", "en-Latn-US"},
30		{"und-GB", "en-Latn-GB"},
31		{"und-CW", "pap-Latn-CW"},
32		{"und-YT", "fr-Latn-YT"},
33		{"und-Arab", "ar-Arab-EG"},
34		{"und-AM", "hy-Armn-AM"},
35		{"und-TW", "zh-Hant-TW"},
36		{"und-002", "en-Latn-NG"},
37		{"und-Latn-002", "en-Latn-NG"},
38		{"en-Latn-002", "en-Latn-NG"},
39		{"en-002", "en-Latn-NG"},
40		{"en-001", "en-Latn-US"},
41		{"und-003", "en-Latn-US"},
42		{"und-GB", "en-Latn-GB"},
43		{"Latn-001", "en-Latn-US"},
44		{"en-001", "en-Latn-US"},
45		{"es-419", "es-Latn-419"},
46		{"he-145", "he-Hebr-IL"},
47		{"ky-145", "ky-Latn-TR"},
48		{"kk", "kk-Cyrl-KZ"},
49		// Don't specialize duplicate and ambiguous matches.
50		{"kk-034", "kk-Arab-034"}, // Matches IR and AF. Both are Arab.
51		{"ku-145", "ku-Latn-TR"},  // Matches IQ, TR, and LB, but kk -> TR.
52		{"und-Arab-CC", "ms-Arab-CC"},
53		{"und-Arab-GB", "ks-Arab-GB"},
54		{"und-Hans-CC", "zh-Hans-CC"},
55		{"und-CC", "en-Latn-CC"},
56		{"sr", "sr-Cyrl-RS"},
57		{"sr-151", "sr-Latn-151"}, // Matches RO and RU.
58		// We would like addLikelySubtags to generate the same results if the input
59		// only changes by adding tags that would otherwise have been added
60		// by the expansion.
61		// In other words:
62		//     und-AA -> xx-Scrp-AA   implies und-Scrp-AA -> xx-Scrp-AA
63		//     und-AA -> xx-Scrp-AA   implies xx-AA -> xx-Scrp-AA
64		//     und-Scrp -> xx-Scrp-AA implies und-Scrp-AA -> xx-Scrp-AA
65		//     und-Scrp -> xx-Scrp-AA implies xx-Scrp -> xx-Scrp-AA
66		//     xx -> xx-Scrp-AA       implies xx-Scrp -> xx-Scrp-AA
67		//     xx -> xx-Scrp-AA       implies xx-AA -> xx-Scrp-AA
68		//
69		// The algorithm specified in
70		//   https://unicode.org/reports/tr35/tr35-9.html#Supplemental_Data,
71		// Section C.10, does not handle the first case. For example,
72		// the CLDR data contains an entry und-BJ -> fr-Latn-BJ, but not
73		// there is no rule for und-Latn-BJ.  According to spec, und-Latn-BJ
74		// would expand to en-Latn-BJ, violating the aforementioned principle.
75		// We deviate from the spec by letting und-Scrp-AA expand to xx-Scrp-AA
76		// if a rule of the form und-AA -> xx-Scrp-AA is defined.
77		// Note that as of version 23, CLDR has some explicitly specified
78		// entries that do not conform to these rules. The implementation
79		// will not correct these explicit inconsistencies. A later versions of CLDR
80		// is supposed to fix this.
81		{"und-Latn-BJ", "fr-Latn-BJ"},
82		{"und-Bugi-ID", "bug-Bugi-ID"},
83		// regions, scripts and languages without definitions
84		{"und-Arab-AA", "ar-Arab-AA"},
85		{"und-Afak-RE", "fr-Afak-RE"},
86		{"und-Arab-GB", "ks-Arab-GB"},
87		{"abp-Arab-GB", "abp-Arab-GB"},
88		// script has preference over region
89		{"und-Arab-NL", "ar-Arab-NL"},
90		{"zza", "zza-Latn-TR"},
91		// preserve variants and extensions
92		{"de-1901", "de-Latn-DE-1901"},
93		{"de-x-abc", "de-Latn-DE-x-abc"},
94		{"de-1901-x-abc", "de-Latn-DE-1901-x-abc"},
95		{"x-abc", "x-abc"}, // TODO: is this the desired behavior?
96	}
97	for i, tt := range tests {
98		in, _ := Parse(tt.in)
99		out, _ := Parse(tt.out)
100		in, _ = in.addLikelySubtags()
101		if in.String() != out.String() {
102			t.Errorf("%d: add(%s) was %s; want %s", i, tt.in, in, tt.out)
103		}
104	}
105}
106func TestMinimize(t *testing.T) {
107	tests := []struct{ in, out string }{
108		{"aa", "aa"},
109		{"aa-Latn", "aa"},
110		{"aa-Latn-ET", "aa"},
111		{"aa-ET", "aa"},
112		{"aa-Arab", "aa-Arab"},
113		{"aa-Arab-ER", "aa-Arab-ER"},
114		{"aa-Arab-ET", "aa-Arab"},
115		{"und", "und"},
116		{"und-Latn", "und"},
117		{"und-Latn-US", "und"},
118		{"en-Latn-US", "en"},
119		{"cmn", "cmn"},
120		{"cmn-Hans", "cmn-Hans"},
121		{"cmn-Hant", "cmn-Hant"},
122		{"zh-AU", "zh-AU"},
123		{"zh-VN", "zh-VN"},
124		{"zh-SG", "zh-SG"},
125		{"zh-Hant", "zh-Hant"},
126		{"zh-Hant-TW", "zh-TW"},
127		{"zh-Hans", "zh"},
128		{"zh-Hani", "zh-Hani"},
129		{"und-Hans", "und-Hans"},
130		{"und-Hani", "und-Hani"},
131
132		{"und-CW", "und-CW"},
133		{"und-YT", "und-YT"},
134		{"und-Arab", "und-Arab"},
135		{"und-AM", "und-AM"},
136		{"und-Arab-CC", "und-Arab-CC"},
137		{"und-CC", "und-CC"},
138		{"und-Latn-BJ", "und-BJ"},
139		{"und-Bugi-ID", "und-Bugi"},
140		{"bug-Bugi-ID", "bug-Bugi"},
141		// regions, scripts and languages without definitions
142		{"und-Arab-AA", "und-Arab-AA"},
143		// preserve variants and extensions
144		{"de-Latn-1901", "de-1901"},
145		{"de-Latn-x-abc", "de-x-abc"},
146		{"de-DE-1901-x-abc", "de-1901-x-abc"},
147		{"x-abc", "x-abc"}, // TODO: is this the desired behavior?
148	}
149	for i, tt := range tests {
150		in, _ := Parse(tt.in)
151		out, _ := Parse(tt.out)
152		min, _ := in.minimize()
153		if min.String() != out.String() {
154			t.Errorf("%d: min(%s) was %s; want %s", i, tt.in, min, tt.out)
155		}
156		max, _ := min.addLikelySubtags()
157		if x, _ := in.addLikelySubtags(); x.String() != max.String() {
158			t.Errorf("%d: max(min(%s)) = %s; want %s", i, tt.in, max, x)
159		}
160	}
161}
162