1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package collate
6
7import (
8	"archive/zip"
9	"bufio"
10	"bytes"
11	"flag"
12	"io"
13	"io/ioutil"
14	"log"
15	"path"
16	"regexp"
17	"strconv"
18	"strings"
19	"testing"
20	"unicode/utf8"
21
22	"golang.org/x/text/collate/build"
23	"golang.org/x/text/internal/gen"
24	"golang.org/x/text/language"
25)
26
27var long = flag.Bool("long", false,
28	"run time-consuming tests, such as tests that fetch data online")
29
30// This regression test runs tests for the test files in CollationTest.zip
31// (taken from https://www.unicode.org/Public/UCA/<gen.UnicodeVersion()>/).
32//
33// The test files have the following form:
34// # header
35// 0009 0021;	# ('\u0009') <CHARACTER TABULATION>	[| | | 0201 025E]
36// 0009 003F;	# ('\u0009') <CHARACTER TABULATION>	[| | | 0201 0263]
37// 000A 0021;	# ('\u000A') <LINE FEED (LF)>	[| | | 0202 025E]
38// 000A 003F;	# ('\u000A') <LINE FEED (LF)>	[| | | 0202 0263]
39//
40// The part before the semicolon is the hex representation of a sequence
41// of runes. After the hash mark is a comment. The strings
42// represented by rune sequence are in the file in sorted order, as
43// defined by the DUCET.
44
45type Test struct {
46	name    string
47	str     [][]byte
48	comment []string
49}
50
51var versionRe = regexp.MustCompile(`# UCA Version: (.*)\n?$`)
52var testRe = regexp.MustCompile(`^([\dA-F ]+);.*# (.*)\n?$`)
53
54func TestCollation(t *testing.T) {
55	if !gen.IsLocal() && !*long {
56		t.Skip("skipping test to prevent downloading; to run use -long or use -local to specify a local source")
57	}
58	t.Skip("must first update to new file format to support test")
59	for _, test := range loadTestData() {
60		doTest(t, test)
61	}
62}
63
64func Error(e error) {
65	if e != nil {
66		log.Fatal(e)
67	}
68}
69
70// parseUCA parses a Default Unicode Collation Element Table of the format
71// specified in https://www.unicode.org/reports/tr10/#File_Format.
72// It returns the variable top.
73func parseUCA(builder *build.Builder) {
74	r := gen.OpenUnicodeFile("UCA", "", "allkeys.txt")
75	defer r.Close()
76	input := bufio.NewReader(r)
77	colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
78	for i := 1; true; i++ {
79		l, prefix, err := input.ReadLine()
80		if err == io.EOF {
81			break
82		}
83		Error(err)
84		line := string(l)
85		if prefix {
86			log.Fatalf("%d: buffer overflow", i)
87		}
88		if len(line) == 0 || line[0] == '#' {
89			continue
90		}
91		if line[0] == '@' {
92			if strings.HasPrefix(line[1:], "version ") {
93				if v := strings.Split(line[1:], " ")[1]; v != gen.UnicodeVersion() {
94					log.Fatalf("incompatible version %s; want %s", v, gen.UnicodeVersion())
95				}
96			}
97		} else {
98			// parse entries
99			part := strings.Split(line, " ; ")
100			if len(part) != 2 {
101				log.Fatalf("%d: production rule without ';': %v", i, line)
102			}
103			lhs := []rune{}
104			for _, v := range strings.Split(part[0], " ") {
105				if v != "" {
106					lhs = append(lhs, rune(convHex(i, v)))
107				}
108			}
109			vars := []int{}
110			rhs := [][]int{}
111			for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
112				if m[1] == "*" {
113					vars = append(vars, i)
114				}
115				elem := []int{}
116				for _, h := range strings.Split(m[2], ".") {
117					elem = append(elem, convHex(i, h))
118				}
119				rhs = append(rhs, elem)
120			}
121			builder.Add(lhs, rhs, vars)
122		}
123	}
124}
125
126func convHex(line int, s string) int {
127	r, e := strconv.ParseInt(s, 16, 32)
128	if e != nil {
129		log.Fatalf("%d: %v", line, e)
130	}
131	return int(r)
132}
133
134func loadTestData() []Test {
135	f := gen.OpenUnicodeFile("UCA", "", "CollationTest.zip")
136	buffer, err := ioutil.ReadAll(f)
137	f.Close()
138	Error(err)
139	archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
140	Error(err)
141	tests := []Test{}
142	for _, f := range archive.File {
143		// Skip the short versions, which are simply duplicates of the long versions.
144		if strings.Contains(f.Name, "SHORT") || f.FileInfo().IsDir() {
145			continue
146		}
147		ff, err := f.Open()
148		Error(err)
149		defer ff.Close()
150		scanner := bufio.NewScanner(ff)
151		test := Test{name: path.Base(f.Name)}
152		for scanner.Scan() {
153			line := scanner.Text()
154			if len(line) <= 1 || line[0] == '#' {
155				if m := versionRe.FindStringSubmatch(line); m != nil {
156					if m[1] != gen.UnicodeVersion() {
157						log.Printf("warning:%s: version is %s; want %s", f.Name, m[1], gen.UnicodeVersion())
158					}
159				}
160				continue
161			}
162			m := testRe.FindStringSubmatch(line)
163			if m == nil || len(m) < 3 {
164				log.Fatalf(`Failed to parse: "%s" result: %#v`, line, m)
165			}
166			str := []byte{}
167			// In the regression test data (unpaired) surrogates are assigned a weight
168			// corresponding to their code point value.  However, utf8.DecodeRune,
169			// which is used to compute the implicit weight, assigns FFFD to surrogates.
170			// We therefore skip tests with surrogates.  This skips about 35 entries
171			// per test.
172			valid := true
173			for _, split := range strings.Split(m[1], " ") {
174				r, err := strconv.ParseUint(split, 16, 64)
175				Error(err)
176				valid = valid && utf8.ValidRune(rune(r))
177				str = append(str, string(rune(r))...)
178			}
179			if valid {
180				test.str = append(test.str, str)
181				test.comment = append(test.comment, m[2])
182			}
183		}
184		if scanner.Err() != nil {
185			log.Fatal(scanner.Err())
186		}
187		tests = append(tests, test)
188	}
189	return tests
190}
191
192var errorCount int
193
194func runes(b []byte) []rune {
195	return []rune(string(b))
196}
197
198var shifted = language.MustParse("und-u-ka-shifted-ks-level4")
199
200func doTest(t *testing.T, tc Test) {
201	bld := build.NewBuilder()
202	parseUCA(bld)
203	w, err := bld.Build()
204	Error(err)
205	var tag language.Tag
206	if !strings.Contains(tc.name, "NON_IGNOR") {
207		tag = shifted
208	}
209	c := NewFromTable(w, OptionsFromTag(tag))
210	b := &Buffer{}
211	prev := tc.str[0]
212	for i := 1; i < len(tc.str); i++ {
213		b.Reset()
214		s := tc.str[i]
215		ka := c.Key(b, prev)
216		kb := c.Key(b, s)
217		if r := bytes.Compare(ka, kb); r == 1 {
218			t.Errorf("%s:%d: Key(%.4X) < Key(%.4X) (%X < %X) == %d; want -1 or 0", tc.name, i, []rune(string(prev)), []rune(string(s)), ka, kb, r)
219			prev = s
220			continue
221		}
222		if r := c.Compare(prev, s); r == 1 {
223			t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want -1 or 0", tc.name, i, runes(prev), runes(s), r)
224		}
225		if r := c.Compare(s, prev); r == -1 {
226			t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want 1 or 0", tc.name, i, runes(s), runes(prev), r)
227		}
228		prev = s
229	}
230}
231