1// Copyright 2012 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package collate 6 7import ( 8 "archive/zip" 9 "bufio" 10 "bytes" 11 "flag" 12 "io" 13 "io/ioutil" 14 "log" 15 "path" 16 "regexp" 17 "strconv" 18 "strings" 19 "testing" 20 "unicode/utf8" 21 22 "golang.org/x/text/collate/build" 23 "golang.org/x/text/internal/gen" 24 "golang.org/x/text/language" 25) 26 27var long = flag.Bool("long", false, 28 "run time-consuming tests, such as tests that fetch data online") 29 30// This regression test runs tests for the test files in CollationTest.zip 31// (taken from https://www.unicode.org/Public/UCA/<gen.UnicodeVersion()>/). 32// 33// The test files have the following form: 34// # header 35// 0009 0021; # ('\u0009') <CHARACTER TABULATION> [| | | 0201 025E] 36// 0009 003F; # ('\u0009') <CHARACTER TABULATION> [| | | 0201 0263] 37// 000A 0021; # ('\u000A') <LINE FEED (LF)> [| | | 0202 025E] 38// 000A 003F; # ('\u000A') <LINE FEED (LF)> [| | | 0202 0263] 39// 40// The part before the semicolon is the hex representation of a sequence 41// of runes. After the hash mark is a comment. The strings 42// represented by rune sequence are in the file in sorted order, as 43// defined by the DUCET. 44 45type Test struct { 46 name string 47 str [][]byte 48 comment []string 49} 50 51var versionRe = regexp.MustCompile(`# UCA Version: (.*)\n?$`) 52var testRe = regexp.MustCompile(`^([\dA-F ]+);.*# (.*)\n?$`) 53 54func TestCollation(t *testing.T) { 55 if !gen.IsLocal() && !*long { 56 t.Skip("skipping test to prevent downloading; to run use -long or use -local to specify a local source") 57 } 58 t.Skip("must first update to new file format to support test") 59 for _, test := range loadTestData() { 60 doTest(t, test) 61 } 62} 63 64func Error(e error) { 65 if e != nil { 66 log.Fatal(e) 67 } 68} 69 70// parseUCA parses a Default Unicode Collation Element Table of the format 71// specified in https://www.unicode.org/reports/tr10/#File_Format. 72// It returns the variable top. 73func parseUCA(builder *build.Builder) { 74 r := gen.OpenUnicodeFile("UCA", "", "allkeys.txt") 75 defer r.Close() 76 input := bufio.NewReader(r) 77 colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`) 78 for i := 1; true; i++ { 79 l, prefix, err := input.ReadLine() 80 if err == io.EOF { 81 break 82 } 83 Error(err) 84 line := string(l) 85 if prefix { 86 log.Fatalf("%d: buffer overflow", i) 87 } 88 if len(line) == 0 || line[0] == '#' { 89 continue 90 } 91 if line[0] == '@' { 92 if strings.HasPrefix(line[1:], "version ") { 93 if v := strings.Split(line[1:], " ")[1]; v != gen.UnicodeVersion() { 94 log.Fatalf("incompatible version %s; want %s", v, gen.UnicodeVersion()) 95 } 96 } 97 } else { 98 // parse entries 99 part := strings.Split(line, " ; ") 100 if len(part) != 2 { 101 log.Fatalf("%d: production rule without ';': %v", i, line) 102 } 103 lhs := []rune{} 104 for _, v := range strings.Split(part[0], " ") { 105 if v != "" { 106 lhs = append(lhs, rune(convHex(i, v))) 107 } 108 } 109 vars := []int{} 110 rhs := [][]int{} 111 for i, m := range colelem.FindAllStringSubmatch(part[1], -1) { 112 if m[1] == "*" { 113 vars = append(vars, i) 114 } 115 elem := []int{} 116 for _, h := range strings.Split(m[2], ".") { 117 elem = append(elem, convHex(i, h)) 118 } 119 rhs = append(rhs, elem) 120 } 121 builder.Add(lhs, rhs, vars) 122 } 123 } 124} 125 126func convHex(line int, s string) int { 127 r, e := strconv.ParseInt(s, 16, 32) 128 if e != nil { 129 log.Fatalf("%d: %v", line, e) 130 } 131 return int(r) 132} 133 134func loadTestData() []Test { 135 f := gen.OpenUnicodeFile("UCA", "", "CollationTest.zip") 136 buffer, err := ioutil.ReadAll(f) 137 f.Close() 138 Error(err) 139 archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer))) 140 Error(err) 141 tests := []Test{} 142 for _, f := range archive.File { 143 // Skip the short versions, which are simply duplicates of the long versions. 144 if strings.Contains(f.Name, "SHORT") || f.FileInfo().IsDir() { 145 continue 146 } 147 ff, err := f.Open() 148 Error(err) 149 defer ff.Close() 150 scanner := bufio.NewScanner(ff) 151 test := Test{name: path.Base(f.Name)} 152 for scanner.Scan() { 153 line := scanner.Text() 154 if len(line) <= 1 || line[0] == '#' { 155 if m := versionRe.FindStringSubmatch(line); m != nil { 156 if m[1] != gen.UnicodeVersion() { 157 log.Printf("warning:%s: version is %s; want %s", f.Name, m[1], gen.UnicodeVersion()) 158 } 159 } 160 continue 161 } 162 m := testRe.FindStringSubmatch(line) 163 if m == nil || len(m) < 3 { 164 log.Fatalf(`Failed to parse: "%s" result: %#v`, line, m) 165 } 166 str := []byte{} 167 // In the regression test data (unpaired) surrogates are assigned a weight 168 // corresponding to their code point value. However, utf8.DecodeRune, 169 // which is used to compute the implicit weight, assigns FFFD to surrogates. 170 // We therefore skip tests with surrogates. This skips about 35 entries 171 // per test. 172 valid := true 173 for _, split := range strings.Split(m[1], " ") { 174 r, err := strconv.ParseUint(split, 16, 64) 175 Error(err) 176 valid = valid && utf8.ValidRune(rune(r)) 177 str = append(str, string(rune(r))...) 178 } 179 if valid { 180 test.str = append(test.str, str) 181 test.comment = append(test.comment, m[2]) 182 } 183 } 184 if scanner.Err() != nil { 185 log.Fatal(scanner.Err()) 186 } 187 tests = append(tests, test) 188 } 189 return tests 190} 191 192var errorCount int 193 194func runes(b []byte) []rune { 195 return []rune(string(b)) 196} 197 198var shifted = language.MustParse("und-u-ka-shifted-ks-level4") 199 200func doTest(t *testing.T, tc Test) { 201 bld := build.NewBuilder() 202 parseUCA(bld) 203 w, err := bld.Build() 204 Error(err) 205 var tag language.Tag 206 if !strings.Contains(tc.name, "NON_IGNOR") { 207 tag = shifted 208 } 209 c := NewFromTable(w, OptionsFromTag(tag)) 210 b := &Buffer{} 211 prev := tc.str[0] 212 for i := 1; i < len(tc.str); i++ { 213 b.Reset() 214 s := tc.str[i] 215 ka := c.Key(b, prev) 216 kb := c.Key(b, s) 217 if r := bytes.Compare(ka, kb); r == 1 { 218 t.Errorf("%s:%d: Key(%.4X) < Key(%.4X) (%X < %X) == %d; want -1 or 0", tc.name, i, []rune(string(prev)), []rune(string(s)), ka, kb, r) 219 prev = s 220 continue 221 } 222 if r := c.Compare(prev, s); r == 1 { 223 t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want -1 or 0", tc.name, i, runes(prev), runes(s), r) 224 } 225 if r := c.Compare(s, prev); r == -1 { 226 t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want 1 or 0", tc.name, i, runes(s), runes(prev), r) 227 } 228 prev = s 229 } 230} 231