1// Copyright 2012 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build ignore 6// +build ignore 7 8// Collation table generator. 9// Data read from the web. 10 11package main 12 13import ( 14 "archive/zip" 15 "bufio" 16 "bytes" 17 "flag" 18 "fmt" 19 "io" 20 "io/ioutil" 21 "log" 22 "os" 23 "regexp" 24 "sort" 25 "strconv" 26 "strings" 27 "unicode/utf8" 28 29 "golang.org/x/text/collate" 30 "golang.org/x/text/collate/build" 31 "golang.org/x/text/internal/colltab" 32 "golang.org/x/text/internal/gen" 33 "golang.org/x/text/language" 34 "golang.org/x/text/unicode/cldr" 35) 36 37var ( 38 test = flag.Bool("test", false, 39 "test existing tables; can be used to compare web data with package data.") 40 short = flag.Bool("short", false, `Use "short" alternatives, when available.`) 41 draft = flag.Bool("draft", false, `Use draft versions, when available.`) 42 tags = flag.String("tags", "", "build tags to be included after +build directive") 43 pkg = flag.String("package", "collate", 44 "the name of the package in which the generated file is to be included") 45 46 tables = flagStringSetAllowAll("tables", "collate", "collate,chars", 47 "comma-spearated list of tables to generate.") 48 exclude = flagStringSet("exclude", "zh2", "", 49 "comma-separated list of languages to exclude.") 50 include = flagStringSet("include", "", "", 51 "comma-separated list of languages to include. Include trumps exclude.") 52 // TODO: Not included: unihan gb2312han zhuyin big5han (for size reasons) 53 // TODO: Not included: traditional (buggy for Bengali) 54 types = flagStringSetAllowAll("types", "standard,phonebook,phonetic,reformed,pinyin,stroke", "", 55 "comma-separated list of types that should be included.") 56) 57 58// stringSet implements an ordered set based on a list. It implements flag.Value 59// to allow a set to be specified as a comma-separated list. 60type stringSet struct { 61 s []string 62 allowed *stringSet 63 dirty bool // needs compaction if true 64 all bool 65 allowAll bool 66} 67 68func flagStringSet(name, def, allowed, usage string) *stringSet { 69 ss := &stringSet{} 70 if allowed != "" { 71 usage += fmt.Sprintf(" (allowed values: any of %s)", allowed) 72 ss.allowed = &stringSet{} 73 failOnError(ss.allowed.Set(allowed)) 74 } 75 ss.Set(def) 76 flag.Var(ss, name, usage) 77 return ss 78} 79 80func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet { 81 ss := &stringSet{allowAll: true} 82 if allowed == "" { 83 flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`)) 84 } else { 85 ss.allowed = &stringSet{} 86 failOnError(ss.allowed.Set(allowed)) 87 flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed)) 88 } 89 ss.Set(def) 90 return ss 91} 92 93func (ss stringSet) Len() int { 94 return len(ss.s) 95} 96 97func (ss stringSet) String() string { 98 return strings.Join(ss.s, ",") 99} 100 101func (ss *stringSet) Set(s string) error { 102 if ss.allowAll && s == "all" { 103 ss.s = nil 104 ss.all = true 105 return nil 106 } 107 ss.s = ss.s[:0] 108 for _, s := range strings.Split(s, ",") { 109 if s := strings.TrimSpace(s); s != "" { 110 if ss.allowed != nil && !ss.allowed.contains(s) { 111 return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed) 112 } 113 ss.add(s) 114 } 115 } 116 ss.compact() 117 return nil 118} 119 120func (ss *stringSet) add(s string) { 121 ss.s = append(ss.s, s) 122 ss.dirty = true 123} 124 125func (ss *stringSet) values() []string { 126 ss.compact() 127 return ss.s 128} 129 130func (ss *stringSet) contains(s string) bool { 131 if ss.all { 132 return true 133 } 134 for _, v := range ss.s { 135 if v == s { 136 return true 137 } 138 } 139 return false 140} 141 142func (ss *stringSet) compact() { 143 if !ss.dirty { 144 return 145 } 146 a := ss.s 147 sort.Strings(a) 148 k := 0 149 for i := 1; i < len(a); i++ { 150 if a[k] != a[i] { 151 a[k+1] = a[i] 152 k++ 153 } 154 } 155 ss.s = a[:k+1] 156 ss.dirty = false 157} 158 159func skipLang(l string) bool { 160 if include.Len() > 0 { 161 return !include.contains(l) 162 } 163 return exclude.contains(l) 164} 165 166// altInclude returns a list of alternatives (for the LDML alt attribute) 167// in order of preference. An empty string in this list indicates the 168// default entry. 169func altInclude() []string { 170 l := []string{} 171 if *short { 172 l = append(l, "short") 173 } 174 l = append(l, "") 175 // TODO: handle draft using cldr.SetDraftLevel 176 if *draft { 177 l = append(l, "proposed") 178 } 179 return l 180} 181 182func failOnError(e error) { 183 if e != nil { 184 log.Panic(e) 185 } 186} 187 188func openArchive() *zip.Reader { 189 f := gen.OpenCLDRCoreZip() 190 buffer, err := ioutil.ReadAll(f) 191 f.Close() 192 failOnError(err) 193 archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer))) 194 failOnError(err) 195 return archive 196} 197 198// parseUCA parses a Default Unicode Collation Element Table of the format 199// specified in https://www.unicode.org/reports/tr10/#File_Format. 200// It returns the variable top. 201func parseUCA(builder *build.Builder) { 202 var r io.ReadCloser 203 var err error 204 for _, f := range openArchive().File { 205 if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") { 206 r, err = f.Open() 207 } 208 } 209 if r == nil { 210 log.Fatal("File allkeys_CLDR.txt not found in archive.") 211 } 212 failOnError(err) 213 defer r.Close() 214 scanner := bufio.NewScanner(r) 215 colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`) 216 for i := 1; scanner.Scan(); i++ { 217 line := scanner.Text() 218 if len(line) == 0 || line[0] == '#' { 219 continue 220 } 221 if line[0] == '@' { 222 // parse properties 223 switch { 224 case strings.HasPrefix(line[1:], "version "): 225 a := strings.Split(line[1:], " ") 226 if a[1] != gen.UnicodeVersion() { 227 log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion()) 228 } 229 case strings.HasPrefix(line[1:], "backwards "): 230 log.Fatalf("%d: unsupported option backwards", i) 231 default: 232 log.Printf("%d: unknown option %s", i, line[1:]) 233 } 234 } else { 235 // parse entries 236 part := strings.Split(line, " ; ") 237 if len(part) != 2 { 238 log.Fatalf("%d: production rule without ';': %v", i, line) 239 } 240 lhs := []rune{} 241 for _, v := range strings.Split(part[0], " ") { 242 if v == "" { 243 continue 244 } 245 lhs = append(lhs, rune(convHex(i, v))) 246 } 247 var n int 248 var vars []int 249 rhs := [][]int{} 250 for i, m := range colelem.FindAllStringSubmatch(part[1], -1) { 251 n += len(m[0]) 252 elem := []int{} 253 for _, h := range strings.Split(m[2], ".") { 254 elem = append(elem, convHex(i, h)) 255 } 256 if m[1] == "*" { 257 vars = append(vars, i) 258 } 259 rhs = append(rhs, elem) 260 } 261 if len(part[1]) < n+3 || part[1][n+1] != '#' { 262 log.Fatalf("%d: expected comment; found %s", i, part[1][n:]) 263 } 264 if *test { 265 testInput.add(string(lhs)) 266 } 267 failOnError(builder.Add(lhs, rhs, vars)) 268 } 269 } 270 if scanner.Err() != nil { 271 log.Fatal(scanner.Err()) 272 } 273} 274 275func convHex(line int, s string) int { 276 r, e := strconv.ParseInt(s, 16, 32) 277 if e != nil { 278 log.Fatalf("%d: %v", line, e) 279 } 280 return int(r) 281} 282 283var testInput = stringSet{} 284 285var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`) 286var tagRe = regexp.MustCompile(`<([a-z_]*) */>`) 287 288var mainLocales = []string{} 289 290// charsets holds a list of exemplar characters per category. 291type charSets map[string][]string 292 293func (p charSets) fprint(w io.Writer) { 294 fmt.Fprintln(w, "[exN]string{") 295 for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} { 296 if set := p[k]; len(set) != 0 { 297 fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " ")) 298 } 299 } 300 fmt.Fprintln(w, "\t},") 301} 302 303var localeChars = make(map[string]charSets) 304 305const exemplarHeader = ` 306type exemplarType int 307const ( 308 exCharacters exemplarType = iota 309 exContractions 310 exPunctuation 311 exAuxiliary 312 exCurrency 313 exIndex 314 exN 315) 316` 317 318func printExemplarCharacters(w io.Writer) { 319 fmt.Fprintln(w, exemplarHeader) 320 fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{") 321 for _, loc := range mainLocales { 322 fmt.Fprintf(w, "\t%q: ", loc) 323 localeChars[loc].fprint(w) 324 } 325 fmt.Fprintln(w, "}") 326} 327 328func decodeCLDR(d *cldr.Decoder) *cldr.CLDR { 329 r := gen.OpenCLDRCoreZip() 330 data, err := d.DecodeZip(r) 331 failOnError(err) 332 return data 333} 334 335// parseMain parses XML files in the main directory of the CLDR core.zip file. 336func parseMain() { 337 d := &cldr.Decoder{} 338 d.SetDirFilter("main") 339 d.SetSectionFilter("characters") 340 data := decodeCLDR(d) 341 for _, loc := range data.Locales() { 342 x := data.RawLDML(loc) 343 if skipLang(x.Identity.Language.Type) { 344 continue 345 } 346 if x.Characters != nil { 347 x, _ = data.LDML(loc) 348 loc = language.Make(loc).String() 349 for _, ec := range x.Characters.ExemplarCharacters { 350 if ec.Draft != "" { 351 continue 352 } 353 if _, ok := localeChars[loc]; !ok { 354 mainLocales = append(mainLocales, loc) 355 localeChars[loc] = make(charSets) 356 } 357 localeChars[loc][ec.Type] = parseCharacters(ec.Data()) 358 } 359 } 360 } 361} 362 363func parseCharacters(chars string) []string { 364 parseSingle := func(s string) (r rune, tail string, escaped bool) { 365 if s[0] == '\\' { 366 return rune(s[1]), s[2:], true 367 } 368 r, sz := utf8.DecodeRuneInString(s) 369 return r, s[sz:], false 370 } 371 chars = strings.TrimSpace(chars) 372 if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' { 373 chars = chars[1:n] 374 } 375 list := []string{} 376 var r, last, end rune 377 for len(chars) > 0 { 378 if chars[0] == '{' { // character sequence 379 buf := []rune{} 380 for chars = chars[1:]; len(chars) > 0; { 381 r, chars, _ = parseSingle(chars) 382 if r == '}' { 383 break 384 } 385 if r == ' ' { 386 log.Fatalf("space not supported in sequence %q", chars) 387 } 388 buf = append(buf, r) 389 } 390 list = append(list, string(buf)) 391 last = 0 392 } else { // single character 393 escaped := false 394 r, chars, escaped = parseSingle(chars) 395 if r != ' ' { 396 if r == '-' && !escaped { 397 if last == 0 { 398 log.Fatal("'-' should be preceded by a character") 399 } 400 end, chars, _ = parseSingle(chars) 401 for ; last <= end; last++ { 402 list = append(list, string(last)) 403 } 404 last = 0 405 } else { 406 list = append(list, string(r)) 407 last = r 408 } 409 } 410 } 411 } 412 return list 413} 414 415var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`) 416 417// typeMap translates legacy type keys to their BCP47 equivalent. 418var typeMap = map[string]string{ 419 "phonebook": "phonebk", 420 "traditional": "trad", 421} 422 423// parseCollation parses XML files in the collation directory of the CLDR core.zip file. 424func parseCollation(b *build.Builder) { 425 d := &cldr.Decoder{} 426 d.SetDirFilter("collation") 427 data := decodeCLDR(d) 428 for _, loc := range data.Locales() { 429 x, err := data.LDML(loc) 430 failOnError(err) 431 if skipLang(x.Identity.Language.Type) { 432 continue 433 } 434 cs := x.Collations.Collation 435 sl := cldr.MakeSlice(&cs) 436 if len(types.s) == 0 { 437 sl.SelectAnyOf("type", x.Collations.Default()) 438 } else if !types.all { 439 sl.SelectAnyOf("type", types.s...) 440 } 441 sl.SelectOnePerGroup("alt", altInclude()) 442 443 for _, c := range cs { 444 id, err := language.Parse(loc) 445 if err != nil { 446 fmt.Fprintf(os.Stderr, "invalid locale: %q", err) 447 continue 448 } 449 // Support both old- and new-style defaults. 450 d := c.Type 451 if x.Collations.DefaultCollation == nil { 452 d = x.Collations.Default() 453 } else { 454 d = x.Collations.DefaultCollation.Data() 455 } 456 // We assume tables are being built either for search or collation, 457 // but not both. For search the default is always "search". 458 if d != c.Type && c.Type != "search" { 459 typ := c.Type 460 if len(c.Type) > 8 { 461 typ = typeMap[c.Type] 462 } 463 id, err = id.SetTypeForKey("co", typ) 464 failOnError(err) 465 } 466 t := b.Tailoring(id) 467 c.Process(processor{t}) 468 } 469 } 470} 471 472type processor struct { 473 t *build.Tailoring 474} 475 476func (p processor) Reset(anchor string, before int) (err error) { 477 if before != 0 { 478 err = p.t.SetAnchorBefore(anchor) 479 } else { 480 err = p.t.SetAnchor(anchor) 481 } 482 failOnError(err) 483 return nil 484} 485 486func (p processor) Insert(level int, str, context, extend string) error { 487 str = context + str 488 if *test { 489 testInput.add(str) 490 } 491 // TODO: mimic bug in old maketables: remove. 492 err := p.t.Insert(colltab.Level(level-1), str, context+extend) 493 failOnError(err) 494 return nil 495} 496 497func (p processor) Index(id string) { 498} 499 500func testCollator(c *collate.Collator) { 501 c0 := collate.New(language.Und) 502 503 // iterator over all characters for all locales and check 504 // whether Key is equal. 505 buf := collate.Buffer{} 506 507 // Add all common and not too uncommon runes to the test set. 508 for i := rune(0); i < 0x30000; i++ { 509 testInput.add(string(i)) 510 } 511 for i := rune(0xE0000); i < 0xF0000; i++ { 512 testInput.add(string(i)) 513 } 514 for _, str := range testInput.values() { 515 k0 := c0.KeyFromString(&buf, str) 516 k := c.KeyFromString(&buf, str) 517 if !bytes.Equal(k0, k) { 518 failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k)) 519 } 520 buf.Reset() 521 } 522 fmt.Println("PASS") 523} 524 525func main() { 526 gen.Init() 527 b := build.NewBuilder() 528 parseUCA(b) 529 if tables.contains("chars") { 530 parseMain() 531 } 532 parseCollation(b) 533 534 c, err := b.Build() 535 failOnError(err) 536 537 if *test { 538 testCollator(collate.NewFromTable(c)) 539 } else { 540 w := &bytes.Buffer{} 541 542 gen.WriteUnicodeVersion(w) 543 gen.WriteCLDRVersion(w) 544 545 if tables.contains("collate") { 546 _, err = b.Print(w) 547 failOnError(err) 548 } 549 if tables.contains("chars") { 550 printExemplarCharacters(w) 551 } 552 gen.WriteGoFile("tables.go", *pkg, w.Bytes()) 553 } 554} 555