1// Copyright 2012 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build ignore 6 7// Collation table generator. 8// Data read from the web. 9 10package main 11 12import ( 13 "archive/zip" 14 "bufio" 15 "bytes" 16 "flag" 17 "fmt" 18 "io" 19 "io/ioutil" 20 "log" 21 "os" 22 "regexp" 23 "sort" 24 "strconv" 25 "strings" 26 "unicode/utf8" 27 28 "golang.org/x/text/collate" 29 "golang.org/x/text/collate/build" 30 "golang.org/x/text/internal/colltab" 31 "golang.org/x/text/internal/gen" 32 "golang.org/x/text/language" 33 "golang.org/x/text/unicode/cldr" 34) 35 36var ( 37 test = flag.Bool("test", false, 38 "test existing tables; can be used to compare web data with package data.") 39 short = flag.Bool("short", false, `Use "short" alternatives, when available.`) 40 draft = flag.Bool("draft", false, `Use draft versions, when available.`) 41 tags = flag.String("tags", "", "build tags to be included after +build directive") 42 pkg = flag.String("package", "collate", 43 "the name of the package in which the generated file is to be included") 44 45 tables = flagStringSetAllowAll("tables", "collate", "collate,chars", 46 "comma-spearated list of tables to generate.") 47 exclude = flagStringSet("exclude", "zh2", "", 48 "comma-separated list of languages to exclude.") 49 include = flagStringSet("include", "", "", 50 "comma-separated list of languages to include. Include trumps exclude.") 51 // TODO: Not included: unihan gb2312han zhuyin big5han (for size reasons) 52 // TODO: Not included: traditional (buggy for Bengali) 53 types = flagStringSetAllowAll("types", "standard,phonebook,phonetic,reformed,pinyin,stroke", "", 54 "comma-separated list of types that should be included.") 55) 56 57// stringSet implements an ordered set based on a list. It implements flag.Value 58// to allow a set to be specified as a comma-separated list. 59type stringSet struct { 60 s []string 61 allowed *stringSet 62 dirty bool // needs compaction if true 63 all bool 64 allowAll bool 65} 66 67func flagStringSet(name, def, allowed, usage string) *stringSet { 68 ss := &stringSet{} 69 if allowed != "" { 70 usage += fmt.Sprintf(" (allowed values: any of %s)", allowed) 71 ss.allowed = &stringSet{} 72 failOnError(ss.allowed.Set(allowed)) 73 } 74 ss.Set(def) 75 flag.Var(ss, name, usage) 76 return ss 77} 78 79func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet { 80 ss := &stringSet{allowAll: true} 81 if allowed == "" { 82 flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`)) 83 } else { 84 ss.allowed = &stringSet{} 85 failOnError(ss.allowed.Set(allowed)) 86 flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed)) 87 } 88 ss.Set(def) 89 return ss 90} 91 92func (ss stringSet) Len() int { 93 return len(ss.s) 94} 95 96func (ss stringSet) String() string { 97 return strings.Join(ss.s, ",") 98} 99 100func (ss *stringSet) Set(s string) error { 101 if ss.allowAll && s == "all" { 102 ss.s = nil 103 ss.all = true 104 return nil 105 } 106 ss.s = ss.s[:0] 107 for _, s := range strings.Split(s, ",") { 108 if s := strings.TrimSpace(s); s != "" { 109 if ss.allowed != nil && !ss.allowed.contains(s) { 110 return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed) 111 } 112 ss.add(s) 113 } 114 } 115 ss.compact() 116 return nil 117} 118 119func (ss *stringSet) add(s string) { 120 ss.s = append(ss.s, s) 121 ss.dirty = true 122} 123 124func (ss *stringSet) values() []string { 125 ss.compact() 126 return ss.s 127} 128 129func (ss *stringSet) contains(s string) bool { 130 if ss.all { 131 return true 132 } 133 for _, v := range ss.s { 134 if v == s { 135 return true 136 } 137 } 138 return false 139} 140 141func (ss *stringSet) compact() { 142 if !ss.dirty { 143 return 144 } 145 a := ss.s 146 sort.Strings(a) 147 k := 0 148 for i := 1; i < len(a); i++ { 149 if a[k] != a[i] { 150 a[k+1] = a[i] 151 k++ 152 } 153 } 154 ss.s = a[:k+1] 155 ss.dirty = false 156} 157 158func skipLang(l string) bool { 159 if include.Len() > 0 { 160 return !include.contains(l) 161 } 162 return exclude.contains(l) 163} 164 165// altInclude returns a list of alternatives (for the LDML alt attribute) 166// in order of preference. An empty string in this list indicates the 167// default entry. 168func altInclude() []string { 169 l := []string{} 170 if *short { 171 l = append(l, "short") 172 } 173 l = append(l, "") 174 // TODO: handle draft using cldr.SetDraftLevel 175 if *draft { 176 l = append(l, "proposed") 177 } 178 return l 179} 180 181func failOnError(e error) { 182 if e != nil { 183 log.Panic(e) 184 } 185} 186 187func openArchive() *zip.Reader { 188 f := gen.OpenCLDRCoreZip() 189 buffer, err := ioutil.ReadAll(f) 190 f.Close() 191 failOnError(err) 192 archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer))) 193 failOnError(err) 194 return archive 195} 196 197// parseUCA parses a Default Unicode Collation Element Table of the format 198// specified in http://www.unicode.org/reports/tr10/#File_Format. 199// It returns the variable top. 200func parseUCA(builder *build.Builder) { 201 var r io.ReadCloser 202 var err error 203 for _, f := range openArchive().File { 204 if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") { 205 r, err = f.Open() 206 } 207 } 208 if r == nil { 209 log.Fatal("File allkeys_CLDR.txt not found in archive.") 210 } 211 failOnError(err) 212 defer r.Close() 213 scanner := bufio.NewScanner(r) 214 colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`) 215 for i := 1; scanner.Scan(); i++ { 216 line := scanner.Text() 217 if len(line) == 0 || line[0] == '#' { 218 continue 219 } 220 if line[0] == '@' { 221 // parse properties 222 switch { 223 case strings.HasPrefix(line[1:], "version "): 224 a := strings.Split(line[1:], " ") 225 if a[1] != gen.UnicodeVersion() { 226 log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion()) 227 } 228 case strings.HasPrefix(line[1:], "backwards "): 229 log.Fatalf("%d: unsupported option backwards", i) 230 default: 231 log.Printf("%d: unknown option %s", i, line[1:]) 232 } 233 } else { 234 // parse entries 235 part := strings.Split(line, " ; ") 236 if len(part) != 2 { 237 log.Fatalf("%d: production rule without ';': %v", i, line) 238 } 239 lhs := []rune{} 240 for _, v := range strings.Split(part[0], " ") { 241 if v == "" { 242 continue 243 } 244 lhs = append(lhs, rune(convHex(i, v))) 245 } 246 var n int 247 var vars []int 248 rhs := [][]int{} 249 for i, m := range colelem.FindAllStringSubmatch(part[1], -1) { 250 n += len(m[0]) 251 elem := []int{} 252 for _, h := range strings.Split(m[2], ".") { 253 elem = append(elem, convHex(i, h)) 254 } 255 if m[1] == "*" { 256 vars = append(vars, i) 257 } 258 rhs = append(rhs, elem) 259 } 260 if len(part[1]) < n+3 || part[1][n+1] != '#' { 261 log.Fatalf("%d: expected comment; found %s", i, part[1][n:]) 262 } 263 if *test { 264 testInput.add(string(lhs)) 265 } 266 failOnError(builder.Add(lhs, rhs, vars)) 267 } 268 } 269 if scanner.Err() != nil { 270 log.Fatal(scanner.Err()) 271 } 272} 273 274func convHex(line int, s string) int { 275 r, e := strconv.ParseInt(s, 16, 32) 276 if e != nil { 277 log.Fatalf("%d: %v", line, e) 278 } 279 return int(r) 280} 281 282var testInput = stringSet{} 283 284var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`) 285var tagRe = regexp.MustCompile(`<([a-z_]*) */>`) 286 287var mainLocales = []string{} 288 289// charsets holds a list of exemplar characters per category. 290type charSets map[string][]string 291 292func (p charSets) fprint(w io.Writer) { 293 fmt.Fprintln(w, "[exN]string{") 294 for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} { 295 if set := p[k]; len(set) != 0 { 296 fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " ")) 297 } 298 } 299 fmt.Fprintln(w, "\t},") 300} 301 302var localeChars = make(map[string]charSets) 303 304const exemplarHeader = ` 305type exemplarType int 306const ( 307 exCharacters exemplarType = iota 308 exContractions 309 exPunctuation 310 exAuxiliary 311 exCurrency 312 exIndex 313 exN 314) 315` 316 317func printExemplarCharacters(w io.Writer) { 318 fmt.Fprintln(w, exemplarHeader) 319 fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{") 320 for _, loc := range mainLocales { 321 fmt.Fprintf(w, "\t%q: ", loc) 322 localeChars[loc].fprint(w) 323 } 324 fmt.Fprintln(w, "}") 325} 326 327func decodeCLDR(d *cldr.Decoder) *cldr.CLDR { 328 r := gen.OpenCLDRCoreZip() 329 data, err := d.DecodeZip(r) 330 failOnError(err) 331 return data 332} 333 334// parseMain parses XML files in the main directory of the CLDR core.zip file. 335func parseMain() { 336 d := &cldr.Decoder{} 337 d.SetDirFilter("main") 338 d.SetSectionFilter("characters") 339 data := decodeCLDR(d) 340 for _, loc := range data.Locales() { 341 x := data.RawLDML(loc) 342 if skipLang(x.Identity.Language.Type) { 343 continue 344 } 345 if x.Characters != nil { 346 x, _ = data.LDML(loc) 347 loc = language.Make(loc).String() 348 for _, ec := range x.Characters.ExemplarCharacters { 349 if ec.Draft != "" { 350 continue 351 } 352 if _, ok := localeChars[loc]; !ok { 353 mainLocales = append(mainLocales, loc) 354 localeChars[loc] = make(charSets) 355 } 356 localeChars[loc][ec.Type] = parseCharacters(ec.Data()) 357 } 358 } 359 } 360} 361 362func parseCharacters(chars string) []string { 363 parseSingle := func(s string) (r rune, tail string, escaped bool) { 364 if s[0] == '\\' { 365 return rune(s[1]), s[2:], true 366 } 367 r, sz := utf8.DecodeRuneInString(s) 368 return r, s[sz:], false 369 } 370 chars = strings.TrimSpace(chars) 371 if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' { 372 chars = chars[1:n] 373 } 374 list := []string{} 375 var r, last, end rune 376 for len(chars) > 0 { 377 if chars[0] == '{' { // character sequence 378 buf := []rune{} 379 for chars = chars[1:]; len(chars) > 0; { 380 r, chars, _ = parseSingle(chars) 381 if r == '}' { 382 break 383 } 384 if r == ' ' { 385 log.Fatalf("space not supported in sequence %q", chars) 386 } 387 buf = append(buf, r) 388 } 389 list = append(list, string(buf)) 390 last = 0 391 } else { // single character 392 escaped := false 393 r, chars, escaped = parseSingle(chars) 394 if r != ' ' { 395 if r == '-' && !escaped { 396 if last == 0 { 397 log.Fatal("'-' should be preceded by a character") 398 } 399 end, chars, _ = parseSingle(chars) 400 for ; last <= end; last++ { 401 list = append(list, string(last)) 402 } 403 last = 0 404 } else { 405 list = append(list, string(r)) 406 last = r 407 } 408 } 409 } 410 } 411 return list 412} 413 414var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`) 415 416// typeMap translates legacy type keys to their BCP47 equivalent. 417var typeMap = map[string]string{ 418 "phonebook": "phonebk", 419 "traditional": "trad", 420} 421 422// parseCollation parses XML files in the collation directory of the CLDR core.zip file. 423func parseCollation(b *build.Builder) { 424 d := &cldr.Decoder{} 425 d.SetDirFilter("collation") 426 data := decodeCLDR(d) 427 for _, loc := range data.Locales() { 428 x, err := data.LDML(loc) 429 failOnError(err) 430 if skipLang(x.Identity.Language.Type) { 431 continue 432 } 433 cs := x.Collations.Collation 434 sl := cldr.MakeSlice(&cs) 435 if len(types.s) == 0 { 436 sl.SelectAnyOf("type", x.Collations.Default()) 437 } else if !types.all { 438 sl.SelectAnyOf("type", types.s...) 439 } 440 sl.SelectOnePerGroup("alt", altInclude()) 441 442 for _, c := range cs { 443 id, err := language.Parse(loc) 444 if err != nil { 445 fmt.Fprintf(os.Stderr, "invalid locale: %q", err) 446 continue 447 } 448 // Support both old- and new-style defaults. 449 d := c.Type 450 if x.Collations.DefaultCollation == nil { 451 d = x.Collations.Default() 452 } else { 453 d = x.Collations.DefaultCollation.Data() 454 } 455 // We assume tables are being built either for search or collation, 456 // but not both. For search the default is always "search". 457 if d != c.Type && c.Type != "search" { 458 typ := c.Type 459 if len(c.Type) > 8 { 460 typ = typeMap[c.Type] 461 } 462 id, err = id.SetTypeForKey("co", typ) 463 failOnError(err) 464 } 465 t := b.Tailoring(id) 466 c.Process(processor{t}) 467 } 468 } 469} 470 471type processor struct { 472 t *build.Tailoring 473} 474 475func (p processor) Reset(anchor string, before int) (err error) { 476 if before != 0 { 477 err = p.t.SetAnchorBefore(anchor) 478 } else { 479 err = p.t.SetAnchor(anchor) 480 } 481 failOnError(err) 482 return nil 483} 484 485func (p processor) Insert(level int, str, context, extend string) error { 486 str = context + str 487 if *test { 488 testInput.add(str) 489 } 490 // TODO: mimic bug in old maketables: remove. 491 err := p.t.Insert(colltab.Level(level-1), str, context+extend) 492 failOnError(err) 493 return nil 494} 495 496func (p processor) Index(id string) { 497} 498 499func testCollator(c *collate.Collator) { 500 c0 := collate.New(language.Und) 501 502 // iterator over all characters for all locales and check 503 // whether Key is equal. 504 buf := collate.Buffer{} 505 506 // Add all common and not too uncommon runes to the test set. 507 for i := rune(0); i < 0x30000; i++ { 508 testInput.add(string(i)) 509 } 510 for i := rune(0xE0000); i < 0xF0000; i++ { 511 testInput.add(string(i)) 512 } 513 for _, str := range testInput.values() { 514 k0 := c0.KeyFromString(&buf, str) 515 k := c.KeyFromString(&buf, str) 516 if !bytes.Equal(k0, k) { 517 failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k)) 518 } 519 buf.Reset() 520 } 521 fmt.Println("PASS") 522} 523 524func main() { 525 gen.Init() 526 b := build.NewBuilder() 527 parseUCA(b) 528 if tables.contains("chars") { 529 parseMain() 530 } 531 parseCollation(b) 532 533 c, err := b.Build() 534 failOnError(err) 535 536 if *test { 537 testCollator(collate.NewFromTable(c)) 538 } else { 539 w := &bytes.Buffer{} 540 541 gen.WriteUnicodeVersion(w) 542 gen.WriteCLDRVersion(w) 543 544 if tables.contains("collate") { 545 _, err = b.Print(w) 546 failOnError(err) 547 } 548 if tables.contains("chars") { 549 printExemplarCharacters(w) 550 } 551 gen.WriteGoFile("tables.go", *pkg, w.Bytes()) 552 } 553} 554