1// Copyright 2013 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build ignore 6// +build ignore 7 8// Language tag table generator. 9// Data read from the web. 10 11package main 12 13import ( 14 "flag" 15 "fmt" 16 "io" 17 "log" 18 "sort" 19 "strconv" 20 "strings" 21 22 "golang.org/x/text/internal/gen" 23 "golang.org/x/text/internal/language" 24 "golang.org/x/text/unicode/cldr" 25) 26 27var ( 28 test = flag.Bool("test", 29 false, 30 "test existing tables; can be used to compare web data with package data.") 31 outputFile = flag.String("output", 32 "tables.go", 33 "output file for generated tables") 34) 35 36func main() { 37 gen.Init() 38 39 w := gen.NewCodeWriter() 40 defer w.WriteGoFile("tables.go", "language") 41 42 b := newBuilder(w) 43 gen.WriteCLDRVersion(w) 44 45 b.writeConstants() 46 b.writeMatchData() 47} 48 49type builder struct { 50 w *gen.CodeWriter 51 hw io.Writer // MultiWriter for w and w.Hash 52 data *cldr.CLDR 53 supp *cldr.SupplementalData 54} 55 56func (b *builder) langIndex(s string) uint16 { 57 return uint16(language.MustParseBase(s)) 58} 59 60func (b *builder) regionIndex(s string) int { 61 return int(language.MustParseRegion(s)) 62} 63 64func (b *builder) scriptIndex(s string) int { 65 return int(language.MustParseScript(s)) 66} 67 68func newBuilder(w *gen.CodeWriter) *builder { 69 r := gen.OpenCLDRCoreZip() 70 defer r.Close() 71 d := &cldr.Decoder{} 72 data, err := d.DecodeZip(r) 73 if err != nil { 74 log.Fatal(err) 75 } 76 b := builder{ 77 w: w, 78 hw: io.MultiWriter(w, w.Hash), 79 data: data, 80 supp: data.Supplemental(), 81 } 82 return &b 83} 84 85// writeConsts computes f(v) for all v in values and writes the results 86// as constants named _v to a single constant block. 87func (b *builder) writeConsts(f func(string) int, values ...string) { 88 fmt.Fprintln(b.w, "const (") 89 for _, v := range values { 90 fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v)) 91 } 92 fmt.Fprintln(b.w, ")") 93} 94 95// TODO: region inclusion data will probably not be use used in future matchers. 96 97var langConsts = []string{ 98 "de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und", 99} 100 101var scriptConsts = []string{ 102 "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy", 103 "Zzzz", 104} 105 106var regionConsts = []string{ 107 "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US", 108 "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo. 109} 110 111func (b *builder) writeConstants() { 112 b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...) 113 b.writeConsts(b.regionIndex, regionConsts...) 114 b.writeConsts(b.scriptIndex, scriptConsts...) 115} 116 117type mutualIntelligibility struct { 118 want, have uint16 119 distance uint8 120 oneway bool 121} 122 123type scriptIntelligibility struct { 124 wantLang, haveLang uint16 125 wantScript, haveScript uint8 126 distance uint8 127 // Always oneway 128} 129 130type regionIntelligibility struct { 131 lang uint16 // compact language id 132 script uint8 // 0 means any 133 group uint8 // 0 means any; if bit 7 is set it means inverse 134 distance uint8 135 // Always twoway. 136} 137 138// writeMatchData writes tables with languages and scripts for which there is 139// mutual intelligibility. The data is based on CLDR's languageMatching data. 140// Note that we use a different algorithm than the one defined by CLDR and that 141// we slightly modify the data. For example, we convert scores to confidence levels. 142// We also drop all region-related data as we use a different algorithm to 143// determine region equivalence. 144func (b *builder) writeMatchData() { 145 lm := b.supp.LanguageMatching.LanguageMatches 146 cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new") 147 148 regionHierarchy := map[string][]string{} 149 for _, g := range b.supp.TerritoryContainment.Group { 150 regions := strings.Split(g.Contains, " ") 151 regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...) 152 } 153 // Regions start at 1, so the slice must be one larger than the number of 154 // regions. 155 regionToGroups := make([]uint8, language.NumRegions+1) 156 157 idToIndex := map[string]uint8{} 158 for i, mv := range lm[0].MatchVariable { 159 if i > 6 { 160 log.Fatalf("Too many groups: %d", i) 161 } 162 idToIndex[mv.Id] = uint8(i + 1) 163 // TODO: also handle '-' 164 for _, r := range strings.Split(mv.Value, "+") { 165 todo := []string{r} 166 for k := 0; k < len(todo); k++ { 167 r := todo[k] 168 regionToGroups[b.regionIndex(r)] |= 1 << uint8(i) 169 todo = append(todo, regionHierarchy[r]...) 170 } 171 } 172 } 173 b.w.WriteVar("regionToGroups", regionToGroups) 174 175 // maps language id to in- and out-of-group region. 176 paradigmLocales := [][3]uint16{} 177 locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ") 178 for i := 0; i < len(locales); i += 2 { 179 x := [3]uint16{} 180 for j := 0; j < 2; j++ { 181 pc := strings.SplitN(locales[i+j], "-", 2) 182 x[0] = b.langIndex(pc[0]) 183 if len(pc) == 2 { 184 x[1+j] = uint16(b.regionIndex(pc[1])) 185 } 186 } 187 paradigmLocales = append(paradigmLocales, x) 188 } 189 b.w.WriteVar("paradigmLocales", paradigmLocales) 190 191 b.w.WriteType(mutualIntelligibility{}) 192 b.w.WriteType(scriptIntelligibility{}) 193 b.w.WriteType(regionIntelligibility{}) 194 195 matchLang := []mutualIntelligibility{} 196 matchScript := []scriptIntelligibility{} 197 matchRegion := []regionIntelligibility{} 198 // Convert the languageMatch entries in lists keyed by desired language. 199 for _, m := range lm[0].LanguageMatch { 200 // Different versions of CLDR use different separators. 201 desired := strings.Replace(m.Desired, "-", "_", -1) 202 supported := strings.Replace(m.Supported, "-", "_", -1) 203 d := strings.Split(desired, "_") 204 s := strings.Split(supported, "_") 205 if len(d) != len(s) { 206 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) 207 continue 208 } 209 distance, _ := strconv.ParseInt(m.Distance, 10, 8) 210 switch len(d) { 211 case 2: 212 if desired == supported && desired == "*_*" { 213 continue 214 } 215 // language-script pair. 216 matchScript = append(matchScript, scriptIntelligibility{ 217 wantLang: uint16(b.langIndex(d[0])), 218 haveLang: uint16(b.langIndex(s[0])), 219 wantScript: uint8(b.scriptIndex(d[1])), 220 haveScript: uint8(b.scriptIndex(s[1])), 221 distance: uint8(distance), 222 }) 223 if m.Oneway != "true" { 224 matchScript = append(matchScript, scriptIntelligibility{ 225 wantLang: uint16(b.langIndex(s[0])), 226 haveLang: uint16(b.langIndex(d[0])), 227 wantScript: uint8(b.scriptIndex(s[1])), 228 haveScript: uint8(b.scriptIndex(d[1])), 229 distance: uint8(distance), 230 }) 231 } 232 case 1: 233 if desired == supported && desired == "*" { 234 continue 235 } 236 if distance == 1 { 237 // nb == no is already handled by macro mapping. Check there 238 // really is only this case. 239 if d[0] != "no" || s[0] != "nb" { 240 log.Fatalf("unhandled equivalence %s == %s", s[0], d[0]) 241 } 242 continue 243 } 244 // TODO: consider dropping oneway field and just doubling the entry. 245 matchLang = append(matchLang, mutualIntelligibility{ 246 want: uint16(b.langIndex(d[0])), 247 have: uint16(b.langIndex(s[0])), 248 distance: uint8(distance), 249 oneway: m.Oneway == "true", 250 }) 251 case 3: 252 if desired == supported && desired == "*_*_*" { 253 continue 254 } 255 if desired != supported { 256 // This is now supported by CLDR, but only one case, which 257 // should already be covered by paradigm locales. For instance, 258 // test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in 259 // testdata/CLDRLocaleMatcherTest.txt tests this. 260 if supported != "en_*_GB" { 261 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) 262 } 263 continue 264 } 265 ri := regionIntelligibility{ 266 lang: b.langIndex(d[0]), 267 distance: uint8(distance), 268 } 269 if d[1] != "*" { 270 ri.script = uint8(b.scriptIndex(d[1])) 271 } 272 switch { 273 case d[2] == "*": 274 ri.group = 0x80 // not contained in anything 275 case strings.HasPrefix(d[2], "$!"): 276 ri.group = 0x80 277 d[2] = "$" + d[2][len("$!"):] 278 fallthrough 279 case strings.HasPrefix(d[2], "$"): 280 ri.group |= idToIndex[d[2]] 281 } 282 matchRegion = append(matchRegion, ri) 283 default: 284 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) 285 } 286 } 287 sort.SliceStable(matchLang, func(i, j int) bool { 288 return matchLang[i].distance < matchLang[j].distance 289 }) 290 b.w.WriteComment(` 291 matchLang holds pairs of langIDs of base languages that are typically 292 mutually intelligible. Each pair is associated with a confidence and 293 whether the intelligibility goes one or both ways.`) 294 b.w.WriteVar("matchLang", matchLang) 295 296 b.w.WriteComment(` 297 matchScript holds pairs of scriptIDs where readers of one script 298 can typically also read the other. Each is associated with a confidence.`) 299 sort.SliceStable(matchScript, func(i, j int) bool { 300 return matchScript[i].distance < matchScript[j].distance 301 }) 302 b.w.WriteVar("matchScript", matchScript) 303 304 sort.SliceStable(matchRegion, func(i, j int) bool { 305 return matchRegion[i].distance < matchRegion[j].distance 306 }) 307 b.w.WriteVar("matchRegion", matchRegion) 308} 309