1// Copyright 2013 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build ignore 6 7// Language tag table generator. 8// Data read from the web. 9 10package main 11 12import ( 13 "flag" 14 "fmt" 15 "io" 16 "log" 17 "sort" 18 "strconv" 19 "strings" 20 21 "golang.org/x/text/internal/gen" 22 "golang.org/x/text/internal/language" 23 "golang.org/x/text/unicode/cldr" 24) 25 26var ( 27 test = flag.Bool("test", 28 false, 29 "test existing tables; can be used to compare web data with package data.") 30 outputFile = flag.String("output", 31 "tables.go", 32 "output file for generated tables") 33) 34 35func main() { 36 gen.Init() 37 38 w := gen.NewCodeWriter() 39 defer w.WriteGoFile("tables.go", "language") 40 41 b := newBuilder(w) 42 gen.WriteCLDRVersion(w) 43 44 b.writeConstants() 45 b.writeMatchData() 46} 47 48type builder struct { 49 w *gen.CodeWriter 50 hw io.Writer // MultiWriter for w and w.Hash 51 data *cldr.CLDR 52 supp *cldr.SupplementalData 53} 54 55func (b *builder) langIndex(s string) uint16 { 56 return uint16(language.MustParseBase(s)) 57} 58 59func (b *builder) regionIndex(s string) int { 60 return int(language.MustParseRegion(s)) 61} 62 63func (b *builder) scriptIndex(s string) int { 64 return int(language.MustParseScript(s)) 65} 66 67func newBuilder(w *gen.CodeWriter) *builder { 68 r := gen.OpenCLDRCoreZip() 69 defer r.Close() 70 d := &cldr.Decoder{} 71 data, err := d.DecodeZip(r) 72 if err != nil { 73 log.Fatal(err) 74 } 75 b := builder{ 76 w: w, 77 hw: io.MultiWriter(w, w.Hash), 78 data: data, 79 supp: data.Supplemental(), 80 } 81 return &b 82} 83 84// writeConsts computes f(v) for all v in values and writes the results 85// as constants named _v to a single constant block. 86func (b *builder) writeConsts(f func(string) int, values ...string) { 87 fmt.Fprintln(b.w, "const (") 88 for _, v := range values { 89 fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v)) 90 } 91 fmt.Fprintln(b.w, ")") 92} 93 94// TODO: region inclusion data will probably not be use used in future matchers. 95 96var langConsts = []string{ 97 "de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und", 98} 99 100var scriptConsts = []string{ 101 "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy", 102 "Zzzz", 103} 104 105var regionConsts = []string{ 106 "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US", 107 "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo. 108} 109 110func (b *builder) writeConstants() { 111 b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...) 112 b.writeConsts(b.regionIndex, regionConsts...) 113 b.writeConsts(b.scriptIndex, scriptConsts...) 114} 115 116type mutualIntelligibility struct { 117 want, have uint16 118 distance uint8 119 oneway bool 120} 121 122type scriptIntelligibility struct { 123 wantLang, haveLang uint16 124 wantScript, haveScript uint8 125 distance uint8 126 // Always oneway 127} 128 129type regionIntelligibility struct { 130 lang uint16 // compact language id 131 script uint8 // 0 means any 132 group uint8 // 0 means any; if bit 7 is set it means inverse 133 distance uint8 134 // Always twoway. 135} 136 137// writeMatchData writes tables with languages and scripts for which there is 138// mutual intelligibility. The data is based on CLDR's languageMatching data. 139// Note that we use a different algorithm than the one defined by CLDR and that 140// we slightly modify the data. For example, we convert scores to confidence levels. 141// We also drop all region-related data as we use a different algorithm to 142// determine region equivalence. 143func (b *builder) writeMatchData() { 144 lm := b.supp.LanguageMatching.LanguageMatches 145 cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new") 146 147 regionHierarchy := map[string][]string{} 148 for _, g := range b.supp.TerritoryContainment.Group { 149 regions := strings.Split(g.Contains, " ") 150 regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...) 151 } 152 regionToGroups := make([]uint8, language.NumRegions) 153 154 idToIndex := map[string]uint8{} 155 for i, mv := range lm[0].MatchVariable { 156 if i > 6 { 157 log.Fatalf("Too many groups: %d", i) 158 } 159 idToIndex[mv.Id] = uint8(i + 1) 160 // TODO: also handle '-' 161 for _, r := range strings.Split(mv.Value, "+") { 162 todo := []string{r} 163 for k := 0; k < len(todo); k++ { 164 r := todo[k] 165 regionToGroups[b.regionIndex(r)] |= 1 << uint8(i) 166 todo = append(todo, regionHierarchy[r]...) 167 } 168 } 169 } 170 b.w.WriteVar("regionToGroups", regionToGroups) 171 172 // maps language id to in- and out-of-group region. 173 paradigmLocales := [][3]uint16{} 174 locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ") 175 for i := 0; i < len(locales); i += 2 { 176 x := [3]uint16{} 177 for j := 0; j < 2; j++ { 178 pc := strings.SplitN(locales[i+j], "-", 2) 179 x[0] = b.langIndex(pc[0]) 180 if len(pc) == 2 { 181 x[1+j] = uint16(b.regionIndex(pc[1])) 182 } 183 } 184 paradigmLocales = append(paradigmLocales, x) 185 } 186 b.w.WriteVar("paradigmLocales", paradigmLocales) 187 188 b.w.WriteType(mutualIntelligibility{}) 189 b.w.WriteType(scriptIntelligibility{}) 190 b.w.WriteType(regionIntelligibility{}) 191 192 matchLang := []mutualIntelligibility{} 193 matchScript := []scriptIntelligibility{} 194 matchRegion := []regionIntelligibility{} 195 // Convert the languageMatch entries in lists keyed by desired language. 196 for _, m := range lm[0].LanguageMatch { 197 // Different versions of CLDR use different separators. 198 desired := strings.Replace(m.Desired, "-", "_", -1) 199 supported := strings.Replace(m.Supported, "-", "_", -1) 200 d := strings.Split(desired, "_") 201 s := strings.Split(supported, "_") 202 if len(d) != len(s) { 203 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) 204 continue 205 } 206 distance, _ := strconv.ParseInt(m.Distance, 10, 8) 207 switch len(d) { 208 case 2: 209 if desired == supported && desired == "*_*" { 210 continue 211 } 212 // language-script pair. 213 matchScript = append(matchScript, scriptIntelligibility{ 214 wantLang: uint16(b.langIndex(d[0])), 215 haveLang: uint16(b.langIndex(s[0])), 216 wantScript: uint8(b.scriptIndex(d[1])), 217 haveScript: uint8(b.scriptIndex(s[1])), 218 distance: uint8(distance), 219 }) 220 if m.Oneway != "true" { 221 matchScript = append(matchScript, scriptIntelligibility{ 222 wantLang: uint16(b.langIndex(s[0])), 223 haveLang: uint16(b.langIndex(d[0])), 224 wantScript: uint8(b.scriptIndex(s[1])), 225 haveScript: uint8(b.scriptIndex(d[1])), 226 distance: uint8(distance), 227 }) 228 } 229 case 1: 230 if desired == supported && desired == "*" { 231 continue 232 } 233 if distance == 1 { 234 // nb == no is already handled by macro mapping. Check there 235 // really is only this case. 236 if d[0] != "no" || s[0] != "nb" { 237 log.Fatalf("unhandled equivalence %s == %s", s[0], d[0]) 238 } 239 continue 240 } 241 // TODO: consider dropping oneway field and just doubling the entry. 242 matchLang = append(matchLang, mutualIntelligibility{ 243 want: uint16(b.langIndex(d[0])), 244 have: uint16(b.langIndex(s[0])), 245 distance: uint8(distance), 246 oneway: m.Oneway == "true", 247 }) 248 case 3: 249 if desired == supported && desired == "*_*_*" { 250 continue 251 } 252 if desired != supported { 253 // This is now supported by CLDR, but only one case, which 254 // should already be covered by paradigm locales. For instance, 255 // test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in 256 // testdata/CLDRLocaleMatcherTest.txt tests this. 257 if supported != "en_*_GB" { 258 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) 259 } 260 continue 261 } 262 ri := regionIntelligibility{ 263 lang: b.langIndex(d[0]), 264 distance: uint8(distance), 265 } 266 if d[1] != "*" { 267 ri.script = uint8(b.scriptIndex(d[1])) 268 } 269 switch { 270 case d[2] == "*": 271 ri.group = 0x80 // not contained in anything 272 case strings.HasPrefix(d[2], "$!"): 273 ri.group = 0x80 274 d[2] = "$" + d[2][len("$!"):] 275 fallthrough 276 case strings.HasPrefix(d[2], "$"): 277 ri.group |= idToIndex[d[2]] 278 } 279 matchRegion = append(matchRegion, ri) 280 default: 281 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) 282 } 283 } 284 sort.SliceStable(matchLang, func(i, j int) bool { 285 return matchLang[i].distance < matchLang[j].distance 286 }) 287 b.w.WriteComment(` 288 matchLang holds pairs of langIDs of base languages that are typically 289 mutually intelligible. Each pair is associated with a confidence and 290 whether the intelligibility goes one or both ways.`) 291 b.w.WriteVar("matchLang", matchLang) 292 293 b.w.WriteComment(` 294 matchScript holds pairs of scriptIDs where readers of one script 295 can typically also read the other. Each is associated with a confidence.`) 296 sort.SliceStable(matchScript, func(i, j int) bool { 297 return matchScript[i].distance < matchScript[j].distance 298 }) 299 b.w.WriteVar("matchScript", matchScript) 300 301 sort.SliceStable(matchRegion, func(i, j int) bool { 302 return matchRegion[i].distance < matchRegion[j].distance 303 }) 304 b.w.WriteVar("matchRegion", matchRegion) 305} 306