1// Copyright (c) 2014 Couchbase, Inc. 2// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3// except in compliance with the License. You may obtain a copy of the License at 4// http://www.apache.org/licenses/LICENSE-2.0 5// Unless required by applicable law or agreed to in writing, software distributed under the 6// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7// either express or implied. See the License for the specific language governing permissions 8// and limitations under the License. 9 10// Modified by Martin Atkins to serve the needs of package textseg. 11 12// +build ignore 13 14package main 15 16import ( 17 "bufio" 18 "flag" 19 "fmt" 20 "io" 21 "log" 22 "net/http" 23 "os" 24 "os/exec" 25 "sort" 26 "strconv" 27 "strings" 28 "unicode" 29) 30 31var url = flag.String("url", 32 "http://www.unicode.org/Public/12.0.0/ucd/auxiliary/", 33 "URL of Unicode database directory") 34var verbose = flag.Bool("verbose", 35 false, 36 "write data to stdout as it is parsed") 37var localFiles = flag.Bool("local", 38 false, 39 "data files have been copied to the current directory; for debugging only") 40var outputFile = flag.String("output", 41 "", 42 "output file for generated tables; default stdout") 43 44var output *bufio.Writer 45 46func main() { 47 flag.Parse() 48 setupOutput() 49 50 graphemePropertyRanges := make(map[string]*unicode.RangeTable) 51 loadUnicodeData("GraphemeBreakProperty.txt", graphemePropertyRanges) 52 wordPropertyRanges := make(map[string]*unicode.RangeTable) 53 loadUnicodeData("WordBreakProperty.txt", wordPropertyRanges) 54 sentencePropertyRanges := make(map[string]*unicode.RangeTable) 55 loadUnicodeData("SentenceBreakProperty.txt", sentencePropertyRanges) 56 57 fmt.Fprintf(output, fileHeader, *url) 58 generateTables("Grapheme", graphemePropertyRanges) 59 generateTables("Word", wordPropertyRanges) 60 generateTables("Sentence", sentencePropertyRanges) 61 62 flushOutput() 63} 64 65// WordBreakProperty.txt has the form: 66// 05F0..05F2 ; Hebrew_Letter # Lo [3] HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW LIGATURE YIDDISH DOUBLE YOD 67// FB1D ; Hebrew_Letter # Lo HEBREW LETTER YOD WITH HIRIQ 68func openReader(file string) (input io.ReadCloser) { 69 if *localFiles { 70 f, err := os.Open(file) 71 if err != nil { 72 log.Fatal(err) 73 } 74 input = f 75 } else { 76 path := *url + file 77 resp, err := http.Get(path) 78 if err != nil { 79 log.Fatal(err) 80 } 81 if resp.StatusCode != 200 { 82 log.Fatal("bad GET status for "+file, resp.Status) 83 } 84 input = resp.Body 85 } 86 return 87} 88 89func loadUnicodeData(filename string, propertyRanges map[string]*unicode.RangeTable) { 90 f := openReader(filename) 91 defer f.Close() 92 bufioReader := bufio.NewReader(f) 93 line, err := bufioReader.ReadString('\n') 94 for err == nil { 95 parseLine(line, propertyRanges) 96 line, err = bufioReader.ReadString('\n') 97 } 98 // if the err was EOF still need to process last value 99 if err == io.EOF { 100 parseLine(line, propertyRanges) 101 } 102} 103 104const comment = "#" 105const sep = ";" 106const rnge = ".." 107 108func parseLine(line string, propertyRanges map[string]*unicode.RangeTable) { 109 if strings.HasPrefix(line, comment) { 110 return 111 } 112 line = strings.TrimSpace(line) 113 if len(line) == 0 { 114 return 115 } 116 commentStart := strings.Index(line, comment) 117 if commentStart > 0 { 118 line = line[0:commentStart] 119 } 120 pieces := strings.Split(line, sep) 121 if len(pieces) != 2 { 122 log.Printf("unexpected %d pieces in %s", len(pieces), line) 123 return 124 } 125 126 propertyName := strings.TrimSpace(pieces[1]) 127 128 rangeTable, ok := propertyRanges[propertyName] 129 if !ok { 130 rangeTable = &unicode.RangeTable{ 131 LatinOffset: 0, 132 } 133 propertyRanges[propertyName] = rangeTable 134 } 135 136 codepointRange := strings.TrimSpace(pieces[0]) 137 rngeIndex := strings.Index(codepointRange, rnge) 138 139 if rngeIndex < 0 { 140 // single codepoint, not range 141 codepointInt, err := strconv.ParseUint(codepointRange, 16, 64) 142 if err != nil { 143 log.Printf("error parsing int: %v", err) 144 return 145 } 146 if codepointInt < 0x10000 { 147 r16 := unicode.Range16{ 148 Lo: uint16(codepointInt), 149 Hi: uint16(codepointInt), 150 Stride: 1, 151 } 152 addR16ToTable(rangeTable, r16) 153 } else { 154 r32 := unicode.Range32{ 155 Lo: uint32(codepointInt), 156 Hi: uint32(codepointInt), 157 Stride: 1, 158 } 159 addR32ToTable(rangeTable, r32) 160 } 161 } else { 162 rngeStart := codepointRange[0:rngeIndex] 163 rngeEnd := codepointRange[rngeIndex+2:] 164 rngeStartInt, err := strconv.ParseUint(rngeStart, 16, 64) 165 if err != nil { 166 log.Printf("error parsing int: %v", err) 167 return 168 } 169 rngeEndInt, err := strconv.ParseUint(rngeEnd, 16, 64) 170 if err != nil { 171 log.Printf("error parsing int: %v", err) 172 return 173 } 174 if rngeStartInt < 0x10000 && rngeEndInt < 0x10000 { 175 r16 := unicode.Range16{ 176 Lo: uint16(rngeStartInt), 177 Hi: uint16(rngeEndInt), 178 Stride: 1, 179 } 180 addR16ToTable(rangeTable, r16) 181 } else if rngeStartInt >= 0x10000 && rngeEndInt >= 0x10000 { 182 r32 := unicode.Range32{ 183 Lo: uint32(rngeStartInt), 184 Hi: uint32(rngeEndInt), 185 Stride: 1, 186 } 187 addR32ToTable(rangeTable, r32) 188 } else { 189 log.Printf("unexpected range") 190 } 191 } 192} 193 194func addR16ToTable(r *unicode.RangeTable, r16 unicode.Range16) { 195 if r.R16 == nil { 196 r.R16 = make([]unicode.Range16, 0, 1) 197 } 198 r.R16 = append(r.R16, r16) 199 if r16.Hi <= unicode.MaxLatin1 { 200 r.LatinOffset++ 201 } 202} 203 204func addR32ToTable(r *unicode.RangeTable, r32 unicode.Range32) { 205 if r.R32 == nil { 206 r.R32 = make([]unicode.Range32, 0, 1) 207 } 208 r.R32 = append(r.R32, r32) 209} 210 211func generateTables(prefix string, propertyRanges map[string]*unicode.RangeTable) { 212 prNames := make([]string, 0, len(propertyRanges)) 213 for k := range propertyRanges { 214 prNames = append(prNames, k) 215 } 216 sort.Strings(prNames) 217 for _, key := range prNames { 218 rt := propertyRanges[key] 219 fmt.Fprintf(output, "var _%s%s = %s\n", prefix, key, generateRangeTable(rt)) 220 } 221 fmt.Fprintf(output, "type _%sRuneRange unicode.RangeTable\n", prefix) 222 223 fmt.Fprintf(output, "func _%sRuneType(r rune) *_%sRuneRange {\n", prefix, prefix) 224 fmt.Fprintf(output, "\tswitch {\n") 225 for _, key := range prNames { 226 fmt.Fprintf(output, "\tcase unicode.Is(_%s%s, r):\n\t\treturn (*_%sRuneRange)(_%s%s)\n", prefix, key, prefix, prefix, key) 227 } 228 fmt.Fprintf(output, "\tdefault:\n\t\treturn nil\n") 229 fmt.Fprintf(output, "\t}\n") 230 fmt.Fprintf(output, "}\n") 231 232 fmt.Fprintf(output, "func (rng *_%sRuneRange) String() string {\n", prefix) 233 fmt.Fprintf(output, "\tswitch (*unicode.RangeTable)(rng) {\n") 234 for _, key := range prNames { 235 fmt.Fprintf(output, "\tcase _%s%s:\n\t\treturn %q\n", prefix, key, key) 236 } 237 fmt.Fprintf(output, "\tdefault:\n\t\treturn \"Other\"\n") 238 fmt.Fprintf(output, "\t}\n") 239 fmt.Fprintf(output, "}\n") 240} 241 242func generateRangeTable(rt *unicode.RangeTable) string { 243 rv := "&unicode.RangeTable{\n" 244 if rt.R16 != nil { 245 rv += "\tR16: []unicode.Range16{\n" 246 for _, r16 := range rt.R16 { 247 rv += fmt.Sprintf("\t\t%#v,\n", r16) 248 } 249 rv += "\t},\n" 250 } 251 if rt.R32 != nil { 252 rv += "\tR32: []unicode.Range32{\n" 253 for _, r32 := range rt.R32 { 254 rv += fmt.Sprintf("\t\t%#v,\n", r32) 255 } 256 rv += "\t},\n" 257 } 258 rv += fmt.Sprintf("\t\tLatinOffset: %d,\n", rt.LatinOffset) 259 rv += "}\n" 260 return rv 261} 262 263const fileHeader = `// Generated by running 264// maketables --url=%s 265// DO NOT EDIT 266 267package textseg 268 269import( 270 "unicode" 271) 272` 273 274func setupOutput() { 275 output = bufio.NewWriter(startGofmt()) 276} 277 278// startGofmt connects output to a gofmt process if -output is set. 279func startGofmt() io.Writer { 280 if *outputFile == "" { 281 return os.Stdout 282 } 283 stdout, err := os.Create(*outputFile) 284 if err != nil { 285 log.Fatal(err) 286 } 287 // Pipe output to gofmt. 288 gofmt := exec.Command("gofmt") 289 fd, err := gofmt.StdinPipe() 290 if err != nil { 291 log.Fatal(err) 292 } 293 gofmt.Stdout = stdout 294 gofmt.Stderr = os.Stderr 295 err = gofmt.Start() 296 if err != nil { 297 log.Fatal(err) 298 } 299 return fd 300} 301 302func flushOutput() { 303 err := output.Flush() 304 if err != nil { 305 log.Fatal(err) 306 } 307} 308