1//  Copyright (c) 2014 Couchbase, Inc.
2//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
3//  except in compliance with the License. You may obtain a copy of the License at
4//    http://www.apache.org/licenses/LICENSE-2.0
5//  Unless required by applicable law or agreed to in writing, software distributed under the
6//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
7//  either express or implied. See the License for the specific language governing permissions
8//  and limitations under the License.
9
10// Modified by Martin Atkins to serve the needs of package textseg.
11
12// +build ignore
13
14package main
15
16import (
17	"bufio"
18	"flag"
19	"fmt"
20	"io"
21	"log"
22	"net/http"
23	"os"
24	"os/exec"
25	"sort"
26	"strconv"
27	"strings"
28	"unicode"
29)
30
31var url = flag.String("url",
32	"http://www.unicode.org/Public/12.0.0/ucd/auxiliary/",
33	"URL of Unicode database directory")
34var verbose = flag.Bool("verbose",
35	false,
36	"write data to stdout as it is parsed")
37var localFiles = flag.Bool("local",
38	false,
39	"data files have been copied to the current directory; for debugging only")
40var outputFile = flag.String("output",
41	"",
42	"output file for generated tables; default stdout")
43
44var output *bufio.Writer
45
46func main() {
47	flag.Parse()
48	setupOutput()
49
50	graphemePropertyRanges := make(map[string]*unicode.RangeTable)
51	loadUnicodeData("GraphemeBreakProperty.txt", graphemePropertyRanges)
52	wordPropertyRanges := make(map[string]*unicode.RangeTable)
53	loadUnicodeData("WordBreakProperty.txt", wordPropertyRanges)
54	sentencePropertyRanges := make(map[string]*unicode.RangeTable)
55	loadUnicodeData("SentenceBreakProperty.txt", sentencePropertyRanges)
56
57	fmt.Fprintf(output, fileHeader, *url)
58	generateTables("Grapheme", graphemePropertyRanges)
59	generateTables("Word", wordPropertyRanges)
60	generateTables("Sentence", sentencePropertyRanges)
61
62	flushOutput()
63}
64
65// WordBreakProperty.txt has the form:
66// 05F0..05F2    ; Hebrew_Letter # Lo   [3] HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW LIGATURE YIDDISH DOUBLE YOD
67// FB1D          ; Hebrew_Letter # Lo       HEBREW LETTER YOD WITH HIRIQ
68func openReader(file string) (input io.ReadCloser) {
69	if *localFiles {
70		f, err := os.Open(file)
71		if err != nil {
72			log.Fatal(err)
73		}
74		input = f
75	} else {
76		path := *url + file
77		resp, err := http.Get(path)
78		if err != nil {
79			log.Fatal(err)
80		}
81		if resp.StatusCode != 200 {
82			log.Fatal("bad GET status for "+file, resp.Status)
83		}
84		input = resp.Body
85	}
86	return
87}
88
89func loadUnicodeData(filename string, propertyRanges map[string]*unicode.RangeTable) {
90	f := openReader(filename)
91	defer f.Close()
92	bufioReader := bufio.NewReader(f)
93	line, err := bufioReader.ReadString('\n')
94	for err == nil {
95		parseLine(line, propertyRanges)
96		line, err = bufioReader.ReadString('\n')
97	}
98	// if the err was EOF still need to process last value
99	if err == io.EOF {
100		parseLine(line, propertyRanges)
101	}
102}
103
104const comment = "#"
105const sep = ";"
106const rnge = ".."
107
108func parseLine(line string, propertyRanges map[string]*unicode.RangeTable) {
109	if strings.HasPrefix(line, comment) {
110		return
111	}
112	line = strings.TrimSpace(line)
113	if len(line) == 0 {
114		return
115	}
116	commentStart := strings.Index(line, comment)
117	if commentStart > 0 {
118		line = line[0:commentStart]
119	}
120	pieces := strings.Split(line, sep)
121	if len(pieces) != 2 {
122		log.Printf("unexpected %d pieces in %s", len(pieces), line)
123		return
124	}
125
126	propertyName := strings.TrimSpace(pieces[1])
127
128	rangeTable, ok := propertyRanges[propertyName]
129	if !ok {
130		rangeTable = &unicode.RangeTable{
131			LatinOffset: 0,
132		}
133		propertyRanges[propertyName] = rangeTable
134	}
135
136	codepointRange := strings.TrimSpace(pieces[0])
137	rngeIndex := strings.Index(codepointRange, rnge)
138
139	if rngeIndex < 0 {
140		// single codepoint, not range
141		codepointInt, err := strconv.ParseUint(codepointRange, 16, 64)
142		if err != nil {
143			log.Printf("error parsing int: %v", err)
144			return
145		}
146		if codepointInt < 0x10000 {
147			r16 := unicode.Range16{
148				Lo:     uint16(codepointInt),
149				Hi:     uint16(codepointInt),
150				Stride: 1,
151			}
152			addR16ToTable(rangeTable, r16)
153		} else {
154			r32 := unicode.Range32{
155				Lo:     uint32(codepointInt),
156				Hi:     uint32(codepointInt),
157				Stride: 1,
158			}
159			addR32ToTable(rangeTable, r32)
160		}
161	} else {
162		rngeStart := codepointRange[0:rngeIndex]
163		rngeEnd := codepointRange[rngeIndex+2:]
164		rngeStartInt, err := strconv.ParseUint(rngeStart, 16, 64)
165		if err != nil {
166			log.Printf("error parsing int: %v", err)
167			return
168		}
169		rngeEndInt, err := strconv.ParseUint(rngeEnd, 16, 64)
170		if err != nil {
171			log.Printf("error parsing int: %v", err)
172			return
173		}
174		if rngeStartInt < 0x10000 && rngeEndInt < 0x10000 {
175			r16 := unicode.Range16{
176				Lo:     uint16(rngeStartInt),
177				Hi:     uint16(rngeEndInt),
178				Stride: 1,
179			}
180			addR16ToTable(rangeTable, r16)
181		} else if rngeStartInt >= 0x10000 && rngeEndInt >= 0x10000 {
182			r32 := unicode.Range32{
183				Lo:     uint32(rngeStartInt),
184				Hi:     uint32(rngeEndInt),
185				Stride: 1,
186			}
187			addR32ToTable(rangeTable, r32)
188		} else {
189			log.Printf("unexpected range")
190		}
191	}
192}
193
194func addR16ToTable(r *unicode.RangeTable, r16 unicode.Range16) {
195	if r.R16 == nil {
196		r.R16 = make([]unicode.Range16, 0, 1)
197	}
198	r.R16 = append(r.R16, r16)
199	if r16.Hi <= unicode.MaxLatin1 {
200		r.LatinOffset++
201	}
202}
203
204func addR32ToTable(r *unicode.RangeTable, r32 unicode.Range32) {
205	if r.R32 == nil {
206		r.R32 = make([]unicode.Range32, 0, 1)
207	}
208	r.R32 = append(r.R32, r32)
209}
210
211func generateTables(prefix string, propertyRanges map[string]*unicode.RangeTable) {
212	prNames := make([]string, 0, len(propertyRanges))
213	for k := range propertyRanges {
214		prNames = append(prNames, k)
215	}
216	sort.Strings(prNames)
217	for _, key := range prNames {
218		rt := propertyRanges[key]
219		fmt.Fprintf(output, "var _%s%s = %s\n", prefix, key, generateRangeTable(rt))
220	}
221	fmt.Fprintf(output, "type _%sRuneRange unicode.RangeTable\n", prefix)
222
223	fmt.Fprintf(output, "func _%sRuneType(r rune) *_%sRuneRange {\n", prefix, prefix)
224	fmt.Fprintf(output, "\tswitch {\n")
225	for _, key := range prNames {
226		fmt.Fprintf(output, "\tcase unicode.Is(_%s%s, r):\n\t\treturn (*_%sRuneRange)(_%s%s)\n", prefix, key, prefix, prefix, key)
227	}
228	fmt.Fprintf(output, "\tdefault:\n\t\treturn nil\n")
229	fmt.Fprintf(output, "\t}\n")
230	fmt.Fprintf(output, "}\n")
231
232	fmt.Fprintf(output, "func (rng *_%sRuneRange) String() string {\n", prefix)
233	fmt.Fprintf(output, "\tswitch (*unicode.RangeTable)(rng) {\n")
234	for _, key := range prNames {
235		fmt.Fprintf(output, "\tcase _%s%s:\n\t\treturn %q\n", prefix, key, key)
236	}
237	fmt.Fprintf(output, "\tdefault:\n\t\treturn \"Other\"\n")
238	fmt.Fprintf(output, "\t}\n")
239	fmt.Fprintf(output, "}\n")
240}
241
242func generateRangeTable(rt *unicode.RangeTable) string {
243	rv := "&unicode.RangeTable{\n"
244	if rt.R16 != nil {
245		rv += "\tR16: []unicode.Range16{\n"
246		for _, r16 := range rt.R16 {
247			rv += fmt.Sprintf("\t\t%#v,\n", r16)
248		}
249		rv += "\t},\n"
250	}
251	if rt.R32 != nil {
252		rv += "\tR32: []unicode.Range32{\n"
253		for _, r32 := range rt.R32 {
254			rv += fmt.Sprintf("\t\t%#v,\n", r32)
255		}
256		rv += "\t},\n"
257	}
258	rv += fmt.Sprintf("\t\tLatinOffset: %d,\n", rt.LatinOffset)
259	rv += "}\n"
260	return rv
261}
262
263const fileHeader = `// Generated by running
264//      maketables --url=%s
265// DO NOT EDIT
266
267package textseg
268
269import(
270	"unicode"
271)
272`
273
274func setupOutput() {
275	output = bufio.NewWriter(startGofmt())
276}
277
278// startGofmt connects output to a gofmt process if -output is set.
279func startGofmt() io.Writer {
280	if *outputFile == "" {
281		return os.Stdout
282	}
283	stdout, err := os.Create(*outputFile)
284	if err != nil {
285		log.Fatal(err)
286	}
287	// Pipe output to gofmt.
288	gofmt := exec.Command("gofmt")
289	fd, err := gofmt.StdinPipe()
290	if err != nil {
291		log.Fatal(err)
292	}
293	gofmt.Stdout = stdout
294	gofmt.Stderr = os.Stderr
295	err = gofmt.Start()
296	if err != nil {
297		log.Fatal(err)
298	}
299	return fd
300}
301
302func flushOutput() {
303	err := output.Flush()
304	if err != nil {
305		log.Fatal(err)
306	}
307}
308