1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7package main
8
9import (
10	"bufio"
11	"fmt"
12	"log"
13	"net/http"
14	"sort"
15	"strings"
16	"unicode/utf8"
17
18	"golang.org/x/text/encoding"
19	"golang.org/x/text/internal/gen"
20)
21
22const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
23	"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
24	` !"#$%&'()*+,-./0123456789:;<=>?` +
25	`@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` +
26	"`abcdefghijklmnopqrstuvwxyz{|}~\u007f"
27
28var encodings = []struct {
29	name        string
30	mib         string
31	comment     string
32	varName     string
33	replacement byte
34	mapping     string
35}{
36	{
37		"IBM Code Page 037",
38		"IBM037",
39		"",
40		"CodePage037",
41		0x3f,
42		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM037-2.1.2.ucm",
43	},
44	{
45		"IBM Code Page 437",
46		"PC8CodePage437",
47		"",
48		"CodePage437",
49		encoding.ASCIISub,
50		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm",
51	},
52	{
53		"IBM Code Page 850",
54		"PC850Multilingual",
55		"",
56		"CodePage850",
57		encoding.ASCIISub,
58		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm",
59	},
60	{
61		"IBM Code Page 852",
62		"PCp852",
63		"",
64		"CodePage852",
65		encoding.ASCIISub,
66		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm",
67	},
68	{
69		"IBM Code Page 855",
70		"IBM855",
71		"",
72		"CodePage855",
73		encoding.ASCIISub,
74		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm",
75	},
76	{
77		"Windows Code Page 858", // PC latin1 with Euro
78		"IBM00858",
79		"",
80		"CodePage858",
81		encoding.ASCIISub,
82		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm",
83	},
84	{
85		"IBM Code Page 860",
86		"IBM860",
87		"",
88		"CodePage860",
89		encoding.ASCIISub,
90		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm",
91	},
92	{
93		"IBM Code Page 862",
94		"PC862LatinHebrew",
95		"",
96		"CodePage862",
97		encoding.ASCIISub,
98		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm",
99	},
100	{
101		"IBM Code Page 863",
102		"IBM863",
103		"",
104		"CodePage863",
105		encoding.ASCIISub,
106		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm",
107	},
108	{
109		"IBM Code Page 865",
110		"IBM865",
111		"",
112		"CodePage865",
113		encoding.ASCIISub,
114		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm",
115	},
116	{
117		"IBM Code Page 866",
118		"IBM866",
119		"",
120		"CodePage866",
121		encoding.ASCIISub,
122		"http://encoding.spec.whatwg.org/index-ibm866.txt",
123	},
124	{
125		"IBM Code Page 1047",
126		"IBM1047",
127		"",
128		"CodePage1047",
129		0x3f,
130		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm",
131	},
132	{
133		"IBM Code Page 1140",
134		"IBM01140",
135		"",
136		"CodePage1140",
137		0x3f,
138		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ibm-1140_P100-1997.ucm",
139	},
140	{
141		"ISO 8859-1",
142		"ISOLatin1",
143		"",
144		"ISO8859_1",
145		encoding.ASCIISub,
146		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm",
147	},
148	{
149		"ISO 8859-2",
150		"ISOLatin2",
151		"",
152		"ISO8859_2",
153		encoding.ASCIISub,
154		"http://encoding.spec.whatwg.org/index-iso-8859-2.txt",
155	},
156	{
157		"ISO 8859-3",
158		"ISOLatin3",
159		"",
160		"ISO8859_3",
161		encoding.ASCIISub,
162		"http://encoding.spec.whatwg.org/index-iso-8859-3.txt",
163	},
164	{
165		"ISO 8859-4",
166		"ISOLatin4",
167		"",
168		"ISO8859_4",
169		encoding.ASCIISub,
170		"http://encoding.spec.whatwg.org/index-iso-8859-4.txt",
171	},
172	{
173		"ISO 8859-5",
174		"ISOLatinCyrillic",
175		"",
176		"ISO8859_5",
177		encoding.ASCIISub,
178		"http://encoding.spec.whatwg.org/index-iso-8859-5.txt",
179	},
180	{
181		"ISO 8859-6",
182		"ISOLatinArabic",
183		"",
184		"ISO8859_6,ISO8859_6E,ISO8859_6I",
185		encoding.ASCIISub,
186		"http://encoding.spec.whatwg.org/index-iso-8859-6.txt",
187	},
188	{
189		"ISO 8859-7",
190		"ISOLatinGreek",
191		"",
192		"ISO8859_7",
193		encoding.ASCIISub,
194		"http://encoding.spec.whatwg.org/index-iso-8859-7.txt",
195	},
196	{
197		"ISO 8859-8",
198		"ISOLatinHebrew",
199		"",
200		"ISO8859_8,ISO8859_8E,ISO8859_8I",
201		encoding.ASCIISub,
202		"http://encoding.spec.whatwg.org/index-iso-8859-8.txt",
203	},
204	{
205		"ISO 8859-9",
206		"ISOLatin5",
207		"",
208		"ISO8859_9",
209		encoding.ASCIISub,
210		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_9-1999.ucm",
211	},
212	{
213		"ISO 8859-10",
214		"ISOLatin6",
215		"",
216		"ISO8859_10",
217		encoding.ASCIISub,
218		"http://encoding.spec.whatwg.org/index-iso-8859-10.txt",
219	},
220	{
221		"ISO 8859-13",
222		"ISO885913",
223		"",
224		"ISO8859_13",
225		encoding.ASCIISub,
226		"http://encoding.spec.whatwg.org/index-iso-8859-13.txt",
227	},
228	{
229		"ISO 8859-14",
230		"ISO885914",
231		"",
232		"ISO8859_14",
233		encoding.ASCIISub,
234		"http://encoding.spec.whatwg.org/index-iso-8859-14.txt",
235	},
236	{
237		"ISO 8859-15",
238		"ISO885915",
239		"",
240		"ISO8859_15",
241		encoding.ASCIISub,
242		"http://encoding.spec.whatwg.org/index-iso-8859-15.txt",
243	},
244	{
245		"ISO 8859-16",
246		"ISO885916",
247		"",
248		"ISO8859_16",
249		encoding.ASCIISub,
250		"http://encoding.spec.whatwg.org/index-iso-8859-16.txt",
251	},
252	{
253		"KOI8-R",
254		"KOI8R",
255		"",
256		"KOI8R",
257		encoding.ASCIISub,
258		"http://encoding.spec.whatwg.org/index-koi8-r.txt",
259	},
260	{
261		"KOI8-U",
262		"KOI8U",
263		"",
264		"KOI8U",
265		encoding.ASCIISub,
266		"http://encoding.spec.whatwg.org/index-koi8-u.txt",
267	},
268	{
269		"Macintosh",
270		"Macintosh",
271		"",
272		"Macintosh",
273		encoding.ASCIISub,
274		"http://encoding.spec.whatwg.org/index-macintosh.txt",
275	},
276	{
277		"Macintosh Cyrillic",
278		"MacintoshCyrillic",
279		"",
280		"MacintoshCyrillic",
281		encoding.ASCIISub,
282		"http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt",
283	},
284	{
285		"Windows 874",
286		"Windows874",
287		"",
288		"Windows874",
289		encoding.ASCIISub,
290		"http://encoding.spec.whatwg.org/index-windows-874.txt",
291	},
292	{
293		"Windows 1250",
294		"Windows1250",
295		"",
296		"Windows1250",
297		encoding.ASCIISub,
298		"http://encoding.spec.whatwg.org/index-windows-1250.txt",
299	},
300	{
301		"Windows 1251",
302		"Windows1251",
303		"",
304		"Windows1251",
305		encoding.ASCIISub,
306		"http://encoding.spec.whatwg.org/index-windows-1251.txt",
307	},
308	{
309		"Windows 1252",
310		"Windows1252",
311		"",
312		"Windows1252",
313		encoding.ASCIISub,
314		"http://encoding.spec.whatwg.org/index-windows-1252.txt",
315	},
316	{
317		"Windows 1253",
318		"Windows1253",
319		"",
320		"Windows1253",
321		encoding.ASCIISub,
322		"http://encoding.spec.whatwg.org/index-windows-1253.txt",
323	},
324	{
325		"Windows 1254",
326		"Windows1254",
327		"",
328		"Windows1254",
329		encoding.ASCIISub,
330		"http://encoding.spec.whatwg.org/index-windows-1254.txt",
331	},
332	{
333		"Windows 1255",
334		"Windows1255",
335		"",
336		"Windows1255",
337		encoding.ASCIISub,
338		"http://encoding.spec.whatwg.org/index-windows-1255.txt",
339	},
340	{
341		"Windows 1256",
342		"Windows1256",
343		"",
344		"Windows1256",
345		encoding.ASCIISub,
346		"http://encoding.spec.whatwg.org/index-windows-1256.txt",
347	},
348	{
349		"Windows 1257",
350		"Windows1257",
351		"",
352		"Windows1257",
353		encoding.ASCIISub,
354		"http://encoding.spec.whatwg.org/index-windows-1257.txt",
355	},
356	{
357		"Windows 1258",
358		"Windows1258",
359		"",
360		"Windows1258",
361		encoding.ASCIISub,
362		"http://encoding.spec.whatwg.org/index-windows-1258.txt",
363	},
364	{
365		"X-User-Defined",
366		"XUserDefined",
367		"It is defined at http://encoding.spec.whatwg.org/#x-user-defined",
368		"XUserDefined",
369		encoding.ASCIISub,
370		ascii +
371			"\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" +
372			"\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" +
373			"\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" +
374			"\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" +
375			"\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" +
376			"\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" +
377			"\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" +
378			"\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" +
379			"\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" +
380			"\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" +
381			"\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" +
382			"\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" +
383			"\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" +
384			"\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" +
385			"\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" +
386			"\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff",
387	},
388}
389
390func getWHATWG(url string) string {
391	res, err := http.Get(url)
392	if err != nil {
393		log.Fatalf("%q: Get: %v", url, err)
394	}
395	defer res.Body.Close()
396
397	mapping := make([]rune, 128)
398	for i := range mapping {
399		mapping[i] = '\ufffd'
400	}
401
402	scanner := bufio.NewScanner(res.Body)
403	for scanner.Scan() {
404		s := strings.TrimSpace(scanner.Text())
405		if s == "" || s[0] == '#' {
406			continue
407		}
408		x, y := 0, 0
409		if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil {
410			log.Fatalf("could not parse %q", s)
411		}
412		if x < 0 || 128 <= x {
413			log.Fatalf("code %d is out of range", x)
414		}
415		if 0x80 <= y && y < 0xa0 {
416			// We diverge from the WHATWG spec by mapping control characters
417			// in the range [0x80, 0xa0) to U+FFFD.
418			continue
419		}
420		mapping[x] = rune(y)
421	}
422	return ascii + string(mapping)
423}
424
425func getUCM(url string) string {
426	res, err := http.Get(url)
427	if err != nil {
428		log.Fatalf("%q: Get: %v", url, err)
429	}
430	defer res.Body.Close()
431
432	mapping := make([]rune, 256)
433	for i := range mapping {
434		mapping[i] = '\ufffd'
435	}
436
437	charsFound := 0
438	scanner := bufio.NewScanner(res.Body)
439	for scanner.Scan() {
440		s := strings.TrimSpace(scanner.Text())
441		if s == "" || s[0] == '#' {
442			continue
443		}
444		var c byte
445		var r rune
446		if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil {
447			continue
448		}
449		mapping[c] = r
450		charsFound++
451	}
452
453	if charsFound < 200 {
454		log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound)
455	}
456
457	return string(mapping)
458}
459
460func main() {
461	mibs := map[string]bool{}
462	all := []string{}
463
464	w := gen.NewCodeWriter()
465	defer w.WriteGoFile("tables.go", "charmap")
466
467	printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) }
468
469	printf("import (\n")
470	printf("\t\"golang.org/x/text/encoding\"\n")
471	printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n")
472	printf(")\n\n")
473	for _, e := range encodings {
474		varNames := strings.Split(e.varName, ",")
475		all = append(all, varNames...)
476		varName := varNames[0]
477		switch {
478		case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"):
479			e.mapping = getWHATWG(e.mapping)
480		case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"):
481			e.mapping = getUCM(e.mapping)
482		}
483
484		asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00
485		if asciiSuperset {
486			low = 0x80
487		}
488		lvn := 1
489		if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") {
490			lvn = 3
491		}
492		lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:]
493		printf("// %s is the %s encoding.\n", varName, e.name)
494		if e.comment != "" {
495			printf("//\n// %s\n", e.comment)
496		}
497		printf("var %s *Charmap = &%s\n\nvar %s = Charmap{\nname: %q,\n",
498			varName, lowerVarName, lowerVarName, e.name)
499		if mibs[e.mib] {
500			log.Fatalf("MIB type %q declared multiple times.", e.mib)
501		}
502		printf("mib: identifier.%s,\n", e.mib)
503		printf("asciiSuperset: %t,\n", asciiSuperset)
504		printf("low: 0x%02x,\n", low)
505		printf("replacement: 0x%02x,\n", e.replacement)
506
507		printf("decode: [256]utf8Enc{\n")
508		i, backMapping := 0, map[rune]byte{}
509		for _, c := range e.mapping {
510			if _, ok := backMapping[c]; !ok && c != utf8.RuneError {
511				backMapping[c] = byte(i)
512			}
513			var buf [8]byte
514			n := utf8.EncodeRune(buf[:], c)
515			if n > 3 {
516				panic(fmt.Sprintf("rune %q (%U) is too long", c, c))
517			}
518			printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2])
519			if i%2 == 1 {
520				printf("\n")
521			}
522			i++
523		}
524		printf("},\n")
525
526		printf("encode: [256]uint32{\n")
527		encode := make([]uint32, 0, 256)
528		for c, i := range backMapping {
529			encode = append(encode, uint32(i)<<24|uint32(c))
530		}
531		sort.Sort(byRune(encode))
532		for len(encode) < cap(encode) {
533			encode = append(encode, encode[len(encode)-1])
534		}
535		for i, enc := range encode {
536			printf("0x%08x,", enc)
537			if i%8 == 7 {
538				printf("\n")
539			}
540		}
541		printf("},\n}\n")
542
543		// Add an estimate of the size of a single Charmap{} struct value, which
544		// includes two 256 elem arrays of 4 bytes and some extra fields, which
545		// align to 3 uint64s on 64-bit architectures.
546		w.Size += 2*4*256 + 3*8
547	}
548	// TODO: add proper line breaking.
549	printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n"))
550}
551
552type byRune []uint32
553
554func (b byRune) Len() int           { return len(b) }
555func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff }
556func (b byRune) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
557