1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7//go:generate go run gen.go
8//go:generate go run gen.go -test
9
10package main
11
12import (
13	"bytes"
14	"flag"
15	"fmt"
16	"go/format"
17	"io/ioutil"
18	"math/rand"
19	"os"
20	"sort"
21	"strings"
22)
23
24// identifier converts s to a Go exported identifier.
25// It converts "div" to "Div" and "accept-charset" to "AcceptCharset".
26func identifier(s string) string {
27	b := make([]byte, 0, len(s))
28	cap := true
29	for _, c := range s {
30		if c == '-' {
31			cap = true
32			continue
33		}
34		if cap && 'a' <= c && c <= 'z' {
35			c -= 'a' - 'A'
36		}
37		cap = false
38		b = append(b, byte(c))
39	}
40	return string(b)
41}
42
43var test = flag.Bool("test", false, "generate table_test.go")
44
45func genFile(name string, buf *bytes.Buffer) {
46	b, err := format.Source(buf.Bytes())
47	if err != nil {
48		fmt.Fprintln(os.Stderr, err)
49		os.Exit(1)
50	}
51	if err := ioutil.WriteFile(name, b, 0644); err != nil {
52		fmt.Fprintln(os.Stderr, err)
53		os.Exit(1)
54	}
55}
56
57func main() {
58	flag.Parse()
59
60	var all []string
61	all = append(all, elements...)
62	all = append(all, attributes...)
63	all = append(all, eventHandlers...)
64	all = append(all, extra...)
65	sort.Strings(all)
66
67	// uniq - lists have dups
68	w := 0
69	for _, s := range all {
70		if w == 0 || all[w-1] != s {
71			all[w] = s
72			w++
73		}
74	}
75	all = all[:w]
76
77	if *test {
78		var buf bytes.Buffer
79		fmt.Fprintln(&buf, "// Code generated by go generate gen.go; DO NOT EDIT.\n")
80		fmt.Fprintln(&buf, "//go:generate go run gen.go -test\n")
81		fmt.Fprintln(&buf, "package atom\n")
82		fmt.Fprintln(&buf, "var testAtomList = []string{")
83		for _, s := range all {
84			fmt.Fprintf(&buf, "\t%q,\n", s)
85		}
86		fmt.Fprintln(&buf, "}")
87
88		genFile("table_test.go", &buf)
89		return
90	}
91
92	// Find hash that minimizes table size.
93	var best *table
94	for i := 0; i < 1000000; i++ {
95		if best != nil && 1<<(best.k-1) < len(all) {
96			break
97		}
98		h := rand.Uint32()
99		for k := uint(0); k <= 16; k++ {
100			if best != nil && k >= best.k {
101				break
102			}
103			var t table
104			if t.init(h, k, all) {
105				best = &t
106				break
107			}
108		}
109	}
110	if best == nil {
111		fmt.Fprintf(os.Stderr, "failed to construct string table\n")
112		os.Exit(1)
113	}
114
115	// Lay out strings, using overlaps when possible.
116	layout := append([]string{}, all...)
117
118	// Remove strings that are substrings of other strings
119	for changed := true; changed; {
120		changed = false
121		for i, s := range layout {
122			if s == "" {
123				continue
124			}
125			for j, t := range layout {
126				if i != j && t != "" && strings.Contains(s, t) {
127					changed = true
128					layout[j] = ""
129				}
130			}
131		}
132	}
133
134	// Join strings where one suffix matches another prefix.
135	for {
136		// Find best i, j, k such that layout[i][len-k:] == layout[j][:k],
137		// maximizing overlap length k.
138		besti := -1
139		bestj := -1
140		bestk := 0
141		for i, s := range layout {
142			if s == "" {
143				continue
144			}
145			for j, t := range layout {
146				if i == j {
147					continue
148				}
149				for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
150					if s[len(s)-k:] == t[:k] {
151						besti = i
152						bestj = j
153						bestk = k
154					}
155				}
156			}
157		}
158		if bestk > 0 {
159			layout[besti] += layout[bestj][bestk:]
160			layout[bestj] = ""
161			continue
162		}
163		break
164	}
165
166	text := strings.Join(layout, "")
167
168	atom := map[string]uint32{}
169	for _, s := range all {
170		off := strings.Index(text, s)
171		if off < 0 {
172			panic("lost string " + s)
173		}
174		atom[s] = uint32(off<<8 | len(s))
175	}
176
177	var buf bytes.Buffer
178	// Generate the Go code.
179	fmt.Fprintln(&buf, "// Code generated by go generate gen.go; DO NOT EDIT.\n")
180	fmt.Fprintln(&buf, "//go:generate go run gen.go\n")
181	fmt.Fprintln(&buf, "package atom\n\nconst (")
182
183	// compute max len
184	maxLen := 0
185	for _, s := range all {
186		if maxLen < len(s) {
187			maxLen = len(s)
188		}
189		fmt.Fprintf(&buf, "\t%s Atom = %#x\n", identifier(s), atom[s])
190	}
191	fmt.Fprintln(&buf, ")\n")
192
193	fmt.Fprintf(&buf, "const hash0 = %#x\n\n", best.h0)
194	fmt.Fprintf(&buf, "const maxAtomLen = %d\n\n", maxLen)
195
196	fmt.Fprintf(&buf, "var table = [1<<%d]Atom{\n", best.k)
197	for i, s := range best.tab {
198		if s == "" {
199			continue
200		}
201		fmt.Fprintf(&buf, "\t%#x: %#x, // %s\n", i, atom[s], s)
202	}
203	fmt.Fprintf(&buf, "}\n")
204	datasize := (1 << best.k) * 4
205
206	fmt.Fprintln(&buf, "const atomText =")
207	textsize := len(text)
208	for len(text) > 60 {
209		fmt.Fprintf(&buf, "\t%q +\n", text[:60])
210		text = text[60:]
211	}
212	fmt.Fprintf(&buf, "\t%q\n\n", text)
213
214	genFile("table.go", &buf)
215
216	fmt.Fprintf(os.Stdout, "%d atoms; %d string bytes + %d tables = %d total data\n", len(all), textsize, datasize, textsize+datasize)
217}
218
219type byLen []string
220
221func (x byLen) Less(i, j int) bool { return len(x[i]) > len(x[j]) }
222func (x byLen) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
223func (x byLen) Len() int           { return len(x) }
224
225// fnv computes the FNV hash with an arbitrary starting value h.
226func fnv(h uint32, s string) uint32 {
227	for i := 0; i < len(s); i++ {
228		h ^= uint32(s[i])
229		h *= 16777619
230	}
231	return h
232}
233
234// A table represents an attempt at constructing the lookup table.
235// The lookup table uses cuckoo hashing, meaning that each string
236// can be found in one of two positions.
237type table struct {
238	h0   uint32
239	k    uint
240	mask uint32
241	tab  []string
242}
243
244// hash returns the two hashes for s.
245func (t *table) hash(s string) (h1, h2 uint32) {
246	h := fnv(t.h0, s)
247	h1 = h & t.mask
248	h2 = (h >> 16) & t.mask
249	return
250}
251
252// init initializes the table with the given parameters.
253// h0 is the initial hash value,
254// k is the number of bits of hash value to use, and
255// x is the list of strings to store in the table.
256// init returns false if the table cannot be constructed.
257func (t *table) init(h0 uint32, k uint, x []string) bool {
258	t.h0 = h0
259	t.k = k
260	t.tab = make([]string, 1<<k)
261	t.mask = 1<<k - 1
262	for _, s := range x {
263		if !t.insert(s) {
264			return false
265		}
266	}
267	return true
268}
269
270// insert inserts s in the table.
271func (t *table) insert(s string) bool {
272	h1, h2 := t.hash(s)
273	if t.tab[h1] == "" {
274		t.tab[h1] = s
275		return true
276	}
277	if t.tab[h2] == "" {
278		t.tab[h2] = s
279		return true
280	}
281	if t.push(h1, 0) {
282		t.tab[h1] = s
283		return true
284	}
285	if t.push(h2, 0) {
286		t.tab[h2] = s
287		return true
288	}
289	return false
290}
291
292// push attempts to push aside the entry in slot i.
293func (t *table) push(i uint32, depth int) bool {
294	if depth > len(t.tab) {
295		return false
296	}
297	s := t.tab[i]
298	h1, h2 := t.hash(s)
299	j := h1 + h2 - i
300	if t.tab[j] != "" && !t.push(j, depth+1) {
301		return false
302	}
303	t.tab[j] = s
304	return true
305}
306
307// The lists of element names and attribute keys were taken from
308// https://html.spec.whatwg.org/multipage/indices.html#index
309// as of the "HTML Living Standard - Last Updated 16 April 2018" version.
310
311// "command", "keygen" and "menuitem" have been removed from the spec,
312// but are kept here for backwards compatibility.
313var elements = []string{
314	"a",
315	"abbr",
316	"address",
317	"area",
318	"article",
319	"aside",
320	"audio",
321	"b",
322	"base",
323	"bdi",
324	"bdo",
325	"blockquote",
326	"body",
327	"br",
328	"button",
329	"canvas",
330	"caption",
331	"cite",
332	"code",
333	"col",
334	"colgroup",
335	"command",
336	"data",
337	"datalist",
338	"dd",
339	"del",
340	"details",
341	"dfn",
342	"dialog",
343	"div",
344	"dl",
345	"dt",
346	"em",
347	"embed",
348	"fieldset",
349	"figcaption",
350	"figure",
351	"footer",
352	"form",
353	"h1",
354	"h2",
355	"h3",
356	"h4",
357	"h5",
358	"h6",
359	"head",
360	"header",
361	"hgroup",
362	"hr",
363	"html",
364	"i",
365	"iframe",
366	"img",
367	"input",
368	"ins",
369	"kbd",
370	"keygen",
371	"label",
372	"legend",
373	"li",
374	"link",
375	"main",
376	"map",
377	"mark",
378	"menu",
379	"menuitem",
380	"meta",
381	"meter",
382	"nav",
383	"noscript",
384	"object",
385	"ol",
386	"optgroup",
387	"option",
388	"output",
389	"p",
390	"param",
391	"picture",
392	"pre",
393	"progress",
394	"q",
395	"rp",
396	"rt",
397	"ruby",
398	"s",
399	"samp",
400	"script",
401	"section",
402	"select",
403	"slot",
404	"small",
405	"source",
406	"span",
407	"strong",
408	"style",
409	"sub",
410	"summary",
411	"sup",
412	"table",
413	"tbody",
414	"td",
415	"template",
416	"textarea",
417	"tfoot",
418	"th",
419	"thead",
420	"time",
421	"title",
422	"tr",
423	"track",
424	"u",
425	"ul",
426	"var",
427	"video",
428	"wbr",
429}
430
431// https://html.spec.whatwg.org/multipage/indices.html#attributes-3
432//
433// "challenge", "command", "contextmenu", "dropzone", "icon", "keytype", "mediagroup",
434// "radiogroup", "spellcheck", "scoped", "seamless", "sortable" and "sorted" have been removed from the spec,
435// but are kept here for backwards compatibility.
436var attributes = []string{
437	"abbr",
438	"accept",
439	"accept-charset",
440	"accesskey",
441	"action",
442	"allowfullscreen",
443	"allowpaymentrequest",
444	"allowusermedia",
445	"alt",
446	"as",
447	"async",
448	"autocomplete",
449	"autofocus",
450	"autoplay",
451	"challenge",
452	"charset",
453	"checked",
454	"cite",
455	"class",
456	"color",
457	"cols",
458	"colspan",
459	"command",
460	"content",
461	"contenteditable",
462	"contextmenu",
463	"controls",
464	"coords",
465	"crossorigin",
466	"data",
467	"datetime",
468	"default",
469	"defer",
470	"dir",
471	"dirname",
472	"disabled",
473	"download",
474	"draggable",
475	"dropzone",
476	"enctype",
477	"for",
478	"form",
479	"formaction",
480	"formenctype",
481	"formmethod",
482	"formnovalidate",
483	"formtarget",
484	"headers",
485	"height",
486	"hidden",
487	"high",
488	"href",
489	"hreflang",
490	"http-equiv",
491	"icon",
492	"id",
493	"inputmode",
494	"integrity",
495	"is",
496	"ismap",
497	"itemid",
498	"itemprop",
499	"itemref",
500	"itemscope",
501	"itemtype",
502	"keytype",
503	"kind",
504	"label",
505	"lang",
506	"list",
507	"loop",
508	"low",
509	"manifest",
510	"max",
511	"maxlength",
512	"media",
513	"mediagroup",
514	"method",
515	"min",
516	"minlength",
517	"multiple",
518	"muted",
519	"name",
520	"nomodule",
521	"nonce",
522	"novalidate",
523	"open",
524	"optimum",
525	"pattern",
526	"ping",
527	"placeholder",
528	"playsinline",
529	"poster",
530	"preload",
531	"radiogroup",
532	"readonly",
533	"referrerpolicy",
534	"rel",
535	"required",
536	"reversed",
537	"rows",
538	"rowspan",
539	"sandbox",
540	"spellcheck",
541	"scope",
542	"scoped",
543	"seamless",
544	"selected",
545	"shape",
546	"size",
547	"sizes",
548	"sortable",
549	"sorted",
550	"slot",
551	"span",
552	"spellcheck",
553	"src",
554	"srcdoc",
555	"srclang",
556	"srcset",
557	"start",
558	"step",
559	"style",
560	"tabindex",
561	"target",
562	"title",
563	"translate",
564	"type",
565	"typemustmatch",
566	"updateviacache",
567	"usemap",
568	"value",
569	"width",
570	"workertype",
571	"wrap",
572}
573
574// "onautocomplete", "onautocompleteerror", "onmousewheel",
575// "onshow" and "onsort" have been removed from the spec,
576// but are kept here for backwards compatibility.
577var eventHandlers = []string{
578	"onabort",
579	"onautocomplete",
580	"onautocompleteerror",
581	"onauxclick",
582	"onafterprint",
583	"onbeforeprint",
584	"onbeforeunload",
585	"onblur",
586	"oncancel",
587	"oncanplay",
588	"oncanplaythrough",
589	"onchange",
590	"onclick",
591	"onclose",
592	"oncontextmenu",
593	"oncopy",
594	"oncuechange",
595	"oncut",
596	"ondblclick",
597	"ondrag",
598	"ondragend",
599	"ondragenter",
600	"ondragexit",
601	"ondragleave",
602	"ondragover",
603	"ondragstart",
604	"ondrop",
605	"ondurationchange",
606	"onemptied",
607	"onended",
608	"onerror",
609	"onfocus",
610	"onhashchange",
611	"oninput",
612	"oninvalid",
613	"onkeydown",
614	"onkeypress",
615	"onkeyup",
616	"onlanguagechange",
617	"onload",
618	"onloadeddata",
619	"onloadedmetadata",
620	"onloadend",
621	"onloadstart",
622	"onmessage",
623	"onmessageerror",
624	"onmousedown",
625	"onmouseenter",
626	"onmouseleave",
627	"onmousemove",
628	"onmouseout",
629	"onmouseover",
630	"onmouseup",
631	"onmousewheel",
632	"onwheel",
633	"onoffline",
634	"ononline",
635	"onpagehide",
636	"onpageshow",
637	"onpaste",
638	"onpause",
639	"onplay",
640	"onplaying",
641	"onpopstate",
642	"onprogress",
643	"onratechange",
644	"onreset",
645	"onresize",
646	"onrejectionhandled",
647	"onscroll",
648	"onsecuritypolicyviolation",
649	"onseeked",
650	"onseeking",
651	"onselect",
652	"onshow",
653	"onsort",
654	"onstalled",
655	"onstorage",
656	"onsubmit",
657	"onsuspend",
658	"ontimeupdate",
659	"ontoggle",
660	"onunhandledrejection",
661	"onunload",
662	"onvolumechange",
663	"onwaiting",
664}
665
666// extra are ad-hoc values not covered by any of the lists above.
667var extra = []string{
668	"acronym",
669	"align",
670	"annotation",
671	"annotation-xml",
672	"applet",
673	"basefont",
674	"bgsound",
675	"big",
676	"blink",
677	"center",
678	"color",
679	"desc",
680	"face",
681	"font",
682	"foreignObject", // HTML is case-insensitive, but SVG-embedded-in-HTML is case-sensitive.
683	"foreignobject",
684	"frame",
685	"frameset",
686	"image",
687	"isindex",
688	"listing",
689	"malignmark",
690	"marquee",
691	"math",
692	"mglyph",
693	"mi",
694	"mn",
695	"mo",
696	"ms",
697	"mtext",
698	"nobr",
699	"noembed",
700	"noframes",
701	"plaintext",
702	"prompt",
703	"public",
704	"rb",
705	"rtc",
706	"spacer",
707	"strike",
708	"svg",
709	"system",
710	"tt",
711	"xmp",
712}
713