1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package html
6
7import (
8	"strings"
9)
10
11// parseDoctype parses the data from a DoctypeToken into a name,
12// public identifier, and system identifier. It returns a Node whose Type
13// is DoctypeNode, whose Data is the name, and which has attributes
14// named "system" and "public" for the two identifiers if they were present.
15// quirks is whether the document should be parsed in "quirks mode".
16func parseDoctype(s string) (n *Node, quirks bool) {
17	n = &Node{Type: DoctypeNode}
18
19	// Find the name.
20	space := strings.IndexAny(s, whitespace)
21	if space == -1 {
22		space = len(s)
23	}
24	n.Data = s[:space]
25	// The comparison to "html" is case-sensitive.
26	if n.Data != "html" {
27		quirks = true
28	}
29	n.Data = strings.ToLower(n.Data)
30	s = strings.TrimLeft(s[space:], whitespace)
31
32	if len(s) < 6 {
33		// It can't start with "PUBLIC" or "SYSTEM".
34		// Ignore the rest of the string.
35		return n, quirks || s != ""
36	}
37
38	key := strings.ToLower(s[:6])
39	s = s[6:]
40	for key == "public" || key == "system" {
41		s = strings.TrimLeft(s, whitespace)
42		if s == "" {
43			break
44		}
45		quote := s[0]
46		if quote != '"' && quote != '\'' {
47			break
48		}
49		s = s[1:]
50		q := strings.IndexRune(s, rune(quote))
51		var id string
52		if q == -1 {
53			id = s
54			s = ""
55		} else {
56			id = s[:q]
57			s = s[q+1:]
58		}
59		n.Attr = append(n.Attr, Attribute{Key: key, Val: id})
60		if key == "public" {
61			key = "system"
62		} else {
63			key = ""
64		}
65	}
66
67	if key != "" || s != "" {
68		quirks = true
69	} else if len(n.Attr) > 0 {
70		if n.Attr[0].Key == "public" {
71			public := strings.ToLower(n.Attr[0].Val)
72			switch public {
73			case "-//w3o//dtd w3 html strict 3.0//en//", "-/w3d/dtd html 4.0 transitional/en", "html":
74				quirks = true
75			default:
76				for _, q := range quirkyIDs {
77					if strings.HasPrefix(public, q) {
78						quirks = true
79						break
80					}
81				}
82			}
83			// The following two public IDs only cause quirks mode if there is no system ID.
84			if len(n.Attr) == 1 && (strings.HasPrefix(public, "-//w3c//dtd html 4.01 frameset//") ||
85				strings.HasPrefix(public, "-//w3c//dtd html 4.01 transitional//")) {
86				quirks = true
87			}
88		}
89		if lastAttr := n.Attr[len(n.Attr)-1]; lastAttr.Key == "system" &&
90			strings.ToLower(lastAttr.Val) == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd" {
91			quirks = true
92		}
93	}
94
95	return n, quirks
96}
97
98// quirkyIDs is a list of public doctype identifiers that cause a document
99// to be interpreted in quirks mode. The identifiers should be in lower case.
100var quirkyIDs = []string{
101	"+//silmaril//dtd html pro v0r11 19970101//",
102	"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
103	"-//as//dtd html 3.0 aswedit + extensions//",
104	"-//ietf//dtd html 2.0 level 1//",
105	"-//ietf//dtd html 2.0 level 2//",
106	"-//ietf//dtd html 2.0 strict level 1//",
107	"-//ietf//dtd html 2.0 strict level 2//",
108	"-//ietf//dtd html 2.0 strict//",
109	"-//ietf//dtd html 2.0//",
110	"-//ietf//dtd html 2.1e//",
111	"-//ietf//dtd html 3.0//",
112	"-//ietf//dtd html 3.2 final//",
113	"-//ietf//dtd html 3.2//",
114	"-//ietf//dtd html 3//",
115	"-//ietf//dtd html level 0//",
116	"-//ietf//dtd html level 1//",
117	"-//ietf//dtd html level 2//",
118	"-//ietf//dtd html level 3//",
119	"-//ietf//dtd html strict level 0//",
120	"-//ietf//dtd html strict level 1//",
121	"-//ietf//dtd html strict level 2//",
122	"-//ietf//dtd html strict level 3//",
123	"-//ietf//dtd html strict//",
124	"-//ietf//dtd html//",
125	"-//metrius//dtd metrius presentational//",
126	"-//microsoft//dtd internet explorer 2.0 html strict//",
127	"-//microsoft//dtd internet explorer 2.0 html//",
128	"-//microsoft//dtd internet explorer 2.0 tables//",
129	"-//microsoft//dtd internet explorer 3.0 html strict//",
130	"-//microsoft//dtd internet explorer 3.0 html//",
131	"-//microsoft//dtd internet explorer 3.0 tables//",
132	"-//netscape comm. corp.//dtd html//",
133	"-//netscape comm. corp.//dtd strict html//",
134	"-//o'reilly and associates//dtd html 2.0//",
135	"-//o'reilly and associates//dtd html extended 1.0//",
136	"-//o'reilly and associates//dtd html extended relaxed 1.0//",
137	"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
138	"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
139	"-//spyglass//dtd html 2.0 extended//",
140	"-//sq//dtd html 2.0 hotmetal + extensions//",
141	"-//sun microsystems corp.//dtd hotjava html//",
142	"-//sun microsystems corp.//dtd hotjava strict html//",
143	"-//w3c//dtd html 3 1995-03-24//",
144	"-//w3c//dtd html 3.2 draft//",
145	"-//w3c//dtd html 3.2 final//",
146	"-//w3c//dtd html 3.2//",
147	"-//w3c//dtd html 3.2s draft//",
148	"-//w3c//dtd html 4.0 frameset//",
149	"-//w3c//dtd html 4.0 transitional//",
150	"-//w3c//dtd html experimental 19960712//",
151	"-//w3c//dtd html experimental 970421//",
152	"-//w3c//dtd w3 html//",
153	"-//w3o//dtd w3 html 3.0//",
154	"-//webtechs//dtd mozilla html 2.0//",
155	"-//webtechs//dtd mozilla html//",
156}
157