1// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package html provides functions for escaping and unescaping HTML text.
6package html
7
8import (
9	"strings"
10	"unicode/utf8"
11)
12
13type writer interface {
14	WriteString(string) (int, error)
15}
16
17// These replacements permit compatibility with old numeric entities that
18// assumed Windows-1252 encoding.
19// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
20var replacementTable = [...]rune{
21	'\u20AC', // First entry is what 0x80 should be replaced with.
22	'\u0081',
23	'\u201A',
24	'\u0192',
25	'\u201E',
26	'\u2026',
27	'\u2020',
28	'\u2021',
29	'\u02C6',
30	'\u2030',
31	'\u0160',
32	'\u2039',
33	'\u0152',
34	'\u008D',
35	'\u017D',
36	'\u008F',
37	'\u0090',
38	'\u2018',
39	'\u2019',
40	'\u201C',
41	'\u201D',
42	'\u2022',
43	'\u2013',
44	'\u2014',
45	'\u02DC',
46	'\u2122',
47	'\u0161',
48	'\u203A',
49	'\u0153',
50	'\u009D',
51	'\u017E',
52	'\u0178', // Last entry is 0x9F.
53	// 0x00->'\uFFFD' is handled programmatically.
54	// 0x0D->'\u000D' is a no-op.
55}
56
57// unescapeEntity reads an entity like "<" from b[src:] and writes the
58// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
59// Precondition: b[src] == '&' && dst <= src.
60func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
61	const attribute = false
62
63	// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
64
65	// i starts at 1 because we already know that s[0] == '&'.
66	i, s := 1, b[src:]
67
68	if len(s) <= 1 {
69		b[dst] = b[src]
70		return dst + 1, src + 1
71	}
72
73	if s[i] == '#' {
74		if len(s) <= 3 { // We need to have at least "&#.".
75			b[dst] = b[src]
76			return dst + 1, src + 1
77		}
78		i++
79		c := s[i]
80		hex := false
81		if c == 'x' || c == 'X' {
82			hex = true
83			i++
84		}
85
86		x := '\x00'
87		for i < len(s) {
88			c = s[i]
89			i++
90			if hex {
91				if '0' <= c && c <= '9' {
92					x = 16*x + rune(c) - '0'
93					continue
94				} else if 'a' <= c && c <= 'f' {
95					x = 16*x + rune(c) - 'a' + 10
96					continue
97				} else if 'A' <= c && c <= 'F' {
98					x = 16*x + rune(c) - 'A' + 10
99					continue
100				}
101			} else if '0' <= c && c <= '9' {
102				x = 10*x + rune(c) - '0'
103				continue
104			}
105			if c != ';' {
106				i--
107			}
108			break
109		}
110
111		if i <= 3 { // No characters matched.
112			b[dst] = b[src]
113			return dst + 1, src + 1
114		}
115
116		if 0x80 <= x && x <= 0x9F {
117			// Replace characters from Windows-1252 with UTF-8 equivalents.
118			x = replacementTable[x-0x80]
119		} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
120			// Replace invalid characters with the replacement character.
121			x = '\uFFFD'
122		}
123
124		return dst + utf8.EncodeRune(b[dst:], x), src + i
125	}
126
127	// Consume the maximum number of characters possible, with the
128	// consumed characters matching one of the named references.
129
130	for i < len(s) {
131		c := s[i]
132		i++
133		// Lower-cased characters are more common in entities, so we check for them first.
134		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
135			continue
136		}
137		if c != ';' {
138			i--
139		}
140		break
141	}
142
143	entityName := s[1:i]
144	if len(entityName) == 0 {
145		// No-op.
146	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
147		// No-op.
148	} else if x := entity[string(entityName)]; x != 0 {
149		return dst + utf8.EncodeRune(b[dst:], x), src + i
150	} else if x := entity2[string(entityName)]; x[0] != 0 {
151		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
152		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
153	} else if !attribute {
154		maxLen := len(entityName) - 1
155		if maxLen > longestEntityWithoutSemicolon {
156			maxLen = longestEntityWithoutSemicolon
157		}
158		for j := maxLen; j > 1; j-- {
159			if x := entity[string(entityName[:j])]; x != 0 {
160				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
161			}
162		}
163	}
164
165	dst1, src1 = dst+i, src+i
166	copy(b[dst:dst1], b[src:src1])
167	return dst1, src1
168}
169
170var htmlEscaper = strings.NewReplacer(
171	`&`, "&amp;",
172	`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
173	`<`, "&lt;",
174	`>`, "&gt;",
175	`"`, "&#34;", // "&#34;" is shorter than "&quot;".
176)
177
178// EscapeString escapes special characters like "<" to become "&lt;". It
179// escapes only five such characters: <, >, &, ' and ".
180// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
181// always true.
182func EscapeString(s string) string {
183	return htmlEscaper.Replace(s)
184}
185
186// UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
187// larger range of entities than EscapeString escapes. For example, "&aacute;"
188// unescapes to "á", as does "&#225;" and "&xE1;".
189// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
190// always true.
191func UnescapeString(s string) string {
192	i := strings.IndexByte(s, '&')
193
194	if i < 0 {
195		return s
196	}
197
198	b := []byte(s)
199	dst, src := unescapeEntity(b, i, i)
200	for len(s[src:]) > 0 {
201		if s[src] == '&' {
202			i = 0
203		} else {
204			i = strings.IndexByte(s[src:], '&')
205		}
206		if i < 0 {
207			dst += copy(b[dst:], s[src:])
208			break
209		}
210
211		if i > 0 {
212			copy(b[dst:], s[src:src+i])
213		}
214		dst, src = unescapeEntity(b, dst+i, src+i)
215	}
216	return string(b[:dst])
217}
218