1// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package html provides functions for escaping and unescaping HTML text.
6package html
7
8import (
9	"strings"
10	"unicode/utf8"
11)
12
13// These replacements permit compatibility with old numeric entities that
14// assumed Windows-1252 encoding.
15// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
16var replacementTable = [...]rune{
17	'\u20AC', // First entry is what 0x80 should be replaced with.
18	'\u0081',
19	'\u201A',
20	'\u0192',
21	'\u201E',
22	'\u2026',
23	'\u2020',
24	'\u2021',
25	'\u02C6',
26	'\u2030',
27	'\u0160',
28	'\u2039',
29	'\u0152',
30	'\u008D',
31	'\u017D',
32	'\u008F',
33	'\u0090',
34	'\u2018',
35	'\u2019',
36	'\u201C',
37	'\u201D',
38	'\u2022',
39	'\u2013',
40	'\u2014',
41	'\u02DC',
42	'\u2122',
43	'\u0161',
44	'\u203A',
45	'\u0153',
46	'\u009D',
47	'\u017E',
48	'\u0178', // Last entry is 0x9F.
49	// 0x00->'\uFFFD' is handled programmatically.
50	// 0x0D->'\u000D' is a no-op.
51}
52
53// unescapeEntity reads an entity like "<" from b[src:] and writes the
54// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
55// Precondition: b[src] == '&' && dst <= src.
56func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
57	const attribute = false
58
59	// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
60
61	// i starts at 1 because we already know that s[0] == '&'.
62	i, s := 1, b[src:]
63
64	if len(s) <= 1 {
65		b[dst] = b[src]
66		return dst + 1, src + 1
67	}
68
69	if s[i] == '#' {
70		if len(s) <= 3 { // We need to have at least "&#.".
71			b[dst] = b[src]
72			return dst + 1, src + 1
73		}
74		i++
75		c := s[i]
76		hex := false
77		if c == 'x' || c == 'X' {
78			hex = true
79			i++
80		}
81
82		x := '\x00'
83		for i < len(s) {
84			c = s[i]
85			i++
86			if hex {
87				if '0' <= c && c <= '9' {
88					x = 16*x + rune(c) - '0'
89					continue
90				} else if 'a' <= c && c <= 'f' {
91					x = 16*x + rune(c) - 'a' + 10
92					continue
93				} else if 'A' <= c && c <= 'F' {
94					x = 16*x + rune(c) - 'A' + 10
95					continue
96				}
97			} else if '0' <= c && c <= '9' {
98				x = 10*x + rune(c) - '0'
99				continue
100			}
101			if c != ';' {
102				i--
103			}
104			break
105		}
106
107		if i <= 3 { // No characters matched.
108			b[dst] = b[src]
109			return dst + 1, src + 1
110		}
111
112		if 0x80 <= x && x <= 0x9F {
113			// Replace characters from Windows-1252 with UTF-8 equivalents.
114			x = replacementTable[x-0x80]
115		} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
116			// Replace invalid characters with the replacement character.
117			x = '\uFFFD'
118		}
119
120		return dst + utf8.EncodeRune(b[dst:], x), src + i
121	}
122
123	// Consume the maximum number of characters possible, with the
124	// consumed characters matching one of the named references.
125
126	for i < len(s) {
127		c := s[i]
128		i++
129		// Lower-cased characters are more common in entities, so we check for them first.
130		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
131			continue
132		}
133		if c != ';' {
134			i--
135		}
136		break
137	}
138
139	entityName := s[1:i]
140	if len(entityName) == 0 {
141		// No-op.
142	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
143		// No-op.
144	} else if x := entity[string(entityName)]; x != 0 {
145		return dst + utf8.EncodeRune(b[dst:], x), src + i
146	} else if x := entity2[string(entityName)]; x[0] != 0 {
147		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
148		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
149	} else if !attribute {
150		maxLen := len(entityName) - 1
151		if maxLen > longestEntityWithoutSemicolon {
152			maxLen = longestEntityWithoutSemicolon
153		}
154		for j := maxLen; j > 1; j-- {
155			if x := entity[string(entityName[:j])]; x != 0 {
156				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
157			}
158		}
159	}
160
161	dst1, src1 = dst+i, src+i
162	copy(b[dst:dst1], b[src:src1])
163	return dst1, src1
164}
165
166var htmlEscaper = strings.NewReplacer(
167	`&`, "&amp;",
168	`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
169	`<`, "&lt;",
170	`>`, "&gt;",
171	`"`, "&#34;", // "&#34;" is shorter than "&quot;".
172)
173
174// EscapeString escapes special characters like "<" to become "&lt;". It
175// escapes only five such characters: <, >, &, ' and ".
176// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
177// always true.
178func EscapeString(s string) string {
179	return htmlEscaper.Replace(s)
180}
181
182// UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
183// larger range of entities than EscapeString escapes. For example, "&aacute;"
184// unescapes to "á", as does "&#225;" and "&#xE1;".
185// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
186// always true.
187func UnescapeString(s string) string {
188	populateMapsOnce.Do(populateMaps)
189	i := strings.IndexByte(s, '&')
190
191	if i < 0 {
192		return s
193	}
194
195	b := []byte(s)
196	dst, src := unescapeEntity(b, i, i)
197	for len(s[src:]) > 0 {
198		if s[src] == '&' {
199			i = 0
200		} else {
201			i = strings.IndexByte(s[src:], '&')
202		}
203		if i < 0 {
204			dst += copy(b[dst:], s[src:])
205			break
206		}
207
208		if i > 0 {
209			copy(b[dst:], s[src:src+i])
210		}
211		dst, src = unescapeEntity(b, dst+i, src+i)
212	}
213	return string(b[:dst])
214}
215