1// Copyright 2017 Frédéric Guillot. All rights reserved.
2// Use of this source code is governed by the Apache 2.0
3// license that can be found in the LICENSE file.
4
5package sanitizer
6
7import (
8	"bytes"
9	"fmt"
10	"io"
11	"regexp"
12	"strconv"
13	"strings"
14
15	"github.com/nkanaev/yarr/src/content/htmlutil"
16	"golang.org/x/net/html"
17)
18
19var splitSrcsetRegex = regexp.MustCompile(`,\s+`)
20
21// Sanitize returns safe HTML.
22func Sanitize(baseURL, input string) string {
23	var buffer bytes.Buffer
24	var tagStack []string
25	var parentTag string
26	blacklistedTagDepth := 0
27
28	tokenizer := html.NewTokenizer(bytes.NewBufferString(input))
29	for {
30		if tokenizer.Next() == html.ErrorToken {
31			err := tokenizer.Err()
32			if err == io.EOF {
33				return buffer.String()
34			}
35
36			return ""
37		}
38
39		token := tokenizer.Token()
40		switch token.Type {
41		case html.TextToken:
42			if blacklistedTagDepth > 0 {
43				continue
44			}
45
46			// An iframe element never has fallback content.
47			// See https://www.w3.org/TR/2010/WD-html5-20101019/the-iframe-element.html#the-iframe-element
48			if parentTag == "iframe" {
49				continue
50			}
51
52			buffer.WriteString(html.EscapeString(token.Data))
53		case html.StartTagToken:
54			tagName := token.Data
55			parentTag = tagName
56
57			if isValidTag(tagName) {
58				attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
59
60				if hasRequiredAttributes(tagName, attrNames) {
61					wrap := isVideoIframe(token)
62					if wrap {
63						buffer.WriteString(`<div class="video-wrapper">`)
64					}
65
66					if len(attrNames) > 0 {
67						buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
68					} else {
69						buffer.WriteString("<" + tagName + ">")
70					}
71
72					if tagName == "iframe" {
73						// autoclose iframes
74						buffer.WriteString("</iframe>")
75						if wrap {
76							buffer.WriteString("</div>")
77						}
78					} else {
79						tagStack = append(tagStack, tagName)
80					}
81				}
82			} else if isBlockedTag(tagName) {
83				blacklistedTagDepth++
84			}
85		case html.EndTagToken:
86			tagName := token.Data
87			// iframes are autoclosed. see above
88			if tagName == "iframe" {
89				continue
90			}
91			if isValidTag(tagName) && inList(tagName, tagStack) {
92				buffer.WriteString(fmt.Sprintf("</%s>", tagName))
93			} else if isBlockedTag(tagName) {
94				blacklistedTagDepth--
95			}
96		case html.SelfClosingTagToken:
97			tagName := token.Data
98			if isValidTag(tagName) {
99				attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
100
101				if hasRequiredAttributes(tagName, attrNames) {
102					if len(attrNames) > 0 {
103						buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
104					} else {
105						buffer.WriteString("<" + tagName + "/>")
106					}
107				}
108			}
109		}
110	}
111}
112
113func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([]string, string) {
114	var htmlAttrs, attrNames []string
115
116	for _, attribute := range attributes {
117		value := attribute.Val
118
119		if !isValidAttribute(tagName, attribute.Key) {
120			continue
121		}
122
123		if (tagName == "img" || tagName == "source") && attribute.Key == "srcset" {
124			value = sanitizeSrcsetAttr(baseURL, value)
125		}
126
127		if isExternalResourceAttribute(attribute.Key) {
128			if tagName == "iframe" {
129				if isValidIframeSource(baseURL, attribute.Val) {
130					value = attribute.Val
131				} else {
132					continue
133				}
134			} else if tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val) {
135				value = attribute.Val
136			} else {
137				value = htmlutil.AbsoluteUrl(value, baseURL)
138				if value == "" {
139					continue
140				}
141
142				if !hasValidURIScheme(value) || isBlockedResource(value) {
143					continue
144				}
145			}
146		}
147
148		attrNames = append(attrNames, attribute.Key)
149		htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s="%s"`, attribute.Key, html.EscapeString(value)))
150	}
151
152	extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName)
153	if len(extraAttrNames) > 0 {
154		attrNames = append(attrNames, extraAttrNames...)
155		htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
156	}
157
158	return attrNames, strings.Join(htmlAttrs, " ")
159}
160
161func getExtraAttributes(tagName string) ([]string, []string) {
162	switch tagName {
163	case "a":
164		return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`}
165	case "video", "audio":
166		return []string{"controls"}, []string{"controls"}
167	case "iframe":
168		return []string{"sandbox", "loading"}, []string{`sandbox="allow-scripts allow-same-origin allow-popups"`, `loading="lazy"`}
169	case "img":
170		return []string{"loading"}, []string{`loading="lazy"`}
171	default:
172		return nil, nil
173	}
174}
175
176func isValidTag(tagName string) bool {
177	x := allowedTags.has(tagName) || allowedSvgTags.has(tagName) || allowedSvgFilters.has(tagName)
178	//fmt.Println(tagName, x)
179	return x
180}
181
182func isValidAttribute(tagName, attributeName string) bool {
183	if attrs, ok := allowedAttrs[tagName]; ok {
184		return attrs.has(attributeName)
185	}
186	if allowedSvgTags.has(tagName) {
187		return allowedSvgAttrs.has(attributeName)
188	}
189	return false
190}
191
192func isExternalResourceAttribute(attribute string) bool {
193	switch attribute {
194	case "src", "href", "poster", "cite":
195		return true
196	default:
197		return false
198	}
199}
200
201func hasRequiredAttributes(tagName string, attributes []string) bool {
202	elements := make(map[string][]string)
203	elements["a"] = []string{"href"}
204	elements["iframe"] = []string{"src"}
205	elements["img"] = []string{"src"}
206	elements["source"] = []string{"src", "srcset"}
207
208	for element, attrs := range elements {
209		if tagName == element {
210			for _, attribute := range attributes {
211				for _, attr := range attrs {
212					if attr == attribute {
213						return true
214					}
215				}
216			}
217
218			return false
219		}
220	}
221
222	return true
223}
224
225// See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
226func hasValidURIScheme(src string) bool {
227	scheme := strings.SplitN(src, ":", 2)[0]
228	return allowedURISchemes.has(scheme)
229}
230
231func isBlockedResource(src string) bool {
232	blacklist := []string{
233		"feedsportal.com",
234		"api.flattr.com",
235		"stats.wordpress.com",
236		"plus.google.com/share",
237		"twitter.com/share",
238		"feeds.feedburner.com",
239	}
240
241	for _, element := range blacklist {
242		if strings.Contains(src, element) {
243			return true
244		}
245	}
246
247	return false
248}
249
250func isValidIframeSource(baseURL, src string) bool {
251	whitelist := []string{
252		"bandcamp.com",
253		"cdn.embedly.com",
254		"invidio.us",
255		"player.bilibili.com",
256		"player.vimeo.com",
257		"soundcloud.com",
258		"vk.com",
259		"w.soundcloud.com",
260		"www.dailymotion.com",
261		"www.youtube-nocookie.com",
262		"www.youtube.com",
263	}
264
265	domain := htmlutil.URLDomain(src)
266	// allow iframe from same origin
267	if htmlutil.URLDomain(baseURL) == domain {
268		return true
269	}
270
271	for _, safeDomain := range whitelist {
272		if safeDomain == domain {
273			return true
274		}
275	}
276
277	return false
278}
279
280func getTagAllowList() map[string][]string {
281	whitelist := make(map[string][]string)
282	whitelist["img"] = []string{"alt", "title", "src", "srcset", "sizes"}
283	whitelist["picture"] = []string{}
284	whitelist["audio"] = []string{"src"}
285	whitelist["video"] = []string{"poster", "height", "width", "src"}
286	whitelist["source"] = []string{"src", "type", "srcset", "sizes", "media"}
287	whitelist["dt"] = []string{}
288	whitelist["dd"] = []string{}
289	whitelist["dl"] = []string{}
290	whitelist["table"] = []string{}
291	whitelist["caption"] = []string{}
292	whitelist["thead"] = []string{}
293	whitelist["tfooter"] = []string{}
294	whitelist["tr"] = []string{}
295	whitelist["td"] = []string{"rowspan", "colspan"}
296	whitelist["th"] = []string{"rowspan", "colspan"}
297	whitelist["h1"] = []string{}
298	whitelist["h2"] = []string{}
299	whitelist["h3"] = []string{}
300	whitelist["h4"] = []string{}
301	whitelist["h5"] = []string{}
302	whitelist["h6"] = []string{}
303	whitelist["strong"] = []string{}
304	whitelist["em"] = []string{}
305	whitelist["code"] = []string{}
306	whitelist["pre"] = []string{}
307	whitelist["blockquote"] = []string{}
308	whitelist["q"] = []string{"cite"}
309	whitelist["p"] = []string{}
310	whitelist["ul"] = []string{}
311	whitelist["li"] = []string{}
312	whitelist["ol"] = []string{}
313	whitelist["br"] = []string{}
314	whitelist["del"] = []string{}
315	whitelist["a"] = []string{"href", "title"}
316	whitelist["figure"] = []string{}
317	whitelist["figcaption"] = []string{}
318	whitelist["cite"] = []string{}
319	whitelist["time"] = []string{"datetime"}
320	whitelist["abbr"] = []string{"title"}
321	whitelist["acronym"] = []string{"title"}
322	whitelist["wbr"] = []string{}
323	whitelist["dfn"] = []string{}
324	whitelist["sub"] = []string{}
325	whitelist["sup"] = []string{}
326	whitelist["var"] = []string{}
327	whitelist["samp"] = []string{}
328	whitelist["s"] = []string{}
329	whitelist["del"] = []string{}
330	whitelist["ins"] = []string{}
331	whitelist["kbd"] = []string{}
332	whitelist["rp"] = []string{}
333	whitelist["rt"] = []string{}
334	whitelist["rtc"] = []string{}
335	whitelist["ruby"] = []string{}
336	whitelist["iframe"] = []string{"width", "height", "frameborder", "src", "allowfullscreen"}
337	return whitelist
338}
339
340func inList(needle string, haystack []string) bool {
341	for _, element := range haystack {
342		if element == needle {
343			return true
344		}
345	}
346
347	return false
348}
349
350func isBlockedTag(tagName string) bool {
351	blacklist := []string{
352		"noscript",
353		"script",
354		"style",
355	}
356
357	for _, element := range blacklist {
358		if element == tagName {
359			return true
360		}
361	}
362
363	return false
364}
365
366/*
367
368One or more strings separated by commas, indicating possible image sources for the user agent to use.
369
370Each string is composed of:
371- A URL to an image
372- Optionally, whitespace followed by one of:
373- A width descriptor (a positive integer directly followed by w). The width descriptor is divided by the source size given in the sizes attribute to calculate the effective pixel density.
374- A pixel density descriptor (a positive floating point number directly followed by x).
375
376*/
377func sanitizeSrcsetAttr(baseURL, value string) string {
378	var sanitizedSources []string
379	rawSources := splitSrcsetRegex.Split(value, -1)
380	for _, rawSource := range rawSources {
381		parts := strings.Split(strings.TrimSpace(rawSource), " ")
382		nbParts := len(parts)
383
384		if nbParts > 0 {
385			sanitizedSource := parts[0]
386			if !strings.HasPrefix(parts[0], "data:") {
387				sanitizedSource = htmlutil.AbsoluteUrl(parts[0], baseURL)
388				if sanitizedSource == "" {
389					continue
390				}
391			}
392
393			if nbParts == 2 && isValidWidthOrDensityDescriptor(parts[1]) {
394				sanitizedSource += " " + parts[1]
395			}
396
397			sanitizedSources = append(sanitizedSources, sanitizedSource)
398		}
399	}
400	return strings.Join(sanitizedSources, ", ")
401}
402
403func isValidWidthOrDensityDescriptor(value string) bool {
404	if value == "" {
405		return false
406	}
407
408	lastChar := value[len(value)-1:]
409	if lastChar != "w" && lastChar != "x" {
410		return false
411	}
412
413	_, err := strconv.ParseFloat(value[0:len(value)-1], 32)
414	return err == nil
415}
416
417func isValidDataAttribute(value string) bool {
418	var dataAttributeAllowList = []string{
419		"data:image/avif",
420		"data:image/apng",
421		"data:image/png",
422		"data:image/svg",
423		"data:image/svg+xml",
424		"data:image/jpg",
425		"data:image/jpeg",
426		"data:image/gif",
427		"data:image/webp",
428	}
429
430	for _, prefix := range dataAttributeAllowList {
431		if strings.HasPrefix(value, prefix) {
432			return true
433		}
434	}
435	return false
436}
437
438func isVideoIframe(token html.Token) bool {
439	videoWhitelist := map[string]bool{
440		"player.bilibili.com":      true,
441		"player.vimeo.com":         true,
442		"www.dailymotion.com":      true,
443		"www.youtube-nocookie.com": true,
444		"www.youtube.com":          true,
445	}
446	if token.Data == "iframe" {
447		for _, attr := range token.Attr {
448			if attr.Key == "src" {
449				domain := htmlutil.URLDomain(attr.Val)
450				return videoWhitelist[domain]
451			}
452		}
453	}
454	return false
455}
456