1// Copyright 2017 The Gitea Authors. All rights reserved.
2// Copyright 2017 The Gogs Authors. All rights reserved.
3// Use of this source code is governed by a MIT-style
4// license that can be found in the LICENSE file.
5
6package markup
7
8import (
9	"io"
10	"regexp"
11	"sync"
12
13	"code.gitea.io/gitea/modules/setting"
14
15	"github.com/microcosm-cc/bluemonday"
16)
17
18// Sanitizer is a protection wrapper of *bluemonday.Policy which does not allow
19// any modification to the underlying policies once it's been created.
20type Sanitizer struct {
21	defaultPolicy    *bluemonday.Policy
22	rendererPolicies map[string]*bluemonday.Policy
23	init             sync.Once
24}
25
26var sanitizer = &Sanitizer{}
27
28// NewSanitizer initializes sanitizer with allowed attributes based on settings.
29// Multiple calls to this function will only create one instance of Sanitizer during
30// entire application lifecycle.
31func NewSanitizer() {
32	sanitizer.init.Do(func() {
33		InitializeSanitizer()
34	})
35}
36
37// InitializeSanitizer (re)initializes the current sanitizer to account for changes in settings
38func InitializeSanitizer() {
39	sanitizer.rendererPolicies = map[string]*bluemonday.Policy{}
40	sanitizer.defaultPolicy = createDefaultPolicy()
41
42	for name, renderer := range renderers {
43		sanitizerRules := renderer.SanitizerRules()
44		if len(sanitizerRules) > 0 {
45			policy := createDefaultPolicy()
46			addSanitizerRules(policy, sanitizerRules)
47			sanitizer.rendererPolicies[name] = policy
48		}
49	}
50}
51
52func createDefaultPolicy() *bluemonday.Policy {
53	policy := bluemonday.UGCPolicy()
54
55	// For JS code copy and Mermaid loading state
56	policy.AllowAttrs("class").Matching(regexp.MustCompile(`^code-block( is-loading)?$`)).OnElements("pre")
57
58	// For Chroma markdown plugin
59	policy.AllowAttrs("class").Matching(regexp.MustCompile(`^(chroma )?language-[\w-]+$`)).OnElements("code")
60
61	// Checkboxes
62	policy.AllowAttrs("type").Matching(regexp.MustCompile(`^checkbox$`)).OnElements("input")
63	policy.AllowAttrs("checked", "disabled", "data-source-position").OnElements("input")
64
65	// Custom URL-Schemes
66	if len(setting.Markdown.CustomURLSchemes) > 0 {
67		policy.AllowURLSchemes(setting.Markdown.CustomURLSchemes...)
68	}
69
70	// Allow classes for anchors
71	policy.AllowAttrs("class").Matching(regexp.MustCompile(`ref-issue( ref-external-issue)?`)).OnElements("a")
72
73	// Allow classes for task lists
74	policy.AllowAttrs("class").Matching(regexp.MustCompile(`task-list-item`)).OnElements("li")
75
76	// Allow icons
77	policy.AllowAttrs("class").Matching(regexp.MustCompile(`^icon(\s+[\p{L}\p{N}_-]+)+$`)).OnElements("i")
78
79	// Allow unlabelled labels
80	policy.AllowNoAttrs().OnElements("label")
81
82	// Allow classes for emojis
83	policy.AllowAttrs("class").Matching(regexp.MustCompile(`emoji`)).OnElements("img")
84
85	// Allow icons, emojis, chroma syntax and keyword markup on span
86	policy.AllowAttrs("class").Matching(regexp.MustCompile(`^((icon(\s+[\p{L}\p{N}_-]+)+)|(emoji))$|^([a-z][a-z0-9]{0,2})$|^` + keywordClass + `$`)).OnElements("span")
87
88	// Allow generally safe attributes
89	generalSafeAttrs := []string{"abbr", "accept", "accept-charset",
90		"accesskey", "action", "align", "alt",
91		"aria-describedby", "aria-hidden", "aria-label", "aria-labelledby",
92		"axis", "border", "cellpadding", "cellspacing", "char",
93		"charoff", "charset", "checked",
94		"clear", "cols", "colspan", "color",
95		"compact", "coords", "datetime", "dir",
96		"disabled", "enctype", "for", "frame",
97		"headers", "height", "hreflang",
98		"hspace", "ismap", "label", "lang",
99		"maxlength", "media", "method",
100		"multiple", "name", "nohref", "noshade",
101		"nowrap", "open", "prompt", "readonly", "rel", "rev",
102		"rows", "rowspan", "rules", "scope",
103		"selected", "shape", "size", "span",
104		"start", "summary", "tabindex", "target",
105		"title", "type", "usemap", "valign", "value",
106		"vspace", "width", "itemprop",
107	}
108
109	generalSafeElements := []string{
110		"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "br", "b", "i", "strong", "em", "a", "pre", "code", "img", "tt",
111		"div", "ins", "del", "sup", "sub", "p", "ol", "ul", "table", "thead", "tbody", "tfoot", "blockquote",
112		"dl", "dt", "dd", "kbd", "q", "samp", "var", "hr", "ruby", "rt", "rp", "li", "tr", "td", "th", "s", "strike", "summary",
113		"details", "caption", "figure", "figcaption",
114		"abbr", "bdo", "cite", "dfn", "mark", "small", "span", "time", "wbr",
115	}
116
117	policy.AllowAttrs(generalSafeAttrs...).OnElements(generalSafeElements...)
118
119	policy.AllowAttrs("itemscope", "itemtype").OnElements("div")
120
121	// FIXME: Need to handle longdesc in img but there is no easy way to do it
122
123	// Custom keyword markup
124	addSanitizerRules(policy, setting.ExternalSanitizerRules)
125
126	return policy
127}
128
129func addSanitizerRules(policy *bluemonday.Policy, rules []setting.MarkupSanitizerRule) {
130	for _, rule := range rules {
131		if rule.AllowDataURIImages {
132			policy.AllowDataURIImages()
133		}
134		if rule.Element != "" {
135			if rule.Regexp != nil {
136				policy.AllowAttrs(rule.AllowAttr).Matching(rule.Regexp).OnElements(rule.Element)
137			} else {
138				policy.AllowAttrs(rule.AllowAttr).OnElements(rule.Element)
139			}
140		}
141	}
142}
143
144// Sanitize takes a string that contains a HTML fragment or document and applies policy whitelist.
145func Sanitize(s string) string {
146	NewSanitizer()
147	return sanitizer.defaultPolicy.Sanitize(s)
148}
149
150// SanitizeReader sanitizes a Reader
151func SanitizeReader(r io.Reader, renderer string, w io.Writer) error {
152	NewSanitizer()
153	policy, exist := sanitizer.rendererPolicies[renderer]
154	if !exist {
155		policy = sanitizer.defaultPolicy
156	}
157	return policy.SanitizeReaderToWriter(r, w)
158}
159