1// Copyright 2017 Frédéric Guillot. All rights reserved. 2// Use of this source code is governed by the Apache 2.0 3// license that can be found in the LICENSE file. 4 5package sanitizer 6 7import ( 8 "bytes" 9 "fmt" 10 "io" 11 "regexp" 12 "strconv" 13 "strings" 14 15 "github.com/nkanaev/yarr/src/content/htmlutil" 16 "golang.org/x/net/html" 17) 18 19var splitSrcsetRegex = regexp.MustCompile(`,\s+`) 20 21// Sanitize returns safe HTML. 22func Sanitize(baseURL, input string) string { 23 var buffer bytes.Buffer 24 var tagStack []string 25 var parentTag string 26 blacklistedTagDepth := 0 27 28 tokenizer := html.NewTokenizer(bytes.NewBufferString(input)) 29 for { 30 if tokenizer.Next() == html.ErrorToken { 31 err := tokenizer.Err() 32 if err == io.EOF { 33 return buffer.String() 34 } 35 36 return "" 37 } 38 39 token := tokenizer.Token() 40 switch token.Type { 41 case html.TextToken: 42 if blacklistedTagDepth > 0 { 43 continue 44 } 45 46 // An iframe element never has fallback content. 47 // See https://www.w3.org/TR/2010/WD-html5-20101019/the-iframe-element.html#the-iframe-element 48 if parentTag == "iframe" { 49 continue 50 } 51 52 buffer.WriteString(html.EscapeString(token.Data)) 53 case html.StartTagToken: 54 tagName := token.Data 55 parentTag = tagName 56 57 if isValidTag(tagName) { 58 attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr) 59 60 if hasRequiredAttributes(tagName, attrNames) { 61 wrap := isVideoIframe(token) 62 if wrap { 63 buffer.WriteString(`<div class="video-wrapper">`) 64 } 65 66 if len(attrNames) > 0 { 67 buffer.WriteString("<" + tagName + " " + htmlAttributes + ">") 68 } else { 69 buffer.WriteString("<" + tagName + ">") 70 } 71 72 if tagName == "iframe" { 73 // autoclose iframes 74 buffer.WriteString("</iframe>") 75 if wrap { 76 buffer.WriteString("</div>") 77 } 78 } else { 79 tagStack = append(tagStack, tagName) 80 } 81 } 82 } else if isBlockedTag(tagName) { 83 blacklistedTagDepth++ 84 } 85 case html.EndTagToken: 86 tagName := token.Data 87 // iframes are autoclosed. see above 88 if tagName == "iframe" { 89 continue 90 } 91 if isValidTag(tagName) && inList(tagName, tagStack) { 92 buffer.WriteString(fmt.Sprintf("</%s>", tagName)) 93 } else if isBlockedTag(tagName) { 94 blacklistedTagDepth-- 95 } 96 case html.SelfClosingTagToken: 97 tagName := token.Data 98 if isValidTag(tagName) { 99 attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr) 100 101 if hasRequiredAttributes(tagName, attrNames) { 102 if len(attrNames) > 0 { 103 buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>") 104 } else { 105 buffer.WriteString("<" + tagName + "/>") 106 } 107 } 108 } 109 } 110 } 111} 112 113func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([]string, string) { 114 var htmlAttrs, attrNames []string 115 116 for _, attribute := range attributes { 117 value := attribute.Val 118 119 if !isValidAttribute(tagName, attribute.Key) { 120 continue 121 } 122 123 if (tagName == "img" || tagName == "source") && attribute.Key == "srcset" { 124 value = sanitizeSrcsetAttr(baseURL, value) 125 } 126 127 if isExternalResourceAttribute(attribute.Key) { 128 if tagName == "iframe" { 129 if isValidIframeSource(baseURL, attribute.Val) { 130 value = attribute.Val 131 } else { 132 continue 133 } 134 } else if tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val) { 135 value = attribute.Val 136 } else { 137 value = htmlutil.AbsoluteUrl(value, baseURL) 138 if value == "" { 139 continue 140 } 141 142 if !hasValidURIScheme(value) || isBlockedResource(value) { 143 continue 144 } 145 } 146 } 147 148 attrNames = append(attrNames, attribute.Key) 149 htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s="%s"`, attribute.Key, html.EscapeString(value))) 150 } 151 152 extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName) 153 if len(extraAttrNames) > 0 { 154 attrNames = append(attrNames, extraAttrNames...) 155 htmlAttrs = append(htmlAttrs, extraHTMLAttributes...) 156 } 157 158 return attrNames, strings.Join(htmlAttrs, " ") 159} 160 161func getExtraAttributes(tagName string) ([]string, []string) { 162 switch tagName { 163 case "a": 164 return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`} 165 case "video", "audio": 166 return []string{"controls"}, []string{"controls"} 167 case "iframe": 168 return []string{"sandbox", "loading"}, []string{`sandbox="allow-scripts allow-same-origin allow-popups"`, `loading="lazy"`} 169 case "img": 170 return []string{"loading"}, []string{`loading="lazy"`} 171 default: 172 return nil, nil 173 } 174} 175 176func isValidTag(tagName string) bool { 177 x := allowedTags.has(tagName) || allowedSvgTags.has(tagName) || allowedSvgFilters.has(tagName) 178 //fmt.Println(tagName, x) 179 return x 180} 181 182func isValidAttribute(tagName, attributeName string) bool { 183 if attrs, ok := allowedAttrs[tagName]; ok { 184 return attrs.has(attributeName) 185 } 186 if allowedSvgTags.has(tagName) { 187 return allowedSvgAttrs.has(attributeName) 188 } 189 return false 190} 191 192func isExternalResourceAttribute(attribute string) bool { 193 switch attribute { 194 case "src", "href", "poster", "cite": 195 return true 196 default: 197 return false 198 } 199} 200 201func hasRequiredAttributes(tagName string, attributes []string) bool { 202 elements := make(map[string][]string) 203 elements["a"] = []string{"href"} 204 elements["iframe"] = []string{"src"} 205 elements["img"] = []string{"src"} 206 elements["source"] = []string{"src", "srcset"} 207 208 for element, attrs := range elements { 209 if tagName == element { 210 for _, attribute := range attributes { 211 for _, attr := range attrs { 212 if attr == attribute { 213 return true 214 } 215 } 216 } 217 218 return false 219 } 220 } 221 222 return true 223} 224 225// See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml 226func hasValidURIScheme(src string) bool { 227 scheme := strings.SplitN(src, ":", 2)[0] 228 return allowedURISchemes.has(scheme) 229} 230 231func isBlockedResource(src string) bool { 232 blacklist := []string{ 233 "feedsportal.com", 234 "api.flattr.com", 235 "stats.wordpress.com", 236 "plus.google.com/share", 237 "twitter.com/share", 238 "feeds.feedburner.com", 239 } 240 241 for _, element := range blacklist { 242 if strings.Contains(src, element) { 243 return true 244 } 245 } 246 247 return false 248} 249 250func isValidIframeSource(baseURL, src string) bool { 251 whitelist := []string{ 252 "bandcamp.com", 253 "cdn.embedly.com", 254 "invidio.us", 255 "player.bilibili.com", 256 "player.vimeo.com", 257 "soundcloud.com", 258 "vk.com", 259 "w.soundcloud.com", 260 "www.dailymotion.com", 261 "www.youtube-nocookie.com", 262 "www.youtube.com", 263 } 264 265 domain := htmlutil.URLDomain(src) 266 // allow iframe from same origin 267 if htmlutil.URLDomain(baseURL) == domain { 268 return true 269 } 270 271 for _, safeDomain := range whitelist { 272 if safeDomain == domain { 273 return true 274 } 275 } 276 277 return false 278} 279 280func getTagAllowList() map[string][]string { 281 whitelist := make(map[string][]string) 282 whitelist["img"] = []string{"alt", "title", "src", "srcset", "sizes"} 283 whitelist["picture"] = []string{} 284 whitelist["audio"] = []string{"src"} 285 whitelist["video"] = []string{"poster", "height", "width", "src"} 286 whitelist["source"] = []string{"src", "type", "srcset", "sizes", "media"} 287 whitelist["dt"] = []string{} 288 whitelist["dd"] = []string{} 289 whitelist["dl"] = []string{} 290 whitelist["table"] = []string{} 291 whitelist["caption"] = []string{} 292 whitelist["thead"] = []string{} 293 whitelist["tfooter"] = []string{} 294 whitelist["tr"] = []string{} 295 whitelist["td"] = []string{"rowspan", "colspan"} 296 whitelist["th"] = []string{"rowspan", "colspan"} 297 whitelist["h1"] = []string{} 298 whitelist["h2"] = []string{} 299 whitelist["h3"] = []string{} 300 whitelist["h4"] = []string{} 301 whitelist["h5"] = []string{} 302 whitelist["h6"] = []string{} 303 whitelist["strong"] = []string{} 304 whitelist["em"] = []string{} 305 whitelist["code"] = []string{} 306 whitelist["pre"] = []string{} 307 whitelist["blockquote"] = []string{} 308 whitelist["q"] = []string{"cite"} 309 whitelist["p"] = []string{} 310 whitelist["ul"] = []string{} 311 whitelist["li"] = []string{} 312 whitelist["ol"] = []string{} 313 whitelist["br"] = []string{} 314 whitelist["del"] = []string{} 315 whitelist["a"] = []string{"href", "title"} 316 whitelist["figure"] = []string{} 317 whitelist["figcaption"] = []string{} 318 whitelist["cite"] = []string{} 319 whitelist["time"] = []string{"datetime"} 320 whitelist["abbr"] = []string{"title"} 321 whitelist["acronym"] = []string{"title"} 322 whitelist["wbr"] = []string{} 323 whitelist["dfn"] = []string{} 324 whitelist["sub"] = []string{} 325 whitelist["sup"] = []string{} 326 whitelist["var"] = []string{} 327 whitelist["samp"] = []string{} 328 whitelist["s"] = []string{} 329 whitelist["del"] = []string{} 330 whitelist["ins"] = []string{} 331 whitelist["kbd"] = []string{} 332 whitelist["rp"] = []string{} 333 whitelist["rt"] = []string{} 334 whitelist["rtc"] = []string{} 335 whitelist["ruby"] = []string{} 336 whitelist["iframe"] = []string{"width", "height", "frameborder", "src", "allowfullscreen"} 337 return whitelist 338} 339 340func inList(needle string, haystack []string) bool { 341 for _, element := range haystack { 342 if element == needle { 343 return true 344 } 345 } 346 347 return false 348} 349 350func isBlockedTag(tagName string) bool { 351 blacklist := []string{ 352 "noscript", 353 "script", 354 "style", 355 } 356 357 for _, element := range blacklist { 358 if element == tagName { 359 return true 360 } 361 } 362 363 return false 364} 365 366/* 367 368One or more strings separated by commas, indicating possible image sources for the user agent to use. 369 370Each string is composed of: 371- A URL to an image 372- Optionally, whitespace followed by one of: 373- A width descriptor (a positive integer directly followed by w). The width descriptor is divided by the source size given in the sizes attribute to calculate the effective pixel density. 374- A pixel density descriptor (a positive floating point number directly followed by x). 375 376*/ 377func sanitizeSrcsetAttr(baseURL, value string) string { 378 var sanitizedSources []string 379 rawSources := splitSrcsetRegex.Split(value, -1) 380 for _, rawSource := range rawSources { 381 parts := strings.Split(strings.TrimSpace(rawSource), " ") 382 nbParts := len(parts) 383 384 if nbParts > 0 { 385 sanitizedSource := parts[0] 386 if !strings.HasPrefix(parts[0], "data:") { 387 sanitizedSource = htmlutil.AbsoluteUrl(parts[0], baseURL) 388 if sanitizedSource == "" { 389 continue 390 } 391 } 392 393 if nbParts == 2 && isValidWidthOrDensityDescriptor(parts[1]) { 394 sanitizedSource += " " + parts[1] 395 } 396 397 sanitizedSources = append(sanitizedSources, sanitizedSource) 398 } 399 } 400 return strings.Join(sanitizedSources, ", ") 401} 402 403func isValidWidthOrDensityDescriptor(value string) bool { 404 if value == "" { 405 return false 406 } 407 408 lastChar := value[len(value)-1:] 409 if lastChar != "w" && lastChar != "x" { 410 return false 411 } 412 413 _, err := strconv.ParseFloat(value[0:len(value)-1], 32) 414 return err == nil 415} 416 417func isValidDataAttribute(value string) bool { 418 var dataAttributeAllowList = []string{ 419 "data:image/avif", 420 "data:image/apng", 421 "data:image/png", 422 "data:image/svg", 423 "data:image/svg+xml", 424 "data:image/jpg", 425 "data:image/jpeg", 426 "data:image/gif", 427 "data:image/webp", 428 } 429 430 for _, prefix := range dataAttributeAllowList { 431 if strings.HasPrefix(value, prefix) { 432 return true 433 } 434 } 435 return false 436} 437 438func isVideoIframe(token html.Token) bool { 439 videoWhitelist := map[string]bool{ 440 "player.bilibili.com": true, 441 "player.vimeo.com": true, 442 "www.dailymotion.com": true, 443 "www.youtube-nocookie.com": true, 444 "www.youtube.com": true, 445 } 446 if token.Data == "iframe" { 447 for _, attr := range token.Attr { 448 if attr.Key == "src" { 449 domain := htmlutil.URLDomain(attr.Val) 450 return videoWhitelist[domain] 451 } 452 } 453 } 454 return false 455} 456