1// Copyright (c) 2014, David Kitchen <david@buro9.com> 2// 3// All rights reserved. 4// 5// Redistribution and use in source and binary forms, with or without 6// modification, are permitted provided that the following conditions are met: 7// 8// * Redistributions of source code must retain the above copyright notice, this 9// list of conditions and the following disclaimer. 10// 11// * Redistributions in binary form must reproduce the above copyright notice, 12// this list of conditions and the following disclaimer in the documentation 13// and/or other materials provided with the distribution. 14// 15// * Neither the name of the organisation (Microcosm) nor the names of its 16// contributors may be used to endorse or promote products derived from 17// this software without specific prior written permission. 18// 19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30package bluemonday 31 32import ( 33 "encoding/base64" 34 "net/url" 35 "regexp" 36) 37 38// A selection of regular expressions that can be used as .Matching() rules on 39// HTML attributes. 40var ( 41 // CellAlign handles the `align` attribute 42 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td#attr-align 43 CellAlign = regexp.MustCompile(`(?i)^(center|justify|left|right|char)$`) 44 45 // CellVerticalAlign handles the `valign` attribute 46 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td#attr-valign 47 CellVerticalAlign = regexp.MustCompile(`(?i)^(baseline|bottom|middle|top)$`) 48 49 // Direction handles the `dir` attribute 50 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/bdo#attr-dir 51 Direction = regexp.MustCompile(`(?i)^(rtl|ltr)$`) 52 53 // ImageAlign handles the `align` attribute on the `image` tag 54 // http://www.w3.org/MarkUp/Test/Img/imgtest.html 55 ImageAlign = regexp.MustCompile( 56 `(?i)^(left|right|top|texttop|middle|absmiddle|baseline|bottom|absbottom)$`, 57 ) 58 59 // Integer describes whole positive integers (including 0) used in places 60 // like td.colspan 61 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td#attr-colspan 62 Integer = regexp.MustCompile(`^[0-9]+$`) 63 64 // ISO8601 according to the W3 group is only a subset of the ISO8601 65 // standard: http://www.w3.org/TR/NOTE-datetime 66 // 67 // Used in places like time.datetime 68 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/time#attr-datetime 69 // 70 // Matches patterns: 71 // Year: 72 // YYYY (eg 1997) 73 // Year and month: 74 // YYYY-MM (eg 1997-07) 75 // Complete date: 76 // YYYY-MM-DD (eg 1997-07-16) 77 // Complete date plus hours and minutes: 78 // YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00) 79 // Complete date plus hours, minutes and seconds: 80 // YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00) 81 // Complete date plus hours, minutes, seconds and a decimal fraction of a 82 // second 83 // YYYY-MM-DDThh:mm:ss.sTZD (eg 1997-07-16T19:20:30.45+01:00) 84 ISO8601 = regexp.MustCompile( 85 `^[0-9]{4}(-[0-9]{2}(-[0-9]{2}([ T][0-9]{2}(:[0-9]{2}){1,2}(.[0-9]{1,6})` + 86 `?Z?([\+-][0-9]{2}:[0-9]{2})?)?)?)?$`, 87 ) 88 89 // ListType encapsulates the common value as well as the latest spec 90 // values for lists 91 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/ol#attr-type 92 ListType = regexp.MustCompile(`(?i)^(circle|disc|square|a|A|i|I|1)$`) 93 94 // SpaceSeparatedTokens is used in places like `a.rel` and the common attribute 95 // `class` which both contain space delimited lists of data tokens 96 // http://www.w3.org/TR/html-markup/datatypes.html#common.data.tokens-def 97 // Regexp: \p{L} matches unicode letters, \p{N} matches unicode numbers 98 SpaceSeparatedTokens = regexp.MustCompile(`^([\s\p{L}\p{N}_-]+)$`) 99 100 // Number is a double value used on HTML5 meter and progress elements 101 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-button-element.html#the-meter-element 102 Number = regexp.MustCompile(`^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$`) 103 104 // NumberOrPercent is used predominantly as units of measurement in width 105 // and height attributes 106 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#attr-height 107 NumberOrPercent = regexp.MustCompile(`^[0-9]+[%]?$`) 108 109 // Paragraph of text in an attribute such as *.'title', img.alt, etc 110 // https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes#attr-title 111 // Note that we are not allowing chars that could close tags like '>' 112 Paragraph = regexp.MustCompile(`^[\p{L}\p{N}\s\-_',\[\]!\./\\\(\)]*$`) 113 114 // dataURIImagePrefix is used by AllowDataURIImages to define the acceptable 115 // prefix of data URIs that contain common web image formats. 116 // 117 // This is not exported as it's not useful by itself, and only has value 118 // within the AllowDataURIImages func 119 dataURIImagePrefix = regexp.MustCompile( 120 `^image/(gif|jpeg|png|webp);base64,`, 121 ) 122) 123 124// AllowStandardURLs is a convenience function that will enable rel="nofollow" 125// on "a", "area" and "link" (if you have allowed those elements) and will 126// ensure that the URL values are parseable and either relative or belong to the 127// "mailto", "http", or "https" schemes 128func (p *Policy) AllowStandardURLs() { 129 // URLs must be parseable by net/url.Parse() 130 p.RequireParseableURLs(true) 131 132 // !url.IsAbs() is permitted 133 p.AllowRelativeURLs(true) 134 135 // Most common URL schemes only 136 p.AllowURLSchemes("mailto", "http", "https") 137 138 // For linking elements we will add rel="nofollow" if it does not already exist 139 // This applies to "a" "area" "link" 140 p.RequireNoFollowOnLinks(true) 141} 142 143// AllowStandardAttributes will enable "id", "title" and the language specific 144// attributes "dir" and "lang" on all elements that are whitelisted 145func (p *Policy) AllowStandardAttributes() { 146 // "dir" "lang" are permitted as both language attributes affect charsets 147 // and direction of text. 148 p.AllowAttrs("dir").Matching(Direction).Globally() 149 p.AllowAttrs( 150 "lang", 151 ).Matching(regexp.MustCompile(`[a-zA-Z]{2,20}`)).Globally() 152 153 // "id" is permitted. This is pretty much as some HTML elements require this 154 // to work well ("dfn" is an example of a "id" being value) 155 // This does create a risk that JavaScript and CSS within your web page 156 // might identify the wrong elements. Ensure that you select things 157 // accurately 158 p.AllowAttrs("id").Matching( 159 regexp.MustCompile(`[a-zA-Z0-9\:\-_\.]+`), 160 ).Globally() 161 162 // "title" is permitted as it improves accessibility. 163 p.AllowAttrs("title").Matching(Paragraph).Globally() 164} 165 166// AllowStyling presently enables the class attribute globally. 167// 168// Note: When bluemonday ships a CSS parser and we can safely sanitise that, 169// this will also allow sanitized styling of elements via the style attribute. 170func (p *Policy) AllowStyling() { 171 172 // "class" is permitted globally 173 p.AllowAttrs("class").Matching(SpaceSeparatedTokens).Globally() 174} 175 176// AllowImages enables the img element and some popular attributes. It will also 177// ensure that URL values are parseable. This helper does not enable data URI 178// images, for that you should also use the AllowDataURIImages() helper. 179func (p *Policy) AllowImages() { 180 181 // "img" is permitted 182 p.AllowAttrs("align").Matching(ImageAlign).OnElements("img") 183 p.AllowAttrs("alt").Matching(Paragraph).OnElements("img") 184 p.AllowAttrs("height", "width").Matching(NumberOrPercent).OnElements("img") 185 186 // Standard URLs enabled 187 p.AllowStandardURLs() 188 p.AllowAttrs("src").OnElements("img") 189} 190 191// AllowDataURIImages permits the use of inline images defined in RFC2397 192// http://tools.ietf.org/html/rfc2397 193// http://en.wikipedia.org/wiki/Data_URI_scheme 194// 195// Images must have a mimetype matching: 196// image/gif 197// image/jpeg 198// image/png 199// image/webp 200// 201// NOTE: There is a potential security risk to allowing data URIs and you should 202// only permit them on content you already trust. 203// http://palizine.plynt.com/issues/2010Oct/bypass-xss-filters/ 204// https://capec.mitre.org/data/definitions/244.html 205func (p *Policy) AllowDataURIImages() { 206 207 // URLs must be parseable by net/url.Parse() 208 p.RequireParseableURLs(true) 209 210 // Supply a function to validate images contained within data URI 211 p.AllowURLSchemeWithCustomPolicy( 212 "data", 213 func(url *url.URL) (allowUrl bool) { 214 if url.RawQuery != "" || url.Fragment != "" { 215 return false 216 } 217 218 matched := dataURIImagePrefix.FindString(url.Opaque) 219 if matched == "" { 220 return false 221 } 222 223 _, err := base64.StdEncoding.DecodeString(url.Opaque[len(matched):]) 224 if err != nil { 225 return false 226 } 227 228 return true 229 }, 230 ) 231} 232 233// AllowLists will enabled ordered and unordered lists, as well as definition 234// lists 235func (p *Policy) AllowLists() { 236 // "ol" "ul" are permitted 237 p.AllowAttrs("type").Matching(ListType).OnElements("ol", "ul") 238 239 // "li" is permitted 240 p.AllowAttrs("type").Matching(ListType).OnElements("li") 241 p.AllowAttrs("value").Matching(Integer).OnElements("li") 242 243 // "dl" "dt" "dd" are permitted 244 p.AllowElements("dl", "dt", "dd") 245} 246 247// AllowTables will enable a rich set of elements and attributes to describe 248// HTML tables 249func (p *Policy) AllowTables() { 250 251 // "table" is permitted 252 p.AllowAttrs("height", "width").Matching(NumberOrPercent).OnElements("table") 253 p.AllowAttrs("summary").Matching(Paragraph).OnElements("table") 254 255 // "caption" is permitted 256 p.AllowElements("caption") 257 258 // "col" "colgroup" are permitted 259 p.AllowAttrs("align").Matching(CellAlign).OnElements("col", "colgroup") 260 p.AllowAttrs("height", "width").Matching( 261 NumberOrPercent, 262 ).OnElements("col", "colgroup") 263 p.AllowAttrs("span").Matching(Integer).OnElements("colgroup", "col") 264 p.AllowAttrs("valign").Matching( 265 CellVerticalAlign, 266 ).OnElements("col", "colgroup") 267 268 // "thead" "tr" are permitted 269 p.AllowAttrs("align").Matching(CellAlign).OnElements("thead", "tr") 270 p.AllowAttrs("valign").Matching(CellVerticalAlign).OnElements("thead", "tr") 271 272 // "td" "th" are permitted 273 p.AllowAttrs("abbr").Matching(Paragraph).OnElements("td", "th") 274 p.AllowAttrs("align").Matching(CellAlign).OnElements("td", "th") 275 p.AllowAttrs("colspan", "rowspan").Matching(Integer).OnElements("td", "th") 276 p.AllowAttrs("headers").Matching( 277 SpaceSeparatedTokens, 278 ).OnElements("td", "th") 279 p.AllowAttrs("height", "width").Matching( 280 NumberOrPercent, 281 ).OnElements("td", "th") 282 p.AllowAttrs( 283 "scope", 284 ).Matching( 285 regexp.MustCompile(`(?i)(?:row|col)(?:group)?`), 286 ).OnElements("td", "th") 287 p.AllowAttrs("valign").Matching(CellVerticalAlign).OnElements("td", "th") 288 p.AllowAttrs("nowrap").Matching( 289 regexp.MustCompile(`(?i)|nowrap`), 290 ).OnElements("td", "th") 291 292 // "tbody" "tfoot" 293 p.AllowAttrs("align").Matching(CellAlign).OnElements("tbody", "tfoot") 294 p.AllowAttrs("valign").Matching( 295 CellVerticalAlign, 296 ).OnElements("tbody", "tfoot") 297} 298