1{-# LANGUAGE OverloadedStrings #-}
2-- | Sanatize HTML to prevent XSS attacks.
3--
4-- See README.md <http://github.com/gregwebs/haskell-xss-sanitize> for more details.
5module Text.HTML.SanitizeXSS
6    (
7    -- * Sanitize
8      sanitize
9    , sanitizeBalance
10    , sanitizeXSS
11
12    -- * Custom filtering
13    , filterTags
14    , safeTags
15    , safeTagsCustom
16    , balanceTags
17
18    -- * Utilities
19    , safeTagName
20    , sanitizeAttribute
21    , sanitaryURI
22    ) where
23
24import Text.HTML.SanitizeXSS.Css
25
26import Text.HTML.TagSoup
27
28import Data.Set (Set(), member, notMember, (\\), fromList, fromAscList)
29import Data.Char ( toLower )
30import Data.Text (Text)
31import qualified Data.Text as T
32
33import Network.URI ( parseURIReference, URI (..),
34                     isAllowedInURI, escapeURIString, uriScheme )
35import Codec.Binary.UTF8.String ( encodeString )
36
37import Data.Maybe (mapMaybe)
38
39
40-- | Sanitize HTML to prevent XSS attacks.  This is equivalent to @filterTags safeTags@.
41sanitize :: Text -> Text
42sanitize = sanitizeXSS
43
44-- | alias of sanitize function
45sanitizeXSS :: Text -> Text
46sanitizeXSS = filterTags safeTags
47
48-- | Sanitize HTML to prevent XSS attacks and also make sure the tags are balanced.
49--   This is equivalent to @filterTags (balanceTags . safeTags)@.
50sanitizeBalance :: Text -> Text
51sanitizeBalance = filterTags (balanceTags . safeTags)
52
53-- | Filter which makes sure the tags are balanced.  Use with 'filterTags' and 'safeTags' to create a custom filter.
54balanceTags :: [Tag Text] -> [Tag Text]
55balanceTags = balance []
56
57-- | Parse the given text to a list of tags, apply the given filtering
58-- function, and render back to HTML. You can insert your own custom
59-- filtering, but make sure you compose your filtering function with
60-- 'safeTags' or 'safeTagsCustom'.
61filterTags :: ([Tag Text] -> [Tag Text]) -> Text -> Text
62filterTags f = renderTagsOptions renderOptions {
63    optMinimize = \x -> x `member` voidElems -- <img><img> converts to <img />, <a/> converts to <a></a>
64  } .  f . canonicalizeTags . parseTags
65
66voidElems :: Set T.Text
67voidElems = fromAscList $ T.words $ T.pack "area base br col command embed hr img input keygen link meta param source track wbr"
68
69balance :: [Text] -- ^ unclosed tags
70        -> [Tag Text] -> [Tag Text]
71balance unclosed [] = map TagClose $ filter (`notMember` voidElems) unclosed
72balance (x:xs) tags'@(TagClose name:tags)
73    | x == name = TagClose name : balance xs tags
74    | x `member` voidElems = balance xs tags'
75    | otherwise = TagOpen name [] : TagClose name : balance (x:xs) tags
76balance unclosed (TagOpen name as : tags) =
77    TagOpen name as : balance (name : unclosed) tags
78balance unclosed (t:ts) = t : balance unclosed ts
79
80-- | Filters out unsafe tags and sanitizes attributes. Use with
81-- filterTags to create a custom filter.
82safeTags :: [Tag Text] -> [Tag Text]
83safeTags = safeTagsCustom safeTagName sanitizeAttribute
84
85-- | Filters out unsafe tags and sanitizes attributes, like
86-- 'safeTags', but uses custom functions for determining which tags
87-- are safe and for sanitizing attributes. This allows you to add or
88-- remove specific tags or attributes on the white list, or to use
89-- your own white list.
90--
91-- @safeTagsCustom safeTagName sanitizeAttribute@ is equivalent to
92-- 'safeTags'.
93--
94-- @since 0.3.6
95safeTagsCustom ::
96     (Text -> Bool)                       -- ^ Select safe tags, like
97                                          -- 'safeTagName'
98  -> ((Text, Text) -> Maybe (Text, Text)) -- ^ Sanitize attributes,
99                                          -- like 'sanitizeAttribute'
100  -> [Tag Text] -> [Tag Text]
101safeTagsCustom _ _ [] = []
102safeTagsCustom safeName sanitizeAttr (t@(TagClose name):tags)
103    | safeName name = t : safeTagsCustom safeName sanitizeAttr tags
104    | otherwise = safeTagsCustom safeName sanitizeAttr tags
105safeTagsCustom safeName sanitizeAttr (TagOpen name attributes:tags)
106  | safeName name = TagOpen name (mapMaybe sanitizeAttr attributes) :
107      safeTagsCustom safeName sanitizeAttr tags
108  | otherwise = safeTagsCustom safeName sanitizeAttr tags
109safeTagsCustom n a (t:tags) = t : safeTagsCustom n a tags
110
111safeTagName :: Text -> Bool
112safeTagName tagname = tagname `member` sanitaryTags
113
114safeAttribute :: (Text, Text) -> Bool
115safeAttribute (name, value) = name `member` sanitaryAttributes &&
116  (name `notMember` uri_attributes || sanitaryURI value)
117
118-- | low-level API if you have your own HTML parser. Used by safeTags.
119sanitizeAttribute :: (Text, Text) -> Maybe (Text, Text)
120sanitizeAttribute ("style", value) =
121    let css = sanitizeCSS value
122    in  if T.null css then Nothing else Just ("style", css)
123sanitizeAttribute attr | safeAttribute attr = Just attr
124                       | otherwise = Nothing
125
126
127-- | Returns @True@ if the specified URI is not a potential security risk.
128sanitaryURI :: Text -> Bool
129sanitaryURI u =
130  case parseURIReference (escapeURI $ T.unpack u) of
131     Just p  -> (null (uriScheme p)) ||
132                ((map toLower $ init $ uriScheme p) `member` safeURISchemes)
133     Nothing -> False
134
135
136-- | Escape unicode characters in a URI.  Characters that are
137-- already valid in a URI, including % and ?, are left alone.
138escapeURI :: String -> String
139escapeURI = escapeURIString isAllowedInURI . encodeString
140
141safeURISchemes :: Set String
142safeURISchemes = fromList acceptable_protocols
143
144sanitaryTags :: Set Text
145sanitaryTags = fromList (acceptable_elements ++ mathml_elements ++ svg_elements)
146  \\ (fromList svg_allow_local_href) -- extra filtering not implemented
147
148sanitaryAttributes :: Set Text
149sanitaryAttributes = fromList (allowed_html_uri_attributes ++ acceptable_attributes ++ mathml_attributes ++ svg_attributes)
150  \\ (fromList svg_attr_val_allows_ref) -- extra unescaping not implemented
151
152allowed_html_uri_attributes :: [Text]
153allowed_html_uri_attributes = ["href", "src", "cite", "action", "longdesc"]
154
155uri_attributes :: Set Text
156uri_attributes = fromList $ allowed_html_uri_attributes ++ ["xlink:href", "xml:base"]
157
158acceptable_elements :: [Text]
159acceptable_elements = ["a", "abbr", "acronym", "address", "area",
160    "article", "aside", "audio", "b", "big", "blockquote", "br", "button",
161    "canvas", "caption", "center", "cite", "code", "col", "colgroup",
162    "command", "datagrid", "datalist", "dd", "del", "details", "dfn",
163    "dialog", "dir", "div", "dl", "dt", "em", "event-source", "fieldset",
164    "figcaption", "figure", "footer", "font", "form", "header", "h1", "h2",
165    "h3", "h4", "h5", "h6", "hr", "i", "img", "input", "ins", "keygen",
166    "kbd", "label", "legend", "li", "m", "main", "map", "menu", "meter", "multicol",
167    "nav", "nextid", "ol", "output", "optgroup", "option", "p", "pre",
168    "progress", "q", "s", "samp", "section", "select", "small", "sound",
169    "source", "spacer", "span", "strike", "strong", "sub", "sup", "table",
170    "tbody", "td", "textarea", "time", "tfoot", "th", "thead", "tr", "tt",
171    "u", "ul", "var", "video"]
172
173mathml_elements :: [Text]
174mathml_elements = ["maction", "math", "merror", "mfrac", "mi",
175    "mmultiscripts", "mn", "mo", "mover", "mpadded", "mphantom",
176    "mprescripts", "mroot", "mrow", "mspace", "msqrt", "mstyle", "msub",
177    "msubsup", "msup", "mtable", "mtd", "mtext", "mtr", "munder",
178    "munderover", "none"]
179
180-- this should include altGlyph I think
181svg_elements :: [Text]
182svg_elements = ["a", "animate", "animateColor", "animateMotion",
183    "animateTransform", "clipPath", "circle", "defs", "desc", "ellipse",
184    "font-face", "font-face-name", "font-face-src", "g", "glyph", "hkern",
185    "linearGradient", "line", "marker", "metadata", "missing-glyph",
186    "mpath", "path", "polygon", "polyline", "radialGradient", "rect",
187    "set", "stop", "svg", "switch", "text", "title", "tspan", "use"]
188
189acceptable_attributes :: [Text]
190acceptable_attributes = ["abbr", "accept", "accept-charset", "accesskey",
191    "align", "alt", "autocomplete", "autofocus", "axis",
192    "background", "balance", "bgcolor", "bgproperties", "border",
193    "bordercolor", "bordercolordark", "bordercolorlight", "bottompadding",
194    "cellpadding", "cellspacing", "ch", "challenge", "char", "charoff",
195    "choff", "charset", "checked", "class", "clear", "color",
196    "cols", "colspan", "compact", "contenteditable", "controls", "coords",
197    -- "data", TODO: allow this with further filtering
198    "datafld", "datapagesize", "datasrc", "datetime", "default",
199    "delay", "dir", "disabled", "draggable", "dynsrc", "enctype", "end",
200    "face", "for", "form", "frame", "galleryimg", "gutter", "headers",
201    "height", "hidefocus", "hidden", "high", "hreflang", "hspace",
202    "icon", "id", "inputmode", "ismap", "keytype", "label", "leftspacing",
203    "lang", "list", "loop", "loopcount", "loopend",
204    "loopstart", "low", "lowsrc", "max", "maxlength", "media", "method",
205    "min", "multiple", "name", "nohref", "noshade", "nowrap", "open",
206    "optimum", "pattern", "ping", "point-size", "prompt", "pqg",
207    "radiogroup", "readonly", "rel", "repeat-max", "repeat-min",
208    "replace", "required", "rev", "rightspacing", "rows", "rowspan",
209    "rules", "scope", "selected", "shape", "size", "span", "start",
210    "step",
211    "style", -- gets further filtering
212    "summary", "suppress", "tabindex", "target",
213    "template", "title", "toppadding", "type", "unselectable", "usemap",
214    "urn", "valign", "value", "variable", "volume", "vspace", "vrml",
215    "width", "wrap", "xml:lang"]
216
217acceptable_protocols :: [String]
218acceptable_protocols = [ "ed2k", "ftp", "http", "https", "irc",
219    "mailto", "news", "gopher", "nntp", "telnet", "webcal",
220    "xmpp", "callto", "feed", "urn", "aim", "rsync", "tag",
221    "ssh", "sftp", "rtsp", "afs" ]
222
223mathml_attributes :: [Text]
224mathml_attributes = ["actiontype", "align", "columnalign", "columnalign",
225    "columnalign", "columnlines", "columnspacing", "columnspan", "depth",
226    "display", "displaystyle", "equalcolumns", "equalrows", "fence",
227    "fontstyle", "fontweight", "frame", "height", "linethickness", "lspace",
228    "mathbackground", "mathcolor", "mathvariant", "mathvariant", "maxsize",
229    "minsize", "other", "rowalign", "rowalign", "rowalign", "rowlines",
230    "rowspacing", "rowspan", "rspace", "scriptlevel", "selection",
231    "separator", "stretchy", "width", "width", "xlink:href", "xlink:show",
232    "xlink:type", "xmlns", "xmlns:xlink"]
233
234svg_attributes :: [Text]
235svg_attributes = ["accent-height", "accumulate", "additive", "alphabetic",
236    "arabic-form", "ascent", "attributeName", "attributeType",
237    "baseProfile", "bbox", "begin", "by", "calcMode", "cap-height",
238    "class", "clip-path", "color", "color-rendering", "content", "cx",
239    "cy", "d", "dx", "dy", "descent", "display", "dur", "end", "fill",
240    "fill-opacity", "fill-rule", "font-family", "font-size",
241    "font-stretch", "font-style", "font-variant", "font-weight", "from",
242    "fx", "fy", "g1", "g2", "glyph-name", "gradientUnits", "hanging",
243    "height", "horiz-adv-x", "horiz-origin-x", "id", "ideographic", "k",
244    "keyPoints", "keySplines", "keyTimes", "lang", "marker-end",
245    "marker-mid", "marker-start", "markerHeight", "markerUnits",
246    "markerWidth", "mathematical", "max", "min", "name", "offset",
247    "opacity", "orient", "origin", "overline-position",
248    "overline-thickness", "panose-1", "path", "pathLength", "points",
249    "preserveAspectRatio", "r", "refX", "refY", "repeatCount",
250    "repeatDur", "requiredExtensions", "requiredFeatures", "restart",
251    "rotate", "rx", "ry", "slope", "stemh", "stemv", "stop-color",
252    "stop-opacity", "strikethrough-position", "strikethrough-thickness",
253    "stroke", "stroke-dasharray", "stroke-dashoffset", "stroke-linecap",
254    "stroke-linejoin", "stroke-miterlimit", "stroke-opacity",
255    "stroke-width", "systemLanguage", "target", "text-anchor", "to",
256    "transform", "type", "u1", "u2", "underline-position",
257    "underline-thickness", "unicode", "unicode-range", "units-per-em",
258    "values", "version", "viewBox", "visibility", "width", "widths", "x",
259    "x-height", "x1", "x2", "xlink:actuate", "xlink:arcrole",
260    "xlink:href", "xlink:role", "xlink:show", "xlink:title", "xlink:type",
261    "xml:base", "xml:lang", "xml:space", "xmlns", "xmlns:xlink", "y",
262    "y1", "y2", "zoomAndPan"]
263
264-- the values for these need to be escaped
265svg_attr_val_allows_ref :: [Text]
266svg_attr_val_allows_ref = ["clip-path", "color-profile", "cursor", "fill",
267    "filter", "marker", "marker-start", "marker-mid", "marker-end",
268    "mask", "stroke"]
269
270svg_allow_local_href :: [Text]
271svg_allow_local_href = ["altGlyph", "animate", "animateColor",
272    "animateMotion", "animateTransform", "cursor", "feImage", "filter",
273    "linearGradient", "pattern", "radialGradient", "textpath", "tref",
274    "set", "use"]
275
276