1defmodule HtmlSanitizeEx.Scrubber.Meta do
2  @moduledoc """
3  This module contains some meta-programming magic to define your own rules
4  for scrubbers.
5
6  The StripTags scrubber is a good starting point:
7
8      defmodule HtmlSanitizeEx.Scrubber.StripTags do
9        require HtmlSanitizeEx.Scrubber.Meta
10        alias HtmlSanitizeEx.Scrubber.Meta
11
12        # Removes any CDATA tags before the traverser/scrubber runs.
13        Meta.remove_cdata_sections_before_scrub
14
15        Meta.strip_comments
16
17        Meta.strip_everything_not_covered
18      end
19
20  You can use the `allow_tag_with_uri_attributes/3` and
21  `allow_tag_with_these_attributes/2` macros to define what is allowed:
22
23      defmodule HtmlSanitizeEx.Scrubber.StripTags do
24        require HtmlSanitizeEx.Scrubber.Meta
25        alias HtmlSanitizeEx.Scrubber.Meta
26
27        # Removes any CDATA tags before the traverser/scrubber runs.
28        Meta.remove_cdata_sections_before_scrub
29
30        Meta.strip_comments
31
32        Meta.allow_tag_with_uri_attributes   "img", ["src"], ["http", "https"]
33        Meta.allow_tag_with_these_attributes "img", ["width", "height"]
34
35        Meta.strip_everything_not_covered
36      end
37
38  You can stack these if convenient:
39
40      Meta.allow_tag_with_uri_attributes   "img", ["src"], ["http", "https"]
41      Meta.allow_tag_with_these_attributes "img", ["width", "height"]
42      Meta.allow_tag_with_these_attributes "img", ["title", "alt"]
43
44  """
45
46  @doc """
47  Allow these tags and use the regular `scrub_attribute/2` function to scrub
48  the attributes.
49  """
50  defmacro allow_tags_and_scrub_their_attributes(list) do
51    Enum.map(list, fn tag_name -> allow_this_tag_and_scrub_its_attributes(tag_name) end)
52  end
53
54  @doc """
55  Allow the given +list+ of attributes for the specified +tag+.
56
57      Meta.allow_tag_with_these_attributes "a", ["name", "title"]
58
59      Meta.allow_tag_with_these_attributes "img", ["title", "alt"]
60  """
61  defmacro allow_tag_with_these_attributes(tag_name, list \\ []) do
62    list
63    |> Enum.map(fn attr_name -> allow_this_tag_with_this_attribute(tag_name, attr_name) end)
64    |> Enum.concat([allow_this_tag_and_scrub_its_attributes(tag_name)])
65  end
66
67  @doc """
68  Allow the given list of +values+ for the given +attribute+ on the
69  specified +tag+.
70
71      Meta.allow_tag_with_this_attribute_values "a", "target", ["_blank"]
72  """
73  defmacro allow_tag_with_this_attribute_values(tag_name, attribute, values) do
74    quote do
75      def scrub_attribute(unquote(tag_name), {unquote(attribute), value})
76          when value in unquote(values) do
77        {unquote(attribute), value}
78      end
79    end
80  end
81
82  @doc """
83  Allow the given +list+ of attributes to contain URI information for the
84  specified +tag+.
85
86      # Only allow SSL-enabled and mailto links
87      Meta.allow_tag_with_uri_attributes "a", ["href"], ["https", "mailto"]
88
89      # Only allow none-SSL images
90      Meta.allow_tag_with_uri_attributes "img", ["src"], ["http"]
91  """
92  defmacro allow_tag_with_uri_attributes(tag, list, valid_schemes) do
93    list
94    |> Enum.map(fn name -> allow_tag_with_uri_attribute(tag, name, valid_schemes) end)
95  end
96
97  @doc """
98
99  """
100  defmacro allow_tags_with_style_attributes(list) do
101    list
102    |> Enum.map(fn tag_name -> allow_this_tag_with_style_attribute(tag_name) end)
103  end
104
105  @doc """
106  Removes any CDATA tags before the traverser/scrubber runs.
107  """
108  defmacro remove_cdata_sections_before_scrub do
109    quote do
110      def before_scrub(html), do: String.replace(html, "<![CDATA[", "")
111    end
112  end
113
114  @doc """
115  Strips all comments.
116  """
117  defmacro strip_comments do
118    quote do
119      def scrub({:comment, children}), do: ""
120    end
121  end
122
123  @doc """
124  Ensures any tags/attributes not explicitly whitelisted until this
125  statement are stripped.
126  """
127  defmacro strip_everything_not_covered do
128    replacement_linebreak = "#{HtmlSanitizeEx.Parser.replacement_for_linebreak}"
129    replacement_space = "#{HtmlSanitizeEx.Parser.replacement_for_space}"
130    replacement_tab = "#{HtmlSanitizeEx.Parser.replacement_for_tab}"
131
132    quote do
133      # If we haven't covered the attribute until here, we just scrab it.
134      def scrub_attribute(_tag, _attribute), do: nil
135
136      # If we haven't covered the attribute until here, we just scrab it.
137      def scrub({_tag, _attributes, children}), do: children
138
139      def scrub({_tag, children}), do: children
140
141      def scrub(unquote(" " <> replacement_linebreak <> " ") <> text), do: text
142      def scrub(unquote(" " <> replacement_space <> " ") <> text), do: " " <> text
143      def scrub(unquote(" " <> replacement_tab <> " ") <> text), do: text
144
145      # Text is left alone
146      def scrub("" <> text), do: text
147    end
148  end
149
150
151
152  defp allow_this_tag_and_scrub_its_attributes(tag_name) do
153    quote do
154      def scrub({unquote(tag_name), attributes, children}) do
155        {unquote(tag_name), scrub_attributes(unquote(tag_name), attributes), children}
156      end
157
158      defp scrub_attributes(unquote(tag_name), attributes) do
159        Enum.map(attributes, fn(attr) -> scrub_attribute(unquote(tag_name), attr) end)
160        |> Enum.reject(&(is_nil(&1)))
161      end
162    end
163  end
164
165  defp allow_this_tag_with_this_attribute(tag_name, attr_name) do
166    quote do
167      def scrub_attribute(unquote(tag_name), {unquote(attr_name), value}) do
168        {unquote(attr_name), value}
169      end
170    end
171  end
172
173  defp allow_this_tag_with_style_attribute(tag_name) do
174    quote do
175      def scrub_attribute(unquote(tag_name), {"style", value}) do
176        {"style", scrub_css(value)}
177      end
178    end
179  end
180
181  defp allow_tag_with_uri_attribute(tag_name, attr_name, valid_schemes) do
182    quote do
183      def scrub_attribute(unquote(tag_name), {unquote(attr_name), "&" <> value}) do
184        nil
185      end
186
187      @protocol_separator ~r/:|(&#0*58)|(&#x70)|(&#x0*3a)|(%|&#37;)3A/mi
188      @scheme_capture ~r/(.+?)(:|(&#0*58)|(&#x70)|(&#x0*3a)|(%|&#37;)3A)/mi
189
190      def scrub_attribute(unquote(tag_name), {unquote(attr_name), uri}) do
191        valid_schema = if String.match?(uri, @protocol_separator) do
192          case Regex.run(@scheme_capture, uri) do
193            [_, scheme, _] ->
194              Enum.any?(unquote(valid_schemes), fn x -> x == scheme end)
195            nil ->
196              false
197          end
198        else
199          true
200        end
201        if valid_schema, do: {unquote(attr_name), uri}
202      end
203    end
204  end
205end
206