1 use aho_corasick::{AhoCorasickBuilder, MatchKind};
2 use lazy_static::lazy_static;
3 
4 #[cfg(feature = "js-esbuild")]
5 use {
6     crate::minify::css::MINIFY_CSS_TRANSFORM_OPTIONS, crate::minify::esbuild::minify_using_esbuild,
7 };
8 
9 use crate::common::gen::attrs::ATTRS;
10 use crate::common::gen::codepoints::DIGIT;
11 use crate::common::pattern::Replacer;
12 use crate::common::spec::script::JAVASCRIPT_MIME_TYPES;
13 use crate::common::spec::tag::ns::Namespace;
14 use crate::common::whitespace::{
15     collapse_whitespace, left_trim, remove_all_whitespace, right_trim,
16 };
17 use crate::entity::encode::encode_entities;
18 use crate::Cfg;
19 
build_double_quoted_replacer() -> Replacer20 fn build_double_quoted_replacer() -> Replacer {
21     let mut patterns = Vec::<Vec<u8>>::new();
22     let mut replacements = Vec::<Vec<u8>>::new();
23 
24     // Replace all `"` with `&#34`, unless the quote is followed by a digit or semicolon,
25     // in which case add a semicolon to the encoded entity.
26     for c in "0123456789;".bytes() {
27         patterns.push(vec![b'"', c]);
28         replacements.push(vec![b'&', b'#', b'3', b'4', b';', c]);
29     }
30     patterns.push(b"\"".to_vec());
31     replacements.push(b"&#34".to_vec());
32 
33     Replacer::new(
34         AhoCorasickBuilder::new()
35             .dfa(true)
36             .match_kind(MatchKind::LeftmostLongest)
37             .build(patterns),
38         replacements,
39     )
40 }
41 
build_single_quoted_replacer() -> Replacer42 fn build_single_quoted_replacer() -> Replacer {
43     let mut patterns = Vec::<Vec<u8>>::new();
44     let mut replacements = Vec::<Vec<u8>>::new();
45 
46     // Replace all `'` with `&#39`, unless the quote is followed by a digit or semicolon,
47     // in which case add a semicolon to the encoded entity.
48     for c in "0123456789;".bytes() {
49         patterns.push(vec![b'\'', c]);
50         replacements.push(vec![b'&', b'#', b'3', b'9', b';', c]);
51     }
52     patterns.push(b"'".to_vec());
53     replacements.push(b"&#39".to_vec());
54 
55     Replacer::new(
56         AhoCorasickBuilder::new()
57             .dfa(true)
58             .match_kind(MatchKind::LeftmostLongest)
59             .build(patterns),
60         replacements,
61     )
62 }
63 
64 // TODO Sync with WHITESPACE definition.
65 static WS: &[(u8, &[u8])] = &[
66     (b'\x09', b"&#9"),
67     (b'\x0a', b"&#10"),
68     (b'\x0c', b"&#12"),
69     (b'\x0d', b"&#13"),
70     (b'\x20', b"&#32"),
71 ];
72 
build_unquoted_replacer() -> Replacer73 fn build_unquoted_replacer() -> Replacer {
74     let mut patterns = Vec::<Vec<u8>>::new();
75     let mut replacements = Vec::<Vec<u8>>::new();
76 
77     // Replace all whitespace with a numeric entity, unless the whitespace is followed by a digit or semicolon,
78     // in which case add a semicolon to the encoded entity.
79     for c in "0123456789;".bytes() {
80         for &(ws, rep) in WS {
81             patterns.push(vec![ws, c]);
82             replacements.push({
83                 let mut ent = rep.to_vec();
84                 ent.push(b';');
85                 ent.push(c);
86                 ent
87             });
88         }
89     }
90     for &(ws, rep) in WS {
91         patterns.push(vec![ws]);
92         replacements.push(rep.to_vec());
93     }
94 
95     // Replace all `>` with `&GT`, unless the chevron is followed by a semicolon,
96     // in which case add a semicolon to the encoded entity.
97     // Use `&GT` instead of `&gt` as `&gt` has more conflicting entities e.g. `&gtcc;`, `&gtdot;`.
98     patterns.push(b">;".to_vec());
99     replacements.push(b"&GT;;".to_vec());
100     patterns.push(b">".to_vec());
101     replacements.push(b"&GT".to_vec());
102 
103     Replacer::new(
104         AhoCorasickBuilder::new()
105             .dfa(true)
106             .match_kind(MatchKind::LeftmostLongest)
107             .build(patterns),
108         replacements,
109     )
110 }
111 
112 // If spec compliance is required, these characters must also be encoded in an unquoted attr value,
113 // as well as whitespace, `<`, and `>`.
114 static WHATWG_UNQUOTED: &[(u8, &[u8])] = &[
115     (b'"', b"&#34"),
116     (b'\'', b"&#39"),
117     (b'=', b"&#61"),
118     (b'`', b"&#6"),
119 ];
120 
build_whatwg_unquoted_replacer() -> Replacer121 fn build_whatwg_unquoted_replacer() -> Replacer {
122     let mut patterns = Vec::<Vec<u8>>::new();
123     let mut replacements = Vec::<Vec<u8>>::new();
124 
125     // Replace all whitespace with a numeric entity, unless the whitespace is followed by a digit or semicolon,
126     // in which case add a semicolon to the encoded entity.
127     for c in "0123456789;".bytes() {
128         for &(ws, rep) in WS {
129             patterns.push(vec![ws, c]);
130             replacements.push({
131                 let mut ent = rep.to_vec();
132                 ent.push(b';');
133                 ent.push(c);
134                 ent
135             });
136         }
137     }
138     for &(ws, rep) in WS {
139         patterns.push(vec![ws]);
140         replacements.push(rep.to_vec());
141     }
142 
143     // Replace WHATWG-disallowed characters with a numeric entity, unless they're followed by a digit or semicolon,
144     // in which case add a semicolon to the encoded entity.
145     for c in "0123456789;".bytes() {
146         for &(ws, rep) in WHATWG_UNQUOTED {
147             patterns.push(vec![ws, c]);
148             replacements.push({
149                 let mut ent = rep.to_vec();
150                 ent.push(b';');
151                 ent.push(c);
152                 ent
153             });
154         }
155     }
156     for &(ws, rep) in WHATWG_UNQUOTED {
157         patterns.push(vec![ws]);
158         replacements.push(rep.to_vec());
159     }
160 
161     // Replace all `<` with `&LT`, unless the chevron is followed by a semicolon,
162     // in which case add a semicolon to the encoded entity.
163     // Use `&GT` instead of `&lt` as `&lt` has more conflicting entities e.g. `&ltcc;`, `&ltdot;`.
164     patterns.push(b"<;".to_vec());
165     replacements.push(b"&LT;;".to_vec());
166     patterns.push(b"<".to_vec());
167     replacements.push(b"&LT".to_vec());
168 
169     // Replace all `>` with `&GT`, unless the chevron is followed by a semicolon,
170     // in which case add a semicolon to the encoded entity.
171     // Use `&GT` instead of `&gt` as `&gt` has more conflicting entities e.g. `&gtcc;`, `&gtdot;`.
172     patterns.push(b">;".to_vec());
173     replacements.push(b"&GT;;".to_vec());
174     patterns.push(b">".to_vec());
175     replacements.push(b"&GT".to_vec());
176 
177     Replacer::new(
178         AhoCorasickBuilder::new()
179             .dfa(true)
180             .match_kind(MatchKind::LeftmostLongest)
181             .build(patterns),
182         replacements,
183     )
184 }
185 
186 lazy_static! {
187     static ref DOUBLE_QUOTED_REPLACER: Replacer = build_double_quoted_replacer();
188     static ref SINGLE_QUOTED_REPLACER: Replacer = build_single_quoted_replacer();
189     static ref UNQUOTED_REPLACER: Replacer = build_unquoted_replacer();
190     static ref WHATWG_UNQUOTED_REPLACER: Replacer = build_whatwg_unquoted_replacer();
191 }
192 
193 pub struct AttrMinifiedValue {
194     quoted: bool,
195     prefix: &'static [u8],
196     data: Vec<u8>,
197     start: usize,
198     suffix: &'static [u8],
199 }
200 
201 impl AttrMinifiedValue {
quoted(&self) -> bool202     pub fn quoted(&self) -> bool {
203         self.quoted
204     }
205 
len(&self) -> usize206     pub fn len(&self) -> usize {
207         self.prefix.len() + (self.data.len() - self.start) + self.suffix.len()
208     }
209 
out(&self, out: &mut Vec<u8>)210     pub fn out(&self, out: &mut Vec<u8>) {
211         out.extend_from_slice(self.prefix);
212         out.extend_from_slice(&self.data[self.start..]);
213         out.extend_from_slice(self.suffix);
214     }
215 
216     #[cfg(test)]
str(&self) -> String217     pub fn str(&self) -> String {
218         let mut out = Vec::with_capacity(self.len());
219         self.out(&mut out);
220         String::from_utf8(out).unwrap()
221     }
222 }
223 
encode_using_double_quotes(val: &[u8]) -> AttrMinifiedValue224 pub fn encode_using_double_quotes(val: &[u8]) -> AttrMinifiedValue {
225     AttrMinifiedValue {
226         quoted: true,
227         prefix: b"\"",
228         data: DOUBLE_QUOTED_REPLACER.replace_all(val),
229         start: 0,
230         suffix: b"\"",
231     }
232 }
233 
encode_using_single_quotes(val: &[u8]) -> AttrMinifiedValue234 pub fn encode_using_single_quotes(val: &[u8]) -> AttrMinifiedValue {
235     AttrMinifiedValue {
236         quoted: true,
237         prefix: b"'",
238         data: SINGLE_QUOTED_REPLACER.replace_all(val),
239         start: 0,
240         suffix: b"'",
241     }
242 }
243 
encode_unquoted(val: &[u8], whatwg: bool) -> AttrMinifiedValue244 pub fn encode_unquoted(val: &[u8], whatwg: bool) -> AttrMinifiedValue {
245     if whatwg {
246         AttrMinifiedValue {
247             quoted: false,
248             prefix: b"",
249             data: WHATWG_UNQUOTED_REPLACER.replace_all(val),
250             start: 0,
251             suffix: b"",
252         }
253     } else {
254         let data = UNQUOTED_REPLACER.replace_all(val);
255         let prefix: &'static [u8] = match data.get(0) {
256             Some(b'"') => match data.get(1) {
257                 Some(&c2) if DIGIT[c2] || c2 == b';' => b"&#34;",
258                 _ => b"&#34",
259             },
260             Some(b'\'') => match data.get(1) {
261                 Some(&c2) if DIGIT[c2] || c2 == b';' => b"&#39;",
262                 _ => b"&#39",
263             },
264             _ => b"",
265         };
266         let start = if !prefix.is_empty() { 1 } else { 0 };
267         AttrMinifiedValue {
268             quoted: false,
269             prefix,
270             data,
271             start,
272             suffix: b"",
273         }
274     }
275 }
276 
277 pub enum AttrMinified {
278     Redundant,
279     NoValue,
280     Value(AttrMinifiedValue),
281 }
282 
minify_attr( cfg: &Cfg, ns: Namespace, tag: &[u8], is_meta_viewport: bool, name: &[u8], mut value_raw: Vec<u8>, ) -> AttrMinified283 pub fn minify_attr(
284     cfg: &Cfg,
285     ns: Namespace,
286     tag: &[u8],
287     // True if element is <meta> and has an attribute `name` equal to `viewport`.
288     is_meta_viewport: bool,
289     name: &[u8],
290     mut value_raw: Vec<u8>,
291 ) -> AttrMinified {
292     let attr_cfg = ATTRS.get(ns, tag, name);
293 
294     let should_collapse = attr_cfg.filter(|attr| attr.collapse).is_some();
295     let should_trim = attr_cfg.filter(|attr| attr.trim).is_some();
296     let should_lowercase = attr_cfg.filter(|attr| attr.case_insensitive).is_some();
297     let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some();
298     // An attribute can have both redundant_if_empty and default_value, which means it has two default values: "" and default_value.
299     let redundant_if_empty = attr_cfg.filter(|attr| attr.redundant_if_empty).is_some();
300     let default_value = attr_cfg.and_then(|attr| attr.default_value);
301 
302     if is_meta_viewport {
303         remove_all_whitespace(&mut value_raw);
304     } else {
305         // Trim before checking is_boolean as the entire attribute could be redundant post-minification.
306         if should_trim {
307             right_trim(&mut value_raw);
308             left_trim(&mut value_raw);
309         };
310         if should_collapse {
311             collapse_whitespace(&mut value_raw);
312         };
313     };
314 
315     #[cfg(feature = "js-esbuild")]
316     if name == b"style" && cfg.minify_css {
317         let mut value_raw_wrapped = Vec::with_capacity(value_raw.len() + 3);
318         // TODO This isn't safe for invalid input e.g. `a}/*`.
319         value_raw_wrapped.extend_from_slice(b"x{");
320         value_raw_wrapped.extend_from_slice(&value_raw);
321         value_raw_wrapped.push(b'}');
322         let mut value_raw_wrapped_min = Vec::with_capacity(value_raw_wrapped.len());
323         minify_using_esbuild(
324             &mut value_raw_wrapped_min,
325             &value_raw_wrapped,
326             &MINIFY_CSS_TRANSFORM_OPTIONS.clone(),
327         );
328         // TODO If input was invalid, wrapper syntax may not exist anymore.
329         if value_raw_wrapped_min.starts_with(b"x{") {
330             value_raw_wrapped_min.drain(0..2);
331         };
332         if value_raw_wrapped_min.ends_with(b"}") {
333             value_raw_wrapped_min.pop();
334         };
335         value_raw = value_raw_wrapped_min;
336     }
337 
338     // Make lowercase before checking against default value or JAVASCRIPT_MIME_TYPES.
339     if should_lowercase {
340         value_raw.make_ascii_lowercase();
341     };
342 
343     if (value_raw.is_empty() && redundant_if_empty)
344         || default_value.filter(|dv| dv == &value_raw).is_some()
345         || (tag == b"script"
346             && name == b"type"
347             && JAVASCRIPT_MIME_TYPES.contains(value_raw.as_slice())
348             && value_raw.as_slice() != b"module")
349     {
350         return AttrMinified::Redundant;
351     };
352 
353     if is_boolean || value_raw.is_empty() {
354         return AttrMinified::NoValue;
355     };
356 
357     let encoded = encode_entities(&value_raw, true);
358 
359     // When lengths are equal, prefer double quotes to all and single quotes to unquoted.
360     let mut min = encode_using_double_quotes(&encoded);
361     let sq = encode_using_single_quotes(&encoded);
362     if sq.len() < min.len() {
363         min = sq;
364     };
365     let uq = encode_unquoted(
366         &encoded,
367         cfg.ensure_spec_compliant_unquoted_attribute_values,
368     );
369     if uq.len() < min.len() {
370         min = uq;
371     };
372     AttrMinified::Value(min)
373 }
374