1 use aho_corasick::{AhoCorasickBuilder, MatchKind};
2 use lazy_static::lazy_static;
3
4 #[cfg(feature = "js-esbuild")]
5 use {
6 crate::minify::css::MINIFY_CSS_TRANSFORM_OPTIONS, crate::minify::esbuild::minify_using_esbuild,
7 };
8
9 use crate::common::gen::attrs::ATTRS;
10 use crate::common::gen::codepoints::DIGIT;
11 use crate::common::pattern::Replacer;
12 use crate::common::spec::script::JAVASCRIPT_MIME_TYPES;
13 use crate::common::spec::tag::ns::Namespace;
14 use crate::common::whitespace::{
15 collapse_whitespace, left_trim, remove_all_whitespace, right_trim,
16 };
17 use crate::entity::encode::encode_entities;
18 use crate::Cfg;
19
build_double_quoted_replacer() -> Replacer20 fn build_double_quoted_replacer() -> Replacer {
21 let mut patterns = Vec::<Vec<u8>>::new();
22 let mut replacements = Vec::<Vec<u8>>::new();
23
24 // Replace all `"` with `"`, unless the quote is followed by a digit or semicolon,
25 // in which case add a semicolon to the encoded entity.
26 for c in "0123456789;".bytes() {
27 patterns.push(vec![b'"', c]);
28 replacements.push(vec![b'&', b'#', b'3', b'4', b';', c]);
29 }
30 patterns.push(b"\"".to_vec());
31 replacements.push(b""".to_vec());
32
33 Replacer::new(
34 AhoCorasickBuilder::new()
35 .dfa(true)
36 .match_kind(MatchKind::LeftmostLongest)
37 .build(patterns),
38 replacements,
39 )
40 }
41
build_single_quoted_replacer() -> Replacer42 fn build_single_quoted_replacer() -> Replacer {
43 let mut patterns = Vec::<Vec<u8>>::new();
44 let mut replacements = Vec::<Vec<u8>>::new();
45
46 // Replace all `'` with `'`, unless the quote is followed by a digit or semicolon,
47 // in which case add a semicolon to the encoded entity.
48 for c in "0123456789;".bytes() {
49 patterns.push(vec![b'\'', c]);
50 replacements.push(vec![b'&', b'#', b'3', b'9', b';', c]);
51 }
52 patterns.push(b"'".to_vec());
53 replacements.push(b"'".to_vec());
54
55 Replacer::new(
56 AhoCorasickBuilder::new()
57 .dfa(true)
58 .match_kind(MatchKind::LeftmostLongest)
59 .build(patterns),
60 replacements,
61 )
62 }
63
64 // TODO Sync with WHITESPACE definition.
65 static WS: &[(u8, &[u8])] = &[
66 (b'\x09', b"	"),
67 (b'\x0a', b"
"),
68 (b'\x0c', b""),
69 (b'\x0d', b"
"),
70 (b'\x20', b" "),
71 ];
72
build_unquoted_replacer() -> Replacer73 fn build_unquoted_replacer() -> Replacer {
74 let mut patterns = Vec::<Vec<u8>>::new();
75 let mut replacements = Vec::<Vec<u8>>::new();
76
77 // Replace all whitespace with a numeric entity, unless the whitespace is followed by a digit or semicolon,
78 // in which case add a semicolon to the encoded entity.
79 for c in "0123456789;".bytes() {
80 for &(ws, rep) in WS {
81 patterns.push(vec![ws, c]);
82 replacements.push({
83 let mut ent = rep.to_vec();
84 ent.push(b';');
85 ent.push(c);
86 ent
87 });
88 }
89 }
90 for &(ws, rep) in WS {
91 patterns.push(vec![ws]);
92 replacements.push(rep.to_vec());
93 }
94
95 // Replace all `>` with `>`, unless the chevron is followed by a semicolon,
96 // in which case add a semicolon to the encoded entity.
97 // Use `>` instead of `>` as `>` has more conflicting entities e.g. `⪧`, `⋗`.
98 patterns.push(b">;".to_vec());
99 replacements.push(b">;".to_vec());
100 patterns.push(b">".to_vec());
101 replacements.push(b">".to_vec());
102
103 Replacer::new(
104 AhoCorasickBuilder::new()
105 .dfa(true)
106 .match_kind(MatchKind::LeftmostLongest)
107 .build(patterns),
108 replacements,
109 )
110 }
111
112 // If spec compliance is required, these characters must also be encoded in an unquoted attr value,
113 // as well as whitespace, `<`, and `>`.
114 static WHATWG_UNQUOTED: &[(u8, &[u8])] = &[
115 (b'"', b"""),
116 (b'\'', b"'"),
117 (b'=', b"="),
118 (b'`', b""),
119 ];
120
build_whatwg_unquoted_replacer() -> Replacer121 fn build_whatwg_unquoted_replacer() -> Replacer {
122 let mut patterns = Vec::<Vec<u8>>::new();
123 let mut replacements = Vec::<Vec<u8>>::new();
124
125 // Replace all whitespace with a numeric entity, unless the whitespace is followed by a digit or semicolon,
126 // in which case add a semicolon to the encoded entity.
127 for c in "0123456789;".bytes() {
128 for &(ws, rep) in WS {
129 patterns.push(vec![ws, c]);
130 replacements.push({
131 let mut ent = rep.to_vec();
132 ent.push(b';');
133 ent.push(c);
134 ent
135 });
136 }
137 }
138 for &(ws, rep) in WS {
139 patterns.push(vec![ws]);
140 replacements.push(rep.to_vec());
141 }
142
143 // Replace WHATWG-disallowed characters with a numeric entity, unless they're followed by a digit or semicolon,
144 // in which case add a semicolon to the encoded entity.
145 for c in "0123456789;".bytes() {
146 for &(ws, rep) in WHATWG_UNQUOTED {
147 patterns.push(vec![ws, c]);
148 replacements.push({
149 let mut ent = rep.to_vec();
150 ent.push(b';');
151 ent.push(c);
152 ent
153 });
154 }
155 }
156 for &(ws, rep) in WHATWG_UNQUOTED {
157 patterns.push(vec![ws]);
158 replacements.push(rep.to_vec());
159 }
160
161 // Replace all `<` with `<`, unless the chevron is followed by a semicolon,
162 // in which case add a semicolon to the encoded entity.
163 // Use `>` instead of `<` as `<` has more conflicting entities e.g. `⪦`, `⋖`.
164 patterns.push(b"<;".to_vec());
165 replacements.push(b"<;".to_vec());
166 patterns.push(b"<".to_vec());
167 replacements.push(b"<".to_vec());
168
169 // Replace all `>` with `>`, unless the chevron is followed by a semicolon,
170 // in which case add a semicolon to the encoded entity.
171 // Use `>` instead of `>` as `>` has more conflicting entities e.g. `⪧`, `⋗`.
172 patterns.push(b">;".to_vec());
173 replacements.push(b">;".to_vec());
174 patterns.push(b">".to_vec());
175 replacements.push(b">".to_vec());
176
177 Replacer::new(
178 AhoCorasickBuilder::new()
179 .dfa(true)
180 .match_kind(MatchKind::LeftmostLongest)
181 .build(patterns),
182 replacements,
183 )
184 }
185
186 lazy_static! {
187 static ref DOUBLE_QUOTED_REPLACER: Replacer = build_double_quoted_replacer();
188 static ref SINGLE_QUOTED_REPLACER: Replacer = build_single_quoted_replacer();
189 static ref UNQUOTED_REPLACER: Replacer = build_unquoted_replacer();
190 static ref WHATWG_UNQUOTED_REPLACER: Replacer = build_whatwg_unquoted_replacer();
191 }
192
193 pub struct AttrMinifiedValue {
194 quoted: bool,
195 prefix: &'static [u8],
196 data: Vec<u8>,
197 start: usize,
198 suffix: &'static [u8],
199 }
200
201 impl AttrMinifiedValue {
quoted(&self) -> bool202 pub fn quoted(&self) -> bool {
203 self.quoted
204 }
205
len(&self) -> usize206 pub fn len(&self) -> usize {
207 self.prefix.len() + (self.data.len() - self.start) + self.suffix.len()
208 }
209
out(&self, out: &mut Vec<u8>)210 pub fn out(&self, out: &mut Vec<u8>) {
211 out.extend_from_slice(self.prefix);
212 out.extend_from_slice(&self.data[self.start..]);
213 out.extend_from_slice(self.suffix);
214 }
215
216 #[cfg(test)]
str(&self) -> String217 pub fn str(&self) -> String {
218 let mut out = Vec::with_capacity(self.len());
219 self.out(&mut out);
220 String::from_utf8(out).unwrap()
221 }
222 }
223
encode_using_double_quotes(val: &[u8]) -> AttrMinifiedValue224 pub fn encode_using_double_quotes(val: &[u8]) -> AttrMinifiedValue {
225 AttrMinifiedValue {
226 quoted: true,
227 prefix: b"\"",
228 data: DOUBLE_QUOTED_REPLACER.replace_all(val),
229 start: 0,
230 suffix: b"\"",
231 }
232 }
233
encode_using_single_quotes(val: &[u8]) -> AttrMinifiedValue234 pub fn encode_using_single_quotes(val: &[u8]) -> AttrMinifiedValue {
235 AttrMinifiedValue {
236 quoted: true,
237 prefix: b"'",
238 data: SINGLE_QUOTED_REPLACER.replace_all(val),
239 start: 0,
240 suffix: b"'",
241 }
242 }
243
encode_unquoted(val: &[u8], whatwg: bool) -> AttrMinifiedValue244 pub fn encode_unquoted(val: &[u8], whatwg: bool) -> AttrMinifiedValue {
245 if whatwg {
246 AttrMinifiedValue {
247 quoted: false,
248 prefix: b"",
249 data: WHATWG_UNQUOTED_REPLACER.replace_all(val),
250 start: 0,
251 suffix: b"",
252 }
253 } else {
254 let data = UNQUOTED_REPLACER.replace_all(val);
255 let prefix: &'static [u8] = match data.get(0) {
256 Some(b'"') => match data.get(1) {
257 Some(&c2) if DIGIT[c2] || c2 == b';' => b""",
258 _ => b""",
259 },
260 Some(b'\'') => match data.get(1) {
261 Some(&c2) if DIGIT[c2] || c2 == b';' => b"'",
262 _ => b"'",
263 },
264 _ => b"",
265 };
266 let start = if !prefix.is_empty() { 1 } else { 0 };
267 AttrMinifiedValue {
268 quoted: false,
269 prefix,
270 data,
271 start,
272 suffix: b"",
273 }
274 }
275 }
276
277 pub enum AttrMinified {
278 Redundant,
279 NoValue,
280 Value(AttrMinifiedValue),
281 }
282
minify_attr( cfg: &Cfg, ns: Namespace, tag: &[u8], is_meta_viewport: bool, name: &[u8], mut value_raw: Vec<u8>, ) -> AttrMinified283 pub fn minify_attr(
284 cfg: &Cfg,
285 ns: Namespace,
286 tag: &[u8],
287 // True if element is <meta> and has an attribute `name` equal to `viewport`.
288 is_meta_viewport: bool,
289 name: &[u8],
290 mut value_raw: Vec<u8>,
291 ) -> AttrMinified {
292 let attr_cfg = ATTRS.get(ns, tag, name);
293
294 let should_collapse = attr_cfg.filter(|attr| attr.collapse).is_some();
295 let should_trim = attr_cfg.filter(|attr| attr.trim).is_some();
296 let should_lowercase = attr_cfg.filter(|attr| attr.case_insensitive).is_some();
297 let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some();
298 // An attribute can have both redundant_if_empty and default_value, which means it has two default values: "" and default_value.
299 let redundant_if_empty = attr_cfg.filter(|attr| attr.redundant_if_empty).is_some();
300 let default_value = attr_cfg.and_then(|attr| attr.default_value);
301
302 if is_meta_viewport {
303 remove_all_whitespace(&mut value_raw);
304 } else {
305 // Trim before checking is_boolean as the entire attribute could be redundant post-minification.
306 if should_trim {
307 right_trim(&mut value_raw);
308 left_trim(&mut value_raw);
309 };
310 if should_collapse {
311 collapse_whitespace(&mut value_raw);
312 };
313 };
314
315 #[cfg(feature = "js-esbuild")]
316 if name == b"style" && cfg.minify_css {
317 let mut value_raw_wrapped = Vec::with_capacity(value_raw.len() + 3);
318 // TODO This isn't safe for invalid input e.g. `a}/*`.
319 value_raw_wrapped.extend_from_slice(b"x{");
320 value_raw_wrapped.extend_from_slice(&value_raw);
321 value_raw_wrapped.push(b'}');
322 let mut value_raw_wrapped_min = Vec::with_capacity(value_raw_wrapped.len());
323 minify_using_esbuild(
324 &mut value_raw_wrapped_min,
325 &value_raw_wrapped,
326 &MINIFY_CSS_TRANSFORM_OPTIONS.clone(),
327 );
328 // TODO If input was invalid, wrapper syntax may not exist anymore.
329 if value_raw_wrapped_min.starts_with(b"x{") {
330 value_raw_wrapped_min.drain(0..2);
331 };
332 if value_raw_wrapped_min.ends_with(b"}") {
333 value_raw_wrapped_min.pop();
334 };
335 value_raw = value_raw_wrapped_min;
336 }
337
338 // Make lowercase before checking against default value or JAVASCRIPT_MIME_TYPES.
339 if should_lowercase {
340 value_raw.make_ascii_lowercase();
341 };
342
343 if (value_raw.is_empty() && redundant_if_empty)
344 || default_value.filter(|dv| dv == &value_raw).is_some()
345 || (tag == b"script"
346 && name == b"type"
347 && JAVASCRIPT_MIME_TYPES.contains(value_raw.as_slice())
348 && value_raw.as_slice() != b"module")
349 {
350 return AttrMinified::Redundant;
351 };
352
353 if is_boolean || value_raw.is_empty() {
354 return AttrMinified::NoValue;
355 };
356
357 let encoded = encode_entities(&value_raw, true);
358
359 // When lengths are equal, prefer double quotes to all and single quotes to unquoted.
360 let mut min = encode_using_double_quotes(&encoded);
361 let sq = encode_using_single_quotes(&encoded);
362 if sq.len() < min.len() {
363 min = sq;
364 };
365 let uq = encode_unquoted(
366 &encoded,
367 cfg.ensure_spec_compliant_unquoted_attribute_values,
368 );
369 if uq.len() < min.len() {
370 min = uq;
371 };
372 AttrMinified::Value(min)
373 }
374