1 // filepatterns.rs
2 //
3 // Copyright 2019 Raphaël Gomès <rgomes@octobus.net>
4 //
5 // This software may be used and distributed according to the terms of the
6 // GNU General Public License version 2 or any later version.
7
8 //! Handling of Mercurial-specific patterns.
9
10 use crate::{
11 utils::{
12 files::{canonical_path, get_bytes_from_path, get_path_from_bytes},
13 hg_path::{path_to_hg_path_buf, HgPathBuf, HgPathError},
14 SliceExt,
15 },
16 FastHashMap, PatternError,
17 };
18 use lazy_static::lazy_static;
19 use regex::bytes::{NoExpand, Regex};
20 use std::ops::Deref;
21 use std::path::{Path, PathBuf};
22 use std::vec::Vec;
23
24 lazy_static! {
25 static ref RE_ESCAPE: Vec<Vec<u8>> = {
26 let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect();
27 let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c";
28 for byte in to_escape {
29 v[*byte as usize].insert(0, b'\\');
30 }
31 v
32 };
33 }
34
35 /// These are matched in order
36 const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] =
37 &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")];
38
39 /// Appended to the regexp of globs
40 const GLOB_SUFFIX: &[u8; 7] = b"(?:/|$)";
41
42 #[derive(Debug, Clone, PartialEq, Eq)]
43 pub enum PatternSyntax {
44 /// A regular expression
45 Regexp,
46 /// Glob that matches at the front of the path
47 RootGlob,
48 /// Glob that matches at any suffix of the path (still anchored at
49 /// slashes)
50 Glob,
51 /// a path relative to repository root, which is matched recursively
52 Path,
53 /// A path relative to cwd
54 RelPath,
55 /// an unrooted glob (*.rs matches Rust files in all dirs)
56 RelGlob,
57 /// A regexp that needn't match the start of a name
58 RelRegexp,
59 /// A path relative to repository root, which is matched non-recursively
60 /// (will not match subdirectories)
61 RootFiles,
62 /// A file of patterns to read and include
63 Include,
64 /// A file of patterns to match against files under the same directory
65 SubInclude,
66 /// SubInclude with the result of parsing the included file
67 ///
68 /// Note: there is no ExpandedInclude because that expansion can be done
69 /// in place by replacing the Include pattern by the included patterns.
70 /// SubInclude requires more handling.
71 ///
72 /// Note: `Box` is used to minimize size impact on other enum variants
73 ExpandedSubInclude(Box<SubInclude>),
74 }
75
76 /// Transforms a glob pattern into a regex
glob_to_re(pat: &[u8]) -> Vec<u8>77 fn glob_to_re(pat: &[u8]) -> Vec<u8> {
78 let mut input = pat;
79 let mut res: Vec<u8> = vec![];
80 let mut group_depth = 0;
81
82 while let Some((c, rest)) = input.split_first() {
83 input = rest;
84
85 match c {
86 b'*' => {
87 for (source, repl) in GLOB_REPLACEMENTS {
88 if let Some(rest) = input.drop_prefix(source) {
89 input = rest;
90 res.extend(*repl);
91 break;
92 }
93 }
94 }
95 b'?' => res.extend(b"."),
96 b'[' => {
97 match input.iter().skip(1).position(|b| *b == b']') {
98 None => res.extend(b"\\["),
99 Some(end) => {
100 // Account for the one we skipped
101 let end = end + 1;
102
103 res.extend(b"[");
104
105 for (i, b) in input[..end].iter().enumerate() {
106 if *b == b'!' && i == 0 {
107 res.extend(b"^")
108 } else if *b == b'^' && i == 0 {
109 res.extend(b"\\^")
110 } else if *b == b'\\' {
111 res.extend(b"\\\\")
112 } else {
113 res.push(*b)
114 }
115 }
116 res.extend(b"]");
117 input = &input[end + 1..];
118 }
119 }
120 }
121 b'{' => {
122 group_depth += 1;
123 res.extend(b"(?:")
124 }
125 b'}' if group_depth > 0 => {
126 group_depth -= 1;
127 res.extend(b")");
128 }
129 b',' if group_depth > 0 => res.extend(b"|"),
130 b'\\' => {
131 let c = {
132 if let Some((c, rest)) = input.split_first() {
133 input = rest;
134 c
135 } else {
136 c
137 }
138 };
139 res.extend(&RE_ESCAPE[*c as usize])
140 }
141 _ => res.extend(&RE_ESCAPE[*c as usize]),
142 }
143 }
144 res
145 }
146
147 fn escape_pattern(pattern: &[u8]) -> Vec<u8> {
148 pattern
149 .iter()
150 .flat_map(|c| RE_ESCAPE[*c as usize].clone())
151 .collect()
152 }
153
154 pub fn parse_pattern_syntax(
155 kind: &[u8],
156 ) -> Result<PatternSyntax, PatternError> {
157 match kind {
158 b"re:" => Ok(PatternSyntax::Regexp),
159 b"path:" => Ok(PatternSyntax::Path),
160 b"relpath:" => Ok(PatternSyntax::RelPath),
161 b"rootfilesin:" => Ok(PatternSyntax::RootFiles),
162 b"relglob:" => Ok(PatternSyntax::RelGlob),
163 b"relre:" => Ok(PatternSyntax::RelRegexp),
164 b"glob:" => Ok(PatternSyntax::Glob),
165 b"rootglob:" => Ok(PatternSyntax::RootGlob),
166 b"include:" => Ok(PatternSyntax::Include),
167 b"subinclude:" => Ok(PatternSyntax::SubInclude),
168 _ => Err(PatternError::UnsupportedSyntax(
169 String::from_utf8_lossy(kind).to_string(),
170 )),
171 }
172 }
173
174 /// Builds the regex that corresponds to the given pattern.
175 /// If within a `syntax: regexp` context, returns the pattern,
176 /// otherwise, returns the corresponding regex.
177 fn _build_single_regex(entry: &IgnorePattern) -> Vec<u8> {
178 let IgnorePattern {
179 syntax, pattern, ..
180 } = entry;
181 if pattern.is_empty() {
182 return vec![];
183 }
184 match syntax {
185 PatternSyntax::Regexp => pattern.to_owned(),
186 PatternSyntax::RelRegexp => {
187 // The `regex` crate accepts `**` while `re2` and Python's `re`
188 // do not. Checking for `*` correctly triggers the same error all
189 // engines.
190 if pattern[0] == b'^'
191 || pattern[0] == b'*'
192 || pattern.starts_with(b".*")
193 {
194 return pattern.to_owned();
195 }
196 [&b".*"[..], pattern].concat()
197 }
198 PatternSyntax::Path | PatternSyntax::RelPath => {
199 if pattern == b"." {
200 return vec![];
201 }
202 [escape_pattern(pattern).as_slice(), b"(?:/|$)"].concat()
203 }
204 PatternSyntax::RootFiles => {
205 let mut res = if pattern == b"." {
206 vec![]
207 } else {
208 // Pattern is a directory name.
209 [escape_pattern(pattern).as_slice(), b"/"].concat()
210 };
211
212 // Anything after the pattern must be a non-directory.
213 res.extend(b"[^/]+$");
214 res
215 }
216 PatternSyntax::RelGlob => {
217 let glob_re = glob_to_re(pattern);
218 if let Some(rest) = glob_re.drop_prefix(b"[^/]*") {
219 [b".*", rest, GLOB_SUFFIX].concat()
220 } else {
221 [b"(?:.*/)?", glob_re.as_slice(), GLOB_SUFFIX].concat()
222 }
223 }
224 PatternSyntax::Glob | PatternSyntax::RootGlob => {
225 [glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
226 }
227 PatternSyntax::Include
228 | PatternSyntax::SubInclude
229 | PatternSyntax::ExpandedSubInclude(_) => unreachable!(),
230 }
231 }
232
233 const GLOB_SPECIAL_CHARACTERS: [u8; 7] =
234 [b'*', b'?', b'[', b']', b'{', b'}', b'\\'];
235
236 /// TODO support other platforms
237 #[cfg(unix)]
238 pub fn normalize_path_bytes(bytes: &[u8]) -> Vec<u8> {
239 if bytes.is_empty() {
240 return b".".to_vec();
241 }
242 let sep = b'/';
243
244 let mut initial_slashes = bytes.iter().take_while(|b| **b == sep).count();
245 if initial_slashes > 2 {
246 // POSIX allows one or two initial slashes, but treats three or more
247 // as single slash.
248 initial_slashes = 1;
249 }
250 let components = bytes
251 .split(|b| *b == sep)
252 .filter(|c| !(c.is_empty() || c == b"."))
253 .fold(vec![], |mut acc, component| {
254 if component != b".."
255 || (initial_slashes == 0 && acc.is_empty())
256 || (!acc.is_empty() && acc[acc.len() - 1] == b"..")
257 {
258 acc.push(component)
259 } else if !acc.is_empty() {
260 acc.pop();
261 }
262 acc
263 });
264 let mut new_bytes = components.join(&sep);
265
266 if initial_slashes > 0 {
267 let mut buf: Vec<_> = (0..initial_slashes).map(|_| sep).collect();
268 buf.extend(new_bytes);
269 new_bytes = buf;
270 }
271 if new_bytes.is_empty() {
272 b".".to_vec()
273 } else {
274 new_bytes
275 }
276 }
277
278 /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs
279 /// that don't need to be transformed into a regex.
280 pub fn build_single_regex(
281 entry: &IgnorePattern,
282 ) -> Result<Option<Vec<u8>>, PatternError> {
283 let IgnorePattern {
284 pattern, syntax, ..
285 } = entry;
286 let pattern = match syntax {
287 PatternSyntax::RootGlob
288 | PatternSyntax::Path
289 | PatternSyntax::RelGlob
290 | PatternSyntax::RootFiles => normalize_path_bytes(&pattern),
291 PatternSyntax::Include | PatternSyntax::SubInclude => {
292 return Err(PatternError::NonRegexPattern(entry.clone()))
293 }
294 _ => pattern.to_owned(),
295 };
296 if *syntax == PatternSyntax::RootGlob
297 && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
298 {
299 Ok(None)
300 } else {
301 let mut entry = entry.clone();
302 entry.pattern = pattern;
303 Ok(Some(_build_single_regex(&entry)))
304 }
305 }
306
307 lazy_static! {
308 static ref SYNTAXES: FastHashMap<&'static [u8], &'static [u8]> = {
309 let mut m = FastHashMap::default();
310
311 m.insert(b"re".as_ref(), b"relre:".as_ref());
312 m.insert(b"regexp".as_ref(), b"relre:".as_ref());
313 m.insert(b"glob".as_ref(), b"relglob:".as_ref());
314 m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref());
315 m.insert(b"include".as_ref(), b"include:".as_ref());
316 m.insert(b"subinclude".as_ref(), b"subinclude:".as_ref());
317 m
318 };
319 }
320
321 #[derive(Debug)]
322 pub enum PatternFileWarning {
323 /// (file path, syntax bytes)
324 InvalidSyntax(PathBuf, Vec<u8>),
325 /// File path
326 NoSuchFile(PathBuf),
327 }
328
329 pub fn parse_pattern_file_contents(
330 lines: &[u8],
331 file_path: &Path,
332 warn: bool,
333 ) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> {
334 let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap();
335
336 #[allow(clippy::trivial_regex)]
337 let comment_escape_regex = Regex::new(r"\\#").unwrap();
338 let mut inputs: Vec<IgnorePattern> = vec![];
339 let mut warnings: Vec<PatternFileWarning> = vec![];
340
341 let mut current_syntax = b"relre:".as_ref();
342
343 for (line_number, mut line) in lines.split(|c| *c == b'\n').enumerate() {
344 let line_number = line_number + 1;
345
346 let line_buf;
347 if line.contains(&b'#') {
348 if let Some(cap) = comment_regex.captures(line) {
349 line = &line[..cap.get(1).unwrap().end()]
350 }
351 line_buf = comment_escape_regex.replace_all(line, NoExpand(b"#"));
352 line = &line_buf;
353 }
354
355 let mut line = line.trim_end();
356
357 if line.is_empty() {
358 continue;
359 }
360
361 if let Some(syntax) = line.drop_prefix(b"syntax:") {
362 let syntax = syntax.trim();
363
364 if let Some(rel_syntax) = SYNTAXES.get(syntax) {
365 current_syntax = rel_syntax;
366 } else if warn {
367 warnings.push(PatternFileWarning::InvalidSyntax(
368 file_path.to_owned(),
369 syntax.to_owned(),
370 ));
371 }
372 continue;
373 }
374
375 let mut line_syntax: &[u8] = ¤t_syntax;
376
377 for (s, rels) in SYNTAXES.iter() {
378 if let Some(rest) = line.drop_prefix(rels) {
379 line_syntax = rels;
380 line = rest;
381 break;
382 }
383 if let Some(rest) = line.drop_prefix(&[s, &b":"[..]].concat()) {
384 line_syntax = rels;
385 line = rest;
386 break;
387 }
388 }
389
390 inputs.push(IgnorePattern::new(
391 parse_pattern_syntax(&line_syntax).map_err(|e| match e {
392 PatternError::UnsupportedSyntax(syntax) => {
393 PatternError::UnsupportedSyntaxInFile(
394 syntax,
395 file_path.to_string_lossy().into(),
396 line_number,
397 )
398 }
399 _ => e,
400 })?,
401 &line,
402 file_path,
403 ));
404 }
405 Ok((inputs, warnings))
406 }
407
408 pub fn read_pattern_file(
409 file_path: &Path,
410 warn: bool,
411 inspect_pattern_bytes: &mut impl FnMut(&[u8]),
412 ) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> {
413 match std::fs::read(file_path) {
414 Ok(contents) => {
415 inspect_pattern_bytes(&contents);
416 parse_pattern_file_contents(&contents, file_path, warn)
417 }
418 Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok((
419 vec![],
420 vec![PatternFileWarning::NoSuchFile(file_path.to_owned())],
421 )),
422 Err(e) => Err(e.into()),
423 }
424 }
425
426 /// Represents an entry in an "ignore" file.
427 #[derive(Debug, Eq, PartialEq, Clone)]
428 pub struct IgnorePattern {
429 pub syntax: PatternSyntax,
430 pub pattern: Vec<u8>,
431 pub source: PathBuf,
432 }
433
434 impl IgnorePattern {
435 pub fn new(syntax: PatternSyntax, pattern: &[u8], source: &Path) -> Self {
436 Self {
437 syntax,
438 pattern: pattern.to_owned(),
439 source: source.to_owned(),
440 }
441 }
442 }
443
444 pub type PatternResult<T> = Result<T, PatternError>;
445
446 /// Wrapper for `read_pattern_file` that also recursively expands `include:`
447 /// and `subinclude:` patterns.
448 ///
449 /// The former are expanded in place, while `PatternSyntax::ExpandedSubInclude`
450 /// is used for the latter to form a tree of patterns.
451 pub fn get_patterns_from_file(
452 pattern_file: &Path,
453 root_dir: &Path,
454 inspect_pattern_bytes: &mut impl FnMut(&[u8]),
455 ) -> PatternResult<(Vec<IgnorePattern>, Vec<PatternFileWarning>)> {
456 let (patterns, mut warnings) =
457 read_pattern_file(pattern_file, true, inspect_pattern_bytes)?;
458 let patterns = patterns
459 .into_iter()
460 .flat_map(|entry| -> PatternResult<_> {
461 Ok(match &entry.syntax {
462 PatternSyntax::Include => {
463 let inner_include =
464 root_dir.join(get_path_from_bytes(&entry.pattern));
465 let (inner_pats, inner_warnings) = get_patterns_from_file(
466 &inner_include,
467 root_dir,
468 inspect_pattern_bytes,
469 )?;
470 warnings.extend(inner_warnings);
471 inner_pats
472 }
473 PatternSyntax::SubInclude => {
474 let mut sub_include = SubInclude::new(
475 &root_dir,
476 &entry.pattern,
477 &entry.source,
478 )?;
479 let (inner_patterns, inner_warnings) =
480 get_patterns_from_file(
481 &sub_include.path,
482 &sub_include.root,
483 inspect_pattern_bytes,
484 )?;
485 sub_include.included_patterns = inner_patterns;
486 warnings.extend(inner_warnings);
487 vec![IgnorePattern {
488 syntax: PatternSyntax::ExpandedSubInclude(Box::new(
489 sub_include,
490 )),
491 ..entry
492 }]
493 }
494 _ => vec![entry],
495 })
496 })
497 .flatten()
498 .collect();
499
500 Ok((patterns, warnings))
501 }
502
503 /// Holds all the information needed to handle a `subinclude:` pattern.
504 #[derive(Debug, PartialEq, Eq, Clone)]
505 pub struct SubInclude {
506 /// Will be used for repository (hg) paths that start with this prefix.
507 /// It is relative to the current working directory, so comparing against
508 /// repository paths is painless.
509 pub prefix: HgPathBuf,
510 /// The file itself, containing the patterns
511 pub path: PathBuf,
512 /// Folder in the filesystem where this it applies
513 pub root: PathBuf,
514
515 pub included_patterns: Vec<IgnorePattern>,
516 }
517
518 impl SubInclude {
519 pub fn new(
520 root_dir: &Path,
521 pattern: &[u8],
522 source: &Path,
523 ) -> Result<SubInclude, HgPathError> {
524 let normalized_source =
525 normalize_path_bytes(&get_bytes_from_path(source));
526
527 let source_root = get_path_from_bytes(&normalized_source);
528 let source_root =
529 source_root.parent().unwrap_or_else(|| source_root.deref());
530
531 let path = source_root.join(get_path_from_bytes(pattern));
532 let new_root = path.parent().unwrap_or_else(|| path.deref());
533
534 let prefix = canonical_path(root_dir, root_dir, new_root)?;
535
536 Ok(Self {
537 prefix: path_to_hg_path_buf(prefix).and_then(|mut p| {
538 if !p.is_empty() {
539 p.push_byte(b'/');
540 }
541 Ok(p)
542 })?,
543 path: path.to_owned(),
544 root: new_root.to_owned(),
545 included_patterns: Vec::new(),
546 })
547 }
548 }
549
550 /// Separate and pre-process subincludes from other patterns for the "ignore"
551 /// phase.
552 pub fn filter_subincludes(
553 ignore_patterns: Vec<IgnorePattern>,
554 ) -> Result<(Vec<Box<SubInclude>>, Vec<IgnorePattern>), HgPathError> {
555 let mut subincludes = vec![];
556 let mut others = vec![];
557
558 for pattern in ignore_patterns {
559 if let PatternSyntax::ExpandedSubInclude(sub_include) = pattern.syntax
560 {
561 subincludes.push(sub_include);
562 } else {
563 others.push(pattern)
564 }
565 }
566 Ok((subincludes, others))
567 }
568
569 #[cfg(test)]
570 mod tests {
571 use super::*;
572 use pretty_assertions::assert_eq;
573
574 #[test]
575 fn escape_pattern_test() {
576 let untouched =
577 br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
578 assert_eq!(escape_pattern(untouched), untouched.to_vec());
579 // All escape codes
580 assert_eq!(
581 escape_pattern(br#"()[]{}?*+-|^$\\.&~# \t\n\r\v\f"#),
582 br#"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\ \\t\\n\\r\\v\\f"#
583 .to_vec()
584 );
585 }
586
587 #[test]
glob_test()588 fn glob_test() {
589 assert_eq!(glob_to_re(br#"?"#), br#"."#);
590 assert_eq!(glob_to_re(br#"*"#), br#"[^/]*"#);
591 assert_eq!(glob_to_re(br#"**"#), br#".*"#);
592 assert_eq!(glob_to_re(br#"**/a"#), br#"(?:.*/)?a"#);
593 assert_eq!(glob_to_re(br#"a/**/b"#), br#"a/(?:.*/)?b"#);
594 assert_eq!(glob_to_re(br#"[a*?!^][^b][!c]"#), br#"[a*?!^][\^b][^c]"#);
595 assert_eq!(glob_to_re(br#"{a,b}"#), br#"(?:a|b)"#);
596 assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#);
597 }
598
599 #[test]
test_parse_pattern_file_contents()600 fn test_parse_pattern_file_contents() {
601 let lines = b"syntax: glob\n*.elc";
602
603 assert_eq!(
604 parse_pattern_file_contents(lines, Path::new("file_path"), false)
605 .unwrap()
606 .0,
607 vec![IgnorePattern::new(
608 PatternSyntax::RelGlob,
609 b"*.elc",
610 Path::new("file_path")
611 )],
612 );
613
614 let lines = b"syntax: include\nsyntax: glob";
615
616 assert_eq!(
617 parse_pattern_file_contents(lines, Path::new("file_path"), false)
618 .unwrap()
619 .0,
620 vec![]
621 );
622 let lines = b"glob:**.o";
623 assert_eq!(
624 parse_pattern_file_contents(lines, Path::new("file_path"), false)
625 .unwrap()
626 .0,
627 vec![IgnorePattern::new(
628 PatternSyntax::RelGlob,
629 b"**.o",
630 Path::new("file_path")
631 )]
632 );
633 }
634
635 #[test]
test_build_single_regex()636 fn test_build_single_regex() {
637 assert_eq!(
638 build_single_regex(&IgnorePattern::new(
639 PatternSyntax::RelGlob,
640 b"rust/target/",
641 Path::new("")
642 ))
643 .unwrap(),
644 Some(br"(?:.*/)?rust/target(?:/|$)".to_vec()),
645 );
646 assert_eq!(
647 build_single_regex(&IgnorePattern::new(
648 PatternSyntax::Regexp,
649 br"rust/target/\d+",
650 Path::new("")
651 ))
652 .unwrap(),
653 Some(br"rust/target/\d+".to_vec()),
654 );
655 }
656
657 #[test]
test_build_single_regex_shortcut()658 fn test_build_single_regex_shortcut() {
659 assert_eq!(
660 build_single_regex(&IgnorePattern::new(
661 PatternSyntax::RootGlob,
662 b"",
663 Path::new("")
664 ))
665 .unwrap(),
666 None,
667 );
668 assert_eq!(
669 build_single_regex(&IgnorePattern::new(
670 PatternSyntax::RootGlob,
671 b"whatever",
672 Path::new("")
673 ))
674 .unwrap(),
675 None,
676 );
677 assert_eq!(
678 build_single_regex(&IgnorePattern::new(
679 PatternSyntax::RootGlob,
680 b"*.o",
681 Path::new("")
682 ))
683 .unwrap(),
684 Some(br"[^/]*\.o(?:/|$)".to_vec()),
685 );
686 }
687 }
688