1 // filepatterns.rs
2 //
3 // Copyright 2019 Raphaël Gomès <rgomes@octobus.net>
4 //
5 // This software may be used and distributed according to the terms of the
6 // GNU General Public License version 2 or any later version.
7 
8 //! Handling of Mercurial-specific patterns.
9 
10 use crate::{
11     utils::{
12         files::{canonical_path, get_bytes_from_path, get_path_from_bytes},
13         hg_path::{path_to_hg_path_buf, HgPathBuf, HgPathError},
14         SliceExt,
15     },
16     FastHashMap, PatternError,
17 };
18 use lazy_static::lazy_static;
19 use regex::bytes::{NoExpand, Regex};
20 use std::ops::Deref;
21 use std::path::{Path, PathBuf};
22 use std::vec::Vec;
23 
24 lazy_static! {
25     static ref RE_ESCAPE: Vec<Vec<u8>> = {
26         let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect();
27         let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c";
28         for byte in to_escape {
29             v[*byte as usize].insert(0, b'\\');
30         }
31         v
32     };
33 }
34 
35 /// These are matched in order
36 const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] =
37     &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")];
38 
39 /// Appended to the regexp of globs
40 const GLOB_SUFFIX: &[u8; 7] = b"(?:/|$)";
41 
42 #[derive(Debug, Clone, PartialEq, Eq)]
43 pub enum PatternSyntax {
44     /// A regular expression
45     Regexp,
46     /// Glob that matches at the front of the path
47     RootGlob,
48     /// Glob that matches at any suffix of the path (still anchored at
49     /// slashes)
50     Glob,
51     /// a path relative to repository root, which is matched recursively
52     Path,
53     /// A path relative to cwd
54     RelPath,
55     /// an unrooted glob (*.rs matches Rust files in all dirs)
56     RelGlob,
57     /// A regexp that needn't match the start of a name
58     RelRegexp,
59     /// A path relative to repository root, which is matched non-recursively
60     /// (will not match subdirectories)
61     RootFiles,
62     /// A file of patterns to read and include
63     Include,
64     /// A file of patterns to match against files under the same directory
65     SubInclude,
66     /// SubInclude with the result of parsing the included file
67     ///
68     /// Note: there is no ExpandedInclude because that expansion can be done
69     /// in place by replacing the Include pattern by the included patterns.
70     /// SubInclude requires more handling.
71     ///
72     /// Note: `Box` is used to minimize size impact on other enum variants
73     ExpandedSubInclude(Box<SubInclude>),
74 }
75 
76 /// Transforms a glob pattern into a regex
glob_to_re(pat: &[u8]) -> Vec<u8>77 fn glob_to_re(pat: &[u8]) -> Vec<u8> {
78     let mut input = pat;
79     let mut res: Vec<u8> = vec![];
80     let mut group_depth = 0;
81 
82     while let Some((c, rest)) = input.split_first() {
83         input = rest;
84 
85         match c {
86             b'*' => {
87                 for (source, repl) in GLOB_REPLACEMENTS {
88                     if let Some(rest) = input.drop_prefix(source) {
89                         input = rest;
90                         res.extend(*repl);
91                         break;
92                     }
93                 }
94             }
95             b'?' => res.extend(b"."),
96             b'[' => {
97                 match input.iter().skip(1).position(|b| *b == b']') {
98                     None => res.extend(b"\\["),
99                     Some(end) => {
100                         // Account for the one we skipped
101                         let end = end + 1;
102 
103                         res.extend(b"[");
104 
105                         for (i, b) in input[..end].iter().enumerate() {
106                             if *b == b'!' && i == 0 {
107                                 res.extend(b"^")
108                             } else if *b == b'^' && i == 0 {
109                                 res.extend(b"\\^")
110                             } else if *b == b'\\' {
111                                 res.extend(b"\\\\")
112                             } else {
113                                 res.push(*b)
114                             }
115                         }
116                         res.extend(b"]");
117                         input = &input[end + 1..];
118                     }
119                 }
120             }
121             b'{' => {
122                 group_depth += 1;
123                 res.extend(b"(?:")
124             }
125             b'}' if group_depth > 0 => {
126                 group_depth -= 1;
127                 res.extend(b")");
128             }
129             b',' if group_depth > 0 => res.extend(b"|"),
130             b'\\' => {
131                 let c = {
132                     if let Some((c, rest)) = input.split_first() {
133                         input = rest;
134                         c
135                     } else {
136                         c
137                     }
138                 };
139                 res.extend(&RE_ESCAPE[*c as usize])
140             }
141             _ => res.extend(&RE_ESCAPE[*c as usize]),
142         }
143     }
144     res
145 }
146 
147 fn escape_pattern(pattern: &[u8]) -> Vec<u8> {
148     pattern
149         .iter()
150         .flat_map(|c| RE_ESCAPE[*c as usize].clone())
151         .collect()
152 }
153 
154 pub fn parse_pattern_syntax(
155     kind: &[u8],
156 ) -> Result<PatternSyntax, PatternError> {
157     match kind {
158         b"re:" => Ok(PatternSyntax::Regexp),
159         b"path:" => Ok(PatternSyntax::Path),
160         b"relpath:" => Ok(PatternSyntax::RelPath),
161         b"rootfilesin:" => Ok(PatternSyntax::RootFiles),
162         b"relglob:" => Ok(PatternSyntax::RelGlob),
163         b"relre:" => Ok(PatternSyntax::RelRegexp),
164         b"glob:" => Ok(PatternSyntax::Glob),
165         b"rootglob:" => Ok(PatternSyntax::RootGlob),
166         b"include:" => Ok(PatternSyntax::Include),
167         b"subinclude:" => Ok(PatternSyntax::SubInclude),
168         _ => Err(PatternError::UnsupportedSyntax(
169             String::from_utf8_lossy(kind).to_string(),
170         )),
171     }
172 }
173 
174 /// Builds the regex that corresponds to the given pattern.
175 /// If within a `syntax: regexp` context, returns the pattern,
176 /// otherwise, returns the corresponding regex.
177 fn _build_single_regex(entry: &IgnorePattern) -> Vec<u8> {
178     let IgnorePattern {
179         syntax, pattern, ..
180     } = entry;
181     if pattern.is_empty() {
182         return vec![];
183     }
184     match syntax {
185         PatternSyntax::Regexp => pattern.to_owned(),
186         PatternSyntax::RelRegexp => {
187             // The `regex` crate accepts `**` while `re2` and Python's `re`
188             // do not. Checking for `*` correctly triggers the same error all
189             // engines.
190             if pattern[0] == b'^'
191                 || pattern[0] == b'*'
192                 || pattern.starts_with(b".*")
193             {
194                 return pattern.to_owned();
195             }
196             [&b".*"[..], pattern].concat()
197         }
198         PatternSyntax::Path | PatternSyntax::RelPath => {
199             if pattern == b"." {
200                 return vec![];
201             }
202             [escape_pattern(pattern).as_slice(), b"(?:/|$)"].concat()
203         }
204         PatternSyntax::RootFiles => {
205             let mut res = if pattern == b"." {
206                 vec![]
207             } else {
208                 // Pattern is a directory name.
209                 [escape_pattern(pattern).as_slice(), b"/"].concat()
210             };
211 
212             // Anything after the pattern must be a non-directory.
213             res.extend(b"[^/]+$");
214             res
215         }
216         PatternSyntax::RelGlob => {
217             let glob_re = glob_to_re(pattern);
218             if let Some(rest) = glob_re.drop_prefix(b"[^/]*") {
219                 [b".*", rest, GLOB_SUFFIX].concat()
220             } else {
221                 [b"(?:.*/)?", glob_re.as_slice(), GLOB_SUFFIX].concat()
222             }
223         }
224         PatternSyntax::Glob | PatternSyntax::RootGlob => {
225             [glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
226         }
227         PatternSyntax::Include
228         | PatternSyntax::SubInclude
229         | PatternSyntax::ExpandedSubInclude(_) => unreachable!(),
230     }
231 }
232 
233 const GLOB_SPECIAL_CHARACTERS: [u8; 7] =
234     [b'*', b'?', b'[', b']', b'{', b'}', b'\\'];
235 
236 /// TODO support other platforms
237 #[cfg(unix)]
238 pub fn normalize_path_bytes(bytes: &[u8]) -> Vec<u8> {
239     if bytes.is_empty() {
240         return b".".to_vec();
241     }
242     let sep = b'/';
243 
244     let mut initial_slashes = bytes.iter().take_while(|b| **b == sep).count();
245     if initial_slashes > 2 {
246         // POSIX allows one or two initial slashes, but treats three or more
247         // as single slash.
248         initial_slashes = 1;
249     }
250     let components = bytes
251         .split(|b| *b == sep)
252         .filter(|c| !(c.is_empty() || c == b"."))
253         .fold(vec![], |mut acc, component| {
254             if component != b".."
255                 || (initial_slashes == 0 && acc.is_empty())
256                 || (!acc.is_empty() && acc[acc.len() - 1] == b"..")
257             {
258                 acc.push(component)
259             } else if !acc.is_empty() {
260                 acc.pop();
261             }
262             acc
263         });
264     let mut new_bytes = components.join(&sep);
265 
266     if initial_slashes > 0 {
267         let mut buf: Vec<_> = (0..initial_slashes).map(|_| sep).collect();
268         buf.extend(new_bytes);
269         new_bytes = buf;
270     }
271     if new_bytes.is_empty() {
272         b".".to_vec()
273     } else {
274         new_bytes
275     }
276 }
277 
278 /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs
279 /// that don't need to be transformed into a regex.
280 pub fn build_single_regex(
281     entry: &IgnorePattern,
282 ) -> Result<Option<Vec<u8>>, PatternError> {
283     let IgnorePattern {
284         pattern, syntax, ..
285     } = entry;
286     let pattern = match syntax {
287         PatternSyntax::RootGlob
288         | PatternSyntax::Path
289         | PatternSyntax::RelGlob
290         | PatternSyntax::RootFiles => normalize_path_bytes(&pattern),
291         PatternSyntax::Include | PatternSyntax::SubInclude => {
292             return Err(PatternError::NonRegexPattern(entry.clone()))
293         }
294         _ => pattern.to_owned(),
295     };
296     if *syntax == PatternSyntax::RootGlob
297         && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
298     {
299         Ok(None)
300     } else {
301         let mut entry = entry.clone();
302         entry.pattern = pattern;
303         Ok(Some(_build_single_regex(&entry)))
304     }
305 }
306 
307 lazy_static! {
308     static ref SYNTAXES: FastHashMap<&'static [u8], &'static [u8]> = {
309         let mut m = FastHashMap::default();
310 
311         m.insert(b"re".as_ref(), b"relre:".as_ref());
312         m.insert(b"regexp".as_ref(), b"relre:".as_ref());
313         m.insert(b"glob".as_ref(), b"relglob:".as_ref());
314         m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref());
315         m.insert(b"include".as_ref(), b"include:".as_ref());
316         m.insert(b"subinclude".as_ref(), b"subinclude:".as_ref());
317         m
318     };
319 }
320 
321 #[derive(Debug)]
322 pub enum PatternFileWarning {
323     /// (file path, syntax bytes)
324     InvalidSyntax(PathBuf, Vec<u8>),
325     /// File path
326     NoSuchFile(PathBuf),
327 }
328 
329 pub fn parse_pattern_file_contents(
330     lines: &[u8],
331     file_path: &Path,
332     warn: bool,
333 ) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> {
334     let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap();
335 
336     #[allow(clippy::trivial_regex)]
337     let comment_escape_regex = Regex::new(r"\\#").unwrap();
338     let mut inputs: Vec<IgnorePattern> = vec![];
339     let mut warnings: Vec<PatternFileWarning> = vec![];
340 
341     let mut current_syntax = b"relre:".as_ref();
342 
343     for (line_number, mut line) in lines.split(|c| *c == b'\n').enumerate() {
344         let line_number = line_number + 1;
345 
346         let line_buf;
347         if line.contains(&b'#') {
348             if let Some(cap) = comment_regex.captures(line) {
349                 line = &line[..cap.get(1).unwrap().end()]
350             }
351             line_buf = comment_escape_regex.replace_all(line, NoExpand(b"#"));
352             line = &line_buf;
353         }
354 
355         let mut line = line.trim_end();
356 
357         if line.is_empty() {
358             continue;
359         }
360 
361         if let Some(syntax) = line.drop_prefix(b"syntax:") {
362             let syntax = syntax.trim();
363 
364             if let Some(rel_syntax) = SYNTAXES.get(syntax) {
365                 current_syntax = rel_syntax;
366             } else if warn {
367                 warnings.push(PatternFileWarning::InvalidSyntax(
368                     file_path.to_owned(),
369                     syntax.to_owned(),
370                 ));
371             }
372             continue;
373         }
374 
375         let mut line_syntax: &[u8] = &current_syntax;
376 
377         for (s, rels) in SYNTAXES.iter() {
378             if let Some(rest) = line.drop_prefix(rels) {
379                 line_syntax = rels;
380                 line = rest;
381                 break;
382             }
383             if let Some(rest) = line.drop_prefix(&[s, &b":"[..]].concat()) {
384                 line_syntax = rels;
385                 line = rest;
386                 break;
387             }
388         }
389 
390         inputs.push(IgnorePattern::new(
391             parse_pattern_syntax(&line_syntax).map_err(|e| match e {
392                 PatternError::UnsupportedSyntax(syntax) => {
393                     PatternError::UnsupportedSyntaxInFile(
394                         syntax,
395                         file_path.to_string_lossy().into(),
396                         line_number,
397                     )
398                 }
399                 _ => e,
400             })?,
401             &line,
402             file_path,
403         ));
404     }
405     Ok((inputs, warnings))
406 }
407 
408 pub fn read_pattern_file(
409     file_path: &Path,
410     warn: bool,
411     inspect_pattern_bytes: &mut impl FnMut(&[u8]),
412 ) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> {
413     match std::fs::read(file_path) {
414         Ok(contents) => {
415             inspect_pattern_bytes(&contents);
416             parse_pattern_file_contents(&contents, file_path, warn)
417         }
418         Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok((
419             vec![],
420             vec![PatternFileWarning::NoSuchFile(file_path.to_owned())],
421         )),
422         Err(e) => Err(e.into()),
423     }
424 }
425 
426 /// Represents an entry in an "ignore" file.
427 #[derive(Debug, Eq, PartialEq, Clone)]
428 pub struct IgnorePattern {
429     pub syntax: PatternSyntax,
430     pub pattern: Vec<u8>,
431     pub source: PathBuf,
432 }
433 
434 impl IgnorePattern {
435     pub fn new(syntax: PatternSyntax, pattern: &[u8], source: &Path) -> Self {
436         Self {
437             syntax,
438             pattern: pattern.to_owned(),
439             source: source.to_owned(),
440         }
441     }
442 }
443 
444 pub type PatternResult<T> = Result<T, PatternError>;
445 
446 /// Wrapper for `read_pattern_file` that also recursively expands `include:`
447 /// and `subinclude:` patterns.
448 ///
449 /// The former are expanded in place, while `PatternSyntax::ExpandedSubInclude`
450 /// is used for the latter to form a tree of patterns.
451 pub fn get_patterns_from_file(
452     pattern_file: &Path,
453     root_dir: &Path,
454     inspect_pattern_bytes: &mut impl FnMut(&[u8]),
455 ) -> PatternResult<(Vec<IgnorePattern>, Vec<PatternFileWarning>)> {
456     let (patterns, mut warnings) =
457         read_pattern_file(pattern_file, true, inspect_pattern_bytes)?;
458     let patterns = patterns
459         .into_iter()
460         .flat_map(|entry| -> PatternResult<_> {
461             Ok(match &entry.syntax {
462                 PatternSyntax::Include => {
463                     let inner_include =
464                         root_dir.join(get_path_from_bytes(&entry.pattern));
465                     let (inner_pats, inner_warnings) = get_patterns_from_file(
466                         &inner_include,
467                         root_dir,
468                         inspect_pattern_bytes,
469                     )?;
470                     warnings.extend(inner_warnings);
471                     inner_pats
472                 }
473                 PatternSyntax::SubInclude => {
474                     let mut sub_include = SubInclude::new(
475                         &root_dir,
476                         &entry.pattern,
477                         &entry.source,
478                     )?;
479                     let (inner_patterns, inner_warnings) =
480                         get_patterns_from_file(
481                             &sub_include.path,
482                             &sub_include.root,
483                             inspect_pattern_bytes,
484                         )?;
485                     sub_include.included_patterns = inner_patterns;
486                     warnings.extend(inner_warnings);
487                     vec![IgnorePattern {
488                         syntax: PatternSyntax::ExpandedSubInclude(Box::new(
489                             sub_include,
490                         )),
491                         ..entry
492                     }]
493                 }
494                 _ => vec![entry],
495             })
496         })
497         .flatten()
498         .collect();
499 
500     Ok((patterns, warnings))
501 }
502 
503 /// Holds all the information needed to handle a `subinclude:` pattern.
504 #[derive(Debug, PartialEq, Eq, Clone)]
505 pub struct SubInclude {
506     /// Will be used for repository (hg) paths that start with this prefix.
507     /// It is relative to the current working directory, so comparing against
508     /// repository paths is painless.
509     pub prefix: HgPathBuf,
510     /// The file itself, containing the patterns
511     pub path: PathBuf,
512     /// Folder in the filesystem where this it applies
513     pub root: PathBuf,
514 
515     pub included_patterns: Vec<IgnorePattern>,
516 }
517 
518 impl SubInclude {
519     pub fn new(
520         root_dir: &Path,
521         pattern: &[u8],
522         source: &Path,
523     ) -> Result<SubInclude, HgPathError> {
524         let normalized_source =
525             normalize_path_bytes(&get_bytes_from_path(source));
526 
527         let source_root = get_path_from_bytes(&normalized_source);
528         let source_root =
529             source_root.parent().unwrap_or_else(|| source_root.deref());
530 
531         let path = source_root.join(get_path_from_bytes(pattern));
532         let new_root = path.parent().unwrap_or_else(|| path.deref());
533 
534         let prefix = canonical_path(root_dir, root_dir, new_root)?;
535 
536         Ok(Self {
537             prefix: path_to_hg_path_buf(prefix).and_then(|mut p| {
538                 if !p.is_empty() {
539                     p.push_byte(b'/');
540                 }
541                 Ok(p)
542             })?,
543             path: path.to_owned(),
544             root: new_root.to_owned(),
545             included_patterns: Vec::new(),
546         })
547     }
548 }
549 
550 /// Separate and pre-process subincludes from other patterns for the "ignore"
551 /// phase.
552 pub fn filter_subincludes(
553     ignore_patterns: Vec<IgnorePattern>,
554 ) -> Result<(Vec<Box<SubInclude>>, Vec<IgnorePattern>), HgPathError> {
555     let mut subincludes = vec![];
556     let mut others = vec![];
557 
558     for pattern in ignore_patterns {
559         if let PatternSyntax::ExpandedSubInclude(sub_include) = pattern.syntax
560         {
561             subincludes.push(sub_include);
562         } else {
563             others.push(pattern)
564         }
565     }
566     Ok((subincludes, others))
567 }
568 
569 #[cfg(test)]
570 mod tests {
571     use super::*;
572     use pretty_assertions::assert_eq;
573 
574     #[test]
575     fn escape_pattern_test() {
576         let untouched =
577             br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
578         assert_eq!(escape_pattern(untouched), untouched.to_vec());
579         // All escape codes
580         assert_eq!(
581             escape_pattern(br#"()[]{}?*+-|^$\\.&~# \t\n\r\v\f"#),
582             br#"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\ \\t\\n\\r\\v\\f"#
583                 .to_vec()
584         );
585     }
586 
587     #[test]
glob_test()588     fn glob_test() {
589         assert_eq!(glob_to_re(br#"?"#), br#"."#);
590         assert_eq!(glob_to_re(br#"*"#), br#"[^/]*"#);
591         assert_eq!(glob_to_re(br#"**"#), br#".*"#);
592         assert_eq!(glob_to_re(br#"**/a"#), br#"(?:.*/)?a"#);
593         assert_eq!(glob_to_re(br#"a/**/b"#), br#"a/(?:.*/)?b"#);
594         assert_eq!(glob_to_re(br#"[a*?!^][^b][!c]"#), br#"[a*?!^][\^b][^c]"#);
595         assert_eq!(glob_to_re(br#"{a,b}"#), br#"(?:a|b)"#);
596         assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#);
597     }
598 
599     #[test]
test_parse_pattern_file_contents()600     fn test_parse_pattern_file_contents() {
601         let lines = b"syntax: glob\n*.elc";
602 
603         assert_eq!(
604             parse_pattern_file_contents(lines, Path::new("file_path"), false)
605                 .unwrap()
606                 .0,
607             vec![IgnorePattern::new(
608                 PatternSyntax::RelGlob,
609                 b"*.elc",
610                 Path::new("file_path")
611             )],
612         );
613 
614         let lines = b"syntax: include\nsyntax: glob";
615 
616         assert_eq!(
617             parse_pattern_file_contents(lines, Path::new("file_path"), false)
618                 .unwrap()
619                 .0,
620             vec![]
621         );
622         let lines = b"glob:**.o";
623         assert_eq!(
624             parse_pattern_file_contents(lines, Path::new("file_path"), false)
625                 .unwrap()
626                 .0,
627             vec![IgnorePattern::new(
628                 PatternSyntax::RelGlob,
629                 b"**.o",
630                 Path::new("file_path")
631             )]
632         );
633     }
634 
635     #[test]
test_build_single_regex()636     fn test_build_single_regex() {
637         assert_eq!(
638             build_single_regex(&IgnorePattern::new(
639                 PatternSyntax::RelGlob,
640                 b"rust/target/",
641                 Path::new("")
642             ))
643             .unwrap(),
644             Some(br"(?:.*/)?rust/target(?:/|$)".to_vec()),
645         );
646         assert_eq!(
647             build_single_regex(&IgnorePattern::new(
648                 PatternSyntax::Regexp,
649                 br"rust/target/\d+",
650                 Path::new("")
651             ))
652             .unwrap(),
653             Some(br"rust/target/\d+".to_vec()),
654         );
655     }
656 
657     #[test]
test_build_single_regex_shortcut()658     fn test_build_single_regex_shortcut() {
659         assert_eq!(
660             build_single_regex(&IgnorePattern::new(
661                 PatternSyntax::RootGlob,
662                 b"",
663                 Path::new("")
664             ))
665             .unwrap(),
666             None,
667         );
668         assert_eq!(
669             build_single_regex(&IgnorePattern::new(
670                 PatternSyntax::RootGlob,
671                 b"whatever",
672                 Path::new("")
673             ))
674             .unwrap(),
675             None,
676         );
677         assert_eq!(
678             build_single_regex(&IgnorePattern::new(
679                 PatternSyntax::RootGlob,
680                 b"*.o",
681                 Path::new("")
682             ))
683             .unwrap(),
684             Some(br"[^/]*\.o(?:/|$)".to_vec()),
685         );
686     }
687 }
688