1 use std::cell::RefCell;
2 use std::cmp;
3 use std::fmt;
4 use std::fs::File;
5 use std::io::{self, Read};
6 use std::path::Path;
7 
8 use crate::line_buffer::{
9     self, alloc_error, BufferAllocation, LineBuffer, LineBufferBuilder,
10     LineBufferReader, DEFAULT_BUFFER_CAPACITY,
11 };
12 use crate::searcher::glue::{MultiLine, ReadByLine, SliceByLine};
13 use crate::sink::{Sink, SinkError};
14 use encoding_rs;
15 use encoding_rs_io::DecodeReaderBytesBuilder;
16 use grep_matcher::{LineTerminator, Match, Matcher};
17 
18 pub use self::mmap::MmapChoice;
19 
20 mod core;
21 mod glue;
22 mod mmap;
23 
24 /// We use this type alias since we want the ergonomics of a matcher's `Match`
25 /// type, but in practice, we use it for arbitrary ranges, so give it a more
26 /// accurate name. This is only used in the searcher's internals.
27 type Range = Match;
28 
29 /// The behavior of binary detection while searching.
30 ///
31 /// Binary detection is the process of _heuristically_ identifying whether a
32 /// given chunk of data is binary or not, and then taking an action based on
33 /// the result of that heuristic. The motivation behind detecting binary data
34 /// is that binary data often indicates data that is undesirable to search
35 /// using textual patterns. Of course, there are many cases in which this isn't
36 /// true, which is why binary detection is disabled by default.
37 ///
38 /// Unfortunately, binary detection works differently depending on the type of
39 /// search being executed:
40 ///
41 /// 1. When performing a search using a fixed size buffer, binary detection is
42 ///    applied to the buffer's contents as it is filled. Binary detection must
43 ///    be applied to the buffer directly because binary files may not contain
44 ///    line terminators, which could result in exorbitant memory usage.
45 /// 2. When performing a search using memory maps or by reading data off the
46 ///    heap, then binary detection is only guaranteed to be applied to the
47 ///    parts corresponding to a match. When `Quit` is enabled, then the first
48 ///    few KB of the data are searched for binary data.
49 #[derive(Clone, Debug, Default)]
50 pub struct BinaryDetection(line_buffer::BinaryDetection);
51 
52 impl BinaryDetection {
53     /// No binary detection is performed. Data reported by the searcher may
54     /// contain arbitrary bytes.
55     ///
56     /// This is the default.
none() -> BinaryDetection57     pub fn none() -> BinaryDetection {
58         BinaryDetection(line_buffer::BinaryDetection::None)
59     }
60 
61     /// Binary detection is performed by looking for the given byte.
62     ///
63     /// When searching is performed using a fixed size buffer, then the
64     /// contents of that buffer are always searched for the presence of this
65     /// byte. If it is found, then the underlying data is considered binary
66     /// and the search stops as if it reached EOF.
67     ///
68     /// When searching is performed with the entire contents mapped into
69     /// memory, then binary detection is more conservative. Namely, only a
70     /// fixed sized region at the beginning of the contents are detected for
71     /// binary data. As a compromise, any subsequent matching (or context)
72     /// lines are also searched for binary data. If binary data is detected at
73     /// any point, then the search stops as if it reached EOF.
quit(binary_byte: u8) -> BinaryDetection74     pub fn quit(binary_byte: u8) -> BinaryDetection {
75         BinaryDetection(line_buffer::BinaryDetection::Quit(binary_byte))
76     }
77 
78     /// Binary detection is performed by looking for the given byte, and
79     /// replacing it with the line terminator configured on the searcher.
80     /// (If the searcher is configured to use `CRLF` as the line terminator,
81     /// then this byte is replaced by just `LF`.)
82     ///
83     /// When searching is performed using a fixed size buffer, then the
84     /// contents of that buffer are always searched for the presence of this
85     /// byte and replaced with the line terminator. In effect, the caller is
86     /// guaranteed to never observe this byte while searching.
87     ///
88     /// When searching is performed with the entire contents mapped into
89     /// memory, then this setting has no effect and is ignored.
convert(binary_byte: u8) -> BinaryDetection90     pub fn convert(binary_byte: u8) -> BinaryDetection {
91         BinaryDetection(line_buffer::BinaryDetection::Convert(binary_byte))
92     }
93 
94     /// If this binary detection uses the "quit" strategy, then this returns
95     /// the byte that will cause a search to quit. In any other case, this
96     /// returns `None`.
quit_byte(&self) -> Option<u8>97     pub fn quit_byte(&self) -> Option<u8> {
98         match self.0 {
99             line_buffer::BinaryDetection::Quit(b) => Some(b),
100             _ => None,
101         }
102     }
103 
104     /// If this binary detection uses the "convert" strategy, then this returns
105     /// the byte that will be replaced by the line terminator. In any other
106     /// case, this returns `None`.
convert_byte(&self) -> Option<u8>107     pub fn convert_byte(&self) -> Option<u8> {
108         match self.0 {
109             line_buffer::BinaryDetection::Convert(b) => Some(b),
110             _ => None,
111         }
112     }
113 }
114 
115 /// An encoding to use when searching.
116 ///
117 /// An encoding can be used to configure a
118 /// [`SearcherBuilder`](struct.SearchBuilder.html)
119 /// to transcode source data from an encoding to UTF-8 before searching.
120 ///
121 /// An `Encoding` will always be cheap to clone.
122 #[derive(Clone, Debug)]
123 pub struct Encoding(&'static encoding_rs::Encoding);
124 
125 impl Encoding {
126     /// Create a new encoding for the specified label.
127     ///
128     /// The encoding label provided is mapped to an encoding via the set of
129     /// available choices specified in the
130     /// [Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get).
131     /// If the given label does not correspond to a valid encoding, then this
132     /// returns an error.
new(label: &str) -> Result<Encoding, ConfigError>133     pub fn new(label: &str) -> Result<Encoding, ConfigError> {
134         let label = label.as_bytes();
135         match encoding_rs::Encoding::for_label_no_replacement(label) {
136             Some(encoding) => Ok(Encoding(encoding)),
137             None => {
138                 Err(ConfigError::UnknownEncoding { label: label.to_vec() })
139             }
140         }
141     }
142 }
143 
144 /// The internal configuration of a searcher. This is shared among several
145 /// search related types, but is only ever written to by the SearcherBuilder.
146 #[derive(Clone, Debug)]
147 pub struct Config {
148     /// The line terminator to use.
149     line_term: LineTerminator,
150     /// Whether to invert matching.
151     invert_match: bool,
152     /// The number of lines after a match to include.
153     after_context: usize,
154     /// The number of lines before a match to include.
155     before_context: usize,
156     /// Whether to enable unbounded context or not.
157     passthru: bool,
158     /// Whether to count line numbers.
159     line_number: bool,
160     /// The maximum amount of heap memory to use.
161     ///
162     /// When not given, no explicit limit is enforced. When set to `0`, then
163     /// only the memory map search strategy is available.
164     heap_limit: Option<usize>,
165     /// The memory map strategy.
166     mmap: MmapChoice,
167     /// The binary data detection strategy.
168     binary: BinaryDetection,
169     /// Whether to enable matching across multiple lines.
170     multi_line: bool,
171     /// An encoding that, when present, causes the searcher to transcode all
172     /// input from the encoding to UTF-8.
173     encoding: Option<Encoding>,
174     /// Whether to do automatic transcoding based on a BOM or not.
175     bom_sniffing: bool,
176 }
177 
178 impl Default for Config {
default() -> Config179     fn default() -> Config {
180         Config {
181             line_term: LineTerminator::default(),
182             invert_match: false,
183             after_context: 0,
184             before_context: 0,
185             passthru: false,
186             line_number: true,
187             heap_limit: None,
188             mmap: MmapChoice::default(),
189             binary: BinaryDetection::default(),
190             multi_line: false,
191             encoding: None,
192             bom_sniffing: true,
193         }
194     }
195 }
196 
197 impl Config {
198     /// Return the maximal amount of lines needed to fulfill this
199     /// configuration's context.
200     ///
201     /// If this returns `0`, then no context is ever needed.
max_context(&self) -> usize202     fn max_context(&self) -> usize {
203         cmp::max(self.before_context, self.after_context)
204     }
205 
206     /// Build a line buffer from this configuration.
line_buffer(&self) -> LineBuffer207     fn line_buffer(&self) -> LineBuffer {
208         let mut builder = LineBufferBuilder::new();
209         builder
210             .line_terminator(self.line_term.as_byte())
211             .binary_detection(self.binary.0);
212 
213         if let Some(limit) = self.heap_limit {
214             let (capacity, additional) = if limit <= DEFAULT_BUFFER_CAPACITY {
215                 (limit, 0)
216             } else {
217                 (DEFAULT_BUFFER_CAPACITY, limit - DEFAULT_BUFFER_CAPACITY)
218             };
219             builder
220                 .capacity(capacity)
221                 .buffer_alloc(BufferAllocation::Error(additional));
222         }
223         builder.build()
224     }
225 }
226 
227 /// An error that can occur when building a searcher.
228 ///
229 /// This error occurs when a non-sensical configuration is present when trying
230 /// to construct a `Searcher` from a `SearcherBuilder`.
231 #[derive(Clone, Debug, Eq, PartialEq)]
232 pub enum ConfigError {
233     /// Indicates that the heap limit configuration prevents all possible
234     /// search strategies from being used. For example, if the heap limit is
235     /// set to 0 and memory map searching is disabled or unavailable.
236     SearchUnavailable,
237     /// Occurs when a matcher reports a line terminator that is different than
238     /// the one configured in the searcher.
239     MismatchedLineTerminators {
240         /// The matcher's line terminator.
241         matcher: LineTerminator,
242         /// The searcher's line terminator.
243         searcher: LineTerminator,
244     },
245     /// Occurs when no encoding could be found for a particular label.
246     UnknownEncoding {
247         /// The provided encoding label that could not be found.
248         label: Vec<u8>,
249     },
250     /// Hints that destructuring should not be exhaustive.
251     ///
252     /// This enum may grow additional variants, so this makes sure clients
253     /// don't count on exhaustive matching. (Otherwise, adding a new variant
254     /// could break existing code.)
255     #[doc(hidden)]
256     __Nonexhaustive,
257 }
258 
259 impl ::std::error::Error for ConfigError {
description(&self) -> &str260     fn description(&self) -> &str {
261         "grep-searcher configuration error"
262     }
263 }
264 
265 impl fmt::Display for ConfigError {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result266     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
267         match *self {
268             ConfigError::SearchUnavailable => {
269                 write!(f, "grep config error: no available searchers")
270             }
271             ConfigError::MismatchedLineTerminators { matcher, searcher } => {
272                 write!(
273                     f,
274                     "grep config error: mismatched line terminators, \
275                      matcher has {:?} but searcher has {:?}",
276                     matcher, searcher
277                 )
278             }
279             ConfigError::UnknownEncoding { ref label } => write!(
280                 f,
281                 "grep config error: unknown encoding: {}",
282                 String::from_utf8_lossy(label),
283             ),
284             _ => panic!("BUG: unexpected variant found"),
285         }
286     }
287 }
288 
289 /// A builder for configuring a searcher.
290 ///
291 /// A search builder permits specifying the configuration of a searcher,
292 /// including options like whether to invert the search or to enable multi
293 /// line search.
294 ///
295 /// Once a searcher has been built, it is beneficial to reuse that searcher
296 /// for multiple searches, if possible.
297 #[derive(Clone, Debug)]
298 pub struct SearcherBuilder {
299     config: Config,
300 }
301 
302 impl Default for SearcherBuilder {
default() -> SearcherBuilder303     fn default() -> SearcherBuilder {
304         SearcherBuilder::new()
305     }
306 }
307 
308 impl SearcherBuilder {
309     /// Create a new searcher builder with a default configuration.
new() -> SearcherBuilder310     pub fn new() -> SearcherBuilder {
311         SearcherBuilder { config: Config::default() }
312     }
313 
314     /// Build a searcher with the given matcher.
build(&self) -> Searcher315     pub fn build(&self) -> Searcher {
316         let mut config = self.config.clone();
317         if config.passthru {
318             config.before_context = 0;
319             config.after_context = 0;
320         }
321 
322         let mut decode_builder = DecodeReaderBytesBuilder::new();
323         decode_builder
324             .encoding(self.config.encoding.as_ref().map(|e| e.0))
325             .utf8_passthru(true)
326             .strip_bom(self.config.bom_sniffing)
327             .bom_override(true)
328             .bom_sniffing(self.config.bom_sniffing);
329 
330         Searcher {
331             config: config,
332             decode_builder: decode_builder,
333             decode_buffer: RefCell::new(vec![0; 8 * (1 << 10)]),
334             line_buffer: RefCell::new(self.config.line_buffer()),
335             multi_line_buffer: RefCell::new(vec![]),
336         }
337     }
338 
339     /// Set the line terminator that is used by the searcher.
340     ///
341     /// When using a searcher, if the matcher provided has a line terminator
342     /// set, then it must be the same as this one. If they aren't, building
343     /// a searcher will return an error.
344     ///
345     /// By default, this is set to `b'\n'`.
line_terminator( &mut self, line_term: LineTerminator, ) -> &mut SearcherBuilder346     pub fn line_terminator(
347         &mut self,
348         line_term: LineTerminator,
349     ) -> &mut SearcherBuilder {
350         self.config.line_term = line_term;
351         self
352     }
353 
354     /// Whether to invert matching, whereby lines that don't match are reported
355     /// instead of reporting lines that do match.
356     ///
357     /// By default, this is disabled.
invert_match(&mut self, yes: bool) -> &mut SearcherBuilder358     pub fn invert_match(&mut self, yes: bool) -> &mut SearcherBuilder {
359         self.config.invert_match = yes;
360         self
361     }
362 
363     /// Whether to count and include line numbers with matching lines.
364     ///
365     /// This is enabled by default. There is a small performance penalty
366     /// associated with computing line numbers, so this can be disabled when
367     /// this isn't desirable.
line_number(&mut self, yes: bool) -> &mut SearcherBuilder368     pub fn line_number(&mut self, yes: bool) -> &mut SearcherBuilder {
369         self.config.line_number = yes;
370         self
371     }
372 
373     /// Whether to enable multi line search or not.
374     ///
375     /// When multi line search is enabled, matches *may* match across multiple
376     /// lines. Conversely, when multi line search is disabled, it is impossible
377     /// for any match to span more than one line.
378     ///
379     /// **Warning:** multi line search requires having the entire contents to
380     /// search mapped in memory at once. When searching files, memory maps
381     /// will be used if possible and if they are enabled, which avoids using
382     /// your program's heap. However, if memory maps cannot be used (e.g.,
383     /// for searching streams like `stdin` or if transcoding is necessary),
384     /// then the entire contents of the stream are read on to the heap before
385     /// starting the search.
386     ///
387     /// This is disabled by default.
multi_line(&mut self, yes: bool) -> &mut SearcherBuilder388     pub fn multi_line(&mut self, yes: bool) -> &mut SearcherBuilder {
389         self.config.multi_line = yes;
390         self
391     }
392 
393     /// Whether to include a fixed number of lines after every match.
394     ///
395     /// When this is set to a non-zero number, then the searcher will report
396     /// `line_count` contextual lines after every match.
397     ///
398     /// This is set to `0` by default.
after_context( &mut self, line_count: usize, ) -> &mut SearcherBuilder399     pub fn after_context(
400         &mut self,
401         line_count: usize,
402     ) -> &mut SearcherBuilder {
403         self.config.after_context = line_count;
404         self
405     }
406 
407     /// Whether to include a fixed number of lines before every match.
408     ///
409     /// When this is set to a non-zero number, then the searcher will report
410     /// `line_count` contextual lines before every match.
411     ///
412     /// This is set to `0` by default.
before_context( &mut self, line_count: usize, ) -> &mut SearcherBuilder413     pub fn before_context(
414         &mut self,
415         line_count: usize,
416     ) -> &mut SearcherBuilder {
417         self.config.before_context = line_count;
418         self
419     }
420 
421     /// Whether to enable the "passthru" feature or not.
422     ///
423     /// When passthru is enabled, it effectively treats all non-matching lines
424     /// as contextual lines. In other words, enabling this is akin to
425     /// requesting an unbounded number of before and after contextual lines.
426     ///
427     /// When passthru mode is enabled, any `before_context` or `after_context`
428     /// settings are ignored by setting them to `0`.
429     ///
430     /// This is disabled by default.
passthru(&mut self, yes: bool) -> &mut SearcherBuilder431     pub fn passthru(&mut self, yes: bool) -> &mut SearcherBuilder {
432         self.config.passthru = yes;
433         self
434     }
435 
436     /// Set an approximate limit on the amount of heap space used by a
437     /// searcher.
438     ///
439     /// The heap limit is enforced in two scenarios:
440     ///
441     /// * When searching using a fixed size buffer, the heap limit controls
442     ///   how big this buffer is allowed to be. Assuming contexts are disabled,
443     ///   the minimum size of this buffer is the length (in bytes) of the
444     ///   largest single line in the contents being searched. If any line
445     ///   exceeds the heap limit, then an error will be returned.
446     /// * When performing a multi line search, a fixed size buffer cannot be
447     ///   used. Thus, the only choices are to read the entire contents on to
448     ///   the heap, or use memory maps. In the former case, the heap limit set
449     ///   here is enforced.
450     ///
451     /// If a heap limit is set to `0`, then no heap space is used. If there are
452     /// no alternative strategies available for searching without heap space
453     /// (e.g., memory maps are disabled), then the searcher wil return an error
454     /// immediately.
455     ///
456     /// By default, no limit is set.
heap_limit( &mut self, bytes: Option<usize>, ) -> &mut SearcherBuilder457     pub fn heap_limit(
458         &mut self,
459         bytes: Option<usize>,
460     ) -> &mut SearcherBuilder {
461         self.config.heap_limit = bytes;
462         self
463     }
464 
465     /// Set the strategy to employ use of memory maps.
466     ///
467     /// Currently, there are only two strategies that can be employed:
468     ///
469     /// * **Automatic** - A searcher will use heuristics, including but not
470     ///   limited to file size and platform, to determine whether to use memory
471     ///   maps or not.
472     /// * **Never** - Memory maps will never be used. If multi line search is
473     ///   enabled, then the entire contents will be read on to the heap before
474     ///   searching begins.
475     ///
476     /// The default behavior is **never**. Generally speaking, and perhaps
477     /// against conventional wisdom, memory maps don't necessarily enable
478     /// faster searching. For example, depending on the platform, using memory
479     /// maps while searching a large directory can actually be quite a bit
480     /// slower than using normal read calls because of the overhead of managing
481     /// the memory maps.
482     ///
483     /// Memory maps can be faster in some cases however. On some platforms,
484     /// when searching a very large file that *is already in memory*, it can
485     /// be slightly faster to search it as a memory map instead of using
486     /// normal read calls.
487     ///
488     /// Finally, memory maps have a somewhat complicated safety story in Rust.
489     /// If you aren't sure whether enabling memory maps is worth it, then just
490     /// don't bother with it.
491     ///
492     /// **WARNING**: If your process is searching a file backed memory map
493     /// at the same time that file is truncated, then it's possible for the
494     /// process to terminate with a bus error.
memory_map( &mut self, strategy: MmapChoice, ) -> &mut SearcherBuilder495     pub fn memory_map(
496         &mut self,
497         strategy: MmapChoice,
498     ) -> &mut SearcherBuilder {
499         self.config.mmap = strategy;
500         self
501     }
502 
503     /// Set the binary detection strategy.
504     ///
505     /// The binary detection strategy determines not only how the searcher
506     /// detects binary data, but how it responds to the presence of binary
507     /// data. See the [`BinaryDetection`](struct.BinaryDetection.html) type
508     /// for more information.
509     ///
510     /// By default, binary detection is disabled.
binary_detection( &mut self, detection: BinaryDetection, ) -> &mut SearcherBuilder511     pub fn binary_detection(
512         &mut self,
513         detection: BinaryDetection,
514     ) -> &mut SearcherBuilder {
515         self.config.binary = detection;
516         self
517     }
518 
519     /// Set the encoding used to read the source data before searching.
520     ///
521     /// When an encoding is provided, then the source data is _unconditionally_
522     /// transcoded using the encoding, unless a BOM is present. If a BOM is
523     /// present, then the encoding indicated by the BOM is used instead. If the
524     /// transcoding process encounters an error, then bytes are replaced with
525     /// the Unicode replacement codepoint.
526     ///
527     /// When no encoding is specified (the default), then BOM sniffing is
528     /// used (if it's enabled, which it is, by default) to determine whether
529     /// the source data is UTF-8 or UTF-16, and transcoding will be performed
530     /// automatically. If no BOM could be found, then the source data is
531     /// searched _as if_ it were UTF-8. However, so long as the source data is
532     /// at least ASCII compatible, then it is possible for a search to produce
533     /// useful results.
encoding( &mut self, encoding: Option<Encoding>, ) -> &mut SearcherBuilder534     pub fn encoding(
535         &mut self,
536         encoding: Option<Encoding>,
537     ) -> &mut SearcherBuilder {
538         self.config.encoding = encoding;
539         self
540     }
541 
542     /// Enable automatic transcoding based on BOM sniffing.
543     ///
544     /// When this is enabled and an explicit encoding is not set, then this
545     /// searcher will try to detect the encoding of the bytes being searched
546     /// by sniffing its byte-order mark (BOM). In particular, when this is
547     /// enabled, UTF-16 encoded files will be searched seamlessly.
548     ///
549     /// When this is disabled and if an explicit encoding is not set, then
550     /// the bytes from the source stream will be passed through unchanged,
551     /// including its BOM, if one is present.
552     ///
553     /// This is enabled by default.
bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder554     pub fn bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder {
555         self.config.bom_sniffing = yes;
556         self
557     }
558 }
559 
560 /// A searcher executes searches over a haystack and writes results to a caller
561 /// provided sink.
562 ///
563 /// Matches are detected via implementations of the `Matcher` trait, which must
564 /// be provided by the caller when executing a search.
565 ///
566 /// When possible, a searcher should be reused.
567 #[derive(Clone, Debug)]
568 pub struct Searcher {
569     /// The configuration for this searcher.
570     ///
571     /// We make most of these settings available to users of `Searcher` via
572     /// public API methods, which can be queried in implementations of `Sink`
573     /// if necessary.
574     config: Config,
575     /// A builder for constructing a streaming reader that transcodes source
576     /// data according to either an explicitly specified encoding or via an
577     /// automatically detected encoding via BOM sniffing.
578     ///
579     /// When no transcoding is needed, then the transcoder built will pass
580     /// through the underlying bytes with no additional overhead.
581     decode_builder: DecodeReaderBytesBuilder,
582     /// A buffer that is used for transcoding scratch space.
583     decode_buffer: RefCell<Vec<u8>>,
584     /// A line buffer for use in line oriented searching.
585     ///
586     /// We wrap it in a RefCell to permit lending out borrows of `Searcher`
587     /// to sinks. We still require a mutable borrow to execute a search, so
588     /// we statically prevent callers from causing RefCell to panic at runtime
589     /// due to a borrowing violation.
590     line_buffer: RefCell<LineBuffer>,
591     /// A buffer in which to store the contents of a reader when performing a
592     /// multi line search. In particular, multi line searches cannot be
593     /// performed incrementally, and need the entire haystack in memory at
594     /// once.
595     multi_line_buffer: RefCell<Vec<u8>>,
596 }
597 
598 impl Searcher {
599     /// Create a new searcher with a default configuration.
600     ///
601     /// To configure the searcher (e.g., invert matching, enable memory maps,
602     /// enable contexts, etc.), use the
603     /// [`SearcherBuilder`](struct.SearcherBuilder.html).
new() -> Searcher604     pub fn new() -> Searcher {
605         SearcherBuilder::new().build()
606     }
607 
608     /// Execute a search over the file with the given path and write the
609     /// results to the given sink.
610     ///
611     /// If memory maps are enabled and the searcher heuristically believes
612     /// memory maps will help the search run faster, then this will use
613     /// memory maps. For this reason, callers should prefer using this method
614     /// or `search_file` over the more generic `search_reader` when possible.
search_path<P, M, S>( &mut self, matcher: M, path: P, write_to: S, ) -> Result<(), S::Error> where P: AsRef<Path>, M: Matcher, S: Sink,615     pub fn search_path<P, M, S>(
616         &mut self,
617         matcher: M,
618         path: P,
619         write_to: S,
620     ) -> Result<(), S::Error>
621     where
622         P: AsRef<Path>,
623         M: Matcher,
624         S: Sink,
625     {
626         let path = path.as_ref();
627         let file = File::open(path).map_err(S::Error::error_io)?;
628         self.search_file_maybe_path(matcher, Some(path), &file, write_to)
629     }
630 
631     /// Execute a search over a file and write the results to the given sink.
632     ///
633     /// If memory maps are enabled and the searcher heuristically believes
634     /// memory maps will help the search run faster, then this will use
635     /// memory maps. For this reason, callers should prefer using this method
636     /// or `search_path` over the more generic `search_reader` when possible.
search_file<M, S>( &mut self, matcher: M, file: &File, write_to: S, ) -> Result<(), S::Error> where M: Matcher, S: Sink,637     pub fn search_file<M, S>(
638         &mut self,
639         matcher: M,
640         file: &File,
641         write_to: S,
642     ) -> Result<(), S::Error>
643     where
644         M: Matcher,
645         S: Sink,
646     {
647         self.search_file_maybe_path(matcher, None, file, write_to)
648     }
649 
search_file_maybe_path<M, S>( &mut self, matcher: M, path: Option<&Path>, file: &File, write_to: S, ) -> Result<(), S::Error> where M: Matcher, S: Sink,650     fn search_file_maybe_path<M, S>(
651         &mut self,
652         matcher: M,
653         path: Option<&Path>,
654         file: &File,
655         write_to: S,
656     ) -> Result<(), S::Error>
657     where
658         M: Matcher,
659         S: Sink,
660     {
661         if let Some(mmap) = self.config.mmap.open(file, path) {
662             log::trace!("{:?}: searching via memory map", path);
663             return self.search_slice(matcher, &mmap, write_to);
664         }
665         // Fast path for multi-line searches of files when memory maps are
666         // not enabled. This pre-allocates a buffer roughly the size of the
667         // file, which isn't possible when searching an arbitrary io::Read.
668         if self.multi_line_with_matcher(&matcher) {
669             log::trace!(
670                 "{:?}: reading entire file on to heap for mulitline",
671                 path
672             );
673             self.fill_multi_line_buffer_from_file::<S>(file)?;
674             log::trace!("{:?}: searching via multiline strategy", path);
675             MultiLine::new(
676                 self,
677                 matcher,
678                 &*self.multi_line_buffer.borrow(),
679                 write_to,
680             )
681             .run()
682         } else {
683             log::trace!("{:?}: searching using generic reader", path);
684             self.search_reader(matcher, file, write_to)
685         }
686     }
687 
688     /// Execute a search over any implementation of `io::Read` and write the
689     /// results to the given sink.
690     ///
691     /// When possible, this implementation will search the reader incrementally
692     /// without reading it into memory. In some cases---for example, if multi
693     /// line search is enabled---an incremental search isn't possible and the
694     /// given reader is consumed completely and placed on the heap before
695     /// searching begins. For this reason, when multi line search is enabled,
696     /// one should try to use higher level APIs (e.g., searching by file or
697     /// file path) so that memory maps can be used if they are available and
698     /// enabled.
search_reader<M, R, S>( &mut self, matcher: M, read_from: R, write_to: S, ) -> Result<(), S::Error> where M: Matcher, R: io::Read, S: Sink,699     pub fn search_reader<M, R, S>(
700         &mut self,
701         matcher: M,
702         read_from: R,
703         write_to: S,
704     ) -> Result<(), S::Error>
705     where
706         M: Matcher,
707         R: io::Read,
708         S: Sink,
709     {
710         self.check_config(&matcher).map_err(S::Error::error_config)?;
711 
712         let mut decode_buffer = self.decode_buffer.borrow_mut();
713         let decoder = self
714             .decode_builder
715             .build_with_buffer(read_from, &mut *decode_buffer)
716             .map_err(S::Error::error_io)?;
717 
718         if self.multi_line_with_matcher(&matcher) {
719             log::trace!(
720                 "generic reader: reading everything to heap for multiline"
721             );
722             self.fill_multi_line_buffer_from_reader::<_, S>(decoder)?;
723             log::trace!("generic reader: searching via multiline strategy");
724             MultiLine::new(
725                 self,
726                 matcher,
727                 &*self.multi_line_buffer.borrow(),
728                 write_to,
729             )
730             .run()
731         } else {
732             let mut line_buffer = self.line_buffer.borrow_mut();
733             let rdr = LineBufferReader::new(decoder, &mut *line_buffer);
734             log::trace!("generic reader: searching via roll buffer strategy");
735             ReadByLine::new(self, matcher, rdr, write_to).run()
736         }
737     }
738 
739     /// Execute a search over the given slice and write the results to the
740     /// given sink.
search_slice<M, S>( &mut self, matcher: M, slice: &[u8], write_to: S, ) -> Result<(), S::Error> where M: Matcher, S: Sink,741     pub fn search_slice<M, S>(
742         &mut self,
743         matcher: M,
744         slice: &[u8],
745         write_to: S,
746     ) -> Result<(), S::Error>
747     where
748         M: Matcher,
749         S: Sink,
750     {
751         self.check_config(&matcher).map_err(S::Error::error_config)?;
752 
753         // We can search the slice directly, unless we need to do transcoding.
754         if self.slice_needs_transcoding(slice) {
755             log::trace!(
756                 "slice reader: needs transcoding, using generic reader"
757             );
758             return self.search_reader(matcher, slice, write_to);
759         }
760         if self.multi_line_with_matcher(&matcher) {
761             log::trace!("slice reader: searching via multiline strategy");
762             MultiLine::new(self, matcher, slice, write_to).run()
763         } else {
764             log::trace!("slice reader: searching via slice-by-line strategy");
765             SliceByLine::new(self, matcher, slice, write_to).run()
766         }
767     }
768 
769     /// Set the binary detection method used on this searcher.
set_binary_detection(&mut self, detection: BinaryDetection)770     pub fn set_binary_detection(&mut self, detection: BinaryDetection) {
771         self.config.binary = detection.clone();
772         self.line_buffer.borrow_mut().set_binary_detection(detection.0);
773     }
774 
775     /// Check that the searcher's configuration and the matcher are consistent
776     /// with each other.
check_config<M: Matcher>(&self, matcher: M) -> Result<(), ConfigError>777     fn check_config<M: Matcher>(&self, matcher: M) -> Result<(), ConfigError> {
778         if self.config.heap_limit == Some(0) && !self.config.mmap.is_enabled()
779         {
780             return Err(ConfigError::SearchUnavailable);
781         }
782         let matcher_line_term = match matcher.line_terminator() {
783             None => return Ok(()),
784             Some(line_term) => line_term,
785         };
786         if matcher_line_term != self.config.line_term {
787             return Err(ConfigError::MismatchedLineTerminators {
788                 matcher: matcher_line_term,
789                 searcher: self.config.line_term,
790             });
791         }
792         Ok(())
793     }
794 
795     /// Returns true if and only if the given slice needs to be transcoded.
slice_needs_transcoding(&self, slice: &[u8]) -> bool796     fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
797         self.config.encoding.is_some()
798             || (self.config.bom_sniffing && slice_has_bom(slice))
799     }
800 }
801 
802 /// The following methods permit querying the configuration of a searcher.
803 /// These can be useful in generic implementations of
804 /// [`Sink`](trait.Sink.html),
805 /// where the output may be tailored based on how the searcher is configured.
806 impl Searcher {
807     /// Returns the line terminator used by this searcher.
808     #[inline]
line_terminator(&self) -> LineTerminator809     pub fn line_terminator(&self) -> LineTerminator {
810         self.config.line_term
811     }
812 
813     /// Returns the type of binary detection configured on this searcher.
814     #[inline]
binary_detection(&self) -> &BinaryDetection815     pub fn binary_detection(&self) -> &BinaryDetection {
816         &self.config.binary
817     }
818 
819     /// Returns true if and only if this searcher is configured to invert its
820     /// search results. That is, matching lines are lines that do **not** match
821     /// the searcher's matcher.
822     #[inline]
invert_match(&self) -> bool823     pub fn invert_match(&self) -> bool {
824         self.config.invert_match
825     }
826 
827     /// Returns true if and only if this searcher is configured to count line
828     /// numbers.
829     #[inline]
line_number(&self) -> bool830     pub fn line_number(&self) -> bool {
831         self.config.line_number
832     }
833 
834     /// Returns true if and only if this searcher is configured to perform
835     /// multi line search.
836     #[inline]
multi_line(&self) -> bool837     pub fn multi_line(&self) -> bool {
838         self.config.multi_line
839     }
840 
841     /// Returns true if and only if this searcher will choose a multi-line
842     /// strategy given the provided matcher.
843     ///
844     /// This may diverge from the result of `multi_line` in cases where the
845     /// searcher has been configured to execute a search that can report
846     /// matches over multiple lines, but where the matcher guarantees that it
847     /// will never produce a match over multiple lines.
multi_line_with_matcher<M: Matcher>(&self, matcher: M) -> bool848     pub fn multi_line_with_matcher<M: Matcher>(&self, matcher: M) -> bool {
849         if !self.multi_line() {
850             return false;
851         }
852         if let Some(line_term) = matcher.line_terminator() {
853             if line_term == self.line_terminator() {
854                 return false;
855             }
856         }
857         if let Some(non_matching) = matcher.non_matching_bytes() {
858             // If the line terminator is CRLF, we don't actually need to care
859             // whether the regex can match `\r` or not. Namely, a `\r` is
860             // neither necessary nor sufficient to terminate a line. A `\n` is
861             // always required.
862             if non_matching.contains(self.line_terminator().as_byte()) {
863                 return false;
864             }
865         }
866         true
867     }
868 
869     /// Returns the number of "after" context lines to report. When context
870     /// reporting is not enabled, this returns `0`.
871     #[inline]
after_context(&self) -> usize872     pub fn after_context(&self) -> usize {
873         self.config.after_context
874     }
875 
876     /// Returns the number of "before" context lines to report. When context
877     /// reporting is not enabled, this returns `0`.
878     #[inline]
before_context(&self) -> usize879     pub fn before_context(&self) -> usize {
880         self.config.before_context
881     }
882 
883     /// Returns true if and only if the searcher has "passthru" mode enabled.
884     #[inline]
passthru(&self) -> bool885     pub fn passthru(&self) -> bool {
886         self.config.passthru
887     }
888 
889     /// Fill the buffer for use with multi-line searching from the given file.
890     /// This reads from the file until EOF or until an error occurs. If the
891     /// contents exceed the configured heap limit, then an error is returned.
fill_multi_line_buffer_from_file<S: Sink>( &self, file: &File, ) -> Result<(), S::Error>892     fn fill_multi_line_buffer_from_file<S: Sink>(
893         &self,
894         file: &File,
895     ) -> Result<(), S::Error> {
896         assert!(self.config.multi_line);
897 
898         let mut decode_buffer = self.decode_buffer.borrow_mut();
899         let mut read_from = self
900             .decode_builder
901             .build_with_buffer(file, &mut *decode_buffer)
902             .map_err(S::Error::error_io)?;
903 
904         // If we don't have a heap limit, then we can defer to std's
905         // read_to_end implementation. fill_multi_line_buffer_from_reader will
906         // do this too, but since we have a File, we can be a bit smarter about
907         // pre-allocating here.
908         //
909         // If we're transcoding, then our pre-allocation might not be exact,
910         // but is probably still better than nothing.
911         if self.config.heap_limit.is_none() {
912             let mut buf = self.multi_line_buffer.borrow_mut();
913             buf.clear();
914             let cap =
915                 file.metadata().map(|m| m.len() as usize + 1).unwrap_or(0);
916             buf.reserve(cap);
917             read_from.read_to_end(&mut *buf).map_err(S::Error::error_io)?;
918             return Ok(());
919         }
920         self.fill_multi_line_buffer_from_reader::<_, S>(read_from)
921     }
922 
923     /// Fill the buffer for use with multi-line searching from the given
924     /// reader. This reads from the reader until EOF or until an error occurs.
925     /// If the contents exceed the configured heap limit, then an error is
926     /// returned.
fill_multi_line_buffer_from_reader<R: io::Read, S: Sink>( &self, mut read_from: R, ) -> Result<(), S::Error>927     fn fill_multi_line_buffer_from_reader<R: io::Read, S: Sink>(
928         &self,
929         mut read_from: R,
930     ) -> Result<(), S::Error> {
931         assert!(self.config.multi_line);
932 
933         let mut buf = self.multi_line_buffer.borrow_mut();
934         buf.clear();
935 
936         // If we don't have a heap limit, then we can defer to std's
937         // read_to_end implementation...
938         let heap_limit = match self.config.heap_limit {
939             Some(heap_limit) => heap_limit,
940             None => {
941                 read_from
942                     .read_to_end(&mut *buf)
943                     .map_err(S::Error::error_io)?;
944                 return Ok(());
945             }
946         };
947         if heap_limit == 0 {
948             return Err(S::Error::error_io(alloc_error(heap_limit)));
949         }
950 
951         // ... otherwise we need to roll our own. This is likely quite a bit
952         // slower than what is optimal, but we avoid worry about memory safety
953         // until there's a compelling reason to speed this up.
954         buf.resize(cmp::min(DEFAULT_BUFFER_CAPACITY, heap_limit), 0);
955         let mut pos = 0;
956         loop {
957             let nread = match read_from.read(&mut buf[pos..]) {
958                 Ok(nread) => nread,
959                 Err(ref err) if err.kind() == io::ErrorKind::Interrupted => {
960                     continue;
961                 }
962                 Err(err) => return Err(S::Error::error_io(err)),
963             };
964             if nread == 0 {
965                 buf.resize(pos, 0);
966                 return Ok(());
967             }
968 
969             pos += nread;
970             if buf[pos..].is_empty() {
971                 let additional = heap_limit - buf.len();
972                 if additional == 0 {
973                     return Err(S::Error::error_io(alloc_error(heap_limit)));
974                 }
975                 let limit = buf.len() + additional;
976                 let doubled = 2 * buf.len();
977                 buf.resize(cmp::min(doubled, limit), 0);
978             }
979         }
980     }
981 }
982 
983 /// Returns true if and only if the given slice begins with a UTF-8 or UTF-16
984 /// BOM.
985 ///
986 /// This is used by the searcher to determine if a transcoder is necessary.
987 /// Otherwise, it is advantageous to search the slice directly.
slice_has_bom(slice: &[u8]) -> bool988 fn slice_has_bom(slice: &[u8]) -> bool {
989     let enc = match encoding_rs::Encoding::for_bom(slice) {
990         None => return false,
991         Some((enc, _)) => enc,
992     };
993     [encoding_rs::UTF_16LE, encoding_rs::UTF_16BE, encoding_rs::UTF_8]
994         .contains(&enc)
995 }
996 
997 #[cfg(test)]
998 mod tests {
999     use super::*;
1000     use crate::testutil::{KitchenSink, RegexMatcher};
1001 
1002     #[test]
config_error_heap_limit()1003     fn config_error_heap_limit() {
1004         let matcher = RegexMatcher::new("");
1005         let sink = KitchenSink::new();
1006         let mut searcher = SearcherBuilder::new().heap_limit(Some(0)).build();
1007         let res = searcher.search_slice(matcher, &[], sink);
1008         assert!(res.is_err());
1009     }
1010 
1011     #[test]
config_error_line_terminator()1012     fn config_error_line_terminator() {
1013         let mut matcher = RegexMatcher::new("");
1014         matcher.set_line_term(Some(LineTerminator::byte(b'z')));
1015 
1016         let sink = KitchenSink::new();
1017         let mut searcher = Searcher::new();
1018         let res = searcher.search_slice(matcher, &[], sink);
1019         assert!(res.is_err());
1020     }
1021 
1022     #[test]
uft8_bom_sniffing()1023     fn uft8_bom_sniffing() {
1024         // See: https://github.com/BurntSushi/ripgrep/issues/1638
1025         // ripgrep must sniff utf-8 BOM, just like it does with utf-16
1026         let matcher = RegexMatcher::new("foo");
1027         let haystack: &[u8] = &[0xef, 0xbb, 0xbf, 0x66, 0x6f, 0x6f];
1028 
1029         let mut sink = KitchenSink::new();
1030         let mut searcher = SearcherBuilder::new().build();
1031 
1032         let res = searcher.search_slice(matcher, haystack, &mut sink);
1033         assert!(res.is_ok());
1034 
1035         let sink_output = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
1036         assert_eq!(sink_output, "1:0:foo\nbyte count:3\n");
1037     }
1038 }
1039