1 use std::iter::repeat;
2 
3 /// Create a sequence of tests that should be run by memchr implementations.
memchr_tests() -> Vec<MemchrTest>4 pub fn memchr_tests() -> Vec<MemchrTest> {
5     let mut tests = Vec::new();
6     for statict in MEMCHR_TESTS {
7         assert!(!statict.corpus.contains("%"), "% is not allowed in corpora");
8         assert!(!statict.corpus.contains("#"), "# is not allowed in corpora");
9         assert!(!statict.needles.contains(&b'%'), "% is an invalid needle");
10         assert!(!statict.needles.contains(&b'#'), "# is an invalid needle");
11 
12         let t = MemchrTest {
13             corpus: statict.corpus.to_string(),
14             needles: statict.needles.to_vec(),
15             positions: statict.positions.to_vec(),
16         };
17         tests.push(t.clone());
18         tests.extend(t.expand());
19     }
20     tests
21 }
22 
23 /// A set of tests for memchr-like functions.
24 ///
25 /// These tests mostly try to cover the short string cases. We cover the longer
26 /// string cases via the benchmarks (which are tests themselves), via
27 /// quickcheck tests and via automatic expansion of each test case (by
28 /// increasing the corpus size). Finally, we cover different alignment cases
29 /// in the tests by varying the starting point of the slice.
30 const MEMCHR_TESTS: &[MemchrTestStatic] = &[
31     // one needle (applied to memchr + memchr2 + memchr3)
32     MemchrTestStatic { corpus: "a", needles: &[b'a'], positions: &[0] },
33     MemchrTestStatic { corpus: "aa", needles: &[b'a'], positions: &[0, 1] },
34     MemchrTestStatic {
35         corpus: "aaa",
36         needles: &[b'a'],
37         positions: &[0, 1, 2],
38     },
39     MemchrTestStatic { corpus: "", needles: &[b'a'], positions: &[] },
40     MemchrTestStatic { corpus: "z", needles: &[b'a'], positions: &[] },
41     MemchrTestStatic { corpus: "zz", needles: &[b'a'], positions: &[] },
42     MemchrTestStatic { corpus: "zza", needles: &[b'a'], positions: &[2] },
43     MemchrTestStatic { corpus: "zaza", needles: &[b'a'], positions: &[1, 3] },
44     MemchrTestStatic { corpus: "zzza", needles: &[b'a'], positions: &[3] },
45     MemchrTestStatic { corpus: "\x00a", needles: &[b'a'], positions: &[1] },
46     MemchrTestStatic { corpus: "\x00", needles: &[b'\x00'], positions: &[0] },
47     MemchrTestStatic {
48         corpus: "\x00\x00",
49         needles: &[b'\x00'],
50         positions: &[0, 1],
51     },
52     MemchrTestStatic {
53         corpus: "\x00a\x00",
54         needles: &[b'\x00'],
55         positions: &[0, 2],
56     },
57     MemchrTestStatic {
58         corpus: "zzzzzzzzzzzzzzzza",
59         needles: &[b'a'],
60         positions: &[16],
61     },
62     MemchrTestStatic {
63         corpus: "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzza",
64         needles: &[b'a'],
65         positions: &[32],
66     },
67     // two needles (applied to memchr2 + memchr3)
68     MemchrTestStatic {
69         corpus: "az",
70         needles: &[b'a', b'z'],
71         positions: &[0, 1],
72     },
73     MemchrTestStatic {
74         corpus: "az",
75         needles: &[b'a', b'z'],
76         positions: &[0, 1],
77     },
78     MemchrTestStatic { corpus: "az", needles: &[b'x', b'y'], positions: &[] },
79     MemchrTestStatic { corpus: "az", needles: &[b'a', b'y'], positions: &[0] },
80     MemchrTestStatic { corpus: "az", needles: &[b'x', b'z'], positions: &[1] },
81     MemchrTestStatic {
82         corpus: "yyyyaz",
83         needles: &[b'a', b'z'],
84         positions: &[4, 5],
85     },
86     MemchrTestStatic {
87         corpus: "yyyyaz",
88         needles: &[b'z', b'a'],
89         positions: &[4, 5],
90     },
91     // three needles (applied to memchr3)
92     MemchrTestStatic {
93         corpus: "xyz",
94         needles: &[b'x', b'y', b'z'],
95         positions: &[0, 1, 2],
96     },
97     MemchrTestStatic {
98         corpus: "zxy",
99         needles: &[b'x', b'y', b'z'],
100         positions: &[0, 1, 2],
101     },
102     MemchrTestStatic {
103         corpus: "zxy",
104         needles: &[b'x', b'a', b'z'],
105         positions: &[0, 1],
106     },
107     MemchrTestStatic {
108         corpus: "zxy",
109         needles: &[b't', b'a', b'z'],
110         positions: &[0],
111     },
112     MemchrTestStatic {
113         corpus: "yxz",
114         needles: &[b't', b'a', b'z'],
115         positions: &[2],
116     },
117 ];
118 
119 /// A description of a test on a memchr like function.
120 #[derive(Clone, Debug)]
121 pub struct MemchrTest {
122     /// The thing to search. We use `&str` instead of `&[u8]` because they
123     /// are nicer to write in tests, and we don't miss much since memchr
124     /// doesn't care about UTF-8.
125     ///
126     /// Corpora cannot contain either '%' or '#'. We use these bytes when
127     /// expanding test cases into many test cases, and we assume they are not
128     /// used. If they are used, `memchr_tests` will panic.
129     corpus: String,
130     /// The needles to search for. This is intended to be an "alternation" of
131     /// needles. The number of needles may cause this test to be skipped for
132     /// some memchr variants. For example, a test with 2 needles cannot be used
133     /// to test `memchr`, but can be used to test `memchr2` and `memchr3`.
134     /// However, a test with only 1 needle can be used to test all of `memchr`,
135     /// `memchr2` and `memchr3`. We achieve this by filling in the needles with
136     /// bytes that we never used in the corpus (such as '#').
137     needles: Vec<u8>,
138     /// The positions expected to match for all of the needles.
139     positions: Vec<usize>,
140 }
141 
142 /// Like MemchrTest, but easier to define as a constant.
143 #[derive(Clone, Debug)]
144 pub struct MemchrTestStatic {
145     corpus: &'static str,
146     needles: &'static [u8],
147     positions: &'static [usize],
148 }
149 
150 impl MemchrTest {
one<F: Fn(u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F)151     pub fn one<F: Fn(u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F) {
152         let needles = match self.needles(1) {
153             None => return,
154             Some(needles) => needles,
155         };
156         // We test different alignments here. Since some implementations use
157         // AVX2, which can read 32 bytes at a time, we test at least that.
158         // Moreover, with loop unrolling, we sometimes process 64 (sse2) or 128
159         // (avx) bytes at a time, so we include that in our offsets as well.
160         //
161         // You might think this would cause most needles to not be found, but
162         // we actually expand our tests to include corpus sizes all the way up
163         // to >500 bytes, so we should exercise most branches.
164         for align in 0..130 {
165             let corpus = self.corpus(align);
166             assert_eq!(
167                 self.positions(align, reverse).get(0).cloned(),
168                 f(needles[0], corpus.as_bytes()),
169                 "search for {:?} failed in: {:?} (len: {}, alignment: {})",
170                 needles[0] as char,
171                 corpus,
172                 corpus.len(),
173                 align
174             );
175         }
176     }
177 
two<F: Fn(u8, u8, &[u8]) -> Option<usize>>( &self, reverse: bool, f: F, )178     pub fn two<F: Fn(u8, u8, &[u8]) -> Option<usize>>(
179         &self,
180         reverse: bool,
181         f: F,
182     ) {
183         let needles = match self.needles(2) {
184             None => return,
185             Some(needles) => needles,
186         };
187         for align in 0..130 {
188             let corpus = self.corpus(align);
189             assert_eq!(
190                 self.positions(align, reverse).get(0).cloned(),
191                 f(needles[0], needles[1], corpus.as_bytes()),
192                 "search for {:?}|{:?} failed in: {:?} \
193                  (len: {}, alignment: {})",
194                 needles[0] as char,
195                 needles[1] as char,
196                 corpus,
197                 corpus.len(),
198                 align
199             );
200         }
201     }
202 
three<F: Fn(u8, u8, u8, &[u8]) -> Option<usize>>( &self, reverse: bool, f: F, )203     pub fn three<F: Fn(u8, u8, u8, &[u8]) -> Option<usize>>(
204         &self,
205         reverse: bool,
206         f: F,
207     ) {
208         let needles = match self.needles(3) {
209             None => return,
210             Some(needles) => needles,
211         };
212         for align in 0..130 {
213             let corpus = self.corpus(align);
214             assert_eq!(
215                 self.positions(align, reverse).get(0).cloned(),
216                 f(needles[0], needles[1], needles[2], corpus.as_bytes()),
217                 "search for {:?}|{:?}|{:?} failed in: {:?} \
218                  (len: {}, alignment: {})",
219                 needles[0] as char,
220                 needles[1] as char,
221                 needles[2] as char,
222                 corpus,
223                 corpus.len(),
224                 align
225             );
226         }
227     }
228 
iter_one<'a, I, F>(&'a self, reverse: bool, f: F) where F: FnOnce(u8, &'a [u8]) -> I, I: Iterator<Item = usize>,229     pub fn iter_one<'a, I, F>(&'a self, reverse: bool, f: F)
230     where
231         F: FnOnce(u8, &'a [u8]) -> I,
232         I: Iterator<Item = usize>,
233     {
234         if let Some(ns) = self.needles(1) {
235             self.iter(reverse, f(ns[0], self.corpus.as_bytes()));
236         }
237     }
238 
iter_two<'a, I, F>(&'a self, reverse: bool, f: F) where F: FnOnce(u8, u8, &'a [u8]) -> I, I: Iterator<Item = usize>,239     pub fn iter_two<'a, I, F>(&'a self, reverse: bool, f: F)
240     where
241         F: FnOnce(u8, u8, &'a [u8]) -> I,
242         I: Iterator<Item = usize>,
243     {
244         if let Some(ns) = self.needles(2) {
245             self.iter(reverse, f(ns[0], ns[1], self.corpus.as_bytes()));
246         }
247     }
248 
iter_three<'a, I, F>(&'a self, reverse: bool, f: F) where F: FnOnce(u8, u8, u8, &'a [u8]) -> I, I: Iterator<Item = usize>,249     pub fn iter_three<'a, I, F>(&'a self, reverse: bool, f: F)
250     where
251         F: FnOnce(u8, u8, u8, &'a [u8]) -> I,
252         I: Iterator<Item = usize>,
253     {
254         if let Some(ns) = self.needles(3) {
255             self.iter(reverse, f(ns[0], ns[1], ns[2], self.corpus.as_bytes()));
256         }
257     }
258 
259     /// Test that the positions yielded by the given iterator match the
260     /// positions in this test. If reverse is true, then reverse the positions
261     /// before comparing them.
iter<I: Iterator<Item = usize>>(&self, reverse: bool, it: I)262     fn iter<I: Iterator<Item = usize>>(&self, reverse: bool, it: I) {
263         assert_eq!(
264             self.positions(0, reverse),
265             it.collect::<Vec<usize>>(),
266             r"search for {:?} failed in: {:?}",
267             self.needles.iter().map(|&b| b as char).collect::<Vec<char>>(),
268             self.corpus
269         );
270     }
271 
272     /// Expand this test into many variations of the same test.
273     ///
274     /// In particular, this will generate more tests with larger corpus sizes.
275     /// The expected positions are updated to maintain the integrity of the
276     /// test.
277     ///
278     /// This is important in testing a memchr implementation, because there are
279     /// often different cases depending on the length of the corpus.
280     ///
281     /// Note that we extend the corpus by adding `%` bytes, which we
282     /// don't otherwise use as a needle.
expand(&self) -> Vec<MemchrTest>283     fn expand(&self) -> Vec<MemchrTest> {
284         let mut more = Vec::new();
285 
286         // Add bytes to the start of the corpus.
287         for i in 1..515 {
288             let mut t = self.clone();
289             let mut new_corpus: String = repeat('%').take(i).collect();
290             new_corpus.push_str(&t.corpus);
291             t.corpus = new_corpus;
292             t.positions = t.positions.into_iter().map(|p| p + i).collect();
293             more.push(t);
294         }
295         // Add bytes to the end of the corpus.
296         for i in 1..515 {
297             let mut t = self.clone();
298             let padding: String = repeat('%').take(i).collect();
299             t.corpus.push_str(&padding);
300             more.push(t);
301         }
302 
303         more
304     }
305 
306     /// Return the corpus at the given alignment.
307     ///
308     /// If the alignment exceeds the length of the corpus, then this returns
309     /// an empty slice.
corpus(&self, align: usize) -> &str310     fn corpus(&self, align: usize) -> &str {
311         self.corpus.get(align..).unwrap_or("")
312     }
313 
314     /// Return exactly `count` needles from this test. If this test has less
315     /// than `count` needles, then add `#` until the number of needles
316     /// matches `count`. If this test has more than `count` needles, then
317     /// return `None` (because there is no way to use this test data for a
318     /// search using fewer needles).
needles(&self, count: usize) -> Option<Vec<u8>>319     fn needles(&self, count: usize) -> Option<Vec<u8>> {
320         if self.needles.len() > count {
321             return None;
322         }
323 
324         let mut needles = self.needles.to_vec();
325         for _ in needles.len()..count {
326             // we assume # is never used in tests.
327             needles.push(b'#');
328         }
329         Some(needles)
330     }
331 
332     /// Return the positions in this test, reversed if `reverse` is true.
333     ///
334     /// If alignment is given, then all positions greater than or equal to that
335     /// alignment are offset by the alignment. Positions less than the
336     /// alignment are dropped.
positions(&self, align: usize, reverse: bool) -> Vec<usize>337     fn positions(&self, align: usize, reverse: bool) -> Vec<usize> {
338         let positions = if reverse {
339             let mut positions = self.positions.to_vec();
340             positions.reverse();
341             positions
342         } else {
343             self.positions.to_vec()
344         };
345         positions
346             .into_iter()
347             .filter(|&p| p >= align)
348             .map(|p| p - align)
349             .collect()
350     }
351 }
352