1 use std::collections::hash_map::Entry;
2 use std::collections::HashMap;
3 use std::fmt::Debug;
4 use std::hash::{Hash, Hasher};
5 use std::ops::{Add, Index, Range};
6 
7 /// Utility function to check if a range is empty that works on older rust versions
8 #[inline(always)]
9 #[allow(clippy::neg_cmp_op_on_partial_ord)]
is_empty_range<T: PartialOrd<T>>(range: &Range<T>) -> bool10 pub fn is_empty_range<T: PartialOrd<T>>(range: &Range<T>) -> bool {
11     !(range.start < range.end)
12 }
13 
14 /// Represents an item in the vector returend by [`unique`].
15 ///
16 /// It compares like the underlying item does it was created from but
17 /// carries the index it was originally created from.
18 pub struct UniqueItem<'a, Idx: ?Sized> {
19     lookup: &'a Idx,
20     index: usize,
21 }
22 
23 impl<'a, Idx: ?Sized> UniqueItem<'a, Idx>
24 where
25     Idx: Index<usize>,
26 {
27     /// Returns the value.
28     #[inline(always)]
value(&self) -> &Idx::Output29     pub fn value(&self) -> &Idx::Output {
30         &self.lookup[self.index]
31     }
32 
33     /// Returns the original index.
34     #[inline(always)]
original_index(&self) -> usize35     pub fn original_index(&self) -> usize {
36         self.index
37     }
38 }
39 
40 impl<'a, Idx: Index<usize> + 'a> Debug for UniqueItem<'a, Idx>
41 where
42     Idx::Output: Debug,
43 {
fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result44     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
45         f.debug_struct("UniqueItem")
46             .field("value", &self.value())
47             .field("original_index", &self.original_index())
48             .finish()
49     }
50 }
51 
52 impl<'a, 'b, A, B> PartialEq<UniqueItem<'a, A>> for UniqueItem<'b, B>
53 where
54     A: Index<usize> + 'b + ?Sized,
55     B: Index<usize> + 'b + ?Sized,
56     B::Output: PartialEq<A::Output>,
57 {
58     #[inline(always)]
eq(&self, other: &UniqueItem<'a, A>) -> bool59     fn eq(&self, other: &UniqueItem<'a, A>) -> bool {
60         self.value() == other.value()
61     }
62 }
63 
64 /// Returns only unique items in the sequence as vector.
65 ///
66 /// Each item is wrapped in a [`UniqueItem`] so that both the value and the
67 /// index can be extracted.
unique<Idx>(lookup: &Idx, range: Range<usize>) -> Vec<UniqueItem<Idx>> where Idx: Index<usize> + ?Sized, Idx::Output: Hash + Eq,68 pub fn unique<Idx>(lookup: &Idx, range: Range<usize>) -> Vec<UniqueItem<Idx>>
69 where
70     Idx: Index<usize> + ?Sized,
71     Idx::Output: Hash + Eq,
72 {
73     let mut by_item = HashMap::new();
74     for index in range {
75         match by_item.entry(&lookup[index]) {
76             Entry::Vacant(entry) => {
77                 entry.insert(Some(index));
78             }
79             Entry::Occupied(mut entry) => {
80                 let entry = entry.get_mut();
81                 if entry.is_some() {
82                     *entry = None
83                 }
84             }
85         }
86     }
87     let mut rv = by_item
88         .into_iter()
89         .filter_map(|(_, x)| x)
90         .map(|index| UniqueItem { lookup, index })
91         .collect::<Vec<_>>();
92     rv.sort_by_key(|a| a.original_index());
93     rv
94 }
95 
96 /// Given two lookups and ranges calculates the length of the common prefix.
common_prefix_len<Old, New>( old: &Old, old_range: Range<usize>, new: &New, new_range: Range<usize>, ) -> usize where Old: Index<usize> + ?Sized, New: Index<usize> + ?Sized, New::Output: PartialEq<Old::Output>,97 pub fn common_prefix_len<Old, New>(
98     old: &Old,
99     old_range: Range<usize>,
100     new: &New,
101     new_range: Range<usize>,
102 ) -> usize
103 where
104     Old: Index<usize> + ?Sized,
105     New: Index<usize> + ?Sized,
106     New::Output: PartialEq<Old::Output>,
107 {
108     if is_empty_range(&old_range) || is_empty_range(&new_range) {
109         return 0;
110     }
111     new_range
112         .zip(old_range)
113         .take_while(
114             #[inline(always)]
115             |x| new[x.0] == old[x.1],
116         )
117         .count()
118 }
119 
120 /// Given two lookups and ranges calculates the length of common suffix.
common_suffix_len<Old, New>( old: &Old, old_range: Range<usize>, new: &New, new_range: Range<usize>, ) -> usize where Old: Index<usize> + ?Sized, New: Index<usize> + ?Sized, New::Output: PartialEq<Old::Output>,121 pub fn common_suffix_len<Old, New>(
122     old: &Old,
123     old_range: Range<usize>,
124     new: &New,
125     new_range: Range<usize>,
126 ) -> usize
127 where
128     Old: Index<usize> + ?Sized,
129     New: Index<usize> + ?Sized,
130     New::Output: PartialEq<Old::Output>,
131 {
132     if is_empty_range(&old_range) || is_empty_range(&new_range) {
133         return 0;
134     }
135     new_range
136         .rev()
137         .zip(old_range.rev())
138         .take_while(
139             #[inline(always)]
140             |x| new[x.0] == old[x.1],
141         )
142         .count()
143 }
144 
145 struct OffsetLookup<Int> {
146     offset: usize,
147     vec: Vec<Int>,
148 }
149 
150 impl<Int> Index<usize> for OffsetLookup<Int> {
151     type Output = Int;
152 
153     #[inline(always)]
index(&self, index: usize) -> &Self::Output154     fn index(&self, index: usize) -> &Self::Output {
155         &self.vec[index - self.offset]
156     }
157 }
158 
159 /// A utility struct to convert distinct items to unique integers.
160 ///
161 /// This can be helpful on larger inputs to speed up the comparisons
162 /// performed by doing a first pass where the data set gets reduced
163 /// to (small) integers.
164 ///
165 /// The idea is that instead of passing two sequences to a diffling algorithm
166 /// you first pass it via [`IdentifyDistinct`]:
167 ///
168 /// ```rust
169 /// use similar::capture_diff;
170 /// use similar::algorithms::{Algorithm, IdentifyDistinct};
171 ///
172 /// let old = &["foo", "bar", "baz"][..];
173 /// let new = &["foo", "blah", "baz"][..];
174 /// let h = IdentifyDistinct::<u32>::new(old, 0..old.len(), new, 0..new.len());
175 /// let ops = capture_diff(
176 ///     Algorithm::Myers,
177 ///     h.old_lookup(),
178 ///     h.old_range(),
179 ///     h.new_lookup(),
180 ///     h.new_range(),
181 /// );
182 /// ```
183 ///
184 /// The indexes are the same as with the passed source ranges.
185 pub struct IdentifyDistinct<Int> {
186     old: OffsetLookup<Int>,
187     new: OffsetLookup<Int>,
188 }
189 
190 impl<Int> IdentifyDistinct<Int>
191 where
192     Int: Add<Output = Int> + From<u8> + Default + Copy,
193 {
194     /// Creates an int hasher for two sequences.
new<Old, New>( old: &Old, old_range: Range<usize>, new: &New, new_range: Range<usize>, ) -> Self where Old: Index<usize> + ?Sized, Old::Output: Eq + Hash, New: Index<usize> + ?Sized, New::Output: Eq + Hash + PartialEq<Old::Output>,195     pub fn new<Old, New>(
196         old: &Old,
197         old_range: Range<usize>,
198         new: &New,
199         new_range: Range<usize>,
200     ) -> Self
201     where
202         Old: Index<usize> + ?Sized,
203         Old::Output: Eq + Hash,
204         New: Index<usize> + ?Sized,
205         New::Output: Eq + Hash + PartialEq<Old::Output>,
206     {
207         enum Key<'old, 'new, Old: ?Sized, New: ?Sized> {
208             Old(&'old Old),
209             New(&'new New),
210         }
211 
212         impl<'old, 'new, Old, New> Hash for Key<'old, 'new, Old, New>
213         where
214             Old: Hash + ?Sized,
215             New: Hash + ?Sized,
216         {
217             fn hash<H: Hasher>(&self, state: &mut H) {
218                 match *self {
219                     Key::Old(val) => val.hash(state),
220                     Key::New(val) => val.hash(state),
221                 }
222             }
223         }
224 
225         impl<'old, 'new, Old, New> PartialEq for Key<'old, 'new, Old, New>
226         where
227             Old: Eq + ?Sized,
228             New: Eq + PartialEq<Old> + ?Sized,
229         {
230             #[inline(always)]
231             fn eq(&self, other: &Self) -> bool {
232                 match (self, other) {
233                     (Key::Old(a), Key::Old(b)) => a == b,
234                     (Key::New(a), Key::New(b)) => a == b,
235                     (Key::Old(a), Key::New(b)) | (Key::New(b), Key::Old(a)) => b == a,
236                 }
237             }
238         }
239 
240         impl<'old, 'new, Old, New> Eq for Key<'old, 'new, Old, New>
241         where
242             Old: Eq + ?Sized,
243             New: Eq + PartialEq<Old> + ?Sized,
244         {
245         }
246 
247         let mut map = HashMap::new();
248         let mut old_seq = Vec::new();
249         let mut new_seq = Vec::new();
250         let mut next_id = Int::default();
251         let step = Int::from(1);
252         let old_start = old_range.start;
253         let new_start = new_range.start;
254 
255         for idx in old_range {
256             let item = Key::Old(&old[idx]);
257             let id = match map.entry(item) {
258                 Entry::Occupied(o) => *o.get(),
259                 Entry::Vacant(v) => {
260                     let id = next_id;
261                     next_id = next_id + step;
262                     *v.insert(id)
263                 }
264             };
265             old_seq.push(id);
266         }
267 
268         for idx in new_range {
269             let item = Key::New(&new[idx]);
270             let id = match map.entry(item) {
271                 Entry::Occupied(o) => *o.get(),
272                 Entry::Vacant(v) => {
273                     let id = next_id;
274                     next_id = next_id + step;
275                     *v.insert(id)
276                 }
277             };
278             new_seq.push(id);
279         }
280 
281         IdentifyDistinct {
282             old: OffsetLookup {
283                 offset: old_start,
284                 vec: old_seq,
285             },
286             new: OffsetLookup {
287                 offset: new_start,
288                 vec: new_seq,
289             },
290         }
291     }
292 
293     /// Returns a lookup for the old side.
old_lookup(&self) -> &impl Index<usize, Output = Int>294     pub fn old_lookup(&self) -> &impl Index<usize, Output = Int> {
295         &self.old
296     }
297 
298     /// Returns a lookup for the new side.
new_lookup(&self) -> &impl Index<usize, Output = Int>299     pub fn new_lookup(&self) -> &impl Index<usize, Output = Int> {
300         &self.new
301     }
302 
303     /// Convenience method to get back the old range.
old_range(&self) -> Range<usize>304     pub fn old_range(&self) -> Range<usize> {
305         self.old.offset..self.old.offset + self.old.vec.len()
306     }
307 
308     /// Convenience method to get back the new range.
new_range(&self) -> Range<usize>309     pub fn new_range(&self) -> Range<usize> {
310         self.new.offset..self.new.offset + self.new.vec.len()
311     }
312 }
313 
314 #[test]
test_unique()315 fn test_unique() {
316     let u = unique(&vec!['a', 'b', 'c', 'd', 'd', 'b'], 0..6)
317         .into_iter()
318         .map(|x| (*x.value(), x.original_index()))
319         .collect::<Vec<_>>();
320     assert_eq!(u, vec![('a', 0), ('c', 2)]);
321 }
322 
323 #[test]
test_int_hasher()324 fn test_int_hasher() {
325     let ih = IdentifyDistinct::<u8>::new(
326         &["", "foo", "bar", "baz"][..],
327         1..4,
328         &["", "foo", "blah", "baz"][..],
329         1..4,
330     );
331     assert_eq!(ih.old_lookup()[1], 0);
332     assert_eq!(ih.old_lookup()[2], 1);
333     assert_eq!(ih.old_lookup()[3], 2);
334     assert_eq!(ih.new_lookup()[1], 0);
335     assert_eq!(ih.new_lookup()[2], 3);
336     assert_eq!(ih.new_lookup()[3], 2);
337     assert_eq!(ih.old_range(), 1..4);
338     assert_eq!(ih.new_range(), 1..4);
339 }
340 
341 #[test]
test_common_prefix_len()342 fn test_common_prefix_len() {
343     assert_eq!(
344         common_prefix_len("".as_bytes(), 0..0, "".as_bytes(), 0..0),
345         0
346     );
347     assert_eq!(
348         common_prefix_len("foobarbaz".as_bytes(), 0..9, "foobarblah".as_bytes(), 0..10),
349         7
350     );
351     assert_eq!(
352         common_prefix_len("foobarbaz".as_bytes(), 0..9, "blablabla".as_bytes(), 0..9),
353         0
354     );
355     assert_eq!(
356         common_prefix_len("foobarbaz".as_bytes(), 3..9, "foobarblah".as_bytes(), 3..10),
357         4
358     );
359 }
360 
361 #[test]
test_common_suffix_len()362 fn test_common_suffix_len() {
363     assert_eq!(
364         common_suffix_len("".as_bytes(), 0..0, "".as_bytes(), 0..0),
365         0
366     );
367     assert_eq!(
368         common_suffix_len("1234".as_bytes(), 0..4, "X0001234".as_bytes(), 0..8),
369         4
370     );
371     assert_eq!(
372         common_suffix_len("1234".as_bytes(), 0..4, "Xxxx".as_bytes(), 0..4),
373         0
374     );
375     assert_eq!(
376         common_suffix_len("1234".as_bytes(), 2..4, "01234".as_bytes(), 2..5),
377         2
378     );
379 }
380