1 use std::collections::hash_map::Entry;
2 use std::collections::HashMap;
3 use std::fmt::Debug;
4 use std::hash::{Hash, Hasher};
5 use std::ops::{Add, Index, Range};
6
7 /// Utility function to check if a range is empty that works on older rust versions
8 #[inline(always)]
9 #[allow(clippy::neg_cmp_op_on_partial_ord)]
is_empty_range<T: PartialOrd<T>>(range: &Range<T>) -> bool10 pub fn is_empty_range<T: PartialOrd<T>>(range: &Range<T>) -> bool {
11 !(range.start < range.end)
12 }
13
14 /// Represents an item in the vector returend by [`unique`].
15 ///
16 /// It compares like the underlying item does it was created from but
17 /// carries the index it was originally created from.
18 pub struct UniqueItem<'a, Idx: ?Sized> {
19 lookup: &'a Idx,
20 index: usize,
21 }
22
23 impl<'a, Idx: ?Sized> UniqueItem<'a, Idx>
24 where
25 Idx: Index<usize>,
26 {
27 /// Returns the value.
28 #[inline(always)]
value(&self) -> &Idx::Output29 pub fn value(&self) -> &Idx::Output {
30 &self.lookup[self.index]
31 }
32
33 /// Returns the original index.
34 #[inline(always)]
original_index(&self) -> usize35 pub fn original_index(&self) -> usize {
36 self.index
37 }
38 }
39
40 impl<'a, Idx: Index<usize> + 'a> Debug for UniqueItem<'a, Idx>
41 where
42 Idx::Output: Debug,
43 {
fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result44 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
45 f.debug_struct("UniqueItem")
46 .field("value", &self.value())
47 .field("original_index", &self.original_index())
48 .finish()
49 }
50 }
51
52 impl<'a, 'b, A, B> PartialEq<UniqueItem<'a, A>> for UniqueItem<'b, B>
53 where
54 A: Index<usize> + 'b + ?Sized,
55 B: Index<usize> + 'b + ?Sized,
56 B::Output: PartialEq<A::Output>,
57 {
58 #[inline(always)]
eq(&self, other: &UniqueItem<'a, A>) -> bool59 fn eq(&self, other: &UniqueItem<'a, A>) -> bool {
60 self.value() == other.value()
61 }
62 }
63
64 /// Returns only unique items in the sequence as vector.
65 ///
66 /// Each item is wrapped in a [`UniqueItem`] so that both the value and the
67 /// index can be extracted.
unique<Idx>(lookup: &Idx, range: Range<usize>) -> Vec<UniqueItem<Idx>> where Idx: Index<usize> + ?Sized, Idx::Output: Hash + Eq,68 pub fn unique<Idx>(lookup: &Idx, range: Range<usize>) -> Vec<UniqueItem<Idx>>
69 where
70 Idx: Index<usize> + ?Sized,
71 Idx::Output: Hash + Eq,
72 {
73 let mut by_item = HashMap::new();
74 for index in range {
75 match by_item.entry(&lookup[index]) {
76 Entry::Vacant(entry) => {
77 entry.insert(Some(index));
78 }
79 Entry::Occupied(mut entry) => {
80 let entry = entry.get_mut();
81 if entry.is_some() {
82 *entry = None
83 }
84 }
85 }
86 }
87 let mut rv = by_item
88 .into_iter()
89 .filter_map(|(_, x)| x)
90 .map(|index| UniqueItem { lookup, index })
91 .collect::<Vec<_>>();
92 rv.sort_by_key(|a| a.original_index());
93 rv
94 }
95
96 /// Given two lookups and ranges calculates the length of the common prefix.
common_prefix_len<Old, New>( old: &Old, old_range: Range<usize>, new: &New, new_range: Range<usize>, ) -> usize where Old: Index<usize> + ?Sized, New: Index<usize> + ?Sized, New::Output: PartialEq<Old::Output>,97 pub fn common_prefix_len<Old, New>(
98 old: &Old,
99 old_range: Range<usize>,
100 new: &New,
101 new_range: Range<usize>,
102 ) -> usize
103 where
104 Old: Index<usize> + ?Sized,
105 New: Index<usize> + ?Sized,
106 New::Output: PartialEq<Old::Output>,
107 {
108 if is_empty_range(&old_range) || is_empty_range(&new_range) {
109 return 0;
110 }
111 new_range
112 .zip(old_range)
113 .take_while(
114 #[inline(always)]
115 |x| new[x.0] == old[x.1],
116 )
117 .count()
118 }
119
120 /// Given two lookups and ranges calculates the length of common suffix.
common_suffix_len<Old, New>( old: &Old, old_range: Range<usize>, new: &New, new_range: Range<usize>, ) -> usize where Old: Index<usize> + ?Sized, New: Index<usize> + ?Sized, New::Output: PartialEq<Old::Output>,121 pub fn common_suffix_len<Old, New>(
122 old: &Old,
123 old_range: Range<usize>,
124 new: &New,
125 new_range: Range<usize>,
126 ) -> usize
127 where
128 Old: Index<usize> + ?Sized,
129 New: Index<usize> + ?Sized,
130 New::Output: PartialEq<Old::Output>,
131 {
132 if is_empty_range(&old_range) || is_empty_range(&new_range) {
133 return 0;
134 }
135 new_range
136 .rev()
137 .zip(old_range.rev())
138 .take_while(
139 #[inline(always)]
140 |x| new[x.0] == old[x.1],
141 )
142 .count()
143 }
144
145 struct OffsetLookup<Int> {
146 offset: usize,
147 vec: Vec<Int>,
148 }
149
150 impl<Int> Index<usize> for OffsetLookup<Int> {
151 type Output = Int;
152
153 #[inline(always)]
index(&self, index: usize) -> &Self::Output154 fn index(&self, index: usize) -> &Self::Output {
155 &self.vec[index - self.offset]
156 }
157 }
158
159 /// A utility struct to convert distinct items to unique integers.
160 ///
161 /// This can be helpful on larger inputs to speed up the comparisons
162 /// performed by doing a first pass where the data set gets reduced
163 /// to (small) integers.
164 ///
165 /// The idea is that instead of passing two sequences to a diffling algorithm
166 /// you first pass it via [`IdentifyDistinct`]:
167 ///
168 /// ```rust
169 /// use similar::capture_diff;
170 /// use similar::algorithms::{Algorithm, IdentifyDistinct};
171 ///
172 /// let old = &["foo", "bar", "baz"][..];
173 /// let new = &["foo", "blah", "baz"][..];
174 /// let h = IdentifyDistinct::<u32>::new(old, 0..old.len(), new, 0..new.len());
175 /// let ops = capture_diff(
176 /// Algorithm::Myers,
177 /// h.old_lookup(),
178 /// h.old_range(),
179 /// h.new_lookup(),
180 /// h.new_range(),
181 /// );
182 /// ```
183 ///
184 /// The indexes are the same as with the passed source ranges.
185 pub struct IdentifyDistinct<Int> {
186 old: OffsetLookup<Int>,
187 new: OffsetLookup<Int>,
188 }
189
190 impl<Int> IdentifyDistinct<Int>
191 where
192 Int: Add<Output = Int> + From<u8> + Default + Copy,
193 {
194 /// Creates an int hasher for two sequences.
new<Old, New>( old: &Old, old_range: Range<usize>, new: &New, new_range: Range<usize>, ) -> Self where Old: Index<usize> + ?Sized, Old::Output: Eq + Hash, New: Index<usize> + ?Sized, New::Output: Eq + Hash + PartialEq<Old::Output>,195 pub fn new<Old, New>(
196 old: &Old,
197 old_range: Range<usize>,
198 new: &New,
199 new_range: Range<usize>,
200 ) -> Self
201 where
202 Old: Index<usize> + ?Sized,
203 Old::Output: Eq + Hash,
204 New: Index<usize> + ?Sized,
205 New::Output: Eq + Hash + PartialEq<Old::Output>,
206 {
207 enum Key<'old, 'new, Old: ?Sized, New: ?Sized> {
208 Old(&'old Old),
209 New(&'new New),
210 }
211
212 impl<'old, 'new, Old, New> Hash for Key<'old, 'new, Old, New>
213 where
214 Old: Hash + ?Sized,
215 New: Hash + ?Sized,
216 {
217 fn hash<H: Hasher>(&self, state: &mut H) {
218 match *self {
219 Key::Old(val) => val.hash(state),
220 Key::New(val) => val.hash(state),
221 }
222 }
223 }
224
225 impl<'old, 'new, Old, New> PartialEq for Key<'old, 'new, Old, New>
226 where
227 Old: Eq + ?Sized,
228 New: Eq + PartialEq<Old> + ?Sized,
229 {
230 #[inline(always)]
231 fn eq(&self, other: &Self) -> bool {
232 match (self, other) {
233 (Key::Old(a), Key::Old(b)) => a == b,
234 (Key::New(a), Key::New(b)) => a == b,
235 (Key::Old(a), Key::New(b)) | (Key::New(b), Key::Old(a)) => b == a,
236 }
237 }
238 }
239
240 impl<'old, 'new, Old, New> Eq for Key<'old, 'new, Old, New>
241 where
242 Old: Eq + ?Sized,
243 New: Eq + PartialEq<Old> + ?Sized,
244 {
245 }
246
247 let mut map = HashMap::new();
248 let mut old_seq = Vec::new();
249 let mut new_seq = Vec::new();
250 let mut next_id = Int::default();
251 let step = Int::from(1);
252 let old_start = old_range.start;
253 let new_start = new_range.start;
254
255 for idx in old_range {
256 let item = Key::Old(&old[idx]);
257 let id = match map.entry(item) {
258 Entry::Occupied(o) => *o.get(),
259 Entry::Vacant(v) => {
260 let id = next_id;
261 next_id = next_id + step;
262 *v.insert(id)
263 }
264 };
265 old_seq.push(id);
266 }
267
268 for idx in new_range {
269 let item = Key::New(&new[idx]);
270 let id = match map.entry(item) {
271 Entry::Occupied(o) => *o.get(),
272 Entry::Vacant(v) => {
273 let id = next_id;
274 next_id = next_id + step;
275 *v.insert(id)
276 }
277 };
278 new_seq.push(id);
279 }
280
281 IdentifyDistinct {
282 old: OffsetLookup {
283 offset: old_start,
284 vec: old_seq,
285 },
286 new: OffsetLookup {
287 offset: new_start,
288 vec: new_seq,
289 },
290 }
291 }
292
293 /// Returns a lookup for the old side.
old_lookup(&self) -> &impl Index<usize, Output = Int>294 pub fn old_lookup(&self) -> &impl Index<usize, Output = Int> {
295 &self.old
296 }
297
298 /// Returns a lookup for the new side.
new_lookup(&self) -> &impl Index<usize, Output = Int>299 pub fn new_lookup(&self) -> &impl Index<usize, Output = Int> {
300 &self.new
301 }
302
303 /// Convenience method to get back the old range.
old_range(&self) -> Range<usize>304 pub fn old_range(&self) -> Range<usize> {
305 self.old.offset..self.old.offset + self.old.vec.len()
306 }
307
308 /// Convenience method to get back the new range.
new_range(&self) -> Range<usize>309 pub fn new_range(&self) -> Range<usize> {
310 self.new.offset..self.new.offset + self.new.vec.len()
311 }
312 }
313
314 #[test]
test_unique()315 fn test_unique() {
316 let u = unique(&vec!['a', 'b', 'c', 'd', 'd', 'b'], 0..6)
317 .into_iter()
318 .map(|x| (*x.value(), x.original_index()))
319 .collect::<Vec<_>>();
320 assert_eq!(u, vec![('a', 0), ('c', 2)]);
321 }
322
323 #[test]
test_int_hasher()324 fn test_int_hasher() {
325 let ih = IdentifyDistinct::<u8>::new(
326 &["", "foo", "bar", "baz"][..],
327 1..4,
328 &["", "foo", "blah", "baz"][..],
329 1..4,
330 );
331 assert_eq!(ih.old_lookup()[1], 0);
332 assert_eq!(ih.old_lookup()[2], 1);
333 assert_eq!(ih.old_lookup()[3], 2);
334 assert_eq!(ih.new_lookup()[1], 0);
335 assert_eq!(ih.new_lookup()[2], 3);
336 assert_eq!(ih.new_lookup()[3], 2);
337 assert_eq!(ih.old_range(), 1..4);
338 assert_eq!(ih.new_range(), 1..4);
339 }
340
341 #[test]
test_common_prefix_len()342 fn test_common_prefix_len() {
343 assert_eq!(
344 common_prefix_len("".as_bytes(), 0..0, "".as_bytes(), 0..0),
345 0
346 );
347 assert_eq!(
348 common_prefix_len("foobarbaz".as_bytes(), 0..9, "foobarblah".as_bytes(), 0..10),
349 7
350 );
351 assert_eq!(
352 common_prefix_len("foobarbaz".as_bytes(), 0..9, "blablabla".as_bytes(), 0..9),
353 0
354 );
355 assert_eq!(
356 common_prefix_len("foobarbaz".as_bytes(), 3..9, "foobarblah".as_bytes(), 3..10),
357 4
358 );
359 }
360
361 #[test]
test_common_suffix_len()362 fn test_common_suffix_len() {
363 assert_eq!(
364 common_suffix_len("".as_bytes(), 0..0, "".as_bytes(), 0..0),
365 0
366 );
367 assert_eq!(
368 common_suffix_len("1234".as_bytes(), 0..4, "X0001234".as_bytes(), 0..8),
369 4
370 );
371 assert_eq!(
372 common_suffix_len("1234".as_bytes(), 0..4, "Xxxx".as_bytes(), 0..4),
373 0
374 );
375 assert_eq!(
376 common_suffix_len("1234".as_bytes(), 2..4, "01234".as_bytes(), 2..5),
377 2
378 );
379 }
380