1 //! [![github]](https://github.com/dtolnay/dissimilar) [![crates-io]](https://crates.io/crates/dissimilar) [![docs-rs]](https://docs.rs/dissimilar)
2 //!
3 //! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github
4 //! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust
5 //! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logoColor=white&logo=
6 //!
7 //! <br>
8 //!
9 //! ## Diff library with semantic cleanup, based on Google's diff-match-patch
10 //!
11 //! This library is a port of the Diff component of [Diff Match Patch] to Rust.
12 //! The diff implementation is based on [Myers' diff algorithm] but includes
13 //! some [semantic cleanups] to increase human readability by factoring out
14 //! commonalities which are likely to be coincidental.
15 //!
16 //! Diff Match Patch was originally built in 2006 to power Google Docs.
17 //!
18 //! # Interface
19 //!
20 //! Here is the entire API of the Rust implementation. It operates on borrowed
21 //! strings and the return value of the diff algorithm is a vector of chunks
22 //! pointing into slices of those input strings.
23 //!
24 //! ```
25 //! pub enum Chunk<'a> {
26 //!     Equal(&'a str),
27 //!     Delete(&'a str),
28 //!     Insert(&'a str),
29 //! }
30 //!
31 //! # const IGNORE: &str = stringify! {
32 //! pub fn diff(text1: &str, text2: &str) -> Vec<Chunk>;
33 //! # };
34 //! ```
35 //!
36 //! [Diff Match Patch]: https://github.com/google/diff-match-patch
37 //! [Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf
38 //! [semantic cleanups]: https://neil.fraser.name/writing/diff/
39 
40 #![doc(html_root_url = "https://docs.rs/dissimilar/1.0.3")]
41 #![allow(
42     clippy::blocks_in_if_conditions,
43     clippy::cast_possible_wrap,
44     clippy::cast_sign_loss,
45     clippy::cloned_instead_of_copied, // https://github.com/rust-lang/rust-clippy/issues/7127
46     clippy::collapsible_else_if,
47     clippy::comparison_chain,
48     clippy::match_same_arms,
49     clippy::module_name_repetitions,
50     clippy::must_use_candidate,
51     clippy::new_without_default,
52     clippy::shadow_unrelated,
53     clippy::similar_names,
54     clippy::too_many_lines,
55     clippy::unseparated_literal_suffix
56 )]
57 
58 mod find;
59 mod range;
60 
61 #[cfg(test)]
62 mod tests;
63 
64 use crate::range::{bytes, str, Range};
65 use std::cmp;
66 use std::collections::VecDeque;
67 use std::fmt::{self, Debug};
68 
69 #[derive(Copy, Clone, PartialEq, Eq)]
70 pub enum Chunk<'a> {
71     Equal(&'a str),
72     Delete(&'a str),
73     Insert(&'a str),
74 }
75 
76 #[derive(Copy, Clone)]
77 enum Diff<'a, 'b> {
78     Equal(Range<'a>, Range<'b>),
79     Delete(Range<'a>),
80     Insert(Range<'b>),
81 }
82 
83 impl<'tmp, 'a: 'tmp, 'b: 'tmp> Diff<'a, 'b> {
text(&self) -> Range<'tmp>84     fn text(&self) -> Range<'tmp> {
85         match *self {
86             Diff::Equal(range, _) | Diff::Delete(range) | Diff::Insert(range) => range,
87         }
88     }
89 
grow_left(&mut self, increment: usize)90     fn grow_left(&mut self, increment: usize) {
91         self.for_each(|range| {
92             range.offset -= increment;
93             range.len += increment;
94         });
95     }
96 
grow_right(&mut self, increment: usize)97     fn grow_right(&mut self, increment: usize) {
98         self.for_each(|range| range.len += increment);
99     }
100 
shift_left(&mut self, increment: usize)101     fn shift_left(&mut self, increment: usize) {
102         self.for_each(|range| range.offset -= increment);
103     }
104 
shift_right(&mut self, increment: usize)105     fn shift_right(&mut self, increment: usize) {
106         self.for_each(|range| range.offset += increment);
107     }
108 
for_each(&mut self, f: impl Fn(&mut Range))109     fn for_each(&mut self, f: impl Fn(&mut Range)) {
110         match self {
111             Diff::Equal(range1, range2) => {
112                 f(range1);
113                 f(range2);
114             }
115             Diff::Delete(range) => f(range),
116             Diff::Insert(range) => f(range),
117         }
118     }
119 }
120 
diff<'a>(text1: &'a str, text2: &'a str) -> Vec<Chunk<'a>>121 pub fn diff<'a>(text1: &'a str, text2: &'a str) -> Vec<Chunk<'a>> {
122     let text1 = Range::new(text1, ..);
123     let text2 = Range::new(text2, ..);
124     let mut solution = main(text1, text2);
125     cleanup_char_boundary(&mut solution);
126     cleanup_semantic(&mut solution);
127     cleanup_merge(&mut solution);
128     solution.diffs.into_iter().map(Chunk::from).collect()
129 }
130 
131 struct Solution<'a, 'b> {
132     text1: Range<'a>,
133     text2: Range<'b>,
134     diffs: Vec<Diff<'a, 'b>>,
135     utf8: bool,
136 }
137 
main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b>138 fn main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b> {
139     let whole1 = text1;
140     let whole2 = text2;
141 
142     // Trim off common prefix.
143     let common_prefix_len = common_prefix_bytes(text1, text2);
144     let common_prefix = Diff::Equal(
145         text1.substring(..common_prefix_len),
146         text2.substring(..common_prefix_len),
147     );
148     text1 = text1.substring(common_prefix_len..);
149     text2 = text2.substring(common_prefix_len..);
150 
151     // Trim off common suffix.
152     let common_suffix_len = common_suffix_bytes(text1, text2);
153     let common_suffix = Diff::Equal(
154         text1.substring(text1.len - common_suffix_len..),
155         text2.substring(text2.len - common_suffix_len..),
156     );
157     text1 = text1.substring(..text1.len - common_suffix_len);
158     text2 = text2.substring(..text2.len - common_suffix_len);
159 
160     // Compute the diff on the middle block.
161     let mut solution = Solution {
162         text1: whole1,
163         text2: whole2,
164         diffs: compute(text1, text2),
165         utf8: false,
166     };
167 
168     // Restore the prefix and suffix.
169     if common_prefix_len > 0 {
170         solution.diffs.insert(0, common_prefix);
171     }
172     if common_suffix_len > 0 {
173         solution.diffs.push(common_suffix);
174     }
175 
176     cleanup_merge(&mut solution);
177 
178     solution
179 }
180 
181 // Find the differences between two texts. Assumes that the texts do not have
182 // any common prefix or suffix.
compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>>183 fn compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
184     match (text1.is_empty(), text2.is_empty()) {
185         (true, true) => return Vec::new(),
186         (true, false) => return vec![Diff::Insert(text2)],
187         (false, true) => return vec![Diff::Delete(text1)],
188         (false, false) => {}
189     }
190 
191     // Check for entire shorter text inside the longer text.
192     if text1.len > text2.len {
193         if let Some(i) = text1.find(text2) {
194             return vec![
195                 Diff::Delete(text1.substring(..i)),
196                 Diff::Equal(text1.substring(i..i + text2.len), text2),
197                 Diff::Delete(text1.substring(i + text2.len..)),
198             ];
199         }
200     } else {
201         if let Some(i) = text2.find(text1) {
202             return vec![
203                 Diff::Insert(text2.substring(..i)),
204                 Diff::Equal(text1, text2.substring(i..i + text1.len)),
205                 Diff::Insert(text2.substring(i + text1.len..)),
206             ];
207         }
208     }
209 
210     if text1.len == 1 || text2.len == 1 {
211         // Single character string.
212         // After the previous check, the character can't be an equality.
213         return vec![Diff::Delete(text1), Diff::Insert(text2)];
214     }
215 
216     bisect(text1, text2)
217 }
218 
219 // Find the 'middle snake' of a diff, split the problem in two and return the
220 // recursively constructed diff.
221 //
222 // See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>>223 fn bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
224     let max_d = (text1.len + text2.len + 1) / 2;
225     let v_offset = max_d;
226     let v_len = 2 * max_d;
227     let mut v1 = vec![-1isize; v_len];
228     let mut v2 = vec![-1isize; v_len];
229     v1[v_offset + 1] = 0;
230     v2[v_offset + 1] = 0;
231     let delta = text1.len as isize - text2.len as isize;
232     // If the total number of characters is odd, then the front path will
233     // collide with the reverse path.
234     let front = delta % 2 != 0;
235     // Offsets for start and end of k loop.
236     // Prevents mapping of space beyond the grid.
237     let mut k1start = 0;
238     let mut k1end = 0;
239     let mut k2start = 0;
240     let mut k2end = 0;
241     for d in 0..max_d as isize {
242         // Walk the front path one step.
243         let mut k1 = -d + k1start;
244         while k1 <= d - k1end {
245             let k1_offset = (v_offset as isize + k1) as usize;
246             let mut x1 = if k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]) {
247                 v1[k1_offset + 1]
248             } else {
249                 v1[k1_offset - 1] + 1
250             } as usize;
251             let mut y1 = (x1 as isize - k1) as usize;
252             if let (Some(s1), Some(s2)) = (text1.get(x1..), text2.get(y1..)) {
253                 let advance = common_prefix_bytes(s1, s2);
254                 x1 += advance;
255                 y1 += advance;
256             }
257             v1[k1_offset] = x1 as isize;
258             if x1 > text1.len {
259                 // Ran off the right of the graph.
260                 k1end += 2;
261             } else if y1 > text2.len {
262                 // Ran off the bottom of the graph.
263                 k1start += 2;
264             } else if front {
265                 let k2_offset = v_offset as isize + delta - k1;
266                 if k2_offset >= 0 && k2_offset < v_len as isize && v2[k2_offset as usize] != -1 {
267                     // Mirror x2 onto top-left coordinate system.
268                     let x2 = text1.len as isize - v2[k2_offset as usize];
269                     if x1 as isize >= x2 {
270                         // Overlap detected.
271                         return bisect_split(text1, text2, x1, y1);
272                     }
273                 }
274             }
275             k1 += 2;
276         }
277 
278         // Walk the reverse path one step.
279         let mut k2 = -d + k2start;
280         while k2 <= d - k2end {
281             let k2_offset = (v_offset as isize + k2) as usize;
282             let mut x2 = if k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) {
283                 v2[k2_offset + 1]
284             } else {
285                 v2[k2_offset - 1] + 1
286             } as usize;
287             let mut y2 = (x2 as isize - k2) as usize;
288             if x2 < text1.len && y2 < text2.len {
289                 let advance = common_suffix_bytes(
290                     text1.substring(..text1.len - x2),
291                     text2.substring(..text2.len - y2),
292                 );
293                 x2 += advance;
294                 y2 += advance;
295             }
296             v2[k2_offset] = x2 as isize;
297             if x2 > text1.len {
298                 // Ran off the left of the graph.
299                 k2end += 2;
300             } else if y2 > text2.len {
301                 // Ran off the top of the graph.
302                 k2start += 2;
303             } else if !front {
304                 let k1_offset = v_offset as isize + delta - k2;
305                 if k1_offset >= 0 && k1_offset < v_len as isize && v1[k1_offset as usize] != -1 {
306                     let x1 = v1[k1_offset as usize] as usize;
307                     let y1 = v_offset + x1 - k1_offset as usize;
308                     // Mirror x2 onto top-left coordinate system.
309                     x2 = text1.len - x2;
310                     if x1 >= x2 {
311                         // Overlap detected.
312                         return bisect_split(text1, text2, x1, y1);
313                     }
314                 }
315             }
316             k2 += 2;
317         }
318     }
319     // Number of diffs equals number of characters, no commonality at all.
320     vec![Diff::Delete(text1), Diff::Insert(text2)]
321 }
322 
323 // Given the location of the 'middle snake', split the diff in two parts and
324 // recurse.
bisect_split<'a, 'b>( text1: Range<'a>, text2: Range<'b>, x: usize, y: usize, ) -> Vec<Diff<'a, 'b>>325 fn bisect_split<'a, 'b>(
326     text1: Range<'a>,
327     text2: Range<'b>,
328     x: usize,
329     y: usize,
330 ) -> Vec<Diff<'a, 'b>> {
331     let (text1a, text1b) = text1.split_at(x);
332     let (text2a, text2b) = text2.split_at(y);
333 
334     // Compute both diffs serially.
335     let mut diffs = main(text1a, text2a).diffs;
336     diffs.extend(main(text1b, text2b).diffs);
337 
338     diffs
339 }
340 
341 // Determine the length of the common prefix of two strings.
common_prefix(text1: Range, text2: Range) -> usize342 fn common_prefix(text1: Range, text2: Range) -> usize {
343     for ((i, ch1), ch2) in text1.char_indices().zip(text2.chars()) {
344         if ch1 != ch2 {
345             return i;
346         }
347     }
348     cmp::min(text1.len, text2.len)
349 }
350 
351 // Determine the length of the common suffix of two strings.
common_suffix(text1: Range, text2: Range) -> usize352 fn common_suffix(text1: Range, text2: Range) -> usize {
353     for ((i, ch1), ch2) in text1.char_indices().rev().zip(text2.chars().rev()) {
354         if ch1 != ch2 {
355             return text1.len - i - ch1.len_utf8();
356         }
357     }
358     cmp::min(text1.len, text2.len)
359 }
360 
common_prefix_bytes(text1: Range, text2: Range) -> usize361 fn common_prefix_bytes(text1: Range, text2: Range) -> usize {
362     for (i, (b1, b2)) in text1.bytes().zip(text2.bytes()).enumerate() {
363         if b1 != b2 {
364             return i;
365         }
366     }
367     cmp::min(text1.len, text2.len)
368 }
369 
common_suffix_bytes(text1: Range, text2: Range) -> usize370 fn common_suffix_bytes(text1: Range, text2: Range) -> usize {
371     for (i, (b1, b2)) in text1.bytes().rev().zip(text2.bytes().rev()).enumerate() {
372         if b1 != b2 {
373             return i;
374         }
375     }
376     cmp::min(text1.len, text2.len)
377 }
378 
379 // Determine if the suffix of one string is the prefix of another.
380 //
381 // Returns the number of characters common to the end of the first string and
382 // the start of the second string.
common_overlap(mut text1: Range, mut text2: Range) -> usize383 fn common_overlap(mut text1: Range, mut text2: Range) -> usize {
384     // Eliminate the null case.
385     if text1.is_empty() || text2.is_empty() {
386         return 0;
387     }
388     // Truncate the longer string.
389     if text1.len > text2.len {
390         text1 = text1.substring(text1.len - text2.len..);
391     } else if text1.len < text2.len {
392         text2 = text2.substring(..text1.len);
393     }
394     // Quick check for the worst case.
395     if bytes(text1) == bytes(text2) {
396         return text1.len;
397     }
398 
399     // Start by looking for a single character match
400     // and increase length until no match is found.
401     // Performance analysis: https://neil.fraser.name/news/2010/11/04/
402     let mut best = 0;
403     let mut length = 1;
404     loop {
405         let pattern = text1.substring(text1.len - length..);
406         let found = match text2.find(pattern) {
407             Some(found) => found,
408             None => return best,
409         };
410         length += found;
411         if found == 0
412             || bytes(text1.substring(text1.len - length..)) == bytes(text2.substring(..length))
413         {
414             best = length;
415             length += 1;
416         }
417     }
418 }
419 
cleanup_char_boundary(solution: &mut Solution)420 fn cleanup_char_boundary(solution: &mut Solution) {
421     fn boundary_down(doc: &str, pos: usize) -> usize {
422         let mut adjust = 0;
423         while !doc.is_char_boundary(pos - adjust) {
424             adjust += 1;
425         }
426         adjust
427     }
428 
429     fn boundary_up(doc: &str, pos: usize) -> usize {
430         let mut adjust = 0;
431         while !doc.is_char_boundary(pos + adjust) {
432             adjust += 1;
433         }
434         adjust
435     }
436 
437     fn skip_overlap<'a>(prev: &Range<'a>, range: &mut Range<'a>) {
438         let prev_end = prev.offset + prev.len;
439         if prev_end > range.offset {
440             let delta = cmp::min(prev_end - range.offset, range.len);
441             range.offset += delta;
442             range.len -= delta;
443         }
444     }
445 
446     let mut read = 0;
447     let mut retain = 0;
448     let mut last_delete = Range::empty();
449     let mut last_insert = Range::empty();
450     while let Some(&(mut diff)) = solution.diffs.get(read) {
451         read += 1;
452         match &mut diff {
453             Diff::Equal(range1, range2) => {
454                 let adjust = boundary_up(range1.doc, range1.offset);
455                 // If the whole range is sub-character, skip it.
456                 if range1.len <= adjust {
457                     continue;
458                 }
459                 range1.offset += adjust;
460                 range1.len -= adjust;
461                 range2.offset += adjust;
462                 range2.len -= adjust;
463                 let adjust = boundary_down(range1.doc, range1.offset + range1.len);
464                 range1.len -= adjust;
465                 range2.len -= adjust;
466                 last_delete = Range::empty();
467                 last_insert = Range::empty();
468             }
469             Diff::Delete(range) => {
470                 skip_overlap(&last_delete, range);
471                 if range.len == 0 {
472                     continue;
473                 }
474                 let adjust = boundary_down(range.doc, range.offset);
475                 range.offset -= adjust;
476                 range.len += adjust;
477                 let adjust = boundary_up(range.doc, range.offset + range.len);
478                 range.len += adjust;
479                 last_delete = *range;
480             }
481             Diff::Insert(range) => {
482                 skip_overlap(&last_insert, range);
483                 if range.len == 0 {
484                     continue;
485                 }
486                 let adjust = boundary_down(range.doc, range.offset);
487                 range.offset -= adjust;
488                 range.len += adjust;
489                 let adjust = boundary_up(range.doc, range.offset + range.len);
490                 range.len += adjust;
491                 last_insert = *range;
492             }
493         }
494         solution.diffs[retain] = diff;
495         retain += 1;
496     }
497 
498     solution.diffs.truncate(retain);
499     solution.utf8 = true;
500 }
501 
502 // Reduce the number of edits by eliminating semantically trivial equalities.
cleanup_semantic(solution: &mut Solution)503 fn cleanup_semantic(solution: &mut Solution) {
504     let mut diffs = &mut solution.diffs;
505     if diffs.is_empty() {
506         return;
507     }
508 
509     let mut changes = false;
510     let mut equalities = VecDeque::new(); // Double-ended queue of equalities.
511     let mut last_equality = None; // Always equal to equalities.peek().text
512     let mut pointer = 0;
513     // Number of characters that changed prior to the equality.
514     let mut len_insertions1 = 0;
515     let mut len_deletions1 = 0;
516     // Number of characters that changed after the equality.
517     let mut len_insertions2 = 0;
518     let mut len_deletions2 = 0;
519     while let Some(&this_diff) = diffs.get(pointer) {
520         match this_diff {
521             Diff::Equal(text1, text2) => {
522                 equalities.push_back(pointer);
523                 len_insertions1 = len_insertions2;
524                 len_deletions1 = len_deletions2;
525                 len_insertions2 = 0;
526                 len_deletions2 = 0;
527                 last_equality = Some((text1, text2));
528                 pointer += 1;
529                 continue;
530             }
531             Diff::Delete(text) => len_deletions2 += text.len,
532             Diff::Insert(text) => len_insertions2 += text.len,
533         }
534         // Eliminate an equality that is smaller or equal to the edits on both
535         // sides of it.
536         if last_equality.map_or(false, |(last_equality, _)| {
537             last_equality.len <= cmp::max(len_insertions1, len_deletions1)
538                 && last_equality.len <= cmp::max(len_insertions2, len_deletions2)
539         }) {
540             // Jump back to offending equality.
541             pointer = equalities.pop_back().unwrap();
542 
543             // Replace equality with a delete.
544             diffs[pointer] = Diff::Delete(last_equality.unwrap().0);
545             // Insert a corresponding insert.
546             diffs.insert(pointer + 1, Diff::Insert(last_equality.unwrap().1));
547 
548             len_insertions1 = 0; // Reset the counters.
549             len_insertions2 = 0;
550             len_deletions1 = 0;
551             len_deletions2 = 0;
552             last_equality = None;
553             changes = true;
554 
555             // Throw away the previous equality (it needs to be reevaluated).
556             equalities.pop_back();
557             if let Some(back) = equalities.back() {
558                 // There is a safe equality we can fall back to.
559                 pointer = *back;
560             } else {
561                 // There are no previous equalities, jump back to the start.
562                 pointer = 0;
563                 continue;
564             }
565         }
566         pointer += 1;
567     }
568 
569     // Normalize the diff.
570     if changes {
571         cleanup_merge(solution);
572     }
573     cleanup_semantic_lossless(solution);
574     diffs = &mut solution.diffs;
575 
576     // Find any overlaps between deletions and insertions.
577     // e.g: <del>abcxxx</del><ins>xxxdef</ins>
578     //   -> <del>abc</del>xxx<ins>def</ins>
579     // e.g: <del>xxxabc</del><ins>defxxx</ins>
580     //   -> <ins>def</ins>xxx<del>abc</del>
581     // Only extract an overlap if it is as big as the edit ahead or behind it.
582     let mut pointer = 1;
583     while let Some(&this_diff) = diffs.get(pointer) {
584         let prev_diff = diffs[pointer - 1];
585         if let (Diff::Delete(deletion), Diff::Insert(insertion)) = (prev_diff, this_diff) {
586             let overlap_len1 = common_overlap(deletion, insertion);
587             let overlap_len2 = common_overlap(insertion, deletion);
588             let overlap_min = cmp::min(deletion.len, insertion.len);
589             if overlap_len1 >= overlap_len2 && 2 * overlap_len1 >= overlap_min {
590                 // Overlap found. Insert an equality and trim the surrounding edits.
591                 diffs.insert(
592                     pointer,
593                     Diff::Equal(
594                         deletion.substring(deletion.len - overlap_len1..deletion.len),
595                         insertion.substring(..overlap_len1),
596                     ),
597                 );
598                 diffs[pointer - 1] =
599                     Diff::Delete(deletion.substring(..deletion.len - overlap_len1));
600                 diffs[pointer + 1] = Diff::Insert(insertion.substring(overlap_len1..));
601             } else if overlap_len1 < overlap_len2 && 2 * overlap_len2 >= overlap_min {
602                 // Reverse overlap found.
603                 // Insert an equality and swap and trim the surrounding edits.
604                 diffs.insert(
605                     pointer,
606                     Diff::Equal(
607                         deletion.substring(..overlap_len2),
608                         insertion.substring(insertion.len - overlap_len2..insertion.len),
609                     ),
610                 );
611                 diffs[pointer - 1] =
612                     Diff::Insert(insertion.substring(..insertion.len - overlap_len2));
613                 diffs[pointer + 1] = Diff::Delete(deletion.substring(overlap_len2..));
614             }
615             pointer += 1;
616         }
617         pointer += 1;
618     }
619 }
620 
621 // Look for single edits surrounded on both sides by equalities which can be
622 // shifted sideways to align the edit to a word boundary.
623 //
624 // e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
cleanup_semantic_lossless(solution: &mut Solution)625 fn cleanup_semantic_lossless(solution: &mut Solution) {
626     let diffs = &mut solution.diffs;
627     let mut pointer = 1;
628     while let Some(&next_diff) = diffs.get(pointer + 1) {
629         let prev_diff = diffs[pointer - 1];
630         if let (
631             Diff::Equal(mut prev_equal1, mut prev_equal2),
632             Diff::Equal(mut next_equal1, mut next_equal2),
633         ) = (prev_diff, next_diff)
634         {
635             // This is a single edit surrounded by equalities.
636             let mut edit = diffs[pointer];
637 
638             // First, shift the edit as far left as possible.
639             let common_offset = common_suffix(prev_equal1, edit.text());
640             let original_prev_len = prev_equal1.len;
641             prev_equal1.len -= common_offset;
642             prev_equal2.len -= common_offset;
643             edit.shift_left(common_offset);
644             next_equal1.offset -= common_offset;
645             next_equal1.len += common_offset;
646             next_equal2.offset -= common_offset;
647             next_equal2.len += common_offset;
648 
649             // Second, step character by character right, looking for the best fit.
650             let mut best_prev_equal = (prev_equal1, prev_equal2);
651             let mut best_edit = edit;
652             let mut best_next_equal = (next_equal1, next_equal2);
653             let mut best_score = cleanup_semantic_score(prev_equal1, edit.text())
654                 + cleanup_semantic_score(edit.text(), next_equal1);
655             while !edit.text().is_empty()
656                 && !next_equal1.is_empty()
657                 && edit.text().chars().next().unwrap() == next_equal1.chars().next().unwrap()
658             {
659                 let increment = edit.text().chars().next().unwrap().len_utf8();
660                 prev_equal1.len += increment;
661                 prev_equal2.len += increment;
662                 edit.shift_right(increment);
663                 next_equal1.offset += increment;
664                 next_equal1.len -= increment;
665                 next_equal2.offset += increment;
666                 next_equal2.len -= increment;
667                 let score = cleanup_semantic_score(prev_equal1, edit.text())
668                     + cleanup_semantic_score(edit.text(), next_equal1);
669                 // The >= encourages trailing rather than leading whitespace on edits.
670                 if score >= best_score {
671                     best_score = score;
672                     best_prev_equal = (prev_equal1, prev_equal2);
673                     best_edit = edit;
674                     best_next_equal = (next_equal1, next_equal2);
675                 }
676             }
677 
678             if original_prev_len != best_prev_equal.0.len {
679                 // We have an improvement, save it back to the diff.
680                 if best_next_equal.0.is_empty() {
681                     diffs.remove(pointer + 1);
682                 } else {
683                     diffs[pointer + 1] = Diff::Equal(best_next_equal.0, best_next_equal.1);
684                 }
685                 diffs[pointer] = best_edit;
686                 if best_prev_equal.0.is_empty() {
687                     diffs.remove(pointer - 1);
688                     pointer -= 1;
689                 } else {
690                     diffs[pointer - 1] = Diff::Equal(best_prev_equal.0, best_prev_equal.1);
691                 }
692             }
693         }
694         pointer += 1;
695     }
696 }
697 
698 // Given two strings, compute a score representing whether the internal boundary
699 // falls on logical boundaries.
700 //
701 // Scores range from 6 (best) to 0 (worst).
cleanup_semantic_score(one: Range, two: Range) -> usize702 fn cleanup_semantic_score(one: Range, two: Range) -> usize {
703     if one.is_empty() || two.is_empty() {
704         // Edges are the best.
705         return 6;
706     }
707 
708     // Each port of this function behaves slightly differently due to subtle
709     // differences in each language's definition of things like 'whitespace'.
710     // Since this function's purpose is largely cosmetic, the choice has been
711     // made to use each language's native features rather than force total
712     // conformity.
713     let char1 = one.chars().next_back().unwrap();
714     let char2 = two.chars().next().unwrap();
715     let non_alphanumeric1 = !char1.is_ascii_alphanumeric();
716     let non_alphanumeric2 = !char2.is_ascii_alphanumeric();
717     let whitespace1 = non_alphanumeric1 && char1.is_ascii_whitespace();
718     let whitespace2 = non_alphanumeric2 && char2.is_ascii_whitespace();
719     let line_break1 = whitespace1 && char1.is_control();
720     let line_break2 = whitespace2 && char2.is_control();
721     let blank_line1 = line_break1 && (one.ends_with("\n\n") || one.ends_with("\n\r\n"));
722     let blank_line2 = line_break2 && (two.starts_with("\n\n") || two.starts_with("\r\n\r\n"));
723 
724     if blank_line1 || blank_line2 {
725         // Five points for blank lines.
726         5
727     } else if line_break1 || line_break2 {
728         // Four points for line breaks.
729         4
730     } else if non_alphanumeric1 && !whitespace1 && whitespace2 {
731         // Three points for end of sentences.
732         3
733     } else if whitespace1 || whitespace2 {
734         // Two points for whitespace.
735         2
736     } else if non_alphanumeric1 || non_alphanumeric2 {
737         // One point for non-alphanumeric.
738         1
739     } else {
740         0
741     }
742 }
743 
744 // Reorder and merge like edit sections. Merge equalities. Any edit section can
745 // move as long as it doesn't cross an equality.
cleanup_merge(solution: &mut Solution)746 fn cleanup_merge(solution: &mut Solution) {
747     let diffs = &mut solution.diffs;
748     let common_prefix = if solution.utf8 {
749         common_prefix
750     } else {
751         common_prefix_bytes
752     };
753     let common_suffix = if solution.utf8 {
754         common_suffix
755     } else {
756         common_suffix_bytes
757     };
758 
759     loop {
760         if diffs.is_empty() {
761             return;
762         }
763 
764         diffs.push(Diff::Equal(
765             solution.text1.substring(solution.text1.len..),
766             solution.text2.substring(solution.text2.len..),
767         )); // Add a dummy entry at the end.
768         let mut pointer = 0;
769         let mut count_delete = 0;
770         let mut count_insert = 0;
771         let mut text_delete = Range::empty();
772         let mut text_insert = Range::empty();
773         while let Some(&this_diff) = diffs.get(pointer) {
774             match this_diff {
775                 Diff::Insert(text) => {
776                     count_insert += 1;
777                     if text_insert.is_empty() {
778                         text_insert = text;
779                     } else {
780                         text_insert.len += text.len;
781                     }
782                 }
783                 Diff::Delete(text) => {
784                     count_delete += 1;
785                     if text_delete.is_empty() {
786                         text_delete = text;
787                     } else {
788                         text_delete.len += text.len;
789                     }
790                 }
791                 Diff::Equal(text, _) => {
792                     let count_both = count_delete + count_insert;
793                     if count_both > 1 {
794                         let both_types = count_delete != 0 && count_insert != 0;
795                         // Delete the offending records.
796                         diffs.splice(pointer - count_both..pointer, None);
797                         pointer -= count_both;
798                         if both_types {
799                             // Factor out any common prefix.
800                             let common_length = common_prefix(text_insert, text_delete);
801                             if common_length != 0 {
802                                 if pointer > 0 {
803                                     match &mut diffs[pointer - 1] {
804                                         Diff::Equal(this_diff1, this_diff2) => {
805                                             this_diff1.len += common_length;
806                                             this_diff2.len += common_length;
807                                         }
808                                         _ => unreachable!(
809                                             "previous diff should have been an equality"
810                                         ),
811                                     }
812                                 } else {
813                                     diffs.insert(
814                                         pointer,
815                                         Diff::Equal(
816                                             text_delete.substring(..common_length),
817                                             text_insert.substring(..common_length),
818                                         ),
819                                     );
820                                     pointer += 1;
821                                 }
822                                 text_insert = text_insert.substring(common_length..);
823                                 text_delete = text_delete.substring(common_length..);
824                             }
825                             // Factor out any common suffix.
826                             let common_length = common_suffix(text_insert, text_delete);
827                             if common_length != 0 {
828                                 diffs[pointer].grow_left(common_length);
829                                 text_insert.len -= common_length;
830                                 text_delete.len -= common_length;
831                             }
832                         }
833                         // Insert the merged records.
834                         if !text_delete.is_empty() {
835                             diffs.insert(pointer, Diff::Delete(text_delete));
836                             pointer += 1;
837                         }
838                         if !text_insert.is_empty() {
839                             diffs.insert(pointer, Diff::Insert(text_insert));
840                             pointer += 1;
841                         }
842                     } else if pointer > 0 {
843                         if let Some(Diff::Equal(prev_equal1, prev_equal2)) =
844                             diffs.get_mut(pointer - 1)
845                         {
846                             // Merge this equality with the previous one.
847                             prev_equal1.len += text.len;
848                             prev_equal2.len += text.len;
849                             diffs.remove(pointer);
850                             pointer -= 1;
851                         }
852                     }
853                     count_insert = 0;
854                     count_delete = 0;
855                     text_delete = Range::empty();
856                     text_insert = Range::empty();
857                 }
858             }
859             pointer += 1;
860         }
861         if diffs.last().unwrap().text().is_empty() {
862             diffs.pop(); // Remove the dummy entry at the end.
863         }
864 
865         // Second pass: look for single edits surrounded on both sides by equalities
866         // which can be shifted sideways to eliminate an equality.
867         // e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
868         let mut changes = false;
869         let mut pointer = 1;
870         // Intentionally ignore the first and last element (don't need checking).
871         while let Some(&next_diff) = diffs.get(pointer + 1) {
872             let prev_diff = diffs[pointer - 1];
873             let this_diff = diffs[pointer];
874             if let (Diff::Equal(prev_diff, _), Diff::Equal(next_diff, _)) = (prev_diff, next_diff) {
875                 // This is a single edit surrounded by equalities.
876                 if this_diff.text().ends_with(prev_diff) {
877                     // Shift the edit over the previous equality.
878                     diffs[pointer].shift_left(prev_diff.len);
879                     diffs[pointer + 1].grow_left(prev_diff.len);
880                     diffs.remove(pointer - 1); // Delete prev_diff.
881                     changes = true;
882                 } else if this_diff.text().starts_with(next_diff) {
883                     // Shift the edit over the next equality.
884                     diffs[pointer - 1].grow_right(next_diff.len);
885                     diffs[pointer].shift_right(next_diff.len);
886                     diffs.remove(pointer + 1); // Delete next_diff.
887                     changes = true;
888                 }
889             }
890             pointer += 1;
891         }
892         // If shifts were made, the diff needs reordering and another shift sweep.
893         if !changes {
894             return;
895         }
896     }
897 }
898 
899 impl Debug for Chunk<'_> {
fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result900     fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
901         let (name, text) = match *self {
902             Chunk::Equal(text) => ("Equal", text),
903             Chunk::Delete(text) => ("Delete", text),
904             Chunk::Insert(text) => ("Insert", text),
905         };
906         write!(formatter, "{}({:?})", name, text)
907     }
908 }
909 
910 impl Debug for Diff<'_, '_> {
fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result911     fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
912         let (name, bytes) = match *self {
913             Diff::Equal(range, _) => ("Equal", bytes(range)),
914             Diff::Delete(range) => ("Delete", bytes(range)),
915             Diff::Insert(range) => ("Insert", bytes(range)),
916         };
917         let text = String::from_utf8_lossy(bytes);
918         write!(formatter, "{}({:?})", name, text)
919     }
920 }
921 
922 impl<'a> From<Diff<'a, 'a>> for Chunk<'a> {
from(diff: Diff<'a, 'a>) -> Self923     fn from(diff: Diff<'a, 'a>) -> Self {
924         match diff {
925             Diff::Equal(range, _) => Chunk::Equal(str(range)),
926             Diff::Delete(range) => Chunk::Delete(str(range)),
927             Diff::Insert(range) => Chunk::Insert(str(range)),
928         }
929     }
930 }
931