1 //! [![github]](https://github.com/dtolnay/dissimilar) [![crates-io]](https://crates.io/crates/dissimilar) [![docs-rs]](https://docs.rs/dissimilar)
2 //!
3 //! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github
4 //! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust
5 //! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logoColor=white&logo=data:image/svg+xml;base64,PHN2ZyByb2xlPSJpbWciIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgdmlld0JveD0iMCAwIDUxMiA1MTIiPjxwYXRoIGZpbGw9IiNmNWY1ZjUiIGQ9Ik00ODguNiAyNTAuMkwzOTIgMjE0VjEwNS41YzAtMTUtOS4zLTI4LjQtMjMuNC0zMy43bC0xMDAtMzcuNWMtOC4xLTMuMS0xNy4xLTMuMS0yNS4zIDBsLTEwMCAzNy41Yy0xNC4xIDUuMy0yMy40IDE4LjctMjMuNCAzMy43VjIxNGwtOTYuNiAzNi4yQzkuMyAyNTUuNSAwIDI2OC45IDAgMjgzLjlWMzk0YzAgMTMuNiA3LjcgMjYuMSAxOS45IDMyLjJsMTAwIDUwYzEwLjEgNS4xIDIyLjEgNS4xIDMyLjIgMGwxMDMuOS01MiAxMDMuOSA1MmMxMC4xIDUuMSAyMi4xIDUuMSAzMi4yIDBsMTAwLTUwYzEyLjItNi4xIDE5LjktMTguNiAxOS45LTMyLjJWMjgzLjljMC0xNS05LjMtMjguNC0yMy40LTMzLjd6TTM1OCAyMTQuOGwtODUgMzEuOXYtNjguMmw4NS0zN3Y3My4zek0xNTQgMTA0LjFsMTAyLTM4LjIgMTAyIDM4LjJ2LjZsLTEwMiA0MS40LTEwMi00MS40di0uNnptODQgMjkxLjFsLTg1IDQyLjV2LTc5LjFsODUtMzguOHY3NS40em0wLTExMmwtMTAyIDQxLjQtMTAyLTQxLjR2LS42bDEwMi0zOC4yIDEwMiAzOC4ydi42em0yNDAgMTEybC04NSA0Mi41di03OS4xbDg1LTM4Ljh2NzUuNHptMC0xMTJsLTEwMiA0MS40LTEwMi00MS40di0uNmwxMDItMzguMiAxMDIgMzguMnYuNnoiPjwvcGF0aD48L3N2Zz4K
6 //!
7 //! <br>
8 //!
9 //! ## Diff library with semantic cleanup, based on Google's diff-match-patch
10 //!
11 //! This library is a port of the Diff component of [Diff Match Patch] to Rust.
12 //! The diff implementation is based on [Myers' diff algorithm] but includes
13 //! some [semantic cleanups] to increase human readability by factoring out
14 //! commonalities which are likely to be coincidental.
15 //!
16 //! Diff Match Patch was originally built in 2006 to power Google Docs.
17 //!
18 //! # Interface
19 //!
20 //! Here is the entire API of the Rust implementation. It operates on borrowed
21 //! strings and the return value of the diff algorithm is a vector of chunks
22 //! pointing into slices of those input strings.
23 //!
24 //! ```
25 //! pub enum Chunk<'a> {
26 //! Equal(&'a str),
27 //! Delete(&'a str),
28 //! Insert(&'a str),
29 //! }
30 //!
31 //! # const IGNORE: &str = stringify! {
32 //! pub fn diff(text1: &str, text2: &str) -> Vec<Chunk>;
33 //! # };
34 //! ```
35 //!
36 //! [Diff Match Patch]: https://github.com/google/diff-match-patch
37 //! [Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf
38 //! [semantic cleanups]: https://neil.fraser.name/writing/diff/
39
40 #![doc(html_root_url = "https://docs.rs/dissimilar/1.0.3")]
41 #![allow(
42 clippy::blocks_in_if_conditions,
43 clippy::cast_possible_wrap,
44 clippy::cast_sign_loss,
45 clippy::cloned_instead_of_copied, // https://github.com/rust-lang/rust-clippy/issues/7127
46 clippy::collapsible_else_if,
47 clippy::comparison_chain,
48 clippy::match_same_arms,
49 clippy::module_name_repetitions,
50 clippy::must_use_candidate,
51 clippy::new_without_default,
52 clippy::shadow_unrelated,
53 clippy::similar_names,
54 clippy::too_many_lines,
55 clippy::unseparated_literal_suffix
56 )]
57
58 mod find;
59 mod range;
60
61 #[cfg(test)]
62 mod tests;
63
64 use crate::range::{bytes, str, Range};
65 use std::cmp;
66 use std::collections::VecDeque;
67 use std::fmt::{self, Debug};
68
69 #[derive(Copy, Clone, PartialEq, Eq)]
70 pub enum Chunk<'a> {
71 Equal(&'a str),
72 Delete(&'a str),
73 Insert(&'a str),
74 }
75
76 #[derive(Copy, Clone)]
77 enum Diff<'a, 'b> {
78 Equal(Range<'a>, Range<'b>),
79 Delete(Range<'a>),
80 Insert(Range<'b>),
81 }
82
83 impl<'tmp, 'a: 'tmp, 'b: 'tmp> Diff<'a, 'b> {
text(&self) -> Range<'tmp>84 fn text(&self) -> Range<'tmp> {
85 match *self {
86 Diff::Equal(range, _) | Diff::Delete(range) | Diff::Insert(range) => range,
87 }
88 }
89
grow_left(&mut self, increment: usize)90 fn grow_left(&mut self, increment: usize) {
91 self.for_each(|range| {
92 range.offset -= increment;
93 range.len += increment;
94 });
95 }
96
grow_right(&mut self, increment: usize)97 fn grow_right(&mut self, increment: usize) {
98 self.for_each(|range| range.len += increment);
99 }
100
shift_left(&mut self, increment: usize)101 fn shift_left(&mut self, increment: usize) {
102 self.for_each(|range| range.offset -= increment);
103 }
104
shift_right(&mut self, increment: usize)105 fn shift_right(&mut self, increment: usize) {
106 self.for_each(|range| range.offset += increment);
107 }
108
for_each(&mut self, f: impl Fn(&mut Range))109 fn for_each(&mut self, f: impl Fn(&mut Range)) {
110 match self {
111 Diff::Equal(range1, range2) => {
112 f(range1);
113 f(range2);
114 }
115 Diff::Delete(range) => f(range),
116 Diff::Insert(range) => f(range),
117 }
118 }
119 }
120
diff<'a>(text1: &'a str, text2: &'a str) -> Vec<Chunk<'a>>121 pub fn diff<'a>(text1: &'a str, text2: &'a str) -> Vec<Chunk<'a>> {
122 let text1 = Range::new(text1, ..);
123 let text2 = Range::new(text2, ..);
124 let mut solution = main(text1, text2);
125 cleanup_char_boundary(&mut solution);
126 cleanup_semantic(&mut solution);
127 cleanup_merge(&mut solution);
128 solution.diffs.into_iter().map(Chunk::from).collect()
129 }
130
131 struct Solution<'a, 'b> {
132 text1: Range<'a>,
133 text2: Range<'b>,
134 diffs: Vec<Diff<'a, 'b>>,
135 utf8: bool,
136 }
137
main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b>138 fn main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b> {
139 let whole1 = text1;
140 let whole2 = text2;
141
142 // Trim off common prefix.
143 let common_prefix_len = common_prefix_bytes(text1, text2);
144 let common_prefix = Diff::Equal(
145 text1.substring(..common_prefix_len),
146 text2.substring(..common_prefix_len),
147 );
148 text1 = text1.substring(common_prefix_len..);
149 text2 = text2.substring(common_prefix_len..);
150
151 // Trim off common suffix.
152 let common_suffix_len = common_suffix_bytes(text1, text2);
153 let common_suffix = Diff::Equal(
154 text1.substring(text1.len - common_suffix_len..),
155 text2.substring(text2.len - common_suffix_len..),
156 );
157 text1 = text1.substring(..text1.len - common_suffix_len);
158 text2 = text2.substring(..text2.len - common_suffix_len);
159
160 // Compute the diff on the middle block.
161 let mut solution = Solution {
162 text1: whole1,
163 text2: whole2,
164 diffs: compute(text1, text2),
165 utf8: false,
166 };
167
168 // Restore the prefix and suffix.
169 if common_prefix_len > 0 {
170 solution.diffs.insert(0, common_prefix);
171 }
172 if common_suffix_len > 0 {
173 solution.diffs.push(common_suffix);
174 }
175
176 cleanup_merge(&mut solution);
177
178 solution
179 }
180
181 // Find the differences between two texts. Assumes that the texts do not have
182 // any common prefix or suffix.
compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>>183 fn compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
184 match (text1.is_empty(), text2.is_empty()) {
185 (true, true) => return Vec::new(),
186 (true, false) => return vec![Diff::Insert(text2)],
187 (false, true) => return vec![Diff::Delete(text1)],
188 (false, false) => {}
189 }
190
191 // Check for entire shorter text inside the longer text.
192 if text1.len > text2.len {
193 if let Some(i) = text1.find(text2) {
194 return vec![
195 Diff::Delete(text1.substring(..i)),
196 Diff::Equal(text1.substring(i..i + text2.len), text2),
197 Diff::Delete(text1.substring(i + text2.len..)),
198 ];
199 }
200 } else {
201 if let Some(i) = text2.find(text1) {
202 return vec![
203 Diff::Insert(text2.substring(..i)),
204 Diff::Equal(text1, text2.substring(i..i + text1.len)),
205 Diff::Insert(text2.substring(i + text1.len..)),
206 ];
207 }
208 }
209
210 if text1.len == 1 || text2.len == 1 {
211 // Single character string.
212 // After the previous check, the character can't be an equality.
213 return vec![Diff::Delete(text1), Diff::Insert(text2)];
214 }
215
216 bisect(text1, text2)
217 }
218
219 // Find the 'middle snake' of a diff, split the problem in two and return the
220 // recursively constructed diff.
221 //
222 // See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>>223 fn bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
224 let max_d = (text1.len + text2.len + 1) / 2;
225 let v_offset = max_d;
226 let v_len = 2 * max_d;
227 let mut v1 = vec![-1isize; v_len];
228 let mut v2 = vec![-1isize; v_len];
229 v1[v_offset + 1] = 0;
230 v2[v_offset + 1] = 0;
231 let delta = text1.len as isize - text2.len as isize;
232 // If the total number of characters is odd, then the front path will
233 // collide with the reverse path.
234 let front = delta % 2 != 0;
235 // Offsets for start and end of k loop.
236 // Prevents mapping of space beyond the grid.
237 let mut k1start = 0;
238 let mut k1end = 0;
239 let mut k2start = 0;
240 let mut k2end = 0;
241 for d in 0..max_d as isize {
242 // Walk the front path one step.
243 let mut k1 = -d + k1start;
244 while k1 <= d - k1end {
245 let k1_offset = (v_offset as isize + k1) as usize;
246 let mut x1 = if k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]) {
247 v1[k1_offset + 1]
248 } else {
249 v1[k1_offset - 1] + 1
250 } as usize;
251 let mut y1 = (x1 as isize - k1) as usize;
252 if let (Some(s1), Some(s2)) = (text1.get(x1..), text2.get(y1..)) {
253 let advance = common_prefix_bytes(s1, s2);
254 x1 += advance;
255 y1 += advance;
256 }
257 v1[k1_offset] = x1 as isize;
258 if x1 > text1.len {
259 // Ran off the right of the graph.
260 k1end += 2;
261 } else if y1 > text2.len {
262 // Ran off the bottom of the graph.
263 k1start += 2;
264 } else if front {
265 let k2_offset = v_offset as isize + delta - k1;
266 if k2_offset >= 0 && k2_offset < v_len as isize && v2[k2_offset as usize] != -1 {
267 // Mirror x2 onto top-left coordinate system.
268 let x2 = text1.len as isize - v2[k2_offset as usize];
269 if x1 as isize >= x2 {
270 // Overlap detected.
271 return bisect_split(text1, text2, x1, y1);
272 }
273 }
274 }
275 k1 += 2;
276 }
277
278 // Walk the reverse path one step.
279 let mut k2 = -d + k2start;
280 while k2 <= d - k2end {
281 let k2_offset = (v_offset as isize + k2) as usize;
282 let mut x2 = if k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) {
283 v2[k2_offset + 1]
284 } else {
285 v2[k2_offset - 1] + 1
286 } as usize;
287 let mut y2 = (x2 as isize - k2) as usize;
288 if x2 < text1.len && y2 < text2.len {
289 let advance = common_suffix_bytes(
290 text1.substring(..text1.len - x2),
291 text2.substring(..text2.len - y2),
292 );
293 x2 += advance;
294 y2 += advance;
295 }
296 v2[k2_offset] = x2 as isize;
297 if x2 > text1.len {
298 // Ran off the left of the graph.
299 k2end += 2;
300 } else if y2 > text2.len {
301 // Ran off the top of the graph.
302 k2start += 2;
303 } else if !front {
304 let k1_offset = v_offset as isize + delta - k2;
305 if k1_offset >= 0 && k1_offset < v_len as isize && v1[k1_offset as usize] != -1 {
306 let x1 = v1[k1_offset as usize] as usize;
307 let y1 = v_offset + x1 - k1_offset as usize;
308 // Mirror x2 onto top-left coordinate system.
309 x2 = text1.len - x2;
310 if x1 >= x2 {
311 // Overlap detected.
312 return bisect_split(text1, text2, x1, y1);
313 }
314 }
315 }
316 k2 += 2;
317 }
318 }
319 // Number of diffs equals number of characters, no commonality at all.
320 vec![Diff::Delete(text1), Diff::Insert(text2)]
321 }
322
323 // Given the location of the 'middle snake', split the diff in two parts and
324 // recurse.
bisect_split<'a, 'b>( text1: Range<'a>, text2: Range<'b>, x: usize, y: usize, ) -> Vec<Diff<'a, 'b>>325 fn bisect_split<'a, 'b>(
326 text1: Range<'a>,
327 text2: Range<'b>,
328 x: usize,
329 y: usize,
330 ) -> Vec<Diff<'a, 'b>> {
331 let (text1a, text1b) = text1.split_at(x);
332 let (text2a, text2b) = text2.split_at(y);
333
334 // Compute both diffs serially.
335 let mut diffs = main(text1a, text2a).diffs;
336 diffs.extend(main(text1b, text2b).diffs);
337
338 diffs
339 }
340
341 // Determine the length of the common prefix of two strings.
common_prefix(text1: Range, text2: Range) -> usize342 fn common_prefix(text1: Range, text2: Range) -> usize {
343 for ((i, ch1), ch2) in text1.char_indices().zip(text2.chars()) {
344 if ch1 != ch2 {
345 return i;
346 }
347 }
348 cmp::min(text1.len, text2.len)
349 }
350
351 // Determine the length of the common suffix of two strings.
common_suffix(text1: Range, text2: Range) -> usize352 fn common_suffix(text1: Range, text2: Range) -> usize {
353 for ((i, ch1), ch2) in text1.char_indices().rev().zip(text2.chars().rev()) {
354 if ch1 != ch2 {
355 return text1.len - i - ch1.len_utf8();
356 }
357 }
358 cmp::min(text1.len, text2.len)
359 }
360
common_prefix_bytes(text1: Range, text2: Range) -> usize361 fn common_prefix_bytes(text1: Range, text2: Range) -> usize {
362 for (i, (b1, b2)) in text1.bytes().zip(text2.bytes()).enumerate() {
363 if b1 != b2 {
364 return i;
365 }
366 }
367 cmp::min(text1.len, text2.len)
368 }
369
common_suffix_bytes(text1: Range, text2: Range) -> usize370 fn common_suffix_bytes(text1: Range, text2: Range) -> usize {
371 for (i, (b1, b2)) in text1.bytes().rev().zip(text2.bytes().rev()).enumerate() {
372 if b1 != b2 {
373 return i;
374 }
375 }
376 cmp::min(text1.len, text2.len)
377 }
378
379 // Determine if the suffix of one string is the prefix of another.
380 //
381 // Returns the number of characters common to the end of the first string and
382 // the start of the second string.
common_overlap(mut text1: Range, mut text2: Range) -> usize383 fn common_overlap(mut text1: Range, mut text2: Range) -> usize {
384 // Eliminate the null case.
385 if text1.is_empty() || text2.is_empty() {
386 return 0;
387 }
388 // Truncate the longer string.
389 if text1.len > text2.len {
390 text1 = text1.substring(text1.len - text2.len..);
391 } else if text1.len < text2.len {
392 text2 = text2.substring(..text1.len);
393 }
394 // Quick check for the worst case.
395 if bytes(text1) == bytes(text2) {
396 return text1.len;
397 }
398
399 // Start by looking for a single character match
400 // and increase length until no match is found.
401 // Performance analysis: https://neil.fraser.name/news/2010/11/04/
402 let mut best = 0;
403 let mut length = 1;
404 loop {
405 let pattern = text1.substring(text1.len - length..);
406 let found = match text2.find(pattern) {
407 Some(found) => found,
408 None => return best,
409 };
410 length += found;
411 if found == 0
412 || bytes(text1.substring(text1.len - length..)) == bytes(text2.substring(..length))
413 {
414 best = length;
415 length += 1;
416 }
417 }
418 }
419
cleanup_char_boundary(solution: &mut Solution)420 fn cleanup_char_boundary(solution: &mut Solution) {
421 fn boundary_down(doc: &str, pos: usize) -> usize {
422 let mut adjust = 0;
423 while !doc.is_char_boundary(pos - adjust) {
424 adjust += 1;
425 }
426 adjust
427 }
428
429 fn boundary_up(doc: &str, pos: usize) -> usize {
430 let mut adjust = 0;
431 while !doc.is_char_boundary(pos + adjust) {
432 adjust += 1;
433 }
434 adjust
435 }
436
437 fn skip_overlap<'a>(prev: &Range<'a>, range: &mut Range<'a>) {
438 let prev_end = prev.offset + prev.len;
439 if prev_end > range.offset {
440 let delta = cmp::min(prev_end - range.offset, range.len);
441 range.offset += delta;
442 range.len -= delta;
443 }
444 }
445
446 let mut read = 0;
447 let mut retain = 0;
448 let mut last_delete = Range::empty();
449 let mut last_insert = Range::empty();
450 while let Some(&(mut diff)) = solution.diffs.get(read) {
451 read += 1;
452 match &mut diff {
453 Diff::Equal(range1, range2) => {
454 let adjust = boundary_up(range1.doc, range1.offset);
455 // If the whole range is sub-character, skip it.
456 if range1.len <= adjust {
457 continue;
458 }
459 range1.offset += adjust;
460 range1.len -= adjust;
461 range2.offset += adjust;
462 range2.len -= adjust;
463 let adjust = boundary_down(range1.doc, range1.offset + range1.len);
464 range1.len -= adjust;
465 range2.len -= adjust;
466 last_delete = Range::empty();
467 last_insert = Range::empty();
468 }
469 Diff::Delete(range) => {
470 skip_overlap(&last_delete, range);
471 if range.len == 0 {
472 continue;
473 }
474 let adjust = boundary_down(range.doc, range.offset);
475 range.offset -= adjust;
476 range.len += adjust;
477 let adjust = boundary_up(range.doc, range.offset + range.len);
478 range.len += adjust;
479 last_delete = *range;
480 }
481 Diff::Insert(range) => {
482 skip_overlap(&last_insert, range);
483 if range.len == 0 {
484 continue;
485 }
486 let adjust = boundary_down(range.doc, range.offset);
487 range.offset -= adjust;
488 range.len += adjust;
489 let adjust = boundary_up(range.doc, range.offset + range.len);
490 range.len += adjust;
491 last_insert = *range;
492 }
493 }
494 solution.diffs[retain] = diff;
495 retain += 1;
496 }
497
498 solution.diffs.truncate(retain);
499 solution.utf8 = true;
500 }
501
502 // Reduce the number of edits by eliminating semantically trivial equalities.
cleanup_semantic(solution: &mut Solution)503 fn cleanup_semantic(solution: &mut Solution) {
504 let mut diffs = &mut solution.diffs;
505 if diffs.is_empty() {
506 return;
507 }
508
509 let mut changes = false;
510 let mut equalities = VecDeque::new(); // Double-ended queue of equalities.
511 let mut last_equality = None; // Always equal to equalities.peek().text
512 let mut pointer = 0;
513 // Number of characters that changed prior to the equality.
514 let mut len_insertions1 = 0;
515 let mut len_deletions1 = 0;
516 // Number of characters that changed after the equality.
517 let mut len_insertions2 = 0;
518 let mut len_deletions2 = 0;
519 while let Some(&this_diff) = diffs.get(pointer) {
520 match this_diff {
521 Diff::Equal(text1, text2) => {
522 equalities.push_back(pointer);
523 len_insertions1 = len_insertions2;
524 len_deletions1 = len_deletions2;
525 len_insertions2 = 0;
526 len_deletions2 = 0;
527 last_equality = Some((text1, text2));
528 pointer += 1;
529 continue;
530 }
531 Diff::Delete(text) => len_deletions2 += text.len,
532 Diff::Insert(text) => len_insertions2 += text.len,
533 }
534 // Eliminate an equality that is smaller or equal to the edits on both
535 // sides of it.
536 if last_equality.map_or(false, |(last_equality, _)| {
537 last_equality.len <= cmp::max(len_insertions1, len_deletions1)
538 && last_equality.len <= cmp::max(len_insertions2, len_deletions2)
539 }) {
540 // Jump back to offending equality.
541 pointer = equalities.pop_back().unwrap();
542
543 // Replace equality with a delete.
544 diffs[pointer] = Diff::Delete(last_equality.unwrap().0);
545 // Insert a corresponding insert.
546 diffs.insert(pointer + 1, Diff::Insert(last_equality.unwrap().1));
547
548 len_insertions1 = 0; // Reset the counters.
549 len_insertions2 = 0;
550 len_deletions1 = 0;
551 len_deletions2 = 0;
552 last_equality = None;
553 changes = true;
554
555 // Throw away the previous equality (it needs to be reevaluated).
556 equalities.pop_back();
557 if let Some(back) = equalities.back() {
558 // There is a safe equality we can fall back to.
559 pointer = *back;
560 } else {
561 // There are no previous equalities, jump back to the start.
562 pointer = 0;
563 continue;
564 }
565 }
566 pointer += 1;
567 }
568
569 // Normalize the diff.
570 if changes {
571 cleanup_merge(solution);
572 }
573 cleanup_semantic_lossless(solution);
574 diffs = &mut solution.diffs;
575
576 // Find any overlaps between deletions and insertions.
577 // e.g: <del>abcxxx</del><ins>xxxdef</ins>
578 // -> <del>abc</del>xxx<ins>def</ins>
579 // e.g: <del>xxxabc</del><ins>defxxx</ins>
580 // -> <ins>def</ins>xxx<del>abc</del>
581 // Only extract an overlap if it is as big as the edit ahead or behind it.
582 let mut pointer = 1;
583 while let Some(&this_diff) = diffs.get(pointer) {
584 let prev_diff = diffs[pointer - 1];
585 if let (Diff::Delete(deletion), Diff::Insert(insertion)) = (prev_diff, this_diff) {
586 let overlap_len1 = common_overlap(deletion, insertion);
587 let overlap_len2 = common_overlap(insertion, deletion);
588 let overlap_min = cmp::min(deletion.len, insertion.len);
589 if overlap_len1 >= overlap_len2 && 2 * overlap_len1 >= overlap_min {
590 // Overlap found. Insert an equality and trim the surrounding edits.
591 diffs.insert(
592 pointer,
593 Diff::Equal(
594 deletion.substring(deletion.len - overlap_len1..deletion.len),
595 insertion.substring(..overlap_len1),
596 ),
597 );
598 diffs[pointer - 1] =
599 Diff::Delete(deletion.substring(..deletion.len - overlap_len1));
600 diffs[pointer + 1] = Diff::Insert(insertion.substring(overlap_len1..));
601 } else if overlap_len1 < overlap_len2 && 2 * overlap_len2 >= overlap_min {
602 // Reverse overlap found.
603 // Insert an equality and swap and trim the surrounding edits.
604 diffs.insert(
605 pointer,
606 Diff::Equal(
607 deletion.substring(..overlap_len2),
608 insertion.substring(insertion.len - overlap_len2..insertion.len),
609 ),
610 );
611 diffs[pointer - 1] =
612 Diff::Insert(insertion.substring(..insertion.len - overlap_len2));
613 diffs[pointer + 1] = Diff::Delete(deletion.substring(overlap_len2..));
614 }
615 pointer += 1;
616 }
617 pointer += 1;
618 }
619 }
620
621 // Look for single edits surrounded on both sides by equalities which can be
622 // shifted sideways to align the edit to a word boundary.
623 //
624 // e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
cleanup_semantic_lossless(solution: &mut Solution)625 fn cleanup_semantic_lossless(solution: &mut Solution) {
626 let diffs = &mut solution.diffs;
627 let mut pointer = 1;
628 while let Some(&next_diff) = diffs.get(pointer + 1) {
629 let prev_diff = diffs[pointer - 1];
630 if let (
631 Diff::Equal(mut prev_equal1, mut prev_equal2),
632 Diff::Equal(mut next_equal1, mut next_equal2),
633 ) = (prev_diff, next_diff)
634 {
635 // This is a single edit surrounded by equalities.
636 let mut edit = diffs[pointer];
637
638 // First, shift the edit as far left as possible.
639 let common_offset = common_suffix(prev_equal1, edit.text());
640 let original_prev_len = prev_equal1.len;
641 prev_equal1.len -= common_offset;
642 prev_equal2.len -= common_offset;
643 edit.shift_left(common_offset);
644 next_equal1.offset -= common_offset;
645 next_equal1.len += common_offset;
646 next_equal2.offset -= common_offset;
647 next_equal2.len += common_offset;
648
649 // Second, step character by character right, looking for the best fit.
650 let mut best_prev_equal = (prev_equal1, prev_equal2);
651 let mut best_edit = edit;
652 let mut best_next_equal = (next_equal1, next_equal2);
653 let mut best_score = cleanup_semantic_score(prev_equal1, edit.text())
654 + cleanup_semantic_score(edit.text(), next_equal1);
655 while !edit.text().is_empty()
656 && !next_equal1.is_empty()
657 && edit.text().chars().next().unwrap() == next_equal1.chars().next().unwrap()
658 {
659 let increment = edit.text().chars().next().unwrap().len_utf8();
660 prev_equal1.len += increment;
661 prev_equal2.len += increment;
662 edit.shift_right(increment);
663 next_equal1.offset += increment;
664 next_equal1.len -= increment;
665 next_equal2.offset += increment;
666 next_equal2.len -= increment;
667 let score = cleanup_semantic_score(prev_equal1, edit.text())
668 + cleanup_semantic_score(edit.text(), next_equal1);
669 // The >= encourages trailing rather than leading whitespace on edits.
670 if score >= best_score {
671 best_score = score;
672 best_prev_equal = (prev_equal1, prev_equal2);
673 best_edit = edit;
674 best_next_equal = (next_equal1, next_equal2);
675 }
676 }
677
678 if original_prev_len != best_prev_equal.0.len {
679 // We have an improvement, save it back to the diff.
680 if best_next_equal.0.is_empty() {
681 diffs.remove(pointer + 1);
682 } else {
683 diffs[pointer + 1] = Diff::Equal(best_next_equal.0, best_next_equal.1);
684 }
685 diffs[pointer] = best_edit;
686 if best_prev_equal.0.is_empty() {
687 diffs.remove(pointer - 1);
688 pointer -= 1;
689 } else {
690 diffs[pointer - 1] = Diff::Equal(best_prev_equal.0, best_prev_equal.1);
691 }
692 }
693 }
694 pointer += 1;
695 }
696 }
697
698 // Given two strings, compute a score representing whether the internal boundary
699 // falls on logical boundaries.
700 //
701 // Scores range from 6 (best) to 0 (worst).
cleanup_semantic_score(one: Range, two: Range) -> usize702 fn cleanup_semantic_score(one: Range, two: Range) -> usize {
703 if one.is_empty() || two.is_empty() {
704 // Edges are the best.
705 return 6;
706 }
707
708 // Each port of this function behaves slightly differently due to subtle
709 // differences in each language's definition of things like 'whitespace'.
710 // Since this function's purpose is largely cosmetic, the choice has been
711 // made to use each language's native features rather than force total
712 // conformity.
713 let char1 = one.chars().next_back().unwrap();
714 let char2 = two.chars().next().unwrap();
715 let non_alphanumeric1 = !char1.is_ascii_alphanumeric();
716 let non_alphanumeric2 = !char2.is_ascii_alphanumeric();
717 let whitespace1 = non_alphanumeric1 && char1.is_ascii_whitespace();
718 let whitespace2 = non_alphanumeric2 && char2.is_ascii_whitespace();
719 let line_break1 = whitespace1 && char1.is_control();
720 let line_break2 = whitespace2 && char2.is_control();
721 let blank_line1 = line_break1 && (one.ends_with("\n\n") || one.ends_with("\n\r\n"));
722 let blank_line2 = line_break2 && (two.starts_with("\n\n") || two.starts_with("\r\n\r\n"));
723
724 if blank_line1 || blank_line2 {
725 // Five points for blank lines.
726 5
727 } else if line_break1 || line_break2 {
728 // Four points for line breaks.
729 4
730 } else if non_alphanumeric1 && !whitespace1 && whitespace2 {
731 // Three points for end of sentences.
732 3
733 } else if whitespace1 || whitespace2 {
734 // Two points for whitespace.
735 2
736 } else if non_alphanumeric1 || non_alphanumeric2 {
737 // One point for non-alphanumeric.
738 1
739 } else {
740 0
741 }
742 }
743
744 // Reorder and merge like edit sections. Merge equalities. Any edit section can
745 // move as long as it doesn't cross an equality.
cleanup_merge(solution: &mut Solution)746 fn cleanup_merge(solution: &mut Solution) {
747 let diffs = &mut solution.diffs;
748 let common_prefix = if solution.utf8 {
749 common_prefix
750 } else {
751 common_prefix_bytes
752 };
753 let common_suffix = if solution.utf8 {
754 common_suffix
755 } else {
756 common_suffix_bytes
757 };
758
759 loop {
760 if diffs.is_empty() {
761 return;
762 }
763
764 diffs.push(Diff::Equal(
765 solution.text1.substring(solution.text1.len..),
766 solution.text2.substring(solution.text2.len..),
767 )); // Add a dummy entry at the end.
768 let mut pointer = 0;
769 let mut count_delete = 0;
770 let mut count_insert = 0;
771 let mut text_delete = Range::empty();
772 let mut text_insert = Range::empty();
773 while let Some(&this_diff) = diffs.get(pointer) {
774 match this_diff {
775 Diff::Insert(text) => {
776 count_insert += 1;
777 if text_insert.is_empty() {
778 text_insert = text;
779 } else {
780 text_insert.len += text.len;
781 }
782 }
783 Diff::Delete(text) => {
784 count_delete += 1;
785 if text_delete.is_empty() {
786 text_delete = text;
787 } else {
788 text_delete.len += text.len;
789 }
790 }
791 Diff::Equal(text, _) => {
792 let count_both = count_delete + count_insert;
793 if count_both > 1 {
794 let both_types = count_delete != 0 && count_insert != 0;
795 // Delete the offending records.
796 diffs.splice(pointer - count_both..pointer, None);
797 pointer -= count_both;
798 if both_types {
799 // Factor out any common prefix.
800 let common_length = common_prefix(text_insert, text_delete);
801 if common_length != 0 {
802 if pointer > 0 {
803 match &mut diffs[pointer - 1] {
804 Diff::Equal(this_diff1, this_diff2) => {
805 this_diff1.len += common_length;
806 this_diff2.len += common_length;
807 }
808 _ => unreachable!(
809 "previous diff should have been an equality"
810 ),
811 }
812 } else {
813 diffs.insert(
814 pointer,
815 Diff::Equal(
816 text_delete.substring(..common_length),
817 text_insert.substring(..common_length),
818 ),
819 );
820 pointer += 1;
821 }
822 text_insert = text_insert.substring(common_length..);
823 text_delete = text_delete.substring(common_length..);
824 }
825 // Factor out any common suffix.
826 let common_length = common_suffix(text_insert, text_delete);
827 if common_length != 0 {
828 diffs[pointer].grow_left(common_length);
829 text_insert.len -= common_length;
830 text_delete.len -= common_length;
831 }
832 }
833 // Insert the merged records.
834 if !text_delete.is_empty() {
835 diffs.insert(pointer, Diff::Delete(text_delete));
836 pointer += 1;
837 }
838 if !text_insert.is_empty() {
839 diffs.insert(pointer, Diff::Insert(text_insert));
840 pointer += 1;
841 }
842 } else if pointer > 0 {
843 if let Some(Diff::Equal(prev_equal1, prev_equal2)) =
844 diffs.get_mut(pointer - 1)
845 {
846 // Merge this equality with the previous one.
847 prev_equal1.len += text.len;
848 prev_equal2.len += text.len;
849 diffs.remove(pointer);
850 pointer -= 1;
851 }
852 }
853 count_insert = 0;
854 count_delete = 0;
855 text_delete = Range::empty();
856 text_insert = Range::empty();
857 }
858 }
859 pointer += 1;
860 }
861 if diffs.last().unwrap().text().is_empty() {
862 diffs.pop(); // Remove the dummy entry at the end.
863 }
864
865 // Second pass: look for single edits surrounded on both sides by equalities
866 // which can be shifted sideways to eliminate an equality.
867 // e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
868 let mut changes = false;
869 let mut pointer = 1;
870 // Intentionally ignore the first and last element (don't need checking).
871 while let Some(&next_diff) = diffs.get(pointer + 1) {
872 let prev_diff = diffs[pointer - 1];
873 let this_diff = diffs[pointer];
874 if let (Diff::Equal(prev_diff, _), Diff::Equal(next_diff, _)) = (prev_diff, next_diff) {
875 // This is a single edit surrounded by equalities.
876 if this_diff.text().ends_with(prev_diff) {
877 // Shift the edit over the previous equality.
878 diffs[pointer].shift_left(prev_diff.len);
879 diffs[pointer + 1].grow_left(prev_diff.len);
880 diffs.remove(pointer - 1); // Delete prev_diff.
881 changes = true;
882 } else if this_diff.text().starts_with(next_diff) {
883 // Shift the edit over the next equality.
884 diffs[pointer - 1].grow_right(next_diff.len);
885 diffs[pointer].shift_right(next_diff.len);
886 diffs.remove(pointer + 1); // Delete next_diff.
887 changes = true;
888 }
889 }
890 pointer += 1;
891 }
892 // If shifts were made, the diff needs reordering and another shift sweep.
893 if !changes {
894 return;
895 }
896 }
897 }
898
899 impl Debug for Chunk<'_> {
fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result900 fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
901 let (name, text) = match *self {
902 Chunk::Equal(text) => ("Equal", text),
903 Chunk::Delete(text) => ("Delete", text),
904 Chunk::Insert(text) => ("Insert", text),
905 };
906 write!(formatter, "{}({:?})", name, text)
907 }
908 }
909
910 impl Debug for Diff<'_, '_> {
fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result911 fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
912 let (name, bytes) = match *self {
913 Diff::Equal(range, _) => ("Equal", bytes(range)),
914 Diff::Delete(range) => ("Delete", bytes(range)),
915 Diff::Insert(range) => ("Insert", bytes(range)),
916 };
917 let text = String::from_utf8_lossy(bytes);
918 write!(formatter, "{}({:?})", name, text)
919 }
920 }
921
922 impl<'a> From<Diff<'a, 'a>> for Chunk<'a> {
from(diff: Diff<'a, 'a>) -> Self923 fn from(diff: Diff<'a, 'a>) -> Self {
924 match diff {
925 Diff::Equal(range, _) => Chunk::Equal(str(range)),
926 Diff::Delete(range) => Chunk::Delete(str(range)),
927 Diff::Insert(range) => Chunk::Insert(str(range)),
928 }
929 }
930 }
931