1 use std::cmp::{max, min};
2 use std::collections::HashMap;
3 use std::hash::Hash;
4 use utils::calculate_ratio;
5 
6 #[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
7 pub struct Match {
8     pub first_start: usize,
9     pub second_start: usize,
10     pub size: usize,
11 }
12 
13 impl Match {
new(first_start: usize, second_start: usize, size: usize) -> Match14     fn new(first_start: usize, second_start: usize, size: usize) -> Match {
15         Match {
16             first_start,
17             second_start,
18             size,
19         }
20     }
21 }
22 
23 #[derive(Debug, Clone, PartialEq)]
24 pub struct Opcode {
25     pub tag: String,
26     pub first_start: usize,
27     pub first_end: usize,
28     pub second_start: usize,
29     pub second_end: usize,
30 }
31 
32 impl Opcode {
new( tag: String, first_start: usize, first_end: usize, second_start: usize, second_end: usize, ) -> Opcode33     fn new(
34         tag: String,
35         first_start: usize,
36         first_end: usize,
37         second_start: usize,
38         second_end: usize,
39     ) -> Opcode {
40         Opcode {
41             tag,
42             first_start,
43             first_end,
44             second_start,
45             second_end,
46         }
47     }
48 }
49 
50 pub trait Sequence: Eq + Hash {}
51 impl<T: Eq + Hash> Sequence for T {}
52 
53 pub struct SequenceMatcher<'a, T: 'a + Sequence> {
54     first_sequence: &'a [T],
55     second_sequence: &'a [T],
56     matching_blocks: Option<Vec<Match>>,
57     opcodes: Option<Vec<Opcode>>,
58     is_junk: Option<fn(&T) -> bool>,
59     second_sequence_elements: HashMap<&'a T, Vec<usize>>,
60 }
61 
62 impl<'a, T: Sequence> SequenceMatcher<'a, T> {
new<S>(first_sequence: &'a S, second_sequence: &'a S) -> SequenceMatcher<'a, T> where S: AsRef<[T]> + ?Sized,63     pub fn new<S>(first_sequence: &'a S, second_sequence: &'a S) -> SequenceMatcher<'a, T>
64     where
65         S: AsRef<[T]> + ?Sized,
66     {
67         let mut matcher = SequenceMatcher {
68             first_sequence: first_sequence.as_ref(),
69             second_sequence: second_sequence.as_ref(),
70             matching_blocks: None,
71             opcodes: None,
72             is_junk: None,
73             second_sequence_elements: HashMap::new(),
74         };
75         matcher.set_seqs(first_sequence, second_sequence);
76         matcher
77     }
78 
set_is_junk(&mut self, is_junk: Option<fn(&T) -> bool>)79     pub fn set_is_junk(&mut self, is_junk: Option<fn(&T) -> bool>) {
80         self.is_junk = is_junk;
81         self.matching_blocks = None;
82         self.opcodes = None;
83         self.chain_second_seq();
84     }
85 
set_seqs<S>(&mut self, first_sequence: &'a S, second_sequence: &'a S) where S: AsRef<[T]> + ?Sized,86     pub fn set_seqs<S>(&mut self, first_sequence: &'a S, second_sequence: &'a S)
87     where
88         S: AsRef<[T]> + ?Sized,
89     {
90         self.set_first_seq(first_sequence);
91         self.set_second_seq(second_sequence);
92     }
93 
set_first_seq<S>(&mut self, sequence: &'a S) where S: AsRef<[T]> + ?Sized,94     pub fn set_first_seq<S>(&mut self, sequence: &'a S)
95     where
96         S: AsRef<[T]> + ?Sized,
97     {
98         self.first_sequence = sequence.as_ref();
99         self.matching_blocks = None;
100         self.opcodes = None;
101     }
102 
set_second_seq<S>(&mut self, sequence: &'a S) where S: AsRef<[T]> + ?Sized,103     pub fn set_second_seq<S>(&mut self, sequence: &'a S)
104     where
105         S: AsRef<[T]> + ?Sized,
106     {
107         self.second_sequence = sequence.as_ref();
108         self.matching_blocks = None;
109         self.opcodes = None;
110         self.chain_second_seq();
111     }
112 
chain_second_seq(&mut self)113     fn chain_second_seq(&mut self) {
114         let second_sequence = self.second_sequence;
115         let mut second_sequence_elements = HashMap::new();
116         for (i, item) in second_sequence.iter().enumerate() {
117             let mut counter = second_sequence_elements
118                 .entry(item)
119                 .or_insert_with(Vec::new);
120             counter.push(i);
121         }
122         if let Some(junk_func) = self.is_junk {
123             second_sequence_elements = second_sequence_elements
124                 .into_iter()
125                 .filter(|&(element, _)| !junk_func(element))
126                 .collect();
127         }
128         // Filter out popular elements
129         let len = second_sequence.len();
130         if len >= 200 {
131             let test_len = (len as f32 / 100.0).floor() as usize + 1;
132             second_sequence_elements = second_sequence_elements
133                 .into_iter()
134                 .filter(|&(_, ref indexes)| indexes.len() > test_len)
135                 .collect();
136         }
137         self.second_sequence_elements = second_sequence_elements;
138     }
139 
find_longest_match( &self, first_start: usize, first_end: usize, second_start: usize, second_end: usize, ) -> Match140     pub fn find_longest_match(
141         &self,
142         first_start: usize,
143         first_end: usize,
144         second_start: usize,
145         second_end: usize,
146     ) -> Match {
147         let first_sequence = &self.first_sequence;
148         let second_sequence = &self.second_sequence;
149         let second_sequence_elements = &self.second_sequence_elements;
150         let (mut best_i, mut best_j, mut best_size) = (first_start, second_start, 0);
151         let mut j2len: HashMap<usize, usize> = HashMap::new();
152         for (i, item) in first_sequence
153             .iter()
154             .enumerate()
155             .take(first_end)
156             .skip(first_start)
157         {
158             let mut new_j2len: HashMap<usize, usize> = HashMap::new();
159             if let Some(indexes) = second_sequence_elements.get(item) {
160                 for j in indexes {
161                     let j = *j;
162                     if j < second_start {
163                         continue;
164                     };
165                     if j >= second_end {
166                         break;
167                     };
168                     let mut size = 0;
169                     if j > 0 {
170                         if let Some(k) = j2len.get(&(j - 1)) {
171                             size = *k;
172                         }
173                     }
174                     size += 1;
175                     new_j2len.insert(j, size);
176                     if size > best_size {
177                         best_i = i + 1 - size;
178                         best_j = j + 1 - size;
179                         best_size = size;
180                     }
181                 }
182             }
183             j2len = new_j2len;
184         }
185         for _ in 0..2 {
186             while best_i > first_start
187                 && best_j > second_start
188                 && first_sequence.get(best_i - 1) == second_sequence.get(best_j - 1)
189             {
190                 best_i -= 1;
191                 best_j -= 1;
192                 best_size += 1;
193             }
194             while best_i + best_size < first_end
195                 && best_j + best_size < second_end
196                 && first_sequence.get(best_i + best_size) == second_sequence.get(best_j + best_size)
197             {
198                 best_size += 1;
199             }
200         }
201         Match::new(best_i, best_j, best_size)
202     }
203 
get_matching_blocks(&mut self) -> Vec<Match>204     pub fn get_matching_blocks(&mut self) -> Vec<Match> {
205         if self.matching_blocks.as_ref().is_some() {
206             return self.matching_blocks.as_ref().unwrap().clone();
207         }
208         let (first_length, second_length) = (self.first_sequence.len(), self.second_sequence.len());
209         let mut matches = Vec::new();
210         let mut queue = vec![(0, first_length, 0, second_length)];
211         while !queue.is_empty() {
212             let (first_start, first_end, second_start, second_end) = queue.pop().unwrap();
213             let m = self.find_longest_match(first_start, first_end, second_start, second_end);
214             match m.size {
215                 0 => {}
216                 _ => {
217                     if first_start < m.first_start && second_start < m.second_start {
218                         queue.push((first_start, m.first_start, second_start, m.second_start));
219                     }
220                     if m.first_start + m.size < first_end && m.second_start + m.size < second_end {
221                         queue.push((
222                             m.first_start + m.size,
223                             first_end,
224                             m.second_start + m.size,
225                             second_end,
226                         ));
227                     }
228                     matches.push(m);
229                 }
230             }
231         }
232         matches.sort_by(|a, b| a.cmp(b));
233         let (mut first_start, mut second_start, mut size) = (0, 0, 0);
234         let mut non_adjacent = Vec::new();
235         for m in &matches {
236             if first_start + size == m.first_start && second_start + size == m.second_start {
237                 size += m.size
238             } else {
239                 if size != 0 {
240                     non_adjacent.push(Match::new(first_start, second_start, size));
241                 }
242                 first_start = m.first_start;
243                 second_start = m.second_start;
244                 size = m.size;
245             }
246         }
247         if size != 0 {
248             non_adjacent.push(Match::new(first_start, second_start, size));
249         }
250         non_adjacent.push(Match::new(first_length, second_length, 0));
251         self.matching_blocks = Some(non_adjacent);
252         self.matching_blocks.as_ref().unwrap().clone()
253     }
254 
get_opcodes(&mut self) -> Vec<Opcode>255     pub fn get_opcodes(&mut self) -> Vec<Opcode> {
256         if self.opcodes.as_ref().is_some() {
257             return self.opcodes.as_ref().unwrap().clone();
258         }
259         let mut opcodes = Vec::new();
260         let (mut i, mut j) = (0, 0);
261         for m in self.get_matching_blocks() {
262             let mut tag = String::new();
263             if i < m.first_start && j < m.second_start {
264                 tag = String::from("replace");
265             } else if i < m.first_start {
266                 tag = String::from("delete");
267             } else if j < m.second_start {
268                 tag = String::from("insert");
269             }
270             if !tag.is_empty() {
271                 opcodes.push(Opcode::new(tag, i, m.first_start, j, m.second_start));
272             }
273             i = m.first_start + m.size;
274             j = m.second_start + m.size;
275             if m.size != 0 {
276                 opcodes.push(Opcode::new(
277                     String::from("equal"),
278                     m.first_start,
279                     i,
280                     m.second_start,
281                     j,
282                 ));
283             }
284         }
285         self.opcodes = Some(opcodes);
286         self.opcodes.as_ref().unwrap().clone()
287     }
288 
get_grouped_opcodes(&mut self, n: usize) -> Vec<Vec<Opcode>>289     pub fn get_grouped_opcodes(&mut self, n: usize) -> Vec<Vec<Opcode>> {
290         let mut res = Vec::new();
291         let mut codes = self.get_opcodes();
292         if codes.is_empty() {
293             codes.push(Opcode::new("equal".to_string(), 0, 1, 0, 1));
294         }
295 
296         if codes.first().unwrap().tag == "equal" {
297             let opcode = codes.first_mut().unwrap();
298             opcode.first_start = max(opcode.first_start, opcode.first_end.saturating_sub(n));
299             opcode.second_start = max(opcode.second_start, opcode.second_end.saturating_sub(n));
300         }
301         if codes.last().unwrap().tag == "equal" {
302             let opcode = codes.last_mut().unwrap();
303             opcode.first_end = min(opcode.first_start + n, opcode.first_end);
304             opcode.second_end = min(opcode.second_start + n, opcode.second_end);
305         }
306         let nn = n + n;
307         let mut group = Vec::new();
308         for code in &codes {
309             let (mut first_start, mut second_start) = (code.first_start, code.second_start);
310             if code.tag == "equal" && code.first_end - code.first_start > nn {
311                 group.push(Opcode::new(
312                     code.tag.clone(),
313                     code.first_start,
314                     min(code.first_end, code.first_start + n),
315                     code.second_start,
316                     min(code.second_end, code.second_start + n),
317                 ));
318                 res.push(group.clone());
319                 group.clear();
320                 first_start = max(first_start, code.first_end.saturating_sub(n));
321                 second_start = max(second_start, code.second_end.saturating_sub(n));
322             }
323             group.push(Opcode::new(
324                 code.tag.clone(),
325                 first_start,
326                 code.first_end,
327                 second_start,
328                 code.second_end,
329             ));
330         }
331         if !(group.len() == 1 && group.first().unwrap().tag == "equal") || group.is_empty() {
332             res.push(group.clone());
333         }
334         res
335     }
336 
ratio(&mut self) -> f32337     pub fn ratio(&mut self) -> f32 {
338         let matches = self.get_matching_blocks()
339             .iter()
340             .fold(0, |res, &m| res + m.size);
341         calculate_ratio(
342             matches,
343             self.first_sequence.len() + self.second_sequence.len(),
344         )
345     }
346 }
347