1 use std::cmp::{max, min}; 2 use std::collections::HashMap; 3 use std::hash::Hash; 4 use utils::calculate_ratio; 5 6 #[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)] 7 pub struct Match { 8 pub first_start: usize, 9 pub second_start: usize, 10 pub size: usize, 11 } 12 13 impl Match { new(first_start: usize, second_start: usize, size: usize) -> Match14 fn new(first_start: usize, second_start: usize, size: usize) -> Match { 15 Match { 16 first_start, 17 second_start, 18 size, 19 } 20 } 21 } 22 23 #[derive(Debug, Clone, PartialEq)] 24 pub struct Opcode { 25 pub tag: String, 26 pub first_start: usize, 27 pub first_end: usize, 28 pub second_start: usize, 29 pub second_end: usize, 30 } 31 32 impl Opcode { new( tag: String, first_start: usize, first_end: usize, second_start: usize, second_end: usize, ) -> Opcode33 fn new( 34 tag: String, 35 first_start: usize, 36 first_end: usize, 37 second_start: usize, 38 second_end: usize, 39 ) -> Opcode { 40 Opcode { 41 tag, 42 first_start, 43 first_end, 44 second_start, 45 second_end, 46 } 47 } 48 } 49 50 pub trait Sequence: Eq + Hash {} 51 impl<T: Eq + Hash> Sequence for T {} 52 53 pub struct SequenceMatcher<'a, T: 'a + Sequence> { 54 first_sequence: &'a [T], 55 second_sequence: &'a [T], 56 matching_blocks: Option<Vec<Match>>, 57 opcodes: Option<Vec<Opcode>>, 58 is_junk: Option<fn(&T) -> bool>, 59 second_sequence_elements: HashMap<&'a T, Vec<usize>>, 60 } 61 62 impl<'a, T: Sequence> SequenceMatcher<'a, T> { new<S>(first_sequence: &'a S, second_sequence: &'a S) -> SequenceMatcher<'a, T> where S: AsRef<[T]> + ?Sized,63 pub fn new<S>(first_sequence: &'a S, second_sequence: &'a S) -> SequenceMatcher<'a, T> 64 where 65 S: AsRef<[T]> + ?Sized, 66 { 67 let mut matcher = SequenceMatcher { 68 first_sequence: first_sequence.as_ref(), 69 second_sequence: second_sequence.as_ref(), 70 matching_blocks: None, 71 opcodes: None, 72 is_junk: None, 73 second_sequence_elements: HashMap::new(), 74 }; 75 matcher.set_seqs(first_sequence, second_sequence); 76 matcher 77 } 78 set_is_junk(&mut self, is_junk: Option<fn(&T) -> bool>)79 pub fn set_is_junk(&mut self, is_junk: Option<fn(&T) -> bool>) { 80 self.is_junk = is_junk; 81 self.matching_blocks = None; 82 self.opcodes = None; 83 self.chain_second_seq(); 84 } 85 set_seqs<S>(&mut self, first_sequence: &'a S, second_sequence: &'a S) where S: AsRef<[T]> + ?Sized,86 pub fn set_seqs<S>(&mut self, first_sequence: &'a S, second_sequence: &'a S) 87 where 88 S: AsRef<[T]> + ?Sized, 89 { 90 self.set_first_seq(first_sequence); 91 self.set_second_seq(second_sequence); 92 } 93 set_first_seq<S>(&mut self, sequence: &'a S) where S: AsRef<[T]> + ?Sized,94 pub fn set_first_seq<S>(&mut self, sequence: &'a S) 95 where 96 S: AsRef<[T]> + ?Sized, 97 { 98 self.first_sequence = sequence.as_ref(); 99 self.matching_blocks = None; 100 self.opcodes = None; 101 } 102 set_second_seq<S>(&mut self, sequence: &'a S) where S: AsRef<[T]> + ?Sized,103 pub fn set_second_seq<S>(&mut self, sequence: &'a S) 104 where 105 S: AsRef<[T]> + ?Sized, 106 { 107 self.second_sequence = sequence.as_ref(); 108 self.matching_blocks = None; 109 self.opcodes = None; 110 self.chain_second_seq(); 111 } 112 chain_second_seq(&mut self)113 fn chain_second_seq(&mut self) { 114 let second_sequence = self.second_sequence; 115 let mut second_sequence_elements = HashMap::new(); 116 for (i, item) in second_sequence.iter().enumerate() { 117 let mut counter = second_sequence_elements 118 .entry(item) 119 .or_insert_with(Vec::new); 120 counter.push(i); 121 } 122 if let Some(junk_func) = self.is_junk { 123 second_sequence_elements = second_sequence_elements 124 .into_iter() 125 .filter(|&(element, _)| !junk_func(element)) 126 .collect(); 127 } 128 // Filter out popular elements 129 let len = second_sequence.len(); 130 if len >= 200 { 131 let test_len = (len as f32 / 100.0).floor() as usize + 1; 132 second_sequence_elements = second_sequence_elements 133 .into_iter() 134 .filter(|&(_, ref indexes)| indexes.len() > test_len) 135 .collect(); 136 } 137 self.second_sequence_elements = second_sequence_elements; 138 } 139 find_longest_match( &self, first_start: usize, first_end: usize, second_start: usize, second_end: usize, ) -> Match140 pub fn find_longest_match( 141 &self, 142 first_start: usize, 143 first_end: usize, 144 second_start: usize, 145 second_end: usize, 146 ) -> Match { 147 let first_sequence = &self.first_sequence; 148 let second_sequence = &self.second_sequence; 149 let second_sequence_elements = &self.second_sequence_elements; 150 let (mut best_i, mut best_j, mut best_size) = (first_start, second_start, 0); 151 let mut j2len: HashMap<usize, usize> = HashMap::new(); 152 for (i, item) in first_sequence 153 .iter() 154 .enumerate() 155 .take(first_end) 156 .skip(first_start) 157 { 158 let mut new_j2len: HashMap<usize, usize> = HashMap::new(); 159 if let Some(indexes) = second_sequence_elements.get(item) { 160 for j in indexes { 161 let j = *j; 162 if j < second_start { 163 continue; 164 }; 165 if j >= second_end { 166 break; 167 }; 168 let mut size = 0; 169 if j > 0 { 170 if let Some(k) = j2len.get(&(j - 1)) { 171 size = *k; 172 } 173 } 174 size += 1; 175 new_j2len.insert(j, size); 176 if size > best_size { 177 best_i = i + 1 - size; 178 best_j = j + 1 - size; 179 best_size = size; 180 } 181 } 182 } 183 j2len = new_j2len; 184 } 185 for _ in 0..2 { 186 while best_i > first_start 187 && best_j > second_start 188 && first_sequence.get(best_i - 1) == second_sequence.get(best_j - 1) 189 { 190 best_i -= 1; 191 best_j -= 1; 192 best_size += 1; 193 } 194 while best_i + best_size < first_end 195 && best_j + best_size < second_end 196 && first_sequence.get(best_i + best_size) == second_sequence.get(best_j + best_size) 197 { 198 best_size += 1; 199 } 200 } 201 Match::new(best_i, best_j, best_size) 202 } 203 get_matching_blocks(&mut self) -> Vec<Match>204 pub fn get_matching_blocks(&mut self) -> Vec<Match> { 205 if self.matching_blocks.as_ref().is_some() { 206 return self.matching_blocks.as_ref().unwrap().clone(); 207 } 208 let (first_length, second_length) = (self.first_sequence.len(), self.second_sequence.len()); 209 let mut matches = Vec::new(); 210 let mut queue = vec![(0, first_length, 0, second_length)]; 211 while !queue.is_empty() { 212 let (first_start, first_end, second_start, second_end) = queue.pop().unwrap(); 213 let m = self.find_longest_match(first_start, first_end, second_start, second_end); 214 match m.size { 215 0 => {} 216 _ => { 217 if first_start < m.first_start && second_start < m.second_start { 218 queue.push((first_start, m.first_start, second_start, m.second_start)); 219 } 220 if m.first_start + m.size < first_end && m.second_start + m.size < second_end { 221 queue.push(( 222 m.first_start + m.size, 223 first_end, 224 m.second_start + m.size, 225 second_end, 226 )); 227 } 228 matches.push(m); 229 } 230 } 231 } 232 matches.sort_by(|a, b| a.cmp(b)); 233 let (mut first_start, mut second_start, mut size) = (0, 0, 0); 234 let mut non_adjacent = Vec::new(); 235 for m in &matches { 236 if first_start + size == m.first_start && second_start + size == m.second_start { 237 size += m.size 238 } else { 239 if size != 0 { 240 non_adjacent.push(Match::new(first_start, second_start, size)); 241 } 242 first_start = m.first_start; 243 second_start = m.second_start; 244 size = m.size; 245 } 246 } 247 if size != 0 { 248 non_adjacent.push(Match::new(first_start, second_start, size)); 249 } 250 non_adjacent.push(Match::new(first_length, second_length, 0)); 251 self.matching_blocks = Some(non_adjacent); 252 self.matching_blocks.as_ref().unwrap().clone() 253 } 254 get_opcodes(&mut self) -> Vec<Opcode>255 pub fn get_opcodes(&mut self) -> Vec<Opcode> { 256 if self.opcodes.as_ref().is_some() { 257 return self.opcodes.as_ref().unwrap().clone(); 258 } 259 let mut opcodes = Vec::new(); 260 let (mut i, mut j) = (0, 0); 261 for m in self.get_matching_blocks() { 262 let mut tag = String::new(); 263 if i < m.first_start && j < m.second_start { 264 tag = String::from("replace"); 265 } else if i < m.first_start { 266 tag = String::from("delete"); 267 } else if j < m.second_start { 268 tag = String::from("insert"); 269 } 270 if !tag.is_empty() { 271 opcodes.push(Opcode::new(tag, i, m.first_start, j, m.second_start)); 272 } 273 i = m.first_start + m.size; 274 j = m.second_start + m.size; 275 if m.size != 0 { 276 opcodes.push(Opcode::new( 277 String::from("equal"), 278 m.first_start, 279 i, 280 m.second_start, 281 j, 282 )); 283 } 284 } 285 self.opcodes = Some(opcodes); 286 self.opcodes.as_ref().unwrap().clone() 287 } 288 get_grouped_opcodes(&mut self, n: usize) -> Vec<Vec<Opcode>>289 pub fn get_grouped_opcodes(&mut self, n: usize) -> Vec<Vec<Opcode>> { 290 let mut res = Vec::new(); 291 let mut codes = self.get_opcodes(); 292 if codes.is_empty() { 293 codes.push(Opcode::new("equal".to_string(), 0, 1, 0, 1)); 294 } 295 296 if codes.first().unwrap().tag == "equal" { 297 let opcode = codes.first_mut().unwrap(); 298 opcode.first_start = max(opcode.first_start, opcode.first_end.saturating_sub(n)); 299 opcode.second_start = max(opcode.second_start, opcode.second_end.saturating_sub(n)); 300 } 301 if codes.last().unwrap().tag == "equal" { 302 let opcode = codes.last_mut().unwrap(); 303 opcode.first_end = min(opcode.first_start + n, opcode.first_end); 304 opcode.second_end = min(opcode.second_start + n, opcode.second_end); 305 } 306 let nn = n + n; 307 let mut group = Vec::new(); 308 for code in &codes { 309 let (mut first_start, mut second_start) = (code.first_start, code.second_start); 310 if code.tag == "equal" && code.first_end - code.first_start > nn { 311 group.push(Opcode::new( 312 code.tag.clone(), 313 code.first_start, 314 min(code.first_end, code.first_start + n), 315 code.second_start, 316 min(code.second_end, code.second_start + n), 317 )); 318 res.push(group.clone()); 319 group.clear(); 320 first_start = max(first_start, code.first_end.saturating_sub(n)); 321 second_start = max(second_start, code.second_end.saturating_sub(n)); 322 } 323 group.push(Opcode::new( 324 code.tag.clone(), 325 first_start, 326 code.first_end, 327 second_start, 328 code.second_end, 329 )); 330 } 331 if !(group.len() == 1 && group.first().unwrap().tag == "equal") || group.is_empty() { 332 res.push(group.clone()); 333 } 334 res 335 } 336 ratio(&mut self) -> f32337 pub fn ratio(&mut self) -> f32 { 338 let matches = self.get_matching_blocks() 339 .iter() 340 .fold(0, |res, &m| res + m.size); 341 calculate_ratio( 342 matches, 343 self.first_sequence.len() + self.second_sequence.len(), 344 ) 345 } 346 } 347