1 use crossbeam_channel::Receiver;
2 use humansize::{file_size_opts as options, FileSize};
3 use std::collections::BTreeMap;
4 #[cfg(target_family = "unix")]
5 use std::collections::HashSet;
6 use std::fs::{File, Metadata, OpenOptions};
7 use std::io::prelude::*;
8 use std::io::{self, Error, ErrorKind};
9 #[cfg(target_family = "unix")]
10 use std::os::unix::fs::MetadataExt;
11 use std::path::{Path, PathBuf};
12 use std::time::{Duration, SystemTime, UNIX_EPOCH};
13 use std::{fs, mem, thread};
14
15 use crate::common::Common;
16 use crate::common_directory::Directories;
17 use crate::common_extensions::Extensions;
18 use crate::common_items::ExcludedItems;
19 use crate::common_messages::Messages;
20 use crate::common_traits::*;
21 use directories_next::ProjectDirs;
22 use rayon::prelude::*;
23 use std::hash::Hasher;
24 use std::io::{BufReader, BufWriter};
25 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
26 use std::sync::Arc;
27 use std::thread::sleep;
28
29 const HASH_MB_LIMIT_BYTES: u64 = 1024 * 1024; // 1MB
30
31 const CACHE_FILE_NAME: &str = "cache_duplicates.txt";
32
33 #[derive(Debug)]
34 pub struct ProgressData {
35 pub checking_method: CheckingMethod,
36 pub current_stage: u8,
37 pub max_stage: u8,
38 pub files_checked: usize,
39 pub files_to_check: usize,
40 }
41
42 #[derive(PartialEq, Eq, Clone, Debug)]
43 pub enum CheckingMethod {
44 None,
45 Name,
46 Size,
47 Hash,
48 HashMb,
49 }
50
51 impl MyHasher for blake3::Hasher {
update(&mut self, bytes: &[u8])52 fn update(&mut self, bytes: &[u8]) {
53 self.update(bytes);
54 }
finalize(&self) -> String55 fn finalize(&self) -> String {
56 self.finalize().to_hex().to_string()
57 }
58 }
59
60 impl MyHasher for crc32fast::Hasher {
update(&mut self, bytes: &[u8])61 fn update(&mut self, bytes: &[u8]) {
62 self.write(bytes);
63 }
finalize(&self) -> String64 fn finalize(&self) -> String {
65 self.finish().to_string()
66 }
67 }
68
69 impl MyHasher for xxhash_rust::xxh3::Xxh3 {
update(&mut self, bytes: &[u8])70 fn update(&mut self, bytes: &[u8]) {
71 self.write(bytes);
72 }
finalize(&self) -> String73 fn finalize(&self) -> String {
74 self.finish().to_string()
75 }
76 }
77
78 #[derive(PartialEq, Eq, Clone, Debug, Copy)]
79 pub enum HashType {
80 Blake3,
81 Crc32,
82 Xxh3,
83 }
84
85 impl HashType {
hasher(self: &HashType) -> Box<dyn MyHasher>86 fn hasher(self: &HashType) -> Box<dyn MyHasher> {
87 match self {
88 HashType::Blake3 => Box::new(blake3::Hasher::new()),
89 HashType::Crc32 => Box::new(crc32fast::Hasher::new()),
90 HashType::Xxh3 => Box::new(xxhash_rust::xxh3::Xxh3::new()),
91 }
92 }
93 }
94
95 #[derive(Eq, PartialEq, Clone, Debug)]
96 pub enum DeleteMethod {
97 None,
98 AllExceptNewest,
99 AllExceptOldest,
100 OneOldest,
101 OneNewest,
102 HardLink,
103 }
104
105 #[derive(Clone, Debug, PartialEq, Default)]
106 pub struct FileEntry {
107 pub path: PathBuf,
108 pub size: u64,
109 pub modified_date: u64,
110 pub hash: String,
111 }
112
113 /// Info struck with helpful information's about results
114 #[derive(Default)]
115 pub struct Info {
116 pub number_of_groups_by_size: usize,
117 pub number_of_duplicated_files_by_size: usize,
118 pub number_of_groups_by_hash: usize,
119 pub number_of_duplicated_files_by_hash: usize,
120 pub number_of_groups_by_name: usize,
121 pub number_of_duplicated_files_by_name: usize,
122 pub lost_space_by_size: u64,
123 pub lost_space_by_hash: u64,
124 pub bytes_read_when_hashing: u64,
125 pub number_of_removed_files: usize,
126 pub number_of_failed_to_remove_files: usize,
127 pub gained_space: u64,
128 }
129
130 impl Info {
new() -> Self131 pub fn new() -> Self {
132 Default::default()
133 }
134 }
135
136 /// Struct with required information's to work
137 pub struct DuplicateFinder {
138 text_messages: Messages,
139 information: Info,
140 files_with_identical_names: BTreeMap<String, Vec<FileEntry>>, // File Size, File Entry
141 files_with_identical_size: BTreeMap<u64, Vec<FileEntry>>, // File Size, File Entry
142 files_with_identical_hashes: BTreeMap<u64, Vec<Vec<FileEntry>>>, // File Size, File Entry
143 directories: Directories,
144 allowed_extensions: Extensions,
145 excluded_items: ExcludedItems,
146 recursive_search: bool,
147 minimal_file_size: u64,
148 maximal_file_size: u64,
149 check_method: CheckingMethod,
150 delete_method: DeleteMethod,
151 hash_type: HashType,
152 ignore_hard_links: bool,
153 dryrun: bool,
154 stopped_search: bool,
155 use_cache: bool,
156 minimal_cache_file_size: u64,
157 }
158
159 impl DuplicateFinder {
new() -> Self160 pub fn new() -> Self {
161 Self {
162 text_messages: Messages::new(),
163 information: Info::new(),
164 files_with_identical_names: Default::default(),
165 files_with_identical_size: Default::default(),
166 files_with_identical_hashes: Default::default(),
167 recursive_search: true,
168 allowed_extensions: Extensions::new(),
169 check_method: CheckingMethod::None,
170 delete_method: DeleteMethod::None,
171 minimal_file_size: 8192,
172 maximal_file_size: u64::MAX,
173 directories: Directories::new(),
174 excluded_items: ExcludedItems::new(),
175 stopped_search: false,
176 ignore_hard_links: true,
177 hash_type: HashType::Blake3,
178 dryrun: false,
179 use_cache: true,
180 minimal_cache_file_size: 2 * 1024 * 1024, // By default cache only >= 1MB files
181 }
182 }
183
find_duplicates(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>)184 pub fn find_duplicates(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) {
185 self.directories.optimize_directories(self.recursive_search, &mut self.text_messages);
186
187 match self.check_method {
188 CheckingMethod::Name => {
189 if !self.check_files_name(stop_receiver, progress_sender) {
190 self.stopped_search = true;
191 return;
192 }
193 }
194 CheckingMethod::Size => {
195 if !self.check_files_size(stop_receiver, progress_sender) {
196 self.stopped_search = true;
197 return;
198 }
199 }
200 CheckingMethod::HashMb | CheckingMethod::Hash => {
201 if !self.check_files_size(stop_receiver, progress_sender) {
202 self.stopped_search = true;
203 return;
204 }
205 if !self.check_files_hash(stop_receiver, progress_sender) {
206 self.stopped_search = true;
207 return;
208 }
209 }
210 CheckingMethod::None => {
211 panic!();
212 }
213 }
214 self.delete_files();
215 self.debug_print();
216 }
217
get_check_method(&self) -> &CheckingMethod218 pub const fn get_check_method(&self) -> &CheckingMethod {
219 &self.check_method
220 }
221
get_stopped_search(&self) -> bool222 pub fn get_stopped_search(&self) -> bool {
223 self.stopped_search
224 }
225
set_minimal_cache_file_size(&mut self, minimal_cache_file_size: u64)226 pub fn set_minimal_cache_file_size(&mut self, minimal_cache_file_size: u64) {
227 self.minimal_cache_file_size = minimal_cache_file_size;
228 }
229
get_files_sorted_by_names(&self) -> &BTreeMap<String, Vec<FileEntry>>230 pub const fn get_files_sorted_by_names(&self) -> &BTreeMap<String, Vec<FileEntry>> {
231 &self.files_with_identical_names
232 }
233
set_use_cache(&mut self, use_cache: bool)234 pub fn set_use_cache(&mut self, use_cache: bool) {
235 self.use_cache = use_cache;
236 }
237
get_files_sorted_by_size(&self) -> &BTreeMap<u64, Vec<FileEntry>>238 pub const fn get_files_sorted_by_size(&self) -> &BTreeMap<u64, Vec<FileEntry>> {
239 &self.files_with_identical_size
240 }
241
get_files_sorted_by_hash(&self) -> &BTreeMap<u64, Vec<Vec<FileEntry>>>242 pub const fn get_files_sorted_by_hash(&self) -> &BTreeMap<u64, Vec<Vec<FileEntry>>> {
243 &self.files_with_identical_hashes
244 }
set_maximal_file_size(&mut self, maximal_file_size: u64)245 pub fn set_maximal_file_size(&mut self, maximal_file_size: u64) {
246 self.maximal_file_size = match maximal_file_size {
247 0 => 1,
248 t => t,
249 };
250 }
251
get_text_messages(&self) -> &Messages252 pub const fn get_text_messages(&self) -> &Messages {
253 &self.text_messages
254 }
255
get_information(&self) -> &Info256 pub const fn get_information(&self) -> &Info {
257 &self.information
258 }
259
set_hash_type(&mut self, hash_type: HashType)260 pub fn set_hash_type(&mut self, hash_type: HashType) {
261 self.hash_type = hash_type;
262 }
263
set_ignore_hard_links(&mut self, ignore_hard_links: bool)264 pub fn set_ignore_hard_links(&mut self, ignore_hard_links: bool) {
265 self.ignore_hard_links = ignore_hard_links;
266 }
267
set_dryrun(&mut self, dryrun: bool)268 pub fn set_dryrun(&mut self, dryrun: bool) {
269 self.dryrun = dryrun;
270 }
271
set_check_method(&mut self, check_method: CheckingMethod)272 pub fn set_check_method(&mut self, check_method: CheckingMethod) {
273 self.check_method = check_method;
274 }
275
set_delete_method(&mut self, delete_method: DeleteMethod)276 pub fn set_delete_method(&mut self, delete_method: DeleteMethod) {
277 self.delete_method = delete_method;
278 }
279
set_minimal_file_size(&mut self, minimal_file_size: u64)280 pub fn set_minimal_file_size(&mut self, minimal_file_size: u64) {
281 self.minimal_file_size = match minimal_file_size {
282 0 => 1,
283 t => t,
284 };
285 }
286
set_recursive_search(&mut self, recursive_search: bool)287 pub fn set_recursive_search(&mut self, recursive_search: bool) {
288 self.recursive_search = recursive_search;
289 }
290
set_included_directory(&mut self, included_directory: Vec<PathBuf>) -> bool291 pub fn set_included_directory(&mut self, included_directory: Vec<PathBuf>) -> bool {
292 self.directories.set_included_directory(included_directory, &mut self.text_messages)
293 }
294
set_excluded_directory(&mut self, excluded_directory: Vec<PathBuf>)295 pub fn set_excluded_directory(&mut self, excluded_directory: Vec<PathBuf>) {
296 self.directories.set_excluded_directory(excluded_directory, &mut self.text_messages);
297 }
set_allowed_extensions(&mut self, allowed_extensions: String)298 pub fn set_allowed_extensions(&mut self, allowed_extensions: String) {
299 self.allowed_extensions.set_allowed_extensions(allowed_extensions, &mut self.text_messages);
300 }
301
set_excluded_items(&mut self, excluded_items: Vec<String>)302 pub fn set_excluded_items(&mut self, excluded_items: Vec<String>) {
303 self.excluded_items.set_excluded_items(excluded_items, &mut self.text_messages);
304 }
305
check_files_name(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool306 fn check_files_name(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool {
307 let start_time: SystemTime = SystemTime::now();
308 let mut folders_to_check: Vec<PathBuf> = Vec::with_capacity(1024 * 2); // This should be small enough too not see to big difference and big enough to store most of paths without needing to resize vector
309
310 // Add root folders for finding
311 for id in &self.directories.included_directories {
312 folders_to_check.push(id.clone());
313 }
314
315 //// PROGRESS THREAD START
316 const LOOP_DURATION: u32 = 200; //in ms
317 let progress_thread_run = Arc::new(AtomicBool::new(true));
318
319 let atomic_file_counter = Arc::new(AtomicUsize::new(0));
320
321 let progress_thread_handle;
322 if let Some(progress_sender) = progress_sender {
323 let progress_send = progress_sender.clone();
324 let progress_thread_run = progress_thread_run.clone();
325 let atomic_file_counter = atomic_file_counter.clone();
326 progress_thread_handle = thread::spawn(move || loop {
327 progress_send
328 .unbounded_send(ProgressData {
329 checking_method: CheckingMethod::Name,
330 current_stage: 0,
331 max_stage: 0,
332 files_checked: atomic_file_counter.load(Ordering::Relaxed) as usize,
333 files_to_check: 0,
334 })
335 .unwrap();
336 if !progress_thread_run.load(Ordering::Relaxed) {
337 break;
338 }
339 sleep(Duration::from_millis(LOOP_DURATION as u64));
340 });
341 } else {
342 progress_thread_handle = thread::spawn(|| {});
343 }
344
345 //// PROGRESS THREAD END
346
347 while !folders_to_check.is_empty() {
348 if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
349 // End thread which send info to gui
350 progress_thread_run.store(false, Ordering::Relaxed);
351 progress_thread_handle.join().unwrap();
352 return false;
353 }
354
355 let current_folder = folders_to_check.pop().unwrap();
356
357 // Read current dir, if permission are denied just go to next
358 let read_dir = match fs::read_dir(¤t_folder) {
359 Ok(t) => t,
360 Err(e) => {
361 self.text_messages.warnings.push(format!("Cannot open dir {}, reason {}", current_folder.display(), e));
362 continue;
363 } // Permissions denied
364 };
365
366 // Check every sub folder/file/link etc.
367 'dir: for entry in read_dir {
368 let entry_data = match entry {
369 Ok(t) => t,
370 Err(e) => {
371 self.text_messages.warnings.push(format!("Cannot read entry in dir {}, reason {}", current_folder.display(), e));
372 continue 'dir;
373 } //Permissions denied
374 };
375 let metadata: Metadata = match entry_data.metadata() {
376 Ok(t) => t,
377 Err(e) => {
378 self.text_messages.warnings.push(format!("Cannot read metadata in dir {}, reason {}", current_folder.display(), e));
379 continue 'dir;
380 } //Permissions denied
381 };
382 if metadata.is_dir() {
383 if !self.recursive_search {
384 continue 'dir;
385 }
386
387 let next_folder = current_folder.join(entry_data.file_name());
388 if self.directories.is_excluded(&next_folder) {
389 continue 'dir;
390 }
391
392 if self.excluded_items.is_excluded(&next_folder) {
393 continue 'dir;
394 }
395
396 folders_to_check.push(next_folder);
397 } else if metadata.is_file() {
398 atomic_file_counter.fetch_add(1, Ordering::Relaxed);
399 // let mut have_valid_extension: bool;
400 let file_name_lowercase: String = match entry_data.file_name().into_string() {
401 Ok(t) => t,
402 Err(_inspected) => {
403 println!("File {:?} has not valid UTF-8 name", entry_data);
404 continue 'dir;
405 }
406 }
407 .to_lowercase();
408
409 // Checking allowed extensions
410 if !self.allowed_extensions.file_extensions.is_empty() {
411 let allowed = self.allowed_extensions.file_extensions.iter().any(|e| file_name_lowercase.ends_with((".".to_string() + e.to_lowercase().as_str()).as_str()));
412 if !allowed {
413 // Not an allowed extension, ignore it.
414 continue 'dir;
415 }
416 }
417 // Checking files
418 if (self.minimal_file_size..=self.maximal_file_size).contains(&metadata.len()) {
419 let current_file_name = current_folder.join(entry_data.file_name());
420 if self.excluded_items.is_excluded(¤t_file_name) {
421 continue 'dir;
422 }
423
424 // Creating new file entry
425 let fe: FileEntry = FileEntry {
426 path: current_file_name.clone(),
427 size: metadata.len(),
428 modified_date: match metadata.modified() {
429 Ok(t) => match t.duration_since(UNIX_EPOCH) {
430 Ok(d) => d.as_secs(),
431 Err(_inspected) => {
432 self.text_messages.warnings.push(format!("File {} seems to be modified before Unix Epoch.", current_file_name.display()));
433 0
434 }
435 },
436 Err(e) => {
437 self.text_messages.warnings.push(format!("Unable to get modification date from file {}, reason {}", current_file_name.display(), e));
438 0
439 } // Permissions Denied
440 },
441 hash: "".to_string(),
442 };
443
444 // Adding files to BTreeMap
445 self.files_with_identical_names.entry(entry_data.file_name().to_string_lossy().to_string()).or_insert_with(Vec::new);
446 self.files_with_identical_names.get_mut(&entry_data.file_name().to_string_lossy().to_string()).unwrap().push(fe);
447 }
448 }
449 }
450 }
451
452 // End thread which send info to gui
453 progress_thread_run.store(false, Ordering::Relaxed);
454 progress_thread_handle.join().unwrap();
455
456 // Create new BTreeMap without single size entries(files have not duplicates)
457 let mut new_map: BTreeMap<String, Vec<FileEntry>> = Default::default();
458
459 for (name, vector) in &self.files_with_identical_names {
460 if vector.len() > 1 {
461 self.information.number_of_duplicated_files_by_name += vector.len() - 1;
462 self.information.number_of_groups_by_name += 1;
463 new_map.insert(name.clone(), vector.clone());
464 }
465 }
466 self.files_with_identical_names = new_map;
467
468 Common::print_time(start_time, SystemTime::now(), "check_files_name".to_string());
469 true
470 }
471
472 /// Read file length and puts it to different boxes(each for different lengths)
473 /// If in box is only 1 result, then it is removed
check_files_size(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool474 fn check_files_size(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool {
475 let start_time: SystemTime = SystemTime::now();
476 let mut folders_to_check: Vec<PathBuf> = Vec::with_capacity(1024 * 2); // This should be small enough too not see to big difference and big enough to store most of paths without needing to resize vector
477
478 // Add root folders for finding
479 for id in &self.directories.included_directories {
480 folders_to_check.push(id.clone());
481 }
482
483 //// PROGRESS THREAD START
484 const LOOP_DURATION: u32 = 200; //in ms
485 let progress_thread_run = Arc::new(AtomicBool::new(true));
486
487 let atomic_file_counter = Arc::new(AtomicUsize::new(0));
488
489 let progress_thread_handle;
490 if let Some(progress_sender) = progress_sender {
491 let progress_send = progress_sender.clone();
492 let progress_thread_run = progress_thread_run.clone();
493 let atomic_file_counter = atomic_file_counter.clone();
494 let checking_method = self.check_method.clone();
495 let max_stage = match self.check_method {
496 CheckingMethod::Size => 0,
497 CheckingMethod::HashMb | CheckingMethod::Hash => 2,
498 _ => 255,
499 };
500 progress_thread_handle = thread::spawn(move || loop {
501 progress_send
502 .unbounded_send(ProgressData {
503 checking_method: checking_method.clone(),
504 current_stage: 0,
505 max_stage,
506 files_checked: atomic_file_counter.load(Ordering::Relaxed) as usize,
507 files_to_check: 0,
508 })
509 .unwrap();
510 if !progress_thread_run.load(Ordering::Relaxed) {
511 break;
512 }
513 sleep(Duration::from_millis(LOOP_DURATION as u64));
514 });
515 } else {
516 progress_thread_handle = thread::spawn(|| {});
517 }
518
519 //// PROGRESS THREAD END
520
521 while !folders_to_check.is_empty() {
522 if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
523 // End thread which send info to gui
524 progress_thread_run.store(false, Ordering::Relaxed);
525 progress_thread_handle.join().unwrap();
526 return false;
527 }
528
529 let current_folder = folders_to_check.pop().unwrap();
530
531 // Read current dir, if permission are denied just go to next
532 let read_dir = match fs::read_dir(¤t_folder) {
533 Ok(t) => t,
534 Err(e) => {
535 self.text_messages.warnings.push(format!("Cannot open dir {}, reason {}", current_folder.display(), e));
536 continue;
537 } // Permissions denied
538 };
539
540 // Check every sub folder/file/link etc.
541 'dir: for entry in read_dir {
542 let entry_data = match entry {
543 Ok(t) => t,
544 Err(e) => {
545 self.text_messages.warnings.push(format!("Cannot read entry in dir {}, reason {}", current_folder.display(), e));
546 continue 'dir;
547 } //Permissions denied
548 };
549 let metadata: Metadata = match entry_data.metadata() {
550 Ok(t) => t,
551 Err(e) => {
552 self.text_messages.warnings.push(format!("Cannot read metadata in dir {}, reason {}", current_folder.display(), e));
553 continue 'dir;
554 } //Permissions denied
555 };
556 if metadata.is_dir() {
557 if !self.recursive_search {
558 continue 'dir;
559 }
560
561 let next_folder = current_folder.join(entry_data.file_name());
562 if self.directories.is_excluded(&next_folder) {
563 continue 'dir;
564 }
565
566 if self.excluded_items.is_excluded(&next_folder) {
567 continue 'dir;
568 }
569
570 folders_to_check.push(next_folder);
571 } else if metadata.is_file() {
572 atomic_file_counter.fetch_add(1, Ordering::Relaxed);
573 // let mut have_valid_extension: bool;
574 let file_name_lowercase: String = match entry_data.file_name().into_string() {
575 Ok(t) => t,
576 Err(_inspected) => {
577 println!("File {:?} has not valid UTF-8 name", entry_data);
578 continue 'dir;
579 }
580 }
581 .to_lowercase();
582
583 // Checking allowed extensions
584 if !self.allowed_extensions.file_extensions.is_empty() {
585 let allowed = self.allowed_extensions.file_extensions.iter().any(|e| file_name_lowercase.ends_with((".".to_string() + e.to_lowercase().as_str()).as_str()));
586 if !allowed {
587 // Not an allowed extension, ignore it.
588
589 continue 'dir;
590 }
591 }
592 // Checking files
593 if (self.minimal_file_size..=self.maximal_file_size).contains(&metadata.len()) {
594 let current_file_name = current_folder.join(entry_data.file_name());
595 if self.excluded_items.is_excluded(¤t_file_name) {
596 continue 'dir;
597 }
598
599 // Creating new file entry
600 let fe: FileEntry = FileEntry {
601 path: current_file_name.clone(),
602 size: metadata.len(),
603 modified_date: match metadata.modified() {
604 Ok(t) => match t.duration_since(UNIX_EPOCH) {
605 Ok(d) => d.as_secs(),
606 Err(_inspected) => {
607 self.text_messages.warnings.push(format!("File {} seems to be modified before Unix Epoch.", current_file_name.display()));
608 0
609 }
610 },
611 Err(e) => {
612 self.text_messages.warnings.push(format!("Unable to get modification date from file {}, reason {}", current_file_name.display(), e));
613 0
614 } // Permissions Denied
615 },
616 hash: "".to_string(),
617 };
618
619 // Adding files to BTreeMap
620 self.files_with_identical_size.entry(metadata.len()).or_insert_with(Vec::new);
621 self.files_with_identical_size.get_mut(&metadata.len()).unwrap().push(fe);
622 }
623 }
624 }
625 }
626 // End thread which send info to gui
627 progress_thread_run.store(false, Ordering::Relaxed);
628 progress_thread_handle.join().unwrap();
629
630 // Create new BTreeMap without single size entries(files have not duplicates)
631 let mut new_map: BTreeMap<u64, Vec<FileEntry>> = Default::default();
632
633 for (size, vec) in &self.files_with_identical_size {
634 if vec.len() <= 1 {
635 continue;
636 }
637
638 let vector;
639 if self.ignore_hard_links {
640 vector = filter_hard_links(vec);
641 } else {
642 vector = vec.clone();
643 }
644
645 if vector.len() > 1 {
646 self.information.number_of_duplicated_files_by_size += vector.len() - 1;
647 self.information.number_of_groups_by_size += 1;
648 self.information.lost_space_by_size += (vector.len() as u64 - 1) * size;
649 new_map.insert(*size, vector);
650 }
651 }
652 self.files_with_identical_size = new_map;
653
654 Common::print_time(start_time, SystemTime::now(), "check_files_size".to_string());
655 true
656 }
657
658 /// The slowest checking type, which must be applied after checking for size
check_files_hash(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool659 fn check_files_hash(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool {
660 let check_type = Arc::new(self.hash_type);
661
662 let start_time: SystemTime = SystemTime::now();
663 let check_was_breaked = AtomicBool::new(false); // Used for breaking from GUI and ending check thread
664 let mut pre_checked_map: BTreeMap<u64, Vec<FileEntry>> = Default::default();
665
666 //// PROGRESS THREAD START
667 const LOOP_DURATION: u32 = 200; //in ms
668 let progress_thread_run = Arc::new(AtomicBool::new(true));
669
670 let atomic_file_counter = Arc::new(AtomicUsize::new(0));
671
672 let progress_thread_handle;
673 if let Some(progress_sender) = progress_sender {
674 let progress_send = progress_sender.clone();
675 let progress_thread_run = progress_thread_run.clone();
676 let atomic_file_counter = atomic_file_counter.clone();
677 let files_to_check = self.files_with_identical_size.iter().map(|e| e.1.len()).sum();
678 let checking_method = self.check_method.clone();
679 progress_thread_handle = thread::spawn(move || loop {
680 progress_send
681 .unbounded_send(ProgressData {
682 checking_method: checking_method.clone(),
683 current_stage: 1,
684 max_stage: 2,
685 files_checked: atomic_file_counter.load(Ordering::Relaxed) as usize,
686 files_to_check,
687 })
688 .unwrap();
689 if !progress_thread_run.load(Ordering::Relaxed) {
690 break;
691 }
692 sleep(Duration::from_millis(LOOP_DURATION as u64));
693 });
694 } else {
695 progress_thread_handle = thread::spawn(|| {});
696 }
697
698 //// PROGRESS THREAD END
699
700 #[allow(clippy::type_complexity)]
701 let pre_hash_results: Vec<(u64, BTreeMap<String, Vec<FileEntry>>, Vec<String>, u64)> = self
702 .files_with_identical_size
703 .par_iter()
704 .map(|(size, vec_file_entry)| {
705 let mut hashmap_with_hash: BTreeMap<String, Vec<FileEntry>> = Default::default();
706 let mut errors: Vec<String> = Vec::new();
707 let mut bytes_read: u64 = 0;
708 let mut buffer = [0u8; 1024 * 2];
709
710 atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed);
711 for file_entry in vec_file_entry {
712 if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
713 check_was_breaked.store(true, Ordering::Relaxed);
714 return None;
715 }
716 match hash_calculation(&mut buffer, file_entry, &check_type, 0) {
717 Ok((hash_string, bytes)) => {
718 bytes_read += bytes;
719 hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new);
720 hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.clone());
721 }
722 Err(s) => errors.push(s),
723 }
724 }
725 Some((*size, hashmap_with_hash, errors, bytes_read))
726 })
727 .while_some()
728 .collect();
729
730 // End thread which send info to gui
731 progress_thread_run.store(false, Ordering::Relaxed);
732 progress_thread_handle.join().unwrap();
733
734 // Check if user aborted search(only from GUI)
735 if check_was_breaked.load(Ordering::Relaxed) {
736 return false;
737 }
738
739 // Check results
740 for (size, hash_map, mut errors, bytes_read) in pre_hash_results {
741 self.information.bytes_read_when_hashing += bytes_read;
742 self.text_messages.warnings.append(&mut errors);
743 for (_hash, mut vec_file_entry) in hash_map {
744 if vec_file_entry.len() > 1 {
745 pre_checked_map.entry(size).or_insert_with(Vec::new);
746 pre_checked_map.get_mut(&size).unwrap().append(&mut vec_file_entry);
747 }
748 }
749 }
750
751 Common::print_time(start_time, SystemTime::now(), "check_files_hash - prehash".to_string());
752 let start_time: SystemTime = SystemTime::now();
753
754 /////////////////////////
755
756 //// PROGRESS THREAD START
757 // const LOOP_DURATION: u32 = 200; //in ms
758 let progress_thread_run = Arc::new(AtomicBool::new(true));
759
760 let atomic_file_counter = Arc::new(AtomicUsize::new(0));
761
762 let progress_thread_handle;
763 if let Some(progress_sender) = progress_sender {
764 let progress_send = progress_sender.clone();
765 let progress_thread_run = progress_thread_run.clone();
766 let atomic_file_counter = atomic_file_counter.clone();
767 let files_to_check = pre_checked_map.iter().map(|e| e.1.len()).sum();
768 let checking_method = self.check_method.clone();
769 progress_thread_handle = thread::spawn(move || loop {
770 progress_send
771 .unbounded_send(ProgressData {
772 checking_method: checking_method.clone(),
773 current_stage: 2,
774 max_stage: 2,
775 files_checked: atomic_file_counter.load(Ordering::Relaxed) as usize,
776 files_to_check,
777 })
778 .unwrap();
779 if !progress_thread_run.load(Ordering::Relaxed) {
780 break;
781 }
782 sleep(Duration::from_millis(LOOP_DURATION as u64));
783 });
784 } else {
785 progress_thread_handle = thread::spawn(|| {});
786 }
787
788 //// PROGRESS THREAD END
789
790 #[allow(clippy::type_complexity)]
791 let mut full_hash_results: Vec<(u64, BTreeMap<String, Vec<FileEntry>>, Vec<String>, u64)>;
792
793 match self.check_method {
794 CheckingMethod::HashMb => {
795 full_hash_results = pre_checked_map
796 .par_iter()
797 .map(|(size, vec_file_entry)| {
798 let mut hashmap_with_hash: BTreeMap<String, Vec<FileEntry>> = Default::default();
799 let mut errors: Vec<String> = Vec::new();
800 let mut bytes_read: u64 = 0;
801 let mut buffer = [0u8; 1024 * 128];
802 atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed);
803 for file_entry in vec_file_entry {
804 if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
805 check_was_breaked.store(true, Ordering::Relaxed);
806 return None;
807 }
808
809 match hash_calculation(&mut buffer, file_entry, &check_type, HASH_MB_LIMIT_BYTES) {
810 Ok((hash_string, bytes)) => {
811 bytes_read += bytes;
812 hashmap_with_hash.entry(hash_string.to_string()).or_insert_with(Vec::new);
813 hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.to_owned());
814 }
815 Err(s) => errors.push(s),
816 }
817 }
818 Some((*size, hashmap_with_hash, errors, bytes_read))
819 })
820 .while_some()
821 .collect();
822 }
823 CheckingMethod::Hash => {
824 let loaded_hash_map;
825
826 let mut records_already_cached: BTreeMap<u64, Vec<FileEntry>> = Default::default();
827 let mut non_cached_files_to_check: BTreeMap<u64, Vec<FileEntry>> = Default::default();
828
829 if self.use_cache {
830 loaded_hash_map = match load_hashes_from_file(&mut self.text_messages, &self.hash_type) {
831 Some(t) => t,
832 None => Default::default(),
833 };
834
835 for (size, vec_file_entry) in pre_checked_map {
836 #[allow(clippy::collapsible_if)]
837 if !loaded_hash_map.contains_key(&size) {
838 // If loaded data doesn't contains current info
839 non_cached_files_to_check.insert(size, vec_file_entry);
840 } else {
841 let loaded_vec_file_entry = loaded_hash_map.get(&size).unwrap();
842
843 for file_entry in vec_file_entry {
844 let mut found: bool = false;
845 for loaded_file_entry in loaded_vec_file_entry {
846 if file_entry.path == loaded_file_entry.path && file_entry.modified_date == loaded_file_entry.modified_date {
847 records_already_cached.entry(file_entry.size).or_insert_with(Vec::new);
848 records_already_cached.get_mut(&file_entry.size).unwrap().push(loaded_file_entry.clone());
849 found = true;
850 break;
851 }
852 }
853
854 if !found {
855 non_cached_files_to_check.entry(file_entry.size).or_insert_with(Vec::new);
856 non_cached_files_to_check.get_mut(&file_entry.size).unwrap().push(file_entry);
857 }
858 }
859 }
860 }
861 } else {
862 loaded_hash_map = Default::default();
863 mem::swap(&mut pre_checked_map, &mut non_cached_files_to_check);
864 }
865
866 full_hash_results = non_cached_files_to_check
867 .par_iter()
868 .map(|(size, vec_file_entry)| {
869 let mut hashmap_with_hash: BTreeMap<String, Vec<FileEntry>> = Default::default();
870 let mut errors: Vec<String> = Vec::new();
871 let mut bytes_read: u64 = 0;
872 let mut buffer = [0u8; 1024 * 128];
873
874 atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed);
875 for file_entry in vec_file_entry {
876 if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
877 check_was_breaked.store(true, Ordering::Relaxed);
878 return None;
879 }
880
881 match hash_calculation(&mut buffer, file_entry, &check_type, u64::MAX) {
882 Ok((hash_string, bytes)) => {
883 bytes_read += bytes;
884 let mut file_entry = file_entry.clone();
885 file_entry.hash = hash_string.clone();
886 hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new);
887 hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry);
888 }
889 Err(s) => errors.push(s),
890 }
891 }
892 Some((*size, hashmap_with_hash, errors, bytes_read))
893 })
894 .while_some()
895 .collect();
896
897 if self.use_cache {
898 'main: for (size, vec_file_entry) in records_already_cached {
899 // Check if size already exists, if exists we must to change it outside because cannot have mut and non mut reference to full_hash_results
900 for (full_size, full_hashmap, _errors, _bytes_read) in &mut full_hash_results {
901 if size == *full_size {
902 for file_entry in vec_file_entry {
903 full_hashmap.entry(file_entry.hash.clone()).or_insert_with(Vec::new);
904 full_hashmap.get_mut(&file_entry.hash).unwrap().push(file_entry);
905 }
906 continue 'main;
907 }
908 }
909 // Size doesn't exists add results to files
910 let mut temp_hashmap: BTreeMap<String, Vec<FileEntry>> = Default::default();
911 for file_entry in vec_file_entry {
912 temp_hashmap.entry(file_entry.hash.clone()).or_insert_with(Vec::new);
913 temp_hashmap.get_mut(&file_entry.hash).unwrap().push(file_entry);
914 }
915 full_hash_results.push((size, temp_hashmap, Vec::new(), 0));
916 }
917
918 // Must save all results to file, old loaded from file with all currently counted results
919 let mut all_results: BTreeMap<String, FileEntry> = Default::default();
920 for (_size, vec_file_entry) in loaded_hash_map {
921 for file_entry in vec_file_entry {
922 all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry);
923 }
924 }
925 for (_size, hashmap, _errors, _bytes_read) in &full_hash_results {
926 for vec_file_entry in hashmap.values() {
927 for file_entry in vec_file_entry {
928 all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone());
929 }
930 }
931 }
932 save_hashes_to_file(&all_results, &mut self.text_messages, &self.hash_type, self.minimal_cache_file_size);
933 }
934 }
935 _ => panic!("What"),
936 }
937
938 // End thread which send info to gui
939 progress_thread_run.store(false, Ordering::Relaxed);
940 progress_thread_handle.join().unwrap();
941
942 // Check if user aborted search(only from GUI)
943 if check_was_breaked.load(Ordering::Relaxed) {
944 return false;
945 }
946
947 for (size, hash_map, mut errors, bytes_read) in full_hash_results {
948 self.information.bytes_read_when_hashing += bytes_read;
949 self.text_messages.warnings.append(&mut errors);
950 for (_hash, vec_file_entry) in hash_map {
951 if vec_file_entry.len() > 1 {
952 self.files_with_identical_hashes.entry(size).or_insert_with(Vec::new);
953 self.files_with_identical_hashes.get_mut(&size).unwrap().push(vec_file_entry);
954 }
955 }
956 }
957
958 /////////////////////////
959
960 for (size, vector_vectors) in &self.files_with_identical_hashes {
961 for vector in vector_vectors {
962 self.information.number_of_duplicated_files_by_hash += vector.len() - 1;
963 self.information.number_of_groups_by_hash += 1;
964 self.information.lost_space_by_hash += (vector.len() as u64 - 1) * size;
965 }
966 }
967
968 Common::print_time(start_time, SystemTime::now(), "check_files_hash - full hash".to_string());
969
970 // Clean unused data
971 self.files_with_identical_size = Default::default();
972
973 true
974 }
975
976 /// Function to delete files, from filed before BTreeMap
977 /// Using another function to delete files to avoid duplicates data
delete_files(&mut self)978 fn delete_files(&mut self) {
979 let start_time: SystemTime = SystemTime::now();
980 if self.delete_method == DeleteMethod::None {
981 return;
982 }
983
984 match self.check_method {
985 CheckingMethod::Name => {
986 for vector in self.files_with_identical_names.values() {
987 let tuple: (u64, usize, usize) = delete_files(vector, &self.delete_method, &mut self.text_messages, self.dryrun);
988 self.information.gained_space += tuple.0;
989 self.information.number_of_removed_files += tuple.1;
990 self.information.number_of_failed_to_remove_files += tuple.2;
991 }
992 }
993 CheckingMethod::Hash | CheckingMethod::HashMb => {
994 for vector_vectors in self.files_with_identical_hashes.values() {
995 for vector in vector_vectors.iter() {
996 let tuple: (u64, usize, usize) = delete_files(vector, &self.delete_method, &mut self.text_messages, self.dryrun);
997 self.information.gained_space += tuple.0;
998 self.information.number_of_removed_files += tuple.1;
999 self.information.number_of_failed_to_remove_files += tuple.2;
1000 }
1001 }
1002 }
1003 CheckingMethod::Size => {
1004 for vector in self.files_with_identical_size.values() {
1005 let tuple: (u64, usize, usize) = delete_files(vector, &self.delete_method, &mut self.text_messages, self.dryrun);
1006 self.information.gained_space += tuple.0;
1007 self.information.number_of_removed_files += tuple.1;
1008 self.information.number_of_failed_to_remove_files += tuple.2;
1009 }
1010 }
1011 CheckingMethod::None => {
1012 //Just do nothing
1013 panic!("Checking method should never be none.");
1014 }
1015 }
1016
1017 Common::print_time(start_time, SystemTime::now(), "delete_files".to_string());
1018 }
1019 }
1020 impl Default for DuplicateFinder {
default() -> Self1021 fn default() -> Self {
1022 Self::new()
1023 }
1024 }
1025
1026 impl DebugPrint for DuplicateFinder {
1027 #[allow(dead_code)]
1028 #[allow(unreachable_code)]
1029 /// Debugging printing - only available on debug build
debug_print(&self)1030 fn debug_print(&self) {
1031 #[cfg(not(debug_assertions))]
1032 {
1033 return;
1034 }
1035 println!("---------------DEBUG PRINT---------------");
1036 println!("### Information's");
1037
1038 println!("Errors size - {}", self.text_messages.errors.len());
1039 println!("Warnings size - {}", self.text_messages.warnings.len());
1040 println!("Messages size - {}", self.text_messages.messages.len());
1041 println!(
1042 "Number of duplicated files by size(in groups) - {} ({})",
1043 self.information.number_of_duplicated_files_by_size, self.information.number_of_groups_by_size
1044 );
1045 println!(
1046 "Number of duplicated files by hash(in groups) - {} ({})",
1047 self.information.number_of_duplicated_files_by_hash, self.information.number_of_groups_by_hash
1048 );
1049 println!(
1050 "Number of duplicated files by name(in groups) - {} ({})",
1051 self.information.number_of_duplicated_files_by_name, self.information.number_of_groups_by_name
1052 );
1053 println!("Lost space by size - {} ({} bytes)", self.information.lost_space_by_size.file_size(options::BINARY).unwrap(), self.information.lost_space_by_size);
1054 println!("Lost space by hash - {} ({} bytes)", self.information.lost_space_by_hash.file_size(options::BINARY).unwrap(), self.information.lost_space_by_hash);
1055 println!(
1056 "Gained space by removing duplicated entries - {} ({} bytes)",
1057 self.information.gained_space.file_size(options::BINARY).unwrap(),
1058 self.information.gained_space
1059 );
1060 println!(
1061 "Bytes read when hashing - {} ({} bytes)",
1062 self.information.bytes_read_when_hashing.file_size(options::BINARY).unwrap(),
1063 self.information.bytes_read_when_hashing
1064 );
1065 println!("Number of removed files - {}", self.information.number_of_removed_files);
1066 println!("Number of failed to remove files - {}", self.information.number_of_failed_to_remove_files);
1067
1068 println!("### Other");
1069
1070 println!("Files list size - {}", self.files_with_identical_size.len());
1071 println!("Hashed Files list size - {}", self.files_with_identical_hashes.len());
1072 println!("Allowed extensions - {:?}", self.allowed_extensions.file_extensions);
1073 println!("Excluded items - {:?}", self.excluded_items.items);
1074 println!("Included directories - {:?}", self.directories.included_directories);
1075 println!("Excluded directories - {:?}", self.directories.excluded_directories);
1076 println!("Recursive search - {}", self.recursive_search.to_string());
1077 println!("Minimum file size - {:?}", self.minimal_file_size);
1078 println!("Checking Method - {:?}", self.check_method);
1079 println!("Delete Method - {:?}", self.delete_method);
1080 println!("-----------------------------------------");
1081 }
1082 }
1083 impl SaveResults for DuplicateFinder {
save_results_to_file(&mut self, file_name: &str) -> bool1084 fn save_results_to_file(&mut self, file_name: &str) -> bool {
1085 let start_time: SystemTime = SystemTime::now();
1086 let file_name: String = match file_name {
1087 "" => "results.txt".to_string(),
1088 k => k.to_string(),
1089 };
1090
1091 let file_handler = match File::create(&file_name) {
1092 Ok(t) => t,
1093 Err(e) => {
1094 self.text_messages.errors.push(format!("Failed to create file {}, reason {}", file_name, e));
1095 return false;
1096 }
1097 };
1098 let mut writer = BufWriter::new(file_handler);
1099
1100 if let Err(e) = writeln!(
1101 writer,
1102 "Results of searching {:?} with excluded directories {:?} and excluded items {:?}",
1103 self.directories.included_directories, self.directories.excluded_directories, self.excluded_items.items
1104 ) {
1105 self.text_messages.errors.push(format!("Failed to save results to file {}, reason {}", file_name, e));
1106 return false;
1107 }
1108 match self.check_method {
1109 CheckingMethod::Name => {
1110 if !self.files_with_identical_names.is_empty() {
1111 writeln!(writer, "-------------------------------------------------Files with same names-------------------------------------------------").unwrap();
1112 writeln!(
1113 writer,
1114 "Found {} files in {} groups with same name(may have different content)",
1115 self.information.number_of_duplicated_files_by_name, self.information.number_of_groups_by_name,
1116 )
1117 .unwrap();
1118 for (name, vector) in self.files_with_identical_names.iter().rev() {
1119 writeln!(writer, "Name - {} - {} files ", name, vector.len()).unwrap();
1120 for j in vector {
1121 writeln!(writer, "{}", j.path.display()).unwrap();
1122 }
1123 writeln!(writer).unwrap();
1124 }
1125 } else {
1126 write!(writer, "Not found any files with same names.").unwrap();
1127 }
1128 }
1129 CheckingMethod::Size => {
1130 if !self.files_with_identical_size.is_empty() {
1131 writeln!(writer, "-------------------------------------------------Files with same size-------------------------------------------------").unwrap();
1132 writeln!(
1133 writer,
1134 "Found {} duplicated files which in {} groups which takes {}.",
1135 self.information.number_of_duplicated_files_by_size,
1136 self.information.number_of_groups_by_size,
1137 self.information.lost_space_by_size.file_size(options::BINARY).unwrap()
1138 )
1139 .unwrap();
1140 for (size, vector) in self.files_with_identical_size.iter().rev() {
1141 write!(writer, "\n---- Size {} ({}) - {} files \n", size.file_size(options::BINARY).unwrap(), size, vector.len()).unwrap();
1142 for file_entry in vector {
1143 writeln!(writer, "{}", file_entry.path.display()).unwrap();
1144 }
1145 }
1146 } else {
1147 write!(writer, "Not found any duplicates.").unwrap();
1148 }
1149 }
1150 CheckingMethod::Hash | CheckingMethod::HashMb => {
1151 if !self.files_with_identical_hashes.is_empty() {
1152 writeln!(writer, "-------------------------------------------------Files with same hashes-------------------------------------------------").unwrap();
1153 writeln!(
1154 writer,
1155 "Found {} duplicated files which in {} groups which takes {}.",
1156 self.information.number_of_duplicated_files_by_hash,
1157 self.information.number_of_groups_by_hash,
1158 self.information.lost_space_by_hash.file_size(options::BINARY).unwrap()
1159 )
1160 .unwrap();
1161 for (size, vectors_vector) in self.files_with_identical_hashes.iter().rev() {
1162 for vector in vectors_vector {
1163 writeln!(writer, "\n---- Size {} ({}) - {} files", size.file_size(options::BINARY).unwrap(), size, vector.len()).unwrap();
1164 for file_entry in vector {
1165 writeln!(writer, "{}", file_entry.path.display()).unwrap();
1166 }
1167 }
1168 }
1169 } else {
1170 write!(writer, "Not found any duplicates.").unwrap();
1171 }
1172 }
1173 CheckingMethod::None => {
1174 panic!();
1175 }
1176 }
1177 Common::print_time(start_time, SystemTime::now(), "save_results_to_file".to_string());
1178 true
1179 }
1180 }
1181 impl PrintResults for DuplicateFinder {
1182 /// Print information's about duplicated entries
1183 /// Only needed for CLI
print_results(&self)1184 fn print_results(&self) {
1185 let start_time: SystemTime = SystemTime::now();
1186 let mut number_of_files: u64 = 0;
1187 let mut number_of_groups: u64 = 0;
1188
1189 match self.check_method {
1190 CheckingMethod::Name => {
1191 for i in &self.files_with_identical_names {
1192 number_of_files += i.1.len() as u64;
1193 number_of_groups += 1;
1194 }
1195 println!("Found {} files in {} groups with same name(may have different content)", number_of_files, number_of_groups,);
1196 for (name, vector) in &self.files_with_identical_names {
1197 println!("Name - {} - {} files ", name, vector.len());
1198 for j in vector {
1199 println!("{}", j.path.display());
1200 }
1201 println!();
1202 }
1203 }
1204 CheckingMethod::Hash | CheckingMethod::HashMb => {
1205 for (_size, vector) in self.files_with_identical_hashes.iter() {
1206 for j in vector {
1207 number_of_files += j.len() as u64;
1208 number_of_groups += 1;
1209 }
1210 }
1211 println!(
1212 "Found {} duplicated files in {} groups with same content which took {}:",
1213 number_of_files,
1214 number_of_groups,
1215 self.information.lost_space_by_size.file_size(options::BINARY).unwrap()
1216 );
1217 for (size, vector) in self.files_with_identical_hashes.iter().rev() {
1218 for j in vector {
1219 println!("Size - {} ({}) - {} files ", size.file_size(options::BINARY).unwrap(), size, j.len());
1220 for k in j {
1221 println!("{}", k.path.display());
1222 }
1223 println!("----");
1224 }
1225 println!();
1226 }
1227 }
1228 CheckingMethod::Size => {
1229 for i in &self.files_with_identical_size {
1230 number_of_files += i.1.len() as u64;
1231 number_of_groups += 1;
1232 }
1233 println!(
1234 "Found {} files in {} groups with same size(may have different content) which took {}:",
1235 number_of_files,
1236 number_of_groups,
1237 self.information.lost_space_by_size.file_size(options::BINARY).unwrap()
1238 );
1239 for (size, vector) in &self.files_with_identical_size {
1240 println!("Size - {} ({}) - {} files ", size.file_size(options::BINARY).unwrap(), size, vector.len());
1241 for j in vector {
1242 println!("{}", j.path.display());
1243 }
1244 println!();
1245 }
1246 }
1247 CheckingMethod::None => {
1248 panic!("Checking Method shouldn't be ever set to None");
1249 }
1250 }
1251 Common::print_time(start_time, SystemTime::now(), "print_entries".to_string());
1252 }
1253 }
1254
1255 /// Functions to remove slice(vector) of files with provided method
1256 /// Returns size of removed elements, number of deleted and failed to delete files and modified warning list
delete_files(vector: &[FileEntry], delete_method: &DeleteMethod, text_messages: &mut Messages, dryrun: bool) -> (u64, usize, usize)1257 fn delete_files(vector: &[FileEntry], delete_method: &DeleteMethod, text_messages: &mut Messages, dryrun: bool) -> (u64, usize, usize) {
1258 assert!(vector.len() > 1, "Vector length must be bigger than 1(This should be done in previous steps).");
1259 let mut gained_space: u64 = 0;
1260 let mut removed_files: usize = 0;
1261 let mut failed_to_remove_files: usize = 0;
1262 let mut values = vector.iter().enumerate();
1263 let q_index = match delete_method {
1264 DeleteMethod::OneOldest | DeleteMethod::AllExceptNewest => values.max_by(|(_, l), (_, r)| l.modified_date.cmp(&r.modified_date)),
1265 DeleteMethod::OneNewest | DeleteMethod::AllExceptOldest | DeleteMethod::HardLink => values.min_by(|(_, l), (_, r)| l.modified_date.cmp(&r.modified_date)),
1266 DeleteMethod::None => values.next(),
1267 };
1268 let q_index = q_index.map(|t| t.0).unwrap_or(0);
1269 let n = match delete_method {
1270 DeleteMethod::OneNewest | DeleteMethod::OneOldest => 1,
1271 DeleteMethod::AllExceptNewest | DeleteMethod::AllExceptOldest | DeleteMethod::None | DeleteMethod::HardLink => usize::MAX,
1272 };
1273 for (index, file) in vector.iter().enumerate() {
1274 if q_index == index {
1275 continue;
1276 } else if removed_files + failed_to_remove_files >= n {
1277 break;
1278 }
1279
1280 let r = match delete_method {
1281 DeleteMethod::OneOldest | DeleteMethod::OneNewest | DeleteMethod::AllExceptOldest | DeleteMethod::AllExceptNewest => {
1282 if dryrun {
1283 Ok(Some(format!("Delete {}", file.path.display())))
1284 } else {
1285 fs::remove_file(&file.path).map(|_| None)
1286 }
1287 }
1288 DeleteMethod::HardLink => {
1289 let src = &vector[q_index].path;
1290 if dryrun {
1291 Ok(Some(format!("Replace file {} with hard link to {}", file.path.display(), src.display())))
1292 } else {
1293 make_hard_link(src, &file.path).map(|_| None)
1294 }
1295 }
1296 DeleteMethod::None => Ok(None),
1297 };
1298
1299 match r {
1300 Err(e) => {
1301 failed_to_remove_files += 1;
1302 text_messages.warnings.push(format!("Failed to remove {} ({})", file.path.display(), e));
1303 }
1304 Ok(Some(msg)) => {
1305 text_messages.messages.push(msg);
1306 removed_files += 1;
1307 gained_space += file.size;
1308 }
1309 Ok(None) => {
1310 removed_files += 1;
1311 gained_space += file.size;
1312 }
1313 }
1314 }
1315 (gained_space, removed_files, failed_to_remove_files)
1316 }
1317
1318 #[cfg(target_family = "windows")]
filter_hard_links(vec_file_entry: &[FileEntry]) -> Vec<FileEntry>1319 fn filter_hard_links(vec_file_entry: &[FileEntry]) -> Vec<FileEntry> {
1320 vec_file_entry.to_vec()
1321 }
1322
1323 #[cfg(target_family = "unix")]
filter_hard_links(vec_file_entry: &[FileEntry]) -> Vec<FileEntry>1324 fn filter_hard_links(vec_file_entry: &[FileEntry]) -> Vec<FileEntry> {
1325 let mut inodes: HashSet<u64> = HashSet::with_capacity(vec_file_entry.len());
1326 let mut identical: Vec<FileEntry> = Vec::with_capacity(vec_file_entry.len());
1327 for f in vec_file_entry {
1328 if let Ok(meta) = fs::metadata(&f.path) {
1329 if !inodes.insert(meta.ino()) {
1330 continue;
1331 }
1332 }
1333 identical.push(f.clone());
1334 }
1335 identical
1336 }
1337
make_hard_link(src: &Path, dst: &Path) -> io::Result<()>1338 pub fn make_hard_link(src: &Path, dst: &Path) -> io::Result<()> {
1339 let dst_dir = dst.parent().ok_or_else(|| Error::new(ErrorKind::Other, "No parent"))?;
1340 let temp = tempfile::Builder::new().tempfile_in(dst_dir)?;
1341 fs::rename(dst, temp.path())?;
1342 let result = fs::hard_link(src, dst);
1343 if result.is_err() {
1344 fs::rename(temp.path(), dst)?;
1345 }
1346 result
1347 }
1348
save_hashes_to_file(hashmap: &BTreeMap<String, FileEntry>, text_messages: &mut Messages, type_of_hash: &HashType, minimal_cache_file_size: u64)1349 fn save_hashes_to_file(hashmap: &BTreeMap<String, FileEntry>, text_messages: &mut Messages, type_of_hash: &HashType, minimal_cache_file_size: u64) {
1350 if let Some(proj_dirs) = ProjectDirs::from("pl", "Qarmin", "Czkawka") {
1351 let cache_dir = PathBuf::from(proj_dirs.cache_dir());
1352 if cache_dir.exists() {
1353 if !cache_dir.is_dir() {
1354 text_messages.messages.push(format!("Config dir {} is a file!", cache_dir.display()));
1355 return;
1356 }
1357 } else if let Err(e) = fs::create_dir_all(&cache_dir) {
1358 text_messages.messages.push(format!("Cannot create config dir {}, reason {}", cache_dir.display(), e));
1359 return;
1360 }
1361 let cache_file = cache_dir.join(CACHE_FILE_NAME.replace(".", format!("_{:?}.", type_of_hash).as_str()));
1362 let file_handler = match OpenOptions::new().truncate(true).write(true).create(true).open(&cache_file) {
1363 Ok(t) => t,
1364 Err(e) => {
1365 text_messages.messages.push(format!("Cannot create or open cache file {}, reason {}", cache_file.display(), e));
1366 return;
1367 }
1368 };
1369 let mut writer = BufWriter::new(file_handler);
1370
1371 for file_entry in hashmap.values() {
1372 // Only cache bigger than 5MB files
1373 if file_entry.size >= minimal_cache_file_size {
1374 let string: String = format!("{}//{}//{}//{}", file_entry.path.display(), file_entry.size, file_entry.modified_date, file_entry.hash);
1375
1376 if let Err(e) = writeln!(writer, "{}", string) {
1377 text_messages.messages.push(format!("Failed to save some data to cache file {}, reason {}", cache_file.display(), e));
1378 return;
1379 };
1380 }
1381 }
1382 }
1383 }
1384
1385 pub trait MyHasher {
update(&mut self, bytes: &[u8])1386 fn update(&mut self, bytes: &[u8]);
finalize(&self) -> String1387 fn finalize(&self) -> String;
1388 }
1389
hash_calculation(buffer: &mut [u8], file_entry: &FileEntry, hash_type: &HashType, limit: u64) -> Result<(String, u64), String>1390 fn hash_calculation(buffer: &mut [u8], file_entry: &FileEntry, hash_type: &HashType, limit: u64) -> Result<(String, u64), String> {
1391 let mut file_handler = match File::open(&file_entry.path) {
1392 Ok(t) => t,
1393 Err(e) => return Err(format!("Unable to check hash of file {}, reason {}", file_entry.path.display(), e)),
1394 };
1395 let hasher = &mut *hash_type.hasher();
1396 let mut current_file_read_bytes: u64 = 0;
1397 loop {
1398 let n = match file_handler.read(buffer) {
1399 Ok(0) => break,
1400 Ok(t) => t,
1401 Err(e) => return Err(format!("Error happened when checking hash of file {}, reason {}", file_entry.path.display(), e)),
1402 };
1403
1404 current_file_read_bytes += n as u64;
1405 hasher.update(&buffer[..n]);
1406
1407 if current_file_read_bytes >= limit {
1408 break;
1409 }
1410 }
1411 Ok((hasher.finalize(), current_file_read_bytes))
1412 }
1413
load_hashes_from_file(text_messages: &mut Messages, type_of_hash: &HashType) -> Option<BTreeMap<u64, Vec<FileEntry>>>1414 fn load_hashes_from_file(text_messages: &mut Messages, type_of_hash: &HashType) -> Option<BTreeMap<u64, Vec<FileEntry>>> {
1415 if let Some(proj_dirs) = ProjectDirs::from("pl", "Qarmin", "Czkawka") {
1416 let cache_dir = PathBuf::from(proj_dirs.cache_dir());
1417 let cache_file = cache_dir.join(CACHE_FILE_NAME.replace(".", format!("_{:?}.", type_of_hash).as_str()));
1418 let file_handler = match OpenOptions::new().read(true).open(&cache_file) {
1419 Ok(t) => t,
1420 Err(_inspected) => {
1421 // text_messages.messages.push(format!("Cannot find or open cache file {}", cache_file.display())); // This shouldn't be write to output
1422 return None;
1423 }
1424 };
1425
1426 let reader = BufReader::new(file_handler);
1427
1428 let mut hashmap_loaded_entries: BTreeMap<u64, Vec<FileEntry>> = Default::default();
1429
1430 // Read the file line by line using the lines() iterator from std::io::BufRead.
1431 for (index, line) in reader.lines().enumerate() {
1432 let line = match line {
1433 Ok(t) => t,
1434 Err(e) => {
1435 text_messages.warnings.push(format!("Failed to load line number {} from cache file {}, reason {}", index + 1, cache_file.display(), e));
1436 return None;
1437 }
1438 };
1439 let uuu = line.split("//").collect::<Vec<&str>>();
1440 if uuu.len() != 4 {
1441 text_messages
1442 .warnings
1443 .push(format!("Found invalid data(too much or too low amount of data) in line {} - ({}) in cache file {}", index + 1, line, cache_file.display()));
1444 continue;
1445 }
1446 // Don't load cache data if destination file not exists
1447 if Path::new(uuu[0]).exists() {
1448 let file_entry = FileEntry {
1449 path: PathBuf::from(uuu[0]),
1450 size: match uuu[1].parse::<u64>() {
1451 Ok(t) => t,
1452 Err(e) => {
1453 text_messages
1454 .warnings
1455 .push(format!("Found invalid size value in line {} - ({}) in cache file {}, reason {}", index + 1, line, cache_file.display(), e));
1456 continue;
1457 }
1458 },
1459 modified_date: match uuu[2].parse::<u64>() {
1460 Ok(t) => t,
1461 Err(e) => {
1462 text_messages
1463 .warnings
1464 .push(format!("Found invalid modified date value in line {} - ({}) in cache file {}, reason {}", index + 1, line, cache_file.display(), e));
1465 continue;
1466 }
1467 },
1468 hash: uuu[3].to_string(),
1469 };
1470 hashmap_loaded_entries.entry(file_entry.size).or_insert_with(Vec::new);
1471 hashmap_loaded_entries.get_mut(&file_entry.size).unwrap().push(file_entry);
1472 }
1473 }
1474
1475 return Some(hashmap_loaded_entries);
1476 }
1477
1478 text_messages.messages.push("Cannot find or open system config dir to save cache file".to_string());
1479 None
1480 }
1481
1482 #[cfg(test)]
1483 mod tests {
1484 use super::*;
1485 use std::fs::{read_dir, File};
1486 use std::io;
1487 #[cfg(target_family = "windows")]
1488 use std::os::fs::MetadataExt;
1489 #[cfg(target_family = "unix")]
1490 use std::os::unix::fs::MetadataExt;
1491
1492 #[cfg(target_family = "unix")]
assert_inode(before: &Metadata, after: &Metadata)1493 fn assert_inode(before: &Metadata, after: &Metadata) {
1494 assert_eq!(before.ino(), after.ino());
1495 }
1496 #[cfg(target_family = "windows")]
assert_inode(_: &Metadata, _: &Metadata)1497 fn assert_inode(_: &Metadata, _: &Metadata) {}
1498
1499 #[test]
test_make_hard_link() -> io::Result<()>1500 fn test_make_hard_link() -> io::Result<()> {
1501 let dir = tempfile::Builder::new().tempdir()?;
1502 let (src, dst) = (dir.path().join("a"), dir.path().join("b"));
1503 File::create(&src)?;
1504 let metadata = fs::metadata(&src)?;
1505 File::create(&dst)?;
1506
1507 make_hard_link(&src, &dst)?;
1508
1509 assert_inode(&metadata, &fs::metadata(&dst)?);
1510 assert_eq!(metadata.permissions(), fs::metadata(&dst)?.permissions());
1511 assert_eq!(metadata.modified()?, fs::metadata(&dst)?.modified()?);
1512 assert_inode(&metadata, &fs::metadata(&src)?);
1513 assert_eq!(metadata.permissions(), fs::metadata(&src)?.permissions());
1514 assert_eq!(metadata.modified()?, fs::metadata(&src)?.modified()?);
1515
1516 let mut actual = read_dir(&dir)?.map(|e| e.unwrap().path()).collect::<Vec<PathBuf>>();
1517 actual.sort();
1518 assert_eq!(vec![src, dst], actual);
1519 Ok(())
1520 }
1521
1522 #[test]
test_make_hard_link_fails() -> io::Result<()>1523 fn test_make_hard_link_fails() -> io::Result<()> {
1524 let dir = tempfile::Builder::new().tempdir()?;
1525 let (src, dst) = (dir.path().join("a"), dir.path().join("b"));
1526 File::create(&dst)?;
1527 let metadata = fs::metadata(&dst)?;
1528
1529 assert!(make_hard_link(&src, &dst).is_err());
1530
1531 assert_inode(&metadata, &fs::metadata(&dst)?);
1532 assert_eq!(metadata.permissions(), fs::metadata(&dst)?.permissions());
1533 assert_eq!(metadata.modified()?, fs::metadata(&dst)?.modified()?);
1534
1535 assert_eq!(vec![dst], read_dir(&dir)?.map(|e| e.unwrap().path()).collect::<Vec<PathBuf>>());
1536 Ok(())
1537 }
1538
1539 #[test]
test_filter_hard_links_empty()1540 fn test_filter_hard_links_empty() {
1541 let expected: Vec<FileEntry> = Default::default();
1542 assert_eq!(expected, filter_hard_links(&[]));
1543 }
1544
1545 #[cfg(target_family = "unix")]
1546 #[test]
test_filter_hard_links() -> io::Result<()>1547 fn test_filter_hard_links() -> io::Result<()> {
1548 let dir = tempfile::Builder::new().tempdir()?;
1549 let (src, dst) = (dir.path().join("a"), dir.path().join("b"));
1550 File::create(&src)?;
1551 fs::hard_link(src.clone(), dst.clone())?;
1552 let e1 = FileEntry { path: src, ..Default::default() };
1553 let e2 = FileEntry { path: dst, ..Default::default() };
1554 let actual = filter_hard_links(&[e1.clone(), e2]);
1555 assert_eq!(vec![e1], actual);
1556 Ok(())
1557 }
1558
1559 #[test]
test_filter_hard_links_regular_files() -> io::Result<()>1560 fn test_filter_hard_links_regular_files() -> io::Result<()> {
1561 let dir = tempfile::Builder::new().tempdir()?;
1562 let (src, dst) = (dir.path().join("a"), dir.path().join("b"));
1563 File::create(&src)?;
1564 File::create(&dst)?;
1565 let e1 = FileEntry { path: src, ..Default::default() };
1566 let e2 = FileEntry { path: dst, ..Default::default() };
1567 let actual = filter_hard_links(&[e1.clone(), e2.clone()]);
1568 assert_eq!(vec![e1, e2], actual);
1569 Ok(())
1570 }
1571
1572 #[test]
test_hash_calculation() -> io::Result<()>1573 fn test_hash_calculation() -> io::Result<()> {
1574 let dir = tempfile::Builder::new().tempdir()?;
1575 let mut buf = [0u8; 1 << 10];
1576 let src = dir.path().join("a");
1577 let mut file = File::create(&src)?;
1578 file.write_all(b"aa")?;
1579 let e = FileEntry { path: src, ..Default::default() };
1580 let r = hash_calculation(&mut buf, &e, &HashType::Blake3, 0).unwrap();
1581 assert_eq!(2, r.1);
1582 assert!(!r.0.is_empty());
1583 Ok(())
1584 }
1585
1586 #[test]
test_hash_calculation_limit() -> io::Result<()>1587 fn test_hash_calculation_limit() -> io::Result<()> {
1588 let dir = tempfile::Builder::new().tempdir()?;
1589 let mut buf = [0u8; 1];
1590 let src = dir.path().join("a");
1591 let mut file = File::create(&src)?;
1592 file.write_all(b"aa")?;
1593 let e = FileEntry { path: src, ..Default::default() };
1594 let r1 = hash_calculation(&mut buf, &e, &HashType::Blake3, 1).unwrap();
1595 let r2 = hash_calculation(&mut buf, &e, &HashType::Blake3, 2).unwrap();
1596 let r3 = hash_calculation(&mut buf, &e, &HashType::Blake3, u64::MAX).unwrap();
1597 assert_ne!(r1, r2);
1598 assert_eq!(r2, r3);
1599 Ok(())
1600 }
1601
1602 #[test]
test_hash_calculation_invalid_file() -> io::Result<()>1603 fn test_hash_calculation_invalid_file() -> io::Result<()> {
1604 let dir = tempfile::Builder::new().tempdir()?;
1605 let mut buf = [0u8; 1 << 10];
1606 let src = dir.path().join("a");
1607 let e = FileEntry { path: src, ..Default::default() };
1608 let r = hash_calculation(&mut buf, &e, &HashType::Blake3, 0).unwrap_err();
1609 assert!(!r.is_empty());
1610 Ok(())
1611 }
1612 }
1613