1 /* Dupmerge - Reclaim disk space by linking identical files together
2
3 This is a utility that scans a UNIX directory tree looking for pairs of
4 distinct files with identical content. When it finds such files, it
5 deletes one file to reclaim its disk space and then recreates its path
6 name as a link to the other copy.
7 My first version of this program circa 1993 worked by computing MD5
8 hashes of every file, sorting the hashes and then looking for duplicates.
9 This worked, but it was unnecessarily slow. The comparison function I use
10 now stops comparing two files as soon as it determines their lengths are
11 different, which is a win when you have many large files with unique lengths.
12
13 * This program reads from standard input a list of files (such
14 * as that generated by "find . -print") and discovers which files are
15 * identical. Dupmerge unlinks one file of each identical pair and
16 * recreates its path name as a link to the other.
17 *
18 * Non-plain files in the input (directories, pipes, devices, etc)
19 * are ignored. Identical files must be on the same file system to be linked.
20 *
21 * Dupmerge prefers to keep the older of two identical files, as the older
22 * timestamp is more likely to be the correct one given that many
23 * copy utilities (e.g., 'cp') do not by default preserve modification
24 * times.
25 *
26 * Dupmerge works by quicksorting a list of path names, with the
27 * actual unlinking and relinking steps performed as side effects of
28 * the comparison function. The results of the sort are discarded.
29 *
30 * Command line arguments:
31 * -n Suppress the actual unlinking and relinking
32 * -q Operate in quiet mode (otherwise, relinks are displayed on stdout)
33 *
34 * 12 February 1998 Phil Karn, karn@ka9q.ampr.org
35 * Copyright Phil Karn. May be used under the terms of the GNU Public License.
36
37 --------------------------------------------------------------------------------
38
39 Added swap macro, invers mode, no linking of zero length files or files with no disk usage (sparse files),
40 void casts for semantic checkers, version number, sorting of equal size files due to dev and ino and name,
41 switched from newline terminated file names to zero terminated file names ...
42 Switched to C99.
43 Tested under SuSE 9.2 and Debian (both Kernel 2.6), tested with 4 and 7 Gigabyte files, coreutils sources,
44 md5 collisions, kernel sources, my archive with CD and DVD collections in an encrypted partition (200 GB,
45 1/2 Million files, 5 hours run time, about 30 GB reclaimed by 100,000 hard links).
46 Changed to reading/writing/comparing 8 byte blocks (instead of 1 byte blocks): 2 times faster now.
47 Changed inverse mode from O(n*ln(n)) to O(n).
48 Added sparse and desparse mode and combo flag.
49 Dr. Rolf Freitag a.k.a. Dr. No, 2004.
50
51 For files bigger than 2 GiB, > 2^31 Byte, you should use the compiler option
52 "-D_FILE_OFFSET_BITS=64".
53
54 Example for compilation (and striping + prelinking) on i686 ("PC" with Pentium III or higher or Athlon)
55 with shell function c (in ~/.bashrc):
56 function c {
57 gcc -Wall -I. -O3 -D_GNU_SOURCE -D__SMP__ -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_REENTRANT \
58 -pthread -mcpu=i686 -fexpensive-optimizations -DCPU=686 -ffast-math -m486 -lm -lz -o $1 $1.c && strip $1 \
59 && prelink -fmRv ./$1 && chmod +t $1
60 }
61
62 Example usage of this shell function (in ~/.bashrc): c dupmerge
63
64 But a simple "gcc -D_GNU_SOURCE -O2 -o dupmerge dupmerge.c" also does the job.
65
66 The inverse mode expands the hard links to simple files if used without the option -n.
67 With option -n hard links are only reported.
68 The reverse mode can be used e. g. for expanding backups, which have been shrinked with
69 with the default mode.
70
71 Caution: If there is not enough space for expanding the hard links (in inverse mode) hard links
72 will be lost! It's also a bad idea to kill dupmerge with signal 9 before it finishes because
73 files can be lost or damaged or new temporary and maybe big files can remain.
74
75 example for usage:
76 find ./ -type f -print0 | dupmerge 2>&1 | tee /tmp/user0/dupmerge.out
77
78 Todo:
79 - progress meter with approx. percent
80 - change from comparing signed to unsigned numbers for clearer ordering
81 - inverse mode optimization: expand with the same file attributes (copy most of struct stat)
82 - option m <number> for minimum block size; (not file size because of sparse files).
83 smaller files will not be hard linked
84 - ANSI-C (C99) comppatible error handling of the comparison functions
85 - for option n (nodo): normal mode: show the number of blocks which can be saved,
86 - refactorisation: separate help function and bit fiels +macros instead of legacy variables
87 - error message +advice when wrong parameters are used
88 - option f for using a soft link as second try when a hard link could not be made and the reverse for
89 inverse mode (inverse mode: expand hard and soft links)
90 - option z for not excluding zero length files from linking/expanding/erasing
91 - more user friendly output with e. g. freed block + saved space in kB instead of kiB or blocks (see IEC 60027-2),
92 extra explanation in output when -n is used,
93 - tests with ntfs partitions for MS-Win-users
94 - correkt block counting (sparse files are yet not counted correct)
95 - man page, info page
96 - autoconf/automake + rpm and deb package
97 - An option for scheduling in parallel with 2..N pthreads e. g. for SMP boxes with ramdisk.
98 - other features like a gziped cvs file with the file attributes from lstat of the files which were
99 linked
100 - an extra option for creating the tmpfiles on a specified partition for speedup with a ramdisk
101 - an option for calling dupmerge with find to shorten the command line the user has to do
102 - check if the user has enough permissions to replace files (in not nodo mode)
103 - self-calling, "user-friendly", mode with extra option which does calls dupmerge with find via the system call
104 - test with files which can be deleted only via the inode (http://www.cyberciti.biz/tips/delete-remove-files-with-inode-number.html)
105 - An additional option for doing deletion irreversible via shred or wipe or ... instead of unlink or rm.
106 - Checking for and reporting of filesystems where there are no links possible; e. g. no hard links with FAT.
107 - GUI
108
109 Version 1.73, 2008-03-01
110
111 */
112
113 #include <stdio.h>
114 #include <stdlib.h>
115 #include <string.h> // strndup,
116 #include <sys/types.h>
117 #include <sys/stat.h>
118 #include <unistd.h> // Nessisary for unlink, gtopt, link, fcmp.
119 #include <iso646.h> // for and, bitand, or, bitor ...
120 #include <stdbool.h> // _Bool
121 #include <signal.h> // signals
122 #include <limits.h> // UCHAR_MAX
123 #include <sys/mman.h> // mlockall
124 #include <sys/types.h> // vfork
125 #include <sys/wait.h> // vfork
126 #include <time.h> // localtime
127
128
129 #define mc_MIN(a, b) ((a) < (b) ? (a) : (b))
130 #define mc_MAX(a, b) ((a) > (b) ? (a) : (b))
131
132
133 // Swap two items a, b of size i_size, use (static) swap variable c for performance.
134 #define mc_SWAP_ITEMS(a, b, c, i_size) { memcpy (&(c), &(a), (i_size));\
135 memcpy (&(a), &(b), (i_size));\
136 memcpy (&(b), &(c), (i_size)); }
137
138 // easter egg
139 #define mc_ADVICE if (argc > 1 && 0 == strncmp (argv[1], "-advice", 8) ) \
140 { \
141 (void)printf ("Don't Panic!\n"); \
142 exit (42); \
143 } /* 42: The meaning of life, the universe, and everything ;-) */
144
145
146 // For registering the signal handler void sig_handler (int sig). This macro saves 32 bytes ;-)
147 #define mc_register_sig_handler \
148 {\
149 int i;\
150 for(i=0; i<=UCHAR_MAX; i++)\
151 signal (i, sig_handler);\
152 }
153
154
155 // union for copy and compare
156 union u_tag
157 {
158 int64_t i64;
159 char c[8];
160 };
161
162
163 int cmp (const void *, const void *); // function for qsort
164 int cmp0 (const void *, const void *); // function for qsort
165 int fcmp (const void *, const void *); // function for qsort
166 int fcmp0 (const void *, const void *); // function for qsort
167 int fcmp1 (const void *);
168 int fcmp2 (const void *, const _Bool);
169 _Bool different_files (const void *, const void *);
170
171 const double d_version = 1.73; // very simple version number of %1.2f format
172 struct stat s_swap; // swap variable
173 int Nodo = 0; // for nodo mode (only report)
174 int Quiet = 0; // for quiet mode
175 int i_soft = 0; // for using always soft links instead of the default hard links
176 _Bool b_inv = false; // invers mode indicator
177 _Bool b_sparse = false; // flag for sparseing
178 _Bool b_comb = false; // combo flag for normal + sparse mode
179 int i_minlinks = 1; // minimum of found hard links
180 int i_maxlinks = 1; // maximum of found hard links
181 long int li_minblocks = LONG_MAX; // minimum of saved blocks by sparse copying
182 long int li_maxblocks = 0; // maximum of saved blocks by sparse copying
183 int Files_deleted = 0;
184 int Blocks_reclaimed = 0;
185 int i_files_expanded = 0;
186 int i_blocks_declaimed = 0;
187 volatile int i_exit_flag = 0; // flag for exiting soon (not immediately) and controlled after a signal to terminate
188 char **a_names = NULL; // array of file names
189 char *a_flags = NULL; // Array with the flags of the files in a_names. 1 = marked for deletion because another file the the same content was found,
190 int nfiles = 0; // number of files
191
192
193 void
sig_handler(const int sig)194 sig_handler (const int sig) // signal handler
195 {
196 if ((SIGINT == sig) or (SIGILL == sig) or (SIGKILL == sig) or (SIGSEGV == sig) or (SIGTERM == sig))
197 {
198 i_exit_flag = 1; // set the flag for a graceful exit
199 // if ((SIGILL == sig) or (SIGKILL == sig) or (SIGSEGV == sig))
200 // {
201 // (void) printf ("\a\a\a\a\a\a\a Signal %d, program exiting... \r\n\a\a\a", sig);
202 // exit (0);
203 // }
204 }
205 return;
206 }
207
208
209 // initialisation for main
210 inline void
main_init1(void)211 main_init1 (void)
212 {
213 Files_deleted = 0;
214 Blocks_reclaimed = 0;
215 i_files_expanded = 0;
216 i_blocks_declaimed = 0;
217 i_minlinks = 1;
218 i_maxlinks = 1;
219 li_minblocks = LONG_MAX;
220 li_maxblocks = 0;
221 }
222
223
224 // delete the files (of name a_name[0 ... i_files] which are marked for deletion
225 void
delete_marked(const int i_files)226 delete_marked (const int i_files)
227 {
228 struct stat sa;
229 int i;
230
231 for (i = 0; i < i_files; i++)
232 {
233 if (i_exit_flag)
234 exit (0);
235 if (a_flags[i] bitand 1) // delete
236 {
237 if (-1 == lstat (a_names[i], &sa)) // or ! S_ISREG (sa.st_mode))
238 {
239 fprintf (stderr, "lstat(%s) failed\n", a_names[i]);
240 perror ("lstat");
241 return;
242 }
243 if (0 == Nodo)
244 {
245 printf ("deleting %s, freeing %lld blocks\n", a_names[i], ((1 == sa.st_nlink) ? sa.st_blocks : 0));
246 unlink (a_names[i]);
247 }
248 Files_deleted++;
249 if (1 == sa.st_nlink) // file a is no (hard) link
250 {
251 Blocks_reclaimed += sa.st_blocks;
252 }
253 }
254 }
255 return;
256 }
257
258
259 int
main(const int argc,const char * const argv[])260 main (const int argc, const char *const argv[])
261 {
262 char **names, buf[BUFSIZ]; //, *cp; // BUFSIZE is the default buffer size in stdio.h, usually 8192
263 int i, j, i1, i_mode = 0, i_ferror = 0, i_ret = 0; // integers, mode: 0: common, 1: deletion, i_ferror: for ferror
264 FILE *tmp; // file pointer for tmpfile
265 const char c_zero = '\0'; // for terminating strings
266 time_t now; // for actual time
267 struct tm *ptr_tm = (time (&now), localtime (&now)); // set actual time, convert to localtime
268
269 mc_ADVICE;
270 // read parameters for quiet mode and suppressing the actual unlinking and relinking. Rolf Freitag
271 while ((i = getopt (argc, (char *const *) argv, "cdhinqsSV")) != EOF)
272 {
273 switch (i)
274 {
275 case 'c':
276 b_comb = 1;
277 break;
278 case 'd':
279 i_mode = 1; // deletion mode
280 break;
281 case 'n':
282 Nodo = 1;
283 break;
284 case 'q':
285 Quiet = 1;
286 break;
287 case 'h':
288 fprintf (stdout, "This program can reclaim disk space by linking identical files together.\n");
289 fprintf (stdout, "It can also expand all hard links and reads the list of files from standard input.\n");
290 fprintf (stdout, "Example usage: find ./ -type f -print0 | dupmerge 2>&1 | tee ../dupmerge_log.txt\n");
291 fprintf (stdout, "Options:\n-h \t Show this help and exit.\n-V \t Show version number and exit.\n");
292 fprintf (stdout, "-d \t delete multiple files and hard links. Default: Preserve the alphabetically first file name.\n");
293 //fprintf (stdout, " \t Switch for ascessing/decessing order in deletion mode.\n");
294 fprintf (stdout, "-q \t Quiet mode.\n-n \t Nodo mode / read-only mode.\n");
295 fprintf (stdout, "-i \t Inverse switch: Expand all hard links in normal mode, replace files by their desparsed version if it is bigger.\n");
296 fprintf (stdout, "-s \t Flag for soft linking (default is hard linking). This option is beta because for linking of all ");
297 fprintf (stdout, "equal files more than one run of dupmerge is necessary and the inverse (expanding of soft links) is untested.\n");
298 fprintf (stdout, "-S \t Flag for Sparse mode: Replace files by their sparse version if it is smaller.\n");
299 fprintf (stdout, "-c \t Combo mode: Default mode +sparse mode. With -i it means inverse mode with unlinking and desparsing.\n");
300 fflush (stdout);
301 exit (0);
302 break;
303 case 's':
304 i_soft = 1;
305 break;
306 case 'V':
307 fprintf (stdout, "dupmerge version %1.2f\n", d_version);
308 fflush (stdout);
309 exit (0);
310 break;
311 case 'i':
312 b_inv = true;
313 break;
314 case 'S':
315 b_sparse = true;
316 break;
317 default:
318 break;
319 }
320 }
321 if (not Quiet)
322 {
323 (void) printf ("%s started at %d-%s%d-%s%d %s%d:%s%d:%s%d\n", argv[0],
324 ptr_tm->tm_year + 1900, ptr_tm->tm_mon + 1 > 9 ? "" : "0", ptr_tm->tm_mon + 1,
325 ptr_tm->tm_mday > 9 ? "" : "0", ptr_tm->tm_mday,
326 ptr_tm->tm_hour > 9 ? "" : "0", ptr_tm->tm_hour,
327 ptr_tm->tm_min > 9 ? "" : "0", ptr_tm->tm_min, ptr_tm->tm_sec > 9 ? "" : "0", ptr_tm->tm_sec);
328 fflush (stdout);
329 }
330 #ifndef __CYGWIN__
331 mlockall (MCL_CURRENT bitor MCL_FUTURE); // be cached
332 #endif
333 // Read list of file names into tmp file and count
334 tmp = tmpfile ();
335 if (NULL == tmp)
336 {
337 (void) fprintf (stderr, "could not open temporary file, exiting\n");
338 exit (-1);
339 }
340 nfiles = 0;
341 if (not Quiet)
342 {
343 (void) printf ("tmpfile (pointer %p) created, processing ...\n", tmp);
344 fflush (stdout);
345 }
346 while (!feof (stdin)) // while not end of input
347 {
348 buf[0] = '\0'; // set strnlen to 0 so that strnlen (buf, BUFSIZ) is 0 at EOF
349 for (i = 0; i < BUFSIZ; i++) // read till the sequence operator \0 is found
350 {
351 fread (&buf[i], 1, 1, stdin);
352 i_ferror = ferror (stdin);
353 if (i_ferror)
354 {
355 (void) fprintf (stderr, "stdin: error %d.\n", i_ferror);
356 i_ret = -1;
357 }
358 if ('\0' == buf[i]) // termination found
359 break;
360 }
361 if ('\0' not_eq buf[sizeof (buf) - 1]) // no termination at buffer end
362 buf[sizeof (buf) - 1] = '\0';
363 if (strnlen (buf, BUFSIZ))
364 nfiles++; // new file: increment file counter
365 if ((EOF == fputs (buf, tmp)) or (not fwrite (&c_zero, 1, 1, tmp))) // store new file name in tmp file
366 {
367 (void) fprintf (stderr, "could not write to temporary file, exiting\n");
368 exit (-1);
369 }
370 }
371 // Now that we know how many there are, printf, allocate space and re-read.
372 if (not Quiet)
373 {
374 (void) printf ("Input: %d files, processing ...\n", nfiles);
375 fflush (stdout);
376 }
377 if (0 == nfiles) // no input, nothing to do
378 exit (0);
379 rewind (tmp);
380 names = (char **) calloc (nfiles, sizeof (char *));
381 a_names = (char **) calloc (nfiles, sizeof (char *));
382 a_flags = (char *) calloc (nfiles, sizeof (char));
383 if ((NULL == names) or (NULL == a_flags) or (NULL == a_names))
384 {
385 (void) fprintf (stderr, "%s: Out of memory\n", argv[0]);
386 exit (-1);
387 }
388 for (i = 0; i < nfiles; i++) // get the file names from the tmp file
389 {
390 for (j = 0; j < BUFSIZ; j++) // read till the sequence operator \0 is found
391 {
392 fread (&buf[j], 1, 1, tmp);
393 i_ferror = ferror (stdin);
394 if (i_ferror)
395 {
396 (void) fprintf (stderr, "tmpfile: error %d.\n", i_ferror);
397 i_ret = -1;
398 }
399 if ('\0' == buf[j]) // termination found
400 break;
401 }
402 if ((NULL == (names[i] = strndup (buf, BUFSIZ))) or (NULL == (a_names[i] = strndup (buf, BUFSIZ))))
403 { // The command "gcc -o dupmerge dupmerge.c" prduces the Warning "dupmerge.c:191: warning: assignment
404 // makes pointer from integer without a cast" but this is a warning bug; in the if statement above there is no int!
405 // To get rid of that warning you can cast strndup to (char *) or equivalent (typeof(names[i])) or use -O3. [gcc version 3.3.4]
406 // For later versions of gcc you should use the command line option -D_GNU_SOURCE, but that's not necessary under cygwin.
407 (void) fprintf (stderr, "%s: Out of memory\n", argv[0]);
408 exit (1);
409 }
410 }
411 (void) fclose (tmp);
412 mc_register_sig_handler; // register signal handler before critical sections
413 qsort (a_names, nfiles, sizeof (char *), cmp0); // sort only the (pointers to the) file names in a_names
414 switch (i_mode)
415 {
416 case 0: // no deletion
417 switch (b_comb and b_inv) // For desparse mode after nomal inverted mode. Otherwise a second run would be nessisary.
418 {
419 case 0: // not combo mode and inverse switch
420 if (b_sparse) // sparse/desparse files first because sparsing/desparsing expands hard links (changes inode)
421 {
422 // no sorted list of files at this point
423 i1 = Quiet;
424 if (b_comb and Nodo) // combo mode and read-only (Nodo): do nodo sparsing/desparsing during sorting with output here
425 Quiet = 0;
426 else
427 Quiet = 1;
428 i = Nodo;
429 Nodo = 1; // for only sorting the names
430 qsort (names, nfiles, sizeof (char *), fcmp); // sort only the (pointers to the) file names
431 Nodo = i; // restore Nodo
432 Quiet = i1; // restore Quiet
433 foo:
434 main_init1 (); // start with new init
435 // sparse or desparse
436 for (i = 0; i < nfiles; i++)
437 {
438 // only different files, call fcmp2/3 only for the last file in a block of equal hard links
439 if ((nfiles - 1 == i) or different_files (names + i, names + i + 1))
440 (void) fcmp2 (names + i, not b_inv);
441 }
442 if (not Quiet)
443 {
444 (void)
445 printf
446 ("Files%s %ssparsed: %d of %d, Disk blocks%s %sclaimed: %d\n",
447 (Nodo ? " which can be" : ""), (b_inv ? "de" : ""),
448 (b_inv ? i_files_expanded : Files_deleted), nfiles,
449 (Nodo ? " can be" : ""), (b_inv ? "de" : "re"), (b_inv ? i_blocks_declaimed : Blocks_reclaimed));
450 (void)
451 printf
452 ("Minimum of disk blocks which could be %s by %s copying: %ld, Maximum: %ld.\n",
453 (b_inv ? "declaimed" : "reclaimed"), (b_inv ? "desparse" : "sparse"), li_minblocks, li_maxblocks);
454 fflush (stdout);
455 }
456 if ((b_comb and Nodo) or (b_inv)) // all work done
457 exit (0);
458 } // if (b_sparse), no break
459 case 1:
460 main_init1 (); // start with new init
461 if ((b_sparse and b_comb) or (not b_sparse)) // sparse mode +combo flag or no sparse mode: normal mode, maybe with inversion
462 { // Without the combo flag and with sparse mode there is no normal (maybe inverted) mode!
463 if (not b_inv)
464 {
465 qsort (names, nfiles, sizeof (char *), fcmp);
466 if (not Quiet)
467 {
468 (void) printf ("Scanning for more dups ...\n");
469 fflush (stdout);
470 }
471 for (i = 0; i <= nfiles - 2; i++) // Second run: nessisary for qsort versions which do use cached values, where fcmp is not always called,
472 (void) fcmp (names + i, names + i + 1); // e. g. with gcc qsort at coreutil sources.
473 }
474 else
475 {
476 if (not Quiet)
477 {
478 (void) printf ("Scanning for hard links ...\n");
479 fflush (stdout);
480 }
481 for (i = 0; i < nfiles; i++)
482 (void) fcmp1 (names + i);
483 }
484 if (not Quiet)
485 {
486 (void)
487 printf
488 ("Files%s %s: %d of %d, Disk blocks%s %sclaimed: %d\n",
489 (Nodo ? " which can be" : ""),
490 (b_inv ? "expanded" : "linked"),
491 (b_inv ? i_files_expanded : Files_deleted), nfiles,
492 (Nodo ? " which can be" : ""), (b_inv ? "de" : "re"), (b_inv ? i_blocks_declaimed : Blocks_reclaimed));
493 (void) printf ("Minimum of found hard links: %d, Maximum: %d.\n", i_minlinks, i_maxlinks);
494 fflush (stdout);
495 }
496 } // if ((b_sparse and b_comb) or (not b_sparse))
497 if (b_comb and b_inv) // do desparsing in inverse mode
498 goto foo;
499 break;
500 default:
501 (void) printf ("Unexpected switch error ...\n");
502 exit (-1);
503 break;
504 } // switch
505 break;
506 case 1: // deletion
507 // no sorted list of files at this point
508 qsort (names, nfiles, sizeof (char *), fcmp0); // sort only the file names
509 // second run
510 for (i = 0; i <= nfiles - 2; i++) // Second run: nessisary for qsort versions which do use cached values, where fcmp is not always called,
511 (void) fcmp0 (names + i, names + i + 1); // e. g. with gcc qsort at coreutil sources.
512 delete_marked (nfiles);
513 if (not Quiet)
514 {
515 (void)
516 printf
517 ("Duplicate files (%s deleted): %d of %d, Disk blocks%s reclaimed: %d\n",
518 (Nodo ? "which can be" : "which have been"), (b_inv ? i_files_expanded : Files_deleted), nfiles, (Nodo ? " which can be" : ""), Blocks_reclaimed);
519 (void) printf ("Minimum of found hard links: %d, Maximum: %d.\n", i_minlinks, i_maxlinks);
520 fflush (stdout);
521 }
522 break;
523 default:
524 (void) printf ("Unexpected switch error ...\n");
525 exit (-1);
526 break;
527 } // switch
528 exit (i_ret);
529 } // main
530
531
532 // For alphabetic ordering/searching.
533 int
cmp(const void * p0,const void * p1)534 cmp (const void *p0, const void *p1)
535 {
536 int i = 0;
537 char **p2 = (char **) p0;
538 char **p3 = (char **) p1;
539 i = strncmp (*p2, *p3, BUFSIZ); // compare the two strings
540 return (i);
541 }
542
543
544 // For alphabetic ordering/searching. With global inversion flag for inverse mode.
545 int
cmp0(const void * p0,const void * p1)546 cmp0 (const void *p0, const void *p1)
547 {
548 int i = 0;
549 char **p2 = (char **) p0;
550 char **p3 = (char **) p1;
551 i = strncmp (*p2, *p3, BUFSIZ); // compare the two strings
552 if (b_inv)
553 return (-i);
554 return (i);
555 }
556
557
558 // This is the comparison function called by qsort, where the real work
559 // is done as a side effect. Due to ANSI-C the current version is not
560 // completely correct because if the same objects are passed more than once to the
561 // comparison function the results must be consistent with another; see 7.20.5 in C99.
562 // If errors like lstat failed do occur, this is not fullfilled yet. With fcmp1 it's the same.
563 // sort levels: 0: size (and accessability), 1: device, 2: content, 3.: name
564 int
fcmp(const void * a,const void * b)565 fcmp (const void *a, const void *b)
566 {
567 struct stat sa, sb;
568 FILE *fa, *fb;
569 int i1, i2, i_nlinka, i_nlinkb, i_fa, i_fb;
570 int rval = 0;
571 const char *filea, *fileb;
572 union u_tag u_buffer1 = {
573 0
574 }, u_buffer2 =
575 {
576 0};
577 void *buffer1 = (void *) &u_buffer1;
578 void *buffer2 = (void *) &u_buffer2;
579 _Bool b_swap = false;
580
581 if (i_exit_flag)
582 exit (0);
583 // Nonexistent or non-plain files are less than any other file
584 if (NULL == a)
585 return (-1);
586 filea = *(const char **) a;
587 if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
588 {
589 fprintf (stderr, "lstat(%s) failed\n", filea);
590 perror ("lstat");
591 return (-1);
592 }
593 if (NULL == b)
594 return 1;
595 fileb = *(const char **) b;
596 if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
597 {
598 fprintf (stderr, "lstat(%s) failed\n", fileb);
599 perror ("lstat");
600 return (1);
601 }
602 i_minlinks = mc_MIN (i_minlinks, mc_MIN (sa.st_nlink, sb.st_nlink));
603 i_maxlinks = mc_MAX (i_maxlinks, mc_MAX (sa.st_nlink, sb.st_nlink));
604 // Smaller files are "less"
605 if (sa.st_size < sb.st_size)
606 return (-1);
607 if (sa.st_size > sb.st_size)
608 return (1);
609 // We now know both files exist, are plain files, are the same size
610 // if both files are hard linked or zero length: sort by name
611 if (((sa.st_dev == sb.st_dev) and (sa.st_ino == sb.st_ino)) or (0 == sa.st_size))
612 return (strncmp (filea, fileb, BUFSIZ));
613 // We now know both files exist, are plain files, are the same size > 0,
614 // and are not already linked, so compare them alphabetical, if they are on the same device
615 if (sa.st_dev != sb.st_dev)
616 return ((sa.st_dev < sb.st_dev) ? -1 : 1);
617 if (NULL == (fa = fopen (filea, "r")))
618 return (-1); // Unreadable files are "less than"
619 if (NULL == (fb = fopen (fileb, "r")))
620 {
621 fclose (fa);
622 return (1);
623 }
624 // Loop for comparing the files in 64 bit (instead of 8 bit) blocks.
625 // On big endian machines it's alphabetic ordering, on little andian machines it's
626 // alphabetic ordering with 64 bit 'characters'.
627 // Du to C99, 7.19.8.1 read must be done with chars.
628 while (((i1 = fread (buffer1, 1, 8, fa)) != 0) and ((i2 = fread (buffer2, 1, 8, fb)) != 0))
629 {
630 // Mask the unsused 1..7 bytes, because the standard says nothing about them.
631 memset (&(u_buffer1.c[i1]), 0, 8 - i1); // start at the first unused byte buffer1.c[i1], end at Byte7 buffer1.c[7]
632 memset (&(u_buffer2.c[i2]), 0, 8 - i2);
633 // check for file errors
634 i_fa = ferror (fa);
635 i_fb = ferror (fb);
636 if (i_fa or i_fb) // check for file errors
637 {
638 if (i_fa)
639 {
640 (void) fprintf (stderr, "file %s: error %d.\n", filea, i_fa);
641 rval = -1; // error file: smaller
642 break;
643 }
644 if (i_fb)
645 {
646 (void) fprintf (stderr, "file %s: error %d.\n", fileb, i_fb);
647 rval = 1;
648 break;
649 }
650 }
651 if (u_buffer1.i64 != u_buffer2.i64) // compare
652 {
653 rval = (u_buffer1.i64 < u_buffer2.i64) ? -1 : 1;
654 break;
655 }
656 }
657 (void) fclose (fa);
658 (void) fclose (fb);
659 if (rval) // unequal files of same size
660 return rval;
661
662 if ((sa.st_blocks) or (sb.st_blocks)) // avoid linking sparse files with no blocks allocated
663 {
664 // The two files have same content, size > 0, blocks allocated > 0 and are on the same device, so link them.
665 // We prefer to keep the one with less blocks, or if they're the
666 // same the older one, or if they're the same, the one with more (hard) links.
667 if ((sb.st_blocks < sa.st_blocks) or (sb.st_mtime > sa.st_mtime) or (sb.st_nlink < sa.st_nlink))
668 {
669 mc_SWAP_ITEMS (sa, sb, s_swap, sizeof (struct stat)); // swap items to keep original sb
670 mc_SWAP_ITEMS (filea, fileb, s_swap, sizeof (char *)); // swap file name pointers
671 b_swap = true;
672 }
673 // before linking: store number of hard links of each file
674 i_nlinka = sa.st_nlink;
675 i_nlinkb = sb.st_nlink;
676 if (1 == sa.st_nlink) // file a is no (hard) link
677 {
678 Files_deleted++;
679 Blocks_reclaimed += sa.st_blocks;
680 }
681 if (!Nodo and (unlink (filea)))
682 {
683 (void) fprintf (stderr, "unlink(%s) failed\n", filea);
684 perror ("unlink");
685 return (-1);
686 }
687 if (!i_soft)
688 {
689 if (!Nodo and (-1 == link (fileb, filea)))
690 {
691 (void) fprintf (stderr, "link(%s,%s) failed\n", fileb, filea);
692 perror ("link");
693 return (-1);
694 }
695 }
696 else
697 {
698 if (!Nodo and (-1 == symlink (fileb, filea)))
699 {
700 (void) fprintf (stderr, "symlink(%s,%s) failed\n", filea, fileb);
701 perror ("symlink");
702 return (-1);
703 }
704 }
705 if (!Quiet)
706 {
707 (void) printf ("ln %s %s %s: %d, %d -> %d, freed +%llu blocks\n", i_soft ? "-s" : "", fileb, filea, i_nlinkb, i_nlinka, i_nlinka + i_nlinkb,
708 sb.st_blocks);
709 fflush (stdout);
710 }
711 }
712 return ((b_swap) ? strncmp (fileb, filea, BUFSIZ) : strncmp (filea, fileb, BUFSIZ)); //
713 } // fcmp
714
715
716 // return the index of file with name a_name
717 int
nindex(const char * a_name)718 nindex (const char *a_name)
719 {
720 void *p_v = bsearch (&a_name, a_names, nfiles, sizeof (char *), cmp0);
721 int i_ret = (int) (((((void *) p_v) - ((void *) a_names))) / (sizeof (char *)));
722 if (NULL == p_v)
723 {
724 printf ("Error: bsearch(%s, %p, %d, %d, %p) returned NULL.\n", a_name, a_names, nfiles, sizeof (char *), cmp);
725 return (-1); // error
726 }
727 return (i_ret);
728 }
729
730
731 // This is the comparison function called by qsort, for marking all duplicate files with that flag
732 // Due to ANSI-C the current version is not
733 // completely correct because if the same objects are passed more than one to the
734 // comparison function the results must be consistent with another, 7.20.5 C99.
735 // If errors like lstat failed do occur, this is not fullfilled yet.
736 // sort levels: 0: size (and accessability), 1: device, 2: content, 3.: name
737 int
fcmp0(const void * a,const void * b)738 fcmp0 (const void *a, const void *b)
739 {
740 struct stat sa, sb;
741 FILE *fa, *fb;
742 int i1, i2;
743 int rval = 0, ja = 0, jb = 0, i_fa, i_fb;
744 const char *filea, *fileb;
745 union u_tag u_buffer1 = {
746 0
747 }, u_buffer2 =
748 {
749 0};
750 void *buffer1 = (void *) &u_buffer1;
751 void *buffer2 = (void *) &u_buffer2;
752
753 if (i_exit_flag)
754 exit (0);
755 // Nonexistent or non-plain files are less than any other file
756 if (NULL == a)
757 return (-1);
758 filea = *(const char **) a;
759 if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
760 {
761 fprintf (stderr, "lstat(%s) failed\n", filea);
762 perror ("lstat");
763 return (-1);
764 }
765 if (NULL == b)
766 return 1;
767 fileb = *(const char **) b;
768 if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
769 {
770 fprintf (stderr, "lstat(%s) failed\n", fileb);
771 perror ("lstat");
772 return (1);
773 }
774 i_minlinks = mc_MIN (i_minlinks, mc_MIN (sa.st_nlink, sb.st_nlink));
775 i_maxlinks = mc_MAX (i_maxlinks, mc_MAX (sa.st_nlink, sb.st_nlink));
776 // Smaller files are "less"
777 if (sa.st_size < sb.st_size)
778 return (-1);
779 if (sa.st_size > sb.st_size)
780 return (1);
781 // We now know both files exist, are plain files, are the same size
782 // if both files are hard linked: sort by name
783 if (((sa.st_dev == sb.st_dev) and (sa.st_ino == sb.st_ino)))
784 {
785 out_equal:
786 // set ja, jb to the index of a, b
787 ja = nindex (filea);
788 jb = nindex (fileb);
789 if ((ja >= 0) and (a_flags[ja] bitand 1))
790 return (-1); // marked file first
791 if ((jb >= 0) and (a_flags[jb] bitand 1))
792 return (1); // marked file first
793 // now both files are not marked
794 if ((ja >= 0) and (jb >= 0))
795 {
796 if ((b_inv ? -1 : 1) * strncmp (filea, fileb, BUFSIZ) <= 0) // alphabetic ordering
797 {
798 a_flags[ja] = 1;
799 if (!Quiet)
800 {
801 (void) printf ("Equal files: %s will get deleted, %s will get preserved.\n", filea, fileb);
802 fflush (stdout);
803 }
804 return (-1); // marked file first
805 }
806 else
807 {
808 a_flags[jb] = 1;
809 if (!Quiet)
810 {
811 (void) printf ("Equal files: %s will get deleted, %s will get preserved.\n", fileb, filea);
812 fflush (stdout);
813 }
814 return (1); // marked file first
815 }
816 }
817 return ((b_inv ? -1 : 1) * strncmp (filea, fileb, BUFSIZ));
818 }
819 // We now know both files exist, are plain files, are the same size > 0,
820 // and are not already linked, so compare them alphabetical
821 if (NULL == (fa = fopen (filea, "r")))
822 return (-1); // Unreadable files are "less than"
823 if (NULL == (fb = fopen (fileb, "r")))
824 {
825 fclose (fa);
826 return 1;
827 }
828 // Loop for comparing the files in 64 bit (instead of 8 bit) blocks.
829 // On big endian machines it's alphabetic ordering, on little andian machines it's
830 // alphabetic ordering with 64 bit 'characters'.
831 while (((i1 = fread (buffer1, 1, 8, fa)) != 0) and ((i2 = fread (buffer2, 1, 8, fb)) != 0))
832 {
833 // Mask the unsused 1..7 bytes, because the standard says nothing about them.
834 memset (&(u_buffer1.c[i1]), 0, 8 - i1); // start at the first unused byte buffer1.c[i1], end at Byte7 buffer1.c[7]
835 memset (&(u_buffer2.c[i2]), 0, 8 - i2);
836 // check for file errors
837 i_fa = ferror (fa);
838 i_fb = ferror (fb);
839 if (i_fa or i_fb) // check for file errors
840 {
841 if (i_fa)
842 {
843 (void) fprintf (stderr, "file %s: error %d.\n", filea, i_fa);
844 rval = -1; // error file: smaller
845 break;
846 }
847 if (i_fb)
848 {
849 (void) fprintf (stderr, "file %s: error %d.\n", fileb, i_fb);
850 rval = 1;
851 break;
852 }
853 }
854 if (u_buffer1.i64 != u_buffer2.i64) // compare
855 {
856 rval = (u_buffer1.i64 < u_buffer2.i64) ? -1 : 1;
857 break;
858 }
859 }
860 (void) fclose (fa);
861 (void) fclose (fb);
862 if (rval) // unequal files of same size
863 return rval;
864 // else
865 goto out_equal;
866 } // fcmp0
867
868
869 // This is the hard link expansion function for the inverse mode. Needs only one file name as argument.
870 int
fcmp1(const void * a)871 fcmp1 (const void *a)
872 {
873 struct stat sa;
874 FILE *fa, *fb;
875 int i1, i, i_b, i_fa, i_fb, i_ret = 0;
876 char a_cd[BUFSIZ] = { '\0' }; // for directory
877 char a_cf[BUFSIZ] = { '\0' }; // for tempfile
878 const char *filea, *fileb = a_cf; // file names, fileb after mkstemp
879 union u_tag u_buffer1 = {
880 0
881 };
882 void *buffer1 = (void *) &u_buffer1;
883
884 if (i_exit_flag)
885 exit (0);
886 // Nonexistent or non-plain files are less than any other file
887 if (NULL == a)
888 return (-1);
889 filea = *(const char **) a;
890 if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
891 {
892 fprintf (stderr, "lstat(%s) failed\n", filea);
893 perror ("lstat");
894 return (-1);
895 }
896 i_minlinks = mc_MIN (i_minlinks, sa.st_nlink);
897 i_maxlinks = mc_MAX (i_maxlinks, sa.st_nlink);
898 // check for hard links
899 if (sa.st_nlink > 1) // hard link found
900 {
901 i_files_expanded++;
902 i_blocks_declaimed += sa.st_blocks;
903 if (!Nodo) // expand the hard link
904 {
905 strncpy (a_cd, filea, BUFSIZ);
906 // extract the directory from the file name: delete from end to last "/"
907 for (i = strnlen (a_cd, BUFSIZ); i >= 0; i--)
908 {
909 if (a_cd[i] != '/')
910 a_cd[i] = '\0';
911 else
912 break;
913 }
914 // now the actual directory is in a_c, create temporary file there
915 strncpy (a_cf, a_cd, BUFSIZ);
916 strncat (a_cf, "dup__XXXXXXX", BUFSIZ / 2); // template for mkstemp
917 i_b = mkstemp (a_cf);
918 if (-1 == i_b)
919 {
920 (void) fprintf (stderr, "Could not create temporary file.\n");
921 perror ("file_create");
922 return (-1);
923 }
924 if ((NULL == (fa = fopen (filea, "r"))) or (NULL == (fb = fdopen (i_b, "w")))) // open file and tmpfile
925 {
926 (void) fprintf (stderr, "Expanding the hard link(%s) failed.\n", filea);
927 perror ("expand");
928 return (-1);
929 }
930 // expand to tmpfile
931 while ((i1 = fread (buffer1, 1, 8, fa)) != 0) // copy fa to fb in 8 byte blocks
932 {
933 (void) fwrite (buffer1, 1, i1, fb);
934 }
935 i_fa = ferror (fa);
936 i_fb = ferror (fb);
937 if (i_fa or i_fb) // check for file errors
938 {
939 if (i_fa)
940 (void) fprintf (stderr, "file %s: error %d.\n", filea, i_fa);
941 if (i_fb)
942 (void) fprintf (stderr, "file %s: error %d.\n", fileb, i_fb);
943 i_ret = -1;
944 }
945 (void) fclose (fa);
946 (void) fclose (fb);
947 // unlink
948 if (unlink (filea))
949 {
950 (void) fprintf (stderr, "unlink(%s) failed, tempfile %s remains\n", filea, fileb);
951 perror ("unlink");
952 return (-1);
953 }
954 // rename to original name
955 if (rename (fileb, filea))
956 {
957 (void) fprintf (stderr, "rename(%s, %s) failed, tempfile %s remains\n", fileb, filea, fileb);
958 perror ("unlink");
959 return (-1);
960 }
961 } // if (!Nodo)
962 if (!Quiet)
963 {
964 (void) printf ("expand %s: %d -> %d, unfreed %llu blocks\n", filea, sa.st_nlink, sa.st_nlink - 1, sa.st_blocks); // sa has the old cached value
965 fflush (stdout);
966 }
967 } // if (sa.st_ino > 1)
968 return (i_ret);
969 } // fcmp1
970
971
972 // This is the (de)sparsing function.
973 // Copying sparse/desparse can reduce/increase st_blocks while st_size remains constant.
974 int
fcmp2(const void * a,const _Bool b_S)975 fcmp2 (const void *a, const _Bool b_S)
976 {
977 FILE *fb;
978 struct stat sa, sb;
979 int i, i_b, status = 0, i_pid, i_ret = 0;
980 long int li;
981 char a_cd[BUFSIZ] = { '\0' }; // for directory, temporary string
982 char a_cf[BUFSIZ] = { '\0' }; // for tempfile
983 char a_ca[BUFSIZ] = { '\0' }; // for tempfile
984 const char *filea = a_ca, *fileb = a_cf; // file names
985
986 if (i_exit_flag)
987 exit (0);
988 // Nonexistent or non-plain files are less than any other file
989 if (NULL == a)
990 return (-1);
991 strncpy (a_ca, *(const char **) a, BUFSIZ - 1);
992 // filea = *(const char **) a;
993 if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
994 {
995 fprintf (stderr, "lstat(%s) failed\n", filea);
996 perror ("lstat");
997 return (-1);
998 }
999 // get tmpname: mkstemp, unlink
1000 strncpy (a_cd, filea, BUFSIZ);
1001 // extract the directory from the file name: delete from end to last "/"
1002 for (i = strnlen (a_cd, BUFSIZ); i >= 0; i--)
1003 {
1004 if (a_cd[i] != '/')
1005 a_cd[i] = '\0';
1006 else
1007 break;
1008 }
1009 // now the actual directory is in a_c, create temporary file there
1010 strncpy (a_cf, a_cd, BUFSIZ);
1011 strncat (a_cf, "dup__XXXXXXX", BUFSIZ / 2); // template for mkstemp
1012 i_b = mkstemp (a_cf);
1013 if (-1 == i_b)
1014 {
1015 (void) fprintf (stderr, "Could not create temporary file.\n");
1016 perror ("mkstemp");
1017 return (-1);
1018 }
1019 // dummy open and close to close the tmpfile
1020 if (NULL == (fb = fdopen (i_b, "w"))) // open tmpfile
1021 {
1022 (void) fprintf (stderr, "fdopen (%d) failed.\n", i_b);
1023 perror ("fdopen");
1024 return (-1);
1025 }
1026 fclose (fb);
1027 if (unlink (fileb)) // delete file because we do need only the name
1028 {
1029 (void) fprintf (stderr, "unlink(%s) failed\n", fileb);
1030 perror ("unlink");
1031 return (-1);
1032 }
1033 switch (i_pid = vfork ())
1034 {
1035 case -1: // parent with fork error
1036 perror ("vfork");
1037 fprintf (stderr, "vfork failed.\n");
1038 break;
1039 case 0: // child
1040 memset (a_cd, '\0', sizeof (a_cd));
1041 strncpy (a_cd, (b_S ? "--sparse=always" : "--sparse=never"), BUFSIZ);
1042 i = execl ("/bin/cp", "cp", a_cd, filea, fileb, NULL);
1043 if (i)
1044 {
1045 fprintf (stderr, "execlp(cp %s %s %s) failed, return value %d\n", a_cd, filea, fileb, i);
1046 perror ("fork");
1047 }
1048 _exit (i);
1049 break;
1050 default: // parent with no error: do proceed
1051 break;
1052 }
1053 i = waitpid (i_pid, &status, WUNTRACED); // wait till child terminates
1054 if (status)
1055 {
1056 fprintf (stderr, "copying failed; child with pid %d returned %d\n", i, status);
1057 i_ret = -1;
1058 goto end;
1059 }
1060 // lstat and check copy
1061 if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
1062 {
1063 fprintf (stderr, "lstat(%s) failed\n", fileb);
1064 perror ("lstat");
1065 return (-1);
1066 }
1067 li = sa.st_blocks - sb.st_blocks;
1068 if (not b_S)
1069 li = -li;
1070 li_minblocks = mc_MIN (li_minblocks, li);
1071 li_maxblocks = mc_MAX (li_maxblocks, li);
1072 // if shrinked in sparse mode or expanded in desparse mode: unlink original and rename tmpfile to original name
1073 if ((b_S ? (sa.st_blocks > sb.st_blocks) : (sa.st_blocks < sb.st_blocks)))
1074 {
1075 b_S ? Files_deleted++ : i_files_expanded++;
1076 b_S ? (Blocks_reclaimed += sa.st_blocks - sb.st_blocks) : (i_blocks_declaimed += sb.st_blocks - sa.st_blocks);
1077 if (!Quiet)
1078 {
1079 (void) printf ("%s: %lld -> %lld blocks\n", filea, sa.st_blocks, sb.st_blocks);
1080 fflush (stdout);
1081 }
1082 if (not Nodo)
1083 {
1084 if (unlink (filea)) // delete original
1085 {
1086 (void) fprintf (stderr, "unlink(%s) failed\n", filea);
1087 perror ("unlink");
1088 return (-1);
1089 }
1090 // rename to original name
1091 if (rename (fileb, filea))
1092 {
1093 (void) fprintf (stderr, "rename(%s, %s) failed\n", fileb, filea);
1094 perror ("unlink");
1095 i_ret = -1;
1096 goto end;
1097 }
1098 return (0);
1099 }
1100 }
1101 end:
1102 if (unlink (fileb)) // delete tmpfile
1103 {
1104 (void) fprintf (stderr, "unlink(%s) failed\n", fileb);
1105 perror ("unlink");
1106 return (-1);
1107 }
1108 return (i_ret);
1109 } // fcmp2
1110
1111
1112 // return true if the two files are not hard linkded
1113 _Bool
different_files(const void * a,const void * b)1114 different_files (const void *a, const void *b)
1115 {
1116 struct stat sa, sb;
1117 const char *filea, *fileb;
1118
1119 // Nonexistent or non-plain files are less than any other file
1120 if (NULL == a)
1121 return (true);
1122 filea = *(const char **) a;
1123 if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
1124 {
1125 fprintf (stderr, "lstat(%s) failed\n", filea);
1126 perror ("lstat");
1127 return (true);
1128 }
1129 if (NULL == b)
1130 return (true);
1131 fileb = *(const char **) b;
1132 if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
1133 {
1134 fprintf (stderr, "lstat(%s) failed\n", fileb);
1135 perror ("lstat");
1136 return (true);
1137 }
1138 if ((sa.st_dev == sb.st_dev) and (sa.st_ino == sb.st_ino)) // same device and same inode?
1139 return (false); // yes: equal files
1140 return (true);
1141 } // different_files
1142