1 /* Dupmerge - Reclaim disk space by linking identical files together
2 
3 This is a utility that scans a UNIX directory tree looking for pairs of
4 distinct files with identical content. When it finds such files, it
5 deletes one file to reclaim its disk space and then recreates its path
6 name as a link to the other copy.
7 My first version of this program circa 1993 worked by computing MD5
8 hashes of every file, sorting the hashes and then looking for duplicates.
9 This worked, but it was unnecessarily slow. The comparison function I use
10 now stops comparing two files as soon as it determines their lengths are
11 different, which is a win when you have many large files with unique lengths.
12 
13  * This program reads from standard input a list of files (such
14  * as that generated by "find . -print") and discovers which files are
15  * identical. Dupmerge unlinks one file of each identical pair and
16  * recreates its path name as a link to the other.
17  *
18  * Non-plain files in the input (directories, pipes, devices, etc)
19  * are ignored.  Identical files must be on the same file system to be linked.
20  *
21  * Dupmerge prefers to keep the older of two identical files, as the older
22  * timestamp is more likely to be the correct one given that many
23  * copy utilities (e.g., 'cp') do not by default preserve modification
24  * times.
25  *
26  * Dupmerge works by quicksorting a list of path names, with the
27  * actual unlinking and relinking steps performed as side effects of
28  * the comparison function. The results of the sort are discarded.
29  *
30  * Command line arguments:
31  * -n Suppress the actual unlinking and relinking
32  * -q Operate in quiet mode (otherwise, relinks are displayed on stdout)
33  *
34  * 12 February 1998 Phil Karn, karn@ka9q.ampr.org
35  * Copyright Phil Karn. May be used under the terms of the GNU Public License.
36 
37 --------------------------------------------------------------------------------
38 
39 Added swap macro, invers mode, no linking of zero length files or files with no disk usage (sparse files),
40 void casts for semantic checkers, version number, sorting of equal size files due to dev and ino and name,
41 switched from newline terminated file names to zero terminated file names ...
42 Switched to C99.
43 Tested under SuSE 9.2 and Debian (both Kernel 2.6), tested with 4 and 7 Gigabyte files, coreutils sources,
44 md5 collisions, kernel sources, my archive with CD and DVD collections in an encrypted partition (200 GB,
45 1/2 Million files, 5 hours run time, about 30 GB reclaimed by 100,000 hard links).
46 Changed to reading/writing/comparing 8 byte blocks (instead of 1 byte blocks): 2 times faster now.
47 Changed inverse mode from O(n*ln(n)) to O(n).
48 Added sparse and desparse mode and combo flag.
49 Dr. Rolf Freitag a.k.a. Dr. No, 2004.
50 
51 For files bigger than 2 GiB, > 2^31 Byte, you should use the compiler option
52 "-D_FILE_OFFSET_BITS=64".
53 
54 Example for compilation (and striping + prelinking) on i686 ("PC" with Pentium III or higher or Athlon)
55 with shell function c (in ~/.bashrc):
56 function c {
57   gcc -Wall -I. -O3 -D_GNU_SOURCE -D__SMP__ -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_REENTRANT \
58  -pthread -mcpu=i686 -fexpensive-optimizations -DCPU=686 -ffast-math -m486 -lm -lz -o $1 $1.c && strip $1 \
59  && prelink -fmRv ./$1 && chmod +t $1
60 }
61 
62 Example usage of this shell function (in ~/.bashrc): c dupmerge
63 
64 But a simple "gcc -D_GNU_SOURCE -O2 -o dupmerge dupmerge.c" also does the job.
65 
66 The inverse mode expands the hard links to simple files if used without the option -n.
67 With option -n hard links are only reported.
68 The reverse mode can be used e. g. for expanding backups, which have been shrinked with
69 with the default mode.
70 
71 Caution: If there is not enough space for expanding the hard links (in inverse mode) hard links
72 will be lost! It's also a bad idea to kill dupmerge with signal 9 before it finishes because
73 files can be lost or damaged or new temporary and maybe big files can remain.
74 
75 example for usage:
76 find ./ -type f -print0 | dupmerge 2>&1 | tee /tmp/user0/dupmerge.out
77 
78 Todo:
79       - progress meter with approx. percent
80       - change from comparing signed to unsigned numbers for clearer ordering
81       - inverse mode optimization: expand with the same file attributes (copy most of struct stat)
82       - option m <number> for minimum block size; (not file size because of sparse files).
83         smaller files will not be hard linked
84       - ANSI-C (C99) comppatible error handling of the comparison functions
85       - for option n (nodo): normal mode: show the number of blocks which can be saved,
86       - refactorisation: separate help function and bit fiels +macros instead of legacy variables
87       - error message +advice when wrong parameters are used
88       - option f for using a soft link as second try when a hard link could not be made and the reverse for
89         inverse mode (inverse mode: expand hard and soft links)
90       - option z for not excluding zero length files from linking/expanding/erasing
91       - more user friendly output with e. g. freed block + saved space in kB instead of kiB or blocks (see IEC 60027-2),
92         extra explanation in output when -n is used,
93       - tests with ntfs partitions for MS-Win-users
94       - correkt block counting (sparse files are yet not counted correct)
95       - man page, info page
96       - autoconf/automake + rpm and deb package
97       - An option for scheduling in parallel with 2..N pthreads e. g. for SMP boxes with ramdisk.
98       - other features like a gziped cvs file with the file attributes from lstat of the files which were
99         linked
100       - an extra option for creating the tmpfiles on a specified partition for speedup with a ramdisk
101       - an option for calling dupmerge with find to shorten the command line the user has to do
102       - check if the user has enough permissions to replace files (in not nodo mode)
103       - self-calling, "user-friendly", mode with extra option which does calls dupmerge with find via the system call
104       - test with files which can be deleted only via the inode (http://www.cyberciti.biz/tips/delete-remove-files-with-inode-number.html)
105       - An additional option for doing deletion irreversible via shred or wipe or ... instead of unlink or rm.
106         - Checking for and reporting of filesystems where there are no links possible;  e. g. no hard links with FAT.
107       - GUI
108 
109     Version 1.73, 2008-03-01
110 
111  */
112 
113 #include <stdio.h>
114 #include <stdlib.h>
115 #include <string.h>             // strndup,
116 #include <sys/types.h>
117 #include <sys/stat.h>
118 #include <unistd.h>             // Nessisary for unlink, gtopt, link, fcmp.
119 #include <iso646.h>             // for and, bitand, or, bitor ...
120 #include <stdbool.h>            // _Bool
121 #include <signal.h>             // signals
122 #include <limits.h>             // UCHAR_MAX
123 #include <sys/mman.h>           // mlockall
124 #include <sys/types.h>          // vfork
125 #include <sys/wait.h>           // vfork
126 #include <time.h>               // localtime
127 
128 
129 #define mc_MIN(a, b) ((a) < (b) ? (a) : (b))
130 #define mc_MAX(a, b) ((a) > (b) ? (a) : (b))
131 
132 
133 // Swap two items a, b of size i_size, use (static) swap variable c for performance.
134 #define mc_SWAP_ITEMS(a, b, c, i_size) { memcpy (&(c), &(a), (i_size));\
135                                             memcpy (&(a), &(b), (i_size));\
136                                             memcpy (&(b), &(c), (i_size)); }
137 
138 // easter egg
139 #define mc_ADVICE if (argc > 1 && 0 == strncmp (argv[1], "-advice", 8) ) \
140 { \
141   (void)printf ("Don't Panic!\n"); \
142   exit (42); \
143 }                               /* 42: The meaning of life, the universe, and everything ;-) */
144 
145 
146 // For registering the signal handler void sig_handler (int sig). This macro saves 32 bytes ;-)
147 #define mc_register_sig_handler \
148 {\
149     int i;\
150     for(i=0; i<=UCHAR_MAX; i++)\
151       signal (i, sig_handler);\
152 }
153 
154 
155 // union for copy and compare
156 union u_tag
157 {
158   int64_t i64;
159   char c[8];
160 };
161 
162 
163 int cmp (const void *, const void *);   // function for qsort
164 int cmp0 (const void *, const void *);  // function for qsort
165 int fcmp (const void *, const void *);  // function for qsort
166 int fcmp0 (const void *, const void *); // function for qsort
167 int fcmp1 (const void *);
168 int fcmp2 (const void *, const _Bool);
169 _Bool different_files (const void *, const void *);
170 
171 const double d_version = 1.73;  // very simple version number of %1.2f format
172 struct stat s_swap;             // swap variable
173 int Nodo = 0;                   // for nodo mode (only report)
174 int Quiet = 0;                  // for quiet mode
175 int i_soft = 0;                 // for using always soft links instead of the default hard links
176 _Bool b_inv = false;            // invers mode indicator
177 _Bool b_sparse = false;         // flag for sparseing
178 _Bool b_comb = false;           // combo flag for normal + sparse mode
179 int i_minlinks = 1;             // minimum of found hard links
180 int i_maxlinks = 1;             // maximum of found hard links
181 long int li_minblocks = LONG_MAX;       // minimum of saved blocks by sparse copying
182 long int li_maxblocks = 0;      // maximum of saved blocks by sparse copying
183 int Files_deleted = 0;
184 int Blocks_reclaimed = 0;
185 int i_files_expanded = 0;
186 int i_blocks_declaimed = 0;
187 volatile int i_exit_flag = 0;   // flag for exiting soon (not immediately) and controlled after a signal to terminate
188 char **a_names = NULL;          // array of file names
189 char *a_flags = NULL;           // Array with the flags of the files in a_names. 1 = marked for deletion because another file the the same content was found,
190 int nfiles = 0;                 // number of files
191 
192 
193 void
sig_handler(const int sig)194 sig_handler (const int sig)     // signal handler
195 {
196   if ((SIGINT == sig) or (SIGILL == sig) or (SIGKILL == sig) or (SIGSEGV == sig) or (SIGTERM == sig))
197   {
198     i_exit_flag = 1;            // set the flag for a graceful exit
199 //    if ((SIGILL == sig) or (SIGKILL == sig) or (SIGSEGV == sig))
200 //    {
201 //      (void) printf ("\a\a\a\a\a\a\a Signal %d, program exiting... \r\n\a\a\a", sig);
202 //      exit (0);
203 //    }
204   }
205   return;
206 }
207 
208 
209 // initialisation for main
210 inline void
main_init1(void)211 main_init1 (void)
212 {
213   Files_deleted = 0;
214   Blocks_reclaimed = 0;
215   i_files_expanded = 0;
216   i_blocks_declaimed = 0;
217   i_minlinks = 1;
218   i_maxlinks = 1;
219   li_minblocks = LONG_MAX;
220   li_maxblocks = 0;
221 }
222 
223 
224 // delete the files (of name a_name[0 ... i_files] which are marked for deletion
225 void
delete_marked(const int i_files)226 delete_marked (const int i_files)
227 {
228   struct stat sa;
229   int i;
230 
231   for (i = 0; i < i_files; i++)
232   {
233     if (i_exit_flag)
234       exit (0);
235     if (a_flags[i] bitand 1)    // delete
236     {
237       if (-1 == lstat (a_names[i], &sa))        // or ! S_ISREG (sa.st_mode))
238       {
239         fprintf (stderr, "lstat(%s) failed\n", a_names[i]);
240         perror ("lstat");
241         return;
242       }
243       if (0 == Nodo)
244       {
245         printf ("deleting %s, freeing %lld blocks\n", a_names[i], ((1 == sa.st_nlink) ? sa.st_blocks : 0));
246         unlink (a_names[i]);
247       }
248       Files_deleted++;
249       if (1 == sa.st_nlink)     // file a is no (hard) link
250       {
251         Blocks_reclaimed += sa.st_blocks;
252       }
253     }
254   }
255   return;
256 }
257 
258 
259 int
main(const int argc,const char * const argv[])260 main (const int argc, const char *const argv[])
261 {
262   char **names, buf[BUFSIZ];    //, *cp;       // BUFSIZE is the default buffer size in stdio.h, usually 8192
263   int i, j, i1, i_mode = 0, i_ferror = 0, i_ret = 0;    // integers, mode: 0: common, 1: deletion, i_ferror: for ferror
264   FILE *tmp;                    // file pointer for tmpfile
265   const char c_zero = '\0';     // for terminating strings
266   time_t now;                   // for actual time
267   struct tm *ptr_tm = (time (&now), localtime (&now));  // set actual time, convert to localtime
268 
269   mc_ADVICE;
270   // read parameters for quiet mode and suppressing the actual unlinking and relinking. Rolf Freitag
271   while ((i = getopt (argc, (char *const *) argv, "cdhinqsSV")) != EOF)
272   {
273     switch (i)
274     {
275       case 'c':
276         b_comb = 1;
277         break;
278       case 'd':
279         i_mode = 1;             // deletion mode
280         break;
281       case 'n':
282         Nodo = 1;
283         break;
284       case 'q':
285         Quiet = 1;
286         break;
287       case 'h':
288         fprintf (stdout, "This program can reclaim disk space by linking identical files together.\n");
289         fprintf (stdout, "It can also expand all hard links and reads the list of files from standard input.\n");
290         fprintf (stdout, "Example usage: find ./ -type f -print0 | dupmerge 2>&1 | tee ../dupmerge_log.txt\n");
291         fprintf (stdout, "Options:\n-h \t Show this help and exit.\n-V \t Show version number and exit.\n");
292         fprintf (stdout, "-d \t delete multiple files and hard links. Default: Preserve the alphabetically first file name.\n");
293         //fprintf (stdout, "   \t Switch for ascessing/decessing order in deletion mode.\n");
294         fprintf (stdout, "-q \t Quiet mode.\n-n \t Nodo mode / read-only mode.\n");
295         fprintf (stdout, "-i \t Inverse switch: Expand all hard links in normal mode, replace files by their desparsed version if it is bigger.\n");
296         fprintf (stdout, "-s \t Flag for soft linking (default is hard linking). This option is beta because for linking of all ");
297         fprintf (stdout, "equal files more than one run of dupmerge is necessary and the inverse (expanding of soft links) is untested.\n");
298         fprintf (stdout, "-S \t Flag for Sparse mode: Replace files by their sparse version if it is smaller.\n");
299         fprintf (stdout, "-c \t Combo mode: Default mode +sparse mode. With -i it means inverse mode with unlinking and desparsing.\n");
300         fflush (stdout);
301         exit (0);
302         break;
303       case 's':
304         i_soft = 1;
305         break;
306       case 'V':
307         fprintf (stdout, "dupmerge version %1.2f\n", d_version);
308         fflush (stdout);
309         exit (0);
310         break;
311       case 'i':
312         b_inv = true;
313         break;
314       case 'S':
315         b_sparse = true;
316         break;
317       default:
318         break;
319     }
320   }
321   if (not Quiet)
322   {
323     (void) printf ("%s started at %d-%s%d-%s%d %s%d:%s%d:%s%d\n", argv[0],
324                    ptr_tm->tm_year + 1900, ptr_tm->tm_mon + 1 > 9 ? "" : "0", ptr_tm->tm_mon + 1,
325                    ptr_tm->tm_mday > 9 ? "" : "0", ptr_tm->tm_mday,
326                    ptr_tm->tm_hour > 9 ? "" : "0", ptr_tm->tm_hour,
327                    ptr_tm->tm_min > 9 ? "" : "0", ptr_tm->tm_min, ptr_tm->tm_sec > 9 ? "" : "0", ptr_tm->tm_sec);
328     fflush (stdout);
329   }
330 #ifndef __CYGWIN__
331   mlockall (MCL_CURRENT bitor MCL_FUTURE);      // be cached
332 #endif
333   // Read list of file names into tmp file and count
334   tmp = tmpfile ();
335   if (NULL == tmp)
336   {
337     (void) fprintf (stderr, "could not open temporary file, exiting\n");
338     exit (-1);
339   }
340   nfiles = 0;
341   if (not Quiet)
342   {
343     (void) printf ("tmpfile (pointer %p) created, processing ...\n", tmp);
344     fflush (stdout);
345   }
346   while (!feof (stdin))         // while not end of input
347   {
348     buf[0] = '\0';              // set strnlen to 0 so that strnlen (buf, BUFSIZ) is 0 at EOF
349     for (i = 0; i < BUFSIZ; i++)        // read till the sequence operator \0 is found
350     {
351       fread (&buf[i], 1, 1, stdin);
352       i_ferror = ferror (stdin);
353       if (i_ferror)
354       {
355         (void) fprintf (stderr, "stdin: error %d.\n", i_ferror);
356         i_ret = -1;
357       }
358       if ('\0' == buf[i])       // termination found
359         break;
360     }
361     if ('\0' not_eq buf[sizeof (buf) - 1])      // no termination at buffer end
362       buf[sizeof (buf) - 1] = '\0';
363     if (strnlen (buf, BUFSIZ))
364       nfiles++;                 // new file: increment file counter
365     if ((EOF == fputs (buf, tmp)) or (not fwrite (&c_zero, 1, 1, tmp))) // store new file name in tmp file
366     {
367       (void) fprintf (stderr, "could not write to temporary file, exiting\n");
368       exit (-1);
369     }
370   }
371   // Now that we know how many there are, printf, allocate space and re-read.
372   if (not Quiet)
373   {
374     (void) printf ("Input: %d files, processing ...\n", nfiles);
375     fflush (stdout);
376   }
377   if (0 == nfiles)              // no input, nothing to do
378     exit (0);
379   rewind (tmp);
380   names = (char **) calloc (nfiles, sizeof (char *));
381   a_names = (char **) calloc (nfiles, sizeof (char *));
382   a_flags = (char *) calloc (nfiles, sizeof (char));
383   if ((NULL == names) or (NULL == a_flags) or (NULL == a_names))
384   {
385     (void) fprintf (stderr, "%s: Out of memory\n", argv[0]);
386     exit (-1);
387   }
388   for (i = 0; i < nfiles; i++)  // get the file names from the tmp file
389   {
390     for (j = 0; j < BUFSIZ; j++)        // read till the sequence operator \0 is found
391     {
392       fread (&buf[j], 1, 1, tmp);
393       i_ferror = ferror (stdin);
394       if (i_ferror)
395       {
396         (void) fprintf (stderr, "tmpfile: error %d.\n", i_ferror);
397         i_ret = -1;
398       }
399       if ('\0' == buf[j])       // termination found
400         break;
401     }
402     if ((NULL == (names[i] = strndup (buf, BUFSIZ))) or (NULL == (a_names[i] = strndup (buf, BUFSIZ))))
403     {                           // The command "gcc -o dupmerge dupmerge.c" prduces the Warning "dupmerge.c:191: warning: assignment
404       // makes pointer from integer without a cast" but this is a warning bug; in the if statement above there is no int!
405       // To get rid of that warning you can cast strndup to (char *) or equivalent (typeof(names[i])) or use -O3. [gcc version 3.3.4]
406       // For later versions of gcc you should use the command line option -D_GNU_SOURCE, but that's not necessary under cygwin.
407       (void) fprintf (stderr, "%s: Out of memory\n", argv[0]);
408       exit (1);
409     }
410   }
411   (void) fclose (tmp);
412   mc_register_sig_handler;      // register signal handler before critical sections
413   qsort (a_names, nfiles, sizeof (char *), cmp0);       // sort only the (pointers to the) file names in a_names
414   switch (i_mode)
415   {
416     case 0:                    // no deletion
417       switch (b_comb and b_inv) // For desparse mode after nomal inverted mode. Otherwise a second run would be nessisary.
418       {
419         case 0:                // not combo mode and inverse switch
420           if (b_sparse)         // sparse/desparse files first because sparsing/desparsing expands hard links (changes inode)
421           {
422             // no sorted list of files at this point
423             i1 = Quiet;
424             if (b_comb and Nodo)        // combo mode and read-only (Nodo): do nodo sparsing/desparsing during sorting with output here
425               Quiet = 0;
426             else
427               Quiet = 1;
428             i = Nodo;
429             Nodo = 1;           // for only sorting the names
430             qsort (names, nfiles, sizeof (char *), fcmp);       // sort only the (pointers to the) file names
431             Nodo = i;           // restore Nodo
432             Quiet = i1;         // restore Quiet
433           foo:
434             main_init1 ();      // start with new init
435             // sparse or desparse
436             for (i = 0; i < nfiles; i++)
437             {
438               // only different files, call fcmp2/3 only for the last file in a block of equal hard links
439               if ((nfiles - 1 == i) or different_files (names + i, names + i + 1))
440                 (void) fcmp2 (names + i, not b_inv);
441             }
442             if (not Quiet)
443             {
444               (void)
445                 printf
446                 ("Files%s %ssparsed: %d of %d, Disk blocks%s %sclaimed: %d\n",
447                  (Nodo ? " which can be" : ""), (b_inv ? "de" : ""),
448                  (b_inv ? i_files_expanded : Files_deleted), nfiles,
449                  (Nodo ? " can be" : ""), (b_inv ? "de" : "re"), (b_inv ? i_blocks_declaimed : Blocks_reclaimed));
450               (void)
451                 printf
452                 ("Minimum of disk blocks which could be %s by %s copying: %ld, Maximum: %ld.\n",
453                  (b_inv ? "declaimed" : "reclaimed"), (b_inv ? "desparse" : "sparse"), li_minblocks, li_maxblocks);
454               fflush (stdout);
455             }
456             if ((b_comb and Nodo) or (b_inv))   // all work done
457               exit (0);
458           }                     // if (b_sparse), no break
459         case 1:
460           main_init1 ();        // start with new init
461           if ((b_sparse and b_comb) or (not b_sparse))  // sparse mode +combo flag or no sparse mode: normal mode, maybe with inversion
462           {                     // Without the combo flag and with sparse mode there is no normal (maybe inverted) mode!
463             if (not b_inv)
464             {
465               qsort (names, nfiles, sizeof (char *), fcmp);
466               if (not Quiet)
467               {
468                 (void) printf ("Scanning for more dups ...\n");
469                 fflush (stdout);
470               }
471               for (i = 0; i <= nfiles - 2; i++) // Second run: nessisary for qsort versions which do use cached values, where fcmp is not always called,
472                 (void) fcmp (names + i, names + i + 1); // e. g. with gcc qsort at coreutil sources.
473             }
474             else
475             {
476               if (not Quiet)
477               {
478                 (void) printf ("Scanning for hard links ...\n");
479                 fflush (stdout);
480               }
481               for (i = 0; i < nfiles; i++)
482                 (void) fcmp1 (names + i);
483             }
484             if (not Quiet)
485             {
486               (void)
487                 printf
488                 ("Files%s %s: %d of %d, Disk blocks%s %sclaimed: %d\n",
489                  (Nodo ? " which can be" : ""),
490                  (b_inv ? "expanded" : "linked"),
491                  (b_inv ? i_files_expanded : Files_deleted), nfiles,
492                  (Nodo ? " which can be" : ""), (b_inv ? "de" : "re"), (b_inv ? i_blocks_declaimed : Blocks_reclaimed));
493               (void) printf ("Minimum of found hard links: %d, Maximum: %d.\n", i_minlinks, i_maxlinks);
494               fflush (stdout);
495             }
496           }                     // if ((b_sparse and b_comb) or (not b_sparse))
497           if (b_comb and b_inv) // do desparsing in inverse mode
498             goto foo;
499           break;
500         default:
501           (void) printf ("Unexpected switch error ...\n");
502           exit (-1);
503           break;
504       }                         // switch
505       break;
506     case 1:                    // deletion
507       // no sorted list of files at this point
508       qsort (names, nfiles, sizeof (char *), fcmp0);    // sort only the file names
509       // second run
510       for (i = 0; i <= nfiles - 2; i++) // Second run: nessisary for qsort versions which do use cached values, where fcmp is not always called,
511         (void) fcmp0 (names + i, names + i + 1);        // e. g. with gcc qsort at coreutil sources.
512       delete_marked (nfiles);
513       if (not Quiet)
514       {
515         (void)
516           printf
517           ("Duplicate files (%s deleted): %d of %d, Disk blocks%s reclaimed: %d\n",
518            (Nodo ? "which can be" : "which have been"), (b_inv ? i_files_expanded : Files_deleted), nfiles, (Nodo ? " which can be" : ""), Blocks_reclaimed);
519         (void) printf ("Minimum of found hard links: %d, Maximum: %d.\n", i_minlinks, i_maxlinks);
520         fflush (stdout);
521       }
522       break;
523     default:
524       (void) printf ("Unexpected switch error ...\n");
525       exit (-1);
526       break;
527   }                             // switch
528   exit (i_ret);
529 }                               // main
530 
531 
532 // For alphabetic ordering/searching.
533 int
cmp(const void * p0,const void * p1)534 cmp (const void *p0, const void *p1)
535 {
536   int i = 0;
537   char **p2 = (char **) p0;
538   char **p3 = (char **) p1;
539   i = strncmp (*p2, *p3, BUFSIZ);       // compare the two strings
540   return (i);
541 }
542 
543 
544 // For alphabetic ordering/searching. With global inversion flag for inverse mode.
545 int
cmp0(const void * p0,const void * p1)546 cmp0 (const void *p0, const void *p1)
547 {
548   int i = 0;
549   char **p2 = (char **) p0;
550   char **p3 = (char **) p1;
551   i = strncmp (*p2, *p3, BUFSIZ);       // compare the two strings
552   if (b_inv)
553     return (-i);
554   return (i);
555 }
556 
557 
558 // This is the comparison function called by qsort, where the real work
559 // is done as a side effect. Due to ANSI-C the current version is not
560 // completely correct because if the same objects are passed more than once to the
561 // comparison function the results must be consistent with another; see 7.20.5  in C99.
562 // If errors like lstat failed do occur, this is not fullfilled yet. With fcmp1 it's the same.
563 // sort levels: 0: size (and accessability), 1: device, 2: content, 3.: name
564 int
fcmp(const void * a,const void * b)565 fcmp (const void *a, const void *b)
566 {
567   struct stat sa, sb;
568   FILE *fa, *fb;
569   int i1, i2, i_nlinka, i_nlinkb, i_fa, i_fb;
570   int rval = 0;
571   const char *filea, *fileb;
572   union u_tag u_buffer1 = {
573     0
574   }, u_buffer2 =
575   {
576   0};
577   void *buffer1 = (void *) &u_buffer1;
578   void *buffer2 = (void *) &u_buffer2;
579   _Bool b_swap = false;
580 
581   if (i_exit_flag)
582     exit (0);
583   // Nonexistent or non-plain files are less than any other file
584   if (NULL == a)
585     return (-1);
586   filea = *(const char **) a;
587   if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
588   {
589     fprintf (stderr, "lstat(%s) failed\n", filea);
590     perror ("lstat");
591     return (-1);
592   }
593   if (NULL == b)
594     return 1;
595   fileb = *(const char **) b;
596   if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
597   {
598     fprintf (stderr, "lstat(%s) failed\n", fileb);
599     perror ("lstat");
600     return (1);
601   }
602   i_minlinks = mc_MIN (i_minlinks, mc_MIN (sa.st_nlink, sb.st_nlink));
603   i_maxlinks = mc_MAX (i_maxlinks, mc_MAX (sa.st_nlink, sb.st_nlink));
604   // Smaller files are "less"
605   if (sa.st_size < sb.st_size)
606     return (-1);
607   if (sa.st_size > sb.st_size)
608     return (1);
609   // We now know both files exist, are plain files, are the same size
610   // if both files are hard linked or zero length: sort by name
611   if (((sa.st_dev == sb.st_dev) and (sa.st_ino == sb.st_ino)) or (0 == sa.st_size))
612     return (strncmp (filea, fileb, BUFSIZ));
613   // We now know both files exist, are plain files, are the same size > 0,
614   // and are not already linked, so compare them alphabetical, if they are on the same device
615   if (sa.st_dev != sb.st_dev)
616     return ((sa.st_dev < sb.st_dev) ? -1 : 1);
617   if (NULL == (fa = fopen (filea, "r")))
618     return (-1);                // Unreadable files are "less than"
619   if (NULL == (fb = fopen (fileb, "r")))
620   {
621     fclose (fa);
622     return (1);
623   }
624   // Loop for comparing the files in 64 bit (instead of 8 bit) blocks.
625   // On big endian machines it's alphabetic ordering, on little andian machines it's
626   // alphabetic ordering with 64 bit 'characters'.
627   // Du to C99, 7.19.8.1 read must be done with chars.
628   while (((i1 = fread (buffer1, 1, 8, fa)) != 0) and ((i2 = fread (buffer2, 1, 8, fb)) != 0))
629   {
630     // Mask the unsused 1..7 bytes, because the standard says nothing about them.
631     memset (&(u_buffer1.c[i1]), 0, 8 - i1);     // start at the first unused byte buffer1.c[i1], end at Byte7 buffer1.c[7]
632     memset (&(u_buffer2.c[i2]), 0, 8 - i2);
633     // check for file errors
634     i_fa = ferror (fa);
635     i_fb = ferror (fb);
636     if (i_fa or i_fb)           // check for file errors
637     {
638       if (i_fa)
639       {
640         (void) fprintf (stderr, "file %s: error %d.\n", filea, i_fa);
641         rval = -1;              // error file: smaller
642         break;
643       }
644       if (i_fb)
645       {
646         (void) fprintf (stderr, "file %s: error %d.\n", fileb, i_fb);
647         rval = 1;
648         break;
649       }
650     }
651     if (u_buffer1.i64 != u_buffer2.i64) // compare
652     {
653       rval = (u_buffer1.i64 < u_buffer2.i64) ? -1 : 1;
654       break;
655     }
656   }
657   (void) fclose (fa);
658   (void) fclose (fb);
659   if (rval)                     // unequal files of same size
660     return rval;
661 
662   if ((sa.st_blocks) or (sb.st_blocks)) // avoid linking sparse files with no blocks allocated
663   {
664     // The two files have same content, size > 0, blocks allocated > 0 and are on the same device, so link them.
665     // We prefer to keep the one with less blocks, or if they're the
666     // same the older one, or if they're the same, the one with more (hard) links.
667     if ((sb.st_blocks < sa.st_blocks) or (sb.st_mtime > sa.st_mtime) or (sb.st_nlink < sa.st_nlink))
668     {
669       mc_SWAP_ITEMS (sa, sb, s_swap, sizeof (struct stat));     // swap items to keep original sb
670       mc_SWAP_ITEMS (filea, fileb, s_swap, sizeof (char *));    // swap file name pointers
671       b_swap = true;
672     }
673     // before linking: store number of hard links of each file
674     i_nlinka = sa.st_nlink;
675     i_nlinkb = sb.st_nlink;
676     if (1 == sa.st_nlink)       // file a is no (hard) link
677     {
678       Files_deleted++;
679       Blocks_reclaimed += sa.st_blocks;
680     }
681     if (!Nodo and (unlink (filea)))
682     {
683       (void) fprintf (stderr, "unlink(%s) failed\n", filea);
684       perror ("unlink");
685       return (-1);
686     }
687     if (!i_soft)
688     {
689       if (!Nodo and (-1 == link (fileb, filea)))
690       {
691         (void) fprintf (stderr, "link(%s,%s) failed\n", fileb, filea);
692         perror ("link");
693         return (-1);
694       }
695     }
696     else
697     {
698       if (!Nodo and (-1 == symlink (fileb, filea)))
699       {
700         (void) fprintf (stderr, "symlink(%s,%s) failed\n", filea, fileb);
701         perror ("symlink");
702         return (-1);
703       }
704     }
705     if (!Quiet)
706     {
707       (void) printf ("ln %s %s %s: %d, %d -> %d, freed +%llu blocks\n", i_soft ? "-s" : "", fileb, filea, i_nlinkb, i_nlinka, i_nlinka + i_nlinkb,
708                      sb.st_blocks);
709       fflush (stdout);
710     }
711   }
712   return ((b_swap) ? strncmp (fileb, filea, BUFSIZ) : strncmp (filea, fileb, BUFSIZ));  //
713 }                               // fcmp
714 
715 
716 // return the index of file with name a_name
717 int
nindex(const char * a_name)718 nindex (const char *a_name)
719 {
720   void *p_v = bsearch (&a_name, a_names, nfiles, sizeof (char *), cmp0);
721   int i_ret = (int) (((((void *) p_v) - ((void *) a_names))) / (sizeof (char *)));
722   if (NULL == p_v)
723   {
724     printf ("Error: bsearch(%s, %p, %d, %d, %p) returned NULL.\n", a_name, a_names, nfiles, sizeof (char *), cmp);
725     return (-1);                // error
726   }
727   return (i_ret);
728 }
729 
730 
731 // This is the comparison function called by qsort, for marking all duplicate files with that flag
732 // Due to ANSI-C the current version is not
733 // completely correct because if the same objects are passed more than one to the
734 // comparison function the results must be consistent with another, 7.20.5 C99.
735 // If errors like lstat failed do occur, this is not fullfilled yet.
736 // sort levels: 0: size (and accessability), 1: device, 2: content, 3.: name
737 int
fcmp0(const void * a,const void * b)738 fcmp0 (const void *a, const void *b)
739 {
740   struct stat sa, sb;
741   FILE *fa, *fb;
742   int i1, i2;
743   int rval = 0, ja = 0, jb = 0, i_fa, i_fb;
744   const char *filea, *fileb;
745   union u_tag u_buffer1 = {
746     0
747   }, u_buffer2 =
748   {
749   0};
750   void *buffer1 = (void *) &u_buffer1;
751   void *buffer2 = (void *) &u_buffer2;
752 
753   if (i_exit_flag)
754     exit (0);
755   // Nonexistent or non-plain files are less than any other file
756   if (NULL == a)
757     return (-1);
758   filea = *(const char **) a;
759   if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
760   {
761     fprintf (stderr, "lstat(%s) failed\n", filea);
762     perror ("lstat");
763     return (-1);
764   }
765   if (NULL == b)
766     return 1;
767   fileb = *(const char **) b;
768   if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
769   {
770     fprintf (stderr, "lstat(%s) failed\n", fileb);
771     perror ("lstat");
772     return (1);
773   }
774   i_minlinks = mc_MIN (i_minlinks, mc_MIN (sa.st_nlink, sb.st_nlink));
775   i_maxlinks = mc_MAX (i_maxlinks, mc_MAX (sa.st_nlink, sb.st_nlink));
776   // Smaller files are "less"
777   if (sa.st_size < sb.st_size)
778     return (-1);
779   if (sa.st_size > sb.st_size)
780     return (1);
781   // We now know both files exist, are plain files, are the same size
782   // if both files are hard linked: sort by name
783   if (((sa.st_dev == sb.st_dev) and (sa.st_ino == sb.st_ino)))
784   {
785   out_equal:
786     // set ja, jb to the index of a, b
787     ja = nindex (filea);
788     jb = nindex (fileb);
789     if ((ja >= 0) and (a_flags[ja] bitand 1))
790       return (-1);              // marked file first
791     if ((jb >= 0) and (a_flags[jb] bitand 1))
792       return (1);               // marked file first
793     // now both files are not marked
794     if ((ja >= 0) and (jb >= 0))
795     {
796       if ((b_inv ? -1 : 1) * strncmp (filea, fileb, BUFSIZ) <= 0)       // alphabetic ordering
797       {
798         a_flags[ja] = 1;
799         if (!Quiet)
800         {
801           (void) printf ("Equal files: %s will get deleted, %s will get preserved.\n", filea, fileb);
802           fflush (stdout);
803         }
804         return (-1);            // marked file first
805       }
806       else
807       {
808         a_flags[jb] = 1;
809         if (!Quiet)
810         {
811           (void) printf ("Equal files: %s will get deleted, %s will get preserved.\n", fileb, filea);
812           fflush (stdout);
813         }
814         return (1);             // marked file first
815       }
816     }
817     return ((b_inv ? -1 : 1) * strncmp (filea, fileb, BUFSIZ));
818   }
819   // We now know both files exist, are plain files, are the same size > 0,
820   // and are not already linked, so compare them alphabetical
821   if (NULL == (fa = fopen (filea, "r")))
822     return (-1);                // Unreadable files are "less than"
823   if (NULL == (fb = fopen (fileb, "r")))
824   {
825     fclose (fa);
826     return 1;
827   }
828   // Loop for comparing the files in 64 bit (instead of 8 bit) blocks.
829   // On big endian machines it's alphabetic ordering, on little andian machines it's
830   // alphabetic ordering with 64 bit 'characters'.
831   while (((i1 = fread (buffer1, 1, 8, fa)) != 0) and ((i2 = fread (buffer2, 1, 8, fb)) != 0))
832   {
833     // Mask the unsused 1..7 bytes, because the standard says nothing about them.
834     memset (&(u_buffer1.c[i1]), 0, 8 - i1);     // start at the first unused byte buffer1.c[i1], end at Byte7 buffer1.c[7]
835     memset (&(u_buffer2.c[i2]), 0, 8 - i2);
836     // check for file errors
837     i_fa = ferror (fa);
838     i_fb = ferror (fb);
839     if (i_fa or i_fb)           // check for file errors
840     {
841       if (i_fa)
842       {
843         (void) fprintf (stderr, "file %s: error %d.\n", filea, i_fa);
844         rval = -1;              // error file: smaller
845         break;
846       }
847       if (i_fb)
848       {
849         (void) fprintf (stderr, "file %s: error %d.\n", fileb, i_fb);
850         rval = 1;
851         break;
852       }
853     }
854     if (u_buffer1.i64 != u_buffer2.i64) // compare
855     {
856       rval = (u_buffer1.i64 < u_buffer2.i64) ? -1 : 1;
857       break;
858     }
859   }
860   (void) fclose (fa);
861   (void) fclose (fb);
862   if (rval)                     // unequal files of same size
863     return rval;
864   // else
865   goto out_equal;
866 }                               // fcmp0
867 
868 
869 // This is the hard link expansion function for the inverse mode. Needs only one file name as argument.
870 int
fcmp1(const void * a)871 fcmp1 (const void *a)
872 {
873   struct stat sa;
874   FILE *fa, *fb;
875   int i1, i, i_b, i_fa, i_fb, i_ret = 0;
876   char a_cd[BUFSIZ] = { '\0' }; // for directory
877   char a_cf[BUFSIZ] = { '\0' }; // for tempfile
878   const char *filea, *fileb = a_cf;     // file names, fileb after mkstemp
879   union u_tag u_buffer1 = {
880     0
881   };
882   void *buffer1 = (void *) &u_buffer1;
883 
884   if (i_exit_flag)
885     exit (0);
886   // Nonexistent or non-plain files are less than any other file
887   if (NULL == a)
888     return (-1);
889   filea = *(const char **) a;
890   if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
891   {
892     fprintf (stderr, "lstat(%s) failed\n", filea);
893     perror ("lstat");
894     return (-1);
895   }
896   i_minlinks = mc_MIN (i_minlinks, sa.st_nlink);
897   i_maxlinks = mc_MAX (i_maxlinks, sa.st_nlink);
898   // check for hard links
899   if (sa.st_nlink > 1)          // hard link found
900   {
901     i_files_expanded++;
902     i_blocks_declaimed += sa.st_blocks;
903     if (!Nodo)                  // expand the hard link
904     {
905       strncpy (a_cd, filea, BUFSIZ);
906       // extract the directory from the file name: delete from end to last "/"
907       for (i = strnlen (a_cd, BUFSIZ); i >= 0; i--)
908       {
909         if (a_cd[i] != '/')
910           a_cd[i] = '\0';
911         else
912           break;
913       }
914       // now the actual directory is in a_c, create temporary file there
915       strncpy (a_cf, a_cd, BUFSIZ);
916       strncat (a_cf, "dup__XXXXXXX", BUFSIZ / 2);       // template for mkstemp
917       i_b = mkstemp (a_cf);
918       if (-1 == i_b)
919       {
920         (void) fprintf (stderr, "Could not create temporary file.\n");
921         perror ("file_create");
922         return (-1);
923       }
924       if ((NULL == (fa = fopen (filea, "r"))) or (NULL == (fb = fdopen (i_b, "w"))))    // open file and tmpfile
925       {
926         (void) fprintf (stderr, "Expanding the hard link(%s) failed.\n", filea);
927         perror ("expand");
928         return (-1);
929       }
930       // expand to tmpfile
931       while ((i1 = fread (buffer1, 1, 8, fa)) != 0)     // copy fa to fb in 8 byte blocks
932       {
933         (void) fwrite (buffer1, 1, i1, fb);
934       }
935       i_fa = ferror (fa);
936       i_fb = ferror (fb);
937       if (i_fa or i_fb)         // check for file errors
938       {
939         if (i_fa)
940           (void) fprintf (stderr, "file %s: error %d.\n", filea, i_fa);
941         if (i_fb)
942           (void) fprintf (stderr, "file %s: error %d.\n", fileb, i_fb);
943         i_ret = -1;
944       }
945       (void) fclose (fa);
946       (void) fclose (fb);
947       // unlink
948       if (unlink (filea))
949       {
950         (void) fprintf (stderr, "unlink(%s) failed, tempfile %s remains\n", filea, fileb);
951         perror ("unlink");
952         return (-1);
953       }
954       // rename to original name
955       if (rename (fileb, filea))
956       {
957         (void) fprintf (stderr, "rename(%s, %s) failed, tempfile %s remains\n", fileb, filea, fileb);
958         perror ("unlink");
959         return (-1);
960       }
961     }                           // if (!Nodo)
962     if (!Quiet)
963     {
964       (void) printf ("expand %s: %d -> %d, unfreed %llu blocks\n", filea, sa.st_nlink, sa.st_nlink - 1, sa.st_blocks);  // sa has the old cached value
965       fflush (stdout);
966     }
967   }                             // if (sa.st_ino > 1)
968   return (i_ret);
969 }                               // fcmp1
970 
971 
972 // This is the (de)sparsing function.
973 // Copying sparse/desparse can reduce/increase st_blocks while st_size remains constant.
974 int
fcmp2(const void * a,const _Bool b_S)975 fcmp2 (const void *a, const _Bool b_S)
976 {
977   FILE *fb;
978   struct stat sa, sb;
979   int i, i_b, status = 0, i_pid, i_ret = 0;
980   long int li;
981   char a_cd[BUFSIZ] = { '\0' }; // for directory, temporary string
982   char a_cf[BUFSIZ] = { '\0' }; // for tempfile
983   char a_ca[BUFSIZ] = { '\0' }; // for tempfile
984   const char *filea = a_ca, *fileb = a_cf;      // file names
985 
986   if (i_exit_flag)
987     exit (0);
988   // Nonexistent or non-plain files are less than any other file
989   if (NULL == a)
990     return (-1);
991   strncpy (a_ca, *(const char **) a, BUFSIZ - 1);
992   // filea = *(const char **) a;
993   if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
994   {
995     fprintf (stderr, "lstat(%s) failed\n", filea);
996     perror ("lstat");
997     return (-1);
998   }
999   // get tmpname: mkstemp, unlink
1000   strncpy (a_cd, filea, BUFSIZ);
1001   // extract the directory from the file name: delete from end to last "/"
1002   for (i = strnlen (a_cd, BUFSIZ); i >= 0; i--)
1003   {
1004     if (a_cd[i] != '/')
1005       a_cd[i] = '\0';
1006     else
1007       break;
1008   }
1009   // now the actual directory is in a_c, create temporary file there
1010   strncpy (a_cf, a_cd, BUFSIZ);
1011   strncat (a_cf, "dup__XXXXXXX", BUFSIZ / 2);   // template for mkstemp
1012   i_b = mkstemp (a_cf);
1013   if (-1 == i_b)
1014   {
1015     (void) fprintf (stderr, "Could not create temporary file.\n");
1016     perror ("mkstemp");
1017     return (-1);
1018   }
1019   // dummy open and close to close the tmpfile
1020   if (NULL == (fb = fdopen (i_b, "w"))) // open tmpfile
1021   {
1022     (void) fprintf (stderr, "fdopen (%d) failed.\n", i_b);
1023     perror ("fdopen");
1024     return (-1);
1025   }
1026   fclose (fb);
1027   if (unlink (fileb))           // delete file because we do need only the name
1028   {
1029     (void) fprintf (stderr, "unlink(%s) failed\n", fileb);
1030     perror ("unlink");
1031     return (-1);
1032   }
1033   switch (i_pid = vfork ())
1034   {
1035     case -1:                   // parent with fork error
1036       perror ("vfork");
1037       fprintf (stderr, "vfork failed.\n");
1038       break;
1039     case 0:                    // child
1040       memset (a_cd, '\0', sizeof (a_cd));
1041       strncpy (a_cd, (b_S ? "--sparse=always" : "--sparse=never"), BUFSIZ);
1042       i = execl ("/bin/cp", "cp", a_cd, filea, fileb, NULL);
1043       if (i)
1044       {
1045         fprintf (stderr, "execlp(cp %s %s %s) failed, return value %d\n", a_cd, filea, fileb, i);
1046         perror ("fork");
1047       }
1048       _exit (i);
1049       break;
1050     default:                   // parent with no error: do proceed
1051       break;
1052   }
1053   i = waitpid (i_pid, &status, WUNTRACED);      // wait till child terminates
1054   if (status)
1055   {
1056     fprintf (stderr, "copying failed; child with pid %d returned %d\n", i, status);
1057     i_ret = -1;
1058     goto end;
1059   }
1060   // lstat and check copy
1061   if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
1062   {
1063     fprintf (stderr, "lstat(%s) failed\n", fileb);
1064     perror ("lstat");
1065     return (-1);
1066   }
1067   li = sa.st_blocks - sb.st_blocks;
1068   if (not b_S)
1069     li = -li;
1070   li_minblocks = mc_MIN (li_minblocks, li);
1071   li_maxblocks = mc_MAX (li_maxblocks, li);
1072   // if shrinked in sparse mode or expanded in desparse mode: unlink original and rename tmpfile to original name
1073   if ((b_S ? (sa.st_blocks > sb.st_blocks) : (sa.st_blocks < sb.st_blocks)))
1074   {
1075     b_S ? Files_deleted++ : i_files_expanded++;
1076     b_S ? (Blocks_reclaimed += sa.st_blocks - sb.st_blocks) : (i_blocks_declaimed += sb.st_blocks - sa.st_blocks);
1077     if (!Quiet)
1078     {
1079       (void) printf ("%s: %lld -> %lld blocks\n", filea, sa.st_blocks, sb.st_blocks);
1080       fflush (stdout);
1081     }
1082     if (not Nodo)
1083     {
1084       if (unlink (filea))       // delete original
1085       {
1086         (void) fprintf (stderr, "unlink(%s) failed\n", filea);
1087         perror ("unlink");
1088         return (-1);
1089       }
1090       // rename to original name
1091       if (rename (fileb, filea))
1092       {
1093         (void) fprintf (stderr, "rename(%s, %s) failed\n", fileb, filea);
1094         perror ("unlink");
1095         i_ret = -1;
1096         goto end;
1097       }
1098       return (0);
1099     }
1100   }
1101 end:
1102   if (unlink (fileb))           // delete tmpfile
1103   {
1104     (void) fprintf (stderr, "unlink(%s) failed\n", fileb);
1105     perror ("unlink");
1106     return (-1);
1107   }
1108   return (i_ret);
1109 }                               // fcmp2
1110 
1111 
1112 // return true if the two files are not hard linkded
1113 _Bool
different_files(const void * a,const void * b)1114 different_files (const void *a, const void *b)
1115 {
1116   struct stat sa, sb;
1117   const char *filea, *fileb;
1118 
1119   // Nonexistent or non-plain files are less than any other file
1120   if (NULL == a)
1121     return (true);
1122   filea = *(const char **) a;
1123   if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
1124   {
1125     fprintf (stderr, "lstat(%s) failed\n", filea);
1126     perror ("lstat");
1127     return (true);
1128   }
1129   if (NULL == b)
1130     return (true);
1131   fileb = *(const char **) b;
1132   if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
1133   {
1134     fprintf (stderr, "lstat(%s) failed\n", fileb);
1135     perror ("lstat");
1136     return (true);
1137   }
1138   if ((sa.st_dev == sb.st_dev) and (sa.st_ino == sb.st_ino))    // same device and same inode?
1139     return (false);             // yes: equal files
1140   return (true);
1141 }                               // different_files
1142