1#!/bin/bash
2
3# stupid_dupes: find duplicates like jdupes but more slowly with a shell script
4# Copyright (C) 2020 by Jody Bruchon <jody@jodybruchon.com>
5#
6# The MIT License (MIT)
7#
8# Permission is hereby granted, free of charge, to any person obtaining a copy
9# of this software and associated documentation files (the "Software"), to deal
10# in the Software without restriction, including without limitation the rights
11# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12# copies of the Software, and to permit persons to whom the Software is
13# furnished to do so, subject to the following conditions:
14#
15# The above copyright notice and this permission notice shall be included in all
16# copies or substantial portions of the Software.
17#
18# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24# SOFTWARE.
25
26#########################################
27#             HOW IT WORKS              #
28#########################################
29#
30# This script loads each file into an array of file paths, then compares every
31# file against every other file, using various tricks to discard candidates as
32# quickly as possible without reading and comparing entire files. These include
33# skipping pairs with mismatched file sizes and hashing only the first 4K block
34# of each file and comparing the partial hashes.
35#
36# Every file is referred to by its index number. Since GNU Bash can use arrays
37# but doesn't have anything remotely like a C structure to conveniently pack
38# a bunch of related variables together within, C structures are simulated with
39# the array index number used as a "pointer." For example, a doubly-linked list
40# in C is pretty easy to declare:
41#
42# struct match { struct match *left; struct match *right; }
43#
44# And then an array of these matches: struct match matchlist[MAX_MATCHES];
45#
46# Using arrays, we simulate this (e.g. with file index 15, match 2):
47#
48# MLEFT[2]  = index number of "left" file in match chain
49# MRIGHT[2] = index number of "right" file in match chain
50#
51# FILES[15] = file path for file #15, referenced by one of the above items
52# SIZES[15] = file size for file #15
53# PHASH[15] = the 4K partial file hash for file #15
54# FHASH[15] = the full file hash for file #15
55#
56# The basic algorithm is: verify size match, verify partial hash match, verify
57# full hash match, verify files match byte-for-byte to be sure.
58#
59# There is some extra code to check for match pairing that is doubled up, and
60# a "processed" flag to prevent double processing of files.
61
62
63PROGNAME=stupid_dupes.sh
64VER=1.0
65VERDATE=2020-02-18
66
67V=1		# Verbosity
68AC=0		# Argument count
69PHS=4096	# Partial hash size
70FQUICK=0	# Quick (no final compare) mode
71FICNT=0		# File index counter
72MSCNT=0		# Match set counter
73STATUS=0	# Exit status
74
75# A hash command that outputs a plain file hash (no file names)
76test -z "$HASHCMD" && HASHCMD=jodyhash
77
78# 'find' defaults to no-recurse
79FRECURSE="-maxdepth 1"
80
81# sort option (cat = none)
82test -z "$SORTCMD" && SORTCMD="cat"
83
84### Function definitions
85
86# $1: file path to add
87add_file () {
88	((V > 1)) && echo "add_file: '$1'" >&2
89	SZ="$(stat -c '%s' "$1" || echo FAIL)"
90	if [ "$SZ" = "FAIL" ]
91		then echo "error: add_file: can't stat '$1'"  >&2
92		STATUS=1
93		return
94	fi
95	((FICNT += 1))
96	FILES[FICNT]="$1"
97	SIZES[FICNT]="$SZ"
98	PHASH[FICNT]="NULL"
99	FHASH[FICNT]="NULL"
100	((V > 1)) && echo "add_file: added as file number $FICNT" >&2
101}
102
103# $1: hash to get (partial/full); $2: file # to hash
104get_filehash () {
105	((V > 1)) && echo "get_filehash: $1:$2 '${FILES[$2]}'" >&2
106	test -z "${FILES[$2]}" && \
107		echo "internal error: get_filehash: bad file number passed" >&2 && exit 1
108	case "$1" in
109		partial)
110			PHASH[$2]="$(dd if="${FILES[$2]}" bs=4096 count=1 2>/dev/null | $HASHCMD || echo "FAIL")"
111			test "${PHASH[$2]}" = "FAIL" && \
112				echo "get_filehash: hashing failed: '${FILES[$2]}'" >&2 && STATUS=1
113			;;
114		full)
115			FHASH[$2]="$($HASHCMD "${FILES[$2]}" || echo "FAIL")"
116			test "${FHASH[$2]}" = "FAIL" && \
117				echo "get_filehash: hashing failed: '${FILES[$2]}'" >&2 && STATUS=1
118			;;
119		*)
120			echo "internal error: get_filehash: invalid hash type '$1'" >&2
121			exit 1;
122			;;
123	esac
124	((V > 1)) && echo "get_filehash: PHASH=${PHASH[$2]}" >&2
125	return 0
126}
127
128# $1/$2: file numbers to check for a match
129check_match () {
130	((V > 1)) && echo "check_match: checking: $1:'${FILES[$1]}', $2:'${FILES[$2]}'" >&2
131	# Sizes must match
132	if [ ${SIZES[$1]} != ${SIZES[$2]} ]
133		then ((V > 1)) && \
134			echo "check_match: sizes differ: ${SIZES[$1]} != ${SIZES[$2]}" >&2
135		return 1
136	fi
137
138	# Check partial hashes
139	test "${PHASH[$1]}" = "NULL" && get_filehash partial "$1"
140	test "${PHASH[$1]}" = "FAIL" && STATUS=1 && return 1
141	test "${PHASH[$2]}" = "NULL" && get_filehash partial "$2"
142	test "${PHASH[$2]}" = "FAIL" && STATUS=1 && return 1
143	if [ "${PHASH[$1]}" != "${PHASH[$2]}" ]
144		then ((V > 1)) && echo "check_match: partial hashes don't match" >&2
145		return 1
146		else ((V > 1)) && echo "check_match: partial hashes match" >&2
147	fi
148
149	# Check full hashes
150	test "{$FHASH[$1]}" = "NULL" && get_filehash full "$1"
151	test "{$FHASH[$1]}" = "FAIL" && STATUS=1 && return 1
152	test "{$FHASH[$2]}" = "NULL" && get_filehash full "$2"
153	test "{$FHASH[$2]}" = "FAIL" && STATUS=1 && return 1
154	if [ "${FHASH[$1]}" != "${FHASH[$2]}" ]
155		then ((V > 1)) && echo "check_match: full hashes don't match" >&2
156		return 1
157		else ((V > 1)) && echo "check_match: full hashes match" >&2
158	fi
159
160	# Byte-for-byte compare the files
161	if ((FQUICK == 1)) || cmp -s "${FILES[$1]}" "${FILES[$2]}"
162		then ((V > 1)) && echo "check_match: files are identical" >&2
163		return 0
164		else ((V > 1)) && echo "check_match: files are not identical" >&2
165		return 1
166	fi
167	return 1  # should never be reached
168}
169
170# Link a pair of matched file numbers
171add_to_matches () {
172	((V > 1)) && echo "add_to_matches: adding: '${FILES[$1]}','${FILES[$2]}'" >&2
173	MSCNT=$((MSCNT + 1))
174	MLEFT[$MSCNT]=$1
175	MRIGHT[$MSCNT]=$2
176	MPROC[$MSCNT]=0		# Flips to 1 during final processing
177	((V > 1)) && echo "add_to_matches: set $MSCNT = $1:$2" >&2
178	return 0
179}
180
181# Print all matched files
182print_matches () {
183	((V > 1)) && echo "print_matches: running" >&2
184	FIRST=1
185	CURFILE=0
186	# Outer loop: find a match pair to start with
187	for ((PRINTCNT = 1; PRINTCNT <= MSCNT; PRINTCNT++))
188		do
189		((V > 1)) && echo "               outer loop: print count $PRINTCNT, match count $MSCNT" >&2
190		# Don't reprint already-printed match pairings
191		if (( MPROC[PRINTCNT] != 0))
192			then
193			((V > 1)) && echo "               skipping processed pair $PRINTCNT" >&2
194			continue
195		fi
196		CURFILE=${MLEFT[PRINTCNT]}
197		# Print a newline before each new set EXCEPT the first set
198		if ((FIRST == 1)); then FIRST=0; else echo; fi
199		echo "${FILES[CURFILE]}"
200		# Inner loop: find match pairs to print
201		CURCNT=$PRINTCNT; PREVCNT=1; unset PREV; PREV[1]=$CURFILE
202		for ((; CURCNT < MSCNT; CURCNT++))
203			do
204			((V > 1)) && echo "                 inner loop: CC $CURCNT" >&2
205			((V > 1)) && echo "                 files: ${MLEFT[CURCNT]}:'${FILES[${MLEFT[CURCNT]}]}', ${MRIGHT[CURCNT]}:'${FILES[${MRIGHT[CURCNT]}]}'" >&2
206			if (( MPROC[PRINTCNT] != 0))
207				then
208				((V > 1)) && echo "                 skipping processed pair $CURCNT" >&2
209				continue
210			fi
211			CURMATCH_L=0; CURMATCH_R=0; PCCNT=0
212			# For each pair, check both sides for any known match number
213			while ((PCCNT < PREVCNT))
214				do
215				PCCNT=$((PCCNT + 1))
216				((V > 1)) && echo -n "                   deep loop: $PCCNT <= $PREVCNT" >&2
217				(( MLEFT[CURCNT] == PREV[PCCNT] )) && CURMATCH_L=${MRIGHT[CURCNT]}
218				(( MRIGHT[CURCNT] == PREV[PCCNT])) && CURMATCH_R=${MLEFT[CURCNT]}
219				((V > 1)) && echo ", curmatch: $CURMATCH = ${MLEFT[CURCNT]} < ${PREV[PCCNT]} > ${MRIGHT[CURCNT]}" >&2
220				# If both sides of this pair have been previously seen,
221				# just flag the pair and print nothing.
222				if (( CURMATCH_L != 0 && CURMATCH_R != 0 ))
223					then
224					MPROC[$CURCNT]=1
225					((V > 1)) && echo "                   Flagging: pair $CURCNT (${MLEFT[CURCNT]}:${MRIGHT[CURCNT]}) (R)" >&2
226					break
227				fi
228			done
229
230			# If L or R match exists, we have a printable match
231			CURMATCH=0
232			(( CURMATCH_L != 0 && CURMATCH_R == 0)) && CURMATCH=$CURMATCH_L
233			(( CURMATCH_R != 0 && CURMATCH_L == 0)) && CURMATCH=$CURMATCH_R
234			if ((CURMATCH != 0))
235				then echo "${FILES[CURMATCH]}"
236				MPROC[$CURCNT]=1
237				((V > 1)) && echo "                   Flagging: pair $CURCNT (${MLEFT[CURCNT]}:${MRIGHT[CURCNT]})" >&2
238				PREVCNT=$((PREVCNT + 1))
239				PREV[$PREVCNT]=$CURMATCH
240			fi
241		done
242	done
243	((V > 1)) && echo "print_matches: complete" >&2
244	return 0
245}
246
247show_help () {
248	COPYTEXT="Copyright (C) 2020 by Jody Bruchon <jody@jodybruchon.com>\n"
249	echo "$PROGNAME $VER ($VERDATE)"
250	if [ "$2" = "full" ]
251		then
252		echo -e "$COPYTEXT"
253		echo -e "\nUsage: $PROGNAME [options] file_or_dir1 [more_files ...]\n"
254		echo -e "Options:\n"
255		echo "-r|--recurse     Recurse into any subdirectories"
256		echo "-q|--quiet       Only show final output and errors"
257		echo "-Q|--quick       Skip the full file byte-for-byte comparison"
258		echo "-D|--debug       Show lots of extra debugging text"
259		echo "-v|-V|--version  Display program version and exit"
260		echo "-h|--help        Show this help text and exit"
261		echo "--license        Show the full program license text"
262		echo -e "\njdupes is better than me. Get it at github.com/jbruchon/jdupes\n"
263	fi
264	if [ "$2" = "license" ]
265		then
266		echo -e "$COPYTEXT"
267		echo -e "\nThe MIT License (MIT)\n"
268		echo "Permission is hereby granted, free of charge, to any person obtaining a copy of"
269		echo "this software and associated documentation files (the \"Software\"), to deal in"
270		echo "the Software without restriction, including without limitation the rights to"
271		echo "use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of"
272		echo "the Software, and to permit persons to whom the Software is furnished to do so,"
273		echo -e "subject to the following conditions:\n"
274		echo "The above copyright notice and this permission notice shall be included in all"
275		echo -e "copies or substantial portions of the Software.\n"
276		echo "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR"
277		echo "IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS"
278		echo "FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR"
279		echo "COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER"
280		echo "IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN"
281		echo "CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE."
282	fi
283	exit $1
284}
285
286### End function definitions
287
288### Begin main program
289
290# Process arguments
291[[ "$@" = "" ]] && show_help 1 full
292for X in "$@"
293	do
294	case "$X" in
295		-q|--quiet) V=0 ;;
296		-D|--debug) V=2 ;;
297		-r|--recurse) FRECURSE="" ;;
298		-Q|--quick) FQUICK=1 ;;
299		-v|-V|--version) show_help 0 version ;;
300		-h|--help) show_help 0 full ;;
301		--license) show_help 0 license ;;
302		*) AC=$((AC + 1)); ARGS[AC]="$X" ;;
303	esac
304done
305
306((V > 1)) && echo "Command line: $(printf %q "$0" "$@")" >&2
307
308# Main loop
309for ((ARGNUM=1; ARGNUM < AC; ARGNUM++))
310	do
311	((V > 1)) && echo -e "Processing argument $ARGNUM: '${ARGS[ARGNUM]}'" >&2
312	if [[ ! -f "${ARGS[ARGNUM]}" && ! -d "${ARGS[ARGNUM]}" || -h "${ARGS[ARGNUM]}" ]]
313		then echo "warning: not a regular file or directory: '${ARGS[ARGNUM]}'" >&2
314		STATUS=1
315		continue
316	fi
317
318	# Add files/dirs to the list, recursing as needed
319	while IFS= read -r X
320		do add_file "$X"
321	done < <(find "${ARGS[ARGNUM]}" $FRECURSE -type f -size +0 | $SORTCMD)
322done
323
324# If there are not enough files, just exit with no matches
325((FICNT < 2)) && echo "No matches found." && exit $STATUS
326
327# Check every file pair for matches
328for ((CNT=1; CNT < FICNT; CNT++))
329do
330	for ((SCAN=CNT; SCAN < FICNT;))
331	do
332		((SCAN++))
333		check_match $CNT $SCAN && add_to_matches $CNT $SCAN
334	done
335done
336
337print_matches
338
339exit $STATUS
340