1#!/bin/bash 2 3# stupid_dupes: find duplicates like jdupes but more slowly with a shell script 4# Copyright (C) 2020 by Jody Bruchon <jody@jodybruchon.com> 5# 6# The MIT License (MIT) 7# 8# Permission is hereby granted, free of charge, to any person obtaining a copy 9# of this software and associated documentation files (the "Software"), to deal 10# in the Software without restriction, including without limitation the rights 11# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12# copies of the Software, and to permit persons to whom the Software is 13# furnished to do so, subject to the following conditions: 14# 15# The above copyright notice and this permission notice shall be included in all 16# copies or substantial portions of the Software. 17# 18# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24# SOFTWARE. 25 26######################################### 27# HOW IT WORKS # 28######################################### 29# 30# This script loads each file into an array of file paths, then compares every 31# file against every other file, using various tricks to discard candidates as 32# quickly as possible without reading and comparing entire files. These include 33# skipping pairs with mismatched file sizes and hashing only the first 4K block 34# of each file and comparing the partial hashes. 35# 36# Every file is referred to by its index number. Since GNU Bash can use arrays 37# but doesn't have anything remotely like a C structure to conveniently pack 38# a bunch of related variables together within, C structures are simulated with 39# the array index number used as a "pointer." For example, a doubly-linked list 40# in C is pretty easy to declare: 41# 42# struct match { struct match *left; struct match *right; } 43# 44# And then an array of these matches: struct match matchlist[MAX_MATCHES]; 45# 46# Using arrays, we simulate this (e.g. with file index 15, match 2): 47# 48# MLEFT[2] = index number of "left" file in match chain 49# MRIGHT[2] = index number of "right" file in match chain 50# 51# FILES[15] = file path for file #15, referenced by one of the above items 52# SIZES[15] = file size for file #15 53# PHASH[15] = the 4K partial file hash for file #15 54# FHASH[15] = the full file hash for file #15 55# 56# The basic algorithm is: verify size match, verify partial hash match, verify 57# full hash match, verify files match byte-for-byte to be sure. 58# 59# There is some extra code to check for match pairing that is doubled up, and 60# a "processed" flag to prevent double processing of files. 61 62 63PROGNAME=stupid_dupes.sh 64VER=1.0 65VERDATE=2020-02-18 66 67V=1 # Verbosity 68AC=0 # Argument count 69PHS=4096 # Partial hash size 70FQUICK=0 # Quick (no final compare) mode 71FICNT=0 # File index counter 72MSCNT=0 # Match set counter 73STATUS=0 # Exit status 74 75# A hash command that outputs a plain file hash (no file names) 76test -z "$HASHCMD" && HASHCMD=jodyhash 77 78# 'find' defaults to no-recurse 79FRECURSE="-maxdepth 1" 80 81# sort option (cat = none) 82test -z "$SORTCMD" && SORTCMD="cat" 83 84### Function definitions 85 86# $1: file path to add 87add_file () { 88 ((V > 1)) && echo "add_file: '$1'" >&2 89 SZ="$(stat -c '%s' "$1" || echo FAIL)" 90 if [ "$SZ" = "FAIL" ] 91 then echo "error: add_file: can't stat '$1'" >&2 92 STATUS=1 93 return 94 fi 95 ((FICNT += 1)) 96 FILES[FICNT]="$1" 97 SIZES[FICNT]="$SZ" 98 PHASH[FICNT]="NULL" 99 FHASH[FICNT]="NULL" 100 ((V > 1)) && echo "add_file: added as file number $FICNT" >&2 101} 102 103# $1: hash to get (partial/full); $2: file # to hash 104get_filehash () { 105 ((V > 1)) && echo "get_filehash: $1:$2 '${FILES[$2]}'" >&2 106 test -z "${FILES[$2]}" && \ 107 echo "internal error: get_filehash: bad file number passed" >&2 && exit 1 108 case "$1" in 109 partial) 110 PHASH[$2]="$(dd if="${FILES[$2]}" bs=4096 count=1 2>/dev/null | $HASHCMD || echo "FAIL")" 111 test "${PHASH[$2]}" = "FAIL" && \ 112 echo "get_filehash: hashing failed: '${FILES[$2]}'" >&2 && STATUS=1 113 ;; 114 full) 115 FHASH[$2]="$($HASHCMD "${FILES[$2]}" || echo "FAIL")" 116 test "${FHASH[$2]}" = "FAIL" && \ 117 echo "get_filehash: hashing failed: '${FILES[$2]}'" >&2 && STATUS=1 118 ;; 119 *) 120 echo "internal error: get_filehash: invalid hash type '$1'" >&2 121 exit 1; 122 ;; 123 esac 124 ((V > 1)) && echo "get_filehash: PHASH=${PHASH[$2]}" >&2 125 return 0 126} 127 128# $1/$2: file numbers to check for a match 129check_match () { 130 ((V > 1)) && echo "check_match: checking: $1:'${FILES[$1]}', $2:'${FILES[$2]}'" >&2 131 # Sizes must match 132 if [ ${SIZES[$1]} != ${SIZES[$2]} ] 133 then ((V > 1)) && \ 134 echo "check_match: sizes differ: ${SIZES[$1]} != ${SIZES[$2]}" >&2 135 return 1 136 fi 137 138 # Check partial hashes 139 test "${PHASH[$1]}" = "NULL" && get_filehash partial "$1" 140 test "${PHASH[$1]}" = "FAIL" && STATUS=1 && return 1 141 test "${PHASH[$2]}" = "NULL" && get_filehash partial "$2" 142 test "${PHASH[$2]}" = "FAIL" && STATUS=1 && return 1 143 if [ "${PHASH[$1]}" != "${PHASH[$2]}" ] 144 then ((V > 1)) && echo "check_match: partial hashes don't match" >&2 145 return 1 146 else ((V > 1)) && echo "check_match: partial hashes match" >&2 147 fi 148 149 # Check full hashes 150 test "{$FHASH[$1]}" = "NULL" && get_filehash full "$1" 151 test "{$FHASH[$1]}" = "FAIL" && STATUS=1 && return 1 152 test "{$FHASH[$2]}" = "NULL" && get_filehash full "$2" 153 test "{$FHASH[$2]}" = "FAIL" && STATUS=1 && return 1 154 if [ "${FHASH[$1]}" != "${FHASH[$2]}" ] 155 then ((V > 1)) && echo "check_match: full hashes don't match" >&2 156 return 1 157 else ((V > 1)) && echo "check_match: full hashes match" >&2 158 fi 159 160 # Byte-for-byte compare the files 161 if ((FQUICK == 1)) || cmp -s "${FILES[$1]}" "${FILES[$2]}" 162 then ((V > 1)) && echo "check_match: files are identical" >&2 163 return 0 164 else ((V > 1)) && echo "check_match: files are not identical" >&2 165 return 1 166 fi 167 return 1 # should never be reached 168} 169 170# Link a pair of matched file numbers 171add_to_matches () { 172 ((V > 1)) && echo "add_to_matches: adding: '${FILES[$1]}','${FILES[$2]}'" >&2 173 MSCNT=$((MSCNT + 1)) 174 MLEFT[$MSCNT]=$1 175 MRIGHT[$MSCNT]=$2 176 MPROC[$MSCNT]=0 # Flips to 1 during final processing 177 ((V > 1)) && echo "add_to_matches: set $MSCNT = $1:$2" >&2 178 return 0 179} 180 181# Print all matched files 182print_matches () { 183 ((V > 1)) && echo "print_matches: running" >&2 184 FIRST=1 185 CURFILE=0 186 # Outer loop: find a match pair to start with 187 for ((PRINTCNT = 1; PRINTCNT <= MSCNT; PRINTCNT++)) 188 do 189 ((V > 1)) && echo " outer loop: print count $PRINTCNT, match count $MSCNT" >&2 190 # Don't reprint already-printed match pairings 191 if (( MPROC[PRINTCNT] != 0)) 192 then 193 ((V > 1)) && echo " skipping processed pair $PRINTCNT" >&2 194 continue 195 fi 196 CURFILE=${MLEFT[PRINTCNT]} 197 # Print a newline before each new set EXCEPT the first set 198 if ((FIRST == 1)); then FIRST=0; else echo; fi 199 echo "${FILES[CURFILE]}" 200 # Inner loop: find match pairs to print 201 CURCNT=$PRINTCNT; PREVCNT=1; unset PREV; PREV[1]=$CURFILE 202 for ((; CURCNT < MSCNT; CURCNT++)) 203 do 204 ((V > 1)) && echo " inner loop: CC $CURCNT" >&2 205 ((V > 1)) && echo " files: ${MLEFT[CURCNT]}:'${FILES[${MLEFT[CURCNT]}]}', ${MRIGHT[CURCNT]}:'${FILES[${MRIGHT[CURCNT]}]}'" >&2 206 if (( MPROC[PRINTCNT] != 0)) 207 then 208 ((V > 1)) && echo " skipping processed pair $CURCNT" >&2 209 continue 210 fi 211 CURMATCH_L=0; CURMATCH_R=0; PCCNT=0 212 # For each pair, check both sides for any known match number 213 while ((PCCNT < PREVCNT)) 214 do 215 PCCNT=$((PCCNT + 1)) 216 ((V > 1)) && echo -n " deep loop: $PCCNT <= $PREVCNT" >&2 217 (( MLEFT[CURCNT] == PREV[PCCNT] )) && CURMATCH_L=${MRIGHT[CURCNT]} 218 (( MRIGHT[CURCNT] == PREV[PCCNT])) && CURMATCH_R=${MLEFT[CURCNT]} 219 ((V > 1)) && echo ", curmatch: $CURMATCH = ${MLEFT[CURCNT]} < ${PREV[PCCNT]} > ${MRIGHT[CURCNT]}" >&2 220 # If both sides of this pair have been previously seen, 221 # just flag the pair and print nothing. 222 if (( CURMATCH_L != 0 && CURMATCH_R != 0 )) 223 then 224 MPROC[$CURCNT]=1 225 ((V > 1)) && echo " Flagging: pair $CURCNT (${MLEFT[CURCNT]}:${MRIGHT[CURCNT]}) (R)" >&2 226 break 227 fi 228 done 229 230 # If L or R match exists, we have a printable match 231 CURMATCH=0 232 (( CURMATCH_L != 0 && CURMATCH_R == 0)) && CURMATCH=$CURMATCH_L 233 (( CURMATCH_R != 0 && CURMATCH_L == 0)) && CURMATCH=$CURMATCH_R 234 if ((CURMATCH != 0)) 235 then echo "${FILES[CURMATCH]}" 236 MPROC[$CURCNT]=1 237 ((V > 1)) && echo " Flagging: pair $CURCNT (${MLEFT[CURCNT]}:${MRIGHT[CURCNT]})" >&2 238 PREVCNT=$((PREVCNT + 1)) 239 PREV[$PREVCNT]=$CURMATCH 240 fi 241 done 242 done 243 ((V > 1)) && echo "print_matches: complete" >&2 244 return 0 245} 246 247show_help () { 248 COPYTEXT="Copyright (C) 2020 by Jody Bruchon <jody@jodybruchon.com>\n" 249 echo "$PROGNAME $VER ($VERDATE)" 250 if [ "$2" = "full" ] 251 then 252 echo -e "$COPYTEXT" 253 echo -e "\nUsage: $PROGNAME [options] file_or_dir1 [more_files ...]\n" 254 echo -e "Options:\n" 255 echo "-r|--recurse Recurse into any subdirectories" 256 echo "-q|--quiet Only show final output and errors" 257 echo "-Q|--quick Skip the full file byte-for-byte comparison" 258 echo "-D|--debug Show lots of extra debugging text" 259 echo "-v|-V|--version Display program version and exit" 260 echo "-h|--help Show this help text and exit" 261 echo "--license Show the full program license text" 262 echo -e "\njdupes is better than me. Get it at github.com/jbruchon/jdupes\n" 263 fi 264 if [ "$2" = "license" ] 265 then 266 echo -e "$COPYTEXT" 267 echo -e "\nThe MIT License (MIT)\n" 268 echo "Permission is hereby granted, free of charge, to any person obtaining a copy of" 269 echo "this software and associated documentation files (the \"Software\"), to deal in" 270 echo "the Software without restriction, including without limitation the rights to" 271 echo "use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of" 272 echo "the Software, and to permit persons to whom the Software is furnished to do so," 273 echo -e "subject to the following conditions:\n" 274 echo "The above copyright notice and this permission notice shall be included in all" 275 echo -e "copies or substantial portions of the Software.\n" 276 echo "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR" 277 echo "IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS" 278 echo "FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR" 279 echo "COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER" 280 echo "IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN" 281 echo "CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE." 282 fi 283 exit $1 284} 285 286### End function definitions 287 288### Begin main program 289 290# Process arguments 291[[ "$@" = "" ]] && show_help 1 full 292for X in "$@" 293 do 294 case "$X" in 295 -q|--quiet) V=0 ;; 296 -D|--debug) V=2 ;; 297 -r|--recurse) FRECURSE="" ;; 298 -Q|--quick) FQUICK=1 ;; 299 -v|-V|--version) show_help 0 version ;; 300 -h|--help) show_help 0 full ;; 301 --license) show_help 0 license ;; 302 *) AC=$((AC + 1)); ARGS[AC]="$X" ;; 303 esac 304done 305 306((V > 1)) && echo "Command line: $(printf %q "$0" "$@")" >&2 307 308# Main loop 309for ((ARGNUM=1; ARGNUM < AC; ARGNUM++)) 310 do 311 ((V > 1)) && echo -e "Processing argument $ARGNUM: '${ARGS[ARGNUM]}'" >&2 312 if [[ ! -f "${ARGS[ARGNUM]}" && ! -d "${ARGS[ARGNUM]}" || -h "${ARGS[ARGNUM]}" ]] 313 then echo "warning: not a regular file or directory: '${ARGS[ARGNUM]}'" >&2 314 STATUS=1 315 continue 316 fi 317 318 # Add files/dirs to the list, recursing as needed 319 while IFS= read -r X 320 do add_file "$X" 321 done < <(find "${ARGS[ARGNUM]}" $FRECURSE -type f -size +0 | $SORTCMD) 322done 323 324# If there are not enough files, just exit with no matches 325((FICNT < 2)) && echo "No matches found." && exit $STATUS 326 327# Check every file pair for matches 328for ((CNT=1; CNT < FICNT; CNT++)) 329do 330 for ((SCAN=CNT; SCAN < FICNT;)) 331 do 332 ((SCAN++)) 333 check_match $CNT $SCAN && add_to_matches $CNT $SCAN 334 done 335done 336 337print_matches 338 339exit $STATUS 340