#!/bin/bash #Copyright 2001, William Stearns #Released under the GPL. #This script will free up space by hardlinking identical files together. #From the yet-another-program-that-should-have-been-written-in-a-real-language series. #Presettable parameters (place on command line before freedups): #DEBUG=YES #Debugging output if YES. #ACTUALLYLINK=YES #Just reports on potential savings if anything but YES. #VERBOSE=YES #Show directory listing and wait before linking if YES. #CHECKDATE=YES #Modified date and time must be equal to be considered for linking if YES. #FILENAMESEQUAL=YES #Files must have the same name (in different directories to be considered for linking if YES. #MINSIZE=size #Files must be larger than this size (in bytes) to be considered for linking. #EXCLUDE=regex #Egrep regex of files to ignore - not implemented yet. #Command line holds just the dirs to look under. #FIXME - remove the following before shipping DEBUG=YES SPACESAVED=0 SPACEWOULDHAVESAVED=0 if [ "$DEBUG" = "YES" ]; then debug () { echo $* >/dev/stderr } else debug () { : } fi nodebug () { : } linkfiles () { #Parameters: the 2 files that need to be hardlinked together. nodebug '++++' lf "$@" if [ $# -lt 2 ]; then return 1 fi #The "find -type f" above already does this check. # for AFILE in "$@" ; do # if [ ! -f "$AFILE" ]; then # debug $AFILE is not a file, aborting link. # return 1 # fi # done FILESIZE=`ls -al "$1" | awk '{print $5}'` if [ "$ACTUALLYLINK" = "YES" ]; then # FIRSTFILE="$1" # shift # for OTHERFILE in "$@" ; do #To support more than 2 file parameters, revert the following "link to older file" logic to the preceding loop. if [ "$1" -nt "$2" ]; then #$1 is newer than $2; link to the older ($2) file FIRSTFILE="$2" OTHERFILE="$1" elif [ "$2" -nt "$1" ]; then FIRSTFILE="$1" OTHERFILE="$2" elif [ `ls -al "$2" | awk '{print $2}'` -gt `ls -al "$1" | awk '{print $2}'` ]; then #If $2 has more links than $1 nodebug "$2" has more links FIRSTFILE="$2" OTHERFILE="$1" else nodebug $1 has more links or equal FIRSTFILE="$1" OTHERFILE="$2" fi if [ "$VERBOSE" = "YES" ]; then ls -ali "$FIRSTFILE" "$OTHERFILE" read JUNK 0 (False) #if [ "$1" = "$2" ]; then #Caught by shared inode test below # nodebug $1 and $2 are the same file. ; return 1 #Commented sections have been handled by upper levels. #elif [ ! -f "$1" ]; then # debug "$1" is not a file. ; return 2 #elif [ ! -f "$2" ]; then # debug "$2" is not a file. ; return 2 #elif [ ! -s "$1" ]; then # debug "$1" has zero length. ; return 3 #elif [ ! -s "$2" ]; then # debug "$2" has zero length. ; return 3 if [ "$1" -ef "$2" ]; then #nodebug "$1" and "$2" already share an inode. return 4 elif [ ! -r "$1" ] || [ ! -r "$2" ]; then debug "$1" or "$2" can\'t be read. return 5 elif [ "$CHECKDATE" = "YES" ] && [ "$1" -nt "$2" ]; then #nodebug "$1" is newer than "$2" return 6 elif [ "$CHECKDATE" = "YES" ] && [ "$1" -ot "$2" ]; then #nodebug "$1" is older than "$2" return 6 #elif [ `ls -al "$1" | awk '{print $5 "/" $1 "/" $3 "/" $4 }'` != `ls -al "$2" | awk '{print $5 "/" $1 "/" $3 "/" $4 }'` ]; then # nodebug "$1" and "$2" differ in size, rights, or ownership. ; return 7 elif [ "$FILENAMESEQUAL" = "YES" ] && [ "$3" != "$4" ]; then #nodebug "$1" and "$2" have different filenames. return 8 elif ! diff -q "$1" "$2" >/dev/null ; then #nodebug "$1" and "$2" have different contents. return 9 else #nodebug Identical. return 0 fi } processsamesignaturefiles () { #nodebug '----' pssf "$@" for ONEFILE in "$@" ; do ONEBASE=`basename "$ONEFILE"` #The following shift compares each file to each other once without comparing a file to itself. shift for TWOFILE in "$@" ; do TWOBASE=`basename "$TWOFILE"` if filesshouldbelinked "$ONEFILE" "$TWOFILE" "$ONEBASE" "$TWOBASE" ; then linkfiles "$ONEFILE" "$TWOFILE" fi done done } #FIXME - parse for parameters if [ $# -gt 0 ]; then DIRS="$*" echo About to check for links in $DIRS >/dev/stderr else echo Usage: >/dev/stderr echo $0 dirs >/dev/stderr echo Example: >/dev/stderr echo $0 '/usr/src/linux* /usr/src/pcmcia-cs*' >/dev/stderr exit 1 fi #/tmpsizes will hold lines like: #1184/644/0/0 /tmp/bkwrap # or #1184/644/0/0/"bkwrap" /tmp/bkwrap #if FILENAMESEQUAL="YES" - once I fix it so spaces in the filenames aren't a problem. #Pipe find directly into while read; use exec kludge or manual looping; straight piping causes last file size block to be skipped from no GT1 var. #FIXME - I can't find the quoting to handle files with spaces in the names. #update: The read will work correctly if the filename spacs are backslash escaped. #if [ "$FILENAMESEQUAL" = "YES" ]; then # EQUIVPRINTF='%s/%m/%U/%G/"%f" %p\n' #else EQUIVPRINTF='%s/%m/%U/%G %p\n' #fi find $DIRS -xdev -type f `if [ -n "$MINSIZE" ]; then echo "-a -size +${MINSIZE}c" ; fi` -printf "$EQUIVPRINTF" \ | grep -v '^0/' | sort -nr | uniq >/tmp/signatures #FIXME - use mktemp if using a file. #FIXME - grep out exclude list if $EXCLUDE set while read SIGNATURE FILENAME ; do #nodebug Z $SIGNATURE Z $FILENAME Z if [ "$SIGNATURE" = "$OLDSIGNATURE" ]; then SAMESIGNATUREFILES="$SAMESIGNATUREFILES \"$FILENAME\"" NUMFILES="GT1" else if [ "$NUMFILES" = "GT1" ]; then #nodebug $SAMESIGNATUREFILES have the same signature. eval processsamesignaturefiles $SAMESIGNATUREFILES fi SAMESIGNATUREFILES="\"$FILENAME\"" NUMFILES="1" fi OLDSIGNATURE="$SIGNATURE" OLDFILENAME="$FILENAME" done >$ONEFILE.$FILECOUNT # LASTRULETAG="$NEWRULETAG" #done #exec 0<&5 5<&-