#!/bin/bash # Freedups will free up space by hardlinking identical files together. # Copyright 2001, William Stearns # # This program is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation; either version 2 of the License, or any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # # You should have received a copy of the GNU General Public License along with # this program; if not, write to the Free Software Foundation, Inc., 59 Temple # Place - Suite 330, Boston, MA 02111-1307, USA. #From the yet-another-program-that-should-have-been-written-in-a-real-language series. #FIXME - keep counts of why links not made? Only catches first reason... #FIXME - NOTE Presettable parameters (place on command line before freedups): no longer available. SPACESAVED=0 SPACEWOULDHAVESAVED=0 LinkFiles () { #Parameters: the 2 files that need to be hardlinked together. #To support more than 2 file parameters, revert the "link to older file" logic to this loop. #FIRSTFILE="$1" #shift #for OTHERFILE in "$@" ; do # ... #done NoDebug '++++' lf "$@" if [ $# -lt 2 ]; then return 1 fi #The "find -type f" above already does this check. # for AFILE in "$@" ; do # if [ ! -f "$AFILE" ]; then # Debug $AFILE is not a file, aborting link. # return 1 # fi # done FILESIZE=`ls -al "$1" | awk '{print $5}'` if [ "$1" -nt "$2" ]; then #$1 is newer than $2; link to the older ($2) file FIRSTFILE="$2" OTHERFILE="$1" elif [ "$2" -nt "$1" ]; then FIRSTFILE="$1" OTHERFILE="$2" elif [ `ls -al "$2" | awk '{print $2}'` -gt `ls -al "$1" | awk '{print $2}'` ]; then #If $2 has more links than $1 NoDebug "$2" has more links FIRSTFILE="$2" OTHERFILE="$1" else NoDebug $1 has more links or equal FIRSTFILE="$1" OTHERFILE="$2" fi #FIXME - use least number of fragments as an additional way to decide which is firstfile. if [ "$ACTUALLYLINK" = "YES" ]; then if [ "$PARANOID" = "YES" ]; then ls -ali "$FIRSTFILE" "$OTHERFILE" >/dev/stderr echo "Press enter to link or Ctrl-C to abort" >/dev/stderr read JUNK 0 (False) #if [ "$1" = "$2" ]; then #Caught by shared inode test below # NoDebug $1 and $2 are the same file. ; return 1 #Commented sections have been handled by upper levels. #elif [ ! -f "$1" ]; then # Debug "$1" is not a file. ; return 2 #elif [ ! -f "$2" ]; then # Debug "$2" is not a file. ; return 2 #elif [ ! -s "$1" ]; then # Debug "$1" has zero length. ; return 3 #elif [ ! -s "$2" ]; then # Debug "$2" has zero length. ; return 3 if [ "$1" -ef "$2" ]; then #NoDebug "$1" and "$2" already share an inode. return 4 elif [ ! -r "$1" ] || [ ! -r "$2" ]; then #NoDebug "$1" or "$2" can\'t be read. return 5 elif [ "$DATESEQUAL" = "YES" ] && [ "$1" -nt "$2" ]; then #NoDebug "$1" is newer than "$2" return 6 elif [ "$DATESEQUAL" = "YES" ] && [ "$1" -ot "$2" ]; then #NoDebug "$1" is older than "$2" return 6 #elif [ `ls -al "$1" | awk '{print $5 "/" $1 "/" $3 "/" $4 }'` != `ls -al "$2" | awk '{print $5 "/" $1 "/" $3 "/" $4 }'` ]; then # NoDebug "$1" and "$2" differ in size, rights, or ownership. ; return 7 elif [ "$FILENAMESEQUAL" = "YES" ] && [ "$3" != "$4" ]; then #NoDebug "$1" and "$2" have different filenames. return 8 elif ! diff -q "$1" "$2" >/dev/null ; then #NoDebug "$1" and "$2" have different contents. return 9 else #NoDebug Identical. return 0 fi } if type -path basename >/dev/null 2>/dev/null ; then THISSCRIPT=`basename $0` else THISSCRIPT="$0" fi #FIXME - update ShowHelpAndExit () { echo Usage: >/dev/stderr echo $THISSCRIPT '[options] DirectoriesToSearch' >/dev/stderr echo Options: >/dev/stderr echo '-a|--actuallylink=yes' >/dev/stderr echo " Actually link the files. Otherwise, just report on potential savings." >/dev/stderr echo '-d|--datesequal=yes' >/dev/stderr echo " Require that the modification dates and times be equal before linking." >/dev/stderr # --debug is hidden. # echo '-e FilespecToExclude' >/dev/stderr # echo '--exclude FilespecToExclude' >/dev/stderr # echo " Exclude files matching this regular expression." >/dev/stderr echo '-f|--filenamesequal=yes' >/dev/stderr echo " Require that the two (pathless) filenames be equal before linking." >/dev/stderr echo '-h|--help' >/dev/stderr echo " Show this help." >/dev/stderr echo '-m MinimumFileSize' >/dev/stderr echo '--minsize=MinimumFileSize' >/dev/stderr echo " Only consider files larger than this number of bytes." >/dev/stderr echo '-p|--paranoid=yes' >/dev/stderr echo " Show directory listing of candidates and wait before linking." >/dev/stderr echo " This only has an effect if actually linking files." >/dev/stderr echo Examples: >/dev/stderr echo $THISSCRIPT '/usr/src/linux-*' >/dev/stderr echo ' Report on what files could be linked under any kernel source trees,' >/dev/stderr echo ' but do not link them.' >/dev/stderr echo $THISSCRIPT '-a /usr/src/linux-*' >/dev/stderr echo ' Link identical files in those trees.' >/dev/stderr echo $THISSCRIPT '-a --datesequal=yes -f /usr/doc /usr/share/doc' >/dev/stderr echo ' Be more strict; the modification time and filename need to be equal' >/dev/stderr echo ' before two files can be linked.' >/dev/stderr echo $THISSCRIPT '--actuallylink=yes -m 1000 /usr/src/linux-* /usr/src/pcmcia-*' >/dev/stderr echo ' Only link files with 1001 or more bytes.' >/dev/stderr exit 1 } #End of ShowHelpAndExit #Carry over VERBOSE -> PARANOID, CHECKDATE -> DATESEQUAL, in case anyone's still using v0.2.1 method #of setting environment variables. if [ "$VERBOSE" = "YES" ]; then PARANOID="YES" fi if [ "$CHECKDATE" = "YES" ]; then DATESEQUAL="YES" fi #Parse for parameters until [ -z "$*" ]; do case `echo $1 | tr a-z A-Z` in -A|--ACTUALLYLINK|--ACTUALLYLINK=YES) ACTUALLYLINK=YES shift ;; --DEBUG|--DEBUG=YES) DEBUG=YES shift ;; -D|--DATESEQUAL|--DATESEQUAL=YES) DATESEQUAL=YES shift ;; -F|--FILENAMESEQUAL|--FILENAMESEQUAL=YES) FILENAMESEQUAL=YES shift ;; -H|--HELP) ShowHelpAndExit shift ;; -M) if [ "z$2" != "z" ] && [ $[ $2 ] -gt 0 ]; then MINSIZE=$[ $2 ] shift else echo Positive number argument required for "$1". Exiting. >/dev/stderr exit 1 fi shift ;; --MINSIZE=*) #MINSIZE=$[ `echo $1 | sed -e s/.*=//` ] #Let's not use sed; one more utility to require. MINSIZE=$[ ${1##*=} ] #Stuff following "=" if [ $MINSIZE -le 0 ]; then echo Positive number argument required for "$1". Exiting. >/dev/stderr exit 1 fi shift ;; -P|--PARANOID|--PARANOID=YES) PARANOID=YES shift ;; # -E) # #FIXME - implement # shift ;; # --EXCLUDE=*) # #FIXME - implement # shift ;; *) if [ -e "$1" ]; then #Dirs and files are all valid. Device nodes, links, and pipes will get stripped by find. DIRS="$DIRS $1" else echo "$1" is not recognized as a valid command line option or >/dev/stderr echo directory. Please use \"$THISSCRIPT --help\" for a summary. >/dev/stderr exit 1 fi shift ;; esac done #The choice of which version of this function to use depends on #whether FILENAMESEQUAL=YES or not. DO NOT move the following #function declaration above the preceding block which actually #sets FILENAMESEQUAL. if [ "$FILENAMESEQUAL" = "YES" ]; then ProcessSameSignatureFiles () { #NoDebug '----' pssf "$@" for ONEFILE in "$@" ; do ONEBASE="`basename "$ONEFILE"`" #The following shift compares each file to each other once without comparing a file to itself. shift for TWOFILE in "$@" ; do if FilesShouldBeLinked "$ONEFILE" "$TWOFILE" "$ONEBASE" "`basename "$TWOFILE"`" ; then LinkFiles "$ONEFILE" "$TWOFILE" fi done done } else ProcessSameSignatureFiles () { #NoDebug '----' pssf "$@" for ONEFILE in "$@" ; do #The following shift compares each file to each other once without comparing a file to itself. shift for TWOFILE in "$@" ; do if FilesShouldBeLinked "$ONEFILE" "$TWOFILE" "" "" ; then LinkFiles "$ONEFILE" "$TWOFILE" fi done done } fi if [ "$DEBUG" = "YES" ]; then Debug () { echo $* >/dev/stderr } else Debug () { : } fi NoDebug () { : } if [ -z "$DIRS" ]; then ShowHelpAndExit fi echo -n "Options chosen: " >/dev/stderr if [ "$ACTUALLYLINK" = "YES" ]; then echo -n "ActuallyLink " >/dev/stderr ; fi if [ "$DEBUG" = "YES" ]; then echo -n "Debug " >/dev/stderr ; fi if [ "$DATESEQUAL" = "YES" ]; then echo -n "DatesEqual " >/dev/stderr ; fi if [ -n "$EXCLUDE" ]; then echo -n "Exclude=$EXCLUDE " >/dev/stderr ; fi if [ "$FILENAMESEQUAL" = "YES" ]; then echo -n "FileNamesEqual " >/dev/stderr ; fi if [ -n "$MINSIZE" ]; then echo -n "MinSize=$MINSIZE " >/dev/stderr ; fi if [ "$PARANOID" = "YES" ]; then echo -n "Paranoid " >/dev/stderr ; fi if [ -z "$ACTUALLYLINK$DEBUG$DATESEQUAL$EXCLUDE$FILENAMESEQUAL$MINSIZE$PARANOID" ]; then echo -n "None " >/dev/stderr fi echo >/dev/stderr echo Checking for files to link in $DIRS >/dev/stderr #/tmpsizes will hold lines like: #1184/644/0/0 /tmp/bkwrap # or #1184/644/0/0/"bkwrap" /tmp/bkwrap #if FILENAMESEQUAL="YES" - once I fix it so spaces in the filenames aren't a problem. #Pipe find directly into while read; use exec kludge or manual looping; straight piping causes last file size block to be skipped from no GT1 var. #FIXME - I can't find the quoting to handle files with spaces in the names. #update: The read will work correctly if the filename spacs are backslash escaped. #if [ "$FILENAMESEQUAL" = "YES" ]; then # EQUIVPRINTF='%s/%m/%U/%G/"%f" %p\n' #else EQUIVPRINTF='%s/%m/%U/%G %p\n' #fi trap 'rm -f $SIGFILE' 0 NoTempFile () { #Handle the case where the temp file can't be created for some reason. echo Couldn\'t create $1 temp file, exiting. exit 1 } if type -path mktemp >/dev/null 2>/dev/null ; then SIGFILE=`mktemp -q /tmp/signatures.XXXXXX` || NoTempFile /tmp/signatures else echo 'Warning! This system does not have the mktemp utility. Please' >/dev/stderr echo 'install it. ' >/dev/stderr SIGFILE=/tmp/signatures.$$.$RANDOM if [ -e "$SIGFILE" ]; then NoTempFile $SIGFILE else touch $SIGFILE fi fi if ! type -path md5sum >/dev/null 2>/dev/null ; then echo Missing md5sum, please install. exit 1 fi find $DIRS -xdev -type f `if [ -n "$MINSIZE" ]; then echo "-a -size +${MINSIZE}c" ; fi` -printf "$EQUIVPRINTF" \ | grep -v '^0/' | sort -nr | (while read SIGNATURE FILENAME ; do echo $SIGNATURE/`md5sum "$FILENAME"` ; done ) \ | sort -nr | uniq >$SIGFILE #Can't use: sed -e "s@\(.*\)/MD5SUM \(.*\)@\1/`md5sum \1` \2@" #because the \1 is only replaced by sed itself, not the shell. #FIXME - grep out exclude list if $EXCLUDE set while read SIGNATURE FILENAME ; do #NoDebug Z $SIGNATURE Z $FILENAME Z if [ "$SIGNATURE" = "$OLDSIGNATURE" ]; then SAMESIGNATUREFILES="$SAMESIGNATUREFILES \"$FILENAME\"" NUMFILES="GT1" else if [ "$NUMFILES" = "GT1" ]; then #NoDebug $SAMESIGNATUREFILES have the same signature. eval ProcessSameSignatureFiles $SAMESIGNATUREFILES #ZZZZ - in progress, not functional. #( for SOMEFILE in $SAMESIGNATUREFILES ; do # md5sum "$SAMESIGNATUREFILES" #done ) | sort -nr | ( # OLDCHECKSUM="" # OLDSIGGEDFILE="" # while read CHECKSUM SIGGEDFILE # #ZZZZ # done #) fi SAMESIGNATUREFILES="\"$FILENAME\"" NUMFILES="1" fi OLDSIGNATURE="$SIGNATURE" OLDFILENAME="$FILENAME" done <$SIGFILE if [ "$NUMFILES" = "GT1" ]; then #Handle last file signature. #NoDebug $SAMESIGNATUREFILES have the same signature. eval ProcessSameSignatureFiles $SAMESIGNATUREFILES fi if [ $SPACESAVED -gt 0 ]; then echo Total space saved: $SPACESAVED \(Small risk of overcounting space saved if linked files have different times.\) fi if [ $SPACEWOULDHAVESAVED -gt 0 ]; then echo Total space would have saved: $SPACEWOULDHAVESAVED \(An overestimate if more than two files would have been linked together.\) fi exit #Read kludge: #Redirect stdin to work around an f^@&ing annoying limitation in the read command. #CKPTSORTRULEFILE=" sortrulefile: split into equivalent sections." ; #ckpt $CKPTSORTRULEFILE #exec 5<&0 < "$ONEFILE" #while read ONELINE ; do # NEWRULETAG="`ruletag $ONELINE`" # if [ "$NEWRULETAG" != "$LASTRULETAG" ]; then # FILECOUNT=$[ $FILECOUNT + 1 ] # if [ -f $ONEFILE.$FILECOUNT ]; then rm -f $ONEFILE.$FILECOUNT || logfail $LINENO masonlib: YYYY 0118 ; fi # fi # echo "$ONELINE" >>$ONEFILE.$FILECOUNT # LASTRULETAG="$NEWRULETAG" #done #exec 0<&5 5<&-