#!/usr/bin/perl #Copyright 2002, William Stearns #Released under the GPL. #Code overview: # # This program is given a set of directories to scan. It uses the #perl equivalent of find to stat all files in those directories; these #files are loaded into the %InodesOfSize, %InodeOfFile, and #%FilesOfInode hashes. # # We now need to find Inodes that might be linkable with each #other. For a given size, if we only know of one inode that large, we #can immediately forget about it since there's nothing we could possibly #link it to (solitary inodes). We then walk through the remaining #inodes, starting with the largest. For every pair of same-sized inodes, #we check to see if that pair is linkable; if so, we pick one inode to #stay as is (the more sparse, older, or already more heavily linked #inode), and hardlink all filenames associated with the other inode to #it. #FIXME - race where file replaced long after stat. #FIXME - on ctrl-C perhaps write out cache. #FIXME - Progress headers #FIXME - support minsize and maxsize range #FIXME - careful walk through bash version to compare. #FIXME - print stats on which criteria used to decide who links to who #FIXME - check all variable uses to make sure we're not printing packed data #FIXME - This app currently requires full paths - make it more gracefully handle relative? use strict; use File::Find (); use File::Compare; use File::Basename; use File::stat; use Digest::MD5; use IO::File; use Getopt::Long; use vars qw/*name *dir *prune/; *name = *File::Find::name; *dir = *File::Find::dir; *prune = *File::Find::prune; my $FreedupsVer="0.6.2"; my %KnownMd5sums = ( ); #Key is packed device, inode, crucial characteristics, value is md5sum of that inode. On-disk cache is loaded into this and saved to from this. my %NewMd5sums = ( ); #known holds the sums loaded from the cache; these obviously don't need to be rewritten back out. New holds new ones that need to be written out at the end. #The following hashes store info about files in the requested directory trees only; there could be others in the filesystem that freedups wasn't asked to index. my %InodesOfSize; #All the inodes of the size key. This is a hash whose values are arrays. my %InodeOfFile; #Provides the InodeSpec of the Filename key (this is loaded very late now; it only holds the files and InodeSpecs for the files of the size currently being processed). my %FilesOfInode; #The filenames associated with the inode key. This is a hash whose values are arrays. my $NumSpecs = 0; #Number of command line directory/file specs in which to search for candidate files. my $CachedSums = 0; #How many checksums we were able to pull from cache. my $FromDiskSums = 0; #How many checksums we had to pull from media. my $SpaceSaved = 0; #How many bytes saved. Takes into account whether we're removing the last link to the file or other links exist. my $EstimatedSpaceSaved = 0; #How much space we would have saved if -a had been on. my $SolitaryInodeSizes = 0; #Sizes for which there was only one inode. my $MultipleInodeSizes = 0; #Sizes for which there was more than one inode. my $UniqueFilesScanned = 0; #How many unique filenames we inspected. my $UniqueInodesScanned = 0; #How many unique inodes encountered during initial scan. my $LastSizeLinked = 0; #For progress, what's the last linked inode size. my $InternalChecks = 0; #Used during development for additional internal checks. #User options: #1 (true) or 0 (false) my $ActuallyLink = 0; #Actually link the files. Otherwise, just report on potential savings and preload the md5sum cache. my $CacheFile = "$ENV{HOME}/md5sum-v1.cache"; #Private file that holds the inode=md5sum cache between runs. Must be created before program runs. my $DatesEqual = 0; #Only link files if their modification times are identical my $FileNamesEqual = 0; #Require that the two (pathless) filenames be equal before linking. my $Help = 0; #Show help my $MinSize = 0; #Files _larger_ than this many bytes are considered for linking. 0 byte files are _never_ considered. #I've found at least one program bug (don't use == with alphanumeric md5sums) with Paranoid. I recommend leaving it on for now. my $Paranoid = 1; #Set to 1 to force a strict compare just before linking. my $Verbosity = 1; #0 = Just intro and stats, 1 = Normal, 2 = some debugging, 3 = Show me everything! #FIXME - prompts have not been strictly checked. sub Debug { my $DebugLevel = shift; if ($Verbosity >= $DebugLevel) { my $DebugString = shift; print "$DebugString\n"; } } #End sub Debug #FIXME - new storage format sub LoadSums { #Load %KnownMd5sums from cache file my $cache_filename = shift; undef $!; if (my $cache_read_fh = IO::File->new($cache_filename, O_RDONLY)) { # |O_CREAT not used, security risk my $loaded_pairs = 0; undef $!; while (defined(my $cache_line = <$cache_read_fh>)) { #process one cache entry from local file. chomp $cache_line; my ($cache_inodespec, $cache_md5sum) = split(/=/, $cache_line, 2); #Debug 4, "Read \"$cache_inodespec,$cache_md5sum\"."; if ($cache_md5sum ne '') { my ($Tdev, $Tino, $Tmode, $Tuid, $Tgid, $Tsize, $Tmtime, $Tctime) = split(/\//, $cache_inodespec); $KnownMd5sums{pack("SLSSSLLL",$Tdev,$Tino,$Tmode,$Tuid,$Tgid,$Tsize,$Tmtime,$Tctime)} = pack("H32", $cache_md5sum); $loaded_pairs++; } else { Debug 3, "Blank md5sum read for $cache_inodespec."; } } close cache_read_fh; Debug 2, "Initial load: loaded $loaded_pairs cached KnownMd5sums from $cache_filename."; } else { #Warn about missing or unreadable cache file. Debug 1, "Local cache file $cache_filename unavailable or unreadable (create it if it's not there and check permissions, please): $!"; } #End of load cache file entries } #End sub LoadSums #FIXME - new storage format sub SaveSums { #Save %NewMd5sums to cache file my $cache_filename = shift; if (my $cache_write_fh = IO::File->new("$cache_filename", O_WRONLY|O_APPEND)) { # |O_CREAT not used, security risk. foreach my $key (keys %NewMd5sums) { if ($NewMd5sums{$key} ne '') { my ($Tdev, $Tino, $Tmode, $Tuid, $Tgid, $Tsize, $Tmtime, $Tctime) = unpack("SLSSSLLL", $key); print $cache_write_fh "$Tdev/$Tino/$Tmode/$Tuid/$Tgid/$Tsize/$Tmtime/$Tctime=", unpack("H32", $NewMd5sums{$key}), "\n"; } else { Debug 3, "Blank md5sum not written for $key."; } } close $cache_write_fh; } else { #Warn about missing or unwritable cache file. Debug 1, "Local cache file $cache_filename unavailable or unwritable for storing new entries (create it if it's not there and check permissions, please): $!."; } } #End sub SaveSums sub Md5sumOf { #This returns the md5sum of a given file. stat(file) returns the Inode #info we need to pull the cached md5sum from %KnownMd5sums or we pull #the checksum from disk and save it in %KnownMd5sums and %NewMd5sums (and hence, in the #md5sum cache file) for future use. my $SumFile = shift or die "No file specified in Md5sumOf: $!"; my $InodeSpec; #Shouldn't ever need this, we should have stat'd all at the beginning, right after find. if (! defined($InodeOfFile{$SumFile})) { my $sb = stat($SumFile); $InodeOfFile{$SumFile}=pack("SLSSSLLL",$sb->dev, $sb->ino, $sb->mode, $sb->uid, $sb->gid, $sb->size, $sb->mtime, $sb->ctime); if ($InternalChecks) { my ($Tdev, $Tino, $Tmode, $Tuid, $Tgid, $Tsize, $Tmtime, $Tctime) = unpack("SLSSSLLL", $InodeOfFile{$SumFile}); die $sb->dev . " ne " . $Tdev . ", exiting" if ($sb->dev ne $Tdev); die $sb->ino . " ne " . $Tino . ", exiting" if ($sb->ino ne $Tino); die $sb->mode . " ne " . $Tmode . ", exiting" if ($sb->mode ne $Tmode); die $sb->uid . " ne " . $Tuid . ", exiting" if ($sb->uid ne $Tuid); die $sb->gid . " ne " . $Tgid . ", exiting" if ($sb->gid ne $Tgid); die $sb->size . " ne " . $Tsize . ", exiting" if ($sb->size ne $Tsize); die $sb->mtime . " ne " . $Tmtime . ", exiting" if ($sb->mtime ne $Tmtime); die $sb->ctime . " ne " . $Tctime . ", exiting" if ($sb->ctime ne $Tctime); } Debug 3, "Had to manually load InodeOfFile in Md5sumOf for file $SumFile, why?."; } $InodeSpec = $InodeOfFile{$SumFile}; #if (defined($NewMd5sums{$InodeSpec})) { # $CachedSums++; # ####$SumToReturn = $NewMd5sums{$InodeSpec}; # Debug 3, "Checksum came from new cache."; #Add in ...unpack("SLSSSLLL", $InodeSpec)... perhaps later #} els if (defined($KnownMd5sums{$InodeSpec})) { $CachedSums++; Debug 3, "Checksum came from known cache."; #Add in ...unpack("SLSSSLLL", $InodeSpec)... perhaps later } else { $FromDiskSums++; open(FILE, $SumFile) or die "Can't open '$SumFile': $!"; binmode(FILE); #$KnownMd5sums{$InodeSpec} = Digest::MD5->new->addfile(*FILE)->hexdigest; my $TempSum = Digest::MD5->new->addfile(*FILE)->hexdigest; $KnownMd5sums{$InodeSpec} = pack("H32", $TempSum); $NewMd5sums{$InodeSpec} = pack("H32", $TempSum); if ($InternalChecks) { if (unpack("H32", $KnownMd5sums{$InodeSpec}) ne $TempSum) { die "Unpack failure " . $KnownMd5sums{$InodeSpec} . " ne " . $TempSum . ", exiting."; } } Debug 3, "Checksum came from physical disk."; #add in ...unpack("SLSSSLLL", $InodeSpec)... perhaps later } Debug 3, "File: $SumFile, Sum: " . unpack("H32", $KnownMd5sums{$InodeSpec}) . "."; return $KnownMd5sums{$InodeSpec}; } #End sub Md5sumOf sub IndexFile { my $OneFile = shift; my $sb = stat($OneFile); if (defined $sb) { my $InodeSpec=pack("SLSSSLLL",$sb->dev, $sb->ino, $sb->mode, $sb->uid, $sb->gid, $sb->size, $sb->mtime, $sb->ctime); if ($InternalChecks) { my ($Tdev, $Tino, $Tmode, $Tuid, $Tgid, $Tsize, $Tmtime, $Tctime) = unpack("SLSSSLLL", $InodeSpec); die $sb->dev . " dev ne " . $Tdev . ", exiting" if ($sb->dev ne $Tdev); die $sb->ino . " ino ne " . $Tino . ", exiting" if ($sb->ino ne $Tino); die $sb->mode . " mode ne " . $Tmode . ", exiting" if ($sb->mode ne $Tmode); die $sb->uid . " uid ne " . $Tuid . ", exiting" if ($sb->uid ne $Tuid); die $sb->gid . " gid ne " . $Tgid . ", exiting" if ($sb->gid ne $Tgid); die $sb->size . " size ne " . $Tsize . ", exiting" if ($sb->size ne $Tsize); die $sb->mtime . " mtime ne " . $Tmtime . ", exiting" if ($sb->mtime ne $Tmtime); die $sb->ctime . " ctime ne " . $Tctime . ", exiting" if ($sb->ctime ne $Tctime); } #Check uniqueness by scanning FilesOfInode my $FileAlreadyInFOI = 0; #False foreach my $ExistingFile (@{$FilesOfInode{$InodeSpec}}) { if ($OneFile eq $ExistingFile) { $FileAlreadyInFOI = 1; last; #exit foreach now } } if ($FileAlreadyInFOI) { #Already in there Debug 3, " NOT adding $OneFile to FilesOfInode, already there."; } else { Debug 3, " Adding $OneFile to FilesOfInode."; $UniqueFilesScanned++; if (defined($FilesOfInode{$InodeSpec})) { push @{$FilesOfInode{$InodeSpec}}, $OneFile; } else { $FilesOfInode{$InodeSpec} = [ $OneFile ]; } if (defined($InodesOfSize{$sb->size})) { #Check to see if $InodeSpec already in $InodesOfSize{$sb->size} my $InodeAlreadyInIOS = 0; #False foreach my $OneInodeSpec (@{$InodesOfSize{$sb->size}}) { if ($OneInodeSpec eq $InodeSpec) { $InodeAlreadyInIOS = 1; last; #exit foreach loop now. } } if ($InodeAlreadyInIOS) { #Already in there Debug 3, " NOT Adding $InodeSpec to InodesOfSize, already there."; } else { $UniqueInodesScanned++; Debug 3, " Adding $InodeSpec to InodesOfSize"; push @{$InodesOfSize{$sb->size}}, $InodeSpec; } } else { $UniqueInodesScanned++; Debug 4, " Initial add $InodeSpec to InodesOfSize"; $InodesOfSize{$sb->size} = [ $InodeSpec ]; } } } else { Debug 2, "Can't stat $OneFile: $!"; } } #End sub IndexFile #Function provided by find2perl ... -type f -a -size +3366c sub wanted { my ($dev,$ino,$mode,$nlink,$uid,$gid); (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) && -f _ && (int(-s _) > $MinSize) && IndexFile $File::Find::name; #Is there a different form that would handle partial paths? } #End sub wanted sub LinkFiles { #FIXME - this modifies %FilesOfInode and $InodeOfFile. Check to see if upper layers care. (partially checked) #clean up? %md5sums{InodeSpec} - remove inode entry on last unlink. (DONE) #clean up? %InodeOfFile{Filename} - Reset to new inode after hardlink and reinstate Identical Inode warning. (DONE) #clean up? %FilesOfInode{InodeSpec} - Move file from old inode to new (DONE) my $BaseFile = shift; #Filename of file which will stay as is my $LinkName = shift; #Filename which will end up as a link to BaseFile if ( ($FileNamesEqual) && (basename($BaseFile) ne basename($LinkName)) ) { Debug 3, "$BaseFile and $LinkName have different filenames at the end, not linking."; return; } if ($Paranoid) { #Check that file hasn't been modified since it was stat'd right after find. I'm aborting the program if changes occur; that tends to point #to a file that's actively being modified. This shouldn't happen. #Note that the following are duplicate checks; the file has already passed these once. Failing now means that file(s) is/are actively being #changed under us. my $Fsb=stat($BaseFile); my $Ssb=stat($LinkName); if (!(-e "$BaseFile")) { die ("LinkFile: $BaseFile no longer exists or is not a file anymore. Exiting."); } if (!(-e "$LinkName")) { die ("LinkFile: $LinkName no longer exists or is not a file anymore. Exiting."); } if ( ! ( ($Fsb->mode == $Ssb->mode) && ($Fsb->uid == $Ssb->uid) && ($Fsb->gid == $Ssb->gid) && ($Fsb->size == $Ssb->size) ) ) { Debug 0, "File1: $InodeOfFile{$BaseFile}, Associated files: @{$FilesOfInode{$InodeOfFile{$BaseFile}}}, md5sum: $KnownMd5sums{$InodeOfFile{$BaseFile}}."; Debug 0, "File2: $InodeOfFile{$LinkName}, Associated files: @{$FilesOfInode{$InodeOfFile{$LinkName}}}, md5sum: $KnownMd5sums{$InodeOfFile{$LinkName}}."; die ("LinkFile: paranoid stat checks failed! Please check failure in linking $BaseFile and $LinkName. Exiting."); } if (compare("$BaseFile","$LinkName") != 0) { #Byte for byte compare not equal Debug 0, "File1: $InodeOfFile{$BaseFile}, Associated files: @{$FilesOfInode{$InodeOfFile{$BaseFile}}}, md5sum: $KnownMd5sums{$InodeOfFile{$BaseFile}}."; Debug 0, "File2: $InodeOfFile{$LinkName}, Associated files: @{$FilesOfInode{$InodeOfFile{$LinkName}}}, md5sum: $KnownMd5sums{$InodeOfFile{$LinkName}}."; die ("LinkFile: paranoid file comparison failed for $BaseFile and $LinkName, please check why. Exiting."); } $Fsb=stat($BaseFile); #Refresh stat blocks in case either changed during file compare. $Ssb=stat($LinkName); if ( ! ( ($Fsb->mode == $Ssb->mode) && ($Fsb->uid == $Ssb->uid) && ($Fsb->gid == $Ssb->gid) && ($Fsb->size == $Ssb->size) ) ) { Debug 0, "File1: $InodeOfFile{$BaseFile}, Associated files: @{$FilesOfInode{$InodeOfFile{$BaseFile}}}, md5sum: $KnownMd5sums{$InodeOfFile{$BaseFile}}."; Debug 0, "File2: $InodeOfFile{$LinkName}, Associated files: @{$FilesOfInode{$InodeOfFile{$LinkName}}}, md5sum: $KnownMd5sums{$InodeOfFile{$LinkName}}."; die ("LinkFile: Second paranoid stat checks failed! Please check failure in linking $BaseFile and $LinkName. Exiting."); } #If the user asked to check mtime and the timestamps are not equal, something's wrong if ( ($DatesEqual) && ($Fsb->mtime != $Ssb->mtime) ) { Debug 0, "File1: $InodeOfFile{$BaseFile}, Associated files: @{$FilesOfInode{$InodeOfFile{$BaseFile}}}, md5sum: $KnownMd5sums{$InodeOfFile{$BaseFile}}."; Debug 0, "File2: $InodeOfFile{$LinkName}, Associated files: @{$FilesOfInode{$InodeOfFile{$LinkName}}}, md5sum: $KnownMd5sums{$InodeOfFile{$LinkName}}."; die ("LinkFile: mtime paranoid check failed! Please check failure in linking $BaseFile and $LinkName. Exiting."); } Debug 2, " Paranoid checks passed for $BaseFile and $LinkName."; } #Actually link and check return code if ($ActuallyLink) { my $Ssb=stat($LinkName); #Have to grab stat before link or else you're looking at nlinks of the merged inode. #FIXME - how to handle case where LinkName unlinked but link call fails? if (unlink($LinkName) && link($BaseFile,$LinkName)) { Debug 1, " linked $BaseFile $LinkName"; if ($Ssb->nlink == 1) { #undef LinkName md5sum here, we just removed the last dentry pointing at the file. #FIXME - do we need to check if it exists before undef'ing? #if (defined($KnownMd5sums{$InodeOfFile{$LinkName}})) { undef $KnownMd5sums{$InodeOfFile{$LinkName}}; #} #if (defined($NewMd5sums{$InodeOfFile{$LinkName}})) { undef $NewMd5sums{$InodeOfFile{$LinkName}}; #} $SpaceSaved += $Ssb->size; } #Add $LinkName to the list of files on the same inode as $BaseFile push @{$FilesOfInode{$InodeOfFile{$BaseFile}}}, $LinkName; #FIXME Should/could we strip the ProcessInodesOfSize::$Tail directly instead? Not sure it would influence the link. #Perhaps ProcessInodesOfSize could use a hand crafted walk through an array (that this routine could modify directly) instead? #Strip $LinkName from $FilesOfInode{$InodeOfFile{$LinkName}} my @TempFiles = @{$FilesOfInode{$InodeOfFile{$LinkName}}}; if ($#TempFiles == -1) { die "Empty FOI-IOF array for $LinkName, $InodeOfFile{$LinkName}, shouldn't happen."; } elsif ($#TempFiles == 0) { if ( ($Paranoid) && ($FilesOfInode{$InodeOfFile{$LinkName}}[0] ne $LinkName) ) { die "Single Element list $FilesOfInode{$InodeOfFile{$LinkName}}[0] doesn't match $LinkName."; } #Only a single element, undef it undef $FilesOfInode{$InodeOfFile{$LinkName}}; } else { #At least 2 array elements undef $FilesOfInode{$InodeOfFile{$LinkName}}; #Start fresh foreach my $AFileName (@TempFiles) { if ($AFileName ne $LinkName) { if (defined($FilesOfInode{$InodeOfFile{$LinkName}})) { push @{$FilesOfInode{$InodeOfFile{$LinkName}}}, $AFileName; } else { $FilesOfInode{$InodeOfFile{$LinkName}} = [ $AFileName ]; } } } } #Setting the correct Inode for this file must come after the above. $InodeOfFile{$LinkName}=$InodeOfFile{$BaseFile}; } else { Debug 1, " Failed to link $BaseFile $LinkName"; } } else { Debug 1, " Would have linked $BaseFile $LinkName"; } } #End sub LinkFiles sub LinkInodes { my $FirstInode = shift; my $SecondInode = shift; my $PreferredInode; my @FirstFilenames = @{$FilesOfInode{$FirstInode}}; my @SecondFilenames = @{$FilesOfInode{$SecondInode}}; my $Fsb=stat($FirstFilenames[0]); my $Ssb=stat($SecondFilenames[0]); if (! defined($FirstFilenames[0])) { Debug 3, "No filenames for $FirstInode, why?."; return; } if (! defined($SecondFilenames[0])) { Debug 3, "No filenames for $SecondInode, why?."; return } #Show progress with file size display if ($LastSizeLinked != $Fsb->size) { $LastSizeLinked = $Fsb->size; Debug 1, " " . $Fsb->size; } #Who links to who? First, make the choice: #If one of the inodes is a more sparse file, we link to that. In the end that gives more space savings if ($Fsb->blocks < $Ssb->blocks) { #Link SecondFiles to more sparse FirstInode Debug 3, " First more sparse."; #The files are stripped from FilesOfInode by LinkFiles one by one as they're processed. That's OK. $PreferredInode="first"; } elsif ($Fsb->blocks > $Ssb->blocks) { Debug 3, " Second more sparse."; $PreferredInode="second"; #Next, if one of the files is older (smaller modification time) link both to the older inode. } elsif ($Fsb->mtime > $Ssb->mtime) { #First file newer, link it to Second Debug 3, " First newer."; $PreferredInode="second"; } elsif ($Ssb->mtime > $Fsb->mtime) { #Second file newer, link it to First Debug 3, " Second newer."; $PreferredInode="first"; #Finally, if they use the same amount of space on disk and have the same mtime, see if one has more links than the other and glue both to the inode with more links. } elsif ($Ssb->nlink > $Fsb->nlink) { #Second inode has more hardlinks, link all firsts to it Debug 3, " Second more hardlinks."; $PreferredInode="second"; #(If they have the same amount of links or the first has more links, we'll hit this case and simply link any second files to the first inode by default.) } else { Debug 3, " First more hardlinks or equal."; $PreferredInode="first"; } #Second, actually perform the links (or at least record estimated savings, which needs to be done here at the inode level) if ($PreferredInode eq "first") { #Get estimated space savings on dry run if ($ActuallyLink) { foreach my $OneSecondFilename (@SecondFilenames) { #Link all second inode filenames to the preferred first inode LinkFiles $FirstFilenames[0], $OneSecondFilename; } } else { #Debug 4, "FirEstUpdate: " . @SecondFilenames . "," . $Ssb->nlink; if (@SecondFilenames == $Ssb->nlink) { $EstimatedSpaceSaved += $Ssb->size; } elsif (@SecondFilenames > $Ssb->nlink) { die @SecondFilenames . " second filenames can't be larger than " . $Ssb->nlink . ", why is it?"; } #no savings, nothing to do if (@SecondFilenames < $Ssb->nlink) foreach my $OneSecondFilename (@SecondFilenames) { #Link all second inode filenames to the preferred first inode #This is a mini version of LinkFiles for ActuallyLink=no if ( ($FileNamesEqual) && (basename($FirstFilenames[0]) ne basename($OneSecondFilename)) ) { Debug 3, $FirstFilenames[0] . " and $OneSecondFilename have different filenames at the end, not linking."; return; } else { Debug 1, " Would have linked " . $FirstFilenames[0] . " $OneSecondFilename"; } } } } elsif ($PreferredInode eq "second") { if ($ActuallyLink) { foreach my $OneFirstFilename (@FirstFilenames) { #Link all first inode filenames to the preferred second inode LinkFiles $SecondFilenames[0], $OneFirstFilename; } } else { #Debug 4, "SecEstUpdate: " . @FirstFilenames . "," . $Fsb->nlink; if (@FirstFilenames == $Fsb->nlink ) { $EstimatedSpaceSaved += $Fsb->size; } elsif (@FirstFilenames > $Fsb->nlink ) { die @FirstFilenames . " first filenames can't be larger than " . $Fsb->nlink . ", why is it?"; } #no savings, nothing to do if (@FirstFilenames < $Fsb->nlink ) foreach my $OneFirstFilename (@FirstFilenames) { #Link all first inode filenames to the preferred second inode #This is a mini version of LinkFiles for ActuallyLink=no if ( ($FileNamesEqual) && (basename($SecondFilenames[0]) ne basename($OneFirstFilename)) ) { Debug 3, $SecondFilenames[0] . " and $OneFirstFilename have different filenames at the end, not linking."; return; } else { Debug 1, " Would have linked " . $SecondFilenames[0] . " $OneFirstFilename"; } } } } else { die "Internal error, PreferredInode is $PreferredInode."; } } #End sub LinkInodes sub CheckForLinkableInodes { my $FirstInode = shift; my $SecondInode = shift; #FIXME - printing packed format #Debug 2, " Comparing $FirstInode to $SecondInode"; #Here we're using the file characteristics encoded in the InodeSpec to indentify candidates for compare. If Paranoid is turned on, we'll re-verify #all this just before linking. Turning Paranoid off risks problems with files being modified under us or a checksum cache with invalid entries. my ($Fdev, $Fino, $Fmode, $Fuid, $Fgid, $Fsize, $Fmtime, $Fctime) = unpack("SLSSSLLL", $FirstInode); #Debug 4, "$Fdev, $Fino, $Fmode, $Fuid, $Fgid, $Fsize, $Fmtime, $Fctime"; my ($Sdev, $Sino, $Smode, $Suid, $Sgid, $Ssize, $Smtime, $Sctime) = unpack("SLSSSLLL", $SecondInode); #Debug 4, "$Sdev, $Sino, $Smode, $Suid, $Sgid, $Ssize, $Smtime, $Sctime"; if ($Fdev == $Sdev) { #Same device if ($Fino == $Sino) { Debug 1, " Tried to link identical Inodes, should not have happened."; } else { #Same device, different inodes. Can we link them? if ( ($Fmode == $Smode) && ($Fuid == $Suid) && ($Fgid == $Sgid) && ($Fsize == $Ssize) ) { #Same device, different inodes, same base characteristics. Check modification time if the user wanted it. #The following loosely translates to "Continue on with the link checks if the user didn't care or the files have the same time anyways." if ( (!($DatesEqual)) || ($Fmtime == $Smtime) ) { #Same device, different inodes, same characteristics. Checksums match? #Note - we can't check for FileNamesEqual here. We'll leave that until we actually have filenames to compare and check #that in LinkFiles if (defined($FilesOfInode{$FirstInode}) && defined($FilesOfInode{$SecondInode})) { #@{$FilesOfInode{$FirstInode}}[0] is the first filename associated with $FirstInode #@{$FilesOfInode{$SecondInode}}[0] is the first filename associated with $SecondInode if ( Md5sumOf(@{$FilesOfInode{$FirstInode}}[0]) eq Md5sumOf(@{$FilesOfInode{$SecondInode}}[0]) ) { #DO NOT use == for md5sums; the sum appears to overflow perl integers, or ignore chars perhaps #my $FirstSumDebug=Md5sumOf(@{$FilesOfInode{$FirstInode}}[0]); #my $SecondSumDebug=Md5sumOf(@{$FilesOfInode{$SecondInode}}[0]); #Debug 4, "Sum1: $FirstSumDebug, Sum2: $SecondSumDebug"; Debug 2, " Identical, linking @{$FilesOfInode{$FirstInode}}[0] and @{$FilesOfInode{$SecondInode}}[0] and any other filenames."; LinkInodes $FirstInode, $SecondInode; } else { Debug 2, " Checksums don't match."; } } else { Debug 3, " Ignoring stripped file."; } } else { Debug 2, " Not linking, different mtimes and user specified DatesEqual."; } } else { Debug 2, " Can't hardlink, different attributes."; } } } else { Debug 3, " Different devices, no chance to link."; } } #End sub CheckForLinkableInodes sub ProcessInodesOfSize { #This is the only function that uses $InodesOfSize, so it can undef those entries. #clean up? %InodesOfSize{Size} - remove inode from list on last unlink? Not sure this is good my $OneSize=shift; if ( $#{$InodesOfSize{$OneSize}} > 0) { Debug 2, " More than one inode of size $OneSize."; $MultipleInodeSizes++; my @CurrentInodes = @{$InodesOfSize{$OneSize}}; #Here we load $InodeOfFile{} with _just_ the files/inodes of the current size, pulling from FilesOfInode{InodesOfSize} %InodeOfFile = ( ); foreach my $InodeToIndex (@CurrentInodes) { foreach my $FileToIndex (@{$FilesOfInode{$InodeToIndex}}) { $InodeOfFile{$FileToIndex} = $InodeToIndex; } } my $Head = shift @CurrentInodes; #$InodesOfSize{$OneSize} will get undef'd at the end of this subroutine while (defined(@CurrentInodes[0])) { #FIXME Should probably use a global list and strip here and in LinkFiles. Note, if we do this, either $Head or $OneTail may disappear from under us. foreach my $OneTail (@CurrentInodes) { CheckForLinkableInodes $Head, $OneTail; } $Head = shift @CurrentInodes; #Drop the head completely, we've compared that inode to all the others, and run the loop again with head being the head of the tail. } #Clear InodeOfFile entirely as we're done with this file size. %InodeOfFile = ( ); } else { #We technically shouldn't get here as we discarded solitary inodes earlier, but it's hardly fatal. #Only one inode? No chance of hardlinking, ignore this inode. Debug 3, " (PIOS) Only one inode of size $OneSize."; $SolitaryInodeSizes++; } my @CurrentInodes = @{$InodesOfSize{$OneSize}}; foreach my $OneInode (@CurrentInodes) { undef ($FilesOfInode{$OneInode}); } undef ($InodesOfSize{$OneSize}); } #End sub ProcessInodesOfSize my $USAGEMSG = < File that holds cached queries and responses ($CacheFile) * --datesequal|-d Require that the modification dates and times be equal before linking ($DatesEqual) --filenamesequal|-f Require that the two (pathless) filenames be equal before linking ($FileNamesEqual) --help|-h This help message --minsize|-m= Only consider files larger than this number of bytes ($MinSize) --paranoid|-p Recheck all file stats and completely compare every byte of the files just before linking. This should definitely be left on unless you are _positive_ that the md5 checksum cache is correct and there's no chance that files will be modified behind freedups' back. ($Paranoid) --quiet|-q Show almost nothing; forces verbosity to 0. --verbose|-v Show more detail (Default verbosity=$Verbosity) * For security reasons, this file must be created before starting freedups or it will not be used at all. Examples: To report on what files could be linked under any kernel source trees and preload the md5sum cache, but not actually link them: freedups /usr/src/linux-* To link identical files in those trees: freedups -a /usr/src/linux-* To be more strict; the modification time and filename need to be equal before two files can be linked: freedups -a --datesequal=yes -f /usr/doc /usr/share/doc Only link files with 1001 or more bytes. freedups --actuallylink=yes -m 1000 /usr/src/linux-* /usr/src/pcmcia-* USAGE #Load command line params. Directories to be scanned are left in ARGV so we can pull them with shift in a moment. die "$USAGEMSG" unless GetOptions( 'actuallylink|a!' => \$ActuallyLink, 'cachefile=s' => \$CacheFile, 'datesequal|d!' => \$DatesEqual, 'filenamesequal|f!' => \$FileNamesEqual, 'help|h' => \$Help, 'minsize|m=i' => \$MinSize, 'paranoid|p!' => \$Paranoid, 'quiet|q' => sub { $Verbosity = 0 }, 'verbose|v+' => \$Verbosity ); die "$USAGEMSG" if $Help; #Start main code print "Freedups Version $FreedupsVer\n"; print "Options Chosen: "; print "ActuallyLink " if $ActuallyLink; print "DatesEqual " if $DatesEqual; print "FileNamesEqual " if $FileNamesEqual; #If Help set, we won't get this far print "Paranoid " if $Paranoid; print "None " if (!( ($ActuallyLink) || ($DatesEqual) || ($FileNamesEqual) || ($Paranoid) )); print "Verbosity=$Verbosity "; print "CacheFile=$CacheFile "; my $SmallestFileSize = $MinSize + 1; print "MinSize=$MinSize (only consider files $SmallestFileSize bytes and larger) "; undef $SmallestFileSize; print "\n"; #Load dir specs from command line while (my $OneSpec = shift) { Debug 1, "Starting to scan $OneSpec"; #Check that it exists first if (-e "$OneSpec") { File::Find::find(\&wanted, $OneSpec); #subroutine could also be written {wanted => \&wanted} #This calls IndexFile(the_found_filename) which puts file info into the inode and file arrays. $NumSpecs++ } else { die "Could not find anything named $OneSpec, exiting.\n"; } } if ($NumSpecs == 0) { die "$USAGEMSG\nNo directories specified, exiting.\n"; } print "Finished loading filenames and characteristics, starting to load md5 checksum cache from $CacheFile.\n"; LoadSums $CacheFile; #Wait until we've verified the filespecs before loading the cache as this can take time. print "Finished loading checksum cache, starting to discard solitary inodes.\n"; #Undef all single inode sizes right now, freeing up memory foreach my $OneSize (keys %InodesOfSize) { if ($#{$InodesOfSize{$OneSize}} == 0) { #Only one inode? No chance of hardlinking, discard this inode. Debug 3, " (Early Discard) Only one inode of size $OneSize."; $SolitaryInodeSizes++; #Debug 4, "About to undef " . $OneSize . ", " . $InodesOfSize{$OneSize}[0] . " and " . scalar $FilesOfInode{$InodesOfSize{$OneSize}[0]}[0]; my @CurrentInodes = @{$InodesOfSize{$OneSize}}; foreach my $OneInode (@CurrentInodes) { undef ($FilesOfInode{$OneInode}); } undef ($InodesOfSize{$OneSize}); } } print "Finished discarding $SolitaryInodeSizes solitary inodes, starting to process inodes.\n"; #FIXME - forget the sort and take them in whatever order they show up. (probably not that much savings.) foreach my $OneSize (sort {$b <=> $a} (keys %InodesOfSize)) { #Process the inodes, starting with the largest. if (defined $InodesOfSize{$OneSize}) { Debug 2, "Processing files of size $OneSize"; ProcessInodesOfSize $OneSize; } } print "Finished processing inodes, appending new md5sums.\n"; SaveSums $CacheFile; print "Finished saving md5sums.\n"; print "$NumSpecs file specs searched.\n"; print "$UniqueFilesScanned Unique files scanned.\n"; print "$UniqueInodesScanned Unique inodes scanned.\n"; print "Cached checksums: $CachedSums, From disk checksums: $FromDiskSums.\n"; if ($ActuallyLink) { print "Space saved: $SpaceSaved\n"; } elsif ($EstimatedSpaceSaved == 0) { print "No space would have been saved.\n"; } else { print "Up to $EstimatedSpaceSaved bytes would have been saved.\n"; } print "$SolitaryInodeSizes file sizes for which there was a single inode.\n"; print "$MultipleInodeSizes file sizes for which there was more than one inode.\n";