#!/usr/bin/perl #Copyright 2002, William Stearns #Released under the GPL. #FIXME - race where file replaced long after stat. #FIXME - on ctrl-C perhaps write out cache. #FIXME - Likely nasty failure cases if a file has 4 consecutive :'s in its name. #FIXME - print stats on number of inodes, etc. #FIXME - Progress headers #FIXME - support minsize and maxsize range #FIXME - careful walk through bash version to compare. #FIXME - replace prints with appropriate Debug levels. #FIXME - updates every N (hardcoded) seconds on what file size working on. #FIXME - print stats on which criteria used to decide who links to who #FIXME - add a compress option? use strict; use File::Find (); use File::Compare; use File::Basename; use File::stat; use Digest::MD5; use IO::File; use Getopt::Long; use vars qw/*name *dir *prune/; *name = *File::Find::name; *dir = *File::Find::dir; *prune = *File::Find::prune; use constant DEFAULT_CACHE_FILE => '/var/cache/md5sum-v1.cache'; my $FreedupsVer="0.5.4"; my %md5sums = ( ); #Key is device/inode/crucial_characteristics, value is md5sum of that inode. On-disk cache is loaded into this and saved to from this. #The following hashes store info about files in the requested directory trees only; there could be others in the filesystem that freedups wasn't asked to index. my %InodesOfSize; #All the inodes of the size key. This is a hash whose values are arrays. my %InodeOfFile; #Provides the InodeSpec of the Filename key my %FilesOfInode; #The filenames associated with the inode key. This is a hash whose values are arrays. my $NumSpecs = 0; #Number of command line directory/file specs in which to search for candidate files my %IndexedFiles = ( ); #Used to prune out duplicates my $CachedSums = 0; #How many checksums we were able to pull from cache. my $FromDiskSums = 0; #How many checksums we had to pull from media. my $SpaceSaved = 0; #How many bytes saved. Takes into account whether we're removing the last link to the file or other links exist. my $SolitaryInodeSizes = 0; #Sizes for which there was only one inode. my $MultipleInodeSizes = 0; #Sizes for which there was more than one inode. my $UniqueFilesScanned = 0; #How many unique filenames we inspected #User options: #1 (true) or 0 (false) my $ActuallyLink = 0; #Actually link the files. Otherwise, just report on potential savings and preload the md5sum cache. my $CacheFile = DEFAULT_CACHE_FILE; #File that holds the inode=md5sum cache between runs. Must be created before program runs. my $DatesEqual = 0; #Only link files if their modification times are identical my $FileNamesEqual = 0; #Require that the two (pathless) filenames be equal before linking. my $Help = 0; #Show help my $MinSize = 0; #Files _larger_ than this many bytes are considered for linking. 0 byte files are _never_ considered. #I've found at least one program bug (don't use == with alphanumeric md5sums) with Paranoid. I recommend leaving it on for now. my $Paranoid = 1; #Set to 1 to force a strict compare just before pre-link. my $Verbosity = 1; #0 = Just intro and stats, 1 = Normal, 2 = some debugging, 3 = Show me everything! #FIXME - prompts have not ben strictly checked. sub Debug { my $DebugLevel = shift; if ($Verbosity >= $DebugLevel) { my $DebugString = shift; print "$DebugString"; } } #End sub Debug sub LoadSums { my $cache_filename = shift; if (my $cache_read_fh = IO::File->new($cache_filename, O_RDONLY)) { # |O_CREAT not used, security risk my $loaded_pairs = 0; undef $!; while (defined(my $cache_line = <$cache_read_fh>)) { #process one cache entry from local file. chomp $cache_line; my ($cache_inodespec, $cache_md5sum) = split(/=/, $cache_line, 2); #print "Read \"$cache_inodespec,$cache_md5sum\".\n"; $md5sums{$cache_inodespec} = $cache_md5sum; $loaded_pairs++; } close cache_read_fh; Debug 2, "Initial load: loaded $loaded_pairs cached md5sums from $cache_filename.\n"; } else { #Warn about missing or unreadable cache file. print, "Local cache file $cache_filename unavailable or unreadable (create it if it's not there and check permissions, please): $!\n"; } #End of load cache file entries } #End sub LoadSums sub SaveSums { my $cache_filename = shift; if (my $cache_write_fh = IO::File->new("$cache_filename", O_WRONLY)) { # |O_CREAT not used, security risk, no |O_APPEND as we want a fresh write. foreach my $key (sort(keys %md5sums)) { print $cache_write_fh $key, '=', $md5sums{$key}, "\n"; } close $cache_write_fh; } else { #Warn about missing or unwritable cache file. print "Local cache file $cache_filename unavailable or unwritable for storing new entries (create it if it's not there and check permissions, please): $!.\n"; } } #End sub SaveSums sub Md5sumOf { my $SumFile = shift or die "No file specified in Md5sumOf: $!"; my $InodeSpec; #Shouldn't ever need this, we should have stat'd all at the beginning, right after find. if (! defined($InodeOfFile{$SumFile})) { my $sb = stat($SumFile); $InodeOfFile{$SumFile}=$sb->dev . "/" . $sb->ino . "/" . $sb->mode . "/" . $sb->uid . "/" . $sb->gid . "/" . $sb->size . "/" . $sb->mtime . "/" . $sb->ctime; Debug 3, "Had to manually load InodeOfFile in Md5sumOf for file $SumFile, why?.\n"; } $InodeSpec = $InodeOfFile{$SumFile}; if (defined($md5sums{$InodeSpec})) { $CachedSums++; Debug 3, "Checksum of $InodeSpec came from cache.\n"; } else { $FromDiskSums++; open(FILE, $SumFile) or die "Can't open '$SumFile': $!"; binmode(FILE); $md5sums{$InodeSpec} = Digest::MD5->new->addfile(*FILE)->hexdigest; Debug 3, "Checksum of $InodeSpec came from physical disk.\n"; } Debug 3, "File: $SumFile, Sum: $md5sums{$InodeSpec}.\n"; return $md5sums{$InodeSpec}; } #End sub Md5sumOf sub IndexFile { my $OneFile = shift; unless ($IndexedFiles{$OneFile}++) { $UniqueFilesScanned++; Debug 3, "adding $OneFile\n"; my $sb = stat($OneFile); my $InodeSpec=$sb->dev . "/" . $sb->ino . "/" . $sb->mode . "/" . $sb->uid . "/" . $sb->gid . "/" . $sb->size . "/" . $sb->mtime . "/" . $sb->ctime; my $FileSize=$sb->size; $InodeOfFile{$OneFile} = $InodeSpec; if (defined($FilesOfInode{$InodeSpec})) { push @{$FilesOfInode{$InodeSpec}}, $OneFile; } else { $FilesOfInode{$InodeSpec} = [ $OneFile ]; } if (defined($InodesOfSize{$FileSize})) { #Check to see if $InodeSpec already in $InodesOfSize{$FileSize} my $InodeAlreadyInIOS = 0; #False foreach my $OneInodeSpec (@{$InodesOfSize{$FileSize}}) { if ($OneInodeSpec eq $InodeSpec) { $InodeAlreadyInIOS = 1; } } if ($InodeAlreadyInIOS) { #Already in there Debug 3, " NOT Adding $InodeSpec to InodesOfSize, already there\n"; } else { Debug 3, " Adding $InodeSpec to InodesOfSize\n"; push @{$InodesOfSize{$FileSize}}, $InodeSpec; } } else { #Debug 3, " Initial add $InodeSpec to InodesOfSize\n"; $InodesOfSize{$FileSize} = [ $InodeSpec ]; } } } #End sub IndexFile #Function provided by find2perl ... -xdev -type f -a -size +3366c sub wanted { my ($dev,$ino,$mode,$nlink,$uid,$gid); (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) && #!($File::Find::prune |= ($dev != $File::Find::topdev)) && #Doesn't seem to work. Screw it. We'll compare dev later, allowing us to link files on multiple devs. -f _ && (int(-s _) > $MinSize) && IndexFile $File::Find::name; #push (@Allfiles, $File::Find::name); } #End sub wanted sub LinkFiles { #FIXME - this modifies %FilesOfInode and $InodeOfFile. Check to see if upper layers care. (partially checked) #clean up? %md5sums{InodeSpec} - remove inode entry on last unlink. Wait, we can't, there may be links outside of the trees we're scanning. #clean up? %InodeOfFile{Filename} - Reset to new inode after hardlink and reinstate Identical Inode warning. (DONE) #clean up? %FilesOfInode{InodeSpec} - Move file from old inode to new (DONE) my $BaseFile = shift; #Filename of file which will stay as is my $LinkName = shift; #Filename which will end up as a link to BaseFile if ( ($FileNamesEqual) && (basename($BaseFile) ne basename($LinkName)) ) { Debug 3, "$BaseFile and $LinkName have different filenames at the end, not linking.\n"; return; } if ($Paranoid) { #Check that file hasn't been modified since it was stat'd right after find. I'm aborting the program if changes occur; that tends to point #to a file that's actively being modified. This shouldn't happen. #Note that the following are duplicate checks; the file has already passed these once. Failing now means that file(s) is/are actively being #changed under us. my $Fsb=stat($BaseFile); my $Ssb=stat($LinkName); if (!(-e "$BaseFile")) { die ("LinkFile: $BaseFile no longer exists or is not a file anymore. Exiting."); } if (!(-e "$LinkName")) { die ("LinkFile: $LinkName no longer exists or is not a file anymore. Exiting."); } if ( ! ( ($Fsb->mode == $Ssb->mode) && ($Fsb->uid == $Ssb->uid) && ($Fsb->gid == $Ssb->gid) && ($Fsb->size == $Ssb->size) ) ) { print "File1: $InodeOfFile{$BaseFile}, Associated files: @{$FilesOfInode{$InodeOfFile{$BaseFile}}}, md5sum: $md5sums{$InodeOfFile{$BaseFile}}.\n"; print "File2: $InodeOfFile{$LinkName}, Associated files: @{$FilesOfInode{$InodeOfFile{$LinkName}}}, md5sum: $md5sums{$InodeOfFile{$LinkName}}.\n"; die ("LinkFile: paranoid stat checks failed! Please check failure in linking $BaseFile and $LinkName. Exiting."); } if (compare("$BaseFile","$LinkName") != 0) { #Not equal print "File1: $InodeOfFile{$BaseFile}, Associated files: @{$FilesOfInode{$InodeOfFile{$BaseFile}}}, md5sum: $md5sums{$InodeOfFile{$BaseFile}}.\n"; print "File2: $InodeOfFile{$LinkName}, Associated files: @{$FilesOfInode{$InodeOfFile{$LinkName}}}, md5sum: $md5sums{$InodeOfFile{$LinkName}}.\n"; die ("LinkFile: paranoid file comparison failed for $BaseFile and $LinkName, please check why. Exiting."); } $Fsb=stat($BaseFile); #Refresh stat blocks in case either changed during file compare. $Ssb=stat($LinkName); if ( ! ( ($Fsb->mode == $Ssb->mode) && ($Fsb->uid == $Ssb->uid) && ($Fsb->gid == $Ssb->gid) && ($Fsb->size == $Ssb->size) ) ) { print "File1: $InodeOfFile{$BaseFile}, Associated files: @{$FilesOfInode{$InodeOfFile{$BaseFile}}}, md5sum: $md5sums{$InodeOfFile{$BaseFile}}.\n"; print "File2: $InodeOfFile{$LinkName}, Associated files: @{$FilesOfInode{$InodeOfFile{$LinkName}}}, md5sum: $md5sums{$InodeOfFile{$LinkName}}.\n"; die ("LinkFile: Second paranoid stat checks failed! Please check failure in linking $BaseFile and $LinkName. Exiting."); } #If the user asked to check mtime and the timestamps are not equal, something's wrong if ( ($DatesEqual) && ($Fsb->mtime != $Ssb->mtime) ) { print "File1: $InodeOfFile{$BaseFile}, Associated files: @{$FilesOfInode{$InodeOfFile{$BaseFile}}}, md5sum: $md5sums{$InodeOfFile{$BaseFile}}.\n"; print "File2: $InodeOfFile{$LinkName}, Associated files: @{$FilesOfInode{$InodeOfFile{$LinkName}}}, md5sum: $md5sums{$InodeOfFile{$LinkName}}.\n"; die ("LinkFile: mtime paranoid check failed! Please check failure in linking $BaseFile and $LinkName. Exiting."); } Debug 2, " Paranoid checks passed for $BaseFile and $LinkName.\n"; } #Actually link and check return code if ($ActuallyLink) { my $Ssb=stat($LinkName); #Have to grab stat before link or else you're looking at nlinks of the merged inode. if (unlink($LinkName) && link($BaseFile,$LinkName)) { Debug 1, " linked $BaseFile $LinkName\n"; if ($Ssb->nlink == 1) { #my $debugsize = $Ssb->size; #print "Adding $debugsize to saved.\n"; $SpaceSaved += $Ssb->size; #} else { # my $debuglinks = $Ssb->nlink; # print "nlinks = $debuglinks, not adding to saved.\n"; } #Add $LinkName to the list of files on the same inode as $BaseFile push @{$FilesOfInode{$InodeOfFile{$BaseFile}}}, $LinkName; #FIXME Should/could we strip the ProcessInodesOfSize::$Tail directly instead? Not sure it would influence the link. #Perhaps ProcessInodesOfSize could use a hand crafted walk through an array (that this routine could modify directly) instead? #Strip $LinkName from $FilesOfInode{$InodeOfFile{$LinkName}} my @TempFiles = @{$FilesOfInode{$InodeOfFile{$LinkName}}}; if ($#TempFiles == -1) { die "Empty FOI-IOF array for $LinkName, $InodeOfFile{$LinkName}, shouldn't happen."; } elsif ($#TempFiles == 0) { if ( ($Paranoid) && ($FilesOfInode{$InodeOfFile{$LinkName}}[0] ne $LinkName) ) { die "Single Element list $FilesOfInode{$InodeOfFile{$LinkName}}[0] doesn't match $LinkName."; } #Only a single element, undef it undef $FilesOfInode{$InodeOfFile{$LinkName}}; } else { #At least 2 array elements undef $FilesOfInode{$InodeOfFile{$LinkName}}; #Start fresh foreach my $AFileName (@TempFiles) { if ($AFileName ne $LinkName) { if (defined($FilesOfInode{$InodeOfFile{$LinkName}})) { push @{$FilesOfInode{$InodeOfFile{$LinkName}}}, $AFileName; } else { $FilesOfInode{$InodeOfFile{$LinkName}} = [ $AFileName ]; } } } } #Setting the correct Inode for this file must come after the above. $InodeOfFile{$LinkName}=$InodeOfFile{$BaseFile}; } else { Debug 1, " Failed to link $BaseFile $LinkName\n"; } } else { Debug 1, " Would have linked $BaseFile $LinkName\n"; } } #End sub LinkFiles sub LinkInodes { my $FirstInode = shift; my $SecondInode = shift; my @FirstFilenames = @{$FilesOfInode{$FirstInode}}; my @SecondFilenames = @{$FilesOfInode{$SecondInode}}; my ($Fdev, $Fino, $Fmode, $Fuid, $Fgid, $Fsize, $Fmtime, $Fctime) = split(/\//, $FirstInode); my ($Sdev, $Sino, $Smode, $Suid, $Sgid, $Ssize, $Smtime, $Sctime) = split(/\//, $SecondInode); my $Fsb=stat($FirstFilenames[0]); my $Ssb=stat($SecondFilenames[0]); if (! defined($FirstFilenames[0])) { Debug 3, "No filenames for $FirstInode, why?.\n"; return; } if (! defined($SecondFilenames[0])) { Debug 3, "No filenames for $SecondInode, why?.\n"; return } #Who links to who? #If one of the inodes is a more sparse file, we link to that. In the end that gives more space savings if ($Fsb->blocks < $Ssb->blocks) { #Link SecondFiles to more sparse FirstInode Debug 3, " First more sparse.\n"; #The files are stripped from FilesOfInode by LinkFiles one by one as they're processed. That's OK. foreach my $OneSecondFilename (@SecondFilenames) { #Link all second inode filenames to the more sparse first inode LinkFiles $FirstFilenames[0], $OneSecondFilename; } } elsif ($Fsb->blocks > $Ssb->blocks) { Debug 3, " Second more sparse.\n"; foreach my $OneFirstFilename (@FirstFilenames) { #Link all first inode filenames to the more sparse second inode LinkFiles $SecondFilenames[0], $OneFirstFilename; } #Next, if one of the files is older (smaller modification time) link both to the older inode. } elsif ($Fmtime > $Smtime) { #First file newer, link it to Second Debug 3, " First newer.\n"; foreach my $OneFirstFilename (@FirstFilenames) { #Link all first inode filenames to the older second inode LinkFiles $SecondFilenames[0], $OneFirstFilename; } } elsif ($Smtime > $Fmtime) { #Second file newer, link it to First Debug 3, " Second newer.\n"; foreach my $OneSecondFilename (@SecondFilenames) { #Link all second inode filenames to the older first inode LinkFiles $FirstFilenames[0], $OneSecondFilename; } #Finally, if they use the same amount of space on disk and have the same mtime, see if one has more links than the other and glue both to the inode with more links. } elsif ($Ssb->nlink > $Fsb->nlink) { #Second inode has more hardlinks, link all firsts to it Debug 3, " Second more hardlinks.\n"; foreach my $OneFirstFilename (@FirstFilenames) { #Link all first inode filenames to the second inode with more hardlinks LinkFiles $SecondFilenames[0], $OneFirstFilename; } #(If they have the same amount of links or the first has more links, we'll hit this case and simply link any second files to the first inode by default.) } else { Debug 3, " First more hardlinks or equal.\n"; foreach my $OneSecondFilename (@SecondFilenames) { #Link all second inode filenames to the first inode with more or equal hardlinks LinkFiles $FirstFilenames[0], $OneSecondFilename; } } } #End sub LinkInodes sub CheckForLinkableInodes { my $FirstInode = shift; my $SecondInode = shift; Debug 2, " Comparing $FirstInode to $SecondInode\n"; #Here we're using the file characteristics encoded in the InodeSpec to indentify candidates for compare. If Paranoid is turned on, we'll re-verify #all this just before linking. Turning Paranoid off risks problems with files being modified under us or a checksum cache with invalid entries. my ($Fdev, $Fino, $Fmode, $Fuid, $Fgid, $Fsize, $Fmtime, $Fctime) = split(/\//, $FirstInode); #print "$Fdev, $Fino, $Fmode, $Fuid, $Fgid, $Fsize, $Fmtime, $Fctime\n"; my ($Sdev, $Sino, $Smode, $Suid, $Sgid, $Ssize, $Smtime, $Sctime) = split(/\//, $SecondInode); #print "$Sdev, $Sino, $Smode, $Suid, $Sgid, $Ssize, $Smtime, $Sctime\n"; if ($Fdev == $Sdev) { #Same device if ($Fino == $Sino) { print " Tried to link identical Inodes, should not have happened\n"; } else { #Same device, different inodes. Can we link them? if ( ($Fmode == $Smode) && ($Fuid == $Suid) && ($Fgid == $Sgid) && ($Fsize == $Ssize) ) { #Same device, different inodes, same base characteristics. Check modification time if the user wanted it. #The following loosely translates to "Continue on with the link checks if the user didn't care or the files have the same time anyways." if ( (!($DatesEqual)) || ($Fmtime == $Smtime) ) { #Same device, different inodes, same characteristics. Checksums match? #Note - we can't check for FileNamesEqual here. We'll leave that until we actually have filenames to compare and check #that in LinkFiles if (defined($FilesOfInode{$FirstInode}) && defined($FilesOfInode{$SecondInode})) { #@{$FilesOfInode{$FirstInode}}[0] is the first filename associated with $FirstInode #@{$FilesOfInode{$SecondInode}}[0] is the first filename associated with $SecondInode if ( Md5sumOf(@{$FilesOfInode{$FirstInode}}[0]) eq Md5sumOf(@{$FilesOfInode{$SecondInode}}[0]) ) { #DO NOT use == for md5sums; the sum appears to overflow perl integers, or ignore chars perhaps #my $FirstSumDebug=Md5sumOf(@{$FilesOfInode{$FirstInode}}[0]); #my $SecondSumDebug=Md5sumOf(@{$FilesOfInode{$SecondInode}}[0]); #print "Sum1: $FirstSumDebug, Sum2: $SecondSumDebug\n"; Debug 2, " Identical, linking @{$FilesOfInode{$FirstInode}}[0] and @{$FilesOfInode{$SecondInode}}[0] and any other filenames.\n"; LinkInodes $FirstInode, $SecondInode; } else { Debug 2, " Checksums don't match.\n"; } } else { Debug 3, " Ignoring stripped file.\n"; } } else { Debug 2, " Not linking, different mtimes and user specified DatesEqual.\n"; } } else { Debug 2, " Can't hardlink, different attributes\n"; } } } else { Debug 3, " Different devices, no chance to link\n"; } } #End sub CheckForLinkableInodes sub ProcessInodesOfSize { #This is the only function that uses $InodesOfSize, so it can undef those entries. #clean up? %InodesOfSize{Size} - remove inode from list on last unlink? Not sure this is good my $OneSize=shift; if ( $#{$InodesOfSize{$OneSize}} > 0) { Debug 2, " More than one inode of size $OneSize.\n"; $MultipleInodeSizes++; my @CurrentInodes = @{$InodesOfSize{$OneSize}}; my $Head = shift @CurrentInodes; #$InodesOfSize{$OneSize} will get undef'd at the end of this subroutine while (defined(@CurrentInodes[0])) { #FIXME Should probably use a global list and strip here and in LinkFiles. Note, if we do this, either $Head or $OneTail may disappear from under us. foreach my $OneTail (@CurrentInodes) { CheckForLinkableInodes $Head, $OneTail; } $Head = shift @CurrentInodes; #Drop the head completely, we've compared that inode to all the others, and run the loop again with head being the head of the tail. } } else { #We technically shouldn't get here as we discarded solitary inodes earlier, but it's hardly fatal. #Only one inode? No chance of hardlinking, ignore this inode. Debug 3, " (PIOS) Only one inode of size $OneSize.\n"; $SolitaryInodeSizes++; } my @CurrentInodes = @{$InodesOfSize{$OneSize}}; foreach my $OneInode (@CurrentInodes) { undef ($FilesOfInode{$OneInode}); } undef ($InodesOfSize{$OneSize}); } #End sub ProcessInodesOfSize my $USAGEMSG = < File that holds cached queries and responses ($CacheFile) * --datesequal|-d Require that the modification dates and times be equal before linking ($DatesEqual) --filenamesequal|-f Require that the two (pathless) filenames be equal before linking ($FileNamesEqual) --help|-h This help message --minsize|-m= Only consider files larger than this number of bytes ($MinSize) --paranoid|-p Recheck all file stats and completely compare every byte of the files just before linking. This should definitely be left on unless you are _positive_ that the md5 checksum cache is correct and there's no chance that files will be modified behind freedups' back. ($Paranoid) --quiet|-q Show almost nothing; forces verbosity to 0. --verbose|-v Show more detail (Default verbosity=$Verbosity) * For security reasons, this file must be created before starting freedups or it will not be used at all. Examples: To report on what files could be linked under any kernel source trees and preload the md5sum cache, but not actually link them: freedups /usr/src/linux-* To link identical files in those trees: freedups -a /usr/src/linux-* To be more strict; the modification time and filename need to be equal before two files can be linked: freedups -a --datesequal=yes -f /usr/doc /usr/share/doc Only link files with 1001 or more bytes. freedups --actuallylink=yes -m 1000 /usr/src/linux-* /usr/src/pcmcia-* USAGE #Load command line params. Directories to be scanned are left in ARGV so we can pull them with shift in a moment. die "$USAGEMSG" unless GetOptions( 'actuallylink|a!' => \$ActuallyLink, 'cachefile=s' => \$CacheFile, 'datesequal|d!' => \$DatesEqual, 'filenamesequal|f!' => \$FileNamesEqual, 'help|h' => \$Help, 'minsize|m=i' => \$MinSize, 'paranoid|p!' => \$Paranoid, 'quiet|q' => sub { $Verbosity = 0 }, 'verbose|v+' => \$Verbosity ); die "$USAGEMSG" if $Help; #Start main code print "Freedups Version $FreedupsVer\n"; print "Options Chosen: "; print "ActuallyLink " if $ActuallyLink; print "DatesEqual " if $DatesEqual; print "FileNamesEqual (NOTE - not implemented yet)! " if $FileNamesEqual; #If Help set, we won't get this far print "Paranoid " if $Paranoid; print "None " if (!( ($ActuallyLink) || ($DatesEqual) || ($FileNamesEqual) || ($Paranoid) )); print "Verbosity=$Verbosity "; print "CacheFile=$CacheFile "; my $SmallestFileSize = $MinSize + 1; print "MinSize=$MinSize (only consider files $SmallestFileSize bytes and larger) "; undef $SmallestFileSize; print "\n"; #Load dir specs from command line while (my $OneSpec = shift) { print "Starting to scan $OneSpec\n"; #Check that it exists first if (-e "$OneSpec") { File::Find::find(\&wanted, $OneSpec); #subroutine could also be written {wanted => \&wanted} #This calls IndexFile(the_found_filename) which puts file info into the inode and file arrays. $NumSpecs++ } else { die "Could not find anything named $OneSpec, exiting.\n"; } } undef %IndexedFiles; #Only used to figure out if a given file is already indexed. if ($NumSpecs == 0) { die "$USAGEMSG\nNo directories specified, exiting.\n"; } print "Finished loading filenames and characteristics, starting to load md5 checksum cache.\n"; LoadSums $CacheFile; #Wait until we've verified the filespecs before loading the cache as this can take time. print "Finished loading checksum cache, starting to discard solitary inodes.\n"; #Undef all single inode sizes right now, freeing up memory foreach my $OneSize (keys %InodesOfSize) { if ($#{$InodesOfSize{$OneSize}} == 0) { #Only one inode? No chance of hardlinking, discard this inode. Debug 3, " (Early Discard) Only one inode of size $OneSize.\n"; $SolitaryInodeSizes++; #print "About to undef ", $OneSize, ", ", $InodesOfSize{$OneSize}[0], " and ", scalar $FilesOfInode{$InodesOfSize{$OneSize}[0]}[0], "\n"; my @CurrentInodes = @{$InodesOfSize{$OneSize}}; foreach my $OneInode (@CurrentInodes) { undef ($FilesOfInode{$OneInode}); } undef ($InodesOfSize{$OneSize}); } } print "Finished discarding $SolitaryInodeSizes solitary inodes, starting to process inodes.\n"; #FIXME - forget the sort and take them in whatever order they show up. (probably not that much savings.) foreach my $OneSize (sort {$b <=> $a} (keys %InodesOfSize)) { #Process the inodes, starting with the largest. if (defined $InodesOfSize{$OneSize}) { Debug 2, "Processing files of size $OneSize\n"; ProcessInodesOfSize $OneSize; } } SaveSums $CacheFile; print "$NumSpecs file specs searched.\n"; print "$UniqueFilesScanned Unique files scanned.\n"; print "Cached checksums: $CachedSums, From disk checksums: $FromDiskSums.\n"; print "Space saved: $SpaceSaved\n"; print "$SolitaryInodeSizes file sizes for which there was a single inode.\n"; print "$MultipleInodeSizes file sizes for which there was more than one inode.\n"; #$DatesEqual mtime equal continue to try to link? #true false false #true true true #false false true #false true true # so negate $DatesEqual to get: #!$DatesEqual mtime equal continue to try to link? #false false false #false true true #true false true #true true true #Debugging #print "@Allfiles\n"; #my $NumFiles=$#Allfiles + 1; #print "$NumFiles (possibly non-unique) files retrieved from requested directories.\n"; #undef $NumFiles; #Debugging: show the contents of the Inode array. #print "now showing.\n"; #foreach my $OneSize (sort {$b <=> $a} (keys %InodesOfSize)) { # print "$OneSize\n"; # # foreach my $OneInode (split(/::::/, $InodesOfSize{$OneSize})) { #No longer works # print " $FilesOfInode{$OneInode}\n"; # # foreach my $Filename (split(/::::/, $FilesOfInode{$OneInode})) { # my $OneSum = Md5sumOf("$Filename"); # #show_stat "$Filename"; # print(" $OneSum $Filename\n"); # } # } #}