#!/usr/bin/perl use strict; use warnings; # findDupeFiles: # This script attempts to identify which files might be duplicates. # It searches specified directories for files with a given suffix # and reports on files that have the same MD5 digest. # The suffix or suffixes to be searched for are specified by the first # command-line argument - each suffix separated from the next by a vertical bar. # The subsequent command-line arguments specify the directories to be searched. # If no directories are specified on the command-line, # it searches the current directory. # Files whose names start with "._" are ignored. # # Cameron Hayne (macdev@hayne.net) January 2006 (revised March 2006) # # # Examples of use: # ---------------- # findDupeFiles '.aif|.aiff' AAA BBB CCC # would look for duplicates among all the files with ".aif" or ".aiff" suffixes # under the directories AAA, BBB, and CCC # # findDupeFiles '.aif|.aiff' # would look for duplicates among all the files with ".aif" or ".aiff" suffixes # under the current directory # # findDupeFiles '' AAA BBB CCC # would look for duplicates among all the files (no matter what suffix) # under the directories AAA, BBB, and CCC # # findDupeFiles # would look for duplicates among all the files (no matter what suffix) # under the current directory # ----------------------------------------------------------------------------- use File::Find; use File::stat; use Digest::MD5; use Fcntl; # The HFS+ filesystem used on OS X has resource forks as well as data forks # By default this script checks the resource forks of files with duplicate data # and issues a message if the resource forks are different. # If you don't want to do this (e.g. on some other Unix system) # then set the 'checkRsrc' variable to 0 my $checkRsrc = 1; # whether to check the resource forks my $matchSomeSuffix; # reference to a subroutine for matching suffixes if (defined($ARGV[0])) { # the list of desired suffixes is supplied in $ARGV[0] # separated by vertical bars - e.g. ".mp3|.aiff" # Note that if $ARGV[0] is '', then all files will be looked at my @suffixes = split(/\|/, $ARGV[0]); if (scalar(@suffixes) > 0) { # create an efficient matching subroutine using the Friedl technique my $matchExpr = join('||', map {"m/\$suffixes[$_]\$/io"} 0..$#suffixes); $matchSomeSuffix = eval "sub {$matchExpr}"; } shift @ARGV; } # if no dirs supplied as command-line args, we search the current directory my @searchDirs = @ARGV ? @ARGV : "."; # verify that these are in fact directories foreach my $dir (@searchDirs) { die "\"$dir\" is not a directory\n" unless -d "$dir"; } my %filesByDataLength; # global variable holding hash of arrays of fileInfo's # calcMd5: returns the MD5 digest of the given file sub calcMd5($) { my ($filename) = @_; if (-d $filename) { # doing MD5 on a directory is not supported return "unsupported"; # we need to return something } # We use 'sysopen' instead of just 'open' in order to be able to handle # filenames with leading whitespace or leading "-" # The usual trick to protect against leading whitespace or "-" is to do # $filename =~ s#^(\s)#./$1#; open(FILE, "< $filename\0") # but that fails if the filename is something like "- foo" # (i.e. if there is an initial "-" followed by whitespace) sysopen(FILE, $filename, O_RDONLY) or die "Unable to open file \"$filename\": $!\n"; binmode(FILE); # just in case we're on Windows! my $md5 = Digest::MD5->new->addfile(*FILE)->hexdigest; close(FILE); return $md5; } # hashByMd5: passed a ref to an array of fileInfo's # Returns a ref to a hash by md5 of the fileInfo's sub hashByMd5($) { my ($fileInfoListRef) = @_; my %filesByMd5; foreach my $fileInfo (@{$fileInfoListRef}) { my $dirname = $fileInfo->{dirname}; my $filename = $fileInfo->{filename}; my $md5 = calcMd5("$dirname/$filename"); push(@{$filesByMd5{$md5}}, $fileInfo); } return \%filesByMd5; } # checkFile: invoked from the 'find' routine on each file or directory in turn sub checkFile() { return unless -f $_; # only interested in files, not directories my $filename = $_; my $dirname = $File::Find::dir; return if $filename =~ /^\._/; # ignore files whose names start with "._" if (defined($matchSomeSuffix)) { return unless &$matchSomeSuffix; } my $statInfo = stat($filename) or warn "Can't stat file \"$dirname/$filename\": $!\n" and return; my $size = $statInfo->size; my $fileInfo = { 'dirname' => $dirname, 'filename' => $filename, }; push(@{$filesByDataLength{$size}}, $fileInfo); } MAIN: { # traverse the directories and collate the files by data length # in the global variable %filesByDataLength find(\&checkFile, @searchDirs); my $numDupes = 0; my $numDupeBytes = 0; # process the files by size, starting with the largest foreach my $size (sort {$b<=>$a} keys %filesByDataLength) { my $numSameSize = scalar(@{$filesByDataLength{$size}}); next unless $numSameSize > 1; #print "size: $size numSameSize: $numSameSize\n"; my $filesByMd5Ref = hashByMd5($filesByDataLength{$size}); my %filesByMd5 = %{$filesByMd5Ref}; foreach my $md5 (keys %filesByMd5) { my @sameMd5List = @{$filesByMd5{$md5}}; my $numSameMd5 = scalar(@sameMd5List); next unless $numSameMd5 > 1; # for each set of dupes, print the full path to the files my $rsrcMd5; foreach my $fileInfo (@sameMd5List) { my $dirname = $fileInfo->{dirname}; my $filename = $fileInfo->{filename}; my $filepath = "$dirname/$filename"; print "$filepath\n"; if ($checkRsrc) { my $rsrcFilepath = "$filepath/..namedfork/rsrc"; if (!defined($rsrcMd5)) { $rsrcMd5 = calcMd5($rsrcFilepath); } elsif ($rsrcMd5 ne calcMd5($rsrcFilepath)) { print "Resource fork differs\n"; } } } print "----------\n"; $numDupes += ($numSameMd5 - 1); $numDupeBytes += ($size * ($numSameMd5 - 1)); } } my $numDupeMegabytes = sprintf("%.1f", $numDupeBytes / (1024 * 1024)); print "Number of duplicate files: $numDupes\n"; print "Megabytes duplicated: $numDupeMegabytes\n"; }