#!/usr/bin/perl
use strict;
use warnings;

# findDupeFiles:
# This script attempts to identify which files might be duplicates.
# It searches specified directories for files with a given suffix
# and reports on files that have the same MD5 digest.
# The suffix or suffixes to be searched for are specified by the first 
# command-line argument - each suffix separated from the next by a vertical bar.
# The subsequent command-line arguments specify the directories to be searched.
# If no directories are specified on the command-line, 
# it searches the current directory.
# Files whose names start with "._" are ignored.
#
# Cameron Hayne (macdev@hayne.net)  January 2006 (revised March 2006)
#
#
# Examples of use:
# ----------------
# findDupeFiles '.aif|.aiff' AAA BBB CCC
# would look for duplicates among all the files with ".aif" or ".aiff" suffixes
# under the directories AAA, BBB, and CCC
#
# findDupeFiles '.aif|.aiff'
# would look for duplicates among all the files with ".aif" or ".aiff" suffixes
# under the current directory
#
# findDupeFiles '' AAA BBB CCC
# would look for duplicates among all the files (no matter what suffix)
# under the directories AAA, BBB, and CCC
#
# findDupeFiles
# would look for duplicates among all the files (no matter what suffix)
# under the current directory
# -----------------------------------------------------------------------------


use File::Find;
use File::stat;
use Digest::MD5;
use Fcntl;

# The HFS+ filesystem used on OS X has resource forks as well as data forks
# By default this script checks the resource forks of files with duplicate data
# and issues a message if the resource forks are different.
# If you don't want to do this (e.g. on some other Unix system)
# then set the 'checkRsrc' variable to 0
my $checkRsrc = 1;  # whether to check the resource forks

my $matchSomeSuffix; # reference to a subroutine for matching suffixes
if (defined($ARGV[0]))
{
    # the list of desired suffixes is supplied in $ARGV[0]
    # separated by vertical bars - e.g. ".mp3|.aiff"
    # Note that if $ARGV[0] is '', then all files will be looked at
    
    my @suffixes = split(/\|/, $ARGV[0]);
    if (scalar(@suffixes) > 0)
    {
        # create an efficient matching subroutine using the Friedl technique
        my $matchExpr = join('||', map {"m/\$suffixes[$_]\$/io"} 0..$#suffixes);

        $matchSomeSuffix = eval "sub {$matchExpr}";
    }
    shift @ARGV;
}

# if no dirs supplied as command-line args, we search the current directory
my @searchDirs = @ARGV ? @ARGV : ".";

# verify that these are in fact directories
foreach my $dir (@searchDirs)
{
    die "\"$dir\" is not a directory\n" unless -d "$dir";
}

my %filesByDataLength; # global variable holding hash of arrays of fileInfo's

# calcMd5: returns the MD5 digest of the given file
sub calcMd5($)
{
    my ($filename) = @_;

    if (-d $filename)
    {
        # doing MD5 on a directory is not supported
        return "unsupported"; # we need to return something
    }

    # We use 'sysopen' instead of just 'open' in order to be able to handle
    # filenames with leading whitespace or leading "-"
    # The usual trick to protect against leading whitespace or "-" is to do
    # $filename =~ s#^(\s)#./$1#; open(FILE, "< $filename\0")
    # but that fails if the filename is something like "- foo"
    # (i.e. if there is an initial "-" followed by whitespace)

    sysopen(FILE, $filename, O_RDONLY)
         or die "Unable to open file \"$filename\": $!\n";
    binmode(FILE); # just in case we're on Windows!
    my $md5 = Digest::MD5->new->addfile(*FILE)->hexdigest;
    close(FILE);
    return $md5;
}

# hashByMd5: passed a ref to an array of fileInfo's
#            Returns a ref to a hash by md5 of the fileInfo's
sub hashByMd5($)
{
    my ($fileInfoListRef) = @_;

    my %filesByMd5;
    foreach my $fileInfo (@{$fileInfoListRef})
    {
        my $dirname = $fileInfo->{dirname};
        my $filename = $fileInfo->{filename};

        my $md5 = calcMd5("$dirname/$filename");
        push(@{$filesByMd5{$md5}}, $fileInfo);
    }
    
    return \%filesByMd5;
}

# checkFile: invoked from the 'find' routine on each file or directory in turn
sub checkFile()
{
    return unless -f $_; # only interested in files, not directories

    my $filename = $_;
    my $dirname = $File::Find::dir;

    return if $filename =~ /^\._/; # ignore files whose names start with "._"

    if (defined($matchSomeSuffix))
    {
        return unless &$matchSomeSuffix;
    }

    my $statInfo = stat($filename)
              or warn "Can't stat file \"$dirname/$filename\": $!\n" and return;
    my $size = $statInfo->size;

    my $fileInfo = {
        'dirname'  => $dirname,
        'filename' => $filename,
        };

    push(@{$filesByDataLength{$size}}, $fileInfo);
}

MAIN:
{
    # traverse the directories and collate the files by data length
    # in the global variable %filesByDataLength
    find(\&checkFile, @searchDirs);
    
    my $numDupes = 0;
    my $numDupeBytes = 0;
    # process the files by size, starting with the largest
    foreach my $size (sort {$b<=>$a} keys %filesByDataLength)
    {
        my $numSameSize = scalar(@{$filesByDataLength{$size}});
        next unless $numSameSize > 1;

        #print "size: $size numSameSize: $numSameSize\n";
        my $filesByMd5Ref = hashByMd5($filesByDataLength{$size});
        my %filesByMd5 = %{$filesByMd5Ref};
        foreach my $md5 (keys %filesByMd5)
        {
            my @sameMd5List = @{$filesByMd5{$md5}};
            my $numSameMd5 = scalar(@sameMd5List);
            next unless $numSameMd5 > 1;
            
            # for each set of dupes, print the full path to the files
            my $rsrcMd5;
            foreach my $fileInfo (@sameMd5List)
            {
                my $dirname = $fileInfo->{dirname};
                my $filename = $fileInfo->{filename};
                my $filepath = "$dirname/$filename";
                print "$filepath\n";
                
                if ($checkRsrc)
                {
                    my $rsrcFilepath = "$filepath/..namedfork/rsrc";
                    if (!defined($rsrcMd5))
                    {
                        $rsrcMd5 = calcMd5($rsrcFilepath);
                    }
                    elsif ($rsrcMd5 ne calcMd5($rsrcFilepath))
                    {
                        print "Resource fork differs\n";
                    }
                }
            }
            print "----------\n";
            
            $numDupes += ($numSameMd5 - 1);
            $numDupeBytes += ($size * ($numSameMd5 - 1));
        }
    }
    
    my $numDupeMegabytes = sprintf("%.1f", $numDupeBytes / (1024 * 1024));
    print "Number of duplicate files: $numDupes\n";
    print "Megabytes duplicated: $numDupeMegabytes\n";
}

