#!/usr/bin/perl
use strict;
use warnings;

use LWP;
use HTTP::Cookies;
use URI;
require Crypt::SSLeay; # seems to be needed for HTTPS with LWP

# getDarwinSource:
# This script can be used to download the Darwin source code
# (i.e. the open-source part of OS X)
#
# The files that are downloaded are those that meet the requirements of
# the 'shouldDownload' routine. It looks at the @inclusions and @exclusions
# arrays of filename patterns. It also looks at the file size restriction
# specified by $maxSizeMB.
# The tarballs are expanded and put in a newly-created sub-directory of the
# current directory named "Darwin$version"
# The script checks for existing source code directories under the 
# "Darwin$version" directory and avoids downloading the same code again. This
# means that you can run it a second time after changing some parameters to
# pick up some files that were missed the first time.
#
# Cameron Hayne (macdev@hayne.net)  January 2006


# Global variables (change these to fit your situation)
# -----------------------------------------------------
my $accountName = "your_name\@your_Isp.com";
my $accountPassword = undef;

# Set $version to the version of OS X that you want to get the source code for
my $version = "10.4.7.ppc"; # must match what is in the URL

my $maxSizeMB = 2.0;    # maximum size (in megabytes) of individual files

# inclusions: patterns of files to definitely be downloaded
#             Inclusions take precedence over exclusions and file size limits
# (the following merely reflects the files I was interested in
#  - you should edit the list to suit your interests)
my @inclusions = (
                  "^BootX.*",
                  "^CF.*",
                  "^DiskArbitration.*",
                  "^Libc.*",
                  "^Libsystem.*",
                  "^PowerManagement.*",
                  "^Secur.*",
                  "^bash.*",
                  "^curl.*",
                  "^gnutar.*",
                  "^sudo.*",
                  "^xnu.*",
                 );
                 
# exclusions: patterns of files to avoid                
my @exclusions = (
                  "^Apple.*", 
                  "^[A-Z].*",
                  "^apache.*",
                  "^gcc.*", 
                  "^gdb.*",
                  "^libsecurity.*",
                 );

 my $darwinsourceUrl = "http://www.opensource.apple.com/darwinsource";
 my $browser; # LWP agent used for all web access


#----- FUNCTIONS -----

sub startLogFile()
{
    open(LOG, ">LOG") or die "Failed to open LOG file\n";
}

sub endLogFile()
{
    close(LOG);
}

# printMsg: print a message to STDOUT and to LOG
sub printMsg($$)
{
    my ($msg, $shouldLog) = @_;

    print "$msg\n";
    if ($shouldLog)
    {
        print LOG "$msg\n";
    }
}

# newBrowser: returns a new LWP browser
sub newBrowser()
{
    my $browser = LWP::UserAgent->new();  

    # write cookies to an in-memory hash
    my $cookieJar = HTTP::Cookies->new();     
    $browser->cookie_jar($cookieJar);

    # allow redirections from POST requests
    push(@{$browser->requests_redirectable}, 'POST');

    return $browser;    
}

# this is just a wrapper around LWP::UserAgent.head()
sub httpHead($)
{
    my ($url) = @_;

    my $response = $browser->head($url);
    return $response;
}

# this is just a wrapper around LWP::UserAgent.get()
sub httpGet($)
{
    my ($url) = @_;

    my $response = $browser->get($url);
    return $response;
}

# httpGetFile: this is just a wrapper around LWP::UserAgent.get()
#              that uses the pseudo-header ":content_file"
#              to put the content into the specified file
sub httpGetFile($$)
{
    my ($url, $filename) = @_;

    my $response = $browser->get($url, ':content_file' => $filename);
    return $response;
}

# httpPost: this is just a wrapper around LWP::UserAgent.post()
sub httpPost($$)
{
    my ($url, $values) = @_;

    my $response = $browser->post($url, $values);
    return $response;
}

# httpCookies: returns cookies as a string
sub httpCookies()
{
    my $cookies = $browser->cookie_jar->as_string();
    return $cookies;
}

# printResponseInfo: for debugging
sub printResponseInfo($)
{    
    my ($response) = @_;

    my $statusLine = $response->status_line();
    my $requestedUrl = $response->request->url;

    print "Response->request->url: $requestedUrl\n";
    print "Response->status_line: $statusLine\n";
}

# printCookies: for debugging
sub printCookies($)
{
    my ($msg) = @_;

    my $cookies = httpCookies();
    print "$msg\n", $cookies, "\n";
}

# login: returns 1 on success, 0 on failure
sub login($$)
{
    my ($accountName, $accountPassword) = @_;

    print "Starting login\n";
    my $apslTarballsUrl = "$darwinsourceUrl/tarballs/apsl/";
    my $response = httpGet($apslTarballsUrl);
    unless ($response->is_success)
    {
        print "Failed to get APSL tarballs page ($apslTarballsUrl)\n";
        printResponseInfo($response);
        return 0;
    }
    
    # The above page should have redirected to the APSL login page
    # which has an HTML form which we want to post to.
    # First we need to extract the form's action URL
    # Note that since the login page uses HTTPS,
    # it seems that LWP requires that the Crypt::SSLeay module be installed

    my $loginUrl = $response->request->url;
    #print "loginUrl: $loginUrl\n";

    my $contents = $response->content;
    $contents =~ s/\n/ /g; # remove carriage returns
    my $str = qq(<form method="post" name="appleConnectForm" action=);
    my ($actionUrl) = $contents =~ m{$str\"([^"]*)\">}io;
    unless ($actionUrl)
    {
        print "Failed to extract action URL from APSL login page ($loginUrl)\n";
        return 0;
    }
    #print "actionUrl: $actionUrl\n";
    my $loginUri = URI->new($loginUrl);
    my $actionFullUrl = $loginUri->scheme . "://" . $loginUri->host 
                        . $actionUrl;
    #print "actionFullUrl: $actionFullUrl\n";
    
    # Now we are ready to login
    my $values = [
                'theAccountName' => $accountName,
                'theAccountPW'   => $accountPassword,
                ];

    $response = httpPost($actionFullUrl, $values);
    unless ($response->is_success)
    {
        print "Posting of the login form failed\n";
        printResponseInfo($response);
        return 0;
    }

    # If Apple didn't give us cookies, the login must have failed
    #printCookies("Cookies after login:");
    unless (httpCookies())
    {
        print "Didn't get any cookies from login "
              . "- probably due to incorrect password\n";
        return 0;
    }
    
    return 1;
}

# getSizeOfDownload: returns the size in bytes of the file specified by URL
sub getSizeOfDownload($)
{
    my ($url) = @_;
    
    my $response = httpHead($url);
        unless ($response->is_success)
    {
        print "Failed to get headers for $url\n";
        return -1;
    }

    my $size = $response->content_length();
    $size = -1 unless defined($size);
    return $size;
}

# getTarUrls: returns an array of the tarball URLs
sub getTarUrls($)
{
    my ($baseUrl) = @_;
    
    my $response = httpGet($baseUrl);
    unless ($response->is_success)
    {
        die "Failed to get darwinsource index page ($baseUrl)\n";
    }

    my $contents = $response->content;
    $contents =~ s/\n/ /g; # remove carriage returns

    # Links on the darwinsource page are like:
    # <a href="../tarballs/other/basic_cmds-43.tar.gz">.tar.gz</a>
    # Directory is either 'other' or 'apsl'

    my @tarUrls = $contents =~ m{<a href=\"([^"]*)\">\.tar\.gz<\/a>}ig;

    return @tarUrls;
}

# shouldDownload: handles the rules for including or excluding files
#                 Returns 1 iff the file should be downloaded
sub shouldDownload($$$)
{
    my ($name, $type, $fullUrl) = @_;
    
    if (-d $name)
    {
        printMsg("Skipping $name since previously downloaded", 0);
        return 0;
    }
    
    foreach my $incl (@inclusions)
    {
        if ($name =~ /$incl/)
        {
            printMsg("Downloading $name since it matches inclusion $incl", 0);
            return 1;
        }
    }
    
    foreach my $excl (@exclusions)
    {
        if ($name =~ /$excl/)
        {
            printMsg("Skipping $name since it matches exclusion $excl", 1);
            return 0;
        }
    }
    
    my $size = getSizeOfDownload($fullUrl);
    #print "name $name  size: $size\n";
    if ($size > 0)
    {
        my $sizeMB = sprintf("%.2f", $size / (1024 * 1024));
        if ($sizeMB > $maxSizeMB)
        {
            my $msg = "Skipping $name since size ($sizeMB MB) "
                      . "is greater than max ($maxSizeMB MB)";
            printMsg($msg, 1);
            return 0;
        }
    }
    
    print "Downloading $name\n";
    return 1;
}

MAIN:
{
    unless (defined($accountPassword))
    {
        print STDERR "Password for APSL account: ";
        chomp($accountPassword = <STDIN>);
        print STDERR "\n";
    }
    
    my $baseUrl = "$darwinsourceUrl/$version/";
    
    $browser = newBrowser();
    login($accountName, $accountPassword) or die "Failed to login\n";
    
    my @tarUrls = getTarUrls($baseUrl);
    if (scalar(@tarUrls) < 1)
    {
        die "Failed to extract tarUrls from index page\n";
    }
    
    # create the directory where the source code will be put
    # and cd to that directory
    my $darwinSrcDir = "Darwin$version";
    unless (-d $darwinSrcDir)
    {
        mkdir $darwinSrcDir or die "Can't create directory $darwinSrcDir: $!\n";
    }
    chdir $darwinSrcDir or die "Can't chdir to $darwinSrcDir: $!\n";

    startLogFile();

    foreach my $url (@tarUrls)
    {
        #print "url: $url\n";
        my $fullUrl = $baseUrl . $url;
        
        my $type;
        my $tarball;
        unless (($type, $tarball) = $url =~ m{(other|apsl)/(.*)$})
        {
            die "Malformed URL: '$url'\n";
        }
        my ($name) = $tarball =~ m{^(.*)\.tar\.gz$};
        #print "type: $type  tarball: $tarball  name: $name\n";
        
        if (shouldDownload($name, $type, $fullUrl))
        {
            my $tempFilename = "temp_tarball";
            httpGetFile($fullUrl, $tempFilename);

            system("/usr/bin/tar zxf $tempFilename");
            unlink $tempFilename;
        }
    }

    endLogFile();
}
