#!/home/staff/jhi/SPAM/bin/perl

#
# check-mirror-status
#
# Checks for the status (up-to-dateness) of CPAN mirrors.
# The check is very rudimentary and based on one file: the CPAN master
# site updates every hour the file MIRRORING.FROM, and most importantly
# (for this script) it updates one timestamp (UTC) within the file (note:
# not *of* the file).  Since mirrors are supposed to pick up changed files,
# this allows us to track how uptodate each mirror is.  (Of course at worst
# they could be mirroring just the MIRRORING.FROM and not mirror anything
# else, but really checking for the uptodateness (or correctness) of several
# hundred megabytes of the hundreds of CPAN sites is not feasible.)
#
# The logic of the script:
# (1) read in the file /pub/CPAN/MIRRORED.BY which lists the mirror sites
#     and the ftp and http URLs they serve CPAN from
# (2) Using the Net::FTP and LWP start at most 16 concurrent connections
#     which will connect the service and retrieve the MIRRORING.FROM,
#     extract the timestamp, and return it to the master process
#     (each connection is a forked slave process, the result is read
#     back through a pipe).  Timeout the connections properly, try up
#     to three times (timeouts at 10, 20, and 30 seconds).  If forks
#     fail, sleep for a while.  The URLs are tried out in random order.
# (3) Collect and display the results: how far behind the URL is,
#     what was the response time.
#
# This script very probably requires a UNIX to run since it uses fork,
# signals, pipes, and select().
#

require 5.6.1;

use strict;
use warnings;

my $DEBUG = $ENV{DEBUG};

use File::Path  qw(rmtree);
use File::Spec;
use File::Temp	qw(tempfile tempdir);
use Getopt::Long;
use Time::HiRes	'time';

my $verbose = '';

GetOptions( 'verbose' => \$verbose );

my $tempdir = tempdir(File::Spec->catdir(File::Spec->tmpdir(), "XXXXXXXX"));

print "tempdir = $tempdir\n" if $DEBUG;

# Paranoia check.
unless(-d $tempdir && $tempdir =~ m:^/tmp/:) {
    print "tempdir '$tempdir' doesn't look right, aborting.\n";
    exit(1);
}

&main;

sub main {
    my $status;

    $status = 
	check_mirrors
	    (
	     configfile  => "/pub/CPAN/MIRRORED.BY",
	     timeouts    => [10, 20, 30],
	     concurrency => 16,
	    );
    show_results($status);

    chdir("/") && rmtree($tempdir);

    exit(0);
}

sub debug {
    return unless $verbose;
    print STDERR @_;
}

sub get_ftp {
    my ($url, $timeout) = @_;

    my ($host, $path) = ($url =~ m!^ftp://(.+?)/(.*)!);

    $| = 1;
    
    use Net::FTP;

    my $ftp;

    unless ($ftp =
	  Net::FTP->new($host, Timeout => $timeout, Debug => $DEBUG)) {
	return (0, "connect: $@");
    }

    unless ($ftp->login('anonymous', 'cpan@perl.org')) {
	return (0, "login: $!");
    }
    
    unless ($ftp->cwd($path)) {
	$ftp->quit;
	return (0, "cwd: $!");
    }
    
    my ($fh, $filename) = tempfile("XXXXXXXX", DIR => $tempdir, UNLINK => 1);

    my $start = time();
    
    unless ($ftp->get('MIRRORING.FROM', $filename)) {
	return (0, "get: $!");
    }
    
    unless ($ftp->quit) {
	close($fh);
	return (0, "quit");
    }
    
    my $stop = time();
    
    return (1, $stop - $start, $fh, $filename);
}

sub get_http {
    my ($url, $timeout) = @_;
    
    use LWP::Debug;
    LWP::Debug::level('+trace') if $DEBUG;
    
    require HTTP::Request;
    require LWP::UserAgent;
    
    $url .= "/" unless $url =~ m!/$!;

    my $request = HTTP::Request->new(GET => "${url}MIRRORING.FROM");
    return (0, 0) unless $request;
    
    my $ua = LWP::UserAgent->new(timeout => $timeout);
    return (0, 0) unless $ua;

    my ($fh, $filename) = tempfile("XXXXXXXX", DIR => $tempdir, UNLINK => 1);

    my $start = time();

    my $response = $ua->request($request, $filename);

    my $stop  = time();

    if ($response->is_success) {
	return (1, $stop - $start, $fh, $filename);
    } else {
	close($fh);
	unlink($filename);
        my $status = $response->status_line;
        $status =~ s/\s+\z//;
        return (0, $status);
    }
}
 
sub get_urls {
    my $configfile = shift;
    my %url;
	     
    if (open(my $configfh, $configfile)) {
	local $/;
		 
	my $slurp = <$configfh>;
		 
	close($configfh);
		 
	while ($slurp =~
	       /^([\w\-]+(?:\.[\w\-]+)+): # a hostname (id for site)
	       ((?:\s+\w+\s*=\s*".*?")+)  # attr="value"
	       /mgx) {
	    my ($hostid, $hostconfig) = ($1, $2);
	    my %hostconfig;
	    while ($hostconfig =~ /(\w+)\s*=\s*"(.*?)"/mg) {
		$hostconfig{$1} = $2;
	    }
	    foreach my $urlid (qw(dst_ftp dst_http)) {
		if (exists $hostconfig{$urlid} &&
		    length $hostconfig{$urlid}) {
		    $url{$hostconfig{$urlid}} =
			[ $hostid, \%hostconfig ];
		}
	    }
	}
    } else {
	warn qq($0: open "$configfile" failed: $!\n);
    }
    
    return \%url;
}

sub check_url_with_timeout {
    my ($wfh, $url, $timeout) = @_;

    my $got   = 0;
    my $epoch = 0;
    my $info  = 0;
    my ($fh, $filename);
		    
    local $SIG{ALRM} =
	sub {
	    print $wfh "$url 0 0 0 alarm\n";
	    if (defined $fh && defined $filename) {
		close($fh);
		unlink($filename);
	    }
	    exit(1);
	};
    alarm($timeout);
		    
    my $start = int(time()); # the epoch time is integral, too

    if ($url =~ m|^ftp://|) {
	($got, $info, $fh, $filename) = get_ftp($url, $timeout);
    } elsif ($url =~ m|^http://|) {
	($got, $info, $fh, $filename) = get_http($url, $timeout);
    }
		    
    if ($got) {
	my $sawend;
	
	while (<$fh>) {
	    if (/UNIX epoch seconds = (\d+)/) {
		$epoch = $1;
	    } elsif (/^\# End of MIRRORING\.FROM/) {
		$sawend = 1;
	    }
	}

	unless ($sawend) {
	    ($got, $epoch, $info) = (0, 0, 0);
	}
    }
    
    if (defined $fh && defined $filename) {
	close($fh);
	unlink($filename);
    }

    $start = 0 unless $epoch; # Never mind, then.

    debug "# $url $got $epoch $start $info $timeout\n";
    print $wfh "$url $got $epoch $start $info\n";
    
    exit($got ? 0 : 1);
}

sub check_urls_with_timeout {
    my ($urls, $timeout, $concurrency, $status) = @_;

    my %todo = %$urls;
    my %fail;

    my $max = $concurrency;
    my $act = 0;
    my $rin = '';
    my %rfh;
    my %wfh;
    my %url;
    my %pid;
    my $pid;
    
    while (keys %todo) {
	while ($act < $max && $act < keys %todo) {
	    my ($rfh, $wfh);
	    
	    if (pipe($rfh, $wfh)) {
		my $rfd = fileno($rfh);

		$rfh{$rfd} = $rfh;
		vec($rin, $rfd, 1) = 1;
		$wfh{$rfd} = $wfh;
		
		my @urls = keys %todo;
		my $url  = $urls[rand @urls];
		
		delete $todo{$url};
		$status->{$url}->{host} = $urls->{$url}->[0];
		
		$url{$rfd} = $url;
		
		$pid = fork();
		
		if (defined $pid) {
		    if ($pid) {
			$pid{$rfd} = $pid;
			$act++;
		    } else {
			check_url_with_timeout($wfh, $url, $timeout);
		    }
		} else {
		    warn "$0: fork failed: $!\n";
		    sleep(2); # melting CPU? We're patient!
		    
		    # Reinstate the URL.
		    
		    delete $rfh{$rfd};
		    vec($rin, $rfd, 1) = 0;
		    delete $wfh{$rfd};
		    
		    $todo{$url} = $urls->{$url};

		    delete $url{$rfd};
		}
	    } else {
		warn "$0: pipe failed: $!\n";
	    }
	}
	
	debug "# todo ", scalar keys %todo, " active $act fail ", scalar keys %fail, "\n";

	my $rout;
	my $nfound = select($rout = $rin, undef, undef, $timeout);
	
	if ($nfound) {
	    while ($nfound--) {
                my $bits = unpack("b*", $rout);
		my $rfd = index $bits, 1, 0;
		my $url = $url{$rfd}; 

		vec($rin, $rfd, 1) = vec($rout, $rfd, 1) = 0;

		my $result = readline($rfh{$rfd});
		my ($got, $epoch, $start, $info);
		if (defined $result) {
		    chomp($result);
		    ($url, $got, $epoch, $start, $info) =
			($result =~ /^(\S+) (0|1) (\d+) (\d+) (.*)/);
		    unless (defined $got) {
			die "bad result '$result'\n";
		    }
		} else {
		    ($got, $epoch, $start, $info) = (0, 0, 0, 0);
		}

		unless ($got) {
		    $fail{$url} = $urls->{$url};
		}
		 
		$status->{$url}->{got}   = $got;
		$status->{$url}->{epoch} = $epoch;
		$status->{$url}->{lag}   = $start - $epoch;
		$status->{$url}->{info}  = $info;

		delete $rfh{$rfd};
		delete $wfh{$rfd};

		delete $url{$rfd};

		kill 'TERM', $pid{$rfd};
		waitpid($pid{$rfd}, 0);
		delete $pid{$rfd};

		$act--;
	    }
	} else {
	    %rfh = %wfh = ();
	    $rin = '';
	    
	    my @fail = values %url;
	    @fail{@fail} = @{$urls}{@fail};
	    for my $url (@fail) {
		debug "# $url 0 0 0 $timeout\n";
		$status->{$url}->{got}    = 0;
		$status->{$url}->{epoch}  = 0;
		$status->{$url}->{lag}    = 0;
		$status->{$url}->{info}   = 0;
	    }
	    %url = ();
	    
	    my @pid = values %pid;
	    kill 'TERM', @pid;
	    for my $pid (@pid) { waitpid($pid, 0) }
	    %pid = ();
	    
	    $act = 0;
	}
    }
    
    %$urls = %fail;
}

sub check_urls_with_timeouts {
    my ($urls, $timeouts, $concurrency, $status) = @_;

    for my $timeout (@$timeouts) {
	check_urls_with_timeout($urls, $timeout, $concurrency, $status);
	last unless keys %$urls;
    }
}

sub check_mirrors {
    my %config = @_;

    my $urls = get_urls($config{configfile});

    my %status;

    check_urls_with_timeouts($urls,
			     $config{timeouts},
			     $config{concurrency},
			     \%status);

    return \%status;
}

sub show_results {
    my $status = shift;

    my @good;
    my @fail;

    for my $url (keys %$status) {
	if ($status->{$url}->{got}) {
	    push @good,
	         [$status->{$url}->{host},
		  $url,
		  $status->{$url}->{epoch},
		  $status->{$url}->{lag},
		  $status->{$url}->{info}];
	} else {
	    push @fail,
                 [$status->{$url}->{host},
		  $url,
		  $status->{$url}->{epoch},
		  $status->{$url}->{lag},
		  $status->{$url}->{info}];
	}
    }

    @good = sort { $a->[0] cmp $b->[0] || $a->[1] cmp $b->[1] } @good;
    @fail = sort { $a->[0] cmp $b->[0] || $a->[1] cmp $b->[1] } @fail;

    print "OK @$_ \n"   for @good;
    print "FAIL @$_ \n" for @fail;
}