#!/usr/bin/perl -w

# Copyright (c) 2003, 2004 Joseph Walton <joe@kafsemo.org>
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# $Id: download-files.pl,v 1.52 2004/08/03 01:38:20 joe Exp $

# Download files (presumably RSS files), doing as good a job
#  as possible of only downloading files that have changed.

# Usage:
#  download-files.pl --output <output-filename>
#
# Reads input of the form:
#
# ---
# <filename>
# Location: <url>
# X-Refresh-Minutes: 360
#
# <filename2>
# Location: <url2>
#
# ---
# 
# That is, stanzas consisting of a filename, any number of HTTP headers
# (that must include a 'Location'), and terminated with a blank line.
# Output is a file of the same format that should be passed back in on
# the next run. The names of updated local files will be printed to
# standard output.


use strict;

use threads;
use threads::shared;

use IO::Handle;
use IO::File;

use Data::Dumper;

use LWP::UserAgent;
use HTTP::Date;
use HTTP::Status;
use Compress::Zlib;
use Time::Duration;

use Getopt::Long;

my ($version) = ('$Revision: 1.52 $' =~ /([\d\.]+)/);


# Authentication data
my %auth;
share(%auth);

{
	package AuthUserAgent;

	our (@ISA);
	push @ISA, 'LWP::UserAgent';

	sub get_basic_credentials
	{
		my ($self, $realm, $uri, $isproxy) = @_;

		# We know nothing about authenticating to a proxy
		if ($isproxy) {
			return ();
		}

		lock(%auth);
		my $u = $auth{lc($uri->host)};
		if (defined($u)) {
			return @{$u};
		} else {
			return ();
		}
	}
}

# Calculate the appropriate expiry time, taking into account
#  what the server says and some clamp limits
sub calculateExpires($$$)
{
	my ($date, $expires, $refreshMinutes) = @_;

	if (!defined($refreshMinutes)) {
		$refreshMinutes = 60;
	}

	# Ignore suspiciously early expirations; clamp
	#  tediously long ones
	if (defined($expires)) {
		my $diff = $expires - $date;
		if ($diff < 60 * 10) {
			undef($expires);
		} elsif ($diff > 60 * 60 * 3) {
			$expires = $date + 60 * 60 * 3;
		}
	}

	if (!defined($expires) && defined($date)) {
		$expires = $date + $refreshMinutes * 60;
	}

	return $expires;
}

my $outputFilename;
my $authFilename;

GetOptions(
	"output|O=s" => \$outputFilename,
	"auth|A=s" => \$authFilename
) or die "Unable to parse command-line options: $!";

die "Usage: download.pl --output <filename> [--auth <filename>]" unless defined($outputFilename);

if (defined($authFilename)) {
	my $authfile = new IO::File($authFilename, '<') or die "Unable to open auth file $authFilename: $!";
	while (<$authfile>) {
		chomp;
		my @pl = split(',', $_, 3);
		if (@pl != 3) {
			die "Password file format: <host>,<usercode>,<password> (was '$_')";
		}

		my ($host, $username, $pass) = @pl;

		my @u;
		share(@u);
		push @u, $username, $pass;
		$auth{lc($host)} = \@u; #&share([]); #$username, $pass]);
	}
	close($authfile) or die "Unable to close auth file $authFilename: $!";
}


my $outputLock;
my $output = new IO::File($outputFilename, '>') or die "Unable to open $outputFilename: $!";
my $changedFileOutput = IO::Handle->new_from_fd(fileno(STDOUT), '>') or die "Unable to open output for changed files: $!";
my $running;
my @queue;
my %threadStatus;

share($outputLock);
share($running);
share(@queue);
share(%threadStatus);

my (@fileCount, @byteCount);
my $compressedCount = 0;
my $totalDownloadedBytes = 0;

share(@fileCount);
share(@byteCount);
share($compressedCount);
share($totalDownloadedBytes);

@fileCount = (0, 0, 0);
@byteCount = (0, 0, 0);

$running = 1;

my @workers;

foreach (1..5) {
	my $w = threads->create("fetch_worker") or die "Unable to create worker thread: $!";
	push @workers, $w;
}

my $filename;
my %rh;

while (<>) {
	chomp;

	if (! /^$/) {
		if (my ($f, $v) = (/^\s*(.*?):\s*(.*)$/)) {
			$rh{$f} = $v;
		} else {
			if (!defined($filename)) {
				$filename = $_;
			} else {
				die "Unexpected line: $_\n";
			}
		}
	} else {
		pend($filename, \%rh);

		undef($filename);
		%rh = ();
	}
}

if (defined($filename)) {
	pend($filename, \%rh);
}

{
lock($running);
$running = 0;
cond_broadcast($running);
}

foreach (@workers) {
	my $tid = $_->tid();

	my $f;
	{
	 lock($running);
	 $f = $threadStatus{$tid};
	}
	if (defined($f)) {
		print STDERR "Waiting for: $f\n";
	}
	defined($_->join()) or die "Thread failed.\n";
}

print STDERR "Downloaded files: ",join('/', reverse(@fileCount)),", bytes: ",join('/', reverse(@byteCount));
if ($fileCount[2] > 0) {
	print STDERR " (",$compressedCount,'/',$fileCount[2]," compressed";

	if ($byteCount[2] > 0) {
		print STDERR ', ',sprintf('%.2f', ($totalDownloadedBytes * 100) / $byteCount[2]),"% bandwidth used";
	}
	print STDERR ")";
}
print STDERR ".\n";

close($output) or die "Unable to close output file: $!";


# addStats(level, size, downloadedSize)
# Level: 0 => Checked local file
#        1 => Check remote site
#        2 => Downloaded data from remote site
# downloadedSize, if defined, implies that this was downloaded compressed
sub addStats($$$)
{
	lock(@fileCount);
	lock(@byteCount);
	lock($compressedCount);
	lock($totalDownloadedBytes);

	my ($level, $size, $downloadedBytes) = @_;

	if (defined($downloadedBytes)) {
		$compressedCount++;
		$totalDownloadedBytes += $downloadedBytes;
	} else {
		if ($level >= 2) {
			$totalDownloadedBytes += $size;
		}
	}

	while ($level >= 0) {
		$fileCount[$level]++;
		$byteCount[$level] += $size;
		$level--;
	}
}

sub pend
{
	my $x = Data::Dumper->Dump(\@_, ['filename', 'headers']);
	share($x);

	lock($running);
	push @queue, $x;
	cond_signal($running);
}

sub fetch_worker
{
	my $ua = AuthUserAgent->new(parse_head => 0);

	$ua->agent("download-files.pl/$version ".$ua->_agent()) if (defined($version));

	$| = 1;

	my $isRunning = 1;

	my $tid = threads->tid();

	while ($isRunning) {
		my ($filename, $headers);
		{
			lock($running);
			undef($threadStatus{$tid});
			while (scalar(@queue) == 0) {
				if ($running) {
					cond_wait($running);
				} else {
					$isRunning = 0;
					last;
				}
			}

			if (scalar(@queue) > 0) {
				eval(pop(@queue));
				$threadStatus{$tid} = $filename;
			}
		}

		if (defined($filename)) {
			fetch($ua, $filename, $headers);
		}

	}

#	print STDERR $tid.": is exiting\n";
}

sub fetch
{
my ($ua, $filename, $href) = @_;

my ($feedId) = ($filename =~ /^(.*)\..*$/);

# print STDERR threads->self()->tid().": would fetch $filename, ".%{$href}."\n";

my $location;

my $req = HTTP::Request->new();
$req->method('GET');

my $date;
my $expires;
my $contentLength;
my $etag;
my $lastModified;
my $refreshMinutes;

foreach(keys(%$href)) {
	my ($f, $v) = (lc($_), $href->{$_});

	if ($f eq 'location') {
		$location = $v;
	} elsif ($f eq 'etag') {
		$etag = $v;
	} elsif ($f eq 'last-modified') {
		$lastModified = str2time($v);
	} elsif ($f eq 'date') {
		$date = str2time($v);
	} elsif ($f eq 'expires') {
		$expires = str2time($v);
	} elsif ($f eq 'content-length') {
		$contentLength = $v;
	} elsif ($f eq 'x-refresh-minutes') {
		$refreshMinutes = $v;
	} else {
		# Ignore unknown headers
#		print STDERR "Unparsed header - $f: $v\n";
	}
}

die "No Location: for $feedId" unless defined($location);

$req->uri($location);

$req->header('Accept-Encoding', ('gzip', 'x-gzip'));

$expires = calculateExpires($date, $expires, $refreshMinutes);

my $exists = -f $filename;

if (!$exists) {
	print STDERR "($filename does not currently exist)\n";
}

# Clear $exists if the file has zero length, or doesn't match the expected
#  length
if ($exists) {
	($exists, my $lm) = (stat($filename))[7,9]; # mtime
	if (defined($exists)) {
		if ($exists == 0) {
			print STDERR "($filename has zero length; discarding)\n";
		} elsif (defined($contentLength) && ($exists != $contentLength)) {
			print STDERR "($filename is $exists bytes, should be $contentLength; discarding)\n";
			$exists = 0;
		}
	}

	# Drop the file if the last modified date doesn't match
	#  what the feeds file says.
	if (defined($lm) && defined($lastModified)) {
		if ($lm != $lastModified) {
			print STDERR "(${filename}'s last modified date is wrong; discarding)\n";
			$exists = 0;
		}
	} elsif (defined($lm)) {
		$lastModified = $lm;
	}
}
 
my $now = time();

if ($exists) {
	if (defined($expires) && ($expires > time())) {
		print STDERR "$filename: hasn't expired yet (".scalar(localtime($expires)).")\n";

		lock($outputLock);
		print $output "$filename\n" or die "Unable to write to output: $!";
#		print $output "Date: ".time2str()."\n" or die "Unable to write to output: $!";
		my ($key, $value);
		while (($key, $value) = each(%$href)) {
			print $output "$key: $value\n" or die "Unable to write to output: $!"; # unless ($key eq 'Date');
		}
		print $output "\n" or die "Unable to write to output: $!";
		$output->flush();

		addStats(0, $exists, undef);
		return;
	}

	if (defined($lastModified)) {
		# If age is less than (five hours into the future)...
		if ($now - $lastModified < -5 * 60 * 60) {
			print STDERR "$filename: Implausible last modification time (".scalar(localtime($lastModified)).")\n";
		} else {
			$req->if_modified_since($lastModified);
		}
	}

	if (defined($etag)) {
		$req->header('If-None-Match', $etag);
	}
}

if ($filename =~ /\.rss$/) {
	$req->header('Accept', ['application/rss+xml;revision=2.0', 'application/rss+xml;q=0.99', 'text/xml;q=0.9', 'application/xml;q=0.9', '*/*;q=0.01']);
} elsif ($filename =~ /\.html$/) {
	$req->header('Accept', ['application/xhtml+xml', 'text/html;q=0.8']);
}

# Turn them into a single header
if (defined($req->header('Accept'))) {
	$req->header('Accept', scalar($req->header('Accept')));
}

my $res = $ua->request($req);

# Check for permanent redirection
# We want the Location: from the most recent 301 that's caused
#  solely by 301s. (That is, the effective new permanent location.)
# i.e., ^ 301 * (301)
my $newLocation;
my $r = $res;
while (defined($r = $r->previous())) {
	if ($r->code == RC_MOVED_PERMANENTLY) {
		if (!defined($newLocation)) {
			$newLocation = $r->headers->header('Location');
		}
	} else {
		$newLocation = undef;
	}
}

if (defined($newLocation)) {
#	print STDERR "PERMANENT REDIRECTION: $location -> $newLocation\n";
	$location = $newLocation;
}

lock($outputLock);

print $output "$filename\n" or die "Unable to write to output: $!";
print $output "Location: ".$location."\n" or die "Unable to write to output: $!";

# Why no, I don't want to trust the server
# Drop time-related headers if the server clock is egregiously wrong.
#  RFC 2616, 14.18, y'all ('SHOULD use a mechanism, such as NTP', otherwise
#  'responses MUST NOT include a Date header field').
$now = time();
my $serverDate = $res->header('Date');
if (defined($serverDate)) {
	my $srvNow= str2time($serverDate);
	if ($srvNow) {
		my $drift = $srvNow - $now;
		if (abs($drift) > 360) {
			print STDERR "Bad time from server (",duration(abs($drift))," ",
				($drift >= 0) ? 'fast' : 'slow',")\n";

			$res->headers()->remove_header('Expires', 'Last-Modified');
		}
	}
}

print $output "Date: ".time2str($now)."\n" or die "Unable to write to output: $!";

print STDERR "$filename: ";

if ($res->code == RC_NOT_MODIFIED) {
	my ($key, $value);
	while (($key, $value) = each(%$href)) {
		next if ($key eq 'Location' or $key eq 'Date');
		my $hv;

		# We only take very specific headers from the new response;
		# IIS sends a 'Content-Length: 0' which we'd like to avoid.
		if (grep /$key/i, ('server', 'expires')) {
			$hv = $res->headers->header($key);
		}

		if (defined($hv)) {
			print $output "$key: $hv\n" or die "Unable to write to output: $!";
		} else {
			print $output "$key: $value\n" or die "Unable to write to output: $!";
		}
	}

	print STDERR "Not modified.\n";

	addStats(1, $exists, undef);
} elsif ($res->is_success) {
	my $isGzipped;

	my $contentEncoding = $res->header('Content-Encoding');
	if (!defined($contentEncoding)) {
		$isGzipped = 0;
	} elsif ((lc($contentEncoding) eq 'gzip') || (lc($contentEncoding) eq 'x-gzip'))
	{
		$isGzipped = 1;
	} else {
		die "Unexpected Content-Encoding: $contentEncoding";
	}

	my $stamp = $res->headers->last_modified();

	my $tmpname = $filename.'.tmp'; # Not necessarily safe, but we trust the user not to try anything silly

	open(HANDLE,'>',$tmpname) or die "Unable to open $tmpname to save content for $location: $!";

	my ($downloadedBytes, $contentBytes);

	if ($isGzipped) {
		my $uncompressed = Compress::Zlib::memGunzip($res->content);
		die "Unable to decompress content for $feedId: $!" unless defined($uncompressed);
		print HANDLE $uncompressed;
		$contentBytes = length($uncompressed);
		$downloadedBytes = length($res->content);
	} else {
		print HANDLE $res->content;
		$contentBytes = length($res->content);
	}

	close(HANDLE) or die "Unable to close $tmpname for saved content of $location: $!\n";
	if (defined($stamp)) {
		utime(time(), $stamp, $tmpname);
	}

	rename($tmpname, $filename) or die "Unable to rename $tmpname to $filename: $!";

	if (defined($refreshMinutes)) {
		print $output "X-Refresh-Minutes: $refreshMinutes\n" or die "Unable to write to output: $!";
	}

	foreach my $t ('ETag', 'Content-Type', 'Content-Length', 'Server') {
		# Don't record the content-length if our local copy has been decoded
		if ($isGzipped && ($t eq 'Content-Length')) {
			next;
		}
		my ($v, @unexpected) = $res->headers->header($t);
		if (defined($v)) {
			print $output "$t: $v\n" or die "Unable to write to output: $!";

			foreach (@unexpected) {
				print $output "Unexpected: $_\n" or die "Unable to write to output: $!";
			}
		}
	}

	my $expires = $res->headers->expires();
	if (defined($expires)) {
		print $output "Expires: ".time2str($expires)."\n" or die "Unable to write to output: $!";
	}

	my $lms;

	if (defined($stamp)) {
		$lms = time2str($stamp);
		print $output "Last-Modified: $lms\n" or die "Unable to write to output: $!";
	} else {
		$lms = '[no stamp]';
	}

	print STDERR "New content ($lms, ",$contentBytes,"b";
	if ($isGzipped) {
		print STDERR ", ",sprintf('%.2f', (100 * $downloadedBytes) / $contentBytes),'%';
	}
	print STDERR ")\n";

	print $changedFileOutput $filename,"\n";

	addStats(2, $contentBytes, $downloadedBytes);
} else {
	print STDERR "Was not successful (".$res->code." ".$res->message.")\n";
}

	print $output "\n" or die "Unable to write to output: $!";
	$output->flush() or die "Unable to write to output: $!";
}
