#!/usr/bin/perl -w

# Digest VSFTPD logs from a CPAN mirror into YAML statistics.

# (Process standard input, sends YAML to standard output.)

# Per-file download counts, the number of unique clients, and a few
#  other things, are included. The parsing is a very simple regexp,
#  so handling logs from other daemons would be a simple change.

# Placed in the public domain, July 2004.

my $PREFIX = '/pub/CPAN';
my $TZ = 'Europe/London';

use strict;

use Date::Parse;
use Digest::SHA1;
use DateTime;
use YAML();

sub date2str($)
{
	return defined($_[0]) && DateTime->from_epoch(epoch => $_[0])->iso8601();
}

my $badlinecount = 0;

my $filecount = 0;
my $bytecount = 0;
my %clients;

my $sstamp;
my $estamp;

my %files;

# This isn't very random, but it's good enough
#  (It's used to reduce client IDs, immediately, to a run-unique token. We're
#   not doing anything with client identity at the moment, but if we were
#   then we'd use the randomised token.)
my $salt = pack('F', rand());

my $sha1 = new Digest::SHA1;

while (<>) {
	if (my ($d, $p, $r) = /(.{24})\s\[pid (\d+)\]\s(.*)$/) {
		my @arr = Date::Parse::strptime($d);
		$arr[5] += 1900;
		$arr[4] ++;
		my %param;
		@param{('second', 'minute', 'hour', 'day', 'month', 'year')} = @arr;
		my $stamp = new DateTime(%param, 'time_zone' => $TZ)->epoch;

		if (my ($client, $file, $length, $rate) = ($r =~ /\[ftp\] OK DOWNLOAD: Client "([^"]*)", "$PREFIX\/([^"]+)", (\d+) bytes, (.*)$/))
		{
			# Only count things that look like modules
			if ($file !~ /^authors\/id\//) {
				next;
			}

			if ($file =~ /\/CHECKSUMS$/) {
				next;
			}

			if ($file !~ /\.(tar\.gz|tar\.bz2|zip|tgz|pm\.gz)$/) {
				print STDERR "No match for $file\n";
				next;
			}

			$sha1->reset();
			$sha1->add($salt);
			$sha1->add($client);
			$client = $sha1->hexdigest();
			
			if (!defined($sstamp)) {
				$sstamp = $stamp;
				$estamp = $stamp;
			} else {
				if ($stamp < $sstamp) {
					$sstamp = $stamp;
				}
				if ($stamp > $estamp) {
					$estamp = $stamp;
				}
			}

			$filecount++;
			$bytecount += $length;
			$clients{$client}++;

			$files{$file}++;
		}
	} else {
		$badlinecount++;
	}
}

my $result = {
	stampStart => date2str($sstamp),
	stampEnd => date2str($estamp),
	filecount => $filecount,
	bytecount => $bytecount,

	clientcount => scalar(keys(%clients)),

	stats => \%files
};

print YAML::Dump($result);

