#!/bin/perl5 -w

$\ = "\n";
#useful variables
@short_month_names = ("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul",
		      "Aug", "Sep", "Oct", "Nov", "Dec");
#@long_month_names = ("January", "February", "March", "April", "May", "June",
#		     "July", "August", "September", "October", "November",
#		     "December"); 
%month_numbers = ("Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4,
		  "May" => 5, "Jun" => 6, "Jul" => 7, "Aug" => 8,
		  "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12);
@year_to_date = ( 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 );

$usage = 
    "Usage: $0 [-w] [-d directory] [-o output-file]\n" .
    "\t\t\t[-r num|num-num] [-e expression]\n";


$expression = ".";
@logfiles = ();
$directory = "/import/ftp/pub/lennox/superguy/";
$OUTPUT_FILE = "-";
$AUTHORS = "${directory}perl/author-addrs";
$warn = 0;

while (defined($argument = shift(@ARGV)) and $argument =~ /\A-/) {
    @args = split(//, $argument);
    shift(@args);		# Lose the leading -
    foreach $arg (@args) {
	if ($arg eq "l" or $arg eq "r") { 
	    push(@logfiles, shift(@ARGV));
	    if (!defined($logfiles[$#logfiles])) {
		die "-$arg: requires an argument.\n";
	    }
	}
	elsif ($arg eq "e") {
	    $expression = shift(@ARGV);
	    if (!defined($expression)) {
		die "-$arg: requires an argument.\n";
	    }
	}
	elsif ($arg eq "o") {
	    $OUTPUT_FILE = shift(@ARGV);
	    if (!defined($OUTPUT_FILE)) {
		die "-$arg: requires an argument.\n";
	    }
	}
	elsif ($arg eq "w") { $warn++ }	# Allow arbitrarily many w's.
	elsif ($arg eq "d") {
	    $directory = shift(@ARGV);
	    if (!defined($directory)) {
		die "-$arg: requires an argument.\n";
	    }
	}
	elsif ($arg eq "a") {
	    $AUTHORS = shift(@ARGV);
	    if (!defined($AUTHORS)) {
		die "-$arg: requires an argument.\n";
	    }
	}
	elsif ($arg eq "h" or $arg eq "?") { 
	    print STDERR $usage;
	    exit 1;
	}
	else { warn "Unknown argument -$arg ignored.\n" }
    }
}

if (! -d $directory) { die "Couldn't open directory $directory\n" }

if ($directory !~ m|/\Z|) { $directory .= "/" }

# Convert the logfile ranges to a proper list.
@logfiles = convert_range(@logfiles);

# Figure out which index files to use.
@ARGV = index_files(@logfiles);

# Look for the index files in the specified directory.
foreach (@ARGV) {s/\A/$directory/}

# Set up the output file.
if ($OUTPUT_FILE eq "-") {
    open(OUTPUT_FILE, ">&STDOUT") or die("couldn't dup stdout, died");
    open(STDOUT, ">&STDERR") or die("couldn't dup stderr, died");
}
else {
    open(OUTPUT_FILE, ">$OUTPUT_FILE")
	or die("couldn't open $OUTPUT_FILE, aborting.\n");
}

# Set up the authors assoc. array, if we're doing warnings.
if ($warn) {
    printf "Reading $AUTHORS...";	# printf to avoid $\
    open AUTHORS or die("Couldn't open authors file $AUTHORS, aborting.\n");
    $author_name = "";
    while (<AUTHORS>) {
	chomp;
	if (/^\t/) {
	    s/^\s*//;
	    $author_addresses{lc($_)} = $author_name;
	}
	else {
	    $author_name = $_;
	}
    }
    close AUTHORS;
    print "done.";
}


# State variables for the loop
$current_ind_searched = -1;	# -1 means not currently looking at an index.
$ind_to_search = shift(@logfiles);
$currently_searching = 0;	# Are we looking at something we care about?
@lines_in_current = ();		# The lines from the current index
				# entry
$found_something = 0;		# Something in the current entry matched.

# Loop through the index files
while (<>) {
    chomp;
    # Determine index currently being searched
    if (!/^ /) {		# Either not an index line, or a new
				# entry.  Either way, it's the end of
				# an entry if $currently_searching.
	if (!/^[\d ]/) { $current_ind_searched = -1 }
	elsif (/^(\d\d\d)/) { $current_ind_searched = $1 }
	if ($currently_searching) {
	    $currently_searching = 0;
	    if ($found_something) {
		do_read_log();
	    }

	    @lines_in_current = ();
	    $found_something = 0;
	    $ind_to_search = shift(@logfiles);
	    if (!defined($ind_to_search)) {
		last;
	    }
	}
    }

    if ($current_ind_searched < $ind_to_search) {
	next;
    }
    elsif ($current_ind_searched > $ind_to_search) {
	die "Hey!  $current_ind_searched is more than $ind_to_search!  Died";
    }
    else {			# We want to search this one
	$currently_searching = 1;
	if (/$expression/) { $found_something = 1 }
	push(@lines_in_current, $_);
    }
}

# Things to do at the end.  Should only execute if we searched the
# last index entry.
if ($currently_searching) {
    $currently_searching = 0;
    if ($found_something) {
	do_read_log();
    }

    $ind_to_search = shift(@logfiles);
    if (defined($ind_to_search)) {
	die "Hey, there were logs left to search when we ran out of index!  Died";
    }
}

print OUTPUT_FILE "Posts to all subtopics but AD:";
print OUTPUT_FILE
"Name                       Episodes     Posts     Lines     Words     Chars";

foreach $author (sort(keys(%statistics))) {
    $chars = 0;
    $words = 0;
    $lines = 0;
    $posts = 0;
    $eps = 0;
    foreach $subtopic (keys(%{$statistics{$author}})) {
	if ($subtopic !~ /^(LNH-old|ADMINISTRIVIA)$/) {
	    $chars += $statistics{$author}{$subtopic}{'chars'};
	    $words += $statistics{$author}{$subtopic}{'words'};
	    $lines += $statistics{$author}{$subtopic}{'lines'};
	    $posts += $statistics{$author}{$subtopic}{'posts'};
	    $eps += $statistics{$author}{$subtopic}{'eps'};
	}
    }
    printf(OUTPUT_FILE "%-25s  %8d  %8d  %8d  %8d  %8d\n",
	   $author, $eps, $posts, $lines, $words, $chars);
}


close(OUTPUT_FILE);


# Subroutines

sub convert_range {
    my @ranges = @_;
    my @files = ();
    my $range;
    my $lower = 10000;
    my $upper = -1;
    my($bot, $top);

    # I might want the pattern more general than this some day.
    my @logfilesfound = glob("$directory/superguy.*[0-9] $directory/superguy.*.gz");

    foreach $file (@logfilesfound) {
	$file =~ /superguy\.(\d*)/;
	if ($1 < $lower) { $lower = $1 }
	if ($1 > $upper) { $upper = $1 }
    }

    if ($#ranges == -1) {	# There weren't any files specified.
	push(@ranges, "$lower-$upper");
    }

    foreach $range (@ranges) {
	if ($range =~ /\A(\d*)-(\d*)\Z/) {
	    $bot = $1;
	    $top = $2;
	    if ( $bot eq "" ) { $bot = $lower }
	    if ( $top eq "" ) { $top = $upper }
	    if ( $bot < $lower ) {
		warn("Bad first parameter in $range: first file is $lower\n");
		$bot = $lower;
	    }
	    if ( $top > $upper ) {
		warn("Bad second parameter in $range: last file is $upper\n");
		$top = $upper;
	    }
	    push(@files, $bot..$top);
	}
	elsif ($range =~ /\A\d+\Z/) {
	    if ( $range < $lower or $range > $upper ) {
		warn("Parameter $range out of range, skipping\n");
	    }
	    else {
		push(@files, $range);
	    }
	}
	else { warn("Bad parameter $range, skipping\n") }
    }

    return(sort {$a <=> $b} @files);
}

sub index_files {
    my @indfiles = ();
    my $old_ind = 0;
    my $ind;
    my $end;
    my $log_num;

    while ($log_num = shift(@_)) {
	$ind = 30* int(($log_num-1)/30)+1;
	if ($ind != $old_ind) {
	    push(@indfiles,
		 sprintf("Superguy.Index.%03d-%03d.txt", $ind,
			 30*(int(($log_num-1)/30)+1)));
	    $old_ind = $ind;
	}
    }
#    return @indfiles;
    return ("Superguy.Index.complete.txt"); # XXX  Kludge, kludge...
					    # there must be a better way.
}


# Read the log file.  $ind_to_search, @lines_in_current, and
# OUTPUT_FILE should be set up properly.
sub do_read_log {
    my $LOG;

    $LOG = sprintf("superguy.%03d", $ind_to_search);
    if (-r "$directory$LOG") {
	open(LOG, "$directory$LOG")
	    or die("Couldn't open $directory$LOG\n");
	if ($warn) {print("Scanning $LOG...")}
    }
    elsif (-r "$directory$LOG.gz") {
	open(LOG, "gzcat $directory$LOG.gz |")
	    or die("Couldn't exec gzcat or couldn't open $directory$LOG.gz\n");
	if ($warn) {print("Scanning $LOG.gz...")}
    }
    else { die("Couldn't find $LOG\n") }
    
    # There are four types of separators separating entries in the logs.
    #   Logs 1 - 6: '<<\d\d\d>>' begins an entry.
    #   Logs 7 - 14, 26 - 126: "NICBBS":
    #      '\*\*\*\*\* Received \d\d:\d\d:\d\d on \d\d/\d\d/\d\d,' ...
    #      _ends_ an entry.  (This is usually the only date indication
    #      we have.)
    #   Logs 15 - 25: '={78,79}' begins an entry.
    #   Logs 126 on: "UCF1VM": '={73}' begins an entry.
    #  Note that the transition from NICBBS to UCF1VM happens _within_
    #   log 126.  *Both* separators occur between Ramrod:  Media
    #   Darling #5 parts 2 and 3 (as would be expected).
    #  We handle this by assuming instead that separators end entries,
    #   but allowing eof() as a separator, and disregarding "entries"
    #   of less than five lines.

    my @entry = ();
    my($log_line, %header, $parsing_a_header, $partial_header, $month);
    my($complain, $key);

    while ($this_index = shift(@lines_in_current)) {
	@entry = ();
	undef %header;
	undef %index;
	$parsing_a_header = 0;
	$complain = 0;
	$provisional_subtopic = "";

	while ($log_line = <LOG>) {
	    chomp $log_line;
	    if (($ind_to_search >= 1 and $ind_to_search <= 6)
		and $log_line =~ /^<<\d\d\d>>$/ ) { last }

	    elsif ((($ind_to_search >= 7 and $ind_to_search <= 14) or
		   ($ind_to_search >= 26 and $ind_to_search <= 126))
		   and $log_line =~
		   m|^\*\*\*\*\* Received (\d\d:\d\d:..) on (\d\d)/(\d\d)/(\d\d)|) {
		# Assemble a proper "Date:" header.
		$month = $short_month_names[$2-1];
		my $date = "$3 $month 19$4 $1";

		# Take the date in the received line as canonical.
		if ($warn and exists($header{'date'})) {
		    print("!!! Duplicate date $header{'date'}");
		}
		$header{'date'} = sprintf("19%02d-%02d-%02d %s", $4, $2, $3,
					  $1); 

		# Put "Date:" after leading blank lines
		my @blanks = ();
		while (($log_line = shift(@entry)) =~ /^\s*$/) {
		    unshift(@blanks, $log_line);
		}
		unshift(@entry, $log_line);
		unshift(@entry, "Date:        $date");
		foreach $log_line (@blanks) { unshift(@entry, $log_line) }
		last;
	    }

	    elsif (($ind_to_search >= 15 and $ind_to_search <= 25)
		   and $log_line =~ /^={78,79}$/ ) { last }

	    elsif (($ind_to_search >= 126)
		   and $log_line =~ /^={73}$/ ) { last }

	    # Parse out headers

	    if ($parsing_a_header) {
		# A continued header line -- begins with whitespace, and
		# contains some non-whitespace.

		if ( $log_line =~ /^\s+(\S.*)/ ) {
		    $partial_header .= $1;
		}
		else {
		    if ( $partial_header =~ /\AFrom: / ) {
			$partial_header =~ s/\AFrom: *//;
			$partial_header =~ s/.*<(.*)>/$1/;
			if (!exists($header{'from'})) {
			    $header{'from'} = $partial_header;
			}
			elsif ( $warn ) {
			    print "!!! duplicate sender \"$partial_header\"";
			}
		    }
		    elsif ( $partial_header =~ /\ASubject: .*/ ) {
			$partial_header =~ s/^Subject: *//;
			$partial_header =~ s/<(Superguy|Sfstory) \d*> *//;
			if ( defined $1 ) { $provisional_subtopic = $1 }

			if (!exists($header{'subject'})) {
			    $header{'subject'} = $partial_header;
			    if(defined($provisional_subtopic)) {
				if ($provisional_subtopic eq "Superguy") {
				    $header{'subtopic'} = "SG";
				}
				elsif ($provisional_subtopic eq "Sfstory") {
				    $header{'subtopic'} = "SF";
				}
			    }
			    elsif ($partial_header =~
				   m|\A(Re: *)?(Repost: *)?([\w/]*):|i) {
				$header{'subtopic'} = $3;
			    }
			}
			elsif ( $warn ) {
			    print "!!! duplicate subject \"$partial_header\"";
			}
		    }
		    else { die "Saw a header I wasn't expecting, stopped" }
		    $parsing_a_header = 0;
		}
	    }

	    if ($log_line =~ /^Subject: / or $log_line =~ /^From: /) {
		$partial_header = $log_line;
		$parsing_a_header = 1;
	    }

	    elsif ($log_line =~ /^Date: /) {	# Date header.
		($newdate = $log_line) =~ s/^Date:\s*//;
		if ($newdate =~ /\w*, (\d*) (\w*) (\d*) ([\d:x]*).*/) {
		    if (exists ($month_numbers{$2})) {
			$month = $month_numbers{$2};
		    }
		    else {
			warn "!!! unknown month \"$2\"";
			$month = 0;
		    }
		    $newdate = sprintf("%04d-%02d-%02d %s", $3, $month,
				       $1, $4);
		    if (!exists($header{'date'})) {
			$header{'date'} = $newdate;
		    }
		    elsif ($warn) { print "!!! duplicate date \"$newdate\"" }
		}
		elsif ($warn) { 
		    print "!!! unknown date format \"$newdate\" -- ignored.";
		}

	    }
	    
	    # Kill control-Z's, they're annoying.
	    $log_line =~ s/\032//g;
	    
	    # Skip those stupid "Archived" lines.
	    if ($log_line =~ /\*\*\*\*\* Archived/) { next }

	    push(@entry, $log_line);
	}

	if ($#entry < 5) { redo } # Entry too small, must be bogus.

	if ($warn >= 2) {
	    if (!exists($header{'subject'})) {print "!!! subject missing"}
	    if (!exists($header{'from'}))    {print "!!! sender missing"}
	    if (!exists($header{'date'}))    {print "!!! date missing"}
	}

	$index{'date'}     = substr($this_index, 5,  8);
	$index{'subtopic'} = substr($this_index, 15, 2);
	$index{'subject'}  = substr($this_index, 19, 37);
	$index{'author'}   = substr($this_index, 57, 25);
	
	$index{'subject'} =~ s/\s*\Z//;
	$index{'author'}  =~ s/\s*\Z//;
	$index{'date'}    =~ s|(..)/(..)/(..)|19$3-$1-$2|;

	$index{'subtopic'} =~ s/LN/LNH/g;
	if ($index{'subtopic'} =~ /LNH/ and $ind_to_search < 591) {
	    $index{'subtopic'} =~ s/LNH/LNH-old/;
	}
	$index{'subtopic'} =~ s/AD/ADMINISTRIVIA/g;
	    
	if ($warn) {
	    if (exists($header{'date'}) and
		abs(days_since_1900($index{'date'}) -
		    days_since_1900($header{'date'})) > 2) {
		print "!!! Index and Log disagree about date.";
		$complain = 1;
	    }
	    
	    if (exists($header{'subtopic'}) and
		lc($header{'subtopic'}) ne lc($index{'subtopic'})) {
		print "!!! Index and Log disagree about subtopic.";
		$complain = 1;
	    }

	    
	    if (exists($header{'from'})) {
		if(exists($author_addresses{lc($header{'from'})})) {
		    $header{'from_name'} = $author_addresses{lc($header{'from'})};
		    if ($index{'author'} !~ /$header{'from_name'}/i
			and $index{'subtopic'} ne "LNH"
			# Many LNH reposts were posted by someone other than
			# the author 
			and $index{'author'} !~ m|/|) # Collaborations
		    {
			print "!!! Author and sender are different.";
			$complain = 1;
		    }
		    
		}
		else {
		    print "!!! Sender address $header{'from'} not in the address database.";
		    $complain = 1; # XXX
		}
	    }

	    if ($complain) {
		printf("!!! Index:  %s  %s: %s  %s\n",
		       $index{'date'}, $index{'subtopic'}, $index{'subject'},
		       $index{'author'}); 
		printf("!!! Log:    %s  %s  %s <%s>\n",
		       exists($header{'date'})      ? $header{'date'}     : "",
		       exists($header{'subject'})   ? $header{'subject'}  : "",
		       exists($header{'from_name'}) ? $header{'from_name'}: "",
		       exists($header{'from'})      ? $header{'from'}     : "");
	    }
	}

	
	if (eof(LOG) and $#lines_in_current != -1) {
	    warn("Uh-oh, not enough log for all the indices, " .
		 "something's probably gone wrong.\n");
	    last;
	}

	if ($index{'subject'} =~ /History.*Superguy/) {
	    $index{'subtopic'} = "ADMINISTRIVIA";
	}

	if ($this_index =~ /$expression/) {
	    if (!exists($statistics{$index{'author'}}{$index{'subtopic'}})) {
		$statistics{$index{'author'}}{$index{'subtopic'}}{'chars'} = 0;
		$statistics{$index{'author'}}{$index{'subtopic'}}{'words'} = 0;
		$statistics{$index{'author'}}{$index{'subtopic'}}{'lines'} = 0;
		$statistics{$index{'author'}}{$index{'subtopic'}}{'posts'} = 0;
		$statistics{$index{'author'}}{$index{'subtopic'}}{'eps'} = 0;
		$statistics{$index{'author'}}{$index{'subtopic'}}{'oldtitle'} = "";
	    }
	    
	
	    foreach $line (@entry) {
		$statistics{$index{'author'}}{$index{'subtopic'}}{'chars'} +=
		    length($line);
		@foo = split(' ', $line);
		$statistics{$index{'author'}}{$index{'subtopic'}}{'words'} +=
		    scalar(@foo);
		$statistics{$index{'author'}}{$index{'subtopic'}}{'lines'} ++;
	    }
	    
	    $statistics{$index{'author'}}{$index{'subtopic'}}{'posts'} ++;
	    
	    ($title = $index{'subject'}) =~ s|\([^)/]+/[^)]+\).*||g;
	    if ($statistics{$index{'author'}}{$index{'subtopic'}}{'oldtitle'}
		ne $title) {
		$statistics{$index{'author'}}{$index{'subtopic'}}{'eps'} ++;
	    }
	    $statistics{$index{'author'}}{$index{'subtopic'}}{'oldtitle'} = $title;
	    
	}
    }
    if (!eof(LOG) and $#entry >= 5) {
	warn("Uh-oh, not enough index for all the logs, " .
	     "something's probably gone wrong.\n");
    }

    close(LOG);
}

sub days_since_1900		# Takes a date in YYYY-MM-DD, returns
				# days since Jan 1 1900
				# valid for Mar 1 1900 - Feb 28 2100
{
    my $date = shift;
    $date =~ /(\d\d\d\d)-(\d\d)-(\d\d)/;
    my $year = $1;
    my $month = $2;
    my $day = $3;
    
    my $total = ($year - 1900) * 365; # regular days since 1900
    $total += int( ($year - 1900) / 4 ); # leap days since 1900, not
				         # counting this year
    if ($year % 4 == 0 && $month > 2) { $total++ } # leap year this year
    if ($month < 1 or $month > 12) { die "month \#$month???  date = $date, stopped" }
    $total += $year_to_date[$month - 1]; # earlier months this year
    $total += $day;		# days this month
    $total;			# return this value
}