#!/bin/perl5 -w

$\ = "\n";
$directory = "/import/ftp/pub/lennox/superguy/";

$progname = "autocollect";

$usage = 
    "Usage: $progname [options] search-pattern\n\n" .
    "Options:\n" .
    "  -s          Search using case-sensitive string matching.\n" .
    "  -i          Search using case-insensitive string matching (default).\n" .
    "  -e          Search using regular expression matching.\n" .
    "  -r num or -r num-num\n" .
    "              Search only the logs specified (multiple ranges can be given).\n" .
    "  -x          Output index entries for matching posts, rather than the\n" .
    "              posts themselves.\n" .
    "  -d <dir>    Use <dir> as archive directory.\n" .
    "  -o <file>   Output to file <file>.\n" .
    "  -v          Give verbose output of what logs are being scanned.\n" .
    "  -w          Warn of log inconsistencies.\n" .
    "              (Give more -w's for more warnings.)\n" .
    "  -a <file>   Use <file> to specify author e-mail addresses.\n" .
    "              (Used in inconsistency checking.)\n" .
    "  -h, -?      Print this help information.";

# useful constants for inconsistency-checking.
@short_month_names = ("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul",
		      "Aug", "Sep", "Oct", "Nov", "Dec");
%month_numbers = ("Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4,
		  "May" => 5, "Jun" => 6, "Jul" => 7, "Aug" => 8,
		  "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12);
@year_to_date = ( 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 );

# variables set by argument parsing ($directory can also be set).

$search_type = 1; # 0 - string 1 - casestring 2 - regexp
@logfiles = ();
$output_index = 0;
$OUTPUT_FILE = "-";
$verbose = 0;
$warn = 0;

while (defined($argument = shift(@ARGV))) {
    if ($argument =~ /\A-/) {
	@args = split(//, $argument);
	shift(@args);		# Lose the leading -
	foreach $arg (@args) {
	    if ($arg eq "s") { $search_type = 0 }
	    elsif ($arg eq "i") { $search_type = 1 }
	    elsif ($arg eq "e") { $search_type = 2 }
	    elsif ($arg eq "r") { 
		push(@logfiles, shift(@ARGV));
		if (!defined($logfiles[$#logfiles])) {
		    die "-$arg: requires an argument.\n";
		}
	    }
	    elsif ($arg eq "x") { $output_index = 1 }
	    elsif ($arg eq "d") {
		$directory = shift(@ARGV);
		if (!defined($directory)) {
		    die "-$arg: requires an argument.\n";
		}
	    }
	    elsif ($arg eq "o") {
		$OUTPUT_FILE = shift(@ARGV);
		if (!defined($OUTPUT_FILE)) {
		    die "-$arg: requires an argument.\n";
		}
	    }
	    elsif ($arg eq "v") { $verbose = 1 }
	    elsif ($arg eq "w") { $warn++ }	# Allow arbitrarily many w's.
	    elsif ($arg eq "a") {
		$AUTHORS = shift(@ARGV);
		if (!defined($AUTHORS)) {
		    die "-$arg: requires an argument.\n";
		}
	    }
	    elsif ($arg eq "h" or $arg eq "?") { 
		print STDERR $usage;
		exit 1;
	    }
	    else { warn "Unknown argument -$arg ignored.\n" }
	}
    } else {
	$expression = $argument;
    }
}
	
if (!defined($expression)) {
    print STDERR "No search pattern specified.";
    print STDERR "Use the search string '.' to extract all material in the range specified.\n";
    print STDERR "Type $progname -h for usage information.";
    exit 1;
}

if ($search_type == 0 or $search_type == 1) {
    if ($expression ne ".") {
	$expression = quotemeta($expression);
    }
} elsif (!defined(eval "'' =~ /\$expression/")) {
    print STDERR "Invalid regular expression \"$expression\".\n";
    print STDERR "Type $progname -h for usage information.";
    exit 1;
}

if ($search_type == 1) {
    $expression = "(?i)" . $expression;
}


if (! -d $directory) { die "Couldn't open directory $directory\n" }

if ($directory !~ m|/\Z|) { $directory .= "/" }
if (!defined($AUTHORS)) {
    $AUTHORS = "${directory}perl/author-addrs";
}

if ($warn) {
    $verbose = 1;
}

# Convert the logfile ranges to a proper list.
@logfiles = convert_range(@logfiles);

# Figure out which index files to use.
@ARGV = index_files(@logfiles);

# Look for the index files in the specified directory.
foreach (@ARGV) {s/\A/$directory/}

# Set up the output file.
if ($OUTPUT_FILE eq "-") {
    open(OUTPUT_FILE, ">&STDOUT") or die("couldn't dup stdout, died");
    open(STDOUT, ">&STDERR") or die("couldn't dup stderr, died");
}
else {
    open(OUTPUT_FILE, ">$OUTPUT_FILE")
	or die("couldn't open $OUTPUT_FILE, aborting.\n");
}

# Set up the authors assoc. array, if we're doing warnings.
if ($warn) {
    printf "Reading $AUTHORS...";	# printf to avoid $\
    open AUTHORS or die("Couldn't open authors file $AUTHORS, aborting.\n");
    $author_name = "";
    while (<AUTHORS>) {
	chomp;
	if (/^\s+/) {
	    s/^\s*//;
	    $author_addresses{lc($_)} = $author_name;
	}
	else {
	    $author_name = $_;
	}
    }
    close AUTHORS;
    print "done.";
}


# State variables for the loop
$current_ind_searched = -1;	# -1 means not currently looking at an index.
$ind_to_search = shift(@logfiles);
$currently_searching = 0;	# Are we looking at something we care about?
@lines_in_current = ();		# The lines from the current index
				# entry
$found_something = 0;		# Something in the current entry matched.
$printed_something = 0; 	# We actually output something.

# Loop through the index files
while (<>) {
    chomp;
    # Determine index currently being searched
    if (!/^ /) {		# Either not an index line, or a new
				# entry.  Either way, it's the end of
				# an entry if $currently_searching.
	if (!/^[\d ]/) { $current_ind_searched = -1 }
	elsif (/^(\d\d\d\d)/ or /^(\d\d\d) /) { $current_ind_searched = $1 }
	if ($currently_searching) {
	    $currently_searching = 0;
	    if ($found_something) {
		if ($output_index) { do_print_index(); }
		else { do_read_log(); }
	    }

	    @lines_in_current = ();
	    $found_something = 0;
	    $ind_to_search = shift(@logfiles);
	    if (!defined($ind_to_search)) {
		last;
	    }
	}
    }

    if ($current_ind_searched < $ind_to_search) {
	next;
    }
    elsif ($current_ind_searched > $ind_to_search) {
	die "Hey!  $current_ind_searched is more than $ind_to_search!  Died";
    }
    else {			# We want to search this one
	$currently_searching = 1;
	if (/$expression/) { $found_something = 1 }
	push(@lines_in_current, $_);
    }
}

# Things to do at the end.  Should only execute if we searched the
# last index entry.
if ($currently_searching) {
    $currently_searching = 0;
    if ($found_something) {
	if ($output_index) { do_print_index(); }
	else { do_read_log(); }
    }

    $ind_to_search = shift(@logfiles);
    if (defined($ind_to_search)) {
	die "Hey, there were logs left to search when we ran out of index!  Died";
    }
}

if (!$printed_something) {
    # For the command line version we don't have to do anything here.
}

close(OUTPUT_FILE);


# Subroutines

sub convert_range {
    my @ranges = @_;
    my @files = ();
    my $range;
    my $lower = 10000;
    my $upper = -1;
    my($bot, $top);

    # I might want the pattern more general than this some day.
    my @logfilesfound = glob("$directory/sguy*[0-9] $directory/sguy*.gz");

    foreach $file (@logfilesfound) {
	$file =~ /sguy(\d*)/;
	if ($1 < $lower) { $lower = $1 }
	if ($1 > $upper) { $upper = $1 }
    }

    if ($#ranges == -1) {	# There weren't any files specified.
	push(@ranges, "$lower-$upper");
    }

    foreach $range (@ranges) {
	if ($range =~ /\A(\d*)-(\d*)\Z/) {
	    $bot = $1;
	    $top = $2;
	    if ( $bot eq "" ) { $bot = $lower }
	    if ( $top eq "" ) { $top = $upper }
	    if ( $bot < $lower ) {
		warn("Bad first parameter in $range: first file is $lower\n");
		$bot = $lower;
	    }
	    if ( $top > $upper ) {
		warn("Bad second parameter in $range: last file is $upper\n");
		$top = $upper;
	    }
	    push(@files, $bot..$top);
	}
	elsif ($range =~ /\A\d+\Z/) {
	    if ( $range < $lower or $range > $upper ) {
		warn("Parameter $range out of range, skipping\n");
	    }
	    else {
		push(@files, $range);
	    }
	}
	else { warn("Bad parameter $range, skipping\n") }
    }

    return(sort {$a <=> $b} @files);
}

sub index_files {
# This is pretty trivial, but it didn't used to be, and I didn't feel like
# changing the calling functions.  Besides, we might need it again someday.
    return ("Superguy.Index.complete.txt");
}

# Print the relevant index entries.  $ind_to_search, @lines_in_current, and
# OUTPUT_FILE should be set up properly.
# XXX: warnings don't work for index-only.
sub do_print_index {
    my($first, $line);
    $first = 0;
    while (defined($line = shift(@lines_in_current))) {
	if ($line =~ /$expression/) {
	    if (!$first) {
		# Force the index number on, regardless of whether it was there
		# originally.
		$line =~ s/^....//;
		$line = sprintf("%04d", $ind_to_search) . $line;
		$first = 1;
	    }
	    if (!$printed_something) {
		printf OUTPUT_FILE <<END;
Vol    Date    ID  Title                                 Author
---- --------  --  ------------------------------------  -------------------
END
		$printed_something = 1;
	    }	
	    print(OUTPUT_FILE $line);
	}
    }
}
 
# Read the log file.  $ind_to_search, @lines_in_current, and
# OUTPUT_FILE should be set up properly.
sub do_read_log {
    my $LOG;

    $LOG = sprintf("sguy%04d", $ind_to_search);
    if (-r "$directory$LOG") {
	open(LOG, "$directory$LOG")
	    or die("Couldn't open $directory$LOG\n");
	if ($verbose) {
	    print("Scanning $LOG...");
	}
    }
    elsif (-r "$directory$LOG.gz") {
	open(LOG, "gzcat $directory$LOG.gz |")
	    or die("Couldn't exec gzcat or couldn't open $directory$LOG.gz\n");
	if ($verbose) {
	    print("Scanning $LOG.gz...");
	}
    }
    else { die("Couldn't find $LOG\n") }
    
    # There are four types of separators separating entries in the logs.
    #   Logs 1 - 6: '<<\d\d\d>>' begins an entry.
    #   Logs 7 - 14, 26 - 126: "NICBBS":
    #      '\*\*\*\*\* Received \d\d:\d\d:\d\d on \d\d/\d\d/\d\d,' ...
    #      _ends_ an entry.  (This is usually the only date indication
    #      we have.)
    #   Logs 15 - 25: '={78,79}' begins an entry.
    #   Logs 126 on: "UCF1VM": '={73}' begins an entry.
    #  Note that the transition from NICBBS to UCF1VM happens _within_
    #   log 126.  *Both* separators occur between Ramrod:  Media
    #   Darling #5 parts 2 and 3 (as would be expected).
    #  We handle this by assuming instead that separators end entries,
    #   but allowing eof() as a separator, and disregarding "entries"
    #   of less than five lines.

    my @entry = ();
    my($log_line, %header, $parsing_a_header, $partial_header, $month);
    my($complain, $key);

    while (defined($this_index = shift(@lines_in_current))) {
	@entry = ();
	undef %header;
	undef %index;
	$parsing_a_header = 0;
	$complain = 0;
	$provisional_subtopic = "";

	while (defined($log_line = <LOG>)) {
	    chomp $log_line;
	    if (($ind_to_search >= 1 and $ind_to_search <= 6)
		and $log_line =~ /^<<\d\d\d>>$/ ) { last }

	    elsif ((($ind_to_search >= 7 and $ind_to_search <= 14) or
		   ($ind_to_search >= 26 and $ind_to_search <= 126))
		   and $log_line =~
		   m|^\*\*\*\*\* Received (\d\d:\d\d:..) on (\d\d)/(\d\d)/(\d\d)|) {
		# Assemble a proper "Date:" header.
		$month = $short_month_names[$2-1];
		my $date = "$3 $month 19$4 $1";

		# Take the date in the received line as canonical.
		if ($warn and exists($header{'date'})) {
		    print("!!! Duplicate date $header{'date'}");
		}
                # This is not a Y2K bug, since this only applies to logs <=
                # 126, which was in 1991
		$header{'date'} = sprintf("19%02d-%02d-%02d %s", $4, $2, $3,
					  $1); 

		# Put "Date:" after leading blank lines
		my @blanks = ();
		while (($log_line = shift(@entry)) =~ /^\s*$/) {
		    unshift(@blanks, $log_line);
		}
		unshift(@entry, $log_line);
		unshift(@entry, "Date:        $date");
		foreach $log_line (@blanks) { unshift(@entry, $log_line) }
		last;
	    }

	    elsif (($ind_to_search >= 15 and $ind_to_search <= 25)
		   and $log_line =~ /^={78,79}$/ ) { last }

	    elsif (($ind_to_search >= 126)
		   and $log_line =~ /^={73}$/ ) { last }

	    # Parse out headers

	    if ($parsing_a_header) {
		# A continued header line -- begins with whitespace, and
		# contains some non-whitespace.

		if ( $log_line =~ /^\s+(\S.*)/ ) {
		    $partial_header .= $1;
		}
		else {
		    if ( $partial_header =~ /\AFrom: / ) {
			$partial_header =~ s/\AFrom: *//;
			$partial_header =~ s/.*<(.*)>/$1/ or
			    $partial_header =~ s/ +\(.*\)$//;
			if (!exists($header{'from'})) {
			    $header{'from'} = $partial_header;
			}
			elsif ( $warn ) {
			    print "!!! duplicate sender \"$partial_header\"";
			}
		    }
		    elsif ( $partial_header =~ /\ASubject: .*/ ) {
			$partial_header =~ s/^Subject: *//;
			$partial_header =~ s/<(Superguy|Sfstory) \d*> *//;
			if ( defined $1 ) { $provisional_subtopic = $1 }

			if (!exists($header{'subject'})) {
			    $header{'subject'} = $partial_header;
			    if(defined($provisional_subtopic)) {
				if ($provisional_subtopic eq "Superguy") {
				    $header{'subtopic'} = "SG";
				}
				elsif ($provisional_subtopic eq "Sfstory") {
				    $header{'subtopic'} = "SF";
				}
			    }
			    elsif ($partial_header =~
				   m|\A(Re: *)?(Repost: *)?([\w/]*):|i) {
				$header{'subtopic'} = $3;
			    }
			}
			elsif ( $warn ) {
			    print "!!! duplicate subject \"$partial_header\"";
			}
		    }
		    else { die "Saw a header I wasn't expecting, stopped" }
		    $parsing_a_header = 0;
		}
	    }

	    if ($log_line =~ /^Subject: / or $log_line =~ /^From: /) {
		$partial_header = $log_line;
		$parsing_a_header = 1;
	    }

	    elsif ($log_line =~ /^Date: /) {	# Date header.
		($newdate = $log_line) =~ s/^Date:\s*//;
		if ($newdate =~ /\w*, (\d*) (\w*) (\d*) ([\d:x]*).*/) {
		    if (exists ($month_numbers{$2})) {
			$month = $month_numbers{$2};
		    }
		    else {
			warn "!!! unknown month \"$2\"";
			$month = 0;
		    }
		    $newdate = sprintf("%04d-%02d-%02d %s", $3, $month,
				       $1, $4);
		    if (!exists($header{'date'})) {
			$header{'date'} = $newdate;
		    }
		    elsif ($warn) { print "!!! duplicate date \"$newdate\"" }
		}
		elsif ($warn) { 
		    print "!!! unknown date format \"$newdate\" -- ignored.";
		}

	    }
	    
	    # Kill control-Z's, they're annoying.
	    $log_line =~ s/\032//g;
	    
	    # Skip those stupid "Archived" lines.
	    if ($log_line =~ /\*\*\*\*\* Archived/) { next }

	    push(@entry, $log_line);
	}

	if ($#entry < 5) { redo } # Entry too small, must be bogus.

	if ($warn >= 2) {
	    if (!exists($header{'subject'})) {print "!!! subject missing"}
	    if (!exists($header{'from'}))    {print "!!! sender missing"}
	    if (!exists($header{'date'}))    {print "!!! date missing"}
	}

	if ($warn) {
	    $index{'date'}     = substr($this_index, 5,  8);
	    $index{'subtopic'} = substr($this_index, 15, 2);
	    $index{'subject'}  = substr($this_index, 19, 37);
	    $index{'author'}   = substr($this_index, 57, 25);
	    
	    $index{'subject'} =~ s/\s*\Z//;
	    $index{'author'}  =~ s/\s*\Z//;
	    $index{'date'}    =  idxdate2iso($index{'date'});

	    if (exists($header{'date'}) and
		abs(days_since_1900($index{'date'}) -
		    days_since_1900($header{'date'})) > 2) {
		print "!!! Index and Log disagree about date.";
		$complain = 1;
	    }
	    
	    $index{'subtopic'} =~ s/LN/LNH/g;
	    $index{'subtopic'} =~ s/AD/ADMINISTRIVIA/g;
	    
	    if (exists($header{'subtopic'}) and
		lc($header{'subtopic'}) ne lc($index{'subtopic'})) {
		print "!!! Index and Log disagree about subtopic.";
		$complain = 1;
	    }

	    
	    if (exists($header{'from'})) {
		if(exists($author_addresses{lc($header{'from'})})) {
		    $header{'from_name'} = $author_addresses{lc($header{'from'})};
		    if ($index{'author'} !~ /$header{'from_name'}/i
			and $index{'subtopic'} ne "LNH"
			# Many LNH reposts were posted by someone other than
			# the author 
			and $index{'author'} !~ m|/|) # Collaborations
		    {
			print "!!! Author and sender are different.";
			$complain = 1;
		    }
		    
		}
		else {
		    print "!!! Sender address $header{'from'} not in the address database.";
		    $complain = 1; # XXX
		}
	    }

	    if ($complain) {
		printf("!!! Index:  %s  %s: %s  %s\n",
		       $index{'date'}, $index{'subtopic'}, $index{'subject'},
		       $index{'author'}); 
		printf("!!! Log:    %s  %s  %s <%s>\n",
		       exists($header{'date'})      ? $header{'date'}     : "",
		       exists($header{'subject'})   ? $header{'subject'}  : "",
		       exists($header{'from_name'}) ? $header{'from_name'}: "",
		       exists($header{'from'})      ? $header{'from'}     : "");
	    }
	}

	
	if (eof(LOG) and $#lines_in_current != -1) {
	    warn("Uh-oh, not enough log for all the indices, " .
		 "something's probably gone wrong.\n");
	    last;
	}

	if ($this_index =~ /$expression/) {
	    $this_index =~ s/^.....//;
	    if ($verbose) {
		print("$this_index");
	    }
	    if (!$printed_something) {
		$printed_something = 1;
	    }
	    print(OUTPUT_FILE "="x73);
	    foreach (@entry) {
		print OUTPUT_FILE;
	    }
	}
    }
    if (!eof(LOG) and $#entry >= 5) {
	warn("Uh-oh, not enough index for all the logs, " .
	     "something's probably gone wrong.\n");
    }

    close(LOG);
}

sub days_since_1900		# Takes a date in YYYY-MM-DD, returns
				# days since Jan 1 1900
				# valid for Mar 1 1900 - Feb 28 2100
{
    my $date = shift;
    $date =~ /(\d\d\d\d)-(\d\d)-(\d\d)/;
    my $year = $1;
    my $month = $2;
    my $day = $3;
    
    my $total = ($year - 1900) * 365; # regular days since 1900
    $total += int( ($year - 1900) / 4 ); # leap days since 1900, not
				         # counting this year
    if ($year % 4 == 0 && $month > 2) { $total++ } # leap year this year
    if ($month < 1 or $month > 12) { die "month \#$month???  date = $date, stopped" }
    $total += $year_to_date[$month - 1]; # earlier months this year
    $total += $day;		# days this month
    $total;			# return this value
}

sub idxdate2iso($) {
    my ($idx) = @_;
    my ($yr, $mo, $day, $fullyear);
    if ($idx =~ m|(..)/(..)/(..)|) {
        $mo = $1;
        $day = $2;
        $yr = $3;
        # Pivot year on 1970.  XXX: Y2K-esque bug
        if ($yr < 70) { $fullyear = $yr + 2000 }
        else {$fullyear = $yr + 1900}
        $idx = "$fullyear-$mo-$day";
    }
    return $idx;
}
