#!/bin/perl5 -w

# The combination of all the other little perl programs lying around
# this directory.  Arguments are as indicated by usage.

$\ = "\n";

$usage = "$0 [-w] [-d directory] [-a author-file] -i index-files... -l log_files...";

# Option-parsing flags
$currently_reading = 0;		# 0 = unknown
				# 1 = index files
				# 2 = log files
				# 3 = directory
				# 4 = author file

# variables set by the option parsing
$write_files = 0;
$directory = "/tmp/foo$$";	# Arbitrary and hopefully harmless
$AUTHORS = "author-addrs";

while ($_ = shift(@ARGV)) {
    if (/\A-i/) {		# "-index"
	$currently_reading = 1;
    }

    elsif (/\A-l/) {		# "-log"
	$currently_reading = 2;
    }

    elsif (/\A-w/) {		# "-write"
	$write_files = 1;
	$currently_reading = 0;
    }

    elsif (/\A-d/) {		# "-directory"
	$currently_reading = 3;
    }

    elsif (/\A-a/) {		# "-author-file"
	$currently_reading = 4;
    }

    elsif (/\A-[?hu]/) {	# -?, -help, -usage, etc.
	print $usage;
	exit;
    }

    elsif (/\A-/) {
	die("Unknown option $_\n$usage\n");
    }

    else			# Must be a file
    {
	$name = $_;
	$name =~ s/(.*\.(Z|z|gz)\Z)/gzcat $1|/;
	
	if ($currently_reading == 1) {
	    push(@INDICES, $name);
	}

	elsif ($currently_reading == 2) {
	    push(@LOGS, $name);
	}

	elsif ($currently_reading == 3) {
	    $directory = $_;
	    $currently_reading = 0;
	}

	elsif ($currently_reading == 4) {
	    $AUTHORS = $name;
	    $currently_reading = 0;
	}

	else { warn "Argument $_ seen in an unexpected context, ignoring.\n" }
    }
}

if (scalar(@INDICES == 0) and scalar(@LOGS == 0)) {
    die "You must specify at least one index or one log.\n";
}
elsif (scalar(@INDICES == 0)) {
    warn "Reading index from stdin...\n";
    push(@INDICES, "-");
}
elsif (scalar(@LOGS == 0)) {
    warn "Reading log from stdin...\n";
    push(@LOGS, "-");
}

@ARGV = @LOGS;

# read the file of author addresses, and create the assoc. array.
printf "Reading $AUTHORS...";	# printf to avoid $\
open AUTHORS;
$author_name = "";
while (<AUTHORS>) {
    chomp;
    if (/^\t/) {
	s/^\s*//;
	$author_addresses{lc($_)} = $author_name;
    }
    else {
	$author_name = $_;
    }
}
close AUTHORS;
print "done.";

# some support routines and data structures

# Date handling
%month_numbers = ("Jan" => "01", "Feb" => "02", "Mar" => "03", "Apr" => "04",
		  "May" => "05", "Jun" => "06", "Jul" => "07", "Aug" => "08",
		  "Sep" => "09", "Oct" => "10", "Nov" => "11", "Dec" => "12");

@month_names = ("January", "February", "March", "April", "May", "June", "July",
		"August", "September", "October", "November", "December");

@year_to_date = ( 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 );

sub days_since_1900		# Takes a date in YYYY-MM-DD, returns
				# days since Jan 1 1900
				# valid for Mar 1 1900 - Feb 28 2100
{
    $date = shift;
    $date =~ /(\d\d\d\d)-(\d\d)-(\d\d)/;
    $year = $1;
    $month = $2;
    $day = $3;
    
    $total = ($year - 1900) * 365; # regular days since 1900
    $total += int( ($year - 1900) / 4 ); # leap days since 1900, not
				         # counting this year
    if ($year % 4 == 0 && $month > 2) { $total++ } # leap year this year
    $total += $year_to_date[$month - 1]; # earlier months this ear
    $total += $day;		# days this month
    $total;			# return this value
}



# handle the parsing of the index files.  Skip over non-useful lines.
$INDEX = shift(@INDICES);
unless (open(INDEX)) {
    die "$0: Couldn't open file $INDEX, aborting.\n";
}

sub get_index_line
{
  LINE: {
      do {
	  while($line = <INDEX>) {
	      if ($line =~ /^[\d ]/) {
		  last LINE;
	      }
	  }
      } while ($INDEX = shift(@INDICES) and
	       (open(INDEX) or
		die "$0: Couldn't open file $INDEX, aborting.\n"));
  }
    $line;
}


# Read in and parse the log file.

# The three fields extracted from a log entry.
$sender = "";
$subject = "";
$date = "";

# Dealing with multi-line headers.
$partial_header = "";
$parsing_a_header = 0;

$ucf1vm_separator =
    "=========================================================================";

$entry_not_empty = 0;

$ind_logfile = "";

$do_logfile = 1;

while ($index = get_index_line()) {
    # Do file_opening things here
  ENTRY: while(<>) {
      if ($do_logfile) {
	  $new_ind_logfile = substr($index, 0, 3);
      
	  ( $log_logfile = $ARGV ) =~ s/\D*(\d+)\D*/$1/;

	  if ($new_ind_logfile ne "   ") {
	      $ind_logfile = $new_ind_logfile;
	      print "[$ind_logfile]";
	  }

	  if ($ind_logfile ne $log_logfile) {
	      print "!!! Index and Log disagree about file:";
	      print "!!!       Index: $ind_logfile    Log: $log_logfile";
	  }

	  $do_logfile = 0;
      }

      chomp;

      if ($parsing_a_header == 1) {
	  if (/^\s.*\S/) {	# A continued header line -- begins with
			  	# whitespace, and contains some
			  	# non-whitespace.
	      s/\s*/ /;
	      $partial_header .= $_;
	  }
	  else {		# Not a continued header -- thus we
				# should parse the accumulated header.
	      if ($partial_header =~ /\AFrom:/) {
		  $partial_header =~ s/\AFrom: *//;
		  $partial_header =~ s/.*<(.*)>/$1/;

		  if ($sender eq "") { $sender = $partial_header }
		  else { print "!!! duplicate sender \"$partial_header\"" }
	      }
	      elsif ($partial_header =~ /\ASubject:/) {
		  $partial_header =~ s/^Subject: *//;
		  $partial_header =~ s/<Superguy \d*> *//;
		  $partial_header =~ s/<Sfstory \d*> *//;

		  if ($subject eq "") { $subject = $partial_header }
		  else { print "!!! duplicate subject \"$partial_header\"" }
	      }
	      else { die "Saw a header I wasn't expecting, stopped" }
	      $parsing_a_header = 0;
	  }
      }

      if (/^Subject: / or /^From: /) {
	  $partial_header = $_;
	  $parsing_a_header = 1;
      }

      elsif (/^Date: /) {	# Date header.
	  $newdate = $_;
	  $newdate =~ s/^Date:\s*//;
	  if ($newdate =~ /\w*, (\d*) (\w*) (\d*) ([\d:x]*).*/) {
	      $day = $1;
	      $spelledmonth = $2;
	      $year = $3;
	      $time = $4;
	      $month = $month_numbers{$spelledmonth};
	      if ($month eq "") {
		  print "!!! unknown month spelling \"$spelledmonth\"";
		  $month = "xx";
	      }
	      if ($day =~ /\A\d\Z/) {$day = "0$day"}
	      $newdate = "$year-$month-$day $time";
	  }
	  else {print "!!! unknown date format \"$newdate\""}

	  if ($date eq "") { $date = $newdate }
	  else { print "!!! duplicate date \"$newdate\"" }
      }

      elsif (/^\*\*\*\*\* Archived/) {print "!!! $_"}

      elsif (/^\*\*\*\*\* Received/) { # End of a NICBBS-style entry
	  if ($date ne "") {print "!!! duplicate date \"$date\"";}
	  $date = $_;
	  $date =~ s|\*\*\*\*\* Received (\d\d:\d\d:..) on (\d\d)/(\d\d)/(\d\d).*|19$4-$2-$3 $1|;
	  last ENTRY;		# Go to end-of-entry parsing
      }

      elsif (/^$ucf1vm_separator$/) { # Top of a ucf1vm entry
	  last ENTRY;
      }

      else {			# Generic other line
	  # Write the line out to a file
	  $entry_not_empty = 1;
      }
  }

    if ($entry_not_empty) {
	if ($subject eq "") {print "!!! subject missing"}
	if ($sender eq "") {print "!!! sender missing"}
	if ($date eq "") {print "!!! date missing"}

	$ind_date = substr($index,5,8);
	$ind_subtopic = substr($index, 15, 2);
	$ind_subject = substr($index, 19, 37);
	$ind_author = substr($index, 57, 25);

	$ind_subject =~ s/\s*\Z//;
	$ind_author =~ s/\s*\Z//;

	$complain = 0;

	$ind_date =~ s|(..)/(..)/(..)|19$3-$1-$2|;

	if (abs(days_since_1900($ind_date) - days_since_1900($date)) > 2) {
	    print "!!! Index and Log disagree about date.";
	    $complain = 1;
	}

	if (exists $author_addresses{lc($sender)}) {
	    $sender_name =  $author_addresses{lc($sender)};
	    if ($ind_author !~ /$sender_name/i) {
		print "!!! Author and sender are different.";
		$complain = 1;
	    }
	}
	else {
	    print "!!! Sender address $sender not in the address database.";
	    $sender_name = "";
	}

	if ($subject =~ m|\A[\w/]*:|) {
	    $log_subtopic = $subject;
	    $log_subtopic =~ s/:.*//;
	}
	else { $log_subtopic = "" }
	
	$ind_subtopic =~ s/LN/LNH/g;
	$ind_subtopic =~ s/AD/ADMINISTRIVIA/g;

	if ($log_subtopic ne "" and $log_subtopic ne $ind_subtopic) {
	    print "!!! Index and Log disagree about subtopic.";
	    $complain = 1;
	}

	if ($complain) {
	    print "Index:  $ind_date  $ind_subtopic: $ind_subject  $ind_author";
	    print "Log:    $date  $subject  $sender_name <$sender>";
	}

	$ind_date =~ /(\d\d\d\d)-(\d\d)-(\d\d)/;
	$year = $1;
	$month = "$2-" . lc($month_names[$2-1]);

	$filename = "$ind_subtopic/$year/$month/$ind_subject";

	# XXX File stuff here
	
	print "would write $filename";
	$date = "";
	$subject = "";
	$sender = "";
	$entry_not_empty = 0;
	$do_logfile = 1;
    }
    else { redo }		# We skipped an empty log entry
}
