#!/bin/perl5 -w
$\ = "\n";
$directory = "/import/ftp/pub/lennox/superguy/";
$progname = "autocollect";
$usage =
"Usage: $progname [options] search-pattern\n\n" .
"Options:\n" .
" -s Search using case-sensitive string matching.\n" .
" -i Search using case-insensitive string matching (default).\n" .
" -e Search using regular expression matching.\n" .
" -r num or -r num-num\n" .
" Search only the logs specified (multiple ranges can be given).\n" .
" -x Output index entries for matching posts, rather than the\n" .
" posts themselves.\n" .
" -d
Use as archive directory.\n" .
" -o Output to file .\n" .
" -v Give verbose output of what logs are being scanned.\n" .
" -w Warn of log inconsistencies.\n" .
" (Give more -w's for more warnings.)\n" .
" -a Use to specify author e-mail addresses.\n" .
" (Used in inconsistency checking.)\n" .
" -h, -? Print this help information.";
# useful constants for inconsistency-checking.
@short_month_names = ("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul",
"Aug", "Sep", "Oct", "Nov", "Dec");
%month_numbers = ("Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4,
"May" => 5, "Jun" => 6, "Jul" => 7, "Aug" => 8,
"Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12);
@year_to_date = ( 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 );
# variables set by argument parsing ($directory can also be set).
$search_type = 1; # 0 - string 1 - casestring 2 - regexp
@logfiles = ();
$output_index = 0;
$OUTPUT_FILE = "-";
$verbose = 0;
$warn = 0;
while (defined($argument = shift(@ARGV))) {
if ($argument =~ /\A-/) {
@args = split(//, $argument);
shift(@args); # Lose the leading -
foreach $arg (@args) {
if ($arg eq "s") { $search_type = 0 }
elsif ($arg eq "i") { $search_type = 1 }
elsif ($arg eq "e") { $search_type = 2 }
elsif ($arg eq "r") {
push(@logfiles, shift(@ARGV));
if (!defined($logfiles[$#logfiles])) {
die "-$arg: requires an argument.\n";
}
}
elsif ($arg eq "x") { $output_index = 1 }
elsif ($arg eq "d") {
$directory = shift(@ARGV);
if (!defined($directory)) {
die "-$arg: requires an argument.\n";
}
}
elsif ($arg eq "o") {
$OUTPUT_FILE = shift(@ARGV);
if (!defined($OUTPUT_FILE)) {
die "-$arg: requires an argument.\n";
}
}
elsif ($arg eq "v") { $verbose = 1 }
elsif ($arg eq "w") { $warn++ } # Allow arbitrarily many w's.
elsif ($arg eq "a") {
$AUTHORS = shift(@ARGV);
if (!defined($AUTHORS)) {
die "-$arg: requires an argument.\n";
}
}
elsif ($arg eq "h" or $arg eq "?") {
print STDERR $usage;
exit 1;
}
else { warn "Unknown argument -$arg ignored.\n" }
}
} else {
$expression = $argument;
}
}
if (!defined($expression)) {
print STDERR "No search pattern specified.";
print STDERR "Use the search string '.' to extract all material in the range specified.\n";
print STDERR "Type $progname -h for usage information.";
exit 1;
}
if ($search_type == 0 or $search_type == 1) {
if ($expression ne ".") {
$expression = quotemeta($expression);
}
} elsif (!defined(eval "'' =~ /\$expression/")) {
print STDERR "Invalid regular expression \"$expression\".\n";
print STDERR "Type $progname -h for usage information.";
exit 1;
}
if ($search_type == 1) {
$expression = "(?i)" . $expression;
}
if (! -d $directory) { die "Couldn't open directory $directory\n" }
if ($directory !~ m|/\Z|) { $directory .= "/" }
if (!defined($AUTHORS)) {
$AUTHORS = "${directory}perl/author-addrs";
}
if ($warn) {
$verbose = 1;
}
# Convert the logfile ranges to a proper list.
@logfiles = convert_range(@logfiles);
# Figure out which index files to use.
@ARGV = index_files(@logfiles);
# Look for the index files in the specified directory.
foreach (@ARGV) {s/\A/$directory/}
# Set up the output file.
if ($OUTPUT_FILE eq "-") {
open(OUTPUT_FILE, ">&STDOUT") or die("couldn't dup stdout, died");
open(STDOUT, ">&STDERR") or die("couldn't dup stderr, died");
}
else {
open(OUTPUT_FILE, ">$OUTPUT_FILE")
or die("couldn't open $OUTPUT_FILE, aborting.\n");
}
# Set up the authors assoc. array, if we're doing warnings.
if ($warn) {
printf "Reading $AUTHORS..."; # printf to avoid $\
open AUTHORS or die("Couldn't open authors file $AUTHORS, aborting.\n");
$author_name = "";
while () {
chomp;
if (/^\s+/) {
s/^\s*//;
$author_addresses{lc($_)} = $author_name;
}
else {
$author_name = $_;
}
}
close AUTHORS;
print "done.";
}
# State variables for the loop
$current_ind_searched = -1; # -1 means not currently looking at an index.
$ind_to_search = shift(@logfiles);
$currently_searching = 0; # Are we looking at something we care about?
@lines_in_current = (); # The lines from the current index
# entry
$found_something = 0; # Something in the current entry matched.
$printed_something = 0; # We actually output something.
# Loop through the index files
while (<>) {
chomp;
# Determine index currently being searched
if (!/^ /) { # Either not an index line, or a new
# entry. Either way, it's the end of
# an entry if $currently_searching.
if (!/^[\d ]/) { $current_ind_searched = -1 }
elsif (/^(\d\d\d\d)/ or /^(\d\d\d) /) { $current_ind_searched = $1 }
if ($currently_searching) {
$currently_searching = 0;
if ($found_something) {
if ($output_index) { do_print_index(); }
else { do_read_log(); }
}
@lines_in_current = ();
$found_something = 0;
$ind_to_search = shift(@logfiles);
if (!defined($ind_to_search)) {
last;
}
}
}
if ($current_ind_searched < $ind_to_search) {
next;
}
elsif ($current_ind_searched > $ind_to_search) {
die "Hey! $current_ind_searched is more than $ind_to_search! Died";
}
else { # We want to search this one
$currently_searching = 1;
if (/$expression/) { $found_something = 1 }
push(@lines_in_current, $_);
}
}
# Things to do at the end. Should only execute if we searched the
# last index entry.
if ($currently_searching) {
$currently_searching = 0;
if ($found_something) {
if ($output_index) { do_print_index(); }
else { do_read_log(); }
}
$ind_to_search = shift(@logfiles);
if (defined($ind_to_search)) {
die "Hey, there were logs left to search when we ran out of index! Died";
}
}
if (!$printed_something) {
# For the command line version we don't have to do anything here.
}
close(OUTPUT_FILE);
# Subroutines
sub convert_range {
my @ranges = @_;
my @files = ();
my $range;
my $lower = 10000;
my $upper = -1;
my($bot, $top);
# I might want the pattern more general than this some day.
my @logfilesfound = glob("$directory/sguy*[0-9] $directory/sguy*.gz");
foreach $file (@logfilesfound) {
$file =~ /sguy(\d*)/;
if ($1 < $lower) { $lower = $1 }
if ($1 > $upper) { $upper = $1 }
}
if ($#ranges == -1) { # There weren't any files specified.
push(@ranges, "$lower-$upper");
}
foreach $range (@ranges) {
if ($range =~ /\A(\d*)-(\d*)\Z/) {
$bot = $1;
$top = $2;
if ( $bot eq "" ) { $bot = $lower }
if ( $top eq "" ) { $top = $upper }
if ( $bot < $lower ) {
warn("Bad first parameter in $range: first file is $lower\n");
$bot = $lower;
}
if ( $top > $upper ) {
warn("Bad second parameter in $range: last file is $upper\n");
$top = $upper;
}
push(@files, $bot..$top);
}
elsif ($range =~ /\A\d+\Z/) {
if ( $range < $lower or $range > $upper ) {
warn("Parameter $range out of range, skipping\n");
}
else {
push(@files, $range);
}
}
else { warn("Bad parameter $range, skipping\n") }
}
return(sort {$a <=> $b} @files);
}
sub index_files {
# This is pretty trivial, but it didn't used to be, and I didn't feel like
# changing the calling functions. Besides, we might need it again someday.
return ("Superguy.Index.complete.txt");
}
# Print the relevant index entries. $ind_to_search, @lines_in_current, and
# OUTPUT_FILE should be set up properly.
# XXX: warnings don't work for index-only.
sub do_print_index {
my($first, $line);
$first = 0;
while (defined($line = shift(@lines_in_current))) {
if ($line =~ /$expression/) {
if (!$first) {
# Force the index number on, regardless of whether it was there
# originally.
$line =~ s/^....//;
$line = sprintf("%04d", $ind_to_search) . $line;
$first = 1;
}
if (!$printed_something) {
printf OUTPUT_FILE <>' begins an entry.
# Logs 7 - 14, 26 - 126: "NICBBS":
# '\*\*\*\*\* Received \d\d:\d\d:\d\d on \d\d/\d\d/\d\d,' ...
# _ends_ an entry. (This is usually the only date indication
# we have.)
# Logs 15 - 25: '={78,79}' begins an entry.
# Logs 126 on: "UCF1VM": '={73}' begins an entry.
# Note that the transition from NICBBS to UCF1VM happens _within_
# log 126. *Both* separators occur between Ramrod: Media
# Darling #5 parts 2 and 3 (as would be expected).
# We handle this by assuming instead that separators end entries,
# but allowing eof() as a separator, and disregarding "entries"
# of less than five lines.
my @entry = ();
my($log_line, %header, $parsing_a_header, $partial_header, $month);
my($complain, $key);
while (defined($this_index = shift(@lines_in_current))) {
@entry = ();
undef %header;
undef %index;
$parsing_a_header = 0;
$complain = 0;
$provisional_subtopic = "";
while (defined($log_line = )) {
chomp $log_line;
if (($ind_to_search >= 1 and $ind_to_search <= 6)
and $log_line =~ /^<<\d\d\d>>$/ ) { last }
elsif ((($ind_to_search >= 7 and $ind_to_search <= 14) or
($ind_to_search >= 26 and $ind_to_search <= 126))
and $log_line =~
m|^\*\*\*\*\* Received (\d\d:\d\d:..) on (\d\d)/(\d\d)/(\d\d)|) {
# Assemble a proper "Date:" header.
$month = $short_month_names[$2-1];
my $date = "$3 $month 19$4 $1";
# Take the date in the received line as canonical.
if ($warn and exists($header{'date'})) {
print("!!! Duplicate date $header{'date'}");
}
# This is not a Y2K bug, since this only applies to logs <=
# 126, which was in 1991
$header{'date'} = sprintf("19%02d-%02d-%02d %s", $4, $2, $3,
$1);
# Put "Date:" after leading blank lines
my @blanks = ();
while (($log_line = shift(@entry)) =~ /^\s*$/) {
unshift(@blanks, $log_line);
}
unshift(@entry, $log_line);
unshift(@entry, "Date: $date");
foreach $log_line (@blanks) { unshift(@entry, $log_line) }
last;
}
elsif (($ind_to_search >= 15 and $ind_to_search <= 25)
and $log_line =~ /^={78,79}$/ ) { last }
elsif (($ind_to_search >= 126)
and $log_line =~ /^={73}$/ ) { last }
# Parse out headers
if ($parsing_a_header) {
# A continued header line -- begins with whitespace, and
# contains some non-whitespace.
if ( $log_line =~ /^\s+(\S.*)/ ) {
$partial_header .= $1;
}
else {
if ( $partial_header =~ /\AFrom: / ) {
$partial_header =~ s/\AFrom: *//;
$partial_header =~ s/.*<(.*)>/$1/ or
$partial_header =~ s/ +\(.*\)$//;
if (!exists($header{'from'})) {
$header{'from'} = $partial_header;
}
elsif ( $warn ) {
print "!!! duplicate sender \"$partial_header\"";
}
}
elsif ( $partial_header =~ /\ASubject: .*/ ) {
$partial_header =~ s/^Subject: *//;
$partial_header =~ s/<(Superguy|Sfstory) \d*> *//;
if ( defined $1 ) { $provisional_subtopic = $1 }
if (!exists($header{'subject'})) {
$header{'subject'} = $partial_header;
if(defined($provisional_subtopic)) {
if ($provisional_subtopic eq "Superguy") {
$header{'subtopic'} = "SG";
}
elsif ($provisional_subtopic eq "Sfstory") {
$header{'subtopic'} = "SF";
}
}
elsif ($partial_header =~
m|\A(Re: *)?(Repost: *)?([\w/]*):|i) {
$header{'subtopic'} = $3;
}
}
elsif ( $warn ) {
print "!!! duplicate subject \"$partial_header\"";
}
}
else { die "Saw a header I wasn't expecting, stopped" }
$parsing_a_header = 0;
}
}
if ($log_line =~ /^Subject: / or $log_line =~ /^From: /) {
$partial_header = $log_line;
$parsing_a_header = 1;
}
elsif ($log_line =~ /^Date: /) { # Date header.
($newdate = $log_line) =~ s/^Date:\s*//;
if ($newdate =~ /\w*, (\d*) (\w*) (\d*) ([\d:x]*).*/) {
if (exists ($month_numbers{$2})) {
$month = $month_numbers{$2};
}
else {
warn "!!! unknown month \"$2\"";
$month = 0;
}
$newdate = sprintf("%04d-%02d-%02d %s", $3, $month,
$1, $4);
if (!exists($header{'date'})) {
$header{'date'} = $newdate;
}
elsif ($warn) { print "!!! duplicate date \"$newdate\"" }
}
elsif ($warn) {
print "!!! unknown date format \"$newdate\" -- ignored.";
}
}
# Kill control-Z's, they're annoying.
$log_line =~ s/\032//g;
# Skip those stupid "Archived" lines.
if ($log_line =~ /\*\*\*\*\* Archived/) { next }
push(@entry, $log_line);
}
if ($#entry < 5) { redo } # Entry too small, must be bogus.
if ($warn >= 2) {
if (!exists($header{'subject'})) {print "!!! subject missing"}
if (!exists($header{'from'})) {print "!!! sender missing"}
if (!exists($header{'date'})) {print "!!! date missing"}
}
if ($warn) {
$index{'date'} = substr($this_index, 5, 8);
$index{'subtopic'} = substr($this_index, 15, 2);
$index{'subject'} = substr($this_index, 19, 37);
$index{'author'} = substr($this_index, 57, 25);
$index{'subject'} =~ s/\s*\Z//;
$index{'author'} =~ s/\s*\Z//;
$index{'date'} = idxdate2iso($index{'date'});
if (exists($header{'date'}) and
abs(days_since_1900($index{'date'}) -
days_since_1900($header{'date'})) > 2) {
print "!!! Index and Log disagree about date.";
$complain = 1;
}
$index{'subtopic'} =~ s/LN/LNH/g;
$index{'subtopic'} =~ s/AD/ADMINISTRIVIA/g;
if (exists($header{'subtopic'}) and
lc($header{'subtopic'}) ne lc($index{'subtopic'})) {
print "!!! Index and Log disagree about subtopic.";
$complain = 1;
}
if (exists($header{'from'})) {
if(exists($author_addresses{lc($header{'from'})})) {
$header{'from_name'} = $author_addresses{lc($header{'from'})};
if ($index{'author'} !~ /$header{'from_name'}/i
and $index{'subtopic'} ne "LNH"
# Many LNH reposts were posted by someone other than
# the author
and $index{'author'} !~ m|/|) # Collaborations
{
print "!!! Author and sender are different.";
$complain = 1;
}
}
else {
print "!!! Sender address $header{'from'} not in the address database.";
$complain = 1; # XXX
}
}
if ($complain) {
printf("!!! Index: %s %s: %s %s\n",
$index{'date'}, $index{'subtopic'}, $index{'subject'},
$index{'author'});
printf("!!! Log: %s %s %s <%s>\n",
exists($header{'date'}) ? $header{'date'} : "",
exists($header{'subject'}) ? $header{'subject'} : "",
exists($header{'from_name'}) ? $header{'from_name'}: "",
exists($header{'from'}) ? $header{'from'} : "");
}
}
if (eof(LOG) and $#lines_in_current != -1) {
warn("Uh-oh, not enough log for all the indices, " .
"something's probably gone wrong.\n");
last;
}
if ($this_index =~ /$expression/) {
$this_index =~ s/^.....//;
if ($verbose) {
print("$this_index");
}
if (!$printed_something) {
$printed_something = 1;
}
print(OUTPUT_FILE "="x73);
foreach (@entry) {
print OUTPUT_FILE;
}
}
}
if (!eof(LOG) and $#entry >= 5) {
warn("Uh-oh, not enough index for all the logs, " .
"something's probably gone wrong.\n");
}
close(LOG);
}
sub days_since_1900 # Takes a date in YYYY-MM-DD, returns
# days since Jan 1 1900
# valid for Mar 1 1900 - Feb 28 2100
{
my $date = shift;
$date =~ /(\d\d\d\d)-(\d\d)-(\d\d)/;
my $year = $1;
my $month = $2;
my $day = $3;
my $total = ($year - 1900) * 365; # regular days since 1900
$total += int( ($year - 1900) / 4 ); # leap days since 1900, not
# counting this year
if ($year % 4 == 0 && $month > 2) { $total++ } # leap year this year
if ($month < 1 or $month > 12) { die "month \#$month??? date = $date, stopped" }
$total += $year_to_date[$month - 1]; # earlier months this year
$total += $day; # days this month
$total; # return this value
}
sub idxdate2iso($) {
my ($idx) = @_;
my ($yr, $mo, $day, $fullyear);
if ($idx =~ m|(..)/(..)/(..)|) {
$mo = $1;
$day = $2;
$yr = $3;
# Pivot year on 1970. XXX: Y2K-esque bug
if ($yr < 70) { $fullyear = $yr + 2000 }
else {$fullyear = $yr + 1900}
$idx = "$fullyear-$mo-$day";
}
return $idx;
}