#!/bin/perl5 -w $\ = "\n"; $directory = "/import/ftp/pub/lennox/superguy/"; $progname = "autocollect"; $usage = "Usage: $progname [options] search-pattern\n\n" . "Options:\n" . " -s Search using case-sensitive string matching.\n" . " -i Search using case-insensitive string matching (default).\n" . " -e Search using regular expression matching.\n" . " -r num or -r num-num\n" . " Search only the logs specified (multiple ranges can be given).\n" . " -x Output index entries for matching posts, rather than the\n" . " posts themselves.\n" . " -d Use as archive directory.\n" . " -o Output to file .\n" . " -v Give verbose output of what logs are being scanned.\n" . " -w Warn of log inconsistencies.\n" . " (Give more -w's for more warnings.)\n" . " -a Use to specify author e-mail addresses.\n" . " (Used in inconsistency checking.)\n" . " -h, -? Print this help information."; # useful constants for inconsistency-checking. @short_month_names = ("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"); %month_numbers = ("Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4, "May" => 5, "Jun" => 6, "Jul" => 7, "Aug" => 8, "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12); @year_to_date = ( 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 ); # variables set by argument parsing ($directory can also be set). $search_type = 1; # 0 - string 1 - casestring 2 - regexp @logfiles = (); $output_index = 0; $OUTPUT_FILE = "-"; $verbose = 0; $warn = 0; while (defined($argument = shift(@ARGV))) { if ($argument =~ /\A-/) { @args = split(//, $argument); shift(@args); # Lose the leading - foreach $arg (@args) { if ($arg eq "s") { $search_type = 0 } elsif ($arg eq "i") { $search_type = 1 } elsif ($arg eq "e") { $search_type = 2 } elsif ($arg eq "r") { push(@logfiles, shift(@ARGV)); if (!defined($logfiles[$#logfiles])) { die "-$arg: requires an argument.\n"; } } elsif ($arg eq "x") { $output_index = 1 } elsif ($arg eq "d") { $directory = shift(@ARGV); if (!defined($directory)) { die "-$arg: requires an argument.\n"; } } elsif ($arg eq "o") { $OUTPUT_FILE = shift(@ARGV); if (!defined($OUTPUT_FILE)) { die "-$arg: requires an argument.\n"; } } elsif ($arg eq "v") { $verbose = 1 } elsif ($arg eq "w") { $warn++ } # Allow arbitrarily many w's. elsif ($arg eq "a") { $AUTHORS = shift(@ARGV); if (!defined($AUTHORS)) { die "-$arg: requires an argument.\n"; } } elsif ($arg eq "h" or $arg eq "?") { print STDERR $usage; exit 1; } else { warn "Unknown argument -$arg ignored.\n" } } } else { $expression = $argument; } } if (!defined($expression)) { print STDERR "No search pattern specified."; print STDERR "Use the search string '.' to extract all material in the range specified.\n"; print STDERR "Type $progname -h for usage information."; exit 1; } if ($search_type == 0 or $search_type == 1) { if ($expression ne ".") { $expression = quotemeta($expression); } } elsif (!defined(eval "'' =~ /\$expression/")) { print STDERR "Invalid regular expression \"$expression\".\n"; print STDERR "Type $progname -h for usage information."; exit 1; } if ($search_type == 1) { $expression = "(?i)" . $expression; } if (! -d $directory) { die "Couldn't open directory $directory\n" } if ($directory !~ m|/\Z|) { $directory .= "/" } if (!defined($AUTHORS)) { $AUTHORS = "${directory}perl/author-addrs"; } if ($warn) { $verbose = 1; } # Convert the logfile ranges to a proper list. @logfiles = convert_range(@logfiles); # Figure out which index files to use. @ARGV = index_files(@logfiles); # Look for the index files in the specified directory. foreach (@ARGV) {s/\A/$directory/} # Set up the output file. if ($OUTPUT_FILE eq "-") { open(OUTPUT_FILE, ">&STDOUT") or die("couldn't dup stdout, died"); open(STDOUT, ">&STDERR") or die("couldn't dup stderr, died"); } else { open(OUTPUT_FILE, ">$OUTPUT_FILE") or die("couldn't open $OUTPUT_FILE, aborting.\n"); } # Set up the authors assoc. array, if we're doing warnings. if ($warn) { printf "Reading $AUTHORS..."; # printf to avoid $\ open AUTHORS or die("Couldn't open authors file $AUTHORS, aborting.\n"); $author_name = ""; while () { chomp; if (/^\s+/) { s/^\s*//; $author_addresses{lc($_)} = $author_name; } else { $author_name = $_; } } close AUTHORS; print "done."; } # State variables for the loop $current_ind_searched = -1; # -1 means not currently looking at an index. $ind_to_search = shift(@logfiles); $currently_searching = 0; # Are we looking at something we care about? @lines_in_current = (); # The lines from the current index # entry $found_something = 0; # Something in the current entry matched. $printed_something = 0; # We actually output something. # Loop through the index files while (<>) { chomp; # Determine index currently being searched if (!/^ /) { # Either not an index line, or a new # entry. Either way, it's the end of # an entry if $currently_searching. if (!/^[\d ]/) { $current_ind_searched = -1 } elsif (/^(\d\d\d\d)/ or /^(\d\d\d) /) { $current_ind_searched = $1 } if ($currently_searching) { $currently_searching = 0; if ($found_something) { if ($output_index) { do_print_index(); } else { do_read_log(); } } @lines_in_current = (); $found_something = 0; $ind_to_search = shift(@logfiles); if (!defined($ind_to_search)) { last; } } } if ($current_ind_searched < $ind_to_search) { next; } elsif ($current_ind_searched > $ind_to_search) { die "Hey! $current_ind_searched is more than $ind_to_search! Died"; } else { # We want to search this one $currently_searching = 1; if (/$expression/) { $found_something = 1 } push(@lines_in_current, $_); } } # Things to do at the end. Should only execute if we searched the # last index entry. if ($currently_searching) { $currently_searching = 0; if ($found_something) { if ($output_index) { do_print_index(); } else { do_read_log(); } } $ind_to_search = shift(@logfiles); if (defined($ind_to_search)) { die "Hey, there were logs left to search when we ran out of index! Died"; } } if (!$printed_something) { # For the command line version we don't have to do anything here. } close(OUTPUT_FILE); # Subroutines sub convert_range { my @ranges = @_; my @files = (); my $range; my $lower = 10000; my $upper = -1; my($bot, $top); # I might want the pattern more general than this some day. my @logfilesfound = glob("$directory/sguy*[0-9] $directory/sguy*.gz"); foreach $file (@logfilesfound) { $file =~ /sguy(\d*)/; if ($1 < $lower) { $lower = $1 } if ($1 > $upper) { $upper = $1 } } if ($#ranges == -1) { # There weren't any files specified. push(@ranges, "$lower-$upper"); } foreach $range (@ranges) { if ($range =~ /\A(\d*)-(\d*)\Z/) { $bot = $1; $top = $2; if ( $bot eq "" ) { $bot = $lower } if ( $top eq "" ) { $top = $upper } if ( $bot < $lower ) { warn("Bad first parameter in $range: first file is $lower\n"); $bot = $lower; } if ( $top > $upper ) { warn("Bad second parameter in $range: last file is $upper\n"); $top = $upper; } push(@files, $bot..$top); } elsif ($range =~ /\A\d+\Z/) { if ( $range < $lower or $range > $upper ) { warn("Parameter $range out of range, skipping\n"); } else { push(@files, $range); } } else { warn("Bad parameter $range, skipping\n") } } return(sort {$a <=> $b} @files); } sub index_files { # This is pretty trivial, but it didn't used to be, and I didn't feel like # changing the calling functions. Besides, we might need it again someday. return ("Superguy.Index.complete.txt"); } # Print the relevant index entries. $ind_to_search, @lines_in_current, and # OUTPUT_FILE should be set up properly. # XXX: warnings don't work for index-only. sub do_print_index { my($first, $line); $first = 0; while (defined($line = shift(@lines_in_current))) { if ($line =~ /$expression/) { if (!$first) { # Force the index number on, regardless of whether it was there # originally. $line =~ s/^....//; $line = sprintf("%04d", $ind_to_search) . $line; $first = 1; } if (!$printed_something) { printf OUTPUT_FILE <>' begins an entry. # Logs 7 - 14, 26 - 126: "NICBBS": # '\*\*\*\*\* Received \d\d:\d\d:\d\d on \d\d/\d\d/\d\d,' ... # _ends_ an entry. (This is usually the only date indication # we have.) # Logs 15 - 25: '={78,79}' begins an entry. # Logs 126 on: "UCF1VM": '={73}' begins an entry. # Note that the transition from NICBBS to UCF1VM happens _within_ # log 126. *Both* separators occur between Ramrod: Media # Darling #5 parts 2 and 3 (as would be expected). # We handle this by assuming instead that separators end entries, # but allowing eof() as a separator, and disregarding "entries" # of less than five lines. my @entry = (); my($log_line, %header, $parsing_a_header, $partial_header, $month); my($complain, $key); while (defined($this_index = shift(@lines_in_current))) { @entry = (); undef %header; undef %index; $parsing_a_header = 0; $complain = 0; $provisional_subtopic = ""; while (defined($log_line = )) { chomp $log_line; if (($ind_to_search >= 1 and $ind_to_search <= 6) and $log_line =~ /^<<\d\d\d>>$/ ) { last } elsif ((($ind_to_search >= 7 and $ind_to_search <= 14) or ($ind_to_search >= 26 and $ind_to_search <= 126)) and $log_line =~ m|^\*\*\*\*\* Received (\d\d:\d\d:..) on (\d\d)/(\d\d)/(\d\d)|) { # Assemble a proper "Date:" header. $month = $short_month_names[$2-1]; my $date = "$3 $month 19$4 $1"; # Take the date in the received line as canonical. if ($warn and exists($header{'date'})) { print("!!! Duplicate date $header{'date'}"); } # This is not a Y2K bug, since this only applies to logs <= # 126, which was in 1991 $header{'date'} = sprintf("19%02d-%02d-%02d %s", $4, $2, $3, $1); # Put "Date:" after leading blank lines my @blanks = (); while (($log_line = shift(@entry)) =~ /^\s*$/) { unshift(@blanks, $log_line); } unshift(@entry, $log_line); unshift(@entry, "Date: $date"); foreach $log_line (@blanks) { unshift(@entry, $log_line) } last; } elsif (($ind_to_search >= 15 and $ind_to_search <= 25) and $log_line =~ /^={78,79}$/ ) { last } elsif (($ind_to_search >= 126) and $log_line =~ /^={73}$/ ) { last } # Parse out headers if ($parsing_a_header) { # A continued header line -- begins with whitespace, and # contains some non-whitespace. if ( $log_line =~ /^\s+(\S.*)/ ) { $partial_header .= $1; } else { if ( $partial_header =~ /\AFrom: / ) { $partial_header =~ s/\AFrom: *//; $partial_header =~ s/.*<(.*)>/$1/ or $partial_header =~ s/ +$.*$$//; if (!exists($header{'from'})) { $header{'from'} = $partial_header; } elsif ( $warn ) { print "!!! duplicate sender \"$partial_header\""; } } elsif ( $partial_header =~ /\ASubject: .*/ ) { $partial_header =~ s/^Subject: *//; $partial_header =~ s/<(Superguy|Sfstory) \d*> *//; if ( defined $1 ) { $provisional_subtopic = $1 } if (!exists($header{'subject'})) { $header{'subject'} = $partial_header; if(defined($provisional_subtopic)) { if ($provisional_subtopic eq "Superguy") { $header{'subtopic'} = "SG"; } elsif ($provisional_subtopic eq "Sfstory") { $header{'subtopic'} = "SF"; } } elsif ($partial_header =~ m|\A(Re: *)?(Repost: *)?([\w/]*):|i) { $header{'subtopic'} = $3; } } elsif ( $warn ) { print "!!! duplicate subject \"$partial_header\""; } } else { die "Saw a header I wasn't expecting, stopped" } $parsing_a_header = 0; } } if ($log_line =~ /^Subject: / or $log_line =~ /^From: /) { $partial_header = $log_line; $parsing_a_header = 1; } elsif ($log_line =~ /^Date: /) { # Date header. ($newdate = $log_line) =~ s/^Date:\s*//; if ($newdate =~ /\w*, (\d*) (\w*) (\d*) ([\d:x]*).*/) { if (exists ($month_numbers{$2})) { $month = $month_numbers{$2}; } else { warn "!!! unknown month \"$2\""; $month = 0; } $newdate = sprintf("%04d-%02d-%02d %s", $3, $month, $1, $4); if (!exists($header{'date'})) { $header{'date'} = $newdate; } elsif ($warn) { print "!!! duplicate date \"$newdate\"" } } elsif ($warn) { print "!!! unknown date format \"$newdate\" -- ignored."; } } # Kill control-Z's, they're annoying. $log_line =~ s/\032//g; # Skip those stupid "Archived" lines. if ($log_line =~ /\*\*\*\*\* Archived/) { next } push(@entry, $log_line); } if ($#entry < 5) { redo } # Entry too small, must be bogus. if ($warn >= 2) { if (!exists($header{'subject'})) {print "!!! subject missing"} if (!exists($header{'from'})) {print "!!! sender missing"} if (!exists($header{'date'})) {print "!!! date missing"} } if ($warn) { $index{'date'} = substr($this_index, 5, 8); $index{'subtopic'} = substr($this_index, 15, 2); $index{'subject'} = substr($this_index, 19, 37); $index{'author'} = substr($this_index, 57, 25); $index{'subject'} =~ s/\s*\Z//; $index{'author'} =~ s/\s*\Z//; $index{'date'} = idxdate2iso($index{'date'}); if (exists($header{'date'}) and abs(days_since_1900($index{'date'}) - days_since_1900($header{'date'})) > 2) { print "!!! Index and Log disagree about date."; $complain = 1; } $index{'subtopic'} =~ s/LN/LNH/g; $index{'subtopic'} =~ s/AD/ADMINISTRIVIA/g; if (exists($header{'subtopic'}) and lc($header{'subtopic'}) ne lc($index{'subtopic'})) { print "!!! Index and Log disagree about subtopic."; $complain = 1; } if (exists($header{'from'})) { if(exists($author_addresses{lc($header{'from'})})) { $header{'from_name'} = $author_addresses{lc($header{'from'})}; if ($index{'author'} !~ /$header{'from_name'}/i and $index{'subtopic'} ne "LNH" # Many LNH reposts were posted by someone other than # the author and $index{'author'} !~ m|/|) # Collaborations { print "!!! Author and sender are different."; $complain = 1; } } else { print "!!! Sender address $header{'from'} not in the address database."; $complain = 1; # XXX } } if ($complain) { printf("!!! Index: %s %s: %s %s\n", $index{'date'}, $index{'subtopic'}, $index{'subject'}, $index{'author'}); printf("!!! Log: %s %s %s <%s>\n", exists($header{'date'}) ? $header{'date'} : "", exists($header{'subject'}) ? $header{'subject'} : "", exists($header{'from_name'}) ? $header{'from_name'}: "", exists($header{'from'}) ? $header{'from'} : ""); } } if (eof(LOG) and $#lines_in_current != -1) { warn("Uh-oh, not enough log for all the indices, " . "something's probably gone wrong.\n"); last; } if ($this_index =~ /$expression/) { $this_index =~ s/^.....//; if ($verbose) { print("$this_index"); } if (!$printed_something) { $printed_something = 1; } print(OUTPUT_FILE "="x73); foreach (@entry) { print OUTPUT_FILE; } } } if (!eof(LOG) and $#entry >= 5) { warn("Uh-oh, not enough index for all the logs, " . "something's probably gone wrong.\n"); } close(LOG); } sub days_since_1900 # Takes a date in YYYY-MM-DD, returns # days since Jan 1 1900 # valid for Mar 1 1900 - Feb 28 2100 { my $date = shift; $date =~ /(\d\d\d\d)-(\d\d)-(\d\d)/; my $year = $1; my $month = $2; my $day = $3; my $total = ($year - 1900) * 365; # regular days since 1900 $total += int( ($year - 1900) / 4 ); # leap days since 1900, not # counting this year if ($year % 4 == 0 && $month > 2) { $total++ } # leap year this year if ($month < 1 or $month > 12) { die "month \#$month??? date = $date, stopped" } $total += $year_to_date[$month - 1]; # earlier months this year $total += $day; # days this month $total; # return this value } sub idxdate2iso($) { my ($idx) = @_; my ($yr, $mo, $day, $fullyear); if ($idx =~ m|(..)/(..)/(..)|) { $mo = $1; $day = $2; $yr = $3; # Pivot year on 1970. XXX: Y2K-esque bug if ($yr < 70) { $fullyear = $yr + 2000 } else {$fullyear = $yr + 1900} $idx = "$fullyear-$mo-$day"; } return $idx; }