#!/bin/perl5 $\ = "\n"; $directory = "/import/ftp/pub/lennox/superguy/"; # $progname = "autocollect"; use CGI; $query = new CGI; # useful constants for inconsistency-checking. @short_month_names = ("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"); %month_numbers = ("Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4, "May" => 5, "Jun" => 6, "Jul" => 7, "Aug" => 8, "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12); @year_to_date = ( 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 ); # not all these variables can actually be set in the CGI ersion, but they # can be set in the command line version so I'm leaving them in to make # co-maintenance easier. $search_type = 1; # 0 - string 1 - casestring 2 - regexp @logfiles = (); $output_index = 0; $OUTPUT_FILE = "-"; $verbose = 0; $warn = 0; # This page must be named autocollect-srv.cgi, and autocollect.cgi must live # in the same directory. ($goback_url = $query->self_url) =~ s/autocollect-srv/autocollect/; $goback_url = "\"$goback_url\""; $expression = $query->param('pattern'); if ($query->param('searchtype') eq "string") { $search_type = 0; } elsif ($query->param('searchtype') eq "regexp") { $search_type = 2; } else { $search_type = 1; # Default } $start = $query->param('start'); $end = $query->param('end'); push(@logfiles, "$start-$end"); $output_index = ($query->param('index') eq 'on'); if (!defined($expression) or $expression eq "") { print $query->header; print $query->start_html('Bad Search String','lennox@cs.columbia.edu'); print <Bad Search String You didn't specify a search string to the Autocollector. If you really want to extract everything from the logs in the range you specified, use the search string "." (a single dot).

Try again... END print $query->end_html; exit; } if ($search_type == 0 or $search_type == 1) { if ($expression ne ".") { $expression = quotemeta($expression); } } elsif (!defined(eval "'' =~ /\$expression/")) { $expression = escapeHTML($expression); print $query->header; print $query->start_html('Bad Search String','lennox@cs.columbia.edu'); print <Bad Search String You specified the syntactically invalid regular expression "$expression" to the Autocollector.

The search string must be a syntactically correct regular expression; if you're not familiar with regular expressions, try prefixing any punctuation you might have in your search string with backslashes.

Try again... END print $query->end_html; exit; } if ($search_type == 1) { $expression = "(?i)" . $expression; } if (! -d $directory) { die "Couldn't open directory $directory\n" } if ($directory !~ m|/\Z|) { $directory .= "/" } if (!defined($AUTHORS)) { $AUTHORS = "${directory}perl/author-addrs"; } if ($warn) { $verbose = 1; } # Convert the logfile ranges to a proper list. @logfiles = convert_range(@logfiles); # Figure out which index files to use. @ARGV = index_files(@logfiles); # Look for the index files in the specified directory. foreach (@ARGV) {s/\A/$directory/} # Set up the output file. if ($OUTPUT_FILE eq "-") { open(OUTPUT_FILE, ">&STDOUT") or die("couldn't dup stdout, died"); open(STDOUT, ">&STDERR") or die("couldn't dup stderr, died"); } else { open(OUTPUT_FILE, ">$OUTPUT_FILE") or die("couldn't open $OUTPUT_FILE, aborting.\n"); } # Set up the authors assoc. array, if we're doing warnings. if ($warn) { printf "Reading $AUTHORS..."; # printf to avoid $\ open AUTHORS or die("Couldn't open authors file $AUTHORS, aborting.\n"); $author_name = ""; while () { chomp; if (/^\t/) { s/^\s*//; $author_addresses{lc($_)} = $author_name; } else { $author_name = $_; } } close AUTHORS; print "done."; } # State variables for the loop $current_ind_searched = -1; # -1 means not currently looking at an index. $ind_to_search = shift(@logfiles); $currently_searching = 0; # Are we looking at something we care about? @lines_in_current = (); # The lines from the current index # entry $found_something = 0; # Something in the current entry matched. $printed_something = 0; # We actually output something. # Loop through the index files while (<>) { chomp; # Determine index currently being searched if (!/^ /) { # Either not an index line, or a new # entry. Either way, it's the end of # an entry if $currently_searching. if (!/^[\d ]/) { $current_ind_searched = -1 } elsif (/^(\d\d\d\d)/ or /^(\d\d\d)/ ) { $current_ind_searched = $1 } if ($currently_searching) { $currently_searching = 0; if ($found_something) { if ($output_index) { do_print_index(); } else { do_read_log(); } } @lines_in_current = (); $found_something = 0; $ind_to_search = shift(@logfiles); if (!defined($ind_to_search)) { last; } } } if ($current_ind_searched < $ind_to_search) { next; } elsif ($current_ind_searched > $ind_to_search) { die "Hey! $current_ind_searched is more than $ind_to_search! Died"; } else { # We want to search this one $currently_searching = 1; if (/$expression/) { $found_something = 1 } push(@lines_in_current, $_); } } # Things to do at the end. Should only execute if we searched the # last index entry. if ($currently_searching) { $currently_searching = 0; if ($found_something) { if ($output_index) { do_print_index(); } else { do_read_log(); } } $ind_to_search = shift(@logfiles); if (defined($ind_to_search)) { die "Hey, there were logs left to search when we ran out of index! Died"; } } if (!$printed_something) { $expression = escapeHTML($query->param('pattern')); $start = escapeHTML($start); $end = escapeHTML($end); if ($search_type == 0) { $patt_desc = "string" } elsif ($search_type == 2) { $patt_desc = "regular expression" } else { $patt_desc = "case-insensitive string" } print OUTPUT_FILE $query->header; print OUTPUT_FILE $query->start_html('No Files Found','lennox@cs.columbia.edu'); print OUTPUT_FILE <Bad Search String Sorry, the $patt_desc "$expression" didn't match anything in logs $start through $end.

Try again... END print OUTPUT_FILE $query->end_html; exit; } close(OUTPUT_FILE); # Subroutines sub convert_range { my @ranges = @_; my @files = (); my $range; my $lower = 10000; my $upper = -1; my($bot, $top); my @logfilesfound = glob("$directory/sguy*[0-9] $directory/sguy*.gz"); foreach $file (@logfilesfound) { $file =~ /sguy(\d*)/; if ($1 < $lower) { $lower = $1 } if ($1 > $upper) { $upper = $1 } } if ($#ranges == -1) { # There weren't any files specified. push(@ranges, "$lower-$upper"); } foreach $range (@ranges) { if ($range =~ /\A(\d*)-(\d*)\Z/) { $bot = $1; $top = $2; if ( $bot eq "" ) { $bot = $lower } if ( $top eq "" ) { $top = $upper } if ( $bot < $lower ) { warn("Bad first parameter in $range: first file is $lower\n"); $bot = $lower; } if ( $top > $upper ) { warn("Bad second parameter in $range: last file is $upper\n"); $top = $upper; } push(@files, $bot..$top); } elsif ($range =~ /\A\d+\Z/) { if ( $range < $lower or $range > $upper ) { warn("Parameter $range out of range, skipping\n"); } else { push(@files, $range); } } else { warn("Bad parameter $range, skipping\n") } } return(sort {$a <=> $b} @files); } sub index_files { # This is pretty trivial, but it didn't used to be, and I didn't feel like # changing the calling functions. Besides, we might need it again someday. return ("Superguy.Index.complete.txt"); } # Print the relevant index entries. $ind_to_search, @lines_in_current, and # OUTPUT_FILE should be set up properly. # XXX: warnings don't work for index-only. sub do_print_index { my($first, $line); $first = 0; while ($line = shift(@lines_in_current)) { if ($line =~ /$expression/) { if (!$first) { # Force the index number on, regardless of whether it was there # originally. $line =~ s/^....//; $line = sprintf("%04d", $ind_to_search) . $line; $first = 1; } if (!$printed_something) { printf OUTPUT_FILE $query->header('text/plain'); printf OUTPUT_FILE <>' begins an entry. # Logs 7 - 14, 26 - 126: "NICBBS": # '\*\*\*\*\* Received \d\d:\d\d:\d\d on \d\d/\d\d/\d\d,' ... # _ends_ an entry. (This is usually the only date indication # we have.) # Logs 15 - 25: '={78,79}' begins an entry. # Logs 126 on: "UCF1VM": '={73}' begins an entry. # Note that the transition from NICBBS to UCF1VM happens _within_ # log 126. *Both* separators occur between Ramrod: Media # Darling #5 parts 2 and 3 (as would be expected). # We handle this by assuming instead that separators end entries, # but allowing eof() as a separator, and disregarding "entries" # of less than five lines. my @entry = (); my($log_line, %header, $parsing_a_header, $partial_header, $month); my($complain, $key); while ($this_index = shift(@lines_in_current)) { @entry = (); undef %header; undef %index; $parsing_a_header = 0; $complain = 0; $provisional_subtopic = ""; while ($log_line = ) { chomp $log_line; if (($ind_to_search >= 1 and $ind_to_search <= 6) and $log_line =~ /^<<\d\d\d>>$/ ) { last } elsif ((($ind_to_search >= 7 and $ind_to_search <= 14) or ($ind_to_search >= 26 and $ind_to_search <= 126)) and $log_line =~ m|^\*\*\*\*\* Received (\d\d:\d\d:..) on (\d\d)/(\d\d)/(\d\d)|) { # Assemble a proper "Date:" header. $month = $short_month_names[$2-1]; my $date = "$3 $month 19$4 $1"; # Take the date in the received line as canonical. if ($warn and exists($header{'date'})) { print("!!! Duplicate date $header{'date'}"); } $header{'date'} = sprintf("19%02d-%02d-%02d %s", $4, $2, $3, $1); # Put "Date:" after leading blank lines my @blanks = (); while (($log_line = shift(@entry)) =~ /^\s*$/) { unshift(@blanks, $log_line); } unshift(@entry, $log_line); unshift(@entry, "Date: $date"); foreach $log_line (@blanks) { unshift(@entry, $log_line) } last; } elsif (($ind_to_search >= 15 and $ind_to_search <= 25) and $log_line =~ /^={78,79}$/ ) { last } elsif (($ind_to_search >= 126) and $log_line =~ /^={73}$/ ) { last } # Parse out headers if ($parsing_a_header) { # A continued header line -- begins with whitespace, and # contains some non-whitespace. if ( $log_line =~ /^\s+(\S.*)/ ) { $partial_header .= $1; } else { if ( $partial_header =~ /\AFrom: / ) { $partial_header =~ s/\AFrom: *//; $partial_header =~ s/.*<(.*)>/$1/; if (!exists($header{'from'})) { $header{'from'} = $partial_header; } elsif ( $warn ) { print "!!! duplicate sender \"$partial_header\""; } } elsif ( $partial_header =~ /\ASubject: .*/ ) { $partial_header =~ s/^Subject: *//; $partial_header =~ s/<(Superguy|Sfstory) \d*> *//; if ( defined $1 ) { $provisional_subtopic = $1 } if (!exists($header{'subject'})) { $header{'subject'} = $partial_header; if(defined($provisional_subtopic)) { if ($provisional_subtopic eq "Superguy") { $header{'subtopic'} = "SG"; } elsif ($provisional_subtopic eq "Sfstory") { $header{'subtopic'} = "SF"; } } elsif ($partial_header =~ m|\A(Re: *)?(Repost: *)?([\w/]*):|i) { $header{'subtopic'} = $3; } } elsif ( $warn ) { print "!!! duplicate subject \"$partial_header\""; } } else { die "Saw a header I wasn't expecting, stopped" } $parsing_a_header = 0; } } if ($log_line =~ /^Subject: / or $log_line =~ /^From: /) { $partial_header = $log_line; $parsing_a_header = 1; } elsif ($log_line =~ /^Date: /) { # Date header. ($newdate = $log_line) =~ s/^Date:\s*//; if ($newdate =~ /\w*, (\d*) (\w*) (\d*) ([\d:x]*).*/) { if (exists ($month_numbers{$2})) { $month = $month_numbers{$2}; } else { warn "!!! unknown month \"$2\""; $month = 0; } $newdate = sprintf("%04d-%02d-%02d %s", $3, $month, $1, $4); if (!exists($header{'date'})) { $header{'date'} = $newdate; } elsif ($warn) { print "!!! duplicate date \"$newdate\"" } } elsif ($warn) { print "!!! unknown date format \"$newdate\" -- ignored."; } } # Kill control-Z's, they're annoying. $log_line =~ s/\032//g; # Skip those stupid "Archived" lines. if ($log_line =~ /\*\*\*\*\* Archived/) { next } push(@entry, $log_line); } if ($#entry < 5) { redo } # Entry too small, must be bogus. if ($warn >= 2) { if (!exists($header{'subject'})) {print "!!! subject missing"} if (!exists($header{'from'})) {print "!!! sender missing"} if (!exists($header{'date'})) {print "!!! date missing"} } if ($warn) { $index{'date'} = substr($this_index, 5, 8); $index{'subtopic'} = substr($this_index, 15, 2); $index{'subject'} = substr($this_index, 19, 37); $index{'author'} = substr($this_index, 57, 25); $index{'subject'} =~ s/\s*\Z//; $index{'author'} =~ s/\s*\Z//; $index{'date'} =~ s|(..)/(..)/(..)|19$3-$1-$2|; if (exists($header{'date'}) and abs(days_since_1900($index{'date'}) - days_since_1900($header{'date'})) > 2) { print "!!! Index and Log disagree about date."; $complain = 1; } $index{'subtopic'} =~ s/LN/LNH/g; $index{'subtopic'} =~ s/AD/ADMINISTRIVIA/g; if (exists($header{'subtopic'}) and lc($header{'subtopic'}) ne lc($index{'subtopic'})) { print "!!! Index and Log disagree about subtopic."; $complain = 1; } if (exists($header{'from'})) { if(exists($author_addresses{lc($header{'from'})})) { $header{'from_name'} = $author_addresses{lc($header{'from'})}; if ($index{'author'} !~ /$header{'from_name'}/i and $index{'subtopic'} ne "LNH" # Many LNH reposts were posted by someone other than # the author and $index{'author'} !~ m|/|) # Collaborations { print "!!! Author and sender are different."; $complain = 1; } } else { print "!!! Sender address $header{'from'} not in the address database."; $complain = 1; # XXX } } if ($complain) { printf("!!! Index: %s %s: %s %s\n", $index{'date'}, $index{'subtopic'}, $index{'subject'}, $index{'author'}); printf("!!! Log: %s %s %s <%s>\n", exists($header{'date'}) ? $header{'date'} : "", exists($header{'subject'}) ? $header{'subject'} : "", exists($header{'from_name'}) ? $header{'from_name'}: "", exists($header{'from'}) ? $header{'from'} : ""); } } if (eof(LOG) and $#lines_in_current != -1) { warn("Uh-oh, not enough log for all the indices, " . "something's probably gone wrong.\n"); last; } if ($this_index =~ /$expression/) { $this_index =~ s/^.....//; if ($verbose) { print("$this_index"); } if (!$printed_something) { printf OUTPUT_FILE $query->header('text/plain'); $printed_something = 1; } print(OUTPUT_FILE "="x73); foreach (@entry) { print OUTPUT_FILE; } } } if (!eof(LOG) and $#entry >= 5) { warn("Uh-oh, not enough index for all the logs, " . "something's probably gone wrong.\n"); } close(LOG); } sub days_since_1900 # Takes a date in YYYY-MM-DD, returns # days since Jan 1 1900 # valid for Mar 1 1900 - Feb 28 2100 { my $date = shift; $date =~ /(\d\d\d\d)-(\d\d)-(\d\d)/; my $year = $1; my $month = $2; my $day = $3; my $total = ($year - 1900) * 365; # regular days since 1900 $total += int( ($year - 1900) / 4 ); # leap days since 1900, not # counting this year if ($year % 4 == 0 && $month > 2) { $total++ } # leap year this year if ($month < 1 or $month > 12) { die "month \#$month??? date = $date, stopped" } $total += $year_to_date[$month - 1]; # earlier months this year $total += $day; # days this month $total; # return this value } # Escape HTML stuff. Lifted directly from CGI.pm. sub escapeHTML { local($toencode) = @_; $toencode=~s/&/&/g; $toencode=~s/\"/"/g; $toencode=~s/>/>/g; $toencode=~s/