#!/usr/bin/perl -w ########################################################################## # Modules and globals. ########################################################################## $\ = "\n"; $directory = "/work/ftp/pub/superguy/"; $webdir = '/superguy/'; use CGI qw/:standard/; use Date::Calc; $query = new CGI; # useful constants for inconsistency-checking. @short_month_names = ("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"); %month_numbers = ("Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4, "May" => 5, "Jun" => 6, "Jul" => 7, "Aug" => 8, "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12); # not all these variables can actually be set in the CGI ersion, but they # can be set in the command line version so I'm leaving them in to make # co-maintenance easier. @logfiles = (); $output_index = 0; $OUTPUT_FILE = "-"; $verbose = 0; $warn = 0; # Find the various mappings, and their reverse-maps. %universe_map = read_mappings($directory.'map-universes'); foreach (keys %universe_map) { $universe_map_reverse{$universe_map{$_}} = $_; } %author_map = read_mappings($directory.'map-authors'); foreach (keys %author_map) { push(@{$author_map_reverse{$author_map{$_}}}, $_); } %title_map = read_mappings($directory.'map-titles'); foreach (keys %title_map) { push(@{$title_map_reverse{$title_map{$_}}}, $_); } # This page must be named superguy-srv, and superguy must live # in the same directory. ($goback_url = $query->self_url) =~ s/superguy-srv/superguy/; ########################################################################## # Reading the mappings ########################################################################## sub read_mappings { my ($fname) = @_; my ($lastmatch, %map); open (MAP, "< $fname") or die "Could not open mappings $fname: $!\n"; while () { chomp; my ($canonical, $alias) = (/^([^\t]*)\t+(.+)$/); next unless $alias; if ($canonical) { $lastmatch = $canonical } else { $canonical = $lastmatch } $map{$alias} = $canonical; } close MAP or die "Could not close mappings $fname: $!\n"; return %map; } ########################################################################## # Matching functions ########################################################################## # Main search function. This should be called any time we want to search, # and will delegate complicated searches down. sub match_any { my ($search_type, $expression, $line) = @_; if ($search_type eq 'author') { return 1 if match_author($expression, $line); } elsif ($search_type eq 'universe') { return 1 if match_universe($expression, $line); } elsif ($search_type eq 'title') { return 1 if match_title($expression, $line); } elsif ($line =~ /$expression/) { return 1; } return 0; } sub match_title { my ($search, $line) = @_; my ($log, $date, $universe, $title, @authors) = parse_index_line($line); print "Search: '$title' eq '$search'"; return 1 if $title =~ /^$search/; foreach my $variation (@{$title_map_reverse{$search}}) { return 1 if $title =~ /$variation/; } return 0; } # Match authors. Checks both for the exact name given, and through the # mappings of alternative author names. Author names have already been # split at any / and (ed.) removed. sub match_author { my ($search, $line) = @_; my ($log, $date, $universe, $title, @authors) = parse_index_line($line); foreach my $a (@authors) { return 1 if $a eq $search; foreach my $variation (@{$author_map_reverse{$search}}) { return 1 if $a eq $variation; } } return 0; } # Match universes. Checks for either a match of the search field to the # short form of the name (SG) or the mapped form (Superguy). sub match_universe { my ($search, $line) = @_; my ($log, $date, $universe, $title, @authors) = parse_index_line($line); return 1 if $universe eq $universe_map_reverse{$search}; return 1 if $universe eq $search; return 0; } ########################################################################## # Output ########################################################################## # Given a log 'entry' (a single post), print a separator, it, and any # opening header out. OUTPUT_FILE should already be ready. sub print_log_entry { my ($this_index, $printed_something, @entry) = @_; $this_index =~ s/^.{5}//; if ($verbose) { print("$this_index"); } if (!$printed_something) { printf OUTPUT_FILE $query->header('text/plain'); $printed_something = 1; } print(OUTPUT_FILE "="x73); foreach (@entry) { print OUTPUT_FILE; } return $printed_something; } # Print the relevant index entries. OUTPUT_FILE should be set up properly. # XXX: warnings don't work for index-only. sub do_print_index { my ($printed_something, @current_index) = @_; while (my $line = shift(@current_index)) { next unless match_any($search_type, $expression, $line); if (!$printed_something) { printf OUTPUT_FILE $query->header('text/html'); print OUTPUT_FILE $query->start_html('The Superguy Autocollector', 'lennox@cs.columbia.edu'); print OUTPUT_FILE "\n"; my (@header) = ('Vol', 'Date', 'ID', 'Title', 'Author'); my (@divider) = ('---', '--------', '--', '------------------------------------', '-------------------'); print OUTPUT_FILE "\n"; print OUTPUT_FILE "\n"; $printed_something = 1; } my (@row) = split_index_line($line); # Make a link from the log number to that complete log. if ($row[0] =~ /^(\d+)/) { my $index = $1; my $url = sprintf('%s', $webdir, $index, $index); $row[0] = $url; } print OUTPUT_FILE "\n"; } return $printed_something; } ########################################################################## # Parsing functions ########################################################################## # Given a single line from an index file, split it. sub split_index_line { my ($line) = @_; if ($line =~ /^(.{4})\s+(.{8})\s+(.{2})\s+(.{37})\s+(.+)$/) { my (@fields) = ($1, $2, $3, $4, $5); foreach (@fields) { s#\s+$## } return @fields; } return ''; } # Fully parse an index line for useful data. This includes skipping header # lines and splitting out authors. Might end up going away, subsumed into # other things. sub parse_index_line { my ($line) = @_; return unless /\S/; return if /^(Vol|---)/; my ($vol, $date, $universe, $title, $author) = split_index_line($line); my @authors; # $title =~ s#\s+$##; # $title =~ s#\s+\(\d+/\d+\)$##; # $title =~ s#\s*\#\S+$##; foreach my $a (split/\//, $author) { $a =~ s#\s+\(ed\.\)$##; push (@authors, $a); } return ($vol, $date, $universe, $title, @authors); } ########################################################################## # Misc functions ########################################################################## # This is pretty trivial, but it didn't used to be, and I didn't feel like # changing the calling functions. Besides, we might need it again someday. sub index_files { return ("Superguy.Index.complete.txt"); } ########################################################################## # Main program ########################################################################## # Grab the search data. $expression = $query->param('pattern'); $search_type = $query->param('searchtype'); $search_type = 'casestring' unless $search_type; # Check for the autosearches. if ($query->param('search_title') ne 'Search by Title') { $search_type = 'title'; $expression = $query->param('search_title'); } elsif ($query->param('search_author') ne 'Search by Author') { $search_type = 'author'; $expression = $query->param('search_author'); } elsif ($query->param('search_universe') ne 'Search by Universe') { $search_type = 'universe'; $expression = $query->param('search_universe'); } $start = $query->param('start'); $end = $query->param('end'); push(@logfiles, "$start-$end"); $output_index = ($query->param('index') && $query->param('index') eq 'on'); if (!defined($expression) or $expression eq '') { print $query->header; print $query->start_html('Bad Search String','lennox@cs.columbia.edu'); print <Bad Search String You didn't specify a search string to the Autocollector. If you really want to extract everything from the logs in the range you specified, use the search string "." (a single dot).

Try again... END print $query->end_html; exit; } if ($search_type eq 'string' or $search_type eq 'casestring') { if ($expression ne '.') { $expression = quotemeta($expression); } } elsif (!defined(eval "'' =~ /\$expression/")) { $expression = CGI::escapeHTML($expression); print $query->header; print $query->start_html('Bad Search String','lennox@cs.columbia.edu'); print <Bad Search String You specified the syntactically invalid regular expression "$expression" to the Autocollector.

The search string must be a syntactically correct regular expression; if you're not familiar with regular expressions, try prefixing any punctuation you might have in your search string with backslashes.

Try again... END print $query->end_html; exit; } if ($search_type eq 'casestring') { $expression = "(?i)" . $expression; } if (! -d $directory) { die "Couldn't open directory $directory\n" } if ($directory !~ m|/$|) { $directory .= "/" } if (!defined($AUTHORS)) { $AUTHORS = "${directory}perl/author-addrs"; } $verbose = 1 if $warn; # Convert the logfile ranges to a proper list. @logfiles = convert_range(@logfiles); # Figure out which index files to use. @ARGV = index_files(@logfiles); # Look for the index files in the specified directory. foreach (@ARGV) { s/^/$directory/ } # Set up the output file. if ($OUTPUT_FILE eq "-") { open(OUTPUT_FILE, ">&STDOUT") or die("couldn't dup stdout, died"); open(STDOUT, ">&STDERR") or die("couldn't dup stderr, died"); } else { open(OUTPUT_FILE, ">$OUTPUT_FILE") or die("couldn't open $OUTPUT_FILE, aborting.\n"); } # Set up the authors assoc. array, if we're doing warnings. if ($warn) { printf "Reading $AUTHORS..."; # printf to avoid $\ open AUTHORS or die("Couldn't open authors file $AUTHORS, aborting.\n"); $author_name = ""; while () { chomp; if (/^\t/) { s/^\s*//; $author_addresses{lc($_)} = $author_name; } else { $author_name = $_; } } close AUTHORS; print "done."; } # State variables for the loop $current_ind_searched = -1; # -1 means not currently looking at an index. $ind_to_search = shift(@logfiles); $currently_searching = 0; # Are we looking at something we care about? @lines_in_current = (); # The lines from the current index # entry $found_something = 0; # Something in the current entry matched. $printed_something = 0; # We actually output something. # Loop through the index files my $i = 0; while (<>) { chomp; # Any line not starting with a space either has the log number or is a # header line. if (!/^ /) { if (!/^[\d ]/) { $current_ind_searched = -1 } elsif (/^(\d{3,4})/) { $current_ind_searched = $1 } # If we had been in a series of posts from a log before, see if we had # found matches and print them out if so. if ($currently_searching) { $currently_searching = 0; if ($found_something) { if ($output_index) { $printed_something = do_print_index($printed_something, @lines_in_current); } else { do_read_log(); } } # Reset variables back to starting values. $i = 0; $found_something = 0; @lines_in_current = (); $ind_to_search = shift(@logfiles); last unless defined($ind_to_search); } # Make sure all lines get the leading log number. } else { if (length($ind_to_search) == 2) { s#^.{3}#0$ind_to_search# } elsif (length($ind_to_search) == 3) { s#^.{3}#$ind_to_search# } else { s#^.{4}#$ind_to_search# } } # Not yet to the index we wish to search. if ($current_ind_searched < $ind_to_search) { next; # Somehow past the index to search.. something went rather wrong. } elsif ($current_ind_searched > $ind_to_search) { die "Hey! $current_ind_searched is more than $ind_to_search! Died"; # At the correct index to search. Push the line onto stack and check # to see if it matches. Because of how the log splitting works, we want # to add any log in the right range, even though it means we search # again later. } else { $found_something = 1 if match_any($search_type, $expression, $_); push(@lines_in_current, $_); $currently_searching = 1; $i++; } } # Now that we're out of loop, do a final check for any matches. if ($found_something) { $currently_searching = 0; if (@lines_in_current) { if ($output_index) { $printed_something = do_print_index($printed_something, @lines_in_current); } else { do_read_log(); } } # If there were still indexes to search, something went wrong. Error. $ind_to_search = shift(@logfiles); if (defined($ind_to_search)) { die "Hey, there were logs left to search when we ran out of index! Died"; } } # If we didn't print anything, then nothing was found. Bring up an error # page instead. if (!$printed_something) { $expression = CGI::escapeHTML($query->param('pattern')); $start = CGI::escapeHTML($start); $end = CGI::escapeHTML($end); if ($search_type eq 'string') { $patt_desc = 'string' } elsif ($search_type eq 'regexp') { $patt_desc = 'regular expression' } else { $patt_desc = 'case-insensitive string' } print OUTPUT_FILE $query->header; print OUTPUT_FILE $query->start_html('No Files Found','lennox@cs.columbia.edu'); print OUTPUT_FILE h1('Bad Search String'); print OUTPUT_FILE p("Sorry, the $patt_desc \"$expression\" didn't ". "match anything in logs $start through $end."); print OUTPUT_FILE p("Try again."); print OUTPUT_FILE $query->end_html; # If we printed and were doing the indexes, then we need to close the table # and the HTML; } elsif ($output_index) { print OUTPUT_FILE "

" . join("", @header) . "
" . join("", @divider) . "
" . join("", @row) . "
\n"; print OUTPUT_FILE $query->end_html; } close(OUTPUT_FILE); # Subroutines sub convert_range { my @ranges = @_; my @files = (); my $range; my $lower = 10000; my $upper = -1; my($bot, $top); my @logfilesfound = glob("$directory/sguy*[0-9] $directory/sguy*.gz"); foreach $file (@logfilesfound) { $file =~ /sguy(\d*)/; if ($1 < $lower) { $lower = $1 } if ($1 > $upper) { $upper = $1 } } if ($#ranges == -1) { # There weren't any files specified. push(@ranges, "$lower-$upper"); } foreach $range (@ranges) { if ($range =~ /\A(\d*)-(\d*)\Z/) { $bot = $1; $top = $2; if ( $bot eq "" ) { $bot = $lower } if ( $top eq "" ) { $top = $upper } if ( $bot < $lower ) { warn("Bad first parameter in $range: first file is $lower\n"); $bot = $lower; } if ( $top > $upper ) { warn("Bad second parameter in $range: last file is $upper\n"); $top = $upper; } push(@files, $bot..$top); } elsif ($range =~ /\A\d+\Z/) { if ( $range < $lower or $range > $upper ) { warn("Parameter $range out of range, skipping\n"); } else { push(@files, $range); } } else { warn("Bad parameter $range, skipping\n") } } return(sort {$a <=> $b} @files); } # Read the log file. $ind_to_search, @lines_in_current, and # OUTPUT_FILE should be set up properly. sub do_read_log { my $LOG; $LOG = sprintf("sguy%04d", $ind_to_search); if (-r "$directory$LOG") { open(LOG, "$directory$LOG") or die("Couldn't open $directory$LOG\n"); if ($verbose) { print("Scanning $LOG..."); } } elsif (-r "$directory$LOG.gz") { open(LOG, "zcat $directory$LOG.gz |") or die("Couldn't exec zcat or couldn't open $directory$LOG.gz\n"); if ($verbose) { print("Scanning $LOG.gz..."); } } else { die("Couldn't find $LOG\n") } # There are four types of separators separating entries in the logs. # Logs 1 - 6: '<<\d\d\d>>' begins an entry. # Logs 7 - 14, 26 - 126: "NICBBS": # '\*\*\*\*\* Received \d\d:\d\d:\d\d on \d\d/\d\d/\d\d,' ... # _ends_ an entry. (This is usually the only date indication # we have.) # Logs 15 - 25: '={78,79}' begins an entry. # Logs 126 on: "UCF1VM": '={73}' begins an entry. # Note that the transition from NICBBS to UCF1VM happens _within_ # log 126. *Both* separators occur between Ramrod: Media # Darling #5 parts 2 and 3 (as would be expected). # We handle this by assuming instead that separators end entries, # but allowing eof() as a separator, and disregarding "entries" # of less than five lines. my @entry = (); my($log_line, %header, $parsing_a_header, $partial_header, $month); my($complain, $key); my $i = 0; while ($this_index = shift(@lines_in_current)) { @entry = (); undef %header; undef %index; $parsing_a_header = 0; $complain = 0; $provisional_subtopic = ""; while ($log_line = ) { chomp $log_line; if (($ind_to_search >= 1 and $ind_to_search <= 6) and $log_line =~ /^<<\d\d\d>>$/ ) { last } elsif ((($ind_to_search >= 7 and $ind_to_search <= 14) or ($ind_to_search >= 26 and $ind_to_search <= 126)) and $log_line =~ m|^\*\*\*\*\* Received (\d\d:\d\d:..) on (\d\d)/(\d\d)/(\d\d)|) { # Assemble a proper "Date:" header. $month = $short_month_names[$2-1]; my $date = "$3 $month 19$4 $1"; # Take the date in the received line as canonical. if ($warn and exists($header{'date'})) { print("!!! Duplicate date $header{'date'}"); } $header{'date'} = sprintf("19%02d-%02d-%02d %s", $4, $2, $3, $1); # Put "Date:" after leading blank lines my @blanks = (); while (($log_line = shift(@entry)) =~ /^\s*$/) { unshift(@blanks, $log_line); } unshift(@entry, $log_line); unshift(@entry, "Date: $date"); foreach $log_line (@blanks) { unshift(@entry, $log_line) } last; } elsif (($ind_to_search >= 15 and $ind_to_search <= 25) and $log_line =~ /^={78,79}$/ ) { last } elsif (($ind_to_search >= 126) and $log_line =~ /^={73}$/ ) { last } # Parse out headers if ($parsing_a_header) { # A continued header line -- begins with whitespace, and # contains some non-whitespace. if ( $log_line =~ /^\s+(\S.*)/ ) { $partial_header .= $1; } else { if ( $partial_header =~ /\AFrom: / ) { $partial_header =~ s/\AFrom: *//; $partial_header =~ s/.*<(.*)>/$1/; if (!exists($header{'from'})) { $header{'from'} = $partial_header; } elsif ( $warn ) { print "!!! duplicate sender \"$partial_header\""; } } elsif ( $partial_header =~ /\ASubject: .*/ ) { $partial_header =~ s/^Subject: *//; $partial_header =~ s/<(Superguy|Sfstory) \d*> *//; if ( defined $1 ) { $provisional_subtopic = $1 } if (!exists($header{'subject'})) { $header{'subject'} = $partial_header; if(defined($provisional_subtopic)) { if ($provisional_subtopic eq "Superguy") { $header{'subtopic'} = "SG"; } elsif ($provisional_subtopic eq "Sfstory") { $header{'subtopic'} = "SF"; } } elsif ($partial_header =~ m|\A(Re: *)?(Repost: *)?([\w/]*):|i) { $header{'subtopic'} = $3; } } elsif ( $warn ) { print "!!! duplicate subject \"$partial_header\""; } } else { die "Saw a header I wasn't expecting, stopped" } $parsing_a_header = 0; } } if ($log_line =~ /^Subject: / or $log_line =~ /^From: /) { $partial_header = $log_line; $parsing_a_header = 1; } elsif ($log_line =~ /^Date: /) { # Date header. ($newdate = $log_line) =~ s/^Date:\s*//; if ($newdate =~ /\w*, (\d*) (\w*) (\d*) ([\d:x]*).*/) { if (exists ($month_numbers{$2})) { $month = $month_numbers{$2}; } else { warn "!!! unknown month \"$2\""; $month = 0; } $newdate = sprintf("%04d-%02d-%02d %s", $3, $month, $1, $4); if (!exists($header{'date'})) { $header{'date'} = $newdate; } elsif ($warn) { print "!!! duplicate date \"$newdate\"" } } elsif ($warn) { print "!!! unknown date format \"$newdate\" -- ignored."; } } # Kill control-Z's, they're annoying. $log_line =~ s/\032//g; # Skip those stupid "Archived" lines. if ($log_line =~ /\*\*\*\*\* Archived/) { next } push(@entry, $log_line); } if ($#entry < 5) { redo } # Entry too small, must be bogus. if ($warn >= 2) { if (!exists($header{'subject'})) {print "!!! subject missing"} if (!exists($header{'from'})) {print "!!! sender missing"} if (!exists($header{'date'})) {print "!!! date missing"} } if ($warn) { $index{'date'} = substr($this_index, 5, 8); $index{'subtopic'} = substr($this_index, 15, 2); $index{'subject'} = substr($this_index, 19, 37); $index{'author'} = substr($this_index, 57, 25); $index{'subject'} =~ s/\s*\Z//; $index{'author'} =~ s/\s*\Z//; $index{'date'} =~ s|(..)/(..)/(..)|19$3-$1-$2|; if (exists($header{'date'}) && abs(str2time($index{'date'}) - str2time($header{'date'})) > (2*24*60*60)) { print "!!! Index and Log disagree about date."; $complain = 1; } $index{'subtopic'} =~ s/LN/LNH/g; $index{'subtopic'} =~ s/AD/ADMINISTRIVIA/g; if (exists($header{'subtopic'}) and lc($header{'subtopic'}) ne lc($index{'subtopic'})) { print "!!! Index and Log disagree about subtopic."; $complain = 1; } if (exists($header{'from'})) { if(exists($author_addresses{lc($header{'from'})})) { $header{'from_name'} = $author_addresses{lc($header{'from'})}; if ($index{'author'} !~ /$header{'from_name'}/i and $index{'subtopic'} ne "LNH" # Many LNH reposts were posted by someone other than # the author and $index{'author'} !~ m|/|) # Collaborations { print "!!! Author and sender are different."; $complain = 1; } } else { print "!!! Sender address $header{'from'} not in the address database."; $complain = 1; # XXX } } if ($complain) { printf("!!! Index: %s %s: %s %s\n", $index{'date'}, $index{'subtopic'}, $index{'subject'}, $index{'author'}); printf("!!! Log: %s %s %s <%s>\n", exists($header{'date'}) ? $header{'date'} : "", exists($header{'subject'}) ? $header{'subject'} : "", exists($header{'from_name'}) ? $header{'from_name'}: "", exists($header{'from'}) ? $header{'from'} : ""); } } if (eof(LOG) and $#lines_in_current != -1) { warn("Uh-oh, not enough log for all the indices, " . "something's probably gone wrong.\n"); last; } if (match_any($search_type, $expression, $this_index)) { $printed_something = print_log_entry($this_index, $printed_something, @entry); } } if (!eof(LOG) and $#entry >= 5) { warn("Uh-oh, not enough index for all the logs, " . "something's probably gone wrong.\n"); } close(LOG); }