#!/usr/bin/perl our $ID = q$Id: mdfrm-utf8,v 1.1 2007-08-01 09:10:49 eagle Exp $; # # mdfrm-utf8 -- View a summary of e-mail in a maildir (supporting Unicode). # # Copyright 1997, 1998, 2002, 2003, 2007 by Russ Allbery # # This program is free software; you may redistribute it and/or modify it # under the same terms as Perl itself. ############################################################################## # Site configuration ############################################################################## # The location of the user's incoming maildir. MAILDIR should always be # honored; the fallback location may be site-dependent. our $MAILDIR; ($MAILDIR = $ENV{MAILDIR} || $ENV{HOME} . '/Maildir') =~ s%/+$%%; # Ranges of characters that should be considered wide. our @WIDE = qw(\x{2E80}-\x{303E} \x{3041}-\x{33FF} \x{4E00}-\x{9FBB} \x{AC00}-\x{D7A3} \x{FF01}-\x{FF60}); # Character sets to try in order. The most restrictive should be listed # first. our @CHARSETS = qw(euc-kr big5 gb2312 iso-2022-jp shiftjis iso-8859-15); ############################################################################## # Modules and declarations ############################################################################## require 5.008; use strict; use Encode qw(decode); use Encode::Guess; ############################################################################## # Reading maildir contents ############################################################################## # Reports a file access error. sub open_fail { die "$0: cannot open $_[0]: $!\n" } # Returns a reference to a sorted array of all messages in either the cur or # new directories of maildir. Since the files are named with timestamps, this # will give us a sorted listing by delivery time, which is what I want. sub contents { my ($maildir, $onlynew, $onlyunseen, $sort) = @_; my @files; unless ($onlynew) { opendir (D, "$maildir/cur") or open_fail "$maildir/cur"; @files = map { ["$maildir/cur/", $_] } grep { ! /^\./ } readdir D; @files = grep { $$_[1] !~ /.*:2,[^,:]*[sS][^,:]*$/ } @files if $onlyunseen; } opendir (D, "$maildir/new") or open_fail "$maildir/new"; push (@files, map { ["$maildir/new/", $_] } grep { ! /^\./ } readdir D); if ($sort) { @files = map { $$_[0] . $$_[1] } sort { $$a[1] cmp $$b[1] } @files; } else { @files = map { $$_[0] . $$_[1] } @files; } \@files; } # Decodes a header, trying RFC 2047 encoding, character set guessing, and # falling back on just assuming it's ISO 8859-15, since that's the most common # case for me. sub header_decode { my ($header) = @_; if ($header =~ /=\?\S+\?[bq]\?.*\?=/i) { eval { $header = decode ('MIME-Header', $header) }; } elsif ($header =~ /[\e\x80-\xff]/) { my $decoder; for my $encoding (@CHARSETS) { $decoder = guess_encoding ($header, $encoding); last if ref $decoder; } if (ref $decoder) { $header = $decoder->decode ($header); } else { $header = decode ('iso-8859-15', $header); } } $header =~ s/\t/ /g; $header =~ s/[\x00-\x1f]//g; return $header; } # Returns who a given message is from and what the subject of the message is # (as a two-element list). We do this by parsing out the From and Subject # headers of the message. If our second argument is true, also grab the # contents of the Date header of the article and convert it to seconds from # epoch, returning that as well. sub headers { my ($file, $need_date) = @_; open (MESSAGE, $file) or open_fail $file; binmode (MESSAGE, ':bytes'); my ($from, $subject, $date); local $_; while () { last if /^$/; if (/^From:\s/i) { s/^From:\s+//i; ($from) = /^(.*)\s+<\S+>/; ($from) = /^\S+\s+\((.*)\)/ unless $from; $from = $_ unless $from; $from = header_decode ($from); $from =~ s/^\"//; $from =~ s/\"$//; $from =~ s/^\s+$//; $from =~ s/\s+$//; } elsif (/^Subject:\s/i) { (undef, $subject) = split (' ', $_, 2); $subject = header_decode ($subject); $subject =~ s/^\s+$//; $subject =~ s/\s+$//; chomp $subject; } elsif ($need_date && /^Date:\s/i) { (undef, $date) = split (' ', $_, 2); $date = Date::Parse::str2time ($date); } last if ($from && $subject && (!$need_date || $date)); } close MESSAGE; return ($from, $subject, $date); } # Stepping through a list of messages, read in the From and Subject (and maybe # the Date) and make a list of anonymous arrays containing that information. sub parse { my ($messages, $usedate) = @_; my @info; if ($usedate) { require Date::Parse } for (@$messages) { push (@info, [ headers ($_, $usedate) ]) } if ($usedate) { @info = sort { $$a[2] <=> $$b[2] } @info } \@info; } ############################################################################## # Output formatting ############################################################################## # Align a string that may contain double-wide characters. For right now, just # assume that all CJK idiographs are double-wide. There are other double-wide # characters that we're not dealing with properly yet. sub align { my ($string, $left) = @_; my $widechars = join ('', @WIDE); my @string = split (//, $string); my $output; while ($left > 0 && @string) { my $c = shift @string; my $width = 1; if ($c =~ /[$widechars]/o) { $width = 2; } elsif ($c =~ /\p{Mn}|\p{Me}|\p{Cf}/g) { $width = 0; } if ($width > $left) { last; } else { $output .= $c; $left -= $width; } } $output .= ' ' x $left if ($left > 0); return $output; } ############################################################################## # Main routine ############################################################################## # Used for heredocs to make them more readable. sub unquote { my ($string) = @_; $string =~ s/^: {0,7}//gm; $string } # Trim extraneous garbage from the path. my $fullpath = $0; $0 =~ s%^.*/%%; # Enable UTF-8 output by default. binmode (STDOUT, ':utf8'); # Get command-line options. my ($count, $onlynew, $onlyunseen, $summary, $usedate); while (@ARGV && $ARGV[0] =~ /^-/) { $_ = shift; if (/^-.*v/) { my $version = join (' ', (split (' ', $ID))[1..3]); $version =~ s/,v\b//; $version =~ s/(\S+)$/($1)/; $version =~ tr%/%-%; print "$version\n"; exit; } elsif (/^-.*h/) { print unquote (<<"EOM"); : Usage: mdfrm [-hvcdnu] [] : : Presents a summary of mail present in . If isn't : provided on the command line, \$MAILDIR is used; if it isn't set, the : default is $MAILDIR. : : -h Display this message. : -v Display the version number of $0. : -c Display only a count of messages. : -s Display a summary of messages (the default behavior). : -d Sort mail by message date. : -n Display only new mail (default if invoked as nfrm). : -u Display only unseen messages. EOM exit; } if (/^-.*c/) { $count = 1 } if (/^-.*d/) { $usedate = 1 } if (/^-.*n/) { $onlynew = 1 } if (/^-.*s/) { $summary = 1 } if (/^-.*u/) { $onlyunseen = 1 } if (!/^-[cdnsu]+$/) { die "$0: unknown flag $_\n"; } } $onlynew ||= ($0 =~ /nfrm/); $summary = $summary || !$count; die "$0: only one maildir argument supported\n" if @ARGV > 1; my $maildir = $ARGV[0] || $MAILDIR; # Get the list of messages and write out the listing. my $messages = contents ($maildir, $onlynew, $onlyunseen, !$usedate); if ($count) { my $desc = 'message' . (@$messages != 1 ? 's' : ''); $desc .= " in $maildir" if @ARGV; $desc = "unread $desc" if $onlyunseen; $desc = "new $desc" if $onlynew; print scalar @$messages, ' ', $desc, "\n"; } if ($summary) { my $info = parse ($messages, $usedate); for (@$info) { my ($author, $subject) = @$_; $author = align ($author, 25); $subject = align ($subject, 53); $subject =~ s/\s+$//; print "$author $subject\n"; } } exit; ############################################################################## # Documentation ############################################################################## =head1 NAME mdfrm-utf8 - List From and Subject of mail in a maildir in Unicode =head1 SYNOPSIS mdfrm-utf8 [B<-hvcdnsu>] [I] mdnfrm-utf8 [B<-hvcds>] [I] =head1 REQUIREMENTS Perl 5.8.0 or newer is required, including the Encode and Encode::Guess modules. If you want to use the B<-d> option, the Date::Parse module (part of the TimeDate distribution on CPAN) is also required. =head1 DESCRIPTION B prints the name from the From header and the Subject header of all mail stored in a given maildir sorted by arrival date. If no maildir is specified on the command line, mdfrm will default to the path in the environnment variable MAILDIR or to a default path of F<$HOME/Maildir> (configured at the top of this script). If invoked as B (or B), see the description of the option B<-n> below. If the From or Subject headers are encoded per RFC 2047, they're decoded into UTF-8 before display. Otherwise, B will attempt to guess the character set of the From and Subject headers by attempting several character sets in order. The default character sets tried (and the default order) is EUC-KR, Big5, GB2312, ISO-2022-JP, Shift JIS, and ISO-8859-15, in that order. The first character set that doesn't result in invalid sequences will be used. The character sets and order can be modified at the start of this script. B will attempt to adjust the formatting for Asian character sets that B renders using a wide font. The range of Unicode characters considered wide is set at the beginning of this script in case it needs modification. =head1 OPTIONS =over 4 =item B<-c> Rather than displaying a summary of the messages, just print a single line giving the count of messages. This option works in conjunction with B<-n> and B<-u> in the expected ways. If you want to see both the count and the summary, also use B<-s>. =item B<-d> Sort the displayed summary by the Date: headers of the mail rather than by arrival date. Use of this flag requires the Date::Parse module be installed (part of TimeDate). (If you do not use this flag, you do not need that module.) =item B<-h> Show a usage summary. =item B<-n> Limit the summary to new (unread) messages (messages located in the new subdirectory of the maildir and excluding messages in the cur subdirectory). This option is the default if this program is invoked as B or B (or any other name containing "nfrm"). =item B<-s> Print a summary of messages. This is the default behavior; this option is only needed in combination with B<-c> if you want to see both the count of messages and the summary. =item B<-u> Only show messages which haven't been seen (read with a mail client). This uses the status flags in the info section of a message name (the part after C<:2,>). See L for more information on status flags. This option doesn't really make sense in combination with B<-n> since new messages by definition haven't been read and a mail client should move messages into the cur subdirectory when the user looks at them. =item B<-v> Show the program version. =back =head1 DIAGNOSTICS =over 4 =item cannot open %s B was unable to open the specified file or directory. This can happen if the specified maildir was incorrect, or if mail was being read while B was running and some of the messages were moved or deleted while it was retrieving Subject or From headers. =item only one maildir argument supported More than one maildir was specified on the command line. Currently, B can only summarize one maildir at a time. To summarize multiple maildirs, run B multiple times with different arguments. =item unknown flag %s An unknown command-line flag was passed to B. =back =head1 EXAMPLES Summarize all of the mail in MAILDIR (F<$HOME/Maildir> if MAILDIR isn't set): mdfrm Display only the new messages in the maildir Mail/incoming, sorted by the Date headers rather than by file name (arrival time): mdfrm -nd Display a count of unseen messages (in both the new and cur directories): mdfrm -cu Show the count and summary of unseen messages, sorted by date. mdfrm -csd If maildirs are used exclusively on your system, it may be a good idea to install B as B. =head1 ENVIRONMENT =over 4 =item MAILDIR The path to the maildir to summarize, used if no maildir is given on the command line. =back =head1 BUGS B assumes that the output character set should be UTF-8 and does not support any other locale. The ordering of character sets when guessing, and the character sets attempted, should be configurable from the command line or via the environment, not only by modifying the script. The list of wide characters is not complete and needs to be further elaborated based on the Unicode data tables. Continuation lines are not handled in either Subject or From, even if the full header content would fit on a single line. =head1 SEE ALSO frm(1), maildir(5), xterm(1) The maildir format is documented at L. L will have the current version of this program. =head1 AUTHOR Russ Allbery =head1 COPYRIGHT AND LICENSE Copyright 1997, 1998, 2002, 2003, 2007 by Russ Allbery This program is free software; you may redistribute it and/or modify it under the same terms as Perl itself. =head1 THANKS Mikko HEnninen and Sverre H. Huseby helped with the initial RFC 2047 decoding logic (no longer used). Support for nfrm (B<-n>) courtesy of VebjErn LjosE. Support for Date header sorting (B<-d>) inspired by the work of Matthew Mead. Documentation and usage message based on work by Sotiris Vassilopoulos. Implementation of B<-u>, B<-c>, and B<-s> from Russell Steinthal. =cut