#!/usr/bin/perl
our $ID = q$Id: mdfrm-utf8,v 1.1 2007-08-01 09:10:49 eagle Exp $;
#
# mdfrm-utf8 -- View a summary of e-mail in a maildir (supporting Unicode).
#
# Copyright 1997, 1998, 2002, 2003, 2007 by Russ Allbery <rra@stanford.edu>
#
# This program is free software; you may redistribute it and/or modify it
# under the same terms as Perl itself.

##############################################################################
# Site configuration
##############################################################################

# The location of the user's incoming maildir.  MAILDIR should always be
# honored; the fallback location may be site-dependent.
our $MAILDIR;
($MAILDIR = $ENV{MAILDIR} || $ENV{HOME} . '/Maildir') =~ s%/+$%%;

# Ranges of characters that should be considered wide.
our @WIDE = qw(\x{2E80}-\x{303E} \x{3041}-\x{33FF} \x{4E00}-\x{9FBB}
               \x{AC00}-\x{D7A3} \x{FF01}-\x{FF60});

# Character sets to try in order.  The most restrictive should be listed
# first.
our @CHARSETS = qw(euc-kr big5 gb2312 iso-2022-jp shiftjis iso-8859-15);

##############################################################################
# Modules and declarations
##############################################################################

require 5.008;

use strict;

use Encode qw(decode);
use Encode::Guess;

##############################################################################
# Reading maildir contents
##############################################################################

# Reports a file access error.
sub open_fail { die "$0: cannot open $_[0]: $!\n" }

# Returns a reference to a sorted array of all messages in either the cur or
# new directories of maildir.  Since the files are named with timestamps, this
# will give us a sorted listing by delivery time, which is what I want.
sub contents {
    my ($maildir, $onlynew, $onlyunseen, $sort) = @_;
    my @files;
    unless ($onlynew) {
        opendir (D, "$maildir/cur") or open_fail "$maildir/cur";
        @files = map { ["$maildir/cur/", $_] } grep { ! /^\./ } readdir D;
        @files = grep { $$_[1] !~ /.*:2,[^,:]*[sS][^,:]*$/ } @files
            if $onlyunseen;
    }
    opendir (D, "$maildir/new") or open_fail "$maildir/new";
    push (@files, map { ["$maildir/new/", $_] } grep { ! /^\./ } readdir D);
    if ($sort) {
        @files = map { $$_[0] . $$_[1] } sort { $$a[1] cmp $$b[1] } @files;
    } else {
        @files = map { $$_[0] . $$_[1] } @files;
    }
    \@files;
}

# Decodes a header, trying RFC 2047 encoding, character set guessing, and
# falling back on just assuming it's ISO 8859-15, since that's the most common
# case for me.
sub header_decode {
    my ($header) = @_;
    if ($header =~ /=\?\S+\?[bq]\?.*\?=/i) {
        eval { $header = decode ('MIME-Header', $header) };
    } elsif ($header =~ /[\e\x80-\xff]/) {
        my $decoder;
        for my $encoding (@CHARSETS) {
            $decoder = guess_encoding ($header, $encoding);
            last if ref $decoder;
        }
        if (ref $decoder) {
            $header = $decoder->decode ($header);
        } else {
            $header = decode ('iso-8859-15', $header);
        }
    }
    $header =~ s/\t/ /g;
    $header =~ s/[\x00-\x1f]//g;
    return $header;
}

# Returns who a given message is from and what the subject of the message is
# (as a two-element list).  We do this by parsing out the From and Subject
# headers of the message.  If our second argument is true, also grab the
# contents of the Date header of the article and convert it to seconds from
# epoch, returning that as well.
sub headers {
    my ($file, $need_date) = @_;
    open (MESSAGE, $file) or open_fail $file;
    binmode (MESSAGE, ':bytes');
    my ($from, $subject, $date);
    local $_;
    while (<MESSAGE>) {
        last if /^$/;
        if (/^From:\s/i) {
            s/^From:\s+//i;
            ($from) = /^(.*)\s+<\S+>/;
            ($from) = /^\S+\s+\((.*)\)/ unless $from;
            $from = $_ unless $from;
            $from = header_decode ($from);
            $from =~ s/^\"//;
            $from =~ s/\"$//;
            $from =~ s/^\s+$//;
            $from =~ s/\s+$//;
        } elsif (/^Subject:\s/i) {
            (undef, $subject) = split (' ', $_, 2);
            $subject = header_decode ($subject);
            $subject =~ s/^\s+$//;
            $subject =~ s/\s+$//;
            chomp $subject;
        } elsif ($need_date && /^Date:\s/i) {
            (undef, $date) = split (' ', $_, 2);
            $date = Date::Parse::str2time ($date);
        }
        last if ($from && $subject && (!$need_date || $date));
    }
    close MESSAGE;
    return ($from, $subject, $date);
}

# Stepping through a list of messages, read in the From and Subject (and maybe
# the Date) and make a list of anonymous arrays containing that information.
sub parse {
    my ($messages, $usedate) = @_;
    my @info;
    if ($usedate) { require Date::Parse }
    for (@$messages) { push (@info, [ headers ($_, $usedate) ]) }
    if ($usedate) { @info = sort { $$a[2] <=> $$b[2] } @info }
    \@info;
}

##############################################################################
# Output formatting
##############################################################################

# Align a string that may contain double-wide characters.  For right now, just
# assume that all CJK idiographs are double-wide.  There are other double-wide
# characters that we're not dealing with properly yet.
sub align {
    my ($string, $left) = @_;
    my $widechars = join ('', @WIDE);
    my @string = split (//, $string);
    my $output;
    while ($left > 0 && @string) {
        my $c = shift @string;
        my $width = 1;
        if ($c =~ /[$widechars]/o) {
            $width = 2;
        } elsif ($c =~ /\p{Mn}|\p{Me}|\p{Cf}/g) {
            $width = 0;
        }
        if ($width > $left) {
            last;
        } else {
            $output .= $c;
            $left -= $width;
        }
    }
    $output .= ' ' x $left if ($left > 0);
    return $output;
}

##############################################################################
# Main routine
##############################################################################

# Used for heredocs to make them more readable.
sub unquote { my ($string) = @_; $string =~ s/^: {0,7}//gm; $string }

# Trim extraneous garbage from the path.
my $fullpath = $0;
$0 =~ s%^.*/%%;

# Enable UTF-8 output by default.
binmode (STDOUT, ':utf8');

# Get command-line options.
my ($count, $onlynew, $onlyunseen, $summary, $usedate);
while (@ARGV && $ARGV[0] =~ /^-/) {
    $_ = shift;
    if (/^-.*v/) {
        my $version = join (' ', (split (' ', $ID))[1..3]);
        $version =~ s/,v\b//;
        $version =~ s/(\S+)$/($1)/;
        $version =~ tr%/%-%;
        print "$version\n";
        exit;
    } elsif (/^-.*h/) {
        print unquote (<<"EOM");
:       Usage: mdfrm [-hvcdnu] [<maildir>]
:
:       Presents a summary of mail present in <maildir>.  If <maildir> isn't
:       provided on the command line, \$MAILDIR is used; if it isn't set, the
:       default is $MAILDIR.
:
:         -h      Display this message.
:         -v      Display the version number of $0.
:         -c      Display only a count of messages.
:         -s      Display a summary of messages (the default behavior).
:         -d      Sort mail by message date.
:         -n      Display only new mail (default if invoked as nfrm).
:         -u      Display only unseen messages.
EOM
        exit;
    }
    if (/^-.*c/) { $count = 1      }
    if (/^-.*d/) { $usedate = 1    }
    if (/^-.*n/) { $onlynew = 1    }
    if (/^-.*s/) { $summary = 1    }
    if (/^-.*u/) { $onlyunseen = 1 }
    if (!/^-[cdnsu]+$/) {
        die "$0: unknown flag $_\n";
    }
}
$onlynew ||= ($0 =~ /nfrm/);
$summary = $summary || !$count;
die "$0: only one maildir argument supported\n" if @ARGV > 1;
my $maildir = $ARGV[0] || $MAILDIR;

# Get the list of messages and write out the listing.
my $messages = contents ($maildir, $onlynew, $onlyunseen, !$usedate);
if ($count) {
    my $desc = 'message' . (@$messages != 1 ? 's' : '');
    $desc .= " in $maildir" if @ARGV;
    $desc = "unread $desc" if $onlyunseen;
    $desc = "new $desc" if $onlynew;
    print scalar @$messages, ' ', $desc, "\n";
}
if ($summary) {
    my $info = parse ($messages, $usedate);
    for (@$info) {
        my ($author, $subject) = @$_;
        $author = align ($author, 25);
        $subject = align ($subject, 53);
        $subject =~ s/\s+$//;
        print "$author  $subject\n";
    }
}
exit;

##############################################################################
# Documentation
##############################################################################

=head1 NAME

mdfrm-utf8 - List From and Subject of mail in a maildir in Unicode

=head1 SYNOPSIS

mdfrm-utf8 [B<-hvcdnsu>] [I<maildir>]

mdnfrm-utf8 [B<-hvcds>] [I<maildir>]

=head1 REQUIREMENTS

Perl 5.8.0 or newer is required, including the Encode and Encode::Guess
modules.  If you want to use the B<-d> option, the Date::Parse module (part
of the TimeDate distribution on CPAN) is also required.

=head1 DESCRIPTION

B<mdfrm-utf8> prints the name from the From header and the Subject header of
all mail stored in a given maildir sorted by arrival date.  If no maildir is
specified on the command line, mdfrm will default to the path in the
environnment variable MAILDIR or to a default path of F<$HOME/Maildir>
(configured at the top of this script).  If invoked as B<mdnfrm-utf8> (or
B<nfrm>), see the description of the option B<-n> below.

If the From or Subject headers are encoded per RFC 2047, they're decoded
into UTF-8 before display.  Otherwise, B<mdfrm-utf8> will attempt to guess
the character set of the From and Subject headers by attempting several
character sets in order.  The default character sets tried (and the default
order) is EUC-KR, Big5, GB2312, ISO-2022-JP, Shift JIS, and ISO-8859-15, in
that order.  The first character set that doesn't result in invalid
sequences will be used.  The character sets and order can be modified at the
start of this script.

B<mdfrm-utf8> will attempt to adjust the formatting for Asian character sets
that B<xterm> renders using a wide font.  The range of Unicode characters
considered wide is set at the beginning of this script in case it needs
modification.

=head1 OPTIONS

=over 4

=item B<-c>

Rather than displaying a summary of the messages, just print a single line
giving the count of messages.  This option works in conjunction with B<-n>
and B<-u> in the expected ways.  If you want to see both the count and the
summary, also use B<-s>.

=item B<-d>

Sort the displayed summary by the Date: headers of the mail rather than by
arrival date.  Use of this flag requires the Date::Parse module be installed
(part of TimeDate).  (If you do not use this flag, you do not need that
module.)

=item B<-h>

Show a usage summary.

=item B<-n>

Limit the summary to new (unread) messages (messages located in the new
subdirectory of the maildir and excluding messages in the cur subdirectory).
This option is the default if this program is invoked as B<mdnfrm> or
B<nfrm> (or any other name containing "nfrm").

=item B<-s>

Print a summary of messages.  This is the default behavior; this option is
only needed in combination with B<-c> if you want to see both the count of
messages and the summary.

=item B<-u>

Only show messages which haven't been seen (read with a mail client).  This
uses the status flags in the info section of a message name (the part after
C<:2,>).  See L<http://cr.yp.to/proto/maildir.html> for more information on
status flags.

This option doesn't really make sense in combination with B<-n> since new
messages by definition haven't been read and a mail client should move
messages into the cur subdirectory when the user looks at them.

=item B<-v>

Show the program version.

=back

=head1 DIAGNOSTICS

=over 4

=item cannot open %s

B<mdfrm> was unable to open the specified file or directory.  This can
happen if the specified maildir was incorrect, or if mail was being read
while B<mdfrm> was running and some of the messages were moved or deleted
while it was retrieving Subject or From headers.

=item only one maildir argument supported

More than one maildir was specified on the command line.  Currently,
B<mdfrm> can only summarize one maildir at a time.  To summarize multiple
maildirs, run B<mdfrm> multiple times with different arguments.

=item unknown flag %s

An unknown command-line flag was passed to B<mdfrm>.

=back

=head1 EXAMPLES

Summarize all of the mail in MAILDIR (F<$HOME/Maildir> if MAILDIR isn't
set):

    mdfrm

Display only the new messages in the maildir Mail/incoming, sorted by the
Date headers rather than by file name (arrival time):

    mdfrm -nd

Display a count of unseen messages (in both the new and cur directories):

    mdfrm -cu

Show the count and summary of unseen messages, sorted by date.

    mdfrm -csd

If maildirs are used exclusively on your system, it may be a good idea to
install B<mdfrm> as B<frm>.

=head1 ENVIRONMENT

=over 4

=item MAILDIR

The path to the maildir to summarize, used if no maildir is given on the
command line.

=back

=head1 BUGS

B<mdfrm-utf8> assumes that the output character set should be UTF-8 and does
not support any other locale.

The ordering of character sets when guessing, and the character sets
attempted, should be configurable from the command line or via the
environment, not only by modifying the script.

The list of wide characters is not complete and needs to be further
elaborated based on the Unicode data tables.

Continuation lines are not handled in either Subject or From, even if the
full header content would fit on a single line.

=head1 SEE ALSO

frm(1), maildir(5), xterm(1)

The maildir format is documented at L<http://cr.yp.to/proto/maildir.html>.

L<http://www.eyrie.org/~eagle/software/mdfrm/> will have the current
version of this program.

=head1 AUTHOR

Russ Allbery <rra@stanford.edu>

=head1 COPYRIGHT AND LICENSE

Copyright 1997, 1998, 2002, 2003, 2007 by Russ Allbery <rra@stanford.edu>

This program is free software; you may redistribute it and/or modify it
under the same terms as Perl itself.

=head1 THANKS

Mikko HE<auml>nninen and Sverre H. Huseby helped with the initial RFC 2047
decoding logic (no longer used).  Support for nfrm (B<-n>) courtesy of
VebjE<oslash>rn LjosE<aring>.  Support for Date header sorting (B<-d>)
inspired by the work of Matthew Mead.  Documentation and usage message based
on work by Sotiris Vassilopoulos.  Implementation of B<-u>, B<-c>, and B<-s>
from Russell Steinthal.

=cut