X-Git-Url: https://diplodocus.org/git/minc/blobdiff_plain/569e7c8a550bb469aa95d31c340830d567635269..cdef2c782619aacd262bf37efa5308073dff5c66:/minc

diff --git a/minc b/minc
index 743accc..22ef143 100755
--- a/minc
+++ b/minc
@@ -1,6 +1,4 @@
-#! /usr/local/bin/perl
-
-# $Id$
+#!/usr/bin/perl
 
 =head1 NAME
 
@@ -8,32 +6,49 @@ B<minc> - incorporate mail from a maildir into mh folders
 
 =head1 SYNOPSIS
 
+B<minc> [B<-m> I<MAX>] [B<-n>]
+
 B<minc> B<-d>
 
-B<minc> [B<-m>] [B<-n>] [B<-p>]
+B<minc> B<-r>
 
 =head1 DESCRIPTION
 
 B<minc> incorporates mail from a maildir to a mh folder hierarchy.  It
 takes mail from a maildir folder (not a maildir folder hierarchy),
-optionally checks for spam with a user-defined spam-checking function,
-and optionally filters mail into separate mh folders.
-
-The filtering is quite sophisticated, as it is done using real Perl
-matching (m//) commands.
+running each message through regular expression based filter and hook
+functions to determine in which folder to store it or whether it's
+spam.  Post-processing hooks may be applied to each message.
+
+As it processes each message, B<minc> prints a line for each message
+similar to B<inc(1)> and B<scan(1)>.  This line includes the folder
+and message number in which the message was stored, the last 'From'
+header, and the last 'Subject' header.  These fields are truncated to
+fit in the user's terminal (see COLUMNS in B<ENVIRONMENT> below) in
+the following proportions: folder (0.1), message number (0.0625), from
+header (0.175).  Any of these may be overridden with $SCAN_P_FOLDER,
+$SCAN_P_MESSAGE, or $SCAN_P_FROM.  The subject always fills out the
+rest of the line.
 
 =cut
 
 use strict;
 use warnings;
 
+$SIG{'PIPE'} = 'IGNORE';
+
 use Data::Dumper;
 use Errno;
-use Fcntl qw(O_WRONLY O_EXCL O_CREAT);
+use Fcntl;
 use FileHandle;
+use File::FcntlLock;
+use File::Temp qw(tempfile);
 use File::stat;
-use Getopt::Std; $Getopt::Std::STANDARD_HELP_VERSION = 1;
-use POSIX qw(strftime WEXITSTATUS WIFEXITED);
+use Getopt::Long qw(:config gnu_getopt no_ignore_case);
+use POSIX qw(WIFEXITED WEXITSTATUS WIFSIGNALED WTERMSIG);
+use Pod::Usage;
+
+use constant LOCK_TRIES => 60;
 
 our $VERSION = 1;
 
@@ -42,9 +57,11 @@ our $VERSION = 1;
 my $MAGIC_TO_REGEX = '^((Original-)?(Resent-)?(To|Cc|Bcc)|(X-Envelope |Apparently(-Resent)?)-To)';
 my $MAGIC_TO_TOKEN = ' TO';
 
-# List of SPAM message numbers, scanned at the end so the user can
-# check for false positives.
-my @SPAM;
+# Mapping of message numbers to array references.  The first element is set by
+# filter_mail to a reference to a header hash for the message; the second is
+# set by maildier_spam to the name of the message file in the spam maildir.
+# scan_spam scans this at the end so the user can check for false positives.
+my %SPAM;
 
 =head1 OPTIONS
 
@@ -53,9 +70,9 @@ my @SPAM;
 =item B<-d>
 
 Dump (using Data::Dumper) the FILTERS list and exit.  This is useful
-for testing the syntax of .mincfilter.
+for testing the syntax of .minc.
 
-=item B<--help>
+=item B<-h>
 
 Show help.
 
@@ -68,48 +85,45 @@ Stop processing after I<MAX> messages.
 Dry run; do not actually incorporate the mail, but log and report to
 stdout/stderr as normal.
 
-=item B<-p>
+=item B<-r>
 
-Print the filename of each message before checking it for spam.  This
-can be handy if a particular message is giving the spam checker a
-problem.
+Rebuild `mhpath +`/.folders from scratch, processing no mail.
 
 =back
 
 =cut
 
-my $dumpfilters = 0;
+my $dumpfilters;
+my $help;
 my $maxmsgs;
-our $run = 1;
-my $printfilenames = 0;
-
-my %opts;
-
-if (not getopts('dm:np', \%opts)) {
-    exit(2);
-}
-
-if ($opts{'d'}) {
-    $dumpfilters = 1;
-} elsif ($opts{'n'}) {
-    $run = 0;
+my $norun;
+my $rebuild_dot_folders;
+
+if (!caller()) {
+GetOptions(
+           'd' => \$dumpfilters,
+           'h|help' => \$help,
+           'm=i' => \$maxmsgs,
+           'n' => \$norun,
+           'r' => \$rebuild_dot_folders,
+          ) or pod2usage();
+$help and pod2usage(-exitstatus=>0, -verbose=>1);
 }
 
-if (defined($opts{'m'})) {
-    $maxmsgs = $opts{'m'};
-}
-
-if ($opts{'p'}) {
-    $printfilenames = 1;
-}
+our $run = !$norun;
 
 =head1 ENVIRONMENT
 
 =over 4
 
+=item COLUMNS
+
+How many columns the user's terminal can hold, used to print scan
+lines for each processed message.  Defaults to 80.
+
 =item HOME
 
-Where configuration files (.mincfilter) are found.  Also,
+Where the configuration file (.minc) is found.  Also,
 $HOME/Maildir is used for the maildir if MAILDIR is not set.
 
 =item MAILDIR
@@ -120,54 +134,28 @@ Where mail is delivered.
 
 =cut
 
-use Env qw(HOME MAILDIR);
+use Env qw(COLUMNS HOME MAILDIR);
+
+$COLUMNS ||= 80;
 
 if (not $HOME) {
     die("HOME environment variable must be set.\n");
 }
-if (not $MAILDIR) {
-    $MAILDIR = "$HOME/Maildir";
-}
 
 =head1 FILES
 
 =over 4
 
-=item $HOME/.mincfilter
-
-This file is Perl code (included via the 'require' directive) which is
-expected to define the FILTERS list.
-
-=item $HOME/.mincspam
-
-If this file exists, B<minc> will include it with the expectation that
-it will define a B<spam_check> function.  This function takes a
-message filename as an argument and returns 1 if the message is spam,
-else 0.  If this file does not exist, B<minc> will define a simple
-function that always returns 0.
-
-One of B<minc>'s global variables is available to the user-defined
-B<spam_check> function: $run.  This boolean should be honored;
-B<spam_check> should only take real action (i.e. removing or creating
-files, running external programs, etc.) if $run is non-zero.
-
-This file may also declare two other functions: B<spam_start_hook> and
-B<spam_stop_hook>.  The former is passed no arguments and is expected
-to return a list.  This list is a "baton" that will also be passed to
-B<spam_check> and B<spam_stop_hook>.  It can hold anything
-B<spam_check> will need to do its job, whether network sockets, pipes,
-or whatever.
-
-XXX: need more details about the spam-handling process; for now read
-the code.
-
-=item `mhpath +`/logs/minc.log
+=item $HOME/.minc
 
-Where B<minc> logs what it does, unless in -n mode.
+This file is Perl code (included via the 'require' directive) which
+may define the FILTERS list, @start_hooks, @filter_hooks,
+@post_store_hooks, and @stop_hooks.
 
-=item `mhpath +`/logs/dryrun.log
+=item `mhpath +`/.folders
 
-Where B<minc> logs what it would do; used in -n mode.
+B<minc> adds all folders it filters into to this file, which is used
+by lukem's B<new(1)> (XXX need a link).
 
 =item `mhpath +`/.minc.context
 
@@ -185,90 +173,73 @@ marked unseen.
 =cut
 
 our @FILTERS;
-require "$HOME/.mincfilter";
+our (@start_hooks, @stop_hooks, @filter_hooks, @post_store_hooks);
+our $SCAN_P_FOLDER = 0.1;
+our $SCAN_P_MESSAGE = 0.0625;
+our $SCAN_P_FROM = 0.175;
+our @folder_sort_list = (qr/^inbox$/);
+our $folder_sorter = sub { sort_by_list(@_, @folder_sort_list) };
 
 my $mh;
-my $logfile;
 
-$mh = `mhpath +`;
+$mh = `mhpath +`; $? >= 0 or die;
 chomp($mh);
-
-if ($run) {
-    $logfile = $mh . '/logs/minc.log';
-} else {
-    $logfile = $mh . '/logs/dryrun.log';
+if (!$mh) {
+    die('mhpath did not give MH mail path: ' . exit_msg($?));
 }
 
 $ENV{"MHCONTEXT"} = $mh . '/.minc.context';
 
+# TODO: Support dot-locking too.  This works on FreeBSD and Linux.
+my $locking = `mhparam datalocking`;
+$? < 0 && die;
+$? == 0 || die('mhparam datalocking: ' . exit_msg($?));
+chomp($locking);
+if ($locking ne 'fcntl') {
+    die("mh locking $locking not supported; only fcntl");
+}
+
 
 ###############################################################################
-# Logging
-
-sub mylog {
-    my $timestamp = strftime('%b %e %H:%M:%S', localtime);
-    my $msg;
-    foreach my $part (@_) {
-        if (defined($part)) {
-            $msg .= $part;
-        }
-    }
-    # no newlines in the log message, thanks
-    $msg =~ s/\n/ /gm;
+# Utility procedures
 
-    open(LOG, ">>$logfile") or die("open(>>$logfile): $!");
-    print(LOG "$timestamp $msg\n") or die("print(>>$logfile): $!");
-    close(LOG) or die("close($logfile): $!");
+sub exit_msg {
+    my $status = shift;
+    WIFEXITED($status) && return 'exited with status ' . WEXITSTATUS($status);
+    WIFSIGNALED($status) && return 'killed with signal ' . WTERMSIG($status);
+    # WTF
+    "died ($status)";
 }
 
-sub logheader {
-    my ($text, @contents) = @_;
-    my $last;
+sub sort_by_list {
+    my $a = shift;
+    my $b = shift;
 
-    if (@contents) {
-        $last = $contents[-1];
-    } else {
-        $last = '';
+    for my $i (@_) {
+        my $am = $a =~ $i;
+        my $bm = $b =~ $i;
+        if ($am) {
+            if ($bm) {
+                last;
+            }
+            return -1;
+        } elsif ($bm) {
+            return 1;
+        }
     }
 
-    mylog('<< ', $text, $last);
-}
-
-sub log_headers {
-    my %headers = @_;
-
-    # For an explanation of the %headers structure, see the
-    # get_headers function below.
-    logheader('From:       ', @{$headers{'return-path'}});
-    logheader('To:         ', @{$headers{'to'}});
-    logheader('Subject:    ', @{$headers{'subject'}});
-    logheader('Message-Id: ', @{$headers{'message-id'}});
+    return $a cmp $b;
 }
 
-
-###############################################################################
-# Utility procedures
-
 sub mkfolder {
-    my $mhfolder = shift;
-    my $folder;
+    my $folder = shift;
     my $target;
     my $component;
 
-    $folder = $mh . '/' . $mhfolder;
-    $target = '';
-
-    if (not -d $folder) {
-        foreach $component (split('/', $folder)) {
-            if (defined($component) and length($component) >= 1) {
-                $target = $target . '/' . $component;
-                if (-d $target or mkdir($target)) {
-                    next;
-                } else {
-                    die("Failed to create +$mhfolder ($component)");
-                }
-            }
-        }
+    $target = $mh;
+    foreach $component (split('/', $folder)) {
+        $target = join('/', $target, $component);
+        -d $target or mkdir($target) or die("mkdir($target): $!");
     }
 }
 
@@ -277,7 +248,7 @@ sub getfiles {
     my @result;
 
     if (not opendir(DIR, $dir)) {
-        die("Failed opendir($dir)");
+        die("opendir($dir): $!");
     }
 
     # Initialize $! to 0 (success) because stupid stupid Perl provides
@@ -286,27 +257,28 @@ sub getfiles {
     # to detect an error.  Real Programmers don't handle errors,
     # right?  >sigh<
     $! = 0;
-    @result = grep {
-        ($_ ne '.' and $_ ne '..')
-          and $_ = "$MAILDIR/new/$_"
-      } readdir(DIR);
-
+    @result = readdir(DIR);
     if ($! != 0) {
-        die("Failed readdir($dir)");
+        die("readdir($dir): $!");
     }
 
-    if (scalar(@result) == 0) {
-        exit(0);
+    if (@result <= 2) {
+        return ();
     }
 
     STDOUT->autoflush(1);
-    print(scalar(@result), " messages...");
+    if (@result == 3) {
+        print('1 message...');
+    } else {
+        print(@result - 2, ' messages...');
+    }
 
     closedir(DIR);
 
     return @result;
 }
 
+my %msgnum_cache;
 sub get_highest_msgnum {
     my $mhfolder = shift;
     my $dir;
@@ -314,16 +286,21 @@ sub get_highest_msgnum {
     my $highest;
     my $msgnum;
 
+    if (defined($msgnum_cache{$mhfolder})) {
+        return $msgnum_cache{$mhfolder}++;
+    }
+
     $dir = "$mh/$mhfolder";
     if (not opendir(DIR, $dir)) {
-        die("Failed opendir($dir)");
+        die("opendir($dir): $!");
     }
 
+    # Insert rant from getfiles here.
     $! = 0;
     @list = readdir(DIR);
 
     if ($! != 0) {
-        die("Failed readdir($dir)");
+        die("readdir($dir): $!");
     }
 
     closedir(DIR);
@@ -338,92 +315,193 @@ sub get_highest_msgnum {
         }
     }
 
-    return $highest;
+    $msgnum_cache{$mhfolder} = $highest;
+    return $msgnum_cache{$mhfolder}++;
+}
+
+sub mark {
+    my $folder = shift;
+    my $msgnum = shift;
+    my $seq = shift;
+
+    my $fn = "$mh/$folder/.mh_sequences";
+    my ($fh, $e) = lkopen_fcntl($fn, O_RDWR | O_CREAT, 0600);
+
+    my @sequences = <$fh>;
+    chomp(@sequences);
+
+    seek($fh, 0, SEEK_SET) or die("seek($fn): $!");
+    truncate($fh, 0) or die("truncate($fn): $!");
+
+    my $marked = 0;
+    for $_ (@sequences) {
+        if (/^$seq: (.*)/) {
+            my @parts;
+            my @result;
+            my $done = 0;
+            for my $part (split(' ', $1)) {
+                if (not $done) {
+                    my ($st, $en) = split('-', $part);
+                    if ((defined($en) and ($msgnum >= $st and $msgnum <= $en))
+                        or $msgnum == $st) {
+                        # It's already there.
+                        $done = 1;
+                    }
+                    if (defined($en)) {
+                        if ($st - 1 == $msgnum) {
+                            $part = "$msgnum-$en";
+                            $done = 1;
+                        } elsif ($en + 1 == $msgnum) {
+                            $part = "$st-$msgnum";
+                            $done = 1;
+                        }
+                    } else {
+                        if ($part - 1 == $msgnum) {
+                            $part = "$msgnum-$part";
+                            $done = 1;
+                        } elsif ($part + 1 == $msgnum) {
+                            $part = "$part-$msgnum";
+                            $done = 1;
+                        }
+                    }
+                }
+                push(@result, $part);
+            }
+            if (not $done) {
+                push(@result, $msgnum);
+            }
+            print($fh "$seq: ", join(' ', @result), "\n");
+            $marked = 1;
+        } else {
+            print($fh "$_\n");
+        }
+    }
+    $marked or print($fh "$seq: $msgnum\n");
+    close($fh) or die("close(>$fn): $!");
+}
+
+# Based on nmh's lkopen_fcntl
+# Return 0 for success, errno on failure.
+sub lkopen_fcntl {
+    my $fn = shift;
+    my $access = shift;
+    my $mode = shift;
+    my $errno;
+
+    # The assumption here is that if you open the file for writing, you
+    # need an exclusive lock.
+
+    my $tries = LOCK_TRIES;
+    for (;;) {
+        sysopen(my $fh, $fn, $access, $mode) or die("sysopen($fn): $!");
+
+        my $flk = File::FcntlLock->new;
+        $flk->l_start(0);
+        $flk->l_len(0);
+        $flk->l_type(($access & O_ACCMODE) == O_RDONLY ? F_RDLCK : F_WRLCK);
+        $flk->l_whence(SEEK_SET);
+
+        # Really should only retry on EAGAIN and EINTR...
+        if ($flk->lock($fh, F_SETLK)) {
+            return $fh;
+        }
+
+        $errno = $flk->lock_errno;
+        close($fh) or die("close($fn): $!");
+
+        if (--$tries == 0) {
+            last;
+        }
+        sleep(1);
+    }
+
+    local $! = $errno;
+    die("failed to lock $fn: $!");
 }
 
+my @filtered;
 sub store_message {
-    my $msg = shift;
+    my $inbox = shift;
+    my $msg = shift;            # rename $src_msg
     my $mhfolder = shift;
-    my $msgnum;
-    my $try;
-    my $mhmsg;
-    my $status;
+    my ($msgnum, $mhmsg);       # rename $dst_msg and $dst_msgpath
 
     # We must do this even in -n mode because later steps fail without
     # it.  This should be harmless.
     mkfolder($mhfolder);
 
-    # This loop is a modified version of the maildir delivery algorithm.
-    $msgnum = get_highest_msgnum($mhfolder);
-    for ($try = 0; ; $try++) {
-        $msgnum++;
+    if ("+$mhfolder" eq $inbox) {
+        # If @filtered is empty, this message already has the right number.
+        if (!($msgnum = shift(@filtered))) {
+            $msg =~ m|.*/(\d+)$|;
+            return $1;
+        }
         $mhmsg = "$mh/$mhfolder/$msgnum";
-
-        if (not stat($mhmsg)) {
-            if ($!{ENOENT}) {
-                # Now we have a non-existent file, let's try to create
-                # it.  We must create a zero-byte file first because a
-                # file my appear between our happy stat results and
-                # our later rename(2), which would clobber said file.
-                # So attempt to create a file with this name.  If it
-                # succeeds, in just a bit here we'll knowingly clobber
-                # this file with the rename(2) call.
-
-                # Another way to do this is not to use rename(2), but
-                # use link(2) + unlink(2) instead.  That's how the
-                # standard maildir algorithm does it.  Each method has
-                # a disadvantage: the program may crash between the
-                # link(2) and unlink(2) calls.  With the standard
-                # maildir algorithm, that means the message will end
-                # up duplicated.  The advantage of creating an empty
-                # file followed by rename(2) is that an extra empty
-                # file is left behind as opposed to a duplicate
-                # message.  This is more easily detected by the user.
-                if ($run) {
-                    if (sysopen(MSG, "$mhmsg",
-                                O_WRONLY | O_EXCL | O_CREAT, 0600)) {
-                        close(MSG);
+    } else {
+        # This loop is a modified version of the maildir delivery algorithm.
+        $msgnum = get_highest_msgnum($mhfolder);
+        my $try;
+        for ($try = 0; ; $try++) {
+            $msgnum++;
+            $mhmsg = "$mh/$mhfolder/$msgnum";
+
+            if (not stat($mhmsg)) {
+                if ($!{ENOENT}) {
+                    # Now we have a non-existent file, let's try to create
+                    # it.  We must create a zero-byte file first because a
+                    # file my appear between our happy stat results and
+                    # our later rename(2), which would clobber said file.
+                    # So attempt to create a file with this name.  If it
+                    # succeeds, in just a bit here we'll knowingly clobber
+                    # this file with the rename(2) call.
+
+                    # Another way to do this is not to use rename(2), but
+                    # use link(2) + unlink(2) instead.  That's how the
+                    # standard maildir algorithm does it.  Each method has
+                    # a disadvantage: the program may crash between the
+                    # link(2) and unlink(2) calls.  With the standard
+                    # maildir algorithm, that means the message will end
+                    # up duplicated.  The advantage of creating an empty
+                    # file followed by rename(2) is that an extra empty
+                    # file is left behind as opposed to a duplicate
+                    # message.  This is more easily detected by the user.
+                    if ($run) {
+                        if (sysopen(MSG, $mhmsg,
+                                    O_WRONLY | O_EXCL | O_CREAT, 0600)) {
+                            close(MSG);
+                            last;
+                        }
+                    } else {
                         last;
                     }
-                } else {
-                    last;
                 }
             }
-        }
 
-        # This algorithm is different from the maildir one; let's make
-        # 10 tries instead of 3.
-        if ($try == 9) {
-            die("Attempted filename $mhmsg exists.");
-        }
+            # This algorithm is different from the maildir one; let's make
+            # 10 tries instead of 3.
+            if ($try == 9) {
+                die("Attempted filename $mhmsg exists.");
+            }
 
-        # This algorithm is different; i don't think we need to sleep.
-        #sleep(2);
-    }
+            # This algorithm is different; i don't think we need to sleep.
+            #sleep(2);
+        }
 
-    if ($mhfolder ne 'SPAM') {
-        mylog('+', $mhfolder);
+        if ($run) {
+            # Mark each message as soon as we store it and bomb if that
+            # fails.  While it is slow, it is not safe to store multiple
+            # messages and then have a failure before marking some (or
+            # all).
+            if ($mhfolder ne 'SPAM') {
+                mark($mhfolder, $msgnum, 'unseen');
+            }
+        }
     }
+    push(@filtered, $msg);
 
     if ($run) {
         if (not rename($msg, $mhmsg)) {
-            die("Failed rename($msg, $mhmsg)");
-        }
-
-        # Mark each message as soon as we store it and bomb if that
-        # fails.  While it is slow, it is not safe to store multiple
-        # messages and then have a failure before marking some (or
-        # all).
-        if ($mhfolder eq 'SPAM') {
-            push(@SPAM, $msgnum);
-        } else {
-            $status = system('mark', "+$mhfolder", "$msgnum", '-sequence',
-                             'unseen', '-add');
-            if (not WIFEXITED($status)) {
-                die("Failed to run mark");
-            } elsif (WEXITSTATUS($status) != 0) {
-                die("Failed to mark message unseen.");
-            }
+            die("rename($msg, $mhmsg): $!");
         }
     }
 
@@ -444,7 +522,7 @@ sub get_headers {
     my $fieldname;              # unmolested header name
     my $contents;               # contents of header
 
-    open(MSG, $msg);
+    open(MSG, $msg) or die("open(MSG, $msg): $!");
     while (<MSG>) {
         chomp;
         if (length == 0) {
@@ -455,8 +533,8 @@ sub get_headers {
             # folded header continuation
 
             if (not defined($current)) {
-                print(STDERR "Malformed message, cannot parse headers.\n");
-                return ();
+                warn('Malformed message, cannot parse headers.');
+                next;
             }
 
             @{$headers{$current}}[-1] .= $_;
@@ -476,7 +554,7 @@ sub get_headers {
     }
     close(MSG);
 
-    return %headers;
+    return \%headers;
 }
 
 
@@ -485,128 +563,349 @@ sub get_headers {
 
 sub find_mh_folder {
     my $msg = shift;
-    my %headers;
-    my $filterref;
-    my @filter;
-    my $header;
-    my $contents;
-    my $pair;
-    my $match;
-    my $expression;
-    my $result;
-
-    %headers = get_headers($msg);
-    if (not %headers) {
+    my $header = shift;
+
+    if (not %$header) {
         return 'malformed';
     }
 
-    log_headers(%headers);
-
     # Walk the list of filters.  This structure is documented in
     # pod at the end of the program.
-    foreach $filterref (@FILTERS) {
-        @filter = @$filterref;
-        $header = shift(@filter);
-
-        # Handle filters using the magic TO header specially.
-        if ($header eq $MAGIC_TO_TOKEN) {
-            foreach $header (keys(%headers)) {
-                if ($header =~ /$MAGIC_TO_REGEX/i) {
-                    foreach $contents (@{$headers{lc($header)}}) {
-                        foreach $pair (@filter) {
-                            ($match, $expression) = @$pair;
-                            if ($contents =~ /$match/) {
-                                return $expression;
-                            }
-                        }
-                    }
-                }
-            }
-
-            # Now that it's been processed specially, skip normal handling.
+    for my $filterref (@FILTERS) {
+        if (ref($filterref) eq 'CODE') {
+            my $m = $filterref->($header, $msg);
+            $m && return $m;
             next;
         }
 
-        # Walk the list of message headers matching the filter's
-        # specified header.
-        foreach $contents (@{$headers{lc($header)}}) {
-            # Walk the filter's list of match/expression pairs.
-            foreach $pair (@filter) {
-                ($match, $expression) = @$pair;
-                if ($contents =~ /$match/i) {
-                    if (eval "\$result = \"$expression\"") {
-                        return $result;
-                    }
+        my $m = match($header, @$filterref);
+        $m && return $m;
+    }
+
+    return 'inbox';
+}
+
+# Test all the header fields against each [regexp, folder-expression] pair.
+sub match {
+    my $header = shift;
+    my $filter_field = shift;
+    my @filters = @_;
+
+    # Handle filters using the magic TO header specially.
+    if ($filter_field eq $MAGIC_TO_TOKEN) {
+        for my $field_name (keys(%$header)) {
+            if ($field_name =~ /$MAGIC_TO_REGEX/i) {
+                my $m = match_one_field($header->{$field_name}, \@filters);
+                $m && return $m;
+            }
+        }
+        # Now that it's been processed specially, skip normal handling.
+        return;
+    }
+
+    # Walk the list of header fields matching the filter's specified header.
+    my $m = match_one_field($header->{lc($filter_field)}, \@filters);
+    $m && return $m;
+}
+
+# Test all the values of one header field against each [regexp,
+# folder-expression] pair.
+sub match_one_field {
+    my $values = shift;
+    my $filters = shift;
+    for my $value (@$values) {
+        for my $pair (@$filters) {
+            my ($regexp, $expression) = @$pair;
+            if ($value =~ $regexp) {
+                my $result;
+                if (eval "\$result = \"$expression\"") {
+                    return $result;
                 }
             }
         }
     }
+}
 
-    return 'inbox';
+sub scan_line {
+    my ($headers, $mhfolder, $msgnum, $nf, $nm, $nF, $ns) = @_;
+    my $from = '';
+    my $subject = '';
+    # Sometimes these headers are missing...
+    eval { $from = [@{$headers->{'from'}}]->[-1] };
+    eval { $subject = [@{$headers->{'subject'}}]->[-1] };
+    # Replace garbage characters.
+    for ($from, $subject) {
+        tr/\x00-\x1f\x80-\xff/?/;
+    }
+    return sprintf("\%-${nf}s \%${nm}d \%-${nF}s \%s",
+                   substr($mhfolder, 0, $nf), substr($msgnum, 0, $nm),
+                   substr($from, 0, $nF),
+                   substr($subject, 0, $ns));
 }
 
 sub filter_mail {
-    my @msglist = @_;
-    my $msgcount = scalar(@msglist);
-    my $msgcountlen = length($msgcount);
+    my $inbox = shift;
+    @_ or return ();
+    my $msgcount = @_ - 2; # don't count . and ..
+    my $len = length($msgcount);
     my @baton;
     my $msg;
     my $mhfolder;
     my $spam = 0;
     my $saved = 0;
-    my %FOLDERS = ('SPAM'=>1);
+    my $msgnum;
+    my %folders;
+
+    # XXX lame names
+    my $nf = int($COLUMNS * $SCAN_P_FOLDER);
+    my $nm = int($COLUMNS * $SCAN_P_MESSAGE);
+    my $nF = int($COLUMNS * $SCAN_P_FROM);
+    my $ns = $COLUMNS - $nf - $nm - $nF - 3;
+
+    my %batons;
+    for my $hook (@start_hooks) {
+        my ($handle, @baton) = $hook->();
+        if (defined($handle)) {
+            $batons{$handle} = [@baton];
+        }
+    }
 
-    if (-f "$HOME/.mincspam") {
-        require "$HOME/.mincspam";
-    } else {
-        eval "sub spam_start_hook { return (); }";
-        eval "sub spam_stop_hook { }";
-        eval "sub spam_check { return 0; }";
+    my $tty = -t STDOUT;
+    if (not $tty) {
+        # Print a newline after the incomplete "N messages..." line.
+        print("\n");
     }
 
-    @baton = spam_start_hook();
+    for $msg (@_) {
+        ($msg eq '.' or $msg eq '..') and next;
+
+        my $headers = get_headers($msg);
+
+        undef($mhfolder);
+        for my $hook (@filter_hooks) {
+            my $result = $hook->(\%batons, $headers, $msg);
+            defined($result) and ($mhfolder = $result);
+        }
+
+        defined($mhfolder) or ($mhfolder = find_mh_folder($msg, $headers));
+
+        $msgnum = store_message($inbox, $msg, $mhfolder);
+        $folders{$mhfolder}++;
+
+        if ($tty) {
+            print("\r");
+        }
 
-    foreach $msg (@msglist) {
-        if (spam_check($msg, @baton)) {
-            $mhfolder = 'SPAM';
-            $spam = scalar(@SPAM)+ 1;
+        if ($mhfolder eq 'SPAM') {
+            $spam++;
+            $SPAM{$msgnum} = [$headers, undef];
         } else {
-            $mhfolder = find_mh_folder($msg);
             $saved++;
+            print(scan_line($headers, $mhfolder, $msgnum, $nf, $nm, $nF, $ns),
+                  "\n");
         }
 
-        store_message($msg, $mhfolder);
-
-        print("\r");
-        if (not $FOLDERS{$mhfolder}) {
-            print(' ' x $msgcountlen);
-            print("                                \r+$mhfolder\n");
-            $FOLDERS{$mhfolder} = 1;
+        for my $hook (@post_store_hooks) {
+            $hook->(\%batons, $headers, $mhfolder, $msgnum);
         }
 
-        printf('%6d SPAM %6d saved %6d/%1d',
-               $spam, $saved, $spam + $saved, $msgcount);
+        if ($tty) {
+            printf(" \%${len}d SPAM \%${len}d saved \%${len}d/%1d",
+                   $spam, $saved, $spam + $saved, $msgcount);
+        }
 
         defined($maxmsgs) and ($spam + $saved < $maxmsgs or last);
     }
     print("\n");
 
-    spam_stop_hook(@baton);
+    for my $hook (@stop_hooks) {
+        $hook->(\%batons);
+    }
+
+    return %folders;
+}
+
+sub build_dot_folders {
+    my $folders = shift;
+    my $fh = shift;
+    my $fn;
+
+    if (defined($fh)) {
+        while (<$fh>) {
+            chomp;
+            $folders->{$_}++;
+        }
+    }
+
+    eval { ($fh, $fn) = tempfile("$mh/.folders.XXXXX") };
+    if ($@) {
+        warn("$@");
+        return;
+    }
+
+    for my $folder (sort { $folder_sorter->($a,$b) } keys(%$folders)) {
+        print($fh "$folder\n");
+    }
+
+    if (not close($fh)) {
+        warn("close($fn): $!");
+        unlink($fn) or warn("unlink($fn): $!");
+        return;
+    }
+
+    rename($fn, "$mh/.folders") or warn("rename($fn, $mh/.folders): $!");
+}
+
+sub create_dot_folders {
+    if (-f "$HOME/.minc") {
+        require "$HOME/.minc";
+    }
+
+    my %folders;
+    open(my $fh, '-|', 'folders', '-fast', '-recur')
+      or die("open(folders|): $!");
+    build_dot_folders(\%folders, $fh);
+    return 0;
+}
+
+sub update_dot_folders {
+    my $folders = shift;
+    my $fh;
+
+    if (not open($fh, "$mh/.folders") and not $!{ENOENT}) {
+        # For ENOENT, we go ahead and create it, else we error and
+        # don't clobber it.
+        warn("open($mh/.folders): $!");
+        return;
+    }
+
+    build_dot_folders($folders, $fh);
+}
+
+# XXX Could use some unification with getfiles...
+sub maildir_spam {
+    my $dir = "$MAILDIR/spam/new";
+
+    if (not chdir($dir)) {
+        $!{ENOENT} or print(STDERR "skipping maildir spam: chdir($dir): $!\n");
+        return;
+    }
+
+    if (not opendir(DIR, '.')) {
+        print(STDERR "skipping maildir spam: opendir($dir): $!\n");
+        return;
+    }
+
+    $! = 0;
+    my @spams = readdir(DIR);
+    if ($! != 0) {
+        print(STDERR "skipping maildir spam: readdir($dir): $!\n");
+        return;
+    }
+
+    closedir(DIR);
+
+    for my $msg (@spams) {
+        ($msg eq '.' or $msg eq '..') and next;
+        my $msgnum = store_message($msg, 'SPAM');
+        # Store the original file name for scan_spam in -n mode.
+        $SPAM{$msgnum} = [undef, $msg];
+    }
+}
+
+sub scan_spam {
+    my ($msgnum, $header, $tuple, $msg);
+
+    # Unlike filter_mail, we don't need to print the folder name.
+    # Calculate how many columns would be allocated to it...
+    my $nf = int($COLUMNS * $SCAN_P_FOLDER);
+    # ...and add that amount to COLUMNS to calculate the number of columns to
+    # allocate to msgnum and from snippet, thus filling the line without
+    # printing the folder name.
+    my $nm = int(($COLUMNS + $nf) * $SCAN_P_MESSAGE);
+    my $nF = int(($COLUMNS + $nf) * $SCAN_P_FROM);
+    my $ns = $COLUMNS - $nm - $nF - 3;
+
+    for $msgnum (sort(keys(%SPAM))) {
+        $tuple = $SPAM{$msgnum};
+        if (defined($tuple->[0])) {
+            # Filed by filter_mail, so we have the header.
+            $header = $tuple->[0];
+        } elsif (defined($tuple->[1])) {
+            # Filed by maildir_spam, so we don't have the header.
+            if ($run) {
+                # The message has been filed, load it from $mh.
+                $msg = "$mh/SPAM/$msgnum";
+            } else {
+                # The message has not been filed, load it from the maildir.
+                # $tuple->[1] is just a basename, not a path; this works
+                # because maildir_spam did chdir(Maildir/spam/new).
+                $msg = $tuple->[1];
+            }
+            $header = get_headers($msg);
+        } else {
+            print(STDERR
+                  "BUG: corrupt SPAM tuple, neither element defined",
+                  " for message $msgnum\n");
+            next;
+        }
+        print(scan_line($header, '', $msgnum, 0, $nm, $nF, $ns),
+              "\n");
+    }
 }
 
 
-MAIN: {
-    my @msglist;
+if (!caller()) {
     my $st;
 
     if ($dumpfilters) {
+        require "$HOME/.minc";
         $Data::Dumper::Indent = 1;
         print(Dumper(\@FILTERS));
         exit;
     }
 
-    @msglist = (
-                map { $_->[1] }
+    $rebuild_dot_folders and exit(create_dot_folders);
+
+    if (-f "$HOME/.minc") {
+        require "$HOME/.minc";
+    }
+
+    my $inbox;
+    if ($MAILDIR) {
+        $inbox = $MAILDIR;
+    } elsif (-d "$HOME/Maildir") {
+        $inbox = "$HOME/Maildir";
+    }
+
+    if (@ARGV > 0) {
+        $inbox = shift(@ARGV);
+    }
+
+    my @files;
+    if (substr($inbox, 0, 1) eq '+') {
+        my @msgs = @ARGV;
+        if (!@msgs) {
+            @msgs = ('cur-last');
+        }
+        open(my $fh, '-|', 'mhpath', @msgs) or die("open(mhpath|): $!");
+        chomp(@files = <$fh>);
+        if (!close($fh)) {
+            $! == 0 or die("open(mhpath|): $!");
+            die('mhpath '. exit_msg($?));
+        }
+        @files > 0 or exit 0;
+    } else {
+        @ARGV == 0 or pod2usage();
+        chdir("$inbox/new") or die("chdir($inbox/new): $!");
+        @files = map { $_->[1] }
+          sort { $a->[0] <=> $b->[0] }
+          map {
+              $st = stat($_) or die("stat($_): $!");
+              [ $st->mtime, $_ ]
+          } getfiles('.');
+
+        @files = map { $_->[1] }
                 sort { $a->[0] <=> $b->[0] }
                 map {
                     if (not ($st = stat($_))) {
@@ -614,26 +913,31 @@ MAIN: {
                     }
                     [$st->mtime, $_]
                 }
-                getfiles("$MAILDIR/new"));
+                getfiles('.');
+    }
 
-    filter_mail(@msglist);
+    my %folders = filter_mail($inbox, @files);
 
-    @SPAM and (exec('scan', '+SPAM', @SPAM) or die);
+    $run and %folders and update_dot_folders(\%folders);
+
+    maildir_spam();
+    scan_spam();
 }
 
+1;
+
 
 __END__
 
 =head1 THE FILTERS STRUCTURE
 
-The user's .mincfilter file must define the @FILTERS structure.  This
-structure is an array.  Each element of @FILTERS is a filter.  A
-filter is itself an array.  The first element of a filter is a string,
-the name of the header this filter acts upon.  The header name is not
-case-sensitive.  Each subsequent element of a filter is a pair (i.e. a
-two-element array): first, a regular expression B<minc> uses to
-determine whether this filter matches or not, and second, an
-expression which B<minc> evaluates to get the folder name.
+Each element of @FILTERS is a filter.  A filter is itself an array.
+The first element of a filter is a string, the name of the header this
+filter acts upon.  The header name is not case-sensitive.  Each
+subsequent element of a filter is a pair (i.e. a two-element array):
+first, a regular expression B<minc> uses to determine whether this
+filter matches or not, and second, an expression which B<minc>
+evaluates to get the folder name.
 
 B<minc> decides where to store a message by iterating over the
 @FILTERS array.  It tests each regexp of each filter against all
@@ -656,6 +960,31 @@ header are matched against a collection of headers related to (and
 including) the To.  Which headers to use is determined by a regular
 expression borrowed from procmail.
 
+=head1 HOOKS
+
+Filter hooks take a reference to a hash of batons, a reference to a
+hash of headers, and the message filename as arguments.  It returns
+undef to decline filtering of this message (thus falling back to
+subsequent filter hooks, and finally @FILTERS), or the name of the
+folder to store this message into.
+
+One of B<minc>'s global variables is available to the user-defined
+hooks: $run.  This boolean should be honored; hooks should only take
+real action (i.e. removing or creating files, running external
+programs, etc.) if $run is non-zero.
+
+The baton hash is created simply from the start hooks; if the hook
+returns at least one defined value, this value is used as the key and
+all other return values are put into a list reference as the value.
+This hash is then passed by reference to all filter, post-store, and
+stop hooks.
+
+Post store hooks take a reference to a hash of batons, a reference to
+a hash of headers, the folder this message was stored in, and its new
+message number.
+
+XXX: need more details about the hook process; for now read the code.
+
 =head1 EXAMPLES
 
 @FILTERS = (
@@ -691,8 +1020,7 @@ apache.org subdomain, with mailing lists hosted there.  So, given a
 list such as dev@httpd.apache.org, this filter will create the folder
 name l/apache/httpd/dev.
 
-For an example B<spam_check> function, see
-L<http:E<047>E<047>pretzelnet.orgE<047>cvsE<047>dotfilesE<047>.mincspam>
+XXX Need hook examples.
 
 =head1 AUTHORS