Killed the sysexits stuff a while back.

[minc] / minc
diff --git a/minc b/minc

index 82734c86186da682c97ab9b6eb323e28a72348ac..c144f2ec2b76ed11054bfcda02ee40d82d382ab5 100755 (executable)
--- a/minc
+++ b/minc
@@ -8,35 +8,44 @@ B<minc> - Incorporate mail from a maildir into mh folders.
  
  =head1 SYNOPSIS
  
-B<minc> [-B<dhns>]
+B<minc> [-B<dmnp>]
  
  =head1 DESCRIPTION
  
-B<minc> is a program for incorporating mail from a maildir to a mh
-folder hierarchy.  It takes mail from a maildir folder (not a maildir
-folder hierarchy), optionally checks for spam with a user-defined
-spam-checking function, and optionally filters mail into separate mh
-folders.
+B<minc> incorporates mail from a maildir to a mh folder hierarchy.  It
+takes mail from a maildir folder (not a maildir folder hierarchy),
+optionally checks for spam with a user-defined spam-checking function,
+and optionally filters mail into separate mh folders.
  
  The filtering is quite sophisticated, as it is done using real Perl
-substitution (s//) commands.
+matching (m//) commands.
  
  =cut
  
  use strict;
  use warnings;
  
-require 'sysexits.ph';
-
  use Data::Dumper;
  use Errno;
  use Fcntl qw(O_WRONLY O_EXCL O_CREAT);
+use FileHandle;
  use File::Basename;
-use Getopt::Std;
+use File::stat;
+use Getopt::Std; $Getopt::Std::STANDARD_HELP_VERSION = 1;
  use Log::Dispatch;
  use Log::Dispatch::File;
  use POSIX qw(strftime WEXITSTATUS WIFEXITED);
-use Tie::IxHash;
+
+our $VERSION = 1;
+
+# If a filter set's header is $MAGIC_TO_TOKEN, that set is compared
+# against headers matching this regex (taken from procmail).
+my $MAGIC_TO_REGEX = '^((Original-)?(Resent-)?(To|Cc|Bcc)|(X-Envelope |Apparently(-Resent)?)-To)';
+my $MAGIC_TO_TOKEN = ' TO';
+
+# List of SPAM message numbers, scanned at the end so the user can
+# check for false positives.
+my @SPAM;
  
  =head1 OPTIONS
  
@@ -44,59 +53,55 @@ use Tie::IxHash;
  
  =item B<-d>
  
-Dump (using Data::Dumper) the FILTERS hash and exit.  This is useful
+Dump (using Data::Dumper) the FILTERS list and exit.  This is useful
  for testing the syntax of .mincfilter.
  
-=item B<-f>
+=item B<--help>
  
-Filter only, then exit.  This is useful after running B<minc -s>.
+Show help.
  
-=item B<-h>
+=item B<-m> I<MAX>
  
-Show help.
+Stop processing after I<MAX> messages.
  
  =item B<-n>
  
  Dry run; do not actually incorporate the mail, but log and report to
  stdout/stderr as normal.
  
-=item B<-s>
+=item B<-p>
  
-Process SPAM only, then exit, leaving all non-spam messages in the
-maildir.
+Print the filename of each message before checking it for spam.  This
+can be handy if a particular message is giving the spam checker a
+problem.
  
  =back
  
  =cut
  
  my $dumpfilters = 0;
-my $filteronly = 0;
+my $maxmsgs;
  our $run = 1;
-my $spamonly = 0;
+my $printfilenames = 0;
  
-our $opt_d;
-our $opt_f;
-our $opt_h;
-our $opt_n;
-our $opt_s;                     # ;; # stupid cperl-mode
+my %opts;
  
-if (not getopts('dfhns')) {
-    exit(&EX_USAGE);
+if (not getopts('dm:np', \%opts)) {
+    exit(2);
  }
  
-if ($opt_d) {
+if ($opts{'d'}) {
      $dumpfilters = 1;
-} elsif ($opt_h) {
-    print("Sorry bub, no help.\n");
-    exit(&EX_OK);
-} elsif ($opt_n) {
+} elsif ($opts{'n'}) {
      $run = 0;
  }
  
-if ($opt_f) {
-    $filteronly = 1;
-} elsif ($opt_s) {              # ))){ # stupid cperl-mode
-    $spamonly = 1;
+if (defined($opts{'m'})) {
+    $maxmsgs = $opts{'m'};
+}
+
+if ($opts{'p'}) {
+    $printfilenames = 1;
  }
  
  =head1 ENVIRONMENT
@@ -132,46 +137,63 @@ if (not $MAILDIR) {
  =item $HOME/.mincfilter
  
  This file is Perl code (included via the 'require' directive) which is
-expected to define the FILTERS hash.
+expected to define the FILTERS list.
  
  =item $HOME/.mincspam
  
-If this file exists, minc will include it with the expectation that it
-will define a B<is_spam> function.  This function takes a message
-filename as an argument and returns 1 if the message is spam, else 0.
-If this file does not exist, minc will define a simple function that
-always returns 0.
+If this file exists, B<minc> will include it with the expectation that
+it will define a B<spam_check> function.  This function takes a
+message filename as an argument and returns 1 if the message is spam,
+else 0.  If this file does not exist, B<minc> will define a simple
+function that always returns 0.
+
+One of B<minc>'s global variables is available to the user-defined
+B<spam_check> function: $run.  This boolean should be honored;
+B<spam_check> should only take real action (i.e. removing or creating
+files, running external programs, etc.) if $run is non-zero.
  
-One of minc's global variables is available to the user-defined
-is_spam function: $run.  This boolean should be honored; is_spam
-should only take real action (i.e. removing or creating files, running
-external programs, etc.) if $run is non-zero.
+This file may also declare two other functions: B<spam_start_hook> and
+B<spam_stop_hook>.  The former is passed no arguments and is expected
+to return a list.  This list is a "baton" that will also be passed to
+B<spam_check> and B<spam_stop_hook>.  It can hold anything
+B<spam_check> will need to do its job, whether network sockets, pipes,
+or whatever.
  
  XXX: need more details about the spam-handling process; for now read
  the code.
  
  =item `mhpath +`/logs/minc.log
  
-Where minc logs what it does, unless in -n mode.
+Where B<minc> logs what it does, unless in -n mode.
  
  =item `mhpath +`/logs/dryrun.log
  
-Where minc logs what it would do; used in -n mode.
+Where B<minc> logs what it would do; used in -n mode.
+
+=item `mhpath +`/.minc.context
+
+B<minc> uses this file for context (i.e. current folder) instead of
+`mhpath +`/context.  This allows you some amount of freedom to use mh
+while B<minc> is running.  For example, you can changed folders without
+causing a message to be stored in the wrong folder.  Care must still
+be taken, however, about the unseen sequence; if you change it
+(whether via show, or mark, or anything else) while in the same folder
+as B<minc>, it is likely the new message B<minc> stores will not end up
+marked unseen.
  
  =back
  
  =cut
  
-our %FILTERS;
-tie(%FILTERS, 'Tie::IxHash');
+our @FILTERS;
  require "$HOME/.mincfilter";
  
  if (-f "$HOME/.mincspam") {
      require "$HOME/.mincspam";
  } else {
-    sub is_spam {
-        return 0;
-    }
+    eval "sub spam_start_hook { return (); }";
+    eval "sub spam_stop_hook { }";
+    eval "sub spam_check { return 0; }";
  }
  
  my $mh;
@@ -186,6 +208,8 @@ if ($run) {
      $logfile = $mh . '/logs/dryrun.log';
  }
  
+$ENV{"MHCONTEXT"} = $mh . '/.minc.context';
+
  \f
  ###############################################################################
  # Logging
@@ -230,55 +254,40 @@ sub mylog {
      }
  }
  
-sub logincoming { mylog('info', INCOMINGCHAR, @_); }
  sub logsave { mylog('notice', SAVECHAR, @_); }
  sub loginfo { mylog('info', INFOCHAR, @_); }
  sub logdebug { mylog('debug', DEBUGCHAR, @_); }
  sub logwarn { mylog('warning', WARNCHAR, @_); }
  sub logerr { mylog('error', ERRORCHAR, @_); }
  
+sub logincoming {
+    my ($text, @contents) = @_;
+    my $last;
+
+    if (@contents) {
+        $last = $contents[-1];
+    } else {
+        $last = '';
+    }
+
+    mylog('info', INCOMINGCHAR, $text, $last);
+}
+
  sub log_headers {
      my %headers = @_;
  
-    logincoming('From:       ', $headers{'return-path'});
-    logincoming('To:         ', $headers{'to'});
-    logincoming('Subject:    ', $headers{'subject'});
-    logincoming('Message-Id: ', $headers{'message-id'});
+    # For an explanation of the %headers structure, see the
+    # get_headers function below.
+    logincoming('From:       ', @{$headers{'return-path'}});
+    logincoming('To:         ', @{$headers{'to'}});
+    logincoming('Subject:    ', @{$headers{'subject'}});
+    logincoming('Message-Id: ', @{$headers{'message-id'}});
  }
  
  \f
  ###############################################################################
  # Utility procedures
  
-sub _errprint {
-    printf('%s:%s', __FILE__, __LINE__);
-
-    if (@_) {
-        print(': ');
-        foreach (@_) {
-            print;
-        }
-    }
-}
-
-sub err {
-    my $ex = shift;
-
-    _errprint(@_);
-    print(": $!\n");
-
-    exit($ex);
-}
-
-sub errx {
-    my $ex = shift;
-
-    _errprint(@_);
-    print("\n");
-
-    exit($ex);
-}
-
  sub mkfolder {
      my $mhfolder = shift;
      my $folder;
@@ -295,8 +304,7 @@ sub mkfolder {
                  if (-d $target or mkdir($target)) {
                      next;
                  } else {
-                    err(&EX_OSERR,
-                        "Failed to create +$mhfolder ($component)");
+                    die("Failed to create +$mhfolder ($component)");
                  }
              }
          }
@@ -308,7 +316,7 @@ sub getfiles {
      my @result;
  
      if (not opendir(DIR, $dir)) {
-        err(&EX_OSERR, "Failed opendir($dir)");
+        die("Failed opendir($dir)");
      }
  
      # Initialize $! to 0 (success) because stupid stupid Perl provides
@@ -323,7 +331,11 @@ sub getfiles {
        } readdir(DIR);
  
      if ($! != 0) {
-        err(&EX_OSERR, "Failed readdir($dir)");
+        die("Failed readdir($dir)");
+    }
+
+    if (scalar(@result) == 0) {
+        exit(0);
      }
  
      closedir(DIR);
@@ -340,14 +352,14 @@ sub get_highest_msgnum {
  
      $dir = "$mh/$mhfolder";
      if (not opendir(DIR, $dir)) {
-        err(&EX_OSERR, "Failed opendir($dir)");
+        die("Failed opendir($dir)");
      }
  
      $! = 0;
      @list = readdir(DIR);
  
      if ($! != 0) {
-        err(&EX_OSERR, "Failed readdir($dir)");
+        die("Failed readdir($dir)");
      }
  
      closedir(DIR);
@@ -365,12 +377,6 @@ sub get_highest_msgnum {
      return $highest;
  }
  
-# We want to print the name of each list that has new mail only once,
-# so use this hash to store the lists that have already been printed.
-# Start the list out with SPAM already in it, since we don't care when
-# new messages are added to it.
-my %FOLDERS = ('SPAM'=>1);
-
  sub store_message {
      my $msg = shift;
      my $mhfolder = shift;
@@ -398,6 +404,17 @@ sub store_message {
                  # So attempt to create a file with this name.  If it
                  # succeeds, in just a bit here we'll knowingly clobber
                  # this file with the rename(2) call.
+
+                # Another way to do this is not to use rename(2), but
+                # use link(2) + unlink(2) instead.  That's how the
+                # standard maildir algorithm does it.  Each method has
+                # a disadvantage: the program may crash between the
+                # link(2) and unlink(2) calls.  With the standard
+                # maildir algorithm, that means the message will end
+                # up duplicated.  The advantage of creating an empty
+                # file followed by rename(2) is that an extra empty
+                # file is left behind as opposed to a duplicate
+                # message.  This is more easily detected by the user.
                  if ($run) {
                      if (sysopen(MSG, "$mhmsg",
                                  O_WRONLY | O_EXCL | O_CREAT, 0600)) {
@@ -413,45 +430,55 @@ sub store_message {
          # This algorithm is different from the maildir one; let's make
          # 10 tries instead of 3.
          if ($try == 9) {
-            errx(&EX_TEMPFAIL, "Attempted filename $mhmsg exists.");
+            die("Attempted filename $mhmsg exists.");
          }
  
          # This algorithm is different; i don't think we need to sleep.
          #sleep(2);
      }
  
-    logsave("+$mhfolder");
+    if ($mhfolder ne 'SPAM') {
+        logsave("+$mhfolder");
+    }
  
      if ($run) {
          if (not rename($msg, $mhmsg)) {
-            err(&EX_OSERR, "Failed rename($msg, $mhmsg)");
+            die("Failed rename($msg, $mhmsg)");
          }
  
          # Mark each message as soon as we store it and bomb if that
          # fails.  While it is slow, it is not safe to store multiple
          # messages and then have a failure before marking some (or
          # all).
-        if ($mhfolder ne 'SPAM') {
-            $status = system("mark +$mhfolder $msgnum -sequence unseen -add");
+        if ($mhfolder eq 'SPAM') {
+            push(@SPAM, $msgnum);
+        } else {
+            $status = system('mark', "+$mhfolder", "$msgnum", '-sequence',
+                             'unseen', '-add');
              if (not WIFEXITED($status)) {
-                err(&EX_OSERR, "Failed to run mark");
+                die("Failed to run mark");
              } elsif (WEXITSTATUS($status) != 0) {
-                errx(&EX_SOFTWARE, "Failed to mark message unseen.");
+                die("Failed to mark message unseen.");
              }
          }
      }
  
-    if (not $FOLDERS{$mhfolder}) {
-        print("+$mhfolder\n");
-        $FOLDERS{$mhfolder} = 1;
-    }
+    return $msgnum;
  }
  
+# Parse a message file into a structure describing the headers.  The
+# structure is a hash of arrays.  The hash keys are the names of the
+# headers found in the message, made all lower-case.  Each item in
+# the hash is an array of header text.  The array has one element
+# per occurrence of the header.  Most headers will only have a single
+# element because they occur only once.  The Received header is the
+# most common header with multiple texts.
  sub get_headers {
      my $msg = shift;
      my %headers;
      my $current;                # current header, used for unfolding lines
      my $fieldname;              # unmolested header name
+    my $contents;               # contents of header
  
      open(MSG, $msg);
      while (<MSG>) {
@@ -464,15 +491,23 @@ sub get_headers {
              # folded header continuation
  
              if (not defined($current)) {
-                print("Malformed message, cannot parse headers.\n");
+                print(STDERR "Malformed message, cannot parse headers.\n");
                  return ();
              }
  
-            $headers{$current} .= $_;
+            @{$headers{$current}}[-1] .= $_;
          } else {
              ($fieldname) = split(/:/);
              $current = lc($fieldname);
-            (undef, $headers{$current}) = split(/^\Q$fieldname\E:\s*/);
+            (undef, $contents) = split(/^\Q$fieldname\E:\s*/);
+
+            if (defined($headers{$current})) {
+                # multiple occurence of same header
+                push(@{$headers{$current}}, $contents);
+            } else {
+                # first occurence of a header
+                $headers{$current} = [$contents];
+            }
          }
      }
      close(MSG);
@@ -480,33 +515,6 @@ sub get_headers {
      return %headers;
  }
  
-###############################################################################
-# Spam handling
-
-sub kill_spam {
-    my @msglist = @_;
-    my @result;
-    my $msg;
-    my $i;
-
-    @result = ();
-
-    $i = 0;
-    foreach $msg (@msglist) {
-        printf('%sChecking for spam... %6d/%d',
-               "\r", ++$i, scalar(@msglist));
-        if (is_spam($msg)) {
-            print(" SPAM\n");
-            store_message($msg, 'SPAM');
-        } else {
-            push(@result, $msg);
-        }
-    }
-    print("\nDone: ", scalar(@result), " survivors\n");
-
-    return @result;
-}
-
  \f
  ###############################################################################
  # Filtering
@@ -514,30 +522,57 @@ sub kill_spam {
  sub find_mh_folder {
      my $msg = shift;
      my %headers;
+    my $filterref;
+    my @filter;
      my $header;
      my $contents;
-    my $regex;
-    my $subst;
-    my $try;
+    my $pair;
+    my $match;
+    my $expression;
+    my $result;
  
      %headers = get_headers($msg);
      if (not %headers) {
-        return 'inbox';
+        return 'malformed';
      }
  
      log_headers(%headers);
  
-    foreach $header (keys(%FILTERS)) {
-        $contents = $headers{lc($header)};
+    # Walk the list of filters.  This structure is documented in
+    # pod at the end of the program.
+    foreach $filterref (@FILTERS) {
+        @filter = @$filterref;
+        $header = shift(@filter);
+
+        # Handle filters using the magic TO header specially.
+        if ($header eq $MAGIC_TO_TOKEN) {
+            foreach $header (keys(%headers)) {
+                if ($header =~ /$MAGIC_TO_REGEX/i) {
+                    foreach $contents (@{$headers{lc($header)}}) {
+                        foreach $pair (@filter) {
+                            ($match, $expression) = @$pair;
+                            if ($contents =~ /$match/) {
+                                return $expression;
+                            }
+                        }
+                    }
+                }
+            }
  
-        if (defined($contents)) {
-            foreach $regex (keys(%{$FILTERS{$header}})) {
-                $subst = $FILTERS{$header}->{$regex};
+            # Now that it's been processed specially, skip normal handling.
+            next;
+        }
  
-                $try = '$contents =~';
-                $try .= " s|$regex|$subst|is";
-                if (eval $try) {
-                    return $contents;
+        # Walk the list of message headers matching the filter's
+        # specified header.
+        foreach $contents (@{$headers{lc($header)}}) {
+            # Walk the filter's list of match/expression pairs.
+            foreach $pair (@filter) {
+                ($match, $expression) = @$pair;
+                if ($contents =~ /$match/i) {
+                    if (eval "\$result = \"$expression\"") {
+                        return $result;
+                    }
                  }
              }
          }
@@ -548,51 +583,126 @@ sub find_mh_folder {
  
  sub filter_mail {
      my @msglist = @_;
+    my $msgcount = scalar(@msglist);
+    my @baton;
      my $msg;
      my $mhfolder;
+    my $spam = 0;
+    my $saved = 0;
+    my %FOLDERS = ('SPAM'=>1);
  
+    @baton = spam_start_hook();
+
+    STDOUT->autoflush(1);
+    print("$msgcount messages...");
      foreach $msg (@msglist) {
-        $mhfolder = find_mh_folder($msg);
+        if (spam_check($msg, @baton)) {
+            $mhfolder = 'SPAM';
+            $spam = scalar(@SPAM)+ 1;
+        } else {
+            $mhfolder = find_mh_folder($msg);
+            $saved++;
+        }
+
          store_message($msg, $mhfolder);
+
+        print("\r");
+        if (not $FOLDERS{$mhfolder}) {
+            print(' ' x length($msgcount));
+            print("                                \r+$mhfolder\n");
+            $FOLDERS{$mhfolder} = 1;
+        }
+
+        printf('%6d SPAM %6d saved %6d/%1d',
+               $spam, $saved, $spam + $saved, $msgcount);
+
+        defined($maxmsgs) and ($spam + $saved < $maxmsgs or last);
      }
+    print("\n");
+
+    spam_stop_hook(@baton);
  }
  
  \f
  MAIN: {
      my @msglist;
+    my $st;
  
      if ($dumpfilters) {
          $Data::Dumper::Indent = 1;
-        print(Dumper(\%FILTERS));
-        exit(&EX_OK);
+        print(Dumper(\@FILTERS));
+        exit;
      }
  
-    @msglist = getfiles("$MAILDIR/new");
+    @msglist = (
+                map { $_->[1] }
+                sort { $a->[0] <=> $b->[0] }
+                map {
+                    if (not ($st = stat($_))) {
+                        die("stat($_): $!");
+                    }
+                    [$st->mtime, $_]
+                }
+                getfiles("$MAILDIR/new"));
  
-    if (not $filteronly) {
-        @msglist = kill_spam(@msglist);
-    }
+    filter_mail(@msglist);
  
-    if (not $spamonly) {
-        filter_mail(@msglist);
-    }
+    @SPAM and (exec('scan', '+SPAM', @SPAM) or die);
  }
  
+\f
  __END__
  
+=head1 THE FILTERS STRUCTURE
+
+The user's .mincfilter file must define the @FILTERS structure.  This
+structure is an array.  Each element of @FILTERS is a filter.  A
+filter is itself an array.  The first element of a filter is a string,
+the name of the header this filter acts upon.  The header name is not
+case-sensitive.  Each subsequent element of a filter is a pair (i.e. a
+two-element array): first, a regular expression B<minc> uses to
+determine whether this filter matches or not, and second, an
+expression which B<minc> evaluates to get the folder name.
+
+B<minc> decides where to store a message by iterating over the
+@FILTERS array.  It tests each regexp of each filter against all
+headers matching that filter's specified header.  As soon as a match
+is found, B<minc> evaluates the second part of the pair.  This part
+may contain positional parameters from the matched regexp ($1, $2,
+etc.).  The result of this expression is used as the folder name.
+
+Multiple occurrences of the same header are preserved and tested
+individually against the filters.  That means, for example, that
+filters can search all the Received headers for a certain string.
+
+It is important to note that all the arrays of the @FILTERS structure
+are traversed I<in order>.  This means the filters can be arranged so
+that some have priority over others.  XXX: get Doug to write an
+example of having the same header matched more than once.
+
+Lastly, B<minc> supports a magic ' TO' header.  Filters using this
+header are matched against a collection of headers related to (and
+including) the To.  Which headers to use is determined by a regular
+expression borrowed from procmail.
+
  =head1 EXAMPLES
  
-%FILTERS =
-  ('List-Id:'=>
-   {'.*<([[:graph:]]+)-([[:graph:]]+)\.lists\.sourceforge\.net>.*'=>'l/$1/$2',
-    '.*<([[:graph:]]+?)(-list)?\.freedesktop\.org>.*'=>'l/freedesktop/$1',
-    '.*<swig\.cs\.uchicago\.edu>.*'=>'l/swig'},
+@FILTERS = (
+
+  ['List-Id',
+   ['<([[:graph:]]+)-([[:graph:]]+)\.lists\.sourceforge\.net>', 'l/$1/$2'],
+   ['<([[:graph:]]+?)(-list)?\.freedesktop\.org>', 'l/freedesktop/$1'],
+   ['<swig\.cs\.uchicago\.edu>', 'l/swig'],
+  ],
  
-   'List-Post:'=>
-   {'<mailto:([[:graph:]]+)+@([[:graph:]]+)\.apache\.org>'=>'l/apache/$2/$1'},
+  ['List-Post',
+   ['<mailto:([[:graph:]]+)+@([[:graph:]]+)\.apache\.org>', 'l/apache/$2/$1'],
+  ],
  
-   'To:'=>
-   {'root'=>'root'});
+  ['To',
+   ['root', 'root'],
+  ],
+);
  
  The first List-Id filter is a surprisingly effective one which creates
  appropriate folder names (such as l/htdig/updates and l/rox/devel) for
@@ -610,7 +720,7 @@ apache.org subdomain, with mailing lists hosted there.  So, given a
  list such as dev@httpd.apache.org, this filter will create the folder
  name l/apache/httpd/dev.
  
-For an example is_spam function, see
+For an example B<spam_check> function, see
  L<http:E<047>E<047>pretzelnet.orgE<047>cvsE<047>dotfilesE<047>.mincspam>
  
  =head1 AUTHORS
@@ -620,4 +730,15 @@ stolen from Adam Lazur <adam@lazur.org>.
  
  Design by Eric Gillespie and Doug Porter <dsp@waterspout.com>.
  
+This program is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself.
+
  =cut
+
+# Local variables:
+# cperl-indent-level: 4
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+
+# vi: set tabstop=4 expandtab: