#! /usr/local/bin/perl # $Id$ =head1 NAME B - incorporate mail from a maildir into mh folders =head1 SYNOPSIS B [B<-m> I] [B<-n>] [B<-p>] B B<-d> B B<-h> =head1 DESCRIPTION B incorporates mail from a maildir to a mh folder hierarchy. It takes mail from a maildir folder (not a maildir folder hierarchy), optionally checks for spam with a user-defined spam-checking function, and optionally filters mail into separate mh folders. The filtering is quite sophisticated, as it is done using real Perl matching (m//) commands. =cut use strict; use warnings; use Data::Dumper; use Errno; use Fcntl qw(O_WRONLY O_EXCL O_CREAT); use FileHandle; use File::stat; use Getopt::Long qw(:config gnu_getopt no_ignore_case); use POSIX qw(strftime WEXITSTATUS WIFEXITED); use Pod::Usage; our $VERSION = 1; # If a filter set's header is $MAGIC_TO_TOKEN, that set is compared # against headers matching this regex (taken from procmail). my $MAGIC_TO_REGEX = '^((Original-)?(Resent-)?(To|Cc|Bcc)|(X-Envelope |Apparently(-Resent)?)-To)'; my $MAGIC_TO_TOKEN = ' TO'; # List of SPAM message numbers, scanned at the end so the user can # check for false positives. my @SPAM; =head1 OPTIONS =over 4 =item B<-d> Dump (using Data::Dumper) the FILTERS list and exit. This is useful for testing the syntax of .mincfilter. =item B<-h> Show help. =item B<-m> I Stop processing after I messages. =item B<-n> Dry run; do not actually incorporate the mail, but log and report to stdout/stderr as normal. =item B<-p> Print the filename of each message before checking it for spam. This can be handy if a particular message is giving the spam checker a problem. =back =cut my $dumpfilters; my $help; my $maxmsgs; my $norun; my $printfilenames; GetOptions( 'd' => \$dumpfilters, 'h|help' => \$help, 'm=i' => \$maxmsgs, 'n' => \$norun, 'p' => \$printfilenames, ) or pod2usage(); $help and pod2usage(-exitstatus=>0, -verbose=>1); @ARGV == 0 or pod2usage(); our $run = !$norun; =head1 ENVIRONMENT =over 4 =item HOME Where configuration files (.mincfilter) are found. Also, $HOME/Maildir is used for the maildir if MAILDIR is not set. =item MAILDIR Where mail is delivered. =back =cut use Env qw(HOME MAILDIR); if (not $HOME) { die("HOME environment variable must be set.\n"); } if (not $MAILDIR) { $MAILDIR = "$HOME/Maildir"; } =head1 FILES =over 4 =item $HOME/.mincfilter This file is Perl code (included via the 'require' directive) which is expected to define the FILTERS list. =item $HOME/.mincspam If this file exists, B will include it with the expectation that it will define a B function. This function takes a message filename as an argument and returns 1 if the message is spam, else 0. If this file does not exist, B will define a simple function that always returns 0. One of B's global variables is available to the user-defined B function: $run. This boolean should be honored; B should only take real action (i.e. removing or creating files, running external programs, etc.) if $run is non-zero. This file may also declare two other functions: B and B. The former is passed no arguments and is expected to return a list. This list is a "baton" that will also be passed to B and B. It can hold anything B will need to do its job, whether network sockets, pipes, or whatever. XXX: need more details about the spam-handling process; for now read the code. =item `mhpath +`/logs/minc.log Where B logs what it does, unless in -n mode. =item `mhpath +`/logs/dryrun.log Where B logs what it would do; used in -n mode. =item `mhpath +`/.minc.context B uses this file for context (i.e. current folder) instead of `mhpath +`/context. This allows you some amount of freedom to use mh while B is running. For example, you can changed folders without causing a message to be stored in the wrong folder. Care must still be taken, however, about the unseen sequence; if you change it (whether via show, or mark, or anything else) while in the same folder as B, it is likely the new message B stores will not end up marked unseen. =back =cut our @FILTERS; my $mh; my $logfile; $mh = `mhpath +`; chomp($mh); if ($run) { $logfile = $mh . '/logs/minc.log'; } else { $logfile = $mh . '/logs/dryrun.log'; } $ENV{"MHCONTEXT"} = $mh . '/.minc.context'; ############################################################################### # Logging sub mylog { my $timestamp = strftime('%b %e %H:%M:%S', localtime); my $msg; foreach my $part (@_) { if (defined($part)) { $msg .= $part; } } # no newlines in the log message, thanks $msg =~ s/\n/ /gm; open(LOG, ">>$logfile") or die("open(>>$logfile): $!"); print(LOG "$timestamp $msg\n") or die("print(>>$logfile): $!"); close(LOG) or die("close($logfile): $!"); } sub logheader { my ($text, @contents) = @_; my $last; if (@contents) { $last = $contents[-1]; } else { $last = ''; } mylog('<< ', $text, $last); } sub log_headers { my %headers = @_; # For an explanation of the %headers structure, see the # get_headers function below. logheader('From: ', @{$headers{'return-path'}}); logheader('To: ', @{$headers{'to'}}); logheader('Subject: ', @{$headers{'subject'}}); logheader('Message-Id: ', @{$headers{'message-id'}}); } ############################################################################### # Utility procedures sub mkfolder { my $folder = shift; my $target; my $component; $target = $mh; foreach $component (split('/', $folder)) { $target = join('/', $target, $component); -d $target or mkdir($target) or die("mkdir($target): $!"); } } sub getfiles { my $dir = shift; my @result; if (not opendir(DIR, $dir)) { die("opendir($dir): $!"); } # Initialize $! to 0 (success) because stupid stupid Perl provides # no way to distinguish an error from an empty directory; that # means setting $! to 0 and testing it afterwards is the only way # to detect an error. Real Programmers don't handle errors, # right? >sigh< $! = 0; @result = readdir(DIR); if ($! != 0) { die("readdir($dir): $!"); } if (@result <= 2) { exit(0); } STDOUT->autoflush(1); print(@result - 2, " messages..."); closedir(DIR); return @result; } sub get_highest_msgnum { my $mhfolder = shift; my $dir; my @list; my $highest; my $msgnum; $dir = "$mh/$mhfolder"; if (not opendir(DIR, $dir)) { die("opendir($dir): $!"); } # Insert rant from getfiles here. $! = 0; @list = readdir(DIR); if ($! != 0) { die("readdir($dir): $!"); } closedir(DIR); $highest = 0; foreach $msgnum (@list) { # Look for integers. if ($msgnum =~ /^[0-9]+$/) { if ($msgnum > $highest) { $highest = $msgnum; } } } return $highest; } sub store_message { my $msg = shift; my $mhfolder = shift; my $msgnum; my $try; my $mhmsg; my $status; # We must do this even in -n mode because later steps fail without # it. This should be harmless. mkfolder($mhfolder); # This loop is a modified version of the maildir delivery algorithm. $msgnum = get_highest_msgnum($mhfolder); for ($try = 0; ; $try++) { $msgnum++; $mhmsg = "$mh/$mhfolder/$msgnum"; if (not stat($mhmsg)) { if ($!{ENOENT}) { # Now we have a non-existent file, let's try to create # it. We must create a zero-byte file first because a # file my appear between our happy stat results and # our later rename(2), which would clobber said file. # So attempt to create a file with this name. If it # succeeds, in just a bit here we'll knowingly clobber # this file with the rename(2) call. # Another way to do this is not to use rename(2), but # use link(2) + unlink(2) instead. That's how the # standard maildir algorithm does it. Each method has # a disadvantage: the program may crash between the # link(2) and unlink(2) calls. With the standard # maildir algorithm, that means the message will end # up duplicated. The advantage of creating an empty # file followed by rename(2) is that an extra empty # file is left behind as opposed to a duplicate # message. This is more easily detected by the user. if ($run) { if (sysopen(MSG, $mhmsg, O_WRONLY | O_EXCL | O_CREAT, 0600)) { close(MSG); last; } } else { last; } } } # This algorithm is different from the maildir one; let's make # 10 tries instead of 3. if ($try == 9) { die("Attempted filename $mhmsg exists."); } # This algorithm is different; i don't think we need to sleep. #sleep(2); } if ($mhfolder ne 'SPAM') { mylog('+', $mhfolder); } if ($run) { if (not rename($msg, $mhmsg)) { die("rename($msg, $mhmsg): $!"); } # Mark each message as soon as we store it and bomb if that # fails. While it is slow, it is not safe to store multiple # messages and then have a failure before marking some (or # all). if ($mhfolder eq 'SPAM') { push(@SPAM, $msgnum); } else { $status = system('mark', "+$mhfolder", "$msgnum", '-sequence', 'unseen', '-add'); # XXX need to handle signalled and stopped, and print # the exit code or signal number. if (not WIFEXITED($status)) { die("Failed to run mark"); } elsif (WEXITSTATUS($status) != 0) { die("Failed to mark message unseen."); } } } return $msgnum; } # Parse a message file into a structure describing the headers. The # structure is a hash of arrays. The hash keys are the names of the # headers found in the message, made all lower-case. Each item in # the hash is an array of header text. The array has one element # per occurrence of the header. Most headers will only have a single # element because they occur only once. The Received header is the # most common header with multiple texts. sub get_headers { my $msg = shift; my %headers; my $current; # current header, used for unfolding lines my $fieldname; # unmolested header name my $contents; # contents of header open(MSG, $msg) or die("open(MSG, $msg): $!"); while () { chomp; if (length == 0) { last; } if (/^\s/) { # folded header continuation if (not defined($current)) { print(STDERR "Malformed message, cannot parse headers.\n"); return (); } @{$headers{$current}}[-1] .= $_; } else { ($fieldname) = split(/:/); $current = lc($fieldname); (undef, $contents) = split(/^\Q$fieldname\E:\s*/); if (defined($headers{$current})) { # multiple occurence of same header push(@{$headers{$current}}, $contents); } else { # first occurence of a header $headers{$current} = [$contents]; } } } close(MSG); return %headers; } ############################################################################### # Filtering sub find_mh_folder { my $msg = shift; my %headers; my $filterref; my @filter; my $header; my $contents; my $pair; my $match; my $expression; my $result; %headers = get_headers($msg); if (not %headers) { return 'malformed'; } log_headers(%headers); # Walk the list of filters. This structure is documented in # pod at the end of the program. foreach $filterref (@FILTERS) { @filter = @$filterref; $header = shift(@filter); # Handle filters using the magic TO header specially. if ($header eq $MAGIC_TO_TOKEN) { foreach $header (keys(%headers)) { if ($header =~ /$MAGIC_TO_REGEX/i) { foreach $contents (@{$headers{lc($header)}}) { foreach $pair (@filter) { ($match, $expression) = @$pair; if ($contents =~ /$match/) { return $expression; } } } } } # Now that it's been processed specially, skip normal handling. next; } # Walk the list of message headers matching the filter's # specified header. foreach $contents (@{$headers{lc($header)}}) { # Walk the filter's list of match/expression pairs. foreach $pair (@filter) { ($match, $expression) = @$pair; if ($contents =~ /$match/i) { if (eval "\$result = \"$expression\"") { return $result; } } } } } return 'inbox'; } sub filter_mail { my @msglist = @_; my $msgcount = @msglist - 2; # don't count . and .. my $len = length($msgcount); my @baton; my $msg; my $mhfolder; my $spam = 0; my $saved = 0; my %FOLDERS = ('SPAM'=>1); if (-f "$HOME/.mincspam") { require "$HOME/.mincspam"; } else { eval "sub spam_start_hook { return (); }"; eval "sub spam_stop_hook { }"; eval "sub spam_check { return 0; }"; } if (-f "$HOME/.mincfilter") { require "$HOME/.mincfilter"; } @baton = spam_start_hook(); foreach $msg (@msglist) { ($msg eq '.' or $msg eq '..') and next; if ($printfilenames) { print("$msg\n"); } if (spam_check($msg, @baton)) { $mhfolder = 'SPAM'; $spam = @SPAM + 1; } else { $mhfolder = find_mh_folder($msg); $saved++; } store_message($msg, $mhfolder); print("\r"); if (not $FOLDERS{$mhfolder}) { print(' ' x $len); print(" \r$mhfolder\n"); $FOLDERS{$mhfolder} = 1; } printf(" \%${len}d SPAM \%${len}d saved \%${len}d/%1d", $spam, $saved, $spam + $saved, $msgcount); defined($maxmsgs) and ($spam + $saved < $maxmsgs or last); } print("\n"); spam_stop_hook(@baton); } MAIN: { my $st; if ($dumpfilters) { require "$HOME/.mincfilter"; $Data::Dumper::Indent = 1; print(Dumper(\@FILTERS)); exit; } chdir("$MAILDIR/new") or die("chdir($MAILDIR/new): $!"); filter_mail(map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { if (not ($st = stat($_))) { die("stat($_): $!"); } [$st->mtime, $_] } getfiles('.')); @SPAM and (exec('scan', '+SPAM', @SPAM) or die); } __END__ =head1 THE FILTERS STRUCTURE The user's .mincfilter file must define the @FILTERS structure. This structure is an array. Each element of @FILTERS is a filter. A filter is itself an array. The first element of a filter is a string, the name of the header this filter acts upon. The header name is not case-sensitive. Each subsequent element of a filter is a pair (i.e. a two-element array): first, a regular expression B uses to determine whether this filter matches or not, and second, an expression which B evaluates to get the folder name. B decides where to store a message by iterating over the @FILTERS array. It tests each regexp of each filter against all headers matching that filter's specified header. As soon as a match is found, B evaluates the second part of the pair. This part may contain positional parameters from the matched regexp ($1, $2, etc.). The result of this expression is used as the folder name. Multiple occurrences of the same header are preserved and tested individually against the filters. That means, for example, that filters can search all the Received headers for a certain string. It is important to note that all the arrays of the @FILTERS structure are traversed I. This means the filters can be arranged so that some have priority over others. XXX: get Doug to write an example of having the same header matched more than once. Lastly, B supports a magic ' TO' header. Filters using this header are matched against a collection of headers related to (and including) the To. Which headers to use is determined by a regular expression borrowed from procmail. =head1 EXAMPLES @FILTERS = ( ['List-Id', ['<([[:graph:]]+)-([[:graph:]]+)\.lists\.sourceforge\.net>', 'l/$1/$2'], ['<([[:graph:]]+?)(-list)?\.freedesktop\.org>', 'l/freedesktop/$1'], ['', 'l/swig'], ], ['List-Post', ['', 'l/apache/$2/$1'], ], ['To', ['root', 'root'], ], ); The first List-Id filter is a surprisingly effective one which creates appropriate folder names (such as l/htdig/updates and l/rox/devel) for all Sourceforge lists to which i subscribe. Certainly there are lists hosted at Sourceforge which do not work well with this, and this is where it is important to remember that the first match is the one uses. Simply put the more specific rules before this one. The next List-Id example is simple. The swig example demonstrates that the folder name does not have to use a portion of the matched header; you can just hardcode the folder name. The List-Post example is a nice one. Most ASF projects have their own apache.org subdomain, with mailing lists hosted there. So, given a list such as dev@httpd.apache.org, this filter will create the folder name l/apache/httpd/dev. For an example B function, see L =head1 AUTHORS Written by Eric Gillespie . Design by Eric Gillespie and Doug Porter . This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut # Local variables: # cperl-indent-level: 4 # perl-indent-level: 4 # indent-tabs-mode: nil # End: # vi: set tabstop=4 expandtab: