X-Git-Url: https://diplodocus.org/git/minc/blobdiff_plain/e0fd2b7eae13eced0ceeabba651c85fb3a664e75..68a4ee3a17937925b02ea2197e6d30e745d4dc3b:/minc diff --git a/minc b/minc index 29a02d7..e216504 100755 --- a/minc +++ b/minc @@ -1,41 +1,53 @@ -#! /usr/local/bin/perl +#! /usr/bin/env perl # $Id$ =head1 NAME -B - Incorporate mail from a maildir into mh folders. +B - incorporate mail from a maildir into mh folders =head1 SYNOPSIS -B [-B] +B [B<-m> I] [B<-n>] [B<-p>] + +B B<-d> + +B B<-h> =head1 DESCRIPTION -B is a program for incorporating mail from a maildir to a mh -folder hierarchy. It takes mail from a maildir folder (not a maildir -folder hierarchy), checks for spam with razor -(http://razor.sourceforge.net/), and optionally filters mail into -separate mh folders. +B incorporates mail from a maildir to a mh folder hierarchy. It +takes mail from a maildir folder (not a maildir folder hierarchy), +optionally checks for spam with a user-defined spam-checking function, +and optionally filters mail into separate mh folders. The filtering is quite sophisticated, as it is done using real Perl -substitution (s//) commands. +matching (m//) commands. =cut use strict; use warnings; -require 'sysexits.ph'; - use Data::Dumper; use Errno; use Fcntl qw(O_WRONLY O_EXCL O_CREAT); -use File::Basename; -use Getopt::Std; -use Log::Dispatch; -use Log::Dispatch::File; +use FileHandle; +use File::stat; +use Getopt::Long qw(:config gnu_getopt no_ignore_case); use POSIX qw(strftime WEXITSTATUS WIFEXITED); +use Pod::Usage; + +our $VERSION = 1; + +# If a filter set's header is $MAGIC_TO_TOKEN, that set is compared +# against headers matching this regex (taken from procmail). +my $MAGIC_TO_REGEX = '^((Original-)?(Resent-)?(To|Cc|Bcc)|(X-Envelope |Apparently(-Resent)?)-To)'; +my $MAGIC_TO_TOKEN = ' TO'; + +# List of SPAM message numbers, scanned at the end so the user can +# check for false positives. +my @SPAM; =head1 OPTIONS @@ -43,60 +55,49 @@ use POSIX qw(strftime WEXITSTATUS WIFEXITED); =item B<-d> -Dump (using Data::Dumper) the FILTERS hash and exit. This is useful +Dump (using Data::Dumper) the FILTERS list and exit. This is useful for testing the syntax of .mincfilter. -=item B<-f> - -Filter only, then exit. This is useful after running B. - =item B<-h> Show help. +=item B<-m> I + +Stop processing after I messages. + =item B<-n> Dry run; do not actually incorporate the mail, but log and report to stdout/stderr as normal. -=item B<-s> +=item B<-p> -Process SPAM only, then exit, leaving all non-spam messages in the -maildir. +Print the filename of each message before checking it for spam. This +can be handy if a particular message is giving the spam checker a +problem. =back =cut -my $dumpfilters = 0; -my $filteronly = 0; -my $run = 1; -my $spamonly = 0; - -our $opt_d; -our $opt_f; -our $opt_h; -our $opt_n; -our $opt_s; # ;; # stupid cperl-mode - -if (not getopts('dhns')) { - exit(&EX_USAGE); -} +my $dumpfilters; +my $help; +my $maxmsgs; +my $norun; +my $printfilenames; -if ($opt_d) { - $dumpfilters = 1; -} elsif ($opt_h) { - print("Sorry bub, no help.\n"); - exit(&EX_OK); -} elsif ($opt_n) { - $run = 0; -} +GetOptions( + 'd' => \$dumpfilters, + 'h|help' => \$help, + 'm=i' => \$maxmsgs, + 'n' => \$norun, + 'p' => \$printfilenames, + ) or pod2usage(); +$help and pod2usage(-exitstatus=>0, -verbose=>1); +@ARGV == 0 or pod2usage(); -if ($opt_f) { - $filteronly = 1; -} elsif ($opt_s) { # ))){ # stupid cperl-mode - $spamonly = 1; -} +our $run = !$norun; =head1 ENVIRONMENT @@ -131,22 +132,55 @@ if (not $MAILDIR) { =item $HOME/.mincfilter This file is Perl code (included via the 'require' directive) which is -expected to define the FILTERS hash. +expected to define the FILTERS list. + +=item $HOME/.mincspam + +If this file exists, B will include it with the expectation that +it will define a B function. This function takes a +message filename as an argument and returns 1 if the message is spam, +else 0. If this file does not exist, B will define a simple +function that always returns 0. + +One of B's global variables is available to the user-defined +B function: $run. This boolean should be honored; +B should only take real action (i.e. removing or creating +files, running external programs, etc.) if $run is non-zero. + +This file may also declare two other functions: B and +B. The former is passed no arguments and is expected +to return a list. This list is a "baton" that will also be passed to +B and B. It can hold anything +B will need to do its job, whether network sockets, pipes, +or whatever. + +XXX: need more details about the spam-handling process; for now read +the code. =item `mhpath +`/logs/minc.log -Where minc logs what it does, unless in -n mode. +Where B logs what it does, unless in -n mode. =item `mhpath +`/logs/dryrun.log -Where minc logs what it would do; used in -n mode. +Where B logs what it would do; used in -n mode. + +=item `mhpath +`/.minc.context + +B uses this file for context (i.e. current folder) instead of +`mhpath +`/context. This allows you some amount of freedom to use mh +while B is running. For example, you can changed folders without +causing a message to be stored in the wrong folder. Care must still +be taken, however, about the unseen sequence; if you change it +(whether via show, or mark, or anything else) while in the same folder +as B, it is likely the new message B stores will not end up +marked unseen. =back =cut -our %FILTERS; -require "$HOME/.mincfilter"; +our @FILTERS; my $mh; my $logfile; @@ -160,33 +194,13 @@ if ($run) { $logfile = $mh . '/logs/dryrun.log'; } +$ENV{"MHCONTEXT"} = $mh . '/.minc.context'; + ############################################################################### # Logging -# debuglevels for the logger -use constant LOG_DEBUGLEVEL => 'info'; -use constant SCREEN_DEBUGLEVEL => 'debug'; - -# characters used to wrap around the id field in the log so I can more -# easily parse the msg flow -use constant DEBUGCHAR => '%'; -use constant INFOCHAR => '='; -use constant INCOMINGCHAR => '<'; -use constant SAVECHAR => '>'; -use constant WARNCHAR => 'W'; -use constant ERRORCHAR => 'E'; - -my $logger = new Log::Dispatch; -$logger->add(new Log::Dispatch::File (name=>'logfile', - filename=>$logfile, - min_level=>'info', - mode=>'append')); - -# log alias, handles getting passed vars that are undef sub mylog { - my $level = shift; - my $act = shift; my $timestamp = strftime('%b %e %H:%M:%S', localtime); my $msg; foreach my $part (@_) { @@ -196,84 +210,49 @@ sub mylog { } # no newlines in the log message, thanks $msg =~ s/\n/ /gm; - $msg = sprintf("%s %s%s %s\n", $timestamp, $act, $act, $msg); - $logger->log(level=>$level, message=>$msg); - if ($act eq SAVECHAR) { - $logger->log(level=>$level, message=>"\n"); - } + open(LOG, ">>$logfile") or die("open(>>$logfile): $!"); + print(LOG "$timestamp $msg\n") or die("print(>>$logfile): $!"); + close(LOG) or die("close($logfile): $!"); } -sub logincoming { mylog('info', INCOMINGCHAR, @_); } -sub logsave { mylog('notice', SAVECHAR, @_); } -sub loginfo { mylog('info', INFOCHAR, @_); } -sub logdebug { mylog('debug', DEBUGCHAR, @_); } -sub logwarn { mylog('warning', WARNCHAR, @_); } -sub logerr { mylog('error', ERRORCHAR, @_); } +sub logheader { + my ($text, @contents) = @_; + my $last; + + if (@contents) { + $last = $contents[-1]; + } else { + $last = ''; + } + + mylog('<< ', $text, $last); +} sub log_headers { my %headers = @_; - logincoming('From: ', $headers{'return-path'}); - logincoming('To: ', $headers{'to'}); - logincoming('Subject: ', $headers{'subject'}); - logincoming('Message-Id: ', $headers{'message-id'}); + # For an explanation of the %headers structure, see the + # get_headers function below. + logheader('From: ', @{$headers{'return-path'}}); + logheader('To: ', @{$headers{'to'}}); + logheader('Subject: ', @{$headers{'subject'}}); + logheader('Message-Id: ', @{$headers{'message-id'}}); } ############################################################################### # Utility procedures -sub _errprint { - printf('%s:%s', __FILE__, __LINE__); - - if (@_) { - print(': '); - foreach (@_) { - print; - } - } -} - -sub err { - my $ex = shift; - - _errprint(@_); - print(": $!\n"); - - exit($ex); -} - -sub errx { - my $ex = shift; - - _errprint(@_); - print("\n"); - - exit($ex); -} - sub mkfolder { - my $mhfolder = shift; - my $folder; + my $folder = shift; my $target; my $component; - $folder = $mh . '/' . $mhfolder; - $target = ''; - - if (not -d $folder) { - foreach $component (split('/', $folder)) { - if (defined($component) and length($component) >= 1) { - $target = $target . '/' . $component; - if (-d $target or mkdir($target)) { - next; - } else { - err(&EX_OSERR, - "Failed to create +$mhfolder ($component)"); - } - } - } + $target = $mh; + foreach $component (split('/', $folder)) { + $target = join('/', $target, $component); + -d $target or mkdir($target) or die("mkdir($target): $!"); } } @@ -282,7 +261,7 @@ sub getfiles { my @result; if (not opendir(DIR, $dir)) { - err(&EX_OSERR, "Failed opendir($dir)"); + die("opendir($dir): $!"); } # Initialize $! to 0 (success) because stupid stupid Perl provides @@ -291,15 +270,18 @@ sub getfiles { # to detect an error. Real Programmers don't handle errors, # right? >sigh< $! = 0; - @result = grep { - ($_ ne '.' and $_ ne '..') - and $_ = "$MAILDIR/new/$_" - } readdir(DIR); - + @result = readdir(DIR); if ($! != 0) { - err(&EX_OSERR, "Failed readdir($dir)"); + die("readdir($dir): $!"); } + if (@result <= 2) { + exit(0); + } + + STDOUT->autoflush(1); + print(@result - 2, " messages..."); + closedir(DIR); return @result; @@ -314,14 +296,15 @@ sub get_highest_msgnum { $dir = "$mh/$mhfolder"; if (not opendir(DIR, $dir)) { - err(&EX_OSERR, "Failed opendir($dir)"); + die("opendir($dir): $!"); } + # Insert rant from getfiles here. $! = 0; @list = readdir(DIR); if ($! != 0) { - err(&EX_OSERR, "Failed readdir($dir)"); + die("readdir($dir): $!"); } closedir(DIR); @@ -339,12 +322,6 @@ sub get_highest_msgnum { return $highest; } -# We want to print the name of each list that has new mail only once, -# so use this hash to store the lists that have already been printed. -# Start the list out with SPAM already in it, since we don't care when -# new messages are added to it. -my %FOLDERS = ('SPAM'=>1); - sub store_message { my $msg = shift; my $mhfolder = shift; @@ -357,6 +334,7 @@ sub store_message { # it. This should be harmless. mkfolder($mhfolder); + # This loop is a modified version of the maildir delivery algorithm. $msgnum = get_highest_msgnum($mhfolder); for ($try = 0; ; $try++) { $msgnum++; @@ -371,8 +349,19 @@ sub store_message { # So attempt to create a file with this name. If it # succeeds, in just a bit here we'll knowingly clobber # this file with the rename(2) call. + + # Another way to do this is not to use rename(2), but + # use link(2) + unlink(2) instead. That's how the + # standard maildir algorithm does it. Each method has + # a disadvantage: the program may crash between the + # link(2) and unlink(2) calls. With the standard + # maildir algorithm, that means the message will end + # up duplicated. The advantage of creating an empty + # file followed by rename(2) is that an extra empty + # file is left behind as opposed to a duplicate + # message. This is more easily detected by the user. if ($run) { - if (sysopen(MSG, "$mhmsg", + if (sysopen(MSG, $mhmsg, O_WRONLY | O_EXCL | O_CREAT, 0600)) { close(MSG); last; @@ -386,47 +375,59 @@ sub store_message { # This algorithm is different from the maildir one; let's make # 10 tries instead of 3. if ($try == 9) { - errx(&EX_TEMPFAIL, "Attempted filename $mhmsg exists."); + die("Attempted filename $mhmsg exists."); } # This algorithm is different; i don't think we need to sleep. #sleep(2); } - logsave("+$mhfolder"); + if ($mhfolder ne 'SPAM') { + mylog('+', $mhfolder); + } if ($run) { if (not rename($msg, $mhmsg)) { - err(&EX_OSERR, "Failed rename($msg, $mhmsg)"); + die("rename($msg, $mhmsg): $!"); } # Mark each message as soon as we store it and bomb if that # fails. While it is slow, it is not safe to store multiple # messages and then have a failure before marking some (or # all). - if ($mhfolder ne 'SPAM') { - $status = system("mark +$mhfolder $msgnum -sequence unseen -add"); + if ($mhfolder eq 'SPAM') { + push(@SPAM, $msgnum); + } else { + $status = system('mark', "+$mhfolder", "$msgnum", '-sequence', + 'unseen', '-add'); + # XXX need to handle signalled and stopped, and print + # the exit code or signal number. if (not WIFEXITED($status)) { - err(&EX_OSERR, "Failed to run mark"); - } elsif (WEXITSTATUS($status) == 0) { - errx(&EX_SOFTWARE, "Failed to mark message unseen."); + die("Failed to run mark"); + } elsif (WEXITSTATUS($status) != 0) { + die("Failed to mark message unseen."); } } } - if (not $FOLDERS{$mhfolder}) { - print("+$mhfolder\n"); - $FOLDERS{$mhfolder} = 1; - } + return $msgnum; } +# Parse a message file into a structure describing the headers. The +# structure is a hash of arrays. The hash keys are the names of the +# headers found in the message, made all lower-case. Each item in +# the hash is an array of header text. The array has one element +# per occurrence of the header. Most headers will only have a single +# element because they occur only once. The Received header is the +# most common header with multiple texts. sub get_headers { my $msg = shift; my %headers; my $current; # current header, used for unfolding lines my $fieldname; # unmolested header name + my $contents; # contents of header - open(MSG, $msg); + open(MSG, $msg) or die("open(MSG, $msg): $!"); while () { chomp; if (length == 0) { @@ -437,15 +438,23 @@ sub get_headers { # folded header continuation if (not defined($current)) { - print("Malformed message, cannot parse headers.\n"); - return (); + warn('Malformed message, cannot parse headers.'); + next; } - $headers{$current} .= $_; + @{$headers{$current}}[-1] .= $_; } else { ($fieldname) = split(/:/); $current = lc($fieldname); - (undef, $headers{$current}) = split(/^$fieldname:\s*/); + (undef, $contents) = split(/^\Q$fieldname\E:\s*/); + + if (defined($headers{$current})) { + # multiple occurence of same header + push(@{$headers{$current}}, $contents); + } else { + # first occurence of a header + $headers{$current} = [$contents]; + } } } close(MSG); @@ -453,88 +462,6 @@ sub get_headers { return %headers; } - -############################################################################### -# Spam handling - -sub is_spam { - my $msg = shift; - my $err; - my $sig; - my $line; - my $message; - my $status; - - $err = $sig = $msg; - $err =~ s|/new/|/err/|; - $sig =~ s|/new/|/sig/|; - if (-f $err) { - open(ERR, $err); - $line = ; - close(ERR); - - chomp($line); - ($message, $!) = split(/:/, $line); - - logerr("$err: $message: $!"); - - if ($run) { - if (unlink($err) != 1) { - err(&EX_OSERR, "Failed unlink($err)"); - } - } - } elsif (-f $sig) { - # This is supposed to be a signature created with razor-check - # directly after delivery. Currently this isn't supported - # because it isn't clear to me how to get that signature back - # into razor-check. For now, just unlink any sig files we - # find and proceed with full razor-check mode. - - if ($run) { - if (unlink($sig) != 1) { - err(&EX_OSERR, "Failed unlink($sig)"); - } - } - } - - if ($run) { - $status = system("razor-check < $msg"); - if (not WIFEXITED($status)) { - err(&EX_OSERR, "Failed to run razor-check < $msg"); - } elsif (WEXITSTATUS($status) == 0) { - return 1; - } else { - return 0; - } - } else { - return 0; - } -} - -sub kill_spam { - my @msglist = @_; - my @result; - my $msg; - my $i; - - @result = (); - - $i = 0; - foreach $msg (@msglist) { - printf('%sChecking for spam... %6d/%d', - "\r", ++$i, scalar(@msglist)); - if (is_spam($msg)) { - print(" SPAM\n"); - store_message($msg, 'SPAM'); - } else { - push(@result, $msg); - } - } - print("\nDone: ", scalar(@result), " survivors\n"); - - return @result; -} - ############################################################################### # Filtering @@ -542,30 +469,57 @@ sub kill_spam { sub find_mh_folder { my $msg = shift; my %headers; + my $filterref; + my @filter; my $header; my $contents; - my $regex; - my $subst; - my $try; + my $pair; + my $match; + my $expression; + my $result; %headers = get_headers($msg); if (not %headers) { - return 'inbox'; + return 'malformed'; } log_headers(%headers); - foreach $header (keys(%FILTERS)) { - $contents = $headers{lc($header)}; + # Walk the list of filters. This structure is documented in + # pod at the end of the program. + foreach $filterref (@FILTERS) { + @filter = @$filterref; + $header = shift(@filter); + + # Handle filters using the magic TO header specially. + if ($header eq $MAGIC_TO_TOKEN) { + foreach $header (keys(%headers)) { + if ($header =~ /$MAGIC_TO_REGEX/i) { + foreach $contents (@{$headers{lc($header)}}) { + foreach $pair (@filter) { + ($match, $expression) = @$pair; + if ($contents =~ /$match/) { + return $expression; + } + } + } + } + } - if (defined($contents)) { - foreach $regex (keys(%{$FILTERS{$header}})) { - $subst = $FILTERS{$header}->{$regex}; + # Now that it's been processed specially, skip normal handling. + next; + } - $try = '$contents =~'; - $try .= " s|$regex|$subst|is"; - if (eval $try) { - return $contents; + # Walk the list of message headers matching the filter's + # specified header. + foreach $contents (@{$headers{lc($header)}}) { + # Walk the filter's list of match/expression pairs. + foreach $pair (@filter) { + ($match, $expression) = @$pair; + if ($contents =~ /$match/i) { + if (eval "\$result = \"$expression\"") { + return $result; + } } } } @@ -576,51 +530,150 @@ sub find_mh_folder { sub filter_mail { my @msglist = @_; + my $msgcount = @msglist - 2; # don't count . and .. + my $len = length($msgcount); + my @baton; my $msg; my $mhfolder; + my $spam = 0; + my $saved = 0; + my $msgnum; + my %FOLDERS = ('SPAM'=>1); + + if (-f "$HOME/.mincspam") { + require "$HOME/.mincspam"; + } else { + eval "sub spam_start_hook { return (); }"; + eval "sub spam_stop_hook { }"; + eval "sub spam_check { return 0; }"; + } + + if (-f "$HOME/.mincfilter") { + require "$HOME/.mincfilter"; + } + + if (-f "$HOME/.minchooks") { + require "$HOME/.minchooks"; + } else { + eval "sub post_store_hook { }"; + } + + @baton = spam_start_hook(); foreach $msg (@msglist) { - $mhfolder = find_mh_folder($msg); - store_message($msg, $mhfolder); + ($msg eq '.' or $msg eq '..') and next; + + if ($printfilenames) { + print("$msg\n"); + } + + if (spam_check($msg, @baton)) { + $mhfolder = 'SPAM'; + $spam = @SPAM + 1; + } else { + $mhfolder = find_mh_folder($msg); + $saved++; + } + + $msgnum = store_message($msg, $mhfolder); + + post_store_hook($mhfolder, $msgnum); + + print("\r"); + if (not $FOLDERS{$mhfolder}) { + print(' ' x $len); + print(" \r$mhfolder\n"); + $FOLDERS{$mhfolder} = 1; + } + + printf(" \%${len}d SPAM \%${len}d saved \%${len}d/%1d", + $spam, $saved, $spam + $saved, $msgcount); + + defined($maxmsgs) and ($spam + $saved < $maxmsgs or last); } + print("\n"); + + spam_stop_hook(@baton); } MAIN: { - my @msglist; + my $st; if ($dumpfilters) { + require "$HOME/.mincfilter"; $Data::Dumper::Indent = 1; - print(Dumper(\%FILTERS)); - exit(&EX_OK); + print(Dumper(\@FILTERS)); + exit; } - @msglist = getfiles("$MAILDIR/new"); - - if (not $filteronly) { - @msglist = kill_spam(@msglist); - } + chdir("$MAILDIR/new") or die("chdir($MAILDIR/new): $!"); + filter_mail(map { $_->[1] } + sort { $a->[0] <=> $b->[0] } + map { + if (not ($st = stat($_))) { + die("stat($_): $!"); + } + [$st->mtime, $_] + } + getfiles('.')); - if (not $spamonly) { - filter_mail(@msglist); - } + @SPAM and (exec('scan', '+SPAM', @SPAM) or die); } + __END__ +=head1 THE FILTERS STRUCTURE + +The user's .mincfilter file must define the @FILTERS structure. This +structure is an array. Each element of @FILTERS is a filter. A +filter is itself an array. The first element of a filter is a string, +the name of the header this filter acts upon. The header name is not +case-sensitive. Each subsequent element of a filter is a pair (i.e. a +two-element array): first, a regular expression B uses to +determine whether this filter matches or not, and second, an +expression which B evaluates to get the folder name. + +B decides where to store a message by iterating over the +@FILTERS array. It tests each regexp of each filter against all +headers matching that filter's specified header. As soon as a match +is found, B evaluates the second part of the pair. This part +may contain positional parameters from the matched regexp ($1, $2, +etc.). The result of this expression is used as the folder name. + +Multiple occurrences of the same header are preserved and tested +individually against the filters. That means, for example, that +filters can search all the Received headers for a certain string. + +It is important to note that all the arrays of the @FILTERS structure +are traversed I. This means the filters can be arranged so +that some have priority over others. XXX: get Doug to write an +example of having the same header matched more than once. + +Lastly, B supports a magic ' TO' header. Filters using this +header are matched against a collection of headers related to (and +including) the To. Which headers to use is determined by a regular +expression borrowed from procmail. + =head1 EXAMPLES -%FILTERS = - ('List-Id:'=> - {'.*<([[:graph:]]+)-([[:graph:]]+)\.lists\.sourceforge\.net>.*'=>'l/$1/$2', - '.*<([[:graph:]]+?)(-list)?\.freedesktop\.org>.*'=>'l/freedesktop/$1', - '.*.*'=>'l/swig'}, +@FILTERS = ( - 'List-Post:'=> - {''=>'l/apache/$2/$1'}, + ['List-Id', + ['<([[:graph:]]+)-([[:graph:]]+)\.lists\.sourceforge\.net>', 'l/$1/$2'], + ['<([[:graph:]]+?)(-list)?\.freedesktop\.org>', 'l/freedesktop/$1'], + ['', 'l/swig'], + ], - 'To:'=> - {'root'=>'root'}); + ['List-Post', + ['', 'l/apache/$2/$1'], + ], + + ['To', + ['root', 'root'], + ], +); The first List-Id filter is a surprisingly effective one which creates appropriate folder names (such as l/htdig/updates and l/rox/devel) for @@ -638,11 +691,23 @@ apache.org subdomain, with mailing lists hosted there. So, given a list such as dev@httpd.apache.org, this filter will create the folder name l/apache/httpd/dev. +For an example B function, see +L + =head1 AUTHORS -Written by Eric Gillespie with logging code -stolen from Adam Lazur . +Written by Eric Gillespie . Design by Eric +Gillespie and Doug Porter . -Design by Eric Gillespie and Doug Porter . +This program is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. =cut + +# Local variables: +# cperl-indent-level: 4 +# perl-indent-level: 4 +# indent-tabs-mode: nil +# End: + +# vi: set tabstop=4 expandtab: