#! /usr/local/bin/perl # $Id$ =head1 NAME B - Incorporate mail from a maildir into mh folders. =head1 SYNOPSIS B [-B] =head1 DESCRIPTION B is a program for incorporating mail from a maildir to a mh folder hierarchy. It takes mail from a maildir folder (not a maildir folder hierarchy), optionally checks for spam with a user-defined spam-checking function, and optionally filters mail into separate mh folders. The filtering is quite sophisticated, as it is done using real Perl substitution (s//) commands. =cut use strict; use warnings; require 'sysexits.ph'; use Data::Dumper; use Errno; use Fcntl qw(O_WRONLY O_EXCL O_CREAT); use File::Basename; use Getopt::Std; use Log::Dispatch; use Log::Dispatch::File; use POSIX qw(strftime WEXITSTATUS WIFEXITED); use Tie::IxHash; =head1 OPTIONS =over 4 =item B<-d> Dump (using Data::Dumper) the FILTERS hash and exit. This is useful for testing the syntax of .mincfilter. =item B<-f> Filter only, then exit. This is useful after running B. =item B<-h> Show help. =item B<-n> Dry run; do not actually incorporate the mail, but log and report to stdout/stderr as normal. =item B<-s> Process SPAM only, then exit, leaving all non-spam messages in the maildir. =back =cut my $dumpfilters = 0; my $filteronly = 0; our $run = 1; my $spamonly = 0; our $opt_d; our $opt_f; our $opt_h; our $opt_n; our $opt_s; # ;; # stupid cperl-mode if (not getopts('dfhns')) { exit(&EX_USAGE); } if ($opt_d) { $dumpfilters = 1; } elsif ($opt_h) { print("Sorry bub, no help.\n"); exit(&EX_OK); } elsif ($opt_n) { $run = 0; } if ($opt_f) { $filteronly = 1; } elsif ($opt_s) { # ))){ # stupid cperl-mode $spamonly = 1; } =head1 ENVIRONMENT =over 4 =item HOME Where configuration files (.mincfilter) are found. Also, $HOME/Maildir is used for the maildir if MAILDIR is not set. =item MAILDIR Where mail is delivered. =back =cut use Env qw(HOME MAILDIR); if (not $HOME) { die("HOME environment variable must be set.\n"); } if (not $MAILDIR) { $MAILDIR = "$HOME/Maildir"; } =head1 FILES =over 4 =item $HOME/.mincfilter This file is Perl code (included via the 'require' directive) which is expected to define the FILTERS hash. =item $HOME/.mincspam If this file exists, minc will include it with the expectation that it will define a B function. This function takes a message filename as an argument and returns 1 if the message is spam, else 0. If this file does not exist, minc will define a simple function that always returns 0. One of minc's global variables is available to the user-defined is_spam function: $run. This boolean should be honored; is_spam should only take real action (i.e. removing or creating files, running external programs, etc.) if $run is non-zero. XXX: need more details about the spam-handling process; for now read the code. =item `mhpath +`/logs/minc.log Where minc logs what it does, unless in -n mode. =item `mhpath +`/logs/dryrun.log Where minc logs what it would do; used in -n mode. =back =cut our %FILTERS; tie(%FILTERS, 'Tie::IxHash'); require "$HOME/.mincfilter"; if (-f "$HOME/.mincspam") { require "$HOME/.mincspam"; } else { sub is_spam { return 0; } } my $mh; my $logfile; $mh = `mhpath +`; chomp($mh); if ($run) { $logfile = $mh . '/logs/minc.log'; } else { $logfile = $mh . '/logs/dryrun.log'; } ############################################################################### # Logging # debuglevels for the logger use constant LOG_DEBUGLEVEL => 'info'; use constant SCREEN_DEBUGLEVEL => 'debug'; # characters used to wrap around the id field in the log so I can more # easily parse the msg flow use constant DEBUGCHAR => '%'; use constant INFOCHAR => '='; use constant INCOMINGCHAR => '<'; use constant SAVECHAR => '>'; use constant WARNCHAR => 'W'; use constant ERRORCHAR => 'E'; my $logger = new Log::Dispatch; $logger->add(new Log::Dispatch::File (name=>'logfile', filename=>$logfile, min_level=>'info', mode=>'append')); # log alias, handles getting passed vars that are undef sub mylog { my $level = shift; my $act = shift; my $timestamp = strftime('%b %e %H:%M:%S', localtime); my $msg; foreach my $part (@_) { if (defined($part)) { $msg .= $part; } } # no newlines in the log message, thanks $msg =~ s/\n/ /gm; $msg = sprintf("%s %s%s %s\n", $timestamp, $act, $act, $msg); $logger->log(level=>$level, message=>$msg); if ($act eq SAVECHAR) { $logger->log(level=>$level, message=>"\n"); } } sub logincoming { mylog('info', INCOMINGCHAR, @_); } sub logsave { mylog('notice', SAVECHAR, @_); } sub loginfo { mylog('info', INFOCHAR, @_); } sub logdebug { mylog('debug', DEBUGCHAR, @_); } sub logwarn { mylog('warning', WARNCHAR, @_); } sub logerr { mylog('error', ERRORCHAR, @_); } sub log_headers { my %headers = @_; logincoming('From: ', $headers{'return-path'}); logincoming('To: ', $headers{'to'}); logincoming('Subject: ', $headers{'subject'}); logincoming('Message-Id: ', $headers{'message-id'}); } ############################################################################### # Utility procedures sub _errprint { printf('%s:%s', __FILE__, __LINE__); if (@_) { print(': '); foreach (@_) { print; } } } sub err { my $ex = shift; _errprint(@_); print(": $!\n"); exit($ex); } sub errx { my $ex = shift; _errprint(@_); print("\n"); exit($ex); } sub mkfolder { my $mhfolder = shift; my $folder; my $target; my $component; $folder = $mh . '/' . $mhfolder; $target = ''; if (not -d $folder) { foreach $component (split('/', $folder)) { if (defined($component) and length($component) >= 1) { $target = $target . '/' . $component; if (-d $target or mkdir($target)) { next; } else { err(&EX_OSERR, "Failed to create +$mhfolder ($component)"); } } } } } sub getfiles { my $dir = shift; my @result; if (not opendir(DIR, $dir)) { err(&EX_OSERR, "Failed opendir($dir)"); } # Initialize $! to 0 (success) because stupid stupid Perl provides # no way to distinguish an error from an empty directory; that # means setting $! to 0 and testing it afterwards is the only way # to detect an error. Real Programmers don't handle errors, # right? >sigh< $! = 0; @result = grep { ($_ ne '.' and $_ ne '..') and $_ = "$MAILDIR/new/$_" } readdir(DIR); if ($! != 0) { err(&EX_OSERR, "Failed readdir($dir)"); } closedir(DIR); return @result; } sub get_highest_msgnum { my $mhfolder = shift; my $dir; my @list; my $highest; my $msgnum; $dir = "$mh/$mhfolder"; if (not opendir(DIR, $dir)) { err(&EX_OSERR, "Failed opendir($dir)"); } $! = 0; @list = readdir(DIR); if ($! != 0) { err(&EX_OSERR, "Failed readdir($dir)"); } closedir(DIR); $highest = 0; foreach $msgnum (@list) { # Look for integers. if ($msgnum =~ /^[0-9]+$/) { if ($msgnum > $highest) { $highest = $msgnum; } } } return $highest; } # We want to print the name of each list that has new mail only once, # so use this hash to store the lists that have already been printed. # Start the list out with SPAM already in it, since we don't care when # new messages are added to it. my %FOLDERS = ('SPAM'=>1); sub store_message { my $msg = shift; my $mhfolder = shift; my $msgnum; my $try; my $mhmsg; my $status; # We must do this even in -n mode because later steps fail without # it. This should be harmless. mkfolder($mhfolder); # This loop is a modified version of the maildir delivery algorithm. $msgnum = get_highest_msgnum($mhfolder); for ($try = 0; ; $try++) { $msgnum++; $mhmsg = "$mh/$mhfolder/$msgnum"; if (not stat($mhmsg)) { if ($!{ENOENT}) { # Now we have a non-existent file, let's try to create # it. We must create a zero-byte file first because a # file my appear between our happy stat results and # our later rename(2), which would clobber said file. # So attempt to create a file with this name. If it # succeeds, in just a bit here we'll knowingly clobber # this file with the rename(2) call. if ($run) { if (sysopen(MSG, "$mhmsg", O_WRONLY | O_EXCL | O_CREAT, 0600)) { close(MSG); last; } } else { last; } } } # This algorithm is different from the maildir one; let's make # 10 tries instead of 3. if ($try == 9) { errx(&EX_TEMPFAIL, "Attempted filename $mhmsg exists."); } # This algorithm is different; i don't think we need to sleep. #sleep(2); } logsave("+$mhfolder"); if ($run) { if (not rename($msg, $mhmsg)) { err(&EX_OSERR, "Failed rename($msg, $mhmsg)"); } # Mark each message as soon as we store it and bomb if that # fails. While it is slow, it is not safe to store multiple # messages and then have a failure before marking some (or # all). if ($mhfolder ne 'SPAM') { $status = system("mark +$mhfolder $msgnum -sequence unseen -add"); if (not WIFEXITED($status)) { err(&EX_OSERR, "Failed to run mark"); } elsif (WEXITSTATUS($status) != 0) { errx(&EX_SOFTWARE, "Failed to mark message unseen."); } } } if (not $FOLDERS{$mhfolder}) { print("+$mhfolder\n"); $FOLDERS{$mhfolder} = 1; } } sub get_headers { my $msg = shift; my %headers; my $current; # current header, used for unfolding lines my $fieldname; # unmolested header name open(MSG, $msg); while () { chomp; if (length == 0) { last; } if (/^\s/) { # folded header continuation if (not defined($current)) { print("Malformed message, cannot parse headers.\n"); return (); } $headers{$current} .= $_; } else { ($fieldname) = split(/:/); $current = lc($fieldname); (undef, $headers{$current}) = split(/^\Q$fieldname\E:\s*/); } } close(MSG); return %headers; } ############################################################################### # Spam handling sub kill_spam { my @msglist = @_; my @result; my $msg; my $i; @result = (); $i = 0; foreach $msg (@msglist) { printf('%sChecking for spam... %6d/%d', "\r", ++$i, scalar(@msglist)); if (is_spam($msg)) { print(" SPAM\n"); store_message($msg, 'SPAM'); } else { push(@result, $msg); } } print("\nDone: ", scalar(@result), " survivors\n"); return @result; } ############################################################################### # Filtering sub find_mh_folder { my $msg = shift; my %headers; my $header; my $contents; my $regex; my $subst; my $try; %headers = get_headers($msg); if (not %headers) { return 'inbox'; } log_headers(%headers); foreach $header (keys(%FILTERS)) { $contents = $headers{lc($header)}; if (defined($contents)) { foreach $regex (keys(%{$FILTERS{$header}})) { $subst = $FILTERS{$header}->{$regex}; $try = '$contents =~'; $try .= " s|$regex|$subst|is"; if (eval $try) { return $contents; } } } } return 'inbox'; } sub filter_mail { my @msglist = @_; my $msg; my $mhfolder; foreach $msg (@msglist) { $mhfolder = find_mh_folder($msg); store_message($msg, $mhfolder); } } MAIN: { my @msglist; if ($dumpfilters) { $Data::Dumper::Indent = 1; print(Dumper(\%FILTERS)); exit(&EX_OK); } @msglist = getfiles("$MAILDIR/new"); if (not $filteronly) { @msglist = kill_spam(@msglist); } if (not $spamonly) { filter_mail(@msglist); } } __END__ =head1 EXAMPLES %FILTERS = ('List-Id:'=> {'.*<([[:graph:]]+)-([[:graph:]]+)\.lists\.sourceforge\.net>.*'=>'l/$1/$2', '.*<([[:graph:]]+?)(-list)?\.freedesktop\.org>.*'=>'l/freedesktop/$1', '.*.*'=>'l/swig'}, 'List-Post:'=> {''=>'l/apache/$2/$1'}, 'To:'=> {'root'=>'root'}); The first List-Id filter is a surprisingly effective one which creates appropriate folder names (such as l/htdig/updates and l/rox/devel) for all Sourceforge lists to which i subscribe. Certainly there are lists hosted at Sourceforge which do not work well with this, and this is where it is important to remember that the first match is the one uses. Simply put the more specific rules before this one. The next List-Id example is simple. The swig example demonstrates that the folder name does not have to use a portion of the matched header; you can just hardcode the folder name. The List-Post example is a nice one. Most ASF projects have their own apache.org subdomain, with mailing lists hosted there. So, given a list such as dev@httpd.apache.org, this filter will create the folder name l/apache/httpd/dev. For an example is_spam function, see LEpretzelnet.orgEcvsEdotfilesE.mincspam> =head1 AUTHORS Written by Eric Gillespie with logging code stolen from Adam Lazur . Design by Eric Gillespie and Doug Porter . =cut