#!/usr/bin/perl =head1 NAME B - incorporate mail from a maildir into mh folders =head1 SYNOPSIS B [B<-m> I] [B<-n>] B B<-d> B B<-r> =head1 DESCRIPTION B incorporates mail from a maildir to a mh folder hierarchy. It takes mail from a maildir folder (not a maildir folder hierarchy), running each message through regular expression based filter and hook functions to determine in which folder to store it or whether it's spam. Post-processing hooks may be applied to each message. As it processes each message, B prints a line for each message similar to B and B. This line includes the folder and message number in which the message was stored, the last 'From' header, and the last 'Subject' header. These fields are truncated to fit in the user's terminal (see COLUMNS in B below) in the following proportions: folder (0.1), message number (0.0625), from header (0.175). Any of these may be overridden with $SCAN_P_FOLDER, $SCAN_P_MESSAGE, or $SCAN_P_FROM. The subject always fills out the rest of the line. =cut use strict; use warnings; $SIG{'PIPE'} = 'IGNORE'; use Data::Dumper; use Errno; use Fcntl qw(O_WRONLY O_EXCL O_CREAT); use FileHandle; use File::Temp qw(tempfile); use File::stat; use Getopt::Long qw(:config gnu_getopt no_ignore_case); use Pod::Usage; our $VERSION = 1; # If a filter set's header is $MAGIC_TO_TOKEN, that set is compared # against headers matching this regex (taken from procmail). my $MAGIC_TO_REGEX = '^((Original-)?(Resent-)?(To|Cc|Bcc)|(X-Envelope |Apparently(-Resent)?)-To)'; my $MAGIC_TO_TOKEN = ' TO'; # Mapping of message numbers to array references. The first element is set by # filter_mail to a reference to a header hash for the message; the second is # set by maildier_spam to the name of the message file in the spam maildir. # scan_spam scans this at the end so the user can check for false positives. my %SPAM; =head1 OPTIONS =over 4 =item B<-d> Dump (using Data::Dumper) the FILTERS list and exit. This is useful for testing the syntax of .minc. =item B<-h> Show help. =item B<-m> I Stop processing after I messages. =item B<-n> Dry run; do not actually incorporate the mail, but log and report to stdout/stderr as normal. =item B<-r> Rebuild `mhpath +`/.folders from scratch, processing no mail. =back =cut my $dumpfilters; my $help; my $maxmsgs; my $norun; my $rebuild_dot_folders; if (!caller()) { GetOptions( 'd' => \$dumpfilters, 'h|help' => \$help, 'm=i' => \$maxmsgs, 'n' => \$norun, 'r' => \$rebuild_dot_folders, ) or pod2usage(); $help and pod2usage(-exitstatus=>0, -verbose=>1); @ARGV == 0 or pod2usage(); } our $run = !$norun; =head1 ENVIRONMENT =over 4 =item COLUMNS How many columns the user's terminal can hold, used to print scan lines for each processed message. Defaults to 80. =item HOME Where the configuration file (.minc) is found. Also, $HOME/Maildir is used for the maildir if MAILDIR is not set. =item MAILDIR Where mail is delivered. =back =cut use Env qw(COLUMNS HOME MAILDIR); $COLUMNS ||= 80; if (not $HOME) { die("HOME environment variable must be set.\n"); } if (not $MAILDIR) { $MAILDIR = "$HOME/Maildir"; } =head1 FILES =over 4 =item $HOME/.minc This file is Perl code (included via the 'require' directive) which may define the FILTERS list, @start_hooks, @filter_hooks, @post_store_hooks, and @stop_hooks. =item `mhpath +`/.folders B adds all folders it filters into to this file, which is used by lukem's B (XXX need a link). =item `mhpath +`/.minc.context B uses this file for context (i.e. current folder) instead of `mhpath +`/context. This allows you some amount of freedom to use mh while B is running. For example, you can changed folders without causing a message to be stored in the wrong folder. Care must still be taken, however, about the unseen sequence; if you change it (whether via show, or mark, or anything else) while in the same folder as B, it is likely the new message B stores will not end up marked unseen. =back =cut our @FILTERS; our (@start_hooks, @stop_hooks, @filter_hooks, @post_store_hooks); our $SCAN_P_FOLDER = 0.1; our $SCAN_P_MESSAGE = 0.0625; our $SCAN_P_FROM = 0.175; our @folder_sort_list = (qr/^inbox$/); our $folder_sorter = sub { sort_by_list(@_, @folder_sort_list) }; my $mh; $mh = `mhpath +`; chomp($mh); $ENV{"MHCONTEXT"} = $mh . '/.minc.context'; ############################################################################### # Utility procedures sub sort_by_list { my $a = shift; my $b = shift; for my $i (@_) { my $am = $a =~ $i; my $bm = $b =~ $i; if ($am) { if ($bm) { last; } return -1; } elsif ($bm) { return 1; } } return $a cmp $b; } sub mkfolder { my $folder = shift; my $target; my $component; $target = $mh; foreach $component (split('/', $folder)) { $target = join('/', $target, $component); -d $target or mkdir($target) or die("mkdir($target): $!"); } } sub getfiles { my $dir = shift; my @result; if (not opendir(DIR, $dir)) { die("opendir($dir): $!"); } # Initialize $! to 0 (success) because stupid stupid Perl provides # no way to distinguish an error from an empty directory; that # means setting $! to 0 and testing it afterwards is the only way # to detect an error. Real Programmers don't handle errors, # right? >sigh< $! = 0; @result = readdir(DIR); if ($! != 0) { die("readdir($dir): $!"); } if (@result <= 2) { return (); } STDOUT->autoflush(1); if (@result == 3) { print('1 message...'); } else { print(@result - 2, ' messages...'); } closedir(DIR); return @result; } my %msgnum_cache; sub get_highest_msgnum { my $mhfolder = shift; my $dir; my @list; my $highest; my $msgnum; if (defined($msgnum_cache{$mhfolder})) { return $msgnum_cache{$mhfolder}++; } $dir = "$mh/$mhfolder"; if (not opendir(DIR, $dir)) { die("opendir($dir): $!"); } # Insert rant from getfiles here. $! = 0; @list = readdir(DIR); if ($! != 0) { die("readdir($dir): $!"); } closedir(DIR); $highest = 0; foreach $msgnum (@list) { # Look for integers. if ($msgnum =~ /^[0-9]+$/) { if ($msgnum > $highest) { $highest = $msgnum; } } } $msgnum_cache{$mhfolder} = $highest; return $msgnum_cache{$mhfolder}++; } sub mark { my $folder = shift; my $msgnum = shift; my $seq = shift; my $fn = "$mh/$folder/.mh_sequences"; my $fh; my @sequences; if (not open($fh, $fn)) { $!{ENOENT} or die("open($fn): $!"); } else { @sequences = <$fh>; chomp(@sequences); } my $marked = 0; open($fh, '>', $fn) or die("open(>$fn): $!"); for $_ (@sequences) { if (/^$seq: (.*)/) { my @parts; my @result; my $done = 0; for my $part (split(' ', $1)) { if (not $done) { my ($st, $en) = split('-', $part); if ((defined($en) and ($msgnum >= $st and $msgnum <= $en)) or $msgnum == $st) { # It's already there. $done = 1; } if (defined($en)) { if ($st - 1 == $msgnum) { $part = "$msgnum-$en"; $done = 1; } elsif ($en + 1 == $msgnum) { $part = "$st-$msgnum"; $done = 1; } } else { if ($part - 1 == $msgnum) { $part = "$msgnum-$part"; $done = 1; } elsif ($part + 1 == $msgnum) { $part = "$part-$msgnum"; $done = 1; } } } push(@result, $part); } if (not $done) { push(@result, $msgnum); } print($fh "$seq: ", join(' ', @result), "\n"); $marked = 1; } else { print($fh "$_\n"); } } $marked or print($fh "$seq: $msgnum\n"); close($fh) or die("close(>$fn): $!"); } sub store_message { my $msg = shift; my $mhfolder = shift; my $msgnum; my $try; my $mhmsg; # We must do this even in -n mode because later steps fail without # it. This should be harmless. mkfolder($mhfolder); # This loop is a modified version of the maildir delivery algorithm. $msgnum = get_highest_msgnum($mhfolder); for ($try = 0; ; $try++) { $msgnum++; $mhmsg = "$mh/$mhfolder/$msgnum"; if (not stat($mhmsg)) { if ($!{ENOENT}) { # Now we have a non-existent file, let's try to create # it. We must create a zero-byte file first because a # file my appear between our happy stat results and # our later rename(2), which would clobber said file. # So attempt to create a file with this name. If it # succeeds, in just a bit here we'll knowingly clobber # this file with the rename(2) call. # Another way to do this is not to use rename(2), but # use link(2) + unlink(2) instead. That's how the # standard maildir algorithm does it. Each method has # a disadvantage: the program may crash between the # link(2) and unlink(2) calls. With the standard # maildir algorithm, that means the message will end # up duplicated. The advantage of creating an empty # file followed by rename(2) is that an extra empty # file is left behind as opposed to a duplicate # message. This is more easily detected by the user. if ($run) { if (sysopen(MSG, $mhmsg, O_WRONLY | O_EXCL | O_CREAT, 0600)) { close(MSG); last; } } else { last; } } } # This algorithm is different from the maildir one; let's make # 10 tries instead of 3. if ($try == 9) { die("Attempted filename $mhmsg exists."); } # This algorithm is different; i don't think we need to sleep. #sleep(2); } if ($run) { if (not rename($msg, $mhmsg)) { die("rename($msg, $mhmsg): $!"); } # Mark each message as soon as we store it and bomb if that # fails. While it is slow, it is not safe to store multiple # messages and then have a failure before marking some (or # all). if ($mhfolder ne 'SPAM') { mark($mhfolder, $msgnum, 'unseen'); } } return $msgnum; } # Parse a message file into a structure describing the headers. The # structure is a hash of arrays. The hash keys are the names of the # headers found in the message, made all lower-case. Each item in # the hash is an array of header text. The array has one element # per occurrence of the header. Most headers will only have a single # element because they occur only once. The Received header is the # most common header with multiple texts. sub get_headers { my $msg = shift; my %headers; my $current; # current header, used for unfolding lines my $fieldname; # unmolested header name my $contents; # contents of header open(MSG, $msg) or die("open(MSG, $msg): $!"); while () { chomp; if (length == 0) { last; } if (/^\s/) { # folded header continuation if (not defined($current)) { warn('Malformed message, cannot parse headers.'); next; } @{$headers{$current}}[-1] .= $_; } else { ($fieldname) = split(/:/); $current = lc($fieldname); (undef, $contents) = split(/^\Q$fieldname\E:\s*/); if (defined($headers{$current})) { # multiple occurence of same header push(@{$headers{$current}}, $contents); } else { # first occurence of a header $headers{$current} = [$contents]; } } } close(MSG); return \%headers; } ############################################################################### # Filtering sub find_mh_folder { my $msg = shift; my $header = shift; if (not %$header) { return 'malformed'; } # Walk the list of filters. This structure is documented in # pod at the end of the program. for my $filterref (@FILTERS) { if (ref($filterref) eq 'CODE') { my $m = $filterref->($header, $msg); $m && return $m; next; } my $m = match($header, @$filterref); $m && return $m; } return 'inbox'; } # Test all the header fields against each [regexp, folder-expression] pair. sub match { my $header = shift; my $filter_field = shift; my @filters = @_; # Handle filters using the magic TO header specially. if ($filter_field eq $MAGIC_TO_TOKEN) { for my $field_name (keys(%$header)) { if ($field_name =~ /$MAGIC_TO_REGEX/i) { my $m = match_one_field($header->{$field_name}, \@filters); $m && return $m; } } # Now that it's been processed specially, skip normal handling. return; } # Walk the list of header fields matching the filter's specified header. my $m = match_one_field($header->{lc($filter_field)}, \@filters); $m && return $m; } # Test all the values of one header field against each [regexp, # folder-expression] pair. sub match_one_field { my $values = shift; my $filters = shift; for my $value (@$values) { for my $pair (@$filters) { my ($regexp, $expression) = @$pair; if ($value =~ $regexp) { my $result; if (eval "\$result = \"$expression\"") { return $result; } } } } } sub scan_line { my ($headers, $mhfolder, $msgnum, $nf, $nm, $nF, $ns) = @_; my $from = ''; my $subject = ''; # Sometimes these headers are missing... eval { $from = [@{$headers->{'from'}}]->[-1] }; eval { $subject = [@{$headers->{'subject'}}]->[-1] }; # Replace garbage characters. for ($from, $subject) { tr/\x00-\x1f\x80-\xff/?/; } return sprintf("\%-${nf}s \%${nm}d \%-${nF}s \%s", substr($mhfolder, 0, $nf), substr($msgnum, 0, $nm), substr($from, 0, $nF), substr($subject, 0, $ns)); } sub filter_mail { @_ or return (); my $msgcount = @_ - 2; # don't count . and .. my $len = length($msgcount); my @baton; my $msg; my $mhfolder; my $spam = 0; my $saved = 0; my $msgnum; my %folders; if (-f "$HOME/.minc") { require "$HOME/.minc"; } # XXX lame names my $nf = int($COLUMNS * $SCAN_P_FOLDER); my $nm = int($COLUMNS * $SCAN_P_MESSAGE); my $nF = int($COLUMNS * $SCAN_P_FROM); my $ns = $COLUMNS - $nf - $nm - $nF - 3; my %batons; for my $hook (@start_hooks) { my ($handle, @baton) = $hook->(); if (defined($handle)) { $batons{$handle} = [@baton]; } } my $tty = -t STDOUT; if (not $tty) { # Print a newline after the incomplete "N messages..." line. print("\n"); } for $msg (@_) { ($msg eq '.' or $msg eq '..') and next; my $headers = get_headers($msg); undef($mhfolder); for my $hook (@filter_hooks) { my $result = $hook->(\%batons, $headers, $msg); defined($result) and ($mhfolder = $result); } defined($mhfolder) or ($mhfolder = find_mh_folder($msg, $headers)); $msgnum = store_message($msg, $mhfolder); $folders{$mhfolder}++; if ($tty) { print("\r"); } if ($mhfolder eq 'SPAM') { $spam++; $SPAM{$msgnum} = [$headers, undef]; } else { $saved++; print(scan_line($headers, $mhfolder, $msgnum, $nf, $nm, $nF, $ns), "\n"); } for my $hook (@post_store_hooks) { $hook->(\%batons, $headers, $mhfolder, $msgnum); } if ($tty) { printf(" \%${len}d SPAM \%${len}d saved \%${len}d/%1d", $spam, $saved, $spam + $saved, $msgcount); } defined($maxmsgs) and ($spam + $saved < $maxmsgs or last); } print("\n"); for my $hook (@stop_hooks) { $hook->(\%batons); } return %folders; } sub build_dot_folders { my $folders = shift; my $fh = shift; my $fn; if (defined($fh)) { while (<$fh>) { chomp; $folders->{$_}++; } } eval { ($fh, $fn) = tempfile("$mh/.folders.XXXXX") }; if ($@) { warn("$@"); return; } for my $folder (sort { $folder_sorter->($a,$b) } keys(%$folders)) { print($fh "$folder\n"); } if (not close($fh)) { warn("close($fn): $!"); unlink($fn) or warn("unlink($fn): $!"); return; } rename($fn, "$mh/.folders") or warn("rename($fn, $mh/.folders): $!"); } sub create_dot_folders { if (-f "$HOME/.minc") { require "$HOME/.minc"; } my %folders; open(my $fh, '-|', 'folders', '-fast', '-recur') or die("open(folders|): $!"); build_dot_folders(\%folders, $fh); return 0; } sub update_dot_folders { my $folders = shift; my $fh; if (not open($fh, "$mh/.folders") and not $!{ENOENT}) { # For ENOENT, we go ahead and create it, else we error and # don't clobber it. warn("open($mh/.folders): $!"); return; } build_dot_folders($folders, $fh); } # XXX Could use some unification with getfiles... sub maildir_spam { my $dir = "$MAILDIR/spam/new"; if (not chdir($dir)) { $!{ENOENT} or print(STDERR "skipping maildir spam: chdir($dir): $!\n"); return; } if (not opendir(DIR, '.')) { print(STDERR "skipping maildir spam: opendir($dir): $!\n"); return; } $! = 0; my @spams = readdir(DIR); if ($! != 0) { print(STDERR "skipping maildir spam: readdir($dir): $!\n"); return; } closedir(DIR); for my $msg (@spams) { ($msg eq '.' or $msg eq '..') and next; my $msgnum = store_message($msg, 'SPAM'); # Store the original file name for scan_spam in -n mode. $SPAM{$msgnum} = [undef, $msg]; } } sub scan_spam { my ($msgnum, $header, $tuple, $msg); # Unlike filter_mail, we don't need to print the folder name. # Calculate how many columns would be allocated to it... my $nf = int($COLUMNS * $SCAN_P_FOLDER); # ...and add that amount to COLUMNS to calculate the number of columns to # allocate to msgnum and from snippet, thus filling the line without # printing the folder name. my $nm = int(($COLUMNS + $nf) * $SCAN_P_MESSAGE); my $nF = int(($COLUMNS + $nf) * $SCAN_P_FROM); my $ns = $COLUMNS - $nm - $nF - 3; for $msgnum (sort(keys(%SPAM))) { $tuple = $SPAM{$msgnum}; if (defined($tuple->[0])) { # Filed by filter_mail, so we have the header. $header = $tuple->[0]; } elsif (defined($tuple->[1])) { # Filed by maildir_spam, so we don't have the header. if ($run) { # The message has been filed, load it from $mh. $msg = "$mh/SPAM/$msgnum"; } else { # The message has not been filed, load it from the maildir. # $tuple->[1] is just a basename, not a path; this works # because maildir_spam did chdir(Maildir/spam/new). $msg = $tuple->[1]; } $header = get_headers($msg); } else { print(STDERR "BUG: corrupt SPAM tuple, neither element defined", " for message $msgnum\n"); next; } print(scan_line($header, '', $msgnum, 0, $nm, $nF, $ns), "\n"); } } if (!caller()) { my $st; if ($dumpfilters) { require "$HOME/.minc"; $Data::Dumper::Indent = 1; print(Dumper(\@FILTERS)); exit; } $rebuild_dot_folders and exit(create_dot_folders); chdir("$MAILDIR/new") or die("chdir($MAILDIR/new): $!"); my %folders = filter_mail(map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { if (not ($st = stat($_))) { die("stat($_): $!"); } [$st->mtime, $_] } getfiles('.')); $run and %folders and update_dot_folders(\%folders); maildir_spam(); scan_spam(); } 1; __END__ =head1 THE FILTERS STRUCTURE Each element of @FILTERS is a filter. A filter is itself an array. The first element of a filter is a string, the name of the header this filter acts upon. The header name is not case-sensitive. Each subsequent element of a filter is a pair (i.e. a two-element array): first, a regular expression B uses to determine whether this filter matches or not, and second, an expression which B evaluates to get the folder name. B decides where to store a message by iterating over the @FILTERS array. It tests each regexp of each filter against all headers matching that filter's specified header. As soon as a match is found, B evaluates the second part of the pair. This part may contain positional parameters from the matched regexp ($1, $2, etc.). The result of this expression is used as the folder name. Multiple occurrences of the same header are preserved and tested individually against the filters. That means, for example, that filters can search all the Received headers for a certain string. It is important to note that all the arrays of the @FILTERS structure are traversed I. This means the filters can be arranged so that some have priority over others. XXX: get Doug to write an example of having the same header matched more than once. Lastly, B supports a magic ' TO' header. Filters using this header are matched against a collection of headers related to (and including) the To. Which headers to use is determined by a regular expression borrowed from procmail. =head1 HOOKS Filter hooks take a reference to a hash of batons, a reference to a hash of headers, and the message filename as arguments. It returns undef to decline filtering of this message (thus falling back to subsequent filter hooks, and finally @FILTERS), or the name of the folder to store this message into. One of B's global variables is available to the user-defined hooks: $run. This boolean should be honored; hooks should only take real action (i.e. removing or creating files, running external programs, etc.) if $run is non-zero. The baton hash is created simply from the start hooks; if the hook returns at least one defined value, this value is used as the key and all other return values are put into a list reference as the value. This hash is then passed by reference to all filter, post-store, and stop hooks. Post store hooks take a reference to a hash of batons, a reference to a hash of headers, the folder this message was stored in, and its new message number. XXX: need more details about the hook process; for now read the code. =head1 EXAMPLES @FILTERS = ( ['List-Id', ['<([[:graph:]]+)-([[:graph:]]+)\.lists\.sourceforge\.net>', 'l/$1/$2'], ['<([[:graph:]]+?)(-list)?\.freedesktop\.org>', 'l/freedesktop/$1'], ['', 'l/swig'], ], ['List-Post', ['', 'l/apache/$2/$1'], ], ['To', ['root', 'root'], ], ); The first List-Id filter is a surprisingly effective one which creates appropriate folder names (such as l/htdig/updates and l/rox/devel) for all Sourceforge lists to which i subscribe. Certainly there are lists hosted at Sourceforge which do not work well with this, and this is where it is important to remember that the first match is the one uses. Simply put the more specific rules before this one. The next List-Id example is simple. The swig example demonstrates that the folder name does not have to use a portion of the matched header; you can just hardcode the folder name. The List-Post example is a nice one. Most ASF projects have their own apache.org subdomain, with mailing lists hosted there. So, given a list such as dev@httpd.apache.org, this filter will create the folder name l/apache/httpd/dev. XXX Need hook examples. =head1 AUTHORS Written by Eric Gillespie . Design by Eric Gillespie and Doug Porter . This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut # Local variables: # cperl-indent-level: 4 # perl-indent-level: 4 # indent-tabs-mode: nil # End: # vi: set tabstop=4 expandtab: