diplodocus.org Git - minc/blob - minc

   1 #! /usr/bin/env perl
   2
   3 # $Id$
   4
   5 =head1 NAME
   6
   7 B<minc> - incorporate mail from a maildir into mh folders
   8
   9 =head1 SYNOPSIS
  10
  11 B<minc> [B<-m> I<MAX>] [B<-n>] [B<-p>]
  12
  13 B<minc> B<-d>
  14
  15 B<minc> B<-h>
  16
  17 =head1 DESCRIPTION
  18
  19 B<minc> incorporates mail from a maildir to a mh folder hierarchy.  It
  20 takes mail from a maildir folder (not a maildir folder hierarchy),
  21 optionally checks for spam with a user-defined spam-checking function,
  22 and optionally filters mail into separate mh folders.
  23
  24 The filtering is quite sophisticated, as it is done using real Perl
  25 matching (m//) commands.
  26
  27 =cut
  28
  29 use strict;
  30 use warnings;
  31
  32 use Data::Dumper;
  33 use Errno;
  34 use Fcntl qw(O_WRONLY O_EXCL O_CREAT);
  35 use FileHandle;
  36 use File::stat;
  37 use Getopt::Long qw(:config gnu_getopt no_ignore_case);
  38 use POSIX qw(strftime WEXITSTATUS WIFEXITED);
  39 use Pod::Usage;
  40
  41 our $VERSION = 1;
  42
  43 # If a filter set's header is $MAGIC_TO_TOKEN, that set is compared
  44 # against headers matching this regex (taken from procmail).
  45 my $MAGIC_TO_REGEX = '^((Original-)?(Resent-)?(To|Cc|Bcc)|(X-Envelope |Apparently(-Resent)?)-To)';
  46 my $MAGIC_TO_TOKEN = ' TO';
  47
  48 # List of SPAM message numbers, scanned at the end so the user can
  49 # check for false positives.
  50 my @SPAM;
  51
  52 =head1 OPTIONS
  53
  54 =over 4
  55
  56 =item B<-d>
  57
  58 Dump (using Data::Dumper) the FILTERS list and exit.  This is useful
  59 for testing the syntax of .mincfilter.
  60
  61 =item B<-h>
  62
  63 Show help.
  64
  65 =item B<-m> I<MAX>
  66
  67 Stop processing after I<MAX> messages.
  68
  69 =item B<-n>
  70
  71 Dry run; do not actually incorporate the mail, but log and report to
  72 stdout/stderr as normal.
  73
  74 =item B<-p>
  75
  76 Print the filename of each message before checking it for spam.  This
  77 can be handy if a particular message is giving the spam checker a
  78 problem.
  79
  80 =back
  81
  82 =cut
  83
  84 my $dumpfilters;
  85 my $help;
  86 my $maxmsgs;
  87 my $norun;
  88 my $printfilenames;
  89
  90 GetOptions(
  91            'd' => \$dumpfilters,
  92            'h|help' => \$help,
  93            'm=i' => \$maxmsgs,
  94            'n' => \$norun,
  95            'p' => \$printfilenames,
  96           ) or pod2usage();
  97 $help and pod2usage(-exitstatus=>0, -verbose=>1);
  98 @ARGV == 0 or pod2usage();
  99
 100 our $run = !$norun;
 101
 102 =head1 ENVIRONMENT
 103
 104 =over 4
 105
 106 =item HOME
 107
 108 Where configuration files (.mincfilter) are found.  Also,
 109 $HOME/Maildir is used for the maildir if MAILDIR is not set.
 110
 111 =item MAILDIR
 112
 113 Where mail is delivered.
 114
 115 =back
 116
 117 =cut
 118
 119 use Env qw(COLUMNS HOME MAILDIR);
 120
 121 $COLUMNS ||= 80;
 122
 123 if (not $HOME) {
 124     die("HOME environment variable must be set.\n");
 125 }
 126 if (not $MAILDIR) {
 127     $MAILDIR = "$HOME/Maildir";
 128 }
 129
 130 =head1 FILES
 131
 132 =over 4
 133
 134 =item $HOME/.mincfilter
 135
 136 This file is Perl code (included via the 'require' directive) which is
 137 expected to define the FILTERS list.
 138
 139 =item $HOME/.mincspam
 140
 141 If this file exists, B<minc> will include it with the expectation that
 142 it will define a B<spam_check> function.  This function takes a
 143 message filename as an argument and returns 1 if the message is spam,
 144 else 0.  If this file does not exist, B<minc> will define a simple
 145 function that always returns 0.
 146
 147 One of B<minc>'s global variables is available to the user-defined
 148 B<spam_check> function: $run.  This boolean should be honored;
 149 B<spam_check> should only take real action (i.e. removing or creating
 150 files, running external programs, etc.) if $run is non-zero.
 151
 152 This file may also declare two other functions: B<spam_start_hook> and
 153 B<spam_stop_hook>.  The former is passed no arguments and is expected
 154 to return a list.  This list is a "baton" that will also be passed to
 155 B<spam_check> and B<spam_stop_hook>.  It can hold anything
 156 B<spam_check> will need to do its job, whether network sockets, pipes,
 157 or whatever.
 158
 159 XXX: need more details about the spam-handling process; for now read
 160 the code.
 161
 162 =item `mhpath +`/logs/minc.log
 163
 164 Where B<minc> logs what it does, unless in -n mode.
 165
 166 =item `mhpath +`/logs/dryrun.log
 167
 168 Where B<minc> logs what it would do; used in -n mode.
 169
 170 =item `mhpath +`/.minc.context
 171
 172 B<minc> uses this file for context (i.e. current folder) instead of
 173 `mhpath +`/context.  This allows you some amount of freedom to use mh
 174 while B<minc> is running.  For example, you can changed folders without
 175 causing a message to be stored in the wrong folder.  Care must still
 176 be taken, however, about the unseen sequence; if you change it
 177 (whether via show, or mark, or anything else) while in the same folder
 178 as B<minc>, it is likely the new message B<minc> stores will not end up
 179 marked unseen.
 180
 181 =back
 182
 183 =cut
 184
 185 our @FILTERS;
 186 our (@start_hooks, @stop_hooks, @filter_hooks, @post_store_hooks);
 187
 188 my $mh;
 189 my $logfile;
 190
 191 $mh = `mhpath +`;
 192 chomp($mh);
 193
 194 if ($run) {
 195     $logfile = $mh . '/logs/minc.log';
 196 } else {
 197     $logfile = $mh . '/logs/dryrun.log';
 198 }
 199
 200 $ENV{"MHCONTEXT"} = $mh . '/.minc.context';
 201
 202 \f
 203 ###############################################################################
 204 # Logging
 205
 206 sub mylog {
 207     my $timestamp = strftime('%b %e %H:%M:%S', localtime);
 208     my $msg;
 209     foreach my $part (@_) {
 210         if (defined($part)) {
 211             $msg .= $part;
 212         }
 213     }
 214     # no newlines in the log message, thanks
 215     $msg =~ s/\n/ /gm;
 216
 217     open(LOG, ">>$logfile") or die("open(>>$logfile): $!");
 218     print(LOG "$timestamp $msg\n") or die("print(>>$logfile): $!");
 219     close(LOG) or die("close($logfile): $!");
 220 }
 221
 222 sub logheader {
 223     my ($text, @contents) = @_;
 224     my $last;
 225
 226     if (@contents) {
 227         $last = $contents[-1];
 228     } else {
 229         $last = '';
 230     }
 231
 232     mylog('<< ', $text, $last);
 233 }
 234
 235 sub log_headers {
 236     my %headers = @_;
 237
 238     # For an explanation of the %headers structure, see the
 239     # get_headers function below.
 240     logheader('From:       ', @{$headers{'return-path'}});
 241     logheader('To:         ', @{$headers{'to'}});
 242     logheader('Subject:    ', @{$headers{'subject'}});
 243     logheader('Message-Id: ', @{$headers{'message-id'}});
 244 }
 245
 246 \f
 247 ###############################################################################
 248 # Utility procedures
 249
 250 sub mkfolder {
 251     my $folder = shift;
 252     my $target;
 253     my $component;
 254
 255     $target = $mh;
 256     foreach $component (split('/', $folder)) {
 257         $target = join('/', $target, $component);
 258         -d $target or mkdir($target) or die("mkdir($target): $!");
 259     }
 260 }
 261
 262 sub getfiles {
 263     my $dir = shift;
 264     my @result;
 265
 266     if (not opendir(DIR, $dir)) {
 267         die("opendir($dir): $!");
 268     }
 269
 270     # Initialize $! to 0 (success) because stupid stupid Perl provides
 271     # no way to distinguish an error from an empty directory; that
 272     # means setting $! to 0 and testing it afterwards is the only way
 273     # to detect an error.  Real Programmers don't handle errors,
 274     # right?  >sigh<
 275     $! = 0;
 276     @result = readdir(DIR);
 277     if ($! != 0) {
 278         die("readdir($dir): $!");
 279     }
 280
 281     if (@result <= 2) {
 282         exit(0);
 283     }
 284
 285     STDOUT->autoflush(1);
 286     print(@result - 2, " messages...");
 287
 288     closedir(DIR);
 289
 290     return @result;
 291 }
 292
 293 sub get_highest_msgnum {
 294     my $mhfolder = shift;
 295     my $dir;
 296     my @list;
 297     my $highest;
 298     my $msgnum;
 299
 300     $dir = "$mh/$mhfolder";
 301     if (not opendir(DIR, $dir)) {
 302         die("opendir($dir): $!");
 303     }
 304
 305     # Insert rant from getfiles here.
 306     $! = 0;
 307     @list = readdir(DIR);
 308
 309     if ($! != 0) {
 310         die("readdir($dir): $!");
 311     }
 312
 313     closedir(DIR);
 314
 315     $highest = 0;
 316     foreach $msgnum (@list) {
 317         # Look for integers.
 318         if ($msgnum =~ /^[0-9]+$/) {
 319             if ($msgnum > $highest) {
 320                 $highest = $msgnum;
 321             }
 322         }
 323     }
 324
 325     return $highest;
 326 }
 327
 328 sub store_message {
 329     my $msg = shift;
 330     my $mhfolder = shift;
 331     my $msgnum;
 332     my $try;
 333     my $mhmsg;
 334     my $status;
 335
 336     # We must do this even in -n mode because later steps fail without
 337     # it.  This should be harmless.
 338     mkfolder($mhfolder);
 339
 340     # This loop is a modified version of the maildir delivery algorithm.
 341     $msgnum = get_highest_msgnum($mhfolder);
 342     for ($try = 0; ; $try++) {
 343         $msgnum++;
 344         $mhmsg = "$mh/$mhfolder/$msgnum";
 345
 346         if (not stat($mhmsg)) {
 347             if ($!{ENOENT}) {
 348                 # Now we have a non-existent file, let's try to create
 349                 # it.  We must create a zero-byte file first because a
 350                 # file my appear between our happy stat results and
 351                 # our later rename(2), which would clobber said file.
 352                 # So attempt to create a file with this name.  If it
 353                 # succeeds, in just a bit here we'll knowingly clobber
 354                 # this file with the rename(2) call.
 355
 356                 # Another way to do this is not to use rename(2), but
 357                 # use link(2) + unlink(2) instead.  That's how the
 358                 # standard maildir algorithm does it.  Each method has
 359                 # a disadvantage: the program may crash between the
 360                 # link(2) and unlink(2) calls.  With the standard
 361                 # maildir algorithm, that means the message will end
 362                 # up duplicated.  The advantage of creating an empty
 363                 # file followed by rename(2) is that an extra empty
 364                 # file is left behind as opposed to a duplicate
 365                 # message.  This is more easily detected by the user.
 366                 if ($run) {
 367                     if (sysopen(MSG, $mhmsg,
 368                                 O_WRONLY | O_EXCL | O_CREAT, 0600)) {
 369                         close(MSG);
 370                         last;
 371                     }
 372                 } else {
 373                     last;
 374                 }
 375             }
 376         }
 377
 378         # This algorithm is different from the maildir one; let's make
 379         # 10 tries instead of 3.
 380         if ($try == 9) {
 381             die("Attempted filename $mhmsg exists.");
 382         }
 383
 384         # This algorithm is different; i don't think we need to sleep.
 385         #sleep(2);
 386     }
 387
 388     if ($mhfolder ne 'SPAM') {
 389         mylog('+', $mhfolder);
 390     }
 391
 392     if ($run) {
 393         if (not rename($msg, $mhmsg)) {
 394             die("rename($msg, $mhmsg): $!");
 395         }
 396
 397         # Mark each message as soon as we store it and bomb if that
 398         # fails.  While it is slow, it is not safe to store multiple
 399         # messages and then have a failure before marking some (or
 400         # all).
 401         if ($mhfolder eq 'SPAM') {
 402             push(@SPAM, $msgnum);
 403         } else {
 404             $status = system('mark', "+$mhfolder", "$msgnum", '-sequence',
 405                              'unseen', '-add');
 406             # XXX need to handle signalled and stopped, and print
 407             # the exit code or signal number.
 408             if (not WIFEXITED($status)) {
 409                 die("Failed to run mark");
 410             } elsif (WEXITSTATUS($status) != 0) {
 411                 die("Failed to mark message unseen.");
 412             }
 413         }
 414     }
 415
 416     return $msgnum;
 417 }
 418
 419 # Parse a message file into a structure describing the headers.  The
 420 # structure is a hash of arrays.  The hash keys are the names of the
 421 # headers found in the message, made all lower-case.  Each item in
 422 # the hash is an array of header text.  The array has one element
 423 # per occurrence of the header.  Most headers will only have a single
 424 # element because they occur only once.  The Received header is the
 425 # most common header with multiple texts.
 426 sub get_headers {
 427     my $msg = shift;
 428     my %headers;
 429     my $current;                # current header, used for unfolding lines
 430     my $fieldname;              # unmolested header name
 431     my $contents;               # contents of header
 432
 433     open(MSG, $msg) or die("open(MSG, $msg): $!");
 434     while (<MSG>) {
 435         chomp;
 436         if (length == 0) {
 437             last;
 438         }
 439
 440         if (/^\s/) {
 441             # folded header continuation
 442
 443             if (not defined($current)) {
 444                 warn('Malformed message, cannot parse headers.');
 445                 next;
 446             }
 447
 448             @{$headers{$current}}[-1] .= $_;
 449         } else {
 450             ($fieldname) = split(/:/);
 451             $current = lc($fieldname);
 452             (undef, $contents) = split(/^\Q$fieldname\E:\s*/);
 453
 454             if (defined($headers{$current})) {
 455                 # multiple occurence of same header
 456                 push(@{$headers{$current}}, $contents);
 457             } else {
 458                 # first occurence of a header
 459                 $headers{$current} = [$contents];
 460             }
 461         }
 462     }
 463     close(MSG);
 464
 465     return %headers;
 466 }
 467
 468 \f
 469 ###############################################################################
 470 # Filtering
 471
 472 sub find_mh_folder {
 473     my $msg = shift;
 474     my %headers = @_;
 475     my $filterref;
 476     my @filter;
 477     my $header;
 478     my $contents;
 479     my $pair;
 480     my $match;
 481     my $expression;
 482     my $result;
 483
 484     if (not %headers) {
 485         return 'malformed';
 486     }
 487
 488     # Walk the list of filters.  This structure is documented in
 489     # pod at the end of the program.
 490     foreach $filterref (@FILTERS) {
 491         @filter = @$filterref;
 492         $header = shift(@filter);
 493
 494         # Handle filters using the magic TO header specially.
 495         if ($header eq $MAGIC_TO_TOKEN) {
 496             foreach $header (keys(%headers)) {
 497                 if ($header =~ /$MAGIC_TO_REGEX/i) {
 498                     foreach $contents (@{$headers{lc($header)}}) {
 499                         foreach $pair (@filter) {
 500                             ($match, $expression) = @$pair;
 501                             if ($contents =~ /$match/) {
 502                                 return $expression;
 503                             }
 504                         }
 505                     }
 506                 }
 507             }
 508
 509             # Now that it's been processed specially, skip normal handling.
 510             next;
 511         }
 512
 513         # Walk the list of message headers matching the filter's
 514         # specified header.
 515         foreach $contents (@{$headers{lc($header)}}) {
 516             # Walk the filter's list of match/expression pairs.
 517             foreach $pair (@filter) {
 518                 ($match, $expression) = @$pair;
 519                 if ($contents =~ /$match/i) {
 520                     if (eval "\$result = \"$expression\"") {
 521                         return $result;
 522                     }
 523                 }
 524             }
 525         }
 526     }
 527
 528     return 'inbox';
 529 }
 530
 531 sub filter_mail {
 532     my $msgcount = @_ - 2; # don't count . and ..
 533     my $len = length($msgcount);
 534     my @baton;
 535     my $msg;
 536     my $mhfolder;
 537     my $spam = 0;
 538     my $saved = 0;
 539     my $msgnum;
 540
 541     # XXX lame names and hard-coded proportions.
 542     my $nf = int($COLUMNS * 0.1);
 543     my $nm = int($COLUMNS * 0.0625);
 544     my $nF = int($COLUMNS * 0.175);
 545     my $ns = $COLUMNS - $nf - $nm - $nF - 3;
 546
 547     if (-f "$HOME/.minc") {
 548         require "$HOME/.minc";
 549     }
 550
 551     my %batons;
 552     for my $hook (@start_hooks) {
 553         my ($handle, @baton) = $hook->();
 554         if (defined($handle)) {
 555             $batons{$handle} = [@baton];
 556         }
 557     }
 558
 559     for $msg (@_) {
 560         ($msg eq '.' or $msg eq '..') and next;
 561
 562         if ($printfilenames) {
 563             print("$msg\n");
 564         }
 565
 566         my %headers = get_headers($msg);
 567         log_headers(%headers);
 568
 569         undef($mhfolder);
 570         for my $hook (@filter_hooks) {
 571             my $result = $hook->(\%batons, \%headers, $msg);
 572             defined($result) and ($mhfolder = $result);
 573         }
 574
 575         defined($mhfolder) or ($mhfolder = find_mh_folder($msg, %headers));
 576
 577         $msgnum = store_message($msg, $mhfolder);
 578
 579         if ($mhfolder eq 'SPAM') {
 580             $spam++;
 581         } else {
 582             $saved++;
 583             my $from = [@{$headers{'from'}}]->[-1];
 584             my $subject = [@{$headers{'subject'}}]->[-1];
 585             for ($from, $subject) {
 586                 tr/\x00-\x1f\x80-\xff/ /;
 587             }
 588             print("\r");
 589             print(' ' x $COLUMNS);
 590             printf("\r\%-${nf}s \%${nm}d \%-${nF}s \%s\n",
 591                    substr($mhfolder, 0, $nf), substr($msgnum, 0, $nm),
 592                    # XXX shouldn't pop, as these are about to be
 593                    # passed to post_store_hooks
 594                    substr($from, 0, $nF),
 595                    substr($subject, 0, $ns));
 596         }
 597
 598         for my $hook (@post_store_hooks) {
 599             $hook->(\%batons, \%headers, $mhfolder, $msgnum);
 600         }
 601
 602         print("\r");
 603         printf(" \%${len}d SPAM \%${len}d saved \%${len}d/%1d",
 604                $spam, $saved, $spam + $saved, $msgcount);
 605
 606         defined($maxmsgs) and ($spam + $saved < $maxmsgs or last);
 607     }
 608     print("\n");
 609
 610     for my $hook (@stop_hooks) {
 611         $hook->(\%batons);
 612     }
 613 }
 614
 615 \f
 616 MAIN: {
 617     my $st;
 618
 619     if ($dumpfilters) {
 620         require "$HOME/.minc";
 621         $Data::Dumper::Indent = 1;
 622         print(Dumper(\@FILTERS));
 623         exit;
 624     }
 625
 626     chdir("$MAILDIR/new") or die("chdir($MAILDIR/new): $!");
 627     filter_mail(map { $_->[1] }
 628                 sort { $a->[0] <=> $b->[0] }
 629                 map {
 630                     if (not ($st = stat($_))) {
 631                         die("stat($_): $!");
 632                     }
 633                     [$st->mtime, $_]
 634                 }
 635                 getfiles('.'));
 636
 637     @SPAM and (exec('scan', '+SPAM', @SPAM) or die);
 638 }
 639
 640 \f
 641 __END__
 642
 643 =head1 THE FILTERS STRUCTURE
 644
 645 The user's .mincfilter file must define the @FILTERS structure.  This
 646 structure is an array.  Each element of @FILTERS is a filter.  A
 647 filter is itself an array.  The first element of a filter is a string,
 648 the name of the header this filter acts upon.  The header name is not
 649 case-sensitive.  Each subsequent element of a filter is a pair (i.e. a
 650 two-element array): first, a regular expression B<minc> uses to
 651 determine whether this filter matches or not, and second, an
 652 expression which B<minc> evaluates to get the folder name.
 653
 654 B<minc> decides where to store a message by iterating over the
 655 @FILTERS array.  It tests each regexp of each filter against all
 656 headers matching that filter's specified header.  As soon as a match
 657 is found, B<minc> evaluates the second part of the pair.  This part
 658 may contain positional parameters from the matched regexp ($1, $2,
 659 etc.).  The result of this expression is used as the folder name.
 660
 661 Multiple occurrences of the same header are preserved and tested
 662 individually against the filters.  That means, for example, that
 663 filters can search all the Received headers for a certain string.
 664
 665 It is important to note that all the arrays of the @FILTERS structure
 666 are traversed I<in order>.  This means the filters can be arranged so
 667 that some have priority over others.  XXX: get Doug to write an
 668 example of having the same header matched more than once.
 669
 670 Lastly, B<minc> supports a magic ' TO' header.  Filters using this
 671 header are matched against a collection of headers related to (and
 672 including) the To.  Which headers to use is determined by a regular
 673 expression borrowed from procmail.
 674
 675 =head1 EXAMPLES
 676
 677 @FILTERS = (
 678
 679   ['List-Id',
 680    ['<([[:graph:]]+)-([[:graph:]]+)\.lists\.sourceforge\.net>', 'l/$1/$2'],
 681    ['<([[:graph:]]+?)(-list)?\.freedesktop\.org>', 'l/freedesktop/$1'],
 682    ['<swig\.cs\.uchicago\.edu>', 'l/swig'],
 683   ],
 684
 685   ['List-Post',
 686    ['<mailto:([[:graph:]]+)+@([[:graph:]]+)\.apache\.org>', 'l/apache/$2/$1'],
 687   ],
 688
 689   ['To',
 690    ['root', 'root'],
 691   ],
 692 );
 693
 694 The first List-Id filter is a surprisingly effective one which creates
 695 appropriate folder names (such as l/htdig/updates and l/rox/devel) for
 696 all Sourceforge lists to which i subscribe.  Certainly there are lists
 697 hosted at Sourceforge which do not work well with this, and this is
 698 where it is important to remember that the first match is the one
 699 uses.  Simply put the more specific rules before this one.
 700
 701 The next List-Id example is simple.  The swig example demonstrates
 702 that the folder name does not have to use a portion of the matched
 703 header; you can just hardcode the folder name.
 704
 705 The List-Post example is a nice one.  Most ASF projects have their own
 706 apache.org subdomain, with mailing lists hosted there.  So, given a
 707 list such as dev@httpd.apache.org, this filter will create the folder
 708 name l/apache/httpd/dev.
 709
 710 For an example B<spam_check> function, see
 711 L<http://pretzelnet.org/cvs/dotfiles/.mincspam>
 712
 713 =head1 AUTHORS
 714
 715 Written by Eric Gillespie <epg@pretzelnet.org>.  Design by Eric
 716 Gillespie and Doug Porter <dsp@waterspout.com>.
 717
 718 This program is free software; you can redistribute it and/or modify
 719 it under the same terms as Perl itself.
 720
 721 =cut
 722
 723 # Local variables:
 724 # cperl-indent-level: 4
 725 # perl-indent-level: 4
 726 # indent-tabs-mode: nil
 727 # End:
 728
 729 # vi: set tabstop=4 expandtab: