diplodocus.org Git - minc/blob - minc

   1 #! /usr/local/bin/perl
   2
   3 # $Id$
   4
   5 =head1 NAME
   6
   7 B<minc> - Incorporate mail from a maildir into mh folders.
   8
   9 =head1 SYNOPSIS
  10
  11 B<minc> [-B<dnp>]
  12
  13 =head1 DESCRIPTION
  14
  15 B<minc> incorporates mail from a maildir to a mh folder hierarchy.  It
  16 takes mail from a maildir folder (not a maildir folder hierarchy),
  17 optionally checks for spam with a user-defined spam-checking function,
  18 and optionally filters mail into separate mh folders.
  19
  20 The filtering is quite sophisticated, as it is done using real Perl
  21 matching (m//) commands.
  22
  23 =cut
  24
  25 use strict;
  26 use warnings;
  27
  28 use Data::Dumper;
  29 use Errno;
  30 use Fcntl qw(O_WRONLY O_EXCL O_CREAT);
  31 use FileHandle;
  32 use File::Basename;
  33 use File::stat;
  34 use Getopt::Std; $Getopt::Std::STANDARD_HELP_VERSION = 1;
  35 use Log::Dispatch;
  36 use Log::Dispatch::File;
  37 use POSIX qw(strftime WEXITSTATUS WIFEXITED);
  38
  39 our $VERSION = 1;
  40
  41 # If a filter set's header is $MAGIC_TO_TOKEN, that set is compared
  42 # against headers matching this regex (taken from procmail).
  43 my $MAGIC_TO_REGEX = '^((Original-)?(Resent-)?(To|Cc|Bcc)|(X-Envelope |Apparently(-Resent)?)-To)';
  44 my $MAGIC_TO_TOKEN = ' TO';
  45
  46 # List of SPAM message numbers, scanned at the end so the user can
  47 # check for false positives.
  48 my @SPAM;
  49
  50 =head1 OPTIONS
  51
  52 =over 4
  53
  54 =item B<-d>
  55
  56 Dump (using Data::Dumper) the FILTERS list and exit.  This is useful
  57 for testing the syntax of .mincfilter.
  58
  59 =item B<--help>
  60
  61 Show help.
  62
  63 =item B<-n>
  64
  65 Dry run; do not actually incorporate the mail, but log and report to
  66 stdout/stderr as normal.
  67
  68 =item B<-p>
  69
  70 Print the filename of each message before checking it for spam.  This
  71 can be handy if a particular message is giving the spam checker a
  72 problem.
  73
  74 =back
  75
  76 =cut
  77
  78 my $dumpfilters = 0;
  79 our $run = 1;
  80 my $printfilenames = 0;
  81
  82 our $opt_d;
  83 our $opt_n;
  84 our $opt_p;
  85
  86 if (not getopts('dnp')) {
  87     exit(2);
  88 }
  89
  90 if ($opt_d) {
  91     $dumpfilters = 1;
  92 } elsif ($opt_n) {
  93     $run = 0;
  94 }
  95
  96 if ($opt_p) {
  97     $printfilenames = 1;
  98 }
  99
 100 =head1 ENVIRONMENT
 101
 102 =over 4
 103
 104 =item HOME
 105
 106 Where configuration files (.mincfilter) are found.  Also,
 107 $HOME/Maildir is used for the maildir if MAILDIR is not set.
 108
 109 =item MAILDIR
 110
 111 Where mail is delivered.
 112
 113 =back
 114
 115 =cut
 116
 117 use Env qw(HOME MAILDIR);
 118
 119 if (not $HOME) {
 120     die("HOME environment variable must be set.\n");
 121 }
 122 if (not $MAILDIR) {
 123     $MAILDIR = "$HOME/Maildir";
 124 }
 125
 126 =head1 FILES
 127
 128 =over 4
 129
 130 =item $HOME/.mincfilter
 131
 132 This file is Perl code (included via the 'require' directive) which is
 133 expected to define the FILTERS list.
 134
 135 =item $HOME/.mincspam
 136
 137 If this file exists, B<minc> will include it with the expectation that
 138 it will define a B<spam_check> function.  This function takes a
 139 message filename as an argument and returns 1 if the message is spam,
 140 else 0.  If this file does not exist, B<minc> will define a simple
 141 function that always returns 0.
 142
 143 One of B<minc>'s global variables is available to the user-defined
 144 B<spam_check> function: $run.  This boolean should be honored;
 145 B<spam_check> should only take real action (i.e. removing or creating
 146 files, running external programs, etc.) if $run is non-zero.
 147
 148 This file may also declare two other functions: B<spam_start_hook> and
 149 B<spam_stop_hook>.  The former is passed no arguments and is expected
 150 to return a list.  This list is a "baton" that will also be passed to
 151 B<spam_check> and B<spam_stop_hook>.  It can hold anything
 152 B<spam_check> will need to do its job, whether network sockets, pipes,
 153 or whatever.
 154
 155 XXX: need more details about the spam-handling process; for now read
 156 the code.
 157
 158 =item `mhpath +`/logs/minc.log
 159
 160 Where B<minc> logs what it does, unless in -n mode.
 161
 162 =item `mhpath +`/logs/dryrun.log
 163
 164 Where B<minc> logs what it would do; used in -n mode.
 165
 166 =item `mhpath +`/.minc.context
 167
 168 B<minc> uses this file for context (i.e. current folder) instead of
 169 `mhpath +`/context.  This allows you some amount of freedom to use mh
 170 while B<minc> is running.  For example, you can changed folders without
 171 causing a message to be stored in the wrong folder.  Care must still
 172 be taken, however, about the unseen sequence; if you change it
 173 (whether via show, or mark, or anything else) while in the same folder
 174 as B<minc>, it is likely the new message B<minc> stores will not end up
 175 marked unseen.
 176
 177 =back
 178
 179 =cut
 180
 181 our @FILTERS;
 182 require "$HOME/.mincfilter";
 183
 184 if (-f "$HOME/.mincspam") {
 185     require "$HOME/.mincspam";
 186 } else {
 187     eval "sub spam_start_hook { return (); }";
 188     eval "sub spam_stop_hook { }";
 189     eval "sub spam_check { return 0; }";
 190 }
 191
 192 my $mh;
 193 my $logfile;
 194
 195 $mh = `mhpath +`;
 196 chomp($mh);
 197
 198 if ($run) {
 199     $logfile = $mh . '/logs/minc.log';
 200 } else {
 201     $logfile = $mh . '/logs/dryrun.log';
 202 }
 203
 204 $ENV{"MHCONTEXT"} = $mh . '/.minc.context';
 205
 206 \f
 207 ###############################################################################
 208 # Logging
 209
 210 # debuglevels for the logger
 211 use constant LOG_DEBUGLEVEL     => 'info';
 212 use constant SCREEN_DEBUGLEVEL  => 'debug';
 213
 214 # characters used to wrap around the id field in the log so I can more
 215 # easily parse the msg flow
 216 use constant DEBUGCHAR          => '%';
 217 use constant INFOCHAR           => '=';
 218 use constant INCOMINGCHAR       => '<';
 219 use constant SAVECHAR           => '>';
 220 use constant WARNCHAR           => 'W';
 221 use constant ERRORCHAR          => 'E';
 222
 223 my $logger = new Log::Dispatch;
 224 $logger->add(new Log::Dispatch::File (name=>'logfile',
 225                                       filename=>$logfile,
 226                                       min_level=>'info',
 227                                       mode=>'append'));
 228
 229 # log alias, handles getting passed vars that are undef
 230 sub mylog {
 231     my $level = shift;
 232     my $act = shift;
 233     my $timestamp = strftime('%b %e %H:%M:%S', localtime);
 234     my $msg;
 235     foreach my $part (@_) {
 236         if (defined($part)) {
 237             $msg .= $part;
 238         }
 239     }
 240     # no newlines in the log message, thanks
 241     $msg =~ s/\n/ /gm;
 242     $msg = sprintf("%s %s%s %s\n", $timestamp, $act, $act, $msg);
 243     $logger->log(level=>$level, message=>$msg);
 244
 245     if ($act eq SAVECHAR) {
 246         $logger->log(level=>$level, message=>"\n");
 247     }
 248 }
 249
 250 sub logsave { mylog('notice', SAVECHAR, @_); }
 251 sub loginfo { mylog('info', INFOCHAR, @_); }
 252 sub logdebug { mylog('debug', DEBUGCHAR, @_); }
 253 sub logwarn { mylog('warning', WARNCHAR, @_); }
 254 sub logerr { mylog('error', ERRORCHAR, @_); }
 255
 256 sub logincoming {
 257     my ($text, @contents) = @_;
 258     my $last;
 259
 260     if (@contents) {
 261         $last = $contents[-1];
 262     } else {
 263         $last = '';
 264     }
 265
 266     mylog('info', INCOMINGCHAR, $text, $last);
 267 }
 268
 269 sub log_headers {
 270     my %headers = @_;
 271
 272     # For an explanation of the %headers structure, see the
 273     # get_headers function below.
 274     logincoming('From:       ', @{$headers{'return-path'}});
 275     logincoming('To:         ', @{$headers{'to'}});
 276     logincoming('Subject:    ', @{$headers{'subject'}});
 277     logincoming('Message-Id: ', @{$headers{'message-id'}});
 278 }
 279
 280 \f
 281 ###############################################################################
 282 # Utility procedures
 283
 284 sub mkfolder {
 285     my $mhfolder = shift;
 286     my $folder;
 287     my $target;
 288     my $component;
 289
 290     $folder = $mh . '/' . $mhfolder;
 291     $target = '';
 292
 293     if (not -d $folder) {
 294         foreach $component (split('/', $folder)) {
 295             if (defined($component) and length($component) >= 1) {
 296                 $target = $target . '/' . $component;
 297                 if (-d $target or mkdir($target)) {
 298                     next;
 299                 } else {
 300                     die("Failed to create +$mhfolder ($component)");
 301                 }
 302             }
 303         }
 304     }
 305 }
 306
 307 sub getfiles {
 308     my $dir = shift;
 309     my @result;
 310
 311     if (not opendir(DIR, $dir)) {
 312         die("Failed opendir($dir)");
 313     }
 314
 315     # Initialize $! to 0 (success) because stupid stupid Perl provides
 316     # no way to distinguish an error from an empty directory; that
 317     # means setting $! to 0 and testing it afterwards is the only way
 318     # to detect an error.  Real Programmers don't handle errors,
 319     # right?  >sigh<
 320     $! = 0;
 321     @result = grep {
 322         ($_ ne '.' and $_ ne '..')
 323           and $_ = "$MAILDIR/new/$_"
 324       } readdir(DIR);
 325
 326     if ($! != 0) {
 327         die("Failed readdir($dir)");
 328     }
 329
 330     if (scalar(@result) == 0) {
 331         exit(0);
 332     }
 333
 334     closedir(DIR);
 335
 336     return @result;
 337 }
 338
 339 sub get_highest_msgnum {
 340     my $mhfolder = shift;
 341     my $dir;
 342     my @list;
 343     my $highest;
 344     my $msgnum;
 345
 346     $dir = "$mh/$mhfolder";
 347     if (not opendir(DIR, $dir)) {
 348         die("Failed opendir($dir)");
 349     }
 350
 351     $! = 0;
 352     @list = readdir(DIR);
 353
 354     if ($! != 0) {
 355         die("Failed readdir($dir)");
 356     }
 357
 358     closedir(DIR);
 359
 360     $highest = 0;
 361     foreach $msgnum (@list) {
 362         # Look for integers.
 363         if ($msgnum =~ /^[0-9]+$/) {
 364             if ($msgnum > $highest) {
 365                 $highest = $msgnum;
 366             }
 367         }
 368     }
 369
 370     return $highest;
 371 }
 372
 373 sub store_message {
 374     my $msg = shift;
 375     my $mhfolder = shift;
 376     my $msgnum;
 377     my $try;
 378     my $mhmsg;
 379     my $status;
 380
 381     # We must do this even in -n mode because later steps fail without
 382     # it.  This should be harmless.
 383     mkfolder($mhfolder);
 384
 385     # This loop is a modified version of the maildir delivery algorithm.
 386     $msgnum = get_highest_msgnum($mhfolder);
 387     for ($try = 0; ; $try++) {
 388         $msgnum++;
 389         $mhmsg = "$mh/$mhfolder/$msgnum";
 390
 391         if (not stat($mhmsg)) {
 392             if ($!{ENOENT}) {
 393                 # Now we have a non-existent file, let's try to create
 394                 # it.  We must create a zero-byte file first because a
 395                 # file my appear between our happy stat results and
 396                 # our later rename(2), which would clobber said file.
 397                 # So attempt to create a file with this name.  If it
 398                 # succeeds, in just a bit here we'll knowingly clobber
 399                 # this file with the rename(2) call.
 400
 401                 # Another way to do this is not to use rename(2), but
 402                 # use link(2) + unlink(2) instead.  That's how the
 403                 # standard maildir algorithm does it.  Each method has
 404                 # a disadvantage: the program may crash between the
 405                 # link(2) and unlink(2) calls.  With the standard
 406                 # maildir algorithm, that means the message will end
 407                 # up duplicated.  The advantage of creating an empty
 408                 # file followed by rename(2) is that an extra empty
 409                 # file is left behind as opposed to a duplicate
 410                 # message.  This is more easily detected by the user.
 411                 if ($run) {
 412                     if (sysopen(MSG, "$mhmsg",
 413                                 O_WRONLY | O_EXCL | O_CREAT, 0600)) {
 414                         close(MSG);
 415                         last;
 416                     }
 417                 } else {
 418                     last;
 419                 }
 420             }
 421         }
 422
 423         # This algorithm is different from the maildir one; let's make
 424         # 10 tries instead of 3.
 425         if ($try == 9) {
 426             die("Attempted filename $mhmsg exists.");
 427         }
 428
 429         # This algorithm is different; i don't think we need to sleep.
 430         #sleep(2);
 431     }
 432
 433     if ($mhfolder ne 'SPAM') {
 434         logsave("+$mhfolder");
 435     }
 436
 437     if ($run) {
 438         if (not rename($msg, $mhmsg)) {
 439             die("Failed rename($msg, $mhmsg)");
 440         }
 441
 442         # Mark each message as soon as we store it and bomb if that
 443         # fails.  While it is slow, it is not safe to store multiple
 444         # messages and then have a failure before marking some (or
 445         # all).
 446         if ($mhfolder eq 'SPAM') {
 447             push(@SPAM, $msgnum);
 448         } else {
 449             $status = system('mark', "+$mhfolder", "$msgnum", '-sequence',
 450                              'unseen', '-add');
 451             if (not WIFEXITED($status)) {
 452                 die("Failed to run mark");
 453             } elsif (WEXITSTATUS($status) != 0) {
 454                 die("Failed to mark message unseen.");
 455             }
 456         }
 457     }
 458
 459     return $msgnum;
 460 }
 461
 462 # Parse a message file into a structure describing the headers.  The
 463 # structure is a hash of arrays.  The hash keys are the names of the
 464 # headers found in the message, made all lower-case.  Each item in
 465 # the hash is an array of header text.  The array has one element
 466 # per occurrence of the header.  Most headers will only have a single
 467 # element because they occur only once.  The Received header is the
 468 # most common header with multiple texts.
 469 sub get_headers {
 470     my $msg = shift;
 471     my %headers;
 472     my $current;                # current header, used for unfolding lines
 473     my $fieldname;              # unmolested header name
 474     my $contents;               # contents of header
 475
 476     open(MSG, $msg);
 477     while (<MSG>) {
 478         chomp;
 479         if (length == 0) {
 480             last;
 481         }
 482
 483         if (/^\s/) {
 484             # folded header continuation
 485
 486             if (not defined($current)) {
 487                 print(STDERR "Malformed message, cannot parse headers.\n");
 488                 return ();
 489             }
 490
 491             @{$headers{$current}}[-1] .= $_;
 492         } else {
 493             ($fieldname) = split(/:/);
 494             $current = lc($fieldname);
 495             (undef, $contents) = split(/^\Q$fieldname\E:\s*/);
 496
 497             if (defined($headers{$current})) {
 498                 # multiple occurence of same header
 499                 push(@{$headers{$current}}, $contents);
 500             } else {
 501                 # first occurence of a header
 502                 $headers{$current} = [$contents];
 503             }
 504         }
 505     }
 506     close(MSG);
 507
 508     return %headers;
 509 }
 510
 511 \f
 512 ###############################################################################
 513 # Filtering
 514
 515 sub find_mh_folder {
 516     my $msg = shift;
 517     my %headers;
 518     my $filterref;
 519     my @filter;
 520     my $header;
 521     my $contents;
 522     my $pair;
 523     my $match;
 524     my $expression;
 525     my $result;
 526
 527     %headers = get_headers($msg);
 528     if (not %headers) {
 529         return 'malformed';
 530     }
 531
 532     log_headers(%headers);
 533
 534     # Walk the list of filters.  This structure is documented in
 535     # pod at the end of the program.
 536     foreach $filterref (@FILTERS) {
 537         @filter = @$filterref;
 538         $header = shift(@filter);
 539
 540         # Handle filters using the magic TO header specially.
 541         if ($header eq $MAGIC_TO_TOKEN) {
 542             foreach $header (keys(%headers)) {
 543                 if ($header =~ /$MAGIC_TO_REGEX/i) {
 544                     foreach $contents (@{$headers{lc($header)}}) {
 545                         foreach $pair (@filter) {
 546                             ($match, $expression) = @$pair;
 547                             if ($contents =~ /$match/) {
 548                                 return $expression;
 549                             }
 550                         }
 551                     }
 552                 }
 553             }
 554
 555             # Now that it's been processed specially, skip normal handling.
 556             next;
 557         }
 558
 559         # Walk the list of message headers matching the filter's
 560         # specified header.
 561         foreach $contents (@{$headers{lc($header)}}) {
 562             # Walk the filter's list of match/expression pairs.
 563             foreach $pair (@filter) {
 564                 ($match, $expression) = @$pair;
 565                 if ($contents =~ /$match/i) {
 566                     if (eval "\$result = \"$expression\"") {
 567                         return $result;
 568                     }
 569                 }
 570             }
 571         }
 572     }
 573
 574     return 'inbox';
 575 }
 576
 577 sub filter_mail {
 578     my @msglist = @_;
 579     my $msgcount = scalar(@msglist);
 580     my @baton;
 581     my $msg;
 582     my $mhfolder;
 583     my $spam = 0;
 584     my $saved = 0;
 585     my %FOLDERS = ('SPAM'=>1);
 586
 587     @baton = spam_start_hook();
 588
 589
 590     STDOUT->autoflush(1);
 591     print("$msgcount messages...");
 592     foreach $msg (@msglist) {
 593         if (spam_check($msg, @baton)) {
 594             $mhfolder = 'SPAM';
 595             $spam = scalar(@SPAM)+ 1;
 596         } else {
 597             $mhfolder = find_mh_folder($msg);
 598             $saved++;
 599         }
 600
 601         store_message($msg, $mhfolder);
 602
 603         print("\r");
 604         if (not $FOLDERS{$mhfolder}) {
 605             print(' ' x length($msgcount));
 606             print("                                \r+$mhfolder\n");
 607             $FOLDERS{$mhfolder} = 1;
 608         }
 609
 610         printf('%6d SPAM %6d saved %6d/%1d',
 611                $spam, $saved, $spam + $saved, $msgcount);
 612     }
 613     print("\n");
 614
 615     spam_stop_hook(@baton);
 616 }
 617
 618 \f
 619 MAIN: {
 620     my @msglist;
 621     my $st;
 622
 623     if ($dumpfilters) {
 624         $Data::Dumper::Indent = 1;
 625         print(Dumper(\@FILTERS));
 626         exit;
 627     }
 628
 629     @msglist = (
 630                 map { $_->[1] }
 631                 sort { $a->[0] <=> $b->[0] }
 632                 map {
 633                     if (not ($st = stat($_))) {
 634                         die("stat($_): $!");
 635                     }
 636                     [$st->mtime, $_]
 637                 }
 638                 getfiles("$MAILDIR/new"));
 639
 640     filter_mail(@msglist);
 641
 642     @SPAM and (exec('scan', '+SPAM', @SPAM) or die);
 643 }
 644
 645 \f
 646 __END__
 647
 648 =head1 THE FILTERS STRUCTURE
 649
 650 The user's .mincfilter file must define the @FILTERS structure.  This
 651 structure is an array.  Each element of @FILTERS is a filter.  A
 652 filter is itself an array.  The first element of a filter is a string,
 653 the name of the header this filter acts upon.  The header name is not
 654 case-sensitive.  Each subsequent element of a filter is a pair (i.e. a
 655 two-element array): first, a regular expression B<minc> uses to
 656 determine whether this filter matches or not, and second, an
 657 expression which B<minc> evaluates to get the folder name.
 658
 659 B<minc> decides where to store a message by iterating over the
 660 @FILTERS array.  It tests each regexp of each filter against all
 661 headers matching that filter's specified header.  As soon as a match
 662 is found, B<minc> evaluates the second part of the pair.  This part
 663 may contain positional parameters from the matched regexp ($1, $2,
 664 etc.).  The result of this expression is used as the folder name.
 665
 666 Multiple occurrences of the same header are preserved and tested
 667 individually against the filters.  That means, for example, that
 668 filters can search all the Received headers for a certain string.
 669
 670 It is important to note that all the arrays of the @FILTERS structure
 671 are traversed I<in order>.  This means the filters can be arranged so
 672 that some have priority over others.  XXX: get Doug to write an
 673 example of having the same header matched more than once.
 674
 675 Lastly, B<minc> supports a magic ' TO' header.  Filters using this
 676 header are matched against a collection of headers related to (and
 677 including) the To.  Which headers to use is determined by a regular
 678 expression borrowed from procmail.
 679
 680 =head1 EXAMPLES
 681
 682 @FILTERS = (
 683
 684   ['List-Id',
 685    ['<([[:graph:]]+)-([[:graph:]]+)\.lists\.sourceforge\.net>', 'l/$1/$2'],
 686    ['<([[:graph:]]+?)(-list)?\.freedesktop\.org>', 'l/freedesktop/$1'],
 687    ['<swig\.cs\.uchicago\.edu>', 'l/swig'],
 688   ],
 689
 690   ['List-Post',
 691    ['<mailto:([[:graph:]]+)+@([[:graph:]]+)\.apache\.org>', 'l/apache/$2/$1'],
 692   ],
 693
 694   ['To',
 695    ['root', 'root'],
 696   ],
 697 );
 698
 699 The first List-Id filter is a surprisingly effective one which creates
 700 appropriate folder names (such as l/htdig/updates and l/rox/devel) for
 701 all Sourceforge lists to which i subscribe.  Certainly there are lists
 702 hosted at Sourceforge which do not work well with this, and this is
 703 where it is important to remember that the first match is the one
 704 uses.  Simply put the more specific rules before this one.
 705
 706 The next List-Id example is simple.  The swig example demonstrates
 707 that the folder name does not have to use a portion of the matched
 708 header; you can just hardcode the folder name.
 709
 710 The List-Post example is a nice one.  Most ASF projects have their own
 711 apache.org subdomain, with mailing lists hosted there.  So, given a
 712 list such as dev@httpd.apache.org, this filter will create the folder
 713 name l/apache/httpd/dev.
 714
 715 For an example B<spam_check> function, see
 716 L<http:E<047>E<047>pretzelnet.orgE<047>cvsE<047>dotfilesE<047>.mincspam>
 717
 718 =head1 AUTHORS
 719
 720 Written by Eric Gillespie <epg@pretzelnet.org> with logging code
 721 stolen from Adam Lazur <adam@lazur.org>.
 722
 723 Design by Eric Gillespie and Doug Porter <dsp@waterspout.com>.
 724
 725 This program is free software; you can redistribute it and/or modify
 726 it under the same terms as Perl itself.
 727
 728 =cut
 729
 730 # Local variables:
 731 # cperl-indent-level: 4
 732 # perl-indent-level: 4
 733 # indent-tabs-mode: nil
 734 # End:
 735
 736 # vi: set tabstop=4 expandtab: