diplodocus.org Git - minc/blob - minc

   1 #! /usr/local/bin/perl
   2
   3 # $Id$
   4
   5 =head1 NAME
   6
   7 B<minc> - Incorporate mail from a maildir into mh folders.
   8
   9 =head1 SYNOPSIS
  10
  11 B<minc> [-B<dhns>]
  12
  13 =head1 DESCRIPTION
  14
  15 B<minc> is a program for incorporating mail from a maildir to a mh
  16 folder hierarchy.  It takes mail from a maildir folder (not a maildir
  17 folder hierarchy), optionally checks for spam with a user-defined
  18 spam-checking function, and optionally filters mail into separate mh
  19 folders.
  20
  21 The filtering is quite sophisticated, as it is done using real Perl
  22 matching (m//) commands.
  23
  24 =cut
  25
  26 use strict;
  27 use warnings;
  28
  29 require 'sysexits.ph';
  30
  31 use Data::Dumper;
  32 use Errno;
  33 use Fcntl qw(O_WRONLY O_EXCL O_CREAT);
  34 use File::Basename;
  35 use Getopt::Std;
  36 use Log::Dispatch;
  37 use Log::Dispatch::File;
  38 use POSIX qw(strftime WEXITSTATUS WIFEXITED);
  39
  40 # If a filter set's header is $MAGIC_TO_TOKEN, that set is compared
  41 # against headers matching this regex (taken from procmail).
  42 my $MAGIC_TO_REGEX = '^((Original-)?(Resent-)?(To|Cc|Bcc)|(X-Envelope |Apparently(-Resent)?)-To)';
  43 my $MAGIC_TO_TOKEN = ' TO';
  44
  45 =head1 OPTIONS
  46
  47 =over 4
  48
  49 =item B<-d>
  50
  51 Dump (using Data::Dumper) the FILTERS list and exit.  This is useful
  52 for testing the syntax of .mincfilter.
  53
  54 =item B<-f>
  55
  56 Filter only, then exit.  This is useful after running B<minc -s>.
  57
  58 =item B<-h>
  59
  60 Show help.
  61
  62 =item B<-n>
  63
  64 Dry run; do not actually incorporate the mail, but log and report to
  65 stdout/stderr as normal.
  66
  67 =item B<-s>
  68
  69 Process SPAM only, then exit, leaving all non-spam messages in the
  70 maildir.
  71
  72 =back
  73
  74 =cut
  75
  76 my $dumpfilters = 0;
  77 my $filteronly = 0;
  78 our $run = 1;
  79 my $spamonly = 0;
  80
  81 our $opt_d;
  82 our $opt_f;
  83 our $opt_h;
  84 our $opt_n;
  85 our $opt_s;                     # ;; # stupid cperl-mode
  86
  87 if (not getopts('dfhns')) {
  88     exit(&EX_USAGE);
  89 }
  90
  91 if ($opt_d) {
  92     $dumpfilters = 1;
  93 } elsif ($opt_h) {
  94     print("Sorry bub, no help.\n");
  95     exit(&EX_OK);
  96 } elsif ($opt_n) {
  97     $run = 0;
  98 }
  99
 100 if ($opt_f) {
 101     $filteronly = 1;
 102 } elsif ($opt_s) {              # ))){ # stupid cperl-mode
 103     $spamonly = 1;
 104 }
 105
 106 =head1 ENVIRONMENT
 107
 108 =over 4
 109
 110 =item HOME
 111
 112 Where configuration files (.mincfilter) are found.  Also,
 113 $HOME/Maildir is used for the maildir if MAILDIR is not set.
 114
 115 =item MAILDIR
 116
 117 Where mail is delivered.
 118
 119 =back
 120
 121 =cut
 122
 123 use Env qw(HOME MAILDIR);
 124
 125 if (not $HOME) {
 126     die("HOME environment variable must be set.\n");
 127 }
 128 if (not $MAILDIR) {
 129     $MAILDIR = "$HOME/Maildir";
 130 }
 131
 132 =head1 FILES
 133
 134 =over 4
 135
 136 =item $HOME/.mincfilter
 137
 138 This file is Perl code (included via the 'require' directive) which is
 139 expected to define the FILTERS list.
 140
 141 =item $HOME/.mincspam
 142
 143 If this file exists, B<minc> will include it with the expectation that it
 144 will define a B<is_spam> function.  This function takes a message
 145 filename as an argument and returns 1 if the message is spam, else 0.
 146 If this file does not exist, B<minc> will define a simple function that
 147 always returns 0.
 148
 149 One of B<minc>'s global variables is available to the user-defined
 150 is_spam function: $run.  This boolean should be honored; is_spam
 151 should only take real action (i.e. removing or creating files, running
 152 external programs, etc.) if $run is non-zero.
 153
 154 XXX: need more details about the spam-handling process; for now read
 155 the code.
 156
 157 =item `mhpath +`/logs/minc.log
 158
 159 Where B<minc> logs what it does, unless in -n mode.
 160
 161 =item `mhpath +`/logs/dryrun.log
 162
 163 Where B<minc> logs what it would do; used in -n mode.
 164
 165 =item `mhpath +`/.minc.context
 166
 167 B<minc> uses this file for context (i.e. current folder) instead of
 168 `mhpath +`/context.  This allows you some amount of freedom to use mh
 169 while B<minc> is running.  For example, you can changed folders without
 170 causing a message to be stored in the wrong folder.  Care must still
 171 be taken, however, about the unseen sequence; if you change it
 172 (whether via show, or mark, or anything else) while in the same folder
 173 as B<minc>, it is likely the new message B<minc> stores will not end up
 174 marked unseen.
 175
 176 =back
 177
 178 =cut
 179
 180 our @FILTERS;
 181 require "$HOME/.mincfilter";
 182
 183 if (-f "$HOME/.mincspam") {
 184     require "$HOME/.mincspam";
 185 } else {
 186     sub is_spam {
 187         return 0;
 188     }
 189 }
 190
 191 my $mh;
 192 my $logfile;
 193
 194 $mh = `mhpath +`;
 195 chomp($mh);
 196
 197 if ($run) {
 198     $logfile = $mh . '/logs/minc.log';
 199 } else {
 200     $logfile = $mh . '/logs/dryrun.log';
 201 }
 202
 203 $ENV{"MHCONTEXT"} = $mh . '/.minc.context';
 204
 205 \f
 206 ###############################################################################
 207 # Logging
 208
 209 # debuglevels for the logger
 210 use constant LOG_DEBUGLEVEL     => 'info';
 211 use constant SCREEN_DEBUGLEVEL  => 'debug';
 212
 213 # characters used to wrap around the id field in the log so I can more
 214 # easily parse the msg flow
 215 use constant DEBUGCHAR          => '%';
 216 use constant INFOCHAR           => '=';
 217 use constant INCOMINGCHAR       => '<';
 218 use constant SAVECHAR           => '>';
 219 use constant WARNCHAR           => 'W';
 220 use constant ERRORCHAR          => 'E';
 221
 222 my $logger = new Log::Dispatch;
 223 $logger->add(new Log::Dispatch::File (name=>'logfile',
 224                                       filename=>$logfile,
 225                                       min_level=>'info',
 226                                       mode=>'append'));
 227
 228 # log alias, handles getting passed vars that are undef
 229 sub mylog {
 230     my $level = shift;
 231     my $act = shift;
 232     my $timestamp = strftime('%b %e %H:%M:%S', localtime);
 233     my $msg;
 234     foreach my $part (@_) {
 235         if (defined($part)) {
 236             $msg .= $part;
 237         }
 238     }
 239     # no newlines in the log message, thanks
 240     $msg =~ s/\n/ /gm;
 241     $msg = sprintf("%s %s%s %s\n", $timestamp, $act, $act, $msg);
 242     $logger->log(level=>$level, message=>$msg);
 243
 244     if ($act eq SAVECHAR) {
 245         $logger->log(level=>$level, message=>"\n");
 246     }
 247 }
 248
 249 sub logsave { mylog('notice', SAVECHAR, @_); }
 250 sub loginfo { mylog('info', INFOCHAR, @_); }
 251 sub logdebug { mylog('debug', DEBUGCHAR, @_); }
 252 sub logwarn { mylog('warning', WARNCHAR, @_); }
 253 sub logerr { mylog('error', ERRORCHAR, @_); }
 254
 255 sub logincoming {
 256     my ($text, @contents) = @_;
 257     my $last;
 258
 259     if (@contents) {
 260         $last = $contents[-1];
 261     } else {
 262         $last = '';
 263     }
 264
 265     mylog('info', INCOMINGCHAR, $text, $last);
 266 }
 267
 268 sub log_headers {
 269     my %headers = @_;
 270
 271     # For an explanation of the %headers structure, see the
 272     # get_headers function below.
 273     logincoming('From:       ', @{$headers{'return-path'}});
 274     logincoming('To:         ', @{$headers{'to'}});
 275     logincoming('Subject:    ', @{$headers{'subject'}});
 276     logincoming('Message-Id: ', @{$headers{'message-id'}});
 277 }
 278
 279 \f
 280 ###############################################################################
 281 # Utility procedures
 282
 283 sub _errprint {
 284     printf(STDERR '%s:%s', __FILE__, __LINE__);
 285
 286     if (@_) {
 287         print(STDERR ': ');
 288         foreach (@_) {
 289             print(STDERR);
 290         }
 291     }
 292 }
 293
 294 sub err {
 295     my $ex = shift;
 296
 297     _errprint(@_);
 298     print(STDERR ": $!\n");
 299
 300     exit($ex);
 301 }
 302
 303 sub errx {
 304     my $ex = shift;
 305
 306     _errprint(@_);
 307     print(STDERR "\n");
 308
 309     exit($ex);
 310 }
 311
 312 sub mkfolder {
 313     my $mhfolder = shift;
 314     my $folder;
 315     my $target;
 316     my $component;
 317
 318     $folder = $mh . '/' . $mhfolder;
 319     $target = '';
 320
 321     if (not -d $folder) {
 322         foreach $component (split('/', $folder)) {
 323             if (defined($component) and length($component) >= 1) {
 324                 $target = $target . '/' . $component;
 325                 if (-d $target or mkdir($target)) {
 326                     next;
 327                 } else {
 328                     err(&EX_OSERR,
 329                         "Failed to create +$mhfolder ($component)");
 330                 }
 331             }
 332         }
 333     }
 334 }
 335
 336 sub getfiles {
 337     my $dir = shift;
 338     my @result;
 339
 340     if (not opendir(DIR, $dir)) {
 341         err(&EX_OSERR, "Failed opendir($dir)");
 342     }
 343
 344     # Initialize $! to 0 (success) because stupid stupid Perl provides
 345     # no way to distinguish an error from an empty directory; that
 346     # means setting $! to 0 and testing it afterwards is the only way
 347     # to detect an error.  Real Programmers don't handle errors,
 348     # right?  >sigh<
 349     $! = 0;
 350     @result = grep {
 351         ($_ ne '.' and $_ ne '..')
 352           and $_ = "$MAILDIR/new/$_"
 353       } readdir(DIR);
 354
 355     if ($! != 0) {
 356         err(&EX_OSERR, "Failed readdir($dir)");
 357     }
 358
 359     closedir(DIR);
 360
 361     return @result;
 362 }
 363
 364 sub get_highest_msgnum {
 365     my $mhfolder = shift;
 366     my $dir;
 367     my @list;
 368     my $highest;
 369     my $msgnum;
 370
 371     $dir = "$mh/$mhfolder";
 372     if (not opendir(DIR, $dir)) {
 373         err(&EX_OSERR, "Failed opendir($dir)");
 374     }
 375
 376     $! = 0;
 377     @list = readdir(DIR);
 378
 379     if ($! != 0) {
 380         err(&EX_OSERR, "Failed readdir($dir)");
 381     }
 382
 383     closedir(DIR);
 384
 385     $highest = 0;
 386     foreach $msgnum (@list) {
 387         # Look for integers.
 388         if ($msgnum =~ /^[0-9]+$/) {
 389             if ($msgnum > $highest) {
 390                 $highest = $msgnum;
 391             }
 392         }
 393     }
 394
 395     return $highest;
 396 }
 397
 398 # We want to print the name of each list that has new mail only once,
 399 # so use this hash to store the lists that have already been printed.
 400 # Start the list out with SPAM already in it, since we don't care when
 401 # new messages are added to it.
 402 my %FOLDERS = ('SPAM'=>1);
 403
 404 sub store_message {
 405     my $msg = shift;
 406     my $mhfolder = shift;
 407     my $msgnum;
 408     my $try;
 409     my $mhmsg;
 410     my $status;
 411
 412     # We must do this even in -n mode because later steps fail without
 413     # it.  This should be harmless.
 414     mkfolder($mhfolder);
 415
 416     # This loop is a modified version of the maildir delivery algorithm.
 417     $msgnum = get_highest_msgnum($mhfolder);
 418     for ($try = 0; ; $try++) {
 419         $msgnum++;
 420         $mhmsg = "$mh/$mhfolder/$msgnum";
 421
 422         if (not stat($mhmsg)) {
 423             if ($!{ENOENT}) {
 424                 # Now we have a non-existent file, let's try to create
 425                 # it.  We must create a zero-byte file first because a
 426                 # file my appear between our happy stat results and
 427                 # our later rename(2), which would clobber said file.
 428                 # So attempt to create a file with this name.  If it
 429                 # succeeds, in just a bit here we'll knowingly clobber
 430                 # this file with the rename(2) call.
 431
 432                 # Another way to do this is not to use rename(2), but
 433                 # use link(2) + unlink(2) instead.  That's how the
 434                 # standard maildir algorithm does it.  Each method has
 435                 # a disadvantage: the program may crash between the
 436                 # link(2) and unlink(2) calls.  With the standard
 437                 # maildir algorithm, that means the message will end
 438                 # up duplicated.  The advantage of creating an empty
 439                 # file followed by rename(2) is that an extra empty
 440                 # file is left behind as opposed to a duplicate
 441                 # message.  This is more easiliy detected by the user.
 442                 if ($run) {
 443                     if (sysopen(MSG, "$mhmsg",
 444                                 O_WRONLY | O_EXCL | O_CREAT, 0600)) {
 445                         close(MSG);
 446                         last;
 447                     }
 448                 } else {
 449                     last;
 450                 }
 451             }
 452         }
 453
 454         # This algorithm is different from the maildir one; let's make
 455         # 10 tries instead of 3.
 456         if ($try == 9) {
 457             errx(&EX_TEMPFAIL, "Attempted filename $mhmsg exists.");
 458         }
 459
 460         # This algorithm is different; i don't think we need to sleep.
 461         #sleep(2);
 462     }
 463
 464     logsave("+$mhfolder");
 465
 466     if ($run) {
 467         if (not rename($msg, $mhmsg)) {
 468             err(&EX_OSERR, "Failed rename($msg, $mhmsg)");
 469         }
 470
 471         # Mark each message as soon as we store it and bomb if that
 472         # fails.  While it is slow, it is not safe to store multiple
 473         # messages and then have a failure before marking some (or
 474         # all).
 475         if ($mhfolder ne 'SPAM') {
 476             $status = system('mark', "+$mhfolder", "$msgnum", '-sequence',
 477                              'unseen', '-add');
 478             if (not WIFEXITED($status)) {
 479                 err(&EX_OSERR, "Failed to run mark");
 480             } elsif (WEXITSTATUS($status) != 0) {
 481                 errx(&EX_SOFTWARE, "Failed to mark message unseen.");
 482             }
 483         }
 484     }
 485
 486     if (not $FOLDERS{$mhfolder}) {
 487         print("+$mhfolder\n");
 488         $FOLDERS{$mhfolder} = 1;
 489     }
 490 }
 491
 492 # Parse a message file into a structure describing the headers.  The
 493 # structure is a hash of arrays.  The hash keys are the names of the
 494 # headers found in the message, made all lower-case.  Each item in
 495 # the hash is an array of header text.  The array has one element
 496 # per occurrence of the header.  Most headers will only have a single
 497 # element because they occur only once.  The Received header is the
 498 # most common header with multiple texts.
 499 sub get_headers {
 500     my $msg = shift;
 501     my %headers;
 502     my $current;                # current header, used for unfolding lines
 503     my $fieldname;              # unmolested header name
 504     my $contents;               # contents of header
 505
 506     open(MSG, $msg);
 507     while (<MSG>) {
 508         chomp;
 509         if (length == 0) {
 510             last;
 511         }
 512
 513         if (/^\s/) {
 514             # folded header continuation
 515
 516             if (not defined($current)) {
 517                 print(STDERR "Malformed message, cannot parse headers.\n");
 518                 return ();
 519             }
 520
 521             @{$headers{$current}}[-1] .= $_;
 522         } else {
 523             ($fieldname) = split(/:/);
 524             $current = lc($fieldname);
 525             (undef, $contents) = split(/^\Q$fieldname\E:\s*/);
 526
 527             if (defined($headers{$current})) {
 528                 # multiple occurence of same header
 529                 push(@{$headers{$current}}, $contents);
 530             } else {
 531                 # first occurence of a header
 532                 $headers{$current} = [$contents];
 533             }
 534         }
 535     }
 536     close(MSG);
 537
 538     return %headers;
 539 }
 540
 541 ###############################################################################
 542 # Spam handling
 543
 544 sub kill_spam {
 545     my @msglist = @_;
 546     my @result;
 547     my $msg;
 548     my $i;
 549
 550     @result = ();
 551
 552     $i = 0;
 553     foreach $msg (@msglist) {
 554         printf('%sChecking for spam... %6d/%d',
 555                "\r", ++$i, scalar(@msglist));
 556         if (is_spam($msg)) {
 557             print(" SPAM\n");
 558             store_message($msg, 'SPAM');
 559         } else {
 560             push(@result, $msg);
 561         }
 562     }
 563     print("\nDone: ", scalar(@result), " survivors\n");
 564
 565     return @result;
 566 }
 567
 568 \f
 569 ###############################################################################
 570 # Filtering
 571
 572 sub find_mh_folder {
 573     my $msg = shift;
 574     my %headers;
 575     my $filterref;
 576     my @filter;
 577     my $header;
 578     my $contents;
 579     my $pair;
 580     my $match;
 581     my $expression;
 582     my $result;
 583
 584     %headers = get_headers($msg);
 585     if (not %headers) {
 586         return 'malformed';
 587     }
 588
 589     log_headers(%headers);
 590
 591     # Walk the list of filters.  This structure is documented in
 592     # pod at the end of the program.
 593     foreach $filterref (@FILTERS) {
 594         @filter = @$filterref;
 595         $header = shift(@filter);
 596
 597         # Handle filters using the magic TO header specially.
 598         if ($header eq $MAGIC_TO_TOKEN) {
 599             foreach $header (keys(%headers)) {
 600                 if ($header =~ /$MAGIC_TO_REGEX/i) {
 601                     foreach $contents (@{$headers{lc($header)}}) {
 602                         foreach $pair (@filter) {
 603                             ($match, $expression) = @$pair;
 604                             if ($contents =~ /$match/) {
 605                                 return $expression;
 606                             }
 607                         }
 608                     }
 609                 }
 610             }
 611
 612             # Now that it's been processed specially, skip normal handling.
 613             next;
 614         }
 615
 616         # Walk the list of message headers matching the filter's
 617         # specified header.
 618         foreach $contents (@{$headers{lc($header)}}) {
 619             # Walk the filter's list of match/expression pairs.
 620             foreach $pair (@filter) {
 621                 ($match, $expression) = @$pair;
 622                 if ($contents =~ /$match/) {
 623                     if (eval "\$result = \"$expression\"") {
 624                         return $result;
 625                     }
 626                 }
 627             }
 628         }
 629     }
 630
 631     return 'inbox';
 632 }
 633
 634 sub filter_mail {
 635     my @msglist = @_;
 636     my $msg;
 637     my $mhfolder;
 638
 639     foreach $msg (@msglist) {
 640         $mhfolder = find_mh_folder($msg);
 641         store_message($msg, $mhfolder);
 642     }
 643 }
 644
 645 \f
 646 MAIN: {
 647     my @msglist;
 648
 649     if ($dumpfilters) {
 650         $Data::Dumper::Indent = 1;
 651         print(Dumper(\@FILTERS));
 652         exit(&EX_OK);
 653     }
 654
 655     @msglist = getfiles("$MAILDIR/new");
 656
 657     if (not $filteronly) {
 658         @msglist = kill_spam(@msglist);
 659     }
 660
 661     if (not $spamonly) {
 662         filter_mail(@msglist);
 663     }
 664 }
 665
 666 \f
 667 __END__
 668
 669 =head1 THE FILTERS STRUCTURE
 670
 671 The user's .mincfilter file must define the @FILTERS structure.  This
 672 structure is an array.  Each element of @FILTERS is a filter.  A
 673 filter is itself an array.  The first element of a filter is a string,
 674 the name of the header this filter acts upon.  The header name is not
 675 case-sensitive.  Each subsequent element of a filter is a pair (i.e. a
 676 two-element array): first, a regular expression B<minc> uses to
 677 determine whether this filter matches or not, and second, an
 678 expression which B<minc> evaluates to get the folder name.
 679
 680 B<minc> decides where to store a message by iterating over the
 681 @FILTERS array.  It tests each regexp of each filter against all
 682 headers matching that filter's specified header.  As soon as a match
 683 is found, B<minc> evaluates the second part of the pair.  This part
 684 may contain positional parameters from the matched regexp ($1, $2,
 685 etc.).  The result of this expression is used as the folder name.
 686
 687 Multiple occurrences of the same header are preserved and tested
 688 individually against the filters.  That means, for example, that
 689 filters can search all the Received headers for a certain string.
 690
 691 It is important to note that all the arrays of the @FILTERS structure
 692 are traversed I<in order>.  This means the filters can be arranged so
 693 that some have priority over others.  XXX: get Doug to write an
 694 example of having the same header matched more than once.
 695
 696 Lastly, B<minc> supports a magic ' TO' header.  Filters using this
 697 header are matched against a collection of headers related to (and
 698 including) the To.  Which headers to use is determined by a regular
 699 expression borrowed from procmail.
 700
 701 =head1 EXAMPLES
 702
 703 @FILTERS = (
 704
 705   ['List-Id',
 706    ['<([[:graph:]]+)-([[:graph:]]+)\.lists\.sourceforge\.net>', 'l/$1/$2'],
 707    ['<([[:graph:]]+?)(-list)?\.freedesktop\.org>', 'l/freedesktop/$1'],
 708    ['<swig\.cs\.uchicago\.edu>', 'l/swig'],
 709   ],
 710
 711   ['List-Post',
 712    ['<mailto:([[:graph:]]+)+@([[:graph:]]+)\.apache\.org>', 'l/apache/$2/$1'],
 713   ],
 714
 715   ['To',
 716    ['root', 'root'],
 717   ],
 718 );
 719
 720 The first List-Id filter is a surprisingly effective one which creates
 721 appropriate folder names (such as l/htdig/updates and l/rox/devel) for
 722 all Sourceforge lists to which i subscribe.  Certainly there are lists
 723 hosted at Sourceforge which do not work well with this, and this is
 724 where it is important to remember that the first match is the one
 725 uses.  Simply put the more specific rules before this one.
 726
 727 The next List-Id example is simple.  The swig example demonstrates
 728 that the folder name does not have to use a portion of the matched
 729 header; you can just hardcode the folder name.
 730
 731 The List-Post example is a nice one.  Most ASF projects have their own
 732 apache.org subdomain, with mailing lists hosted there.  So, given a
 733 list such as dev@httpd.apache.org, this filter will create the folder
 734 name l/apache/httpd/dev.
 735
 736 For an example is_spam function, see
 737 L<http:E<047>E<047>pretzelnet.orgE<047>cvsE<047>dotfilesE<047>.mincspam>
 738
 739 =head1 AUTHORS
 740
 741 Written by Eric Gillespie <epg@pretzelnet.org> with logging code
 742 stolen from Adam Lazur <adam@lazur.org>.
 743
 744 Design by Eric Gillespie and Doug Porter <dsp@waterspout.com>.
 745
 746 =cut
 747
 748 # Local variables:
 749 # cperl-indent-level: 4
 750 # perl-indent-level: 4
 751 # indent-tabs-mode: nil
 752 # End:
 753
 754 # vi: set tabstop=4 expandtab: