]> diplodocus.org Git - minc/blob - minc
mdeliver/current/mdeliver.c:
[minc] / minc
1 #! /usr/local/bin/perl
2
3 # $Id$
4
5 =head1 NAME
6
7 B<minc> - Incorporate mail from a maildir into mh folders.
8
9 =head1 SYNOPSIS
10
11 B<minc> [-B<dhns>]
12
13 =head1 DESCRIPTION
14
15 B<minc> is a program for incorporating mail from a maildir to a mh
16 folder hierarchy. It takes mail from a maildir folder (not a maildir
17 folder hierarchy), optionally checks for spam with a user-defined
18 spam-checking function, and optionally filters mail into separate mh
19 folders.
20
21 The filtering is quite sophisticated, as it is done using real Perl
22 matching (m//) commands.
23
24 =cut
25
26 use strict;
27 use warnings;
28
29 require 'sysexits.ph';
30
31 use Data::Dumper;
32 use Errno;
33 use Fcntl qw(O_WRONLY O_EXCL O_CREAT);
34 use File::Basename;
35 use Getopt::Std;
36 use Log::Dispatch;
37 use Log::Dispatch::File;
38 use POSIX qw(strftime WEXITSTATUS WIFEXITED);
39
40 # If a filter set's header is $MAGIC_TO_TOKEN, that set is compared
41 # against headers matching this regex (taken from procmail).
42 my $MAGIC_TO_REGEX = '^((Original-)?(Resent-)?(To|Cc|Bcc)|(X-Envelope |Apparently(-Resent)?)-To)';
43 my $MAGIC_TO_TOKEN = ' TO';
44
45 =head1 OPTIONS
46
47 =over 4
48
49 =item B<-d>
50
51 Dump (using Data::Dumper) the FILTERS list and exit. This is useful
52 for testing the syntax of .mincfilter.
53
54 =item B<-f>
55
56 Filter only, then exit. This is useful after running B<minc -s>.
57
58 =item B<-h>
59
60 Show help.
61
62 =item B<-n>
63
64 Dry run; do not actually incorporate the mail, but log and report to
65 stdout/stderr as normal.
66
67 =item B<-s>
68
69 Process SPAM only, then exit, leaving all non-spam messages in the
70 maildir.
71
72 =back
73
74 =cut
75
76 my $dumpfilters = 0;
77 my $filteronly = 0;
78 our $run = 1;
79 my $spamonly = 0;
80
81 our $opt_d;
82 our $opt_f;
83 our $opt_h;
84 our $opt_n;
85 our $opt_s; # ;; # stupid cperl-mode
86
87 if (not getopts('dfhns')) {
88 exit(&EX_USAGE);
89 }
90
91 if ($opt_d) {
92 $dumpfilters = 1;
93 } elsif ($opt_h) {
94 print("Sorry bub, no help.\n");
95 exit(&EX_OK);
96 } elsif ($opt_n) {
97 $run = 0;
98 }
99
100 if ($opt_f) {
101 $filteronly = 1;
102 } elsif ($opt_s) { # ))){ # stupid cperl-mode
103 $spamonly = 1;
104 }
105
106 =head1 ENVIRONMENT
107
108 =over 4
109
110 =item HOME
111
112 Where configuration files (.mincfilter) are found. Also,
113 $HOME/Maildir is used for the maildir if MAILDIR is not set.
114
115 =item MAILDIR
116
117 Where mail is delivered.
118
119 =back
120
121 =cut
122
123 use Env qw(HOME MAILDIR);
124
125 if (not $HOME) {
126 die("HOME environment variable must be set.\n");
127 }
128 if (not $MAILDIR) {
129 $MAILDIR = "$HOME/Maildir";
130 }
131
132 =head1 FILES
133
134 =over 4
135
136 =item $HOME/.mincfilter
137
138 This file is Perl code (included via the 'require' directive) which is
139 expected to define the FILTERS list.
140
141 =item $HOME/.mincspam
142
143 If this file exists, B<minc> will include it with the expectation that it
144 will define a B<is_spam> function. This function takes a message
145 filename as an argument and returns 1 if the message is spam, else 0.
146 If this file does not exist, B<minc> will define a simple function that
147 always returns 0.
148
149 One of B<minc>'s global variables is available to the user-defined
150 is_spam function: $run. This boolean should be honored; is_spam
151 should only take real action (i.e. removing or creating files, running
152 external programs, etc.) if $run is non-zero.
153
154 XXX: need more details about the spam-handling process; for now read
155 the code.
156
157 =item `mhpath +`/logs/minc.log
158
159 Where B<minc> logs what it does, unless in -n mode.
160
161 =item `mhpath +`/logs/dryrun.log
162
163 Where B<minc> logs what it would do; used in -n mode.
164
165 =item `mhpath +`/.minc.context
166
167 B<minc> uses this file for context (i.e. current folder) instead of
168 `mhpath +`/context. This allows you some amount of freedom to use mh
169 while B<minc> is running. For example, you can changed folders without
170 causing a message to be stored in the wrong folder. Care must still
171 be taken, however, about the unseen sequence; if you change it
172 (whether via show, or mark, or anything else) while in the same folder
173 as B<minc>, it is likely the new message B<minc> stores will not end up
174 marked unseen.
175
176 =back
177
178 =cut
179
180 our @FILTERS;
181 require "$HOME/.mincfilter";
182
183 if (-f "$HOME/.mincspam") {
184 require "$HOME/.mincspam";
185 } else {
186 sub is_spam {
187 return 0;
188 }
189 }
190
191 my $mh;
192 my $logfile;
193
194 $mh = `mhpath +`;
195 chomp($mh);
196
197 if ($run) {
198 $logfile = $mh . '/logs/minc.log';
199 } else {
200 $logfile = $mh . '/logs/dryrun.log';
201 }
202
203 $ENV{"MHCONTEXT"} = $mh . '/.minc.context';
204
205 \f
206 ###############################################################################
207 # Logging
208
209 # debuglevels for the logger
210 use constant LOG_DEBUGLEVEL => 'info';
211 use constant SCREEN_DEBUGLEVEL => 'debug';
212
213 # characters used to wrap around the id field in the log so I can more
214 # easily parse the msg flow
215 use constant DEBUGCHAR => '%';
216 use constant INFOCHAR => '=';
217 use constant INCOMINGCHAR => '<';
218 use constant SAVECHAR => '>';
219 use constant WARNCHAR => 'W';
220 use constant ERRORCHAR => 'E';
221
222 my $logger = new Log::Dispatch;
223 $logger->add(new Log::Dispatch::File (name=>'logfile',
224 filename=>$logfile,
225 min_level=>'info',
226 mode=>'append'));
227
228 # log alias, handles getting passed vars that are undef
229 sub mylog {
230 my $level = shift;
231 my $act = shift;
232 my $timestamp = strftime('%b %e %H:%M:%S', localtime);
233 my $msg;
234 foreach my $part (@_) {
235 if (defined($part)) {
236 $msg .= $part;
237 }
238 }
239 # no newlines in the log message, thanks
240 $msg =~ s/\n/ /gm;
241 $msg = sprintf("%s %s%s %s\n", $timestamp, $act, $act, $msg);
242 $logger->log(level=>$level, message=>$msg);
243
244 if ($act eq SAVECHAR) {
245 $logger->log(level=>$level, message=>"\n");
246 }
247 }
248
249 sub logsave { mylog('notice', SAVECHAR, @_); }
250 sub loginfo { mylog('info', INFOCHAR, @_); }
251 sub logdebug { mylog('debug', DEBUGCHAR, @_); }
252 sub logwarn { mylog('warning', WARNCHAR, @_); }
253 sub logerr { mylog('error', ERRORCHAR, @_); }
254
255 sub logincoming {
256 my ($text, @contents) = @_;
257 my $last;
258
259 if (@contents) {
260 $last = $contents[-1];
261 } else {
262 $last = '';
263 }
264
265 mylog('info', INCOMINGCHAR, $text, $last);
266 }
267
268 sub log_headers {
269 my %headers = @_;
270
271 # For an explanation of the %headers structure, see the
272 # get_headers function below.
273 logincoming('From: ', @{$headers{'return-path'}});
274 logincoming('To: ', @{$headers{'to'}});
275 logincoming('Subject: ', @{$headers{'subject'}});
276 logincoming('Message-Id: ', @{$headers{'message-id'}});
277 }
278
279 \f
280 ###############################################################################
281 # Utility procedures
282
283 sub _errprint {
284 printf(STDERR '%s:%s', __FILE__, __LINE__);
285
286 if (@_) {
287 print(STDERR ': ');
288 foreach (@_) {
289 print(STDERR);
290 }
291 }
292 }
293
294 sub err {
295 my $ex = shift;
296
297 _errprint(@_);
298 print(STDERR ": $!\n");
299
300 exit($ex);
301 }
302
303 sub errx {
304 my $ex = shift;
305
306 _errprint(@_);
307 print(STDERR "\n");
308
309 exit($ex);
310 }
311
312 sub mkfolder {
313 my $mhfolder = shift;
314 my $folder;
315 my $target;
316 my $component;
317
318 $folder = $mh . '/' . $mhfolder;
319 $target = '';
320
321 if (not -d $folder) {
322 foreach $component (split('/', $folder)) {
323 if (defined($component) and length($component) >= 1) {
324 $target = $target . '/' . $component;
325 if (-d $target or mkdir($target)) {
326 next;
327 } else {
328 err(&EX_OSERR,
329 "Failed to create +$mhfolder ($component)");
330 }
331 }
332 }
333 }
334 }
335
336 sub getfiles {
337 my $dir = shift;
338 my @result;
339
340 if (not opendir(DIR, $dir)) {
341 err(&EX_OSERR, "Failed opendir($dir)");
342 }
343
344 # Initialize $! to 0 (success) because stupid stupid Perl provides
345 # no way to distinguish an error from an empty directory; that
346 # means setting $! to 0 and testing it afterwards is the only way
347 # to detect an error. Real Programmers don't handle errors,
348 # right? >sigh<
349 $! = 0;
350 @result = grep {
351 ($_ ne '.' and $_ ne '..')
352 and $_ = "$MAILDIR/new/$_"
353 } readdir(DIR);
354
355 if ($! != 0) {
356 err(&EX_OSERR, "Failed readdir($dir)");
357 }
358
359 closedir(DIR);
360
361 return @result;
362 }
363
364 sub get_highest_msgnum {
365 my $mhfolder = shift;
366 my $dir;
367 my @list;
368 my $highest;
369 my $msgnum;
370
371 $dir = "$mh/$mhfolder";
372 if (not opendir(DIR, $dir)) {
373 err(&EX_OSERR, "Failed opendir($dir)");
374 }
375
376 $! = 0;
377 @list = readdir(DIR);
378
379 if ($! != 0) {
380 err(&EX_OSERR, "Failed readdir($dir)");
381 }
382
383 closedir(DIR);
384
385 $highest = 0;
386 foreach $msgnum (@list) {
387 # Look for integers.
388 if ($msgnum =~ /^[0-9]+$/) {
389 if ($msgnum > $highest) {
390 $highest = $msgnum;
391 }
392 }
393 }
394
395 return $highest;
396 }
397
398 # We want to print the name of each list that has new mail only once,
399 # so use this hash to store the lists that have already been printed.
400 # Start the list out with SPAM already in it, since we don't care when
401 # new messages are added to it.
402 my %FOLDERS = ('SPAM'=>1);
403
404 sub store_message {
405 my $msg = shift;
406 my $mhfolder = shift;
407 my $msgnum;
408 my $try;
409 my $mhmsg;
410 my $status;
411
412 # We must do this even in -n mode because later steps fail without
413 # it. This should be harmless.
414 mkfolder($mhfolder);
415
416 # This loop is a modified version of the maildir delivery algorithm.
417 $msgnum = get_highest_msgnum($mhfolder);
418 for ($try = 0; ; $try++) {
419 $msgnum++;
420 $mhmsg = "$mh/$mhfolder/$msgnum";
421
422 if (not stat($mhmsg)) {
423 if ($!{ENOENT}) {
424 # Now we have a non-existent file, let's try to create
425 # it. We must create a zero-byte file first because a
426 # file my appear between our happy stat results and
427 # our later rename(2), which would clobber said file.
428 # So attempt to create a file with this name. If it
429 # succeeds, in just a bit here we'll knowingly clobber
430 # this file with the rename(2) call.
431
432 # Another way to do this is not to use rename(2), but
433 # use link(2) + unlink(2) instead. That's how the
434 # standard maildir algorithm does it. Each method has
435 # a disadvantage: the program may crash between the
436 # link(2) and unlink(2) calls. With the standard
437 # maildir algorithm, that means the message will end
438 # up duplicated. The advantage of creating an empty
439 # file followed by rename(2) is that an extra empty
440 # file is left behind as opposed to a duplicate
441 # message. This is more easiliy detected by the user.
442 if ($run) {
443 if (sysopen(MSG, "$mhmsg",
444 O_WRONLY | O_EXCL | O_CREAT, 0600)) {
445 close(MSG);
446 last;
447 }
448 } else {
449 last;
450 }
451 }
452 }
453
454 # This algorithm is different from the maildir one; let's make
455 # 10 tries instead of 3.
456 if ($try == 9) {
457 errx(&EX_TEMPFAIL, "Attempted filename $mhmsg exists.");
458 }
459
460 # This algorithm is different; i don't think we need to sleep.
461 #sleep(2);
462 }
463
464 logsave("+$mhfolder");
465
466 if ($run) {
467 if (not rename($msg, $mhmsg)) {
468 err(&EX_OSERR, "Failed rename($msg, $mhmsg)");
469 }
470
471 # Mark each message as soon as we store it and bomb if that
472 # fails. While it is slow, it is not safe to store multiple
473 # messages and then have a failure before marking some (or
474 # all).
475 if ($mhfolder ne 'SPAM') {
476 $status = system('mark', "+$mhfolder", "$msgnum", '-sequence',
477 'unseen', '-add');
478 if (not WIFEXITED($status)) {
479 err(&EX_OSERR, "Failed to run mark");
480 } elsif (WEXITSTATUS($status) != 0) {
481 errx(&EX_SOFTWARE, "Failed to mark message unseen.");
482 }
483 }
484 }
485
486 if (not $FOLDERS{$mhfolder}) {
487 print("+$mhfolder\n");
488 $FOLDERS{$mhfolder} = 1;
489 }
490 }
491
492 # Parse a message file into a structure describing the headers. The
493 # structure is a hash of arrays. The hash keys are the names of the
494 # headers found in the message, made all lower-case. Each item in
495 # the hash is an array of header text. The array has one element
496 # per occurrence of the header. Most headers will only have a single
497 # element because they occur only once. The Received header is the
498 # most common header with multiple texts.
499 sub get_headers {
500 my $msg = shift;
501 my %headers;
502 my $current; # current header, used for unfolding lines
503 my $fieldname; # unmolested header name
504 my $contents; # contents of header
505
506 open(MSG, $msg);
507 while (<MSG>) {
508 chomp;
509 if (length == 0) {
510 last;
511 }
512
513 if (/^\s/) {
514 # folded header continuation
515
516 if (not defined($current)) {
517 print(STDERR "Malformed message, cannot parse headers.\n");
518 return ();
519 }
520
521 @{$headers{$current}}[-1] .= $_;
522 } else {
523 ($fieldname) = split(/:/);
524 $current = lc($fieldname);
525 (undef, $contents) = split(/^\Q$fieldname\E:\s*/);
526
527 if (defined($headers{$current})) {
528 # multiple occurence of same header
529 push(@{$headers{$current}}, $contents);
530 } else {
531 # first occurence of a header
532 $headers{$current} = [$contents];
533 }
534 }
535 }
536 close(MSG);
537
538 return %headers;
539 }
540
541 ###############################################################################
542 # Spam handling
543
544 sub kill_spam {
545 my @msglist = @_;
546 my @result;
547 my $msg;
548 my $i;
549
550 @result = ();
551
552 $i = 0;
553 foreach $msg (@msglist) {
554 printf('%sChecking for spam... %6d/%d',
555 "\r", ++$i, scalar(@msglist));
556 if (is_spam($msg)) {
557 print(" SPAM\n");
558 store_message($msg, 'SPAM');
559 } else {
560 push(@result, $msg);
561 }
562 }
563 print("\nDone: ", scalar(@result), " survivors\n");
564
565 return @result;
566 }
567
568 \f
569 ###############################################################################
570 # Filtering
571
572 sub find_mh_folder {
573 my $msg = shift;
574 my %headers;
575 my $filterref;
576 my @filter;
577 my $header;
578 my $contents;
579 my $pair;
580 my $match;
581 my $expression;
582 my $result;
583
584 %headers = get_headers($msg);
585 if (not %headers) {
586 return 'malformed';
587 }
588
589 log_headers(%headers);
590
591 # Walk the list of filters. This structure is documented in
592 # pod at the end of the program.
593 foreach $filterref (@FILTERS) {
594 @filter = @$filterref;
595 $header = shift(@filter);
596
597 # Handle filters using the magic TO header specially.
598 if ($header eq $MAGIC_TO_TOKEN) {
599 foreach $header (keys(%headers)) {
600 if ($header =~ /$MAGIC_TO_REGEX/i) {
601 foreach $contents (@{$headers{lc($header)}}) {
602 foreach $pair (@filter) {
603 ($match, $expression) = @$pair;
604 if ($contents =~ /$match/) {
605 return $expression;
606 }
607 }
608 }
609 }
610 }
611
612 # Now that it's been processed specially, skip normal handling.
613 next;
614 }
615
616 # Walk the list of message headers matching the filter's
617 # specified header.
618 foreach $contents (@{$headers{lc($header)}}) {
619 # Walk the filter's list of match/expression pairs.
620 foreach $pair (@filter) {
621 ($match, $expression) = @$pair;
622 if ($contents =~ /$match/) {
623 if (eval "\$result = \"$expression\"") {
624 return $result;
625 }
626 }
627 }
628 }
629 }
630
631 return 'inbox';
632 }
633
634 sub filter_mail {
635 my @msglist = @_;
636 my $msg;
637 my $mhfolder;
638
639 foreach $msg (@msglist) {
640 $mhfolder = find_mh_folder($msg);
641 store_message($msg, $mhfolder);
642 }
643 }
644
645 \f
646 MAIN: {
647 my @msglist;
648
649 if ($dumpfilters) {
650 $Data::Dumper::Indent = 1;
651 print(Dumper(\@FILTERS));
652 exit(&EX_OK);
653 }
654
655 @msglist = getfiles("$MAILDIR/new");
656
657 if (not $filteronly) {
658 @msglist = kill_spam(@msglist);
659 }
660
661 if (not $spamonly) {
662 filter_mail(@msglist);
663 }
664 }
665
666 \f
667 __END__
668
669 =head1 THE FILTERS STRUCTURE
670
671 The user's .mincfilter file must define the @FILTERS structure. This
672 structure is an array. Each element of @FILTERS is a filter. A
673 filter is itself an array. The first element of a filter is a string,
674 the name of the header this filter acts upon. The header name is not
675 case-sensitive. Each subsequent element of a filter is a pair (i.e. a
676 two-element array): first, a regular expression B<minc> uses to
677 determine whether this filter matches or not, and second, an
678 expression which B<minc> evaluates to get the folder name.
679
680 B<minc> decides where to store a message by iterating over the
681 @FILTERS array. It tests each regexp of each filter against all
682 headers matching that filter's specified header. As soon as a match
683 is found, B<minc> evaluates the second part of the pair. This part
684 may contain positional parameters from the matched regexp ($1, $2,
685 etc.). The result of this expression is used as the folder name.
686
687 Multiple occurrences of the same header are preserved and tested
688 individually against the filters. That means, for example, that
689 filters can search all the Received headers for a certain string.
690
691 It is important to note that all the arrays of the @FILTERS structure
692 are traversed I<in order>. This means the filters can be arranged so
693 that some have priority over others. XXX: get Doug to write an
694 example of having the same header matched more than once.
695
696 Lastly, B<minc> supports a magic ' TO' header. Filters using this
697 header are matched against a collection of headers related to (and
698 including) the To. Which headers to use is determined by a regular
699 expression borrowed from procmail.
700
701 =head1 EXAMPLES
702
703 @FILTERS = (
704
705 ['List-Id',
706 ['<([[:graph:]]+)-([[:graph:]]+)\.lists\.sourceforge\.net>', 'l/$1/$2'],
707 ['<([[:graph:]]+?)(-list)?\.freedesktop\.org>', 'l/freedesktop/$1'],
708 ['<swig\.cs\.uchicago\.edu>', 'l/swig'],
709 ],
710
711 ['List-Post',
712 ['<mailto:([[:graph:]]+)+@([[:graph:]]+)\.apache\.org>', 'l/apache/$2/$1'],
713 ],
714
715 ['To',
716 ['root', 'root'],
717 ],
718 );
719
720 The first List-Id filter is a surprisingly effective one which creates
721 appropriate folder names (such as l/htdig/updates and l/rox/devel) for
722 all Sourceforge lists to which i subscribe. Certainly there are lists
723 hosted at Sourceforge which do not work well with this, and this is
724 where it is important to remember that the first match is the one
725 uses. Simply put the more specific rules before this one.
726
727 The next List-Id example is simple. The swig example demonstrates
728 that the folder name does not have to use a portion of the matched
729 header; you can just hardcode the folder name.
730
731 The List-Post example is a nice one. Most ASF projects have their own
732 apache.org subdomain, with mailing lists hosted there. So, given a
733 list such as dev@httpd.apache.org, this filter will create the folder
734 name l/apache/httpd/dev.
735
736 For an example is_spam function, see
737 L<http:E<047>E<047>pretzelnet.orgE<047>cvsE<047>dotfilesE<047>.mincspam>
738
739 =head1 AUTHORS
740
741 Written by Eric Gillespie <epg@pretzelnet.org> with logging code
742 stolen from Adam Lazur <adam@lazur.org>.
743
744 Design by Eric Gillespie and Doug Porter <dsp@waterspout.com>.
745
746 =cut
747
748 # Local variables:
749 # cperl-indent-level: 4
750 # perl-indent-level: 4
751 # indent-tabs-mode: nil
752 # End:
753
754 # vi: set tabstop=4 expandtab: