]> diplodocus.org Git - minc/blob - minc
Note licensing terms (same as Perl itself).
[minc] / minc
1 #! /usr/local/bin/perl
2
3 # $Id$
4
5 =head1 NAME
6
7 B<minc> - Incorporate mail from a maildir into mh folders.
8
9 =head1 SYNOPSIS
10
11 B<minc> [-B<dnp>]
12
13 =head1 DESCRIPTION
14
15 B<minc> incorporates mail from a maildir to a mh folder hierarchy. It
16 takes mail from a maildir folder (not a maildir folder hierarchy),
17 optionally checks for spam with a user-defined spam-checking function,
18 and optionally filters mail into separate mh folders.
19
20 The filtering is quite sophisticated, as it is done using real Perl
21 matching (m//) commands.
22
23 =cut
24
25 use strict;
26 use warnings;
27
28 use Data::Dumper;
29 use Errno;
30 use Fcntl qw(O_WRONLY O_EXCL O_CREAT);
31 use FileHandle;
32 use File::Basename;
33 use File::stat;
34 use Getopt::Std; $Getopt::Std::STANDARD_HELP_VERSION = 1;
35 use Log::Dispatch;
36 use Log::Dispatch::File;
37 use POSIX qw(strftime WEXITSTATUS WIFEXITED);
38
39 our $VERSION = 1;
40
41 # If a filter set's header is $MAGIC_TO_TOKEN, that set is compared
42 # against headers matching this regex (taken from procmail).
43 my $MAGIC_TO_REGEX = '^((Original-)?(Resent-)?(To|Cc|Bcc)|(X-Envelope |Apparently(-Resent)?)-To)';
44 my $MAGIC_TO_TOKEN = ' TO';
45
46 # List of SPAM message numbers, scanned at the end so the user can
47 # check for false positives.
48 my @SPAM;
49
50 =head1 OPTIONS
51
52 =over 4
53
54 =item B<-d>
55
56 Dump (using Data::Dumper) the FILTERS list and exit. This is useful
57 for testing the syntax of .mincfilter.
58
59 =item B<--help>
60
61 Show help.
62
63 =item B<-n>
64
65 Dry run; do not actually incorporate the mail, but log and report to
66 stdout/stderr as normal.
67
68 =item B<-p>
69
70 Print the filename of each message before checking it for spam. This
71 can be handy if a particular message is giving the spam checker a
72 problem.
73
74 =back
75
76 =cut
77
78 my $dumpfilters = 0;
79 our $run = 1;
80 my $printfilenames = 0;
81
82 our $opt_d;
83 our $opt_n;
84 our $opt_p;
85
86 if (not getopts('dnp')) {
87 exit(2);
88 }
89
90 if ($opt_d) {
91 $dumpfilters = 1;
92 } elsif ($opt_n) {
93 $run = 0;
94 }
95
96 if ($opt_p) {
97 $printfilenames = 1;
98 }
99
100 =head1 ENVIRONMENT
101
102 =over 4
103
104 =item HOME
105
106 Where configuration files (.mincfilter) are found. Also,
107 $HOME/Maildir is used for the maildir if MAILDIR is not set.
108
109 =item MAILDIR
110
111 Where mail is delivered.
112
113 =back
114
115 =cut
116
117 use Env qw(HOME MAILDIR);
118
119 if (not $HOME) {
120 die("HOME environment variable must be set.\n");
121 }
122 if (not $MAILDIR) {
123 $MAILDIR = "$HOME/Maildir";
124 }
125
126 =head1 FILES
127
128 =over 4
129
130 =item $HOME/.mincfilter
131
132 This file is Perl code (included via the 'require' directive) which is
133 expected to define the FILTERS list.
134
135 =item $HOME/.mincspam
136
137 If this file exists, B<minc> will include it with the expectation that
138 it will define a B<spam_check> function. This function takes a
139 message filename as an argument and returns 1 if the message is spam,
140 else 0. If this file does not exist, B<minc> will define a simple
141 function that always returns 0.
142
143 One of B<minc>'s global variables is available to the user-defined
144 B<spam_check> function: $run. This boolean should be honored;
145 B<spam_check> should only take real action (i.e. removing or creating
146 files, running external programs, etc.) if $run is non-zero.
147
148 This file may also declare two other functions: B<spam_start_hook> and
149 B<spam_stop_hook>. The former is passed no arguments and is expected
150 to return a list. This list is a "baton" that will also be passed to
151 B<spam_check> and B<spam_stop_hook>. It can hold anything
152 B<spam_check> will need to do its job, whether network sockets, pipes,
153 or whatever.
154
155 XXX: need more details about the spam-handling process; for now read
156 the code.
157
158 =item `mhpath +`/logs/minc.log
159
160 Where B<minc> logs what it does, unless in -n mode.
161
162 =item `mhpath +`/logs/dryrun.log
163
164 Where B<minc> logs what it would do; used in -n mode.
165
166 =item `mhpath +`/.minc.context
167
168 B<minc> uses this file for context (i.e. current folder) instead of
169 `mhpath +`/context. This allows you some amount of freedom to use mh
170 while B<minc> is running. For example, you can changed folders without
171 causing a message to be stored in the wrong folder. Care must still
172 be taken, however, about the unseen sequence; if you change it
173 (whether via show, or mark, or anything else) while in the same folder
174 as B<minc>, it is likely the new message B<minc> stores will not end up
175 marked unseen.
176
177 =back
178
179 =cut
180
181 our @FILTERS;
182 require "$HOME/.mincfilter";
183
184 if (-f "$HOME/.mincspam") {
185 require "$HOME/.mincspam";
186 } else {
187 eval "sub spam_start_hook { return (); }";
188 eval "sub spam_stop_hook { }";
189 eval "sub spam_check { return 0; }";
190 }
191
192 my $mh;
193 my $logfile;
194
195 $mh = `mhpath +`;
196 chomp($mh);
197
198 if ($run) {
199 $logfile = $mh . '/logs/minc.log';
200 } else {
201 $logfile = $mh . '/logs/dryrun.log';
202 }
203
204 $ENV{"MHCONTEXT"} = $mh . '/.minc.context';
205
206 \f
207 ###############################################################################
208 # Logging
209
210 # debuglevels for the logger
211 use constant LOG_DEBUGLEVEL => 'info';
212 use constant SCREEN_DEBUGLEVEL => 'debug';
213
214 # characters used to wrap around the id field in the log so I can more
215 # easily parse the msg flow
216 use constant DEBUGCHAR => '%';
217 use constant INFOCHAR => '=';
218 use constant INCOMINGCHAR => '<';
219 use constant SAVECHAR => '>';
220 use constant WARNCHAR => 'W';
221 use constant ERRORCHAR => 'E';
222
223 my $logger = new Log::Dispatch;
224 $logger->add(new Log::Dispatch::File (name=>'logfile',
225 filename=>$logfile,
226 min_level=>'info',
227 mode=>'append'));
228
229 # log alias, handles getting passed vars that are undef
230 sub mylog {
231 my $level = shift;
232 my $act = shift;
233 my $timestamp = strftime('%b %e %H:%M:%S', localtime);
234 my $msg;
235 foreach my $part (@_) {
236 if (defined($part)) {
237 $msg .= $part;
238 }
239 }
240 # no newlines in the log message, thanks
241 $msg =~ s/\n/ /gm;
242 $msg = sprintf("%s %s%s %s\n", $timestamp, $act, $act, $msg);
243 $logger->log(level=>$level, message=>$msg);
244
245 if ($act eq SAVECHAR) {
246 $logger->log(level=>$level, message=>"\n");
247 }
248 }
249
250 sub logsave { mylog('notice', SAVECHAR, @_); }
251 sub loginfo { mylog('info', INFOCHAR, @_); }
252 sub logdebug { mylog('debug', DEBUGCHAR, @_); }
253 sub logwarn { mylog('warning', WARNCHAR, @_); }
254 sub logerr { mylog('error', ERRORCHAR, @_); }
255
256 sub logincoming {
257 my ($text, @contents) = @_;
258 my $last;
259
260 if (@contents) {
261 $last = $contents[-1];
262 } else {
263 $last = '';
264 }
265
266 mylog('info', INCOMINGCHAR, $text, $last);
267 }
268
269 sub log_headers {
270 my %headers = @_;
271
272 # For an explanation of the %headers structure, see the
273 # get_headers function below.
274 logincoming('From: ', @{$headers{'return-path'}});
275 logincoming('To: ', @{$headers{'to'}});
276 logincoming('Subject: ', @{$headers{'subject'}});
277 logincoming('Message-Id: ', @{$headers{'message-id'}});
278 }
279
280 \f
281 ###############################################################################
282 # Utility procedures
283
284 sub mkfolder {
285 my $mhfolder = shift;
286 my $folder;
287 my $target;
288 my $component;
289
290 $folder = $mh . '/' . $mhfolder;
291 $target = '';
292
293 if (not -d $folder) {
294 foreach $component (split('/', $folder)) {
295 if (defined($component) and length($component) >= 1) {
296 $target = $target . '/' . $component;
297 if (-d $target or mkdir($target)) {
298 next;
299 } else {
300 die("Failed to create +$mhfolder ($component)");
301 }
302 }
303 }
304 }
305 }
306
307 sub getfiles {
308 my $dir = shift;
309 my @result;
310
311 if (not opendir(DIR, $dir)) {
312 die("Failed opendir($dir)");
313 }
314
315 # Initialize $! to 0 (success) because stupid stupid Perl provides
316 # no way to distinguish an error from an empty directory; that
317 # means setting $! to 0 and testing it afterwards is the only way
318 # to detect an error. Real Programmers don't handle errors,
319 # right? >sigh<
320 $! = 0;
321 @result = grep {
322 ($_ ne '.' and $_ ne '..')
323 and $_ = "$MAILDIR/new/$_"
324 } readdir(DIR);
325
326 if ($! != 0) {
327 die("Failed readdir($dir)");
328 }
329
330 if (scalar(@result) == 0) {
331 exit(0);
332 }
333
334 closedir(DIR);
335
336 return @result;
337 }
338
339 sub get_highest_msgnum {
340 my $mhfolder = shift;
341 my $dir;
342 my @list;
343 my $highest;
344 my $msgnum;
345
346 $dir = "$mh/$mhfolder";
347 if (not opendir(DIR, $dir)) {
348 die("Failed opendir($dir)");
349 }
350
351 $! = 0;
352 @list = readdir(DIR);
353
354 if ($! != 0) {
355 die("Failed readdir($dir)");
356 }
357
358 closedir(DIR);
359
360 $highest = 0;
361 foreach $msgnum (@list) {
362 # Look for integers.
363 if ($msgnum =~ /^[0-9]+$/) {
364 if ($msgnum > $highest) {
365 $highest = $msgnum;
366 }
367 }
368 }
369
370 return $highest;
371 }
372
373 sub store_message {
374 my $msg = shift;
375 my $mhfolder = shift;
376 my $msgnum;
377 my $try;
378 my $mhmsg;
379 my $status;
380
381 # We must do this even in -n mode because later steps fail without
382 # it. This should be harmless.
383 mkfolder($mhfolder);
384
385 # This loop is a modified version of the maildir delivery algorithm.
386 $msgnum = get_highest_msgnum($mhfolder);
387 for ($try = 0; ; $try++) {
388 $msgnum++;
389 $mhmsg = "$mh/$mhfolder/$msgnum";
390
391 if (not stat($mhmsg)) {
392 if ($!{ENOENT}) {
393 # Now we have a non-existent file, let's try to create
394 # it. We must create a zero-byte file first because a
395 # file my appear between our happy stat results and
396 # our later rename(2), which would clobber said file.
397 # So attempt to create a file with this name. If it
398 # succeeds, in just a bit here we'll knowingly clobber
399 # this file with the rename(2) call.
400
401 # Another way to do this is not to use rename(2), but
402 # use link(2) + unlink(2) instead. That's how the
403 # standard maildir algorithm does it. Each method has
404 # a disadvantage: the program may crash between the
405 # link(2) and unlink(2) calls. With the standard
406 # maildir algorithm, that means the message will end
407 # up duplicated. The advantage of creating an empty
408 # file followed by rename(2) is that an extra empty
409 # file is left behind as opposed to a duplicate
410 # message. This is more easily detected by the user.
411 if ($run) {
412 if (sysopen(MSG, "$mhmsg",
413 O_WRONLY | O_EXCL | O_CREAT, 0600)) {
414 close(MSG);
415 last;
416 }
417 } else {
418 last;
419 }
420 }
421 }
422
423 # This algorithm is different from the maildir one; let's make
424 # 10 tries instead of 3.
425 if ($try == 9) {
426 die("Attempted filename $mhmsg exists.");
427 }
428
429 # This algorithm is different; i don't think we need to sleep.
430 #sleep(2);
431 }
432
433 if ($mhfolder ne 'SPAM') {
434 logsave("+$mhfolder");
435 }
436
437 if ($run) {
438 if (not rename($msg, $mhmsg)) {
439 die("Failed rename($msg, $mhmsg)");
440 }
441
442 # Mark each message as soon as we store it and bomb if that
443 # fails. While it is slow, it is not safe to store multiple
444 # messages and then have a failure before marking some (or
445 # all).
446 if ($mhfolder eq 'SPAM') {
447 push(@SPAM, $msgnum);
448 } else {
449 $status = system('mark', "+$mhfolder", "$msgnum", '-sequence',
450 'unseen', '-add');
451 if (not WIFEXITED($status)) {
452 die("Failed to run mark");
453 } elsif (WEXITSTATUS($status) != 0) {
454 die("Failed to mark message unseen.");
455 }
456 }
457 }
458
459 return $msgnum;
460 }
461
462 # Parse a message file into a structure describing the headers. The
463 # structure is a hash of arrays. The hash keys are the names of the
464 # headers found in the message, made all lower-case. Each item in
465 # the hash is an array of header text. The array has one element
466 # per occurrence of the header. Most headers will only have a single
467 # element because they occur only once. The Received header is the
468 # most common header with multiple texts.
469 sub get_headers {
470 my $msg = shift;
471 my %headers;
472 my $current; # current header, used for unfolding lines
473 my $fieldname; # unmolested header name
474 my $contents; # contents of header
475
476 open(MSG, $msg);
477 while (<MSG>) {
478 chomp;
479 if (length == 0) {
480 last;
481 }
482
483 if (/^\s/) {
484 # folded header continuation
485
486 if (not defined($current)) {
487 print(STDERR "Malformed message, cannot parse headers.\n");
488 return ();
489 }
490
491 @{$headers{$current}}[-1] .= $_;
492 } else {
493 ($fieldname) = split(/:/);
494 $current = lc($fieldname);
495 (undef, $contents) = split(/^\Q$fieldname\E:\s*/);
496
497 if (defined($headers{$current})) {
498 # multiple occurence of same header
499 push(@{$headers{$current}}, $contents);
500 } else {
501 # first occurence of a header
502 $headers{$current} = [$contents];
503 }
504 }
505 }
506 close(MSG);
507
508 return %headers;
509 }
510
511 \f
512 ###############################################################################
513 # Filtering
514
515 sub find_mh_folder {
516 my $msg = shift;
517 my %headers;
518 my $filterref;
519 my @filter;
520 my $header;
521 my $contents;
522 my $pair;
523 my $match;
524 my $expression;
525 my $result;
526
527 %headers = get_headers($msg);
528 if (not %headers) {
529 return 'malformed';
530 }
531
532 log_headers(%headers);
533
534 # Walk the list of filters. This structure is documented in
535 # pod at the end of the program.
536 foreach $filterref (@FILTERS) {
537 @filter = @$filterref;
538 $header = shift(@filter);
539
540 # Handle filters using the magic TO header specially.
541 if ($header eq $MAGIC_TO_TOKEN) {
542 foreach $header (keys(%headers)) {
543 if ($header =~ /$MAGIC_TO_REGEX/i) {
544 foreach $contents (@{$headers{lc($header)}}) {
545 foreach $pair (@filter) {
546 ($match, $expression) = @$pair;
547 if ($contents =~ /$match/) {
548 return $expression;
549 }
550 }
551 }
552 }
553 }
554
555 # Now that it's been processed specially, skip normal handling.
556 next;
557 }
558
559 # Walk the list of message headers matching the filter's
560 # specified header.
561 foreach $contents (@{$headers{lc($header)}}) {
562 # Walk the filter's list of match/expression pairs.
563 foreach $pair (@filter) {
564 ($match, $expression) = @$pair;
565 if ($contents =~ /$match/i) {
566 if (eval "\$result = \"$expression\"") {
567 return $result;
568 }
569 }
570 }
571 }
572 }
573
574 return 'inbox';
575 }
576
577 sub filter_mail {
578 my @msglist = @_;
579 my $msgcount = scalar(@msglist);
580 my @baton;
581 my $msg;
582 my $mhfolder;
583 my $spam = 0;
584 my $saved = 0;
585 my %FOLDERS = ('SPAM'=>1);
586
587 @baton = spam_start_hook();
588
589
590 STDOUT->autoflush(1);
591 print("$msgcount messages...");
592 foreach $msg (@msglist) {
593 if (spam_check($msg, @baton)) {
594 $mhfolder = 'SPAM';
595 $spam = scalar(@SPAM)+ 1;
596 } else {
597 $mhfolder = find_mh_folder($msg);
598 $saved++;
599 }
600
601 store_message($msg, $mhfolder);
602
603 print("\r");
604 if (not $FOLDERS{$mhfolder}) {
605 print(' ' x length($msgcount));
606 print(" \r+$mhfolder\n");
607 $FOLDERS{$mhfolder} = 1;
608 }
609
610 printf('%6d SPAM %6d saved %6d/%1d',
611 $spam, $saved, $spam + $saved, $msgcount);
612 }
613 print("\n");
614
615 spam_stop_hook(@baton);
616 }
617
618 \f
619 MAIN: {
620 my @msglist;
621 my $st;
622
623 if ($dumpfilters) {
624 $Data::Dumper::Indent = 1;
625 print(Dumper(\@FILTERS));
626 exit;
627 }
628
629 @msglist = (
630 map { $_->[1] }
631 sort { $a->[0] <=> $b->[0] }
632 map {
633 if (not ($st = stat($_))) {
634 die("stat($_): $!");
635 }
636 [$st->mtime, $_]
637 }
638 getfiles("$MAILDIR/new"));
639
640 filter_mail(@msglist);
641
642 @SPAM and (exec('scan', '+SPAM', @SPAM) or die);
643 }
644
645 \f
646 __END__
647
648 =head1 THE FILTERS STRUCTURE
649
650 The user's .mincfilter file must define the @FILTERS structure. This
651 structure is an array. Each element of @FILTERS is a filter. A
652 filter is itself an array. The first element of a filter is a string,
653 the name of the header this filter acts upon. The header name is not
654 case-sensitive. Each subsequent element of a filter is a pair (i.e. a
655 two-element array): first, a regular expression B<minc> uses to
656 determine whether this filter matches or not, and second, an
657 expression which B<minc> evaluates to get the folder name.
658
659 B<minc> decides where to store a message by iterating over the
660 @FILTERS array. It tests each regexp of each filter against all
661 headers matching that filter's specified header. As soon as a match
662 is found, B<minc> evaluates the second part of the pair. This part
663 may contain positional parameters from the matched regexp ($1, $2,
664 etc.). The result of this expression is used as the folder name.
665
666 Multiple occurrences of the same header are preserved and tested
667 individually against the filters. That means, for example, that
668 filters can search all the Received headers for a certain string.
669
670 It is important to note that all the arrays of the @FILTERS structure
671 are traversed I<in order>. This means the filters can be arranged so
672 that some have priority over others. XXX: get Doug to write an
673 example of having the same header matched more than once.
674
675 Lastly, B<minc> supports a magic ' TO' header. Filters using this
676 header are matched against a collection of headers related to (and
677 including) the To. Which headers to use is determined by a regular
678 expression borrowed from procmail.
679
680 =head1 EXAMPLES
681
682 @FILTERS = (
683
684 ['List-Id',
685 ['<([[:graph:]]+)-([[:graph:]]+)\.lists\.sourceforge\.net>', 'l/$1/$2'],
686 ['<([[:graph:]]+?)(-list)?\.freedesktop\.org>', 'l/freedesktop/$1'],
687 ['<swig\.cs\.uchicago\.edu>', 'l/swig'],
688 ],
689
690 ['List-Post',
691 ['<mailto:([[:graph:]]+)+@([[:graph:]]+)\.apache\.org>', 'l/apache/$2/$1'],
692 ],
693
694 ['To',
695 ['root', 'root'],
696 ],
697 );
698
699 The first List-Id filter is a surprisingly effective one which creates
700 appropriate folder names (such as l/htdig/updates and l/rox/devel) for
701 all Sourceforge lists to which i subscribe. Certainly there are lists
702 hosted at Sourceforge which do not work well with this, and this is
703 where it is important to remember that the first match is the one
704 uses. Simply put the more specific rules before this one.
705
706 The next List-Id example is simple. The swig example demonstrates
707 that the folder name does not have to use a portion of the matched
708 header; you can just hardcode the folder name.
709
710 The List-Post example is a nice one. Most ASF projects have their own
711 apache.org subdomain, with mailing lists hosted there. So, given a
712 list such as dev@httpd.apache.org, this filter will create the folder
713 name l/apache/httpd/dev.
714
715 For an example B<spam_check> function, see
716 L<http:E<047>E<047>pretzelnet.orgE<047>cvsE<047>dotfilesE<047>.mincspam>
717
718 =head1 AUTHORS
719
720 Written by Eric Gillespie <epg@pretzelnet.org> with logging code
721 stolen from Adam Lazur <adam@lazur.org>.
722
723 Design by Eric Gillespie and Doug Porter <dsp@waterspout.com>.
724
725 This program is free software; you can redistribute it and/or modify
726 it under the same terms as Perl itself.
727
728 =cut
729
730 # Local variables:
731 # cperl-indent-level: 4
732 # perl-indent-level: 4
733 # indent-tabs-mode: nil
734 # End:
735
736 # vi: set tabstop=4 expandtab: