]> diplodocus.org Git - minc/blob - minc
(filter_mail): Build and return a hash of all folders to which we add
[minc] / minc
1 #! /usr/bin/env perl
2
3 # $Id$
4
5 =head1 NAME
6
7 B<minc> - incorporate mail from a maildir into mh folders
8
9 =head1 SYNOPSIS
10
11 B<minc> [B<-m> I<MAX>] [B<-n>] [B<-p>]
12
13 B<minc> B<-d>
14
15 B<minc> B<-h>
16
17 =head1 DESCRIPTION
18
19 B<minc> incorporates mail from a maildir to a mh folder hierarchy. It
20 takes mail from a maildir folder (not a maildir folder hierarchy),
21 optionally checks for spam with a user-defined spam-checking function,
22 and optionally filters mail into separate mh folders.
23
24 The filtering is quite sophisticated, as it is done using real Perl
25 matching (m//) commands.
26
27 =cut
28
29 use strict;
30 use warnings;
31
32 use Data::Dumper;
33 use Errno;
34 use Fcntl qw(O_WRONLY O_EXCL O_CREAT);
35 use FileHandle;
36 use File::Temp qw(tempfile);
37 use File::stat;
38 use Getopt::Long qw(:config gnu_getopt no_ignore_case);
39 use POSIX qw(strftime WEXITSTATUS WIFEXITED);
40 use Pod::Usage;
41
42 our $VERSION = 1;
43
44 # If a filter set's header is $MAGIC_TO_TOKEN, that set is compared
45 # against headers matching this regex (taken from procmail).
46 my $MAGIC_TO_REGEX = '^((Original-)?(Resent-)?(To|Cc|Bcc)|(X-Envelope |Apparently(-Resent)?)-To)';
47 my $MAGIC_TO_TOKEN = ' TO';
48
49 # List of SPAM message numbers, scanned at the end so the user can
50 # check for false positives.
51 my @SPAM;
52
53 =head1 OPTIONS
54
55 =over 4
56
57 =item B<-d>
58
59 Dump (using Data::Dumper) the FILTERS list and exit. This is useful
60 for testing the syntax of .mincfilter.
61
62 =item B<-h>
63
64 Show help.
65
66 =item B<-m> I<MAX>
67
68 Stop processing after I<MAX> messages.
69
70 =item B<-n>
71
72 Dry run; do not actually incorporate the mail, but log and report to
73 stdout/stderr as normal.
74
75 =item B<-p>
76
77 Print the filename of each message before checking it for spam. This
78 can be handy if a particular message is giving the spam checker a
79 problem.
80
81 =back
82
83 =cut
84
85 my $dumpfilters;
86 my $help;
87 my $maxmsgs;
88 my $norun;
89 my $printfilenames;
90
91 GetOptions(
92 'd' => \$dumpfilters,
93 'h|help' => \$help,
94 'm=i' => \$maxmsgs,
95 'n' => \$norun,
96 'p' => \$printfilenames,
97 ) or pod2usage();
98 $help and pod2usage(-exitstatus=>0, -verbose=>1);
99 @ARGV == 0 or pod2usage();
100
101 our $run = !$norun;
102
103 =head1 ENVIRONMENT
104
105 =over 4
106
107 =item HOME
108
109 Where configuration files (.mincfilter) are found. Also,
110 $HOME/Maildir is used for the maildir if MAILDIR is not set.
111
112 =item MAILDIR
113
114 Where mail is delivered.
115
116 =back
117
118 =cut
119
120 use Env qw(COLUMNS HOME MAILDIR);
121
122 $COLUMNS ||= 80;
123
124 if (not $HOME) {
125 die("HOME environment variable must be set.\n");
126 }
127 if (not $MAILDIR) {
128 $MAILDIR = "$HOME/Maildir";
129 }
130
131 =head1 FILES
132
133 =over 4
134
135 =item $HOME/.mincfilter
136
137 This file is Perl code (included via the 'require' directive) which is
138 expected to define the FILTERS list.
139
140 =item $HOME/.mincspam
141
142 If this file exists, B<minc> will include it with the expectation that
143 it will define a B<spam_check> function. This function takes a
144 message filename as an argument and returns 1 if the message is spam,
145 else 0. If this file does not exist, B<minc> will define a simple
146 function that always returns 0.
147
148 One of B<minc>'s global variables is available to the user-defined
149 B<spam_check> function: $run. This boolean should be honored;
150 B<spam_check> should only take real action (i.e. removing or creating
151 files, running external programs, etc.) if $run is non-zero.
152
153 This file may also declare two other functions: B<spam_start_hook> and
154 B<spam_stop_hook>. The former is passed no arguments and is expected
155 to return a list. This list is a "baton" that will also be passed to
156 B<spam_check> and B<spam_stop_hook>. It can hold anything
157 B<spam_check> will need to do its job, whether network sockets, pipes,
158 or whatever.
159
160 XXX: need more details about the spam-handling process; for now read
161 the code.
162
163 =item `mhpath +`/logs/minc.log
164
165 Where B<minc> logs what it does, unless in -n mode.
166
167 =item `mhpath +`/logs/dryrun.log
168
169 Where B<minc> logs what it would do; used in -n mode.
170
171 =item `mhpath +`/.minc.context
172
173 B<minc> uses this file for context (i.e. current folder) instead of
174 `mhpath +`/context. This allows you some amount of freedom to use mh
175 while B<minc> is running. For example, you can changed folders without
176 causing a message to be stored in the wrong folder. Care must still
177 be taken, however, about the unseen sequence; if you change it
178 (whether via show, or mark, or anything else) while in the same folder
179 as B<minc>, it is likely the new message B<minc> stores will not end up
180 marked unseen.
181
182 =back
183
184 =cut
185
186 our @FILTERS;
187 our (@start_hooks, @stop_hooks, @filter_hooks, @post_store_hooks);
188
189 my $mh;
190 my $logfile;
191
192 $mh = `mhpath +`;
193 chomp($mh);
194
195 if ($run) {
196 $logfile = $mh . '/logs/minc.log';
197 } else {
198 $logfile = $mh . '/logs/dryrun.log';
199 }
200
201 $ENV{"MHCONTEXT"} = $mh . '/.minc.context';
202
203 \f
204 ###############################################################################
205 # Logging
206
207 sub mylog {
208 my $timestamp = strftime('%b %e %H:%M:%S', localtime);
209 my $msg;
210 foreach my $part (@_) {
211 if (defined($part)) {
212 $msg .= $part;
213 }
214 }
215 # no newlines in the log message, thanks
216 $msg =~ s/\n/ /gm;
217
218 open(LOG, ">>$logfile") or die("open(>>$logfile): $!");
219 print(LOG "$timestamp $msg\n") or die("print(>>$logfile): $!");
220 close(LOG) or die("close($logfile): $!");
221 }
222
223 sub logheader {
224 my ($text, @contents) = @_;
225 my $last;
226
227 if (@contents) {
228 $last = $contents[-1];
229 } else {
230 $last = '';
231 }
232
233 mylog('<< ', $text, $last);
234 }
235
236 sub log_headers {
237 my %headers = @_;
238
239 # For an explanation of the %headers structure, see the
240 # get_headers function below.
241 logheader('From: ', @{$headers{'return-path'}});
242 logheader('To: ', @{$headers{'to'}});
243 logheader('Subject: ', @{$headers{'subject'}});
244 logheader('Message-Id: ', @{$headers{'message-id'}});
245 }
246
247 \f
248 ###############################################################################
249 # Utility procedures
250
251 sub mkfolder {
252 my $folder = shift;
253 my $target;
254 my $component;
255
256 $target = $mh;
257 foreach $component (split('/', $folder)) {
258 $target = join('/', $target, $component);
259 -d $target or mkdir($target) or die("mkdir($target): $!");
260 }
261 }
262
263 sub getfiles {
264 my $dir = shift;
265 my @result;
266
267 if (not opendir(DIR, $dir)) {
268 die("opendir($dir): $!");
269 }
270
271 # Initialize $! to 0 (success) because stupid stupid Perl provides
272 # no way to distinguish an error from an empty directory; that
273 # means setting $! to 0 and testing it afterwards is the only way
274 # to detect an error. Real Programmers don't handle errors,
275 # right? >sigh<
276 $! = 0;
277 @result = readdir(DIR);
278 if ($! != 0) {
279 die("readdir($dir): $!");
280 }
281
282 if (@result <= 2) {
283 exit(0);
284 }
285
286 STDOUT->autoflush(1);
287 print(@result - 2, " messages...");
288
289 closedir(DIR);
290
291 return @result;
292 }
293
294 sub get_highest_msgnum {
295 my $mhfolder = shift;
296 my $dir;
297 my @list;
298 my $highest;
299 my $msgnum;
300
301 $dir = "$mh/$mhfolder";
302 if (not opendir(DIR, $dir)) {
303 die("opendir($dir): $!");
304 }
305
306 # Insert rant from getfiles here.
307 $! = 0;
308 @list = readdir(DIR);
309
310 if ($! != 0) {
311 die("readdir($dir): $!");
312 }
313
314 closedir(DIR);
315
316 $highest = 0;
317 foreach $msgnum (@list) {
318 # Look for integers.
319 if ($msgnum =~ /^[0-9]+$/) {
320 if ($msgnum > $highest) {
321 $highest = $msgnum;
322 }
323 }
324 }
325
326 return $highest;
327 }
328
329 sub store_message {
330 my $msg = shift;
331 my $mhfolder = shift;
332 my $msgnum;
333 my $try;
334 my $mhmsg;
335 my $status;
336
337 # We must do this even in -n mode because later steps fail without
338 # it. This should be harmless.
339 mkfolder($mhfolder);
340
341 # This loop is a modified version of the maildir delivery algorithm.
342 $msgnum = get_highest_msgnum($mhfolder);
343 for ($try = 0; ; $try++) {
344 $msgnum++;
345 $mhmsg = "$mh/$mhfolder/$msgnum";
346
347 if (not stat($mhmsg)) {
348 if ($!{ENOENT}) {
349 # Now we have a non-existent file, let's try to create
350 # it. We must create a zero-byte file first because a
351 # file my appear between our happy stat results and
352 # our later rename(2), which would clobber said file.
353 # So attempt to create a file with this name. If it
354 # succeeds, in just a bit here we'll knowingly clobber
355 # this file with the rename(2) call.
356
357 # Another way to do this is not to use rename(2), but
358 # use link(2) + unlink(2) instead. That's how the
359 # standard maildir algorithm does it. Each method has
360 # a disadvantage: the program may crash between the
361 # link(2) and unlink(2) calls. With the standard
362 # maildir algorithm, that means the message will end
363 # up duplicated. The advantage of creating an empty
364 # file followed by rename(2) is that an extra empty
365 # file is left behind as opposed to a duplicate
366 # message. This is more easily detected by the user.
367 if ($run) {
368 if (sysopen(MSG, $mhmsg,
369 O_WRONLY | O_EXCL | O_CREAT, 0600)) {
370 close(MSG);
371 last;
372 }
373 } else {
374 last;
375 }
376 }
377 }
378
379 # This algorithm is different from the maildir one; let's make
380 # 10 tries instead of 3.
381 if ($try == 9) {
382 die("Attempted filename $mhmsg exists.");
383 }
384
385 # This algorithm is different; i don't think we need to sleep.
386 #sleep(2);
387 }
388
389 if ($mhfolder ne 'SPAM') {
390 mylog('+', $mhfolder);
391 }
392
393 if ($run) {
394 if (not rename($msg, $mhmsg)) {
395 die("rename($msg, $mhmsg): $!");
396 }
397
398 # Mark each message as soon as we store it and bomb if that
399 # fails. While it is slow, it is not safe to store multiple
400 # messages and then have a failure before marking some (or
401 # all).
402 if ($mhfolder eq 'SPAM') {
403 push(@SPAM, $msgnum);
404 } else {
405 $status = system('mark', "+$mhfolder", "$msgnum", '-sequence',
406 'unseen', '-add');
407 # XXX need to handle signalled and stopped, and print
408 # the exit code or signal number.
409 if (not WIFEXITED($status)) {
410 die("Failed to run mark");
411 } elsif (WEXITSTATUS($status) != 0) {
412 die("Failed to mark message unseen.");
413 }
414 }
415 }
416
417 return $msgnum;
418 }
419
420 # Parse a message file into a structure describing the headers. The
421 # structure is a hash of arrays. The hash keys are the names of the
422 # headers found in the message, made all lower-case. Each item in
423 # the hash is an array of header text. The array has one element
424 # per occurrence of the header. Most headers will only have a single
425 # element because they occur only once. The Received header is the
426 # most common header with multiple texts.
427 sub get_headers {
428 my $msg = shift;
429 my %headers;
430 my $current; # current header, used for unfolding lines
431 my $fieldname; # unmolested header name
432 my $contents; # contents of header
433
434 open(MSG, $msg) or die("open(MSG, $msg): $!");
435 while (<MSG>) {
436 chomp;
437 if (length == 0) {
438 last;
439 }
440
441 if (/^\s/) {
442 # folded header continuation
443
444 if (not defined($current)) {
445 warn('Malformed message, cannot parse headers.');
446 next;
447 }
448
449 @{$headers{$current}}[-1] .= $_;
450 } else {
451 ($fieldname) = split(/:/);
452 $current = lc($fieldname);
453 (undef, $contents) = split(/^\Q$fieldname\E:\s*/);
454
455 if (defined($headers{$current})) {
456 # multiple occurence of same header
457 push(@{$headers{$current}}, $contents);
458 } else {
459 # first occurence of a header
460 $headers{$current} = [$contents];
461 }
462 }
463 }
464 close(MSG);
465
466 return %headers;
467 }
468
469 \f
470 ###############################################################################
471 # Filtering
472
473 sub find_mh_folder {
474 my $msg = shift;
475 my %headers = @_;
476 my $filterref;
477 my @filter;
478 my $header;
479 my $contents;
480 my $pair;
481 my $match;
482 my $expression;
483 my $result;
484
485 if (not %headers) {
486 return 'malformed';
487 }
488
489 # Walk the list of filters. This structure is documented in
490 # pod at the end of the program.
491 foreach $filterref (@FILTERS) {
492 @filter = @$filterref;
493 $header = shift(@filter);
494
495 # Handle filters using the magic TO header specially.
496 if ($header eq $MAGIC_TO_TOKEN) {
497 foreach $header (keys(%headers)) {
498 if ($header =~ /$MAGIC_TO_REGEX/i) {
499 foreach $contents (@{$headers{lc($header)}}) {
500 foreach $pair (@filter) {
501 ($match, $expression) = @$pair;
502 if ($contents =~ /$match/) {
503 return $expression;
504 }
505 }
506 }
507 }
508 }
509
510 # Now that it's been processed specially, skip normal handling.
511 next;
512 }
513
514 # Walk the list of message headers matching the filter's
515 # specified header.
516 foreach $contents (@{$headers{lc($header)}}) {
517 # Walk the filter's list of match/expression pairs.
518 foreach $pair (@filter) {
519 ($match, $expression) = @$pair;
520 if ($contents =~ /$match/i) {
521 if (eval "\$result = \"$expression\"") {
522 return $result;
523 }
524 }
525 }
526 }
527 }
528
529 return 'inbox';
530 }
531
532 sub filter_mail {
533 my $msgcount = @_ - 2; # don't count . and ..
534 my $len = length($msgcount);
535 my @baton;
536 my $msg;
537 my $mhfolder;
538 my $spam = 0;
539 my $saved = 0;
540 my $msgnum;
541 my %folders;
542
543 # XXX lame names and hard-coded proportions.
544 my $nf = int($COLUMNS * 0.1);
545 my $nm = int($COLUMNS * 0.0625);
546 my $nF = int($COLUMNS * 0.175);
547 my $ns = $COLUMNS - $nf - $nm - $nF - 3;
548
549 if (-f "$HOME/.minc") {
550 require "$HOME/.minc";
551 }
552
553 my %batons;
554 for my $hook (@start_hooks) {
555 my ($handle, @baton) = $hook->();
556 if (defined($handle)) {
557 $batons{$handle} = [@baton];
558 }
559 }
560
561 for $msg (@_) {
562 ($msg eq '.' or $msg eq '..') and next;
563
564 if ($printfilenames) {
565 print("$msg\n");
566 }
567
568 my %headers = get_headers($msg);
569 log_headers(%headers);
570
571 undef($mhfolder);
572 for my $hook (@filter_hooks) {
573 my $result = $hook->(\%batons, \%headers, $msg);
574 defined($result) and ($mhfolder = $result);
575 }
576
577 defined($mhfolder) or ($mhfolder = find_mh_folder($msg, %headers));
578
579 $msgnum = store_message($msg, $mhfolder);
580 $folders{$mhfolder}++;
581
582 if ($mhfolder eq 'SPAM') {
583 $spam++;
584 } else {
585 $saved++;
586 my $from = [@{$headers{'from'}}]->[-1];
587 my $subject = [@{$headers{'subject'}}]->[-1];
588 for ($from, $subject) {
589 tr/\x00-\x1f\x80-\xff/ /;
590 }
591 print("\r");
592 print(' ' x $COLUMNS);
593 printf("\r\%-${nf}s \%${nm}d \%-${nF}s \%s\n",
594 substr($mhfolder, 0, $nf), substr($msgnum, 0, $nm),
595 # XXX shouldn't pop, as these are about to be
596 # passed to post_store_hooks
597 substr($from, 0, $nF),
598 substr($subject, 0, $ns));
599 }
600
601 for my $hook (@post_store_hooks) {
602 $hook->(\%batons, \%headers, $mhfolder, $msgnum);
603 }
604
605 print("\r");
606 printf(" \%${len}d SPAM \%${len}d saved \%${len}d/%1d",
607 $spam, $saved, $spam + $saved, $msgcount);
608
609 defined($maxmsgs) and ($spam + $saved < $maxmsgs or last);
610 }
611 print("\n");
612
613 for my $hook (@stop_hooks) {
614 $hook->(\%batons);
615 }
616
617 return %folders;
618 }
619
620 sub update_folders {
621 my $folders = shift;
622 my ($fh, $fn);
623
624 if (not open($fh, "$mh/.folders")) {
625 warn("open($mh/.folders): $!");
626 return;
627 }
628
629 while (<$fh>) {
630 chomp;
631 $folders->{$_}++;
632 }
633
634 eval { ($fh, $fn) = tempfile("$mh/.folders.XXXXX") };
635 if ($@) {
636 warn("$@");
637 return;
638 }
639
640 for my $folder (sort(keys(%$folders))) {
641 print($fh "$folder\n");
642 }
643
644 if (not close($fh)) {
645 warn("close($fn): $!");
646 unlink($fn) or warn("unlink($fn): $!");
647 return;
648 }
649
650 rename($fn, "$mh/.folders") or warn("rename($fn, $mh/.folders): $!");
651 }
652
653 \f
654 MAIN: {
655 my $st;
656
657 if ($dumpfilters) {
658 require "$HOME/.minc";
659 $Data::Dumper::Indent = 1;
660 print(Dumper(\@FILTERS));
661 exit;
662 }
663
664 chdir("$MAILDIR/new") or die("chdir($MAILDIR/new): $!");
665 my %folders = filter_mail(map { $_->[1] }
666 sort { $a->[0] <=> $b->[0] }
667 map {
668 if (not ($st = stat($_))) {
669 die("stat($_): $!");
670 }
671 [$st->mtime, $_]
672 }
673 getfiles('.'));
674
675 update_folders(\%folders);
676
677 @SPAM and (exec('scan', '+SPAM', @SPAM) or die);
678 }
679
680 \f
681 __END__
682
683 =head1 THE FILTERS STRUCTURE
684
685 The user's .mincfilter file must define the @FILTERS structure. This
686 structure is an array. Each element of @FILTERS is a filter. A
687 filter is itself an array. The first element of a filter is a string,
688 the name of the header this filter acts upon. The header name is not
689 case-sensitive. Each subsequent element of a filter is a pair (i.e. a
690 two-element array): first, a regular expression B<minc> uses to
691 determine whether this filter matches or not, and second, an
692 expression which B<minc> evaluates to get the folder name.
693
694 B<minc> decides where to store a message by iterating over the
695 @FILTERS array. It tests each regexp of each filter against all
696 headers matching that filter's specified header. As soon as a match
697 is found, B<minc> evaluates the second part of the pair. This part
698 may contain positional parameters from the matched regexp ($1, $2,
699 etc.). The result of this expression is used as the folder name.
700
701 Multiple occurrences of the same header are preserved and tested
702 individually against the filters. That means, for example, that
703 filters can search all the Received headers for a certain string.
704
705 It is important to note that all the arrays of the @FILTERS structure
706 are traversed I<in order>. This means the filters can be arranged so
707 that some have priority over others. XXX: get Doug to write an
708 example of having the same header matched more than once.
709
710 Lastly, B<minc> supports a magic ' TO' header. Filters using this
711 header are matched against a collection of headers related to (and
712 including) the To. Which headers to use is determined by a regular
713 expression borrowed from procmail.
714
715 =head1 EXAMPLES
716
717 @FILTERS = (
718
719 ['List-Id',
720 ['<([[:graph:]]+)-([[:graph:]]+)\.lists\.sourceforge\.net>', 'l/$1/$2'],
721 ['<([[:graph:]]+?)(-list)?\.freedesktop\.org>', 'l/freedesktop/$1'],
722 ['<swig\.cs\.uchicago\.edu>', 'l/swig'],
723 ],
724
725 ['List-Post',
726 ['<mailto:([[:graph:]]+)+@([[:graph:]]+)\.apache\.org>', 'l/apache/$2/$1'],
727 ],
728
729 ['To',
730 ['root', 'root'],
731 ],
732 );
733
734 The first List-Id filter is a surprisingly effective one which creates
735 appropriate folder names (such as l/htdig/updates and l/rox/devel) for
736 all Sourceforge lists to which i subscribe. Certainly there are lists
737 hosted at Sourceforge which do not work well with this, and this is
738 where it is important to remember that the first match is the one
739 uses. Simply put the more specific rules before this one.
740
741 The next List-Id example is simple. The swig example demonstrates
742 that the folder name does not have to use a portion of the matched
743 header; you can just hardcode the folder name.
744
745 The List-Post example is a nice one. Most ASF projects have their own
746 apache.org subdomain, with mailing lists hosted there. So, given a
747 list such as dev@httpd.apache.org, this filter will create the folder
748 name l/apache/httpd/dev.
749
750 For an example B<spam_check> function, see
751 L<http://pretzelnet.org/cvs/dotfiles/.mincspam>
752
753 =head1 AUTHORS
754
755 Written by Eric Gillespie <epg@pretzelnet.org>. Design by Eric
756 Gillespie and Doug Porter <dsp@waterspout.com>.
757
758 This program is free software; you can redistribute it and/or modify
759 it under the same terms as Perl itself.
760
761 =cut
762
763 # Local variables:
764 # cperl-indent-level: 4
765 # perl-indent-level: 4
766 # indent-tabs-mode: nil
767 # End:
768
769 # vi: set tabstop=4 expandtab: