]> diplodocus.org Git - minc/blob - minc
(filter_mail): Don't move @_ to @msglist, just use @_.
[minc] / minc
1 #! /usr/bin/env perl
2
3 # $Id$
4
5 =head1 NAME
6
7 B<minc> - incorporate mail from a maildir into mh folders
8
9 =head1 SYNOPSIS
10
11 B<minc> [B<-m> I<MAX>] [B<-n>] [B<-p>]
12
13 B<minc> B<-d>
14
15 B<minc> B<-h>
16
17 =head1 DESCRIPTION
18
19 B<minc> incorporates mail from a maildir to a mh folder hierarchy. It
20 takes mail from a maildir folder (not a maildir folder hierarchy),
21 optionally checks for spam with a user-defined spam-checking function,
22 and optionally filters mail into separate mh folders.
23
24 The filtering is quite sophisticated, as it is done using real Perl
25 matching (m//) commands.
26
27 =cut
28
29 use strict;
30 use warnings;
31
32 use Data::Dumper;
33 use Errno;
34 use Fcntl qw(O_WRONLY O_EXCL O_CREAT);
35 use FileHandle;
36 use File::stat;
37 use Getopt::Long qw(:config gnu_getopt no_ignore_case);
38 use POSIX qw(strftime WEXITSTATUS WIFEXITED);
39 use Pod::Usage;
40
41 our $VERSION = 1;
42
43 # If a filter set's header is $MAGIC_TO_TOKEN, that set is compared
44 # against headers matching this regex (taken from procmail).
45 my $MAGIC_TO_REGEX = '^((Original-)?(Resent-)?(To|Cc|Bcc)|(X-Envelope |Apparently(-Resent)?)-To)';
46 my $MAGIC_TO_TOKEN = ' TO';
47
48 # List of SPAM message numbers, scanned at the end so the user can
49 # check for false positives.
50 my @SPAM;
51
52 =head1 OPTIONS
53
54 =over 4
55
56 =item B<-d>
57
58 Dump (using Data::Dumper) the FILTERS list and exit. This is useful
59 for testing the syntax of .mincfilter.
60
61 =item B<-h>
62
63 Show help.
64
65 =item B<-m> I<MAX>
66
67 Stop processing after I<MAX> messages.
68
69 =item B<-n>
70
71 Dry run; do not actually incorporate the mail, but log and report to
72 stdout/stderr as normal.
73
74 =item B<-p>
75
76 Print the filename of each message before checking it for spam. This
77 can be handy if a particular message is giving the spam checker a
78 problem.
79
80 =back
81
82 =cut
83
84 my $dumpfilters;
85 my $help;
86 my $maxmsgs;
87 my $norun;
88 my $printfilenames;
89
90 GetOptions(
91 'd' => \$dumpfilters,
92 'h|help' => \$help,
93 'm=i' => \$maxmsgs,
94 'n' => \$norun,
95 'p' => \$printfilenames,
96 ) or pod2usage();
97 $help and pod2usage(-exitstatus=>0, -verbose=>1);
98 @ARGV == 0 or pod2usage();
99
100 our $run = !$norun;
101
102 =head1 ENVIRONMENT
103
104 =over 4
105
106 =item HOME
107
108 Where configuration files (.mincfilter) are found. Also,
109 $HOME/Maildir is used for the maildir if MAILDIR is not set.
110
111 =item MAILDIR
112
113 Where mail is delivered.
114
115 =back
116
117 =cut
118
119 use Env qw(COLUMNS HOME MAILDIR);
120
121 $COLUMNS ||= 80;
122
123 if (not $HOME) {
124 die("HOME environment variable must be set.\n");
125 }
126 if (not $MAILDIR) {
127 $MAILDIR = "$HOME/Maildir";
128 }
129
130 =head1 FILES
131
132 =over 4
133
134 =item $HOME/.mincfilter
135
136 This file is Perl code (included via the 'require' directive) which is
137 expected to define the FILTERS list.
138
139 =item $HOME/.mincspam
140
141 If this file exists, B<minc> will include it with the expectation that
142 it will define a B<spam_check> function. This function takes a
143 message filename as an argument and returns 1 if the message is spam,
144 else 0. If this file does not exist, B<minc> will define a simple
145 function that always returns 0.
146
147 One of B<minc>'s global variables is available to the user-defined
148 B<spam_check> function: $run. This boolean should be honored;
149 B<spam_check> should only take real action (i.e. removing or creating
150 files, running external programs, etc.) if $run is non-zero.
151
152 This file may also declare two other functions: B<spam_start_hook> and
153 B<spam_stop_hook>. The former is passed no arguments and is expected
154 to return a list. This list is a "baton" that will also be passed to
155 B<spam_check> and B<spam_stop_hook>. It can hold anything
156 B<spam_check> will need to do its job, whether network sockets, pipes,
157 or whatever.
158
159 XXX: need more details about the spam-handling process; for now read
160 the code.
161
162 =item `mhpath +`/logs/minc.log
163
164 Where B<minc> logs what it does, unless in -n mode.
165
166 =item `mhpath +`/logs/dryrun.log
167
168 Where B<minc> logs what it would do; used in -n mode.
169
170 =item `mhpath +`/.minc.context
171
172 B<minc> uses this file for context (i.e. current folder) instead of
173 `mhpath +`/context. This allows you some amount of freedom to use mh
174 while B<minc> is running. For example, you can changed folders without
175 causing a message to be stored in the wrong folder. Care must still
176 be taken, however, about the unseen sequence; if you change it
177 (whether via show, or mark, or anything else) while in the same folder
178 as B<minc>, it is likely the new message B<minc> stores will not end up
179 marked unseen.
180
181 =back
182
183 =cut
184
185 our @FILTERS;
186 our (@start_hooks, @stop_hooks, @filter_hooks, @post_store_hooks);
187
188 my $mh;
189 my $logfile;
190
191 $mh = `mhpath +`;
192 chomp($mh);
193
194 if ($run) {
195 $logfile = $mh . '/logs/minc.log';
196 } else {
197 $logfile = $mh . '/logs/dryrun.log';
198 }
199
200 $ENV{"MHCONTEXT"} = $mh . '/.minc.context';
201
202 \f
203 ###############################################################################
204 # Logging
205
206 sub mylog {
207 my $timestamp = strftime('%b %e %H:%M:%S', localtime);
208 my $msg;
209 foreach my $part (@_) {
210 if (defined($part)) {
211 $msg .= $part;
212 }
213 }
214 # no newlines in the log message, thanks
215 $msg =~ s/\n/ /gm;
216
217 open(LOG, ">>$logfile") or die("open(>>$logfile): $!");
218 print(LOG "$timestamp $msg\n") or die("print(>>$logfile): $!");
219 close(LOG) or die("close($logfile): $!");
220 }
221
222 sub logheader {
223 my ($text, @contents) = @_;
224 my $last;
225
226 if (@contents) {
227 $last = $contents[-1];
228 } else {
229 $last = '';
230 }
231
232 mylog('<< ', $text, $last);
233 }
234
235 sub log_headers {
236 my %headers = @_;
237
238 # For an explanation of the %headers structure, see the
239 # get_headers function below.
240 logheader('From: ', @{$headers{'return-path'}});
241 logheader('To: ', @{$headers{'to'}});
242 logheader('Subject: ', @{$headers{'subject'}});
243 logheader('Message-Id: ', @{$headers{'message-id'}});
244 }
245
246 \f
247 ###############################################################################
248 # Utility procedures
249
250 sub mkfolder {
251 my $folder = shift;
252 my $target;
253 my $component;
254
255 $target = $mh;
256 foreach $component (split('/', $folder)) {
257 $target = join('/', $target, $component);
258 -d $target or mkdir($target) or die("mkdir($target): $!");
259 }
260 }
261
262 sub getfiles {
263 my $dir = shift;
264 my @result;
265
266 if (not opendir(DIR, $dir)) {
267 die("opendir($dir): $!");
268 }
269
270 # Initialize $! to 0 (success) because stupid stupid Perl provides
271 # no way to distinguish an error from an empty directory; that
272 # means setting $! to 0 and testing it afterwards is the only way
273 # to detect an error. Real Programmers don't handle errors,
274 # right? >sigh<
275 $! = 0;
276 @result = readdir(DIR);
277 if ($! != 0) {
278 die("readdir($dir): $!");
279 }
280
281 if (@result <= 2) {
282 exit(0);
283 }
284
285 STDOUT->autoflush(1);
286 print(@result - 2, " messages...");
287
288 closedir(DIR);
289
290 return @result;
291 }
292
293 sub get_highest_msgnum {
294 my $mhfolder = shift;
295 my $dir;
296 my @list;
297 my $highest;
298 my $msgnum;
299
300 $dir = "$mh/$mhfolder";
301 if (not opendir(DIR, $dir)) {
302 die("opendir($dir): $!");
303 }
304
305 # Insert rant from getfiles here.
306 $! = 0;
307 @list = readdir(DIR);
308
309 if ($! != 0) {
310 die("readdir($dir): $!");
311 }
312
313 closedir(DIR);
314
315 $highest = 0;
316 foreach $msgnum (@list) {
317 # Look for integers.
318 if ($msgnum =~ /^[0-9]+$/) {
319 if ($msgnum > $highest) {
320 $highest = $msgnum;
321 }
322 }
323 }
324
325 return $highest;
326 }
327
328 sub store_message {
329 my $msg = shift;
330 my $mhfolder = shift;
331 my $msgnum;
332 my $try;
333 my $mhmsg;
334 my $status;
335
336 # We must do this even in -n mode because later steps fail without
337 # it. This should be harmless.
338 mkfolder($mhfolder);
339
340 # This loop is a modified version of the maildir delivery algorithm.
341 $msgnum = get_highest_msgnum($mhfolder);
342 for ($try = 0; ; $try++) {
343 $msgnum++;
344 $mhmsg = "$mh/$mhfolder/$msgnum";
345
346 if (not stat($mhmsg)) {
347 if ($!{ENOENT}) {
348 # Now we have a non-existent file, let's try to create
349 # it. We must create a zero-byte file first because a
350 # file my appear between our happy stat results and
351 # our later rename(2), which would clobber said file.
352 # So attempt to create a file with this name. If it
353 # succeeds, in just a bit here we'll knowingly clobber
354 # this file with the rename(2) call.
355
356 # Another way to do this is not to use rename(2), but
357 # use link(2) + unlink(2) instead. That's how the
358 # standard maildir algorithm does it. Each method has
359 # a disadvantage: the program may crash between the
360 # link(2) and unlink(2) calls. With the standard
361 # maildir algorithm, that means the message will end
362 # up duplicated. The advantage of creating an empty
363 # file followed by rename(2) is that an extra empty
364 # file is left behind as opposed to a duplicate
365 # message. This is more easily detected by the user.
366 if ($run) {
367 if (sysopen(MSG, $mhmsg,
368 O_WRONLY | O_EXCL | O_CREAT, 0600)) {
369 close(MSG);
370 last;
371 }
372 } else {
373 last;
374 }
375 }
376 }
377
378 # This algorithm is different from the maildir one; let's make
379 # 10 tries instead of 3.
380 if ($try == 9) {
381 die("Attempted filename $mhmsg exists.");
382 }
383
384 # This algorithm is different; i don't think we need to sleep.
385 #sleep(2);
386 }
387
388 if ($mhfolder ne 'SPAM') {
389 mylog('+', $mhfolder);
390 }
391
392 if ($run) {
393 if (not rename($msg, $mhmsg)) {
394 die("rename($msg, $mhmsg): $!");
395 }
396
397 # Mark each message as soon as we store it and bomb if that
398 # fails. While it is slow, it is not safe to store multiple
399 # messages and then have a failure before marking some (or
400 # all).
401 if ($mhfolder eq 'SPAM') {
402 push(@SPAM, $msgnum);
403 } else {
404 $status = system('mark', "+$mhfolder", "$msgnum", '-sequence',
405 'unseen', '-add');
406 # XXX need to handle signalled and stopped, and print
407 # the exit code or signal number.
408 if (not WIFEXITED($status)) {
409 die("Failed to run mark");
410 } elsif (WEXITSTATUS($status) != 0) {
411 die("Failed to mark message unseen.");
412 }
413 }
414 }
415
416 return $msgnum;
417 }
418
419 # Parse a message file into a structure describing the headers. The
420 # structure is a hash of arrays. The hash keys are the names of the
421 # headers found in the message, made all lower-case. Each item in
422 # the hash is an array of header text. The array has one element
423 # per occurrence of the header. Most headers will only have a single
424 # element because they occur only once. The Received header is the
425 # most common header with multiple texts.
426 sub get_headers {
427 my $msg = shift;
428 my %headers;
429 my $current; # current header, used for unfolding lines
430 my $fieldname; # unmolested header name
431 my $contents; # contents of header
432
433 open(MSG, $msg) or die("open(MSG, $msg): $!");
434 while (<MSG>) {
435 chomp;
436 if (length == 0) {
437 last;
438 }
439
440 if (/^\s/) {
441 # folded header continuation
442
443 if (not defined($current)) {
444 warn('Malformed message, cannot parse headers.');
445 next;
446 }
447
448 @{$headers{$current}}[-1] .= $_;
449 } else {
450 ($fieldname) = split(/:/);
451 $current = lc($fieldname);
452 (undef, $contents) = split(/^\Q$fieldname\E:\s*/);
453
454 if (defined($headers{$current})) {
455 # multiple occurence of same header
456 push(@{$headers{$current}}, $contents);
457 } else {
458 # first occurence of a header
459 $headers{$current} = [$contents];
460 }
461 }
462 }
463 close(MSG);
464
465 return %headers;
466 }
467
468 \f
469 ###############################################################################
470 # Filtering
471
472 sub find_mh_folder {
473 my $msg = shift;
474 my %headers = @_;
475 my $filterref;
476 my @filter;
477 my $header;
478 my $contents;
479 my $pair;
480 my $match;
481 my $expression;
482 my $result;
483
484 if (not %headers) {
485 return 'malformed';
486 }
487
488 # Walk the list of filters. This structure is documented in
489 # pod at the end of the program.
490 foreach $filterref (@FILTERS) {
491 @filter = @$filterref;
492 $header = shift(@filter);
493
494 # Handle filters using the magic TO header specially.
495 if ($header eq $MAGIC_TO_TOKEN) {
496 foreach $header (keys(%headers)) {
497 if ($header =~ /$MAGIC_TO_REGEX/i) {
498 foreach $contents (@{$headers{lc($header)}}) {
499 foreach $pair (@filter) {
500 ($match, $expression) = @$pair;
501 if ($contents =~ /$match/) {
502 return $expression;
503 }
504 }
505 }
506 }
507 }
508
509 # Now that it's been processed specially, skip normal handling.
510 next;
511 }
512
513 # Walk the list of message headers matching the filter's
514 # specified header.
515 foreach $contents (@{$headers{lc($header)}}) {
516 # Walk the filter's list of match/expression pairs.
517 foreach $pair (@filter) {
518 ($match, $expression) = @$pair;
519 if ($contents =~ /$match/i) {
520 if (eval "\$result = \"$expression\"") {
521 return $result;
522 }
523 }
524 }
525 }
526 }
527
528 return 'inbox';
529 }
530
531 sub filter_mail {
532 my $msgcount = @_ - 2; # don't count . and ..
533 my $len = length($msgcount);
534 my @baton;
535 my $msg;
536 my $mhfolder;
537 my $spam = 0;
538 my $saved = 0;
539 my $msgnum;
540
541 # XXX lame names and hard-coded proportions.
542 my $nf = int($COLUMNS * 0.1);
543 my $nm = int($COLUMNS * 0.0625);
544 my $nF = int($COLUMNS * 0.175);
545 my $ns = $COLUMNS - $nf - $nm - $nF - 3;
546
547 if (-f "$HOME/.minc") {
548 require "$HOME/.minc";
549 }
550
551 my %batons;
552 for my $hook (@start_hooks) {
553 my ($handle, @baton) = $hook->();
554 if (defined($handle)) {
555 $batons{$handle} = [@baton];
556 }
557 }
558
559 for $msg (@_) {
560 ($msg eq '.' or $msg eq '..') and next;
561
562 if ($printfilenames) {
563 print("$msg\n");
564 }
565
566 my %headers = get_headers($msg);
567 log_headers(%headers);
568
569 undef($mhfolder);
570 for my $hook (@filter_hooks) {
571 my $result = $hook->(\%batons, \%headers, $msg);
572 defined($result) and ($mhfolder = $result);
573 }
574
575 defined($mhfolder) or ($mhfolder = find_mh_folder($msg, %headers));
576
577 $msgnum = store_message($msg, $mhfolder);
578
579 if ($mhfolder eq 'SPAM') {
580 $spam++;
581 } else {
582 $saved++;
583 my $from = [@{$headers{'from'}}]->[-1];
584 my $subject = [@{$headers{'subject'}}]->[-1];
585 for ($from, $subject) {
586 tr/\x00-\x1f\x80-\xff/ /;
587 }
588 print("\r");
589 print(' ' x $COLUMNS);
590 printf("\r\%-${nf}s \%${nm}d \%-${nF}s \%s\n",
591 substr($mhfolder, 0, $nf), substr($msgnum, 0, $nm),
592 # XXX shouldn't pop, as these are about to be
593 # passed to post_store_hooks
594 substr($from, 0, $nF),
595 substr($subject, 0, $ns));
596 }
597
598 for my $hook (@post_store_hooks) {
599 $hook->(\%batons, \%headers, $mhfolder, $msgnum);
600 }
601
602 print("\r");
603 printf(" \%${len}d SPAM \%${len}d saved \%${len}d/%1d",
604 $spam, $saved, $spam + $saved, $msgcount);
605
606 defined($maxmsgs) and ($spam + $saved < $maxmsgs or last);
607 }
608 print("\n");
609
610 for my $hook (@stop_hooks) {
611 $hook->(\%batons);
612 }
613 }
614
615 \f
616 MAIN: {
617 my $st;
618
619 if ($dumpfilters) {
620 require "$HOME/.minc";
621 $Data::Dumper::Indent = 1;
622 print(Dumper(\@FILTERS));
623 exit;
624 }
625
626 chdir("$MAILDIR/new") or die("chdir($MAILDIR/new): $!");
627 filter_mail(map { $_->[1] }
628 sort { $a->[0] <=> $b->[0] }
629 map {
630 if (not ($st = stat($_))) {
631 die("stat($_): $!");
632 }
633 [$st->mtime, $_]
634 }
635 getfiles('.'));
636
637 @SPAM and (exec('scan', '+SPAM', @SPAM) or die);
638 }
639
640 \f
641 __END__
642
643 =head1 THE FILTERS STRUCTURE
644
645 The user's .mincfilter file must define the @FILTERS structure. This
646 structure is an array. Each element of @FILTERS is a filter. A
647 filter is itself an array. The first element of a filter is a string,
648 the name of the header this filter acts upon. The header name is not
649 case-sensitive. Each subsequent element of a filter is a pair (i.e. a
650 two-element array): first, a regular expression B<minc> uses to
651 determine whether this filter matches or not, and second, an
652 expression which B<minc> evaluates to get the folder name.
653
654 B<minc> decides where to store a message by iterating over the
655 @FILTERS array. It tests each regexp of each filter against all
656 headers matching that filter's specified header. As soon as a match
657 is found, B<minc> evaluates the second part of the pair. This part
658 may contain positional parameters from the matched regexp ($1, $2,
659 etc.). The result of this expression is used as the folder name.
660
661 Multiple occurrences of the same header are preserved and tested
662 individually against the filters. That means, for example, that
663 filters can search all the Received headers for a certain string.
664
665 It is important to note that all the arrays of the @FILTERS structure
666 are traversed I<in order>. This means the filters can be arranged so
667 that some have priority over others. XXX: get Doug to write an
668 example of having the same header matched more than once.
669
670 Lastly, B<minc> supports a magic ' TO' header. Filters using this
671 header are matched against a collection of headers related to (and
672 including) the To. Which headers to use is determined by a regular
673 expression borrowed from procmail.
674
675 =head1 EXAMPLES
676
677 @FILTERS = (
678
679 ['List-Id',
680 ['<([[:graph:]]+)-([[:graph:]]+)\.lists\.sourceforge\.net>', 'l/$1/$2'],
681 ['<([[:graph:]]+?)(-list)?\.freedesktop\.org>', 'l/freedesktop/$1'],
682 ['<swig\.cs\.uchicago\.edu>', 'l/swig'],
683 ],
684
685 ['List-Post',
686 ['<mailto:([[:graph:]]+)+@([[:graph:]]+)\.apache\.org>', 'l/apache/$2/$1'],
687 ],
688
689 ['To',
690 ['root', 'root'],
691 ],
692 );
693
694 The first List-Id filter is a surprisingly effective one which creates
695 appropriate folder names (such as l/htdig/updates and l/rox/devel) for
696 all Sourceforge lists to which i subscribe. Certainly there are lists
697 hosted at Sourceforge which do not work well with this, and this is
698 where it is important to remember that the first match is the one
699 uses. Simply put the more specific rules before this one.
700
701 The next List-Id example is simple. The swig example demonstrates
702 that the folder name does not have to use a portion of the matched
703 header; you can just hardcode the folder name.
704
705 The List-Post example is a nice one. Most ASF projects have their own
706 apache.org subdomain, with mailing lists hosted there. So, given a
707 list such as dev@httpd.apache.org, this filter will create the folder
708 name l/apache/httpd/dev.
709
710 For an example B<spam_check> function, see
711 L<http://pretzelnet.org/cvs/dotfiles/.mincspam>
712
713 =head1 AUTHORS
714
715 Written by Eric Gillespie <epg@pretzelnet.org>. Design by Eric
716 Gillespie and Doug Porter <dsp@waterspout.com>.
717
718 This program is free software; you can redistribute it and/or modify
719 it under the same terms as Perl itself.
720
721 =cut
722
723 # Local variables:
724 # cperl-indent-level: 4
725 # perl-indent-level: 4
726 # indent-tabs-mode: nil
727 # End:
728
729 # vi: set tabstop=4 expandtab: