]> diplodocus.org Git - minc/blob - minc
Write a couple XXX todo comments.
[minc] / minc
1 #! /usr/local/bin/perl
2
3 # $Id$
4
5 =head1 NAME
6
7 B<minc> - Incorporate mail from a maildir into mh folders.
8
9 =head1 SYNOPSIS
10
11 B<minc> [-B<dhns>]
12
13 =head1 DESCRIPTION
14
15 B<minc> is a program for incorporating mail from a maildir to a mh
16 folder hierarchy. It takes mail from a maildir folder (not a maildir
17 folder hierarchy), checks for spam with razor (XXX crossref), and
18 optionally filters mail into separate mh folders.
19
20 The filtering is quite sophisticated, as it is done using real Perl
21 substitution (s//) commands.
22
23 =cut
24
25 use strict;
26 use warnings;
27
28 require 'sysexits.ph';
29
30 use Data::Dumper;
31 use File::Basename;
32 use Getopt::Std;
33 use Log::Dispatch;
34 use Log::Dispatch::File;
35 use POSIX qw(strftime WEXITSTATUS WIFEXITED);
36
37 =head1 OPTIONS
38
39 =over 4
40
41 =item B<-d>
42
43 Dump (using Data::Dumper) the FILTERS hash and exit. This is useful
44 for testing the syntax of .mincfilter.
45
46 =item B<-h>
47
48 Show help.
49
50 =item B<-n>
51
52 Dry run; do not actually incorporate the mail, but log and report to
53 stdout/stderr as normal.
54
55 =item B<-s>
56
57 Process SPAM only, then exit, leaving all non-spam messages in the
58 maildir.
59
60 =back
61
62 =cut
63
64 my $dumpfilters = 0;
65 my $run = 1;
66 my $spamonly = 0;
67
68 our $opt_d;
69 our $opt_h;
70 our $opt_n;
71 our $opt_s; # ;; # stupid cperl-mode
72
73 if (!getopts('dhns')) {
74 exit(&EX_USAGE);
75 }
76
77 if ($opt_d) {
78 $dumpfilters = 1;
79 } elsif ($opt_h) {
80 print("Sorry bub, no help.\n");
81 exit(&EX_OK);
82 } elsif ($opt_n) {
83 $run = 0;
84 }
85
86 if ($opt_s) { # ))){ # stupid cperl-mode
87 $spamonly = 1;
88 }
89
90 =head1 ENVIRONMENT
91
92 =over 4
93
94 =item HOME
95
96 Where configuration files (.mincfilter) are found. Also,
97 $HOME/Maildir is used for the maildir if MAILDIR is not set.
98
99 =item MAILDIR
100
101 Where mail is delivered.
102
103 =back
104
105 =cut
106
107 use Env qw(HOME MAILDIR);
108
109 if (not $HOME) {
110 die("HOME environment variable must be set.\n");
111 }
112 if (not $MAILDIR) {
113 $MAILDIR = "$HOME/Maildir";
114 }
115
116 =head1 FILES
117
118 =over 4
119
120 =item $HOME/.mincfilter
121
122 This file is Perl code (included via the 'require' directive) which is
123 expected to define the FILTERS hash.
124
125 =item `mhpath +`/logs/minc.log
126
127 Where minc logs what it does, unless in -n mode.
128
129 =item `mhpath +`/logs/dryrun.log
130
131 Where minc logs what it would do; used in -n mode.
132
133 =cut
134
135 our %FILTERS;
136 require "$HOME/.mincfilter";
137
138 my $mh;
139 my $logfile;
140
141 $mh = `mhpath +`;
142 chomp($mh);
143
144 if ($run) {
145 $logfile = $mh . '/logs/minc.log';
146 } else {
147 $logfile = $mh . '/logs/dryrun.log';
148 }
149
150 \f
151 ###############################################################################
152 # Logging
153
154 # debuglevels for the logger
155 use constant LOG_DEBUGLEVEL => 'info';
156 use constant SCREEN_DEBUGLEVEL => 'debug';
157
158 # characters used to wrap around the id field in the log so I can more
159 # easily parse the msg flow
160 use constant DEBUGCHAR => '%';
161 use constant INFOCHAR => '=';
162 use constant INCOMINGCHAR => '<';
163 use constant SAVECHAR => '>';
164 use constant WARNCHAR => 'W';
165 use constant ERRORCHAR => 'E';
166
167 my $logger = new Log::Dispatch;
168 $logger->add(new Log::Dispatch::File (name=>'logfile',
169 filename=>$logfile,
170 min_level=>'info',
171 mode=>'append'));
172
173 # log alias, handles getting passed vars that are undef
174 sub mylog {
175 my $level = shift;
176 my $act = shift;
177 my $timestamp = strftime('%b %e %H:%M:%S', localtime);
178 my $msg;
179 foreach my $part (@_) {
180 if (defined($part)) {
181 $msg .= $part;
182 }
183 }
184 # no newlines in the log message, thanks
185 $msg =~ s/\n/ /gm;
186 $msg = sprintf("%s %s%s %s\n", $timestamp, $act, $act, $msg);
187 $logger->log(level=>$level, message=>$msg);
188
189 if ($act eq SAVECHAR) {
190 $logger->log(level=>$level, message=>"\n");
191 }
192 }
193
194 sub logincoming { mylog('info', INCOMINGCHAR, @_); }
195 sub logsave { mylog('notice', SAVECHAR, @_); }
196 sub loginfo { mylog('info', INFOCHAR, @_); }
197 sub logdebug { mylog('debug', DEBUGCHAR, @_); }
198 sub logwarn { mylog('warning', WARNCHAR, @_); }
199 sub logerr { mylog('error', ERRORCHAR, @_); }
200
201 sub log_headers {
202 my %headers = @_;
203
204 logincoming('From: ', $headers{'return-path'});
205 logincoming('To: ', $headers{'to'});
206 logincoming('Subject: ', $headers{'subject'});
207 logincoming('Message-Id: ', $headers{'message-id'});
208 }
209
210 \f
211 ###############################################################################
212 # Utility procedures
213
214 sub _errprint {
215 printf('%s:%s', __FILE__, __LINE__);
216
217 if (@_) {
218 print(': ');
219 foreach (@_) {
220 print;
221 }
222 }
223 }
224
225 sub err {
226 my $ex = shift;
227
228 _errprint(@_);
229 print(": $!\n");
230
231 exit($ex);
232 }
233
234 sub errx {
235 my $ex = shift;
236
237 _errprint(@_);
238 print("\n");
239
240 exit($ex);
241 }
242
243 sub mkfolder {
244 my $mhfolder = shift;
245 my $folder;
246 my $target;
247 my $component;
248
249 $folder = $mh . '/' . $mhfolder;
250 $target = '';
251
252 if (! -d $folder) {
253 foreach $component (split('/', $folder)) {
254 if (defined($component) and length($component) >= 1) {
255 $target = $target . '/' . $component;
256 if (-d $target or mkdir($target)) {
257 next;
258 } else {
259 err(&EX_TEMPFAIL,
260 "Failed to create +$mhfolder ($component)");
261 }
262 }
263 }
264 }
265 }
266
267 # We want to print the name of each list that has new mail only once,
268 # so use this hash to store the lists that have already been printed.
269 # Start the list out with SPAM already in it, since we don't care when
270 # new messages are added to it.
271 my %FOLDERS = ('SPAM'=>1);
272
273 sub store_message {
274 my $msg = shift;
275 my $mhfolder = shift;
276 my $mhmsg;
277 my $msgnum;
278 my $status;
279
280 # We must do this even in -n mode because later steps fail without
281 # it. This should be harmless.
282 mkfolder($mhfolder);
283
284 # XXX: Grab the 'mhpath new' algorithm and implement it internally.
285 $mhmsg = `mhpath +$mhfolder new`;
286 chomp($mhmsg);
287 $msgnum = basename($mhmsg);
288
289 logsave("+$mhfolder");
290
291 if ($run) {
292 if (not rename($msg, $mhmsg)) {
293 err(&EX_TEMPFAIL, "Failed rename($msg, $mhmsg)");
294 }
295
296 # XXX: Lame! Instead, keep a hash of arrays. Keys are folder
297 # names, array elements are message numbers in that list.
298 # Then after all the messages have been sorted, run one mark
299 # command per folder, marking all messages for that folder in
300 # one sweep.
301 if ($mhfolder ne 'SPAM') {
302 $status = system("mark +$mhfolder $msgnum -sequence unseen -add");
303 if (not (WIFEXITED($status) and WEXITSTATUS($status) == 0)) {
304 errx(&EX_TEMPFAIL, "Failed to mark message unseen");
305 }
306 }
307 }
308
309 if (not $FOLDERS{$mhfolder}) {
310 print("+$mhfolder\n");
311 $FOLDERS{$mhfolder} = 1;
312 }
313 }
314
315 sub get_headers {
316 my $msg = shift;
317 my %headers;
318 my $current; # current header, used for unfolding lines
319 my $fieldname; # unmolested header name
320
321 open(MSG, $msg);
322 while (<MSG>) {
323 chomp;
324 if (length == 0) {
325 last;
326 }
327
328 if (/^\s/) {
329 # folded header continuation
330
331 if (!defined($current)) {
332 print("Malformed message, cannot parse headers.\n");
333 return ();
334 }
335
336 $headers{$current} .= $_;
337 } else {
338 ($fieldname) = split(/:/);
339 $current = lc($fieldname);
340 (undef, $headers{$current}) = split(/^$fieldname:\s*/);
341 }
342 }
343 close(MSG);
344
345 return %headers;
346 }
347
348 \f
349 ###############################################################################
350 # Spam handling
351
352 sub is_spam {
353 my $msg = shift;
354 my $err;
355 my $sig;
356 my $status;
357
358 $err = $sig = $msg;
359 $err =~ s|/new/|/err/|;
360 $sig =~ s|/new/|/sig/|;
361 if (-f $err) {
362 my $line;
363 my $message;
364
365 open(ERR, $err);
366 $line = <ERR>;
367 close(ERR);
368
369 chomp($line);
370 ($message, $!) = split(/:/, $line);
371
372 logerr("$err: $message: $!");
373
374 if ($run) {
375 if (unlink($err) != 1) {
376 err(&EX_TEMPFAIL, "Failed unlink($err)");
377 }
378 }
379 } elsif (-f $sig) {
380 # This is supposed to be a signature created with razor-check
381 # directly after delivery. Currently this isn't supported
382 # because it isn't clear to me how to get that signature back
383 # into razor-check. For now, just unlink any sig files we
384 # find and proceed with full razor-check mode.
385
386 if ($run) {
387 if (unlink($sig) != 1) {
388 err(&EX_TEMPFAIL, "Failed unlink($sig)");
389 }
390 }
391 }
392
393 if ($run) {
394 $status = system("razor-check < $msg");
395 if (not WIFEXITED($status)) {
396 err(&EX_TEMPFAIL, "Failed to run razor-check < $msg");
397 } elsif (WEXITSTATUS($status) == 0) {
398 return 1;
399 } else {
400 return 0;
401 }
402 } else {
403 return 0;
404 }
405 }
406
407 sub kill_spam {
408 my @msglist = @_;
409 my @result;
410 my $msg;
411 my $i;
412
413 @result = ();
414
415 $i = 0;
416 foreach $msg (@msglist) {
417 printf('%sChecking for spam... %6d/%d',
418 "\r", ++$i, scalar(@msglist));
419 if (is_spam($msg)) {
420 print(" SPAM\n");
421 store_message($msg, 'SPAM');
422 } else {
423 push(@result, $msg);
424 }
425 }
426 print("\nDone: ", scalar(@result), " survivors\n");
427
428 return @result;
429 }
430
431 \f
432 ###############################################################################
433 # Filtering
434
435 sub find_mh_folder {
436 my $msg = shift;
437 my %headers;
438 my $header;
439 my $contents;
440 my $regex;
441 my $subst;
442 my $try;
443
444 %headers = get_headers($msg);
445 if (not %headers) {
446 return 'inbox';
447 }
448
449 log_headers(%headers);
450
451 foreach $header (keys(%FILTERS)) {
452 $contents = $headers{lc($header)};
453
454 if (defined($contents)) {
455 foreach $regex (keys(%{$FILTERS{$header}})) {
456 $subst = $FILTERS{$header}->{$regex};
457
458 $try = '$contents =~';
459 $try .= " s|$regex|$subst|is";
460 if (eval $try) {
461 return $contents;
462 }
463 }
464 }
465 }
466
467 return 'inbox';
468 }
469
470 sub filter_mail {
471 my @msglist = @_;
472 my $msg;
473 my $mhfolder;
474
475 foreach $msg (@msglist) {
476 $mhfolder = find_mh_folder($msg);
477 store_message($msg, $mhfolder);
478 }
479 }
480
481 \f
482 MAIN: {
483 my @msglist;
484
485 if ($dumpfilters) {
486 $Data::Dumper::Indent = 1;
487 print(Dumper(\%FILTERS));
488 exit(&EX_OK);
489 }
490
491 @msglist = kill_spam(glob("$MAILDIR/new/*"));
492
493 if (not $spamonly) {
494 filter_mail(@msglist);
495 }
496 }
497
498 __END__
499
500 =head1 EXAMPLES
501
502 %FILTERS =
503 ('List-Id:'=>
504 {'.*<([[:graph:]]+)-([[:graph:]]+)\.lists\.sourceforge\.net>.*'=>'l/$1/$2',
505 '.*<([[:graph:]]+?)(-list)?\.freedesktop\.org>.*'=>'l/freedesktop/$1',
506 '.*<swig\.cs\.uchicago\.edu>.*'=>'l/swig'},
507
508 'List-Post:'=>
509 {'<mailto:([[:graph:]]+)+@([[:graph:]]+)\.apache\.org>'=>'l/apache/$2/$1'},
510
511 'To:'=>
512 {'root'=>'root'});
513
514 The first List-Id filter is a surprisingly effective one which creates
515 appropriate folder names (such as l/htdig/updates and l/rox/devel) for
516 all Sourceforge lists to which i subscribe. Certainly there are lists
517 hosted at Sourceforge which do not work well with this, and this is
518 where it is important to remember that the first match is the one
519 uses. Simply put the more specific rules before this one.
520
521 The next List-Id example is simple. The swig example demonstrates
522 that the folder name does not have to use a portion of the matched
523 header; you can just hardcode the folder name.
524
525 The List-Post example is a nice one. Most ASF projects have their own
526 apache.org subdomain, with mailing lists hosted there. So, given a
527 list such as dev@httpd.apache.org, this filter will create the folder
528 name l/apache/httpd/dev.
529
530 =head1 AUTHORS
531
532 Written by Eric Gillespie <epg@pretzelnet.org> with logging code
533 stolen from Adam Lazur <adam@lazur.org>.
534
535 Design by Eric Gillespie and Doug Porter <dsp@waterspout.com>.
536
537 =cut