From: epg@google.com <> Date: Wed, 1 Apr 2009 23:40:27 +0000 (-0700) Subject: Implement custom scan of SPAM messages instead of using scan(1). X-Git-Url: https://diplodocus.org/git/minc/commitdiff_plain/6d0449f39ed28ad05afe9806afc664d6bc204fc6?ds=inline;hp=fbeff8e387d16985ccfbacd67f013703a32301f0 Implement custom scan of SPAM messages instead of using scan(1). If the header of a message contains garbage characters (not 2047-quoted garbage characters, actual multi-byte garbage!) (Chinese or Russian spam) scan(1) will happily print them to the tty, which hurts. (%SPAM): Replace @SPAM global list with a hash. (store_message): Drop unused $status variable. Don't update @SPAM . (scan_line): Factor out the actual scan line formatting from filter_mail to this new function. (filter_mail): Append message number and header hash to %SPAM . (maildir_spam): Append message number and spam maildir file name to %SPAM . (scan_spam): Add function to scan spam messages with scan_line . Deal with 3 cases: spam filtered via filter_mail and spam filtered via maildir_spam in either run or -n mode. For the latter two, open the message file from the mh SPAM folder or from the spam maildir, respectively, to load the header. --- diff --git a/minc b/minc index eec56e4..9b6f791 100755 --- a/minc +++ b/minc @@ -54,9 +54,13 @@ our $VERSION = 1; my $MAGIC_TO_REGEX = '^((Original-)?(Resent-)?(To|Cc|Bcc)|(X-Envelope |Apparently(-Resent)?)-To)'; my $MAGIC_TO_TOKEN = ' TO'; -# List of SPAM message numbers, scanned at the end so the user can -# check for false positives. -my @SPAM; +# XXX re-document + +# Mapping of message numbers to array references. The first element is set by +# filter_mail to a reference to a header hash for the message; the second is +# set by maildier_spam to the name of the message file in the spam maildir. +# scan_spam scans this at the end so the user can check for false positives. +my %SPAM; =head1 OPTIONS @@ -417,7 +421,6 @@ sub store_message { my $msgnum; my $try; my $mhmsg; - my $status; # We must do this even in -n mode because later steps fail without # it. This should be harmless. @@ -482,9 +485,7 @@ sub store_message { # fails. While it is slow, it is not safe to store multiple # messages and then have a failure before marking some (or # all). - if ($mhfolder eq 'SPAM') { - push(@SPAM, $msgnum); - } else { + if ($mhfolder ne 'SPAM') { mark($mhfolder, $msgnum, 'unseen'); } } @@ -606,6 +607,23 @@ sub find_mh_folder { return 'inbox'; } +sub scan_line { + my ($headers, $mhfolder, $msgnum, $nf, $nm, $nF, $ns) = @_; + my $from = ''; + my $subject = ''; + # Sometimes these headers are missing... + eval { $from = [@{$headers->{'from'}}]->[-1] }; + eval { $subject = [@{$headers->{'subject'}}]->[-1] }; + # Replace garbage characters. + for ($from, $subject) { + tr/\x00-\x1f\x80-\xff/?/; + } + return sprintf("\%-${nf}s \%${nm}d \%-${nF}s \%s", + substr($mhfolder, 0, $nf), substr($msgnum, 0, $nm), + substr($from, 0, $nF), + substr($subject, 0, $ns)); +} + sub filter_mail { @_ or return (); my $msgcount = @_ - 2; # don't count . and .. @@ -665,20 +683,11 @@ sub filter_mail { if ($mhfolder eq 'SPAM') { $spam++; + $SPAM{$msgnum} = [\%headers, undef]; } else { $saved++; - my $from = ''; - my $subject = ''; - # Sometimes these headers are missing... - eval { $from = [@{$headers{'from'}}]->[-1] }; - eval { $subject = [@{$headers{'subject'}}]->[-1] }; - for ($from, $subject) { - tr/\x00-\x1f\x80-\xff/ /; - } - printf("\%-${nf}s \%${nm}d \%-${nF}s \%s\n", - substr($mhfolder, 0, $nf), substr($msgnum, 0, $nm), - substr($from, 0, $nF), - substr($subject, 0, $ns)); + print(scan_line(\%headers, $mhfolder, $msgnum, $nf, $nm, $nF, $ns), + "\n"); } for my $hook (@post_store_hooks) { @@ -783,7 +792,50 @@ sub maildir_spam { for my $msg (@spams) { ($msg eq '.' or $msg eq '..') and next; - store_message($msg, 'SPAM'); + my $msgnum = store_message($msg, 'SPAM'); + # Store the original file name for scan_spam in -n mode. + $SPAM{$msgnum} = [undef, $msg]; + } +} + +sub scan_spam { + my ($msgnum, %header, $tuple, $msg); + + # Unlike filter_mail, we don't need to print the folder name. + # Calculate how many columns would be allocated to it... + my $nf = int($COLUMNS * $SCAN_P_FOLDER); + # ...and add that amount to COLUMNS to calculate the number of columns to + # allocate to msgnum and from snippet, thus filling the line without + # printing the folder name. + my $nm = int(($COLUMNS + $nf) * $SCAN_P_MESSAGE); + my $nF = int(($COLUMNS + $nf) * $SCAN_P_FROM); + my $ns = $COLUMNS - $nm - $nF - 3; + + for $msgnum (sort(keys(%SPAM))) { + $tuple = $SPAM{$msgnum}; + if (defined($tuple->[0])) { + # Filed by filter_mail, so we have the header. + %header = %{$tuple->[0]}; + } elsif (defined($tuple->[1])) { + # Filed by maildir_spam, so we don't have the header. + if ($run) { + # The message has been filed, load it from $mh. + $msg = "$mh/SPAM/$msgnum"; + } else { + # The message has not been filed, load it from the maildir. + # $tuple->[1] is just a basename, not a path; this works + # because maildir_spam did chdir(Maildir/spam/new). + $msg = $tuple->[1]; + } + %header = get_headers($msg); + } else { + print(STDERR + "BUG: corrupt SPAM tuple, neither element defined", + " for message $msgnum\n"); + next; + } + print(scan_line(\%header, '', $msgnum, 0, $nm, $nF, $ns), + "\n"); } } @@ -814,8 +866,7 @@ MAIN: { $run and %folders and update_dot_folders(\%folders); maildir_spam(); - - @SPAM and (exec('scan', '+SPAM', @SPAM) or die); + scan_spam(); }