Implement custom scan of SPAM messages instead of using scan(1).

author epg@google.com <>

Wed, 1 Apr 2009 23:40:27 +0000 (16:40 -0700)

committer epg@google.com <>

Wed, 1 Apr 2009 23:40:27 +0000 (16:40 -0700)
author epg@google.com <>
Wed, 1 Apr 2009 23:40:27 +0000 (16:40 -0700)
committer epg@google.com <>
Wed, 1 Apr 2009 23:40:27 +0000 (16:40 -0700)
diff --git a/minc b/minc

index eec56e447881dc8d72b843c32674313290536cd6..9b6f7915f6d3b1af08c8a137f86d1f7ac5443fea 100755 (executable)
--- a/minc
+++ b/minc
@@ -54,9 +54,13 @@ our $VERSION = 1;
  my $MAGIC_TO_REGEX = '^((Original-)?(Resent-)?(To|Cc|Bcc)|(X-Envelope |Apparently(-Resent)?)-To)';
  my $MAGIC_TO_TOKEN = ' TO';
  
-# List of SPAM message numbers, scanned at the end so the user can
-# check for false positives.
-my @SPAM;
+# XXX re-document
+
+# Mapping of message numbers to array references.  The first element is set by
+# filter_mail to a reference to a header hash for the message; the second is
+# set by maildier_spam to the name of the message file in the spam maildir.
+# scan_spam scans this at the end so the user can check for false positives.
+my %SPAM;
  
  =head1 OPTIONS
  
@@ -417,7 +421,6 @@ sub store_message {
      my $msgnum;
      my $try;
      my $mhmsg;
-    my $status;
  
      # We must do this even in -n mode because later steps fail without
      # it.  This should be harmless.
@@ -482,9 +485,7 @@ sub store_message {
          # fails.  While it is slow, it is not safe to store multiple
          # messages and then have a failure before marking some (or
          # all).
-        if ($mhfolder eq 'SPAM') {
-            push(@SPAM, $msgnum);
-        } else {
+        if ($mhfolder ne 'SPAM') {
              mark($mhfolder, $msgnum, 'unseen');
          }
      }
@@ -606,6 +607,23 @@ sub find_mh_folder {
      return 'inbox';
  }
  
+sub scan_line {
+    my ($headers, $mhfolder, $msgnum, $nf, $nm, $nF, $ns) = @_;
+    my $from = '';
+    my $subject = '';
+    # Sometimes these headers are missing...
+    eval { $from = [@{$headers->{'from'}}]->[-1] };
+    eval { $subject = [@{$headers->{'subject'}}]->[-1] };
+    # Replace garbage characters.
+    for ($from, $subject) {
+        tr/\x00-\x1f\x80-\xff/?/;
+    }
+    return sprintf("\%-${nf}s \%${nm}d \%-${nF}s \%s",
+                   substr($mhfolder, 0, $nf), substr($msgnum, 0, $nm),
+                   substr($from, 0, $nF),
+                   substr($subject, 0, $ns));
+}
+
  sub filter_mail {
      @_ or return ();
      my $msgcount = @_ - 2; # don't count . and ..
@@ -665,20 +683,11 @@ sub filter_mail {
  
          if ($mhfolder eq 'SPAM') {
              $spam++;
+            $SPAM{$msgnum} = [\%headers, undef];
          } else {
              $saved++;
-            my $from = '';
-            my $subject = '';
-            # Sometimes these headers are missing...
-            eval { $from = [@{$headers{'from'}}]->[-1] };
-            eval { $subject = [@{$headers{'subject'}}]->[-1] };
-            for ($from, $subject) {
-                tr/\x00-\x1f\x80-\xff/ /;
-            }
-            printf("\%-${nf}s \%${nm}d \%-${nF}s \%s\n",
-                   substr($mhfolder, 0, $nf), substr($msgnum, 0, $nm),
-                   substr($from, 0, $nF),
-                   substr($subject, 0, $ns));
+            print(scan_line(\%headers, $mhfolder, $msgnum, $nf, $nm, $nF, $ns),
+                  "\n");
          }
  
          for my $hook (@post_store_hooks) {
@@ -783,7 +792,50 @@ sub maildir_spam {
  
      for my $msg (@spams) {
          ($msg eq '.' or $msg eq '..') and next;
-        store_message($msg, 'SPAM');
+        my $msgnum = store_message($msg, 'SPAM');
+        # Store the original file name for scan_spam in -n mode.
+        $SPAM{$msgnum} = [undef, $msg];
+    }
+}
+
+sub scan_spam {
+    my ($msgnum, %header, $tuple, $msg);
+
+    # Unlike filter_mail, we don't need to print the folder name.
+    # Calculate how many columns would be allocated to it...
+    my $nf = int($COLUMNS * $SCAN_P_FOLDER);
+    # ...and add that amount to COLUMNS to calculate the number of columns to
+    # allocate to msgnum and from snippet, thus filling the line without
+    # printing the folder name.
+    my $nm = int(($COLUMNS + $nf) * $SCAN_P_MESSAGE);
+    my $nF = int(($COLUMNS + $nf) * $SCAN_P_FROM);
+    my $ns = $COLUMNS - $nm - $nF - 3;
+
+    for $msgnum (sort(keys(%SPAM))) {
+        $tuple = $SPAM{$msgnum};
+        if (defined($tuple->[0])) {
+            # Filed by filter_mail, so we have the header.
+            %header = %{$tuple->[0]};
+        } elsif (defined($tuple->[1])) {
+            # Filed by maildir_spam, so we don't have the header.
+            if ($run) {
+                # The message has been filed, load it from $mh.
+                $msg = "$mh/SPAM/$msgnum";
+            } else {
+                # The message has not been filed, load it from the maildir.
+                # $tuple->[1] is just a basename, not a path; this works
+                # because maildir_spam did chdir(Maildir/spam/new).
+                $msg = $tuple->[1];
+            }
+            %header = get_headers($msg);
+        } else {
+            print(STDERR
+                  "BUG: corrupt SPAM tuple, neither element defined",
+                  " for message $msgnum\n");
+            next;
+        }
+        print(scan_line(\%header, '', $msgnum, 0, $nm, $nF, $ns),
+              "\n");
      }
  }
  
@@ -814,8 +866,7 @@ MAIN: {
      $run and %folders and update_dot_folders(\%folders);
  
      maildir_spam();
-
-    @SPAM and (exec('scan', '+SPAM', @SPAM) or die);
+    scan_spam();
  }
  
  \f
author	epg@google.com <>
	Wed, 1 Apr 2009 23:40:27 +0000 (16:40 -0700)
committer	epg@google.com <>
	Wed, 1 Apr 2009 23:40:27 +0000 (16:40 -0700)