diplodocus.org Git - nmh/blob - docs/contrib/replyfilter

   1 #!/usr/bin/perl
   2 #
   3 # replyfilter - A reply filter for nmh
   4 #
   5 # The idea behind this program is that it will act as a format filter
   6 # for nmh.  It will try to extract out all text/plain parts and format
   7 # them if necessary using a filter program.
   8 #
   9 # To use this program, configure nmh in the following way (nmh 1.5 or later):
  10 #
  11 # - Put the path to this program in your .mh_profile under formatproc:
  12 #
  13 #   formatproc: replyfilter
  14 #
  15 # - Create a mhl reply filter that consists of the following line:
  16 #
  17 #   body:nocomponent,format,nowrap,formatarg="%(trim{content-type})%(putstr)",formatarg="%(trim{content-transfer-encoding})%(putstr)",formatarg=">"
  18 #
  19 #   To decode this a bit:
  20 #
  21 #   body        - Output the "body" component
  22 #   nocomponent - Don't output a component prefix (normally here we use a
  23 #                 component prefix of ">" as a quote character, but we're
  24 #                 going to have replyfilter do that).
  25 #   nowrap      - Don't wrap lines if they exceed the column width
  26 #   formatarg   - Arguments to fmtproc.  The first argument is the value of
  27 #                 the Content-type header; the second is the value of the
  28 #                 Content-Transfer-Encoding header.  The last "formatarg"
  29 #                 is used as your quoting prefix.  Replace it with whatever
  30 #                 you want.
  31 #
  32
  33 use Mail::Field;
  34 use MIME::Head;
  35 use MIME::QuotedPrint;
  36 use MIME::Base64;
  37 use Encode;
  38
  39 #
  40 # The program we use to format "long" text.  Should be capable of reading
  41 # from standard input and sending the formatted text to standard output.
  42 #
  43
  44 $filterprogram = 'par';
  45
  46 #
  47 # If the above filter program has problems with some input, use the following
  48 # regular expression to remove any problematic input.  In this example we
  49 # filter out the UTF-8 non-breaking space (U+00A0) because that makes par
  50 # mangle the output.  Uncomment this if this ends up being a problem for
  51 # you, or feel free to add others.
  52 #
  53
  54 #%filterreplace = ( "\N{U+a0}" => " " );
  55
  56 #
  57 # Our output character set.  This script assumes a UTF-8 locale, but if you
  58 # want to run under a different locale the change it here.
  59 #
  60
  61 $outcharset = 'utf-8';
  62
  63 #
  64 # Maximum column width (used by the HTML converter and to decide if we need
  65 # to invoke the filter program
  66 #
  67
  68 $maxcolwidth = 78;
  69
  70 #
  71 # Out HTML converter program & arguments
  72 #
  73
  74 @htmlconv = ('w3m', '-dump', '-cols', $maxcolwidth - 2, '-T', 'text/html',
  75              '-O', $outcharset);
  76
  77
  78 die "Usage: $0 Content-type content-transfer-encoding quote-prefix\n"
  79                                 if $#ARGV != 2;
  80
  81 if ($ARGV[0] ne "") {
  82         my $ctype = Mail::Field->new('Content-Type', $ARGV[0]);
  83         $content_type =  $ctype->type;
  84         $charset = $ctype->charset;
  85         $boundary = $ctype->boundary;
  86 } else {
  87         $content_type = 'text/plain';
  88         $charset = 'us-ascii';
  89 }
  90
  91 $encoding = $ARGV[1] eq "" ? '7bit' : lc($ARGV[1]);
  92 $quoteprefix = $ARGV[2];
  93
  94 #
  95 # Set up our output to be in our character set
  96 #
  97
  98 binmode(STDOUT, ":encoding($outcharset)");
  99
 100 #
 101 # The simplest case: if we have a single type of text/plain, send it
 102 # to our format subroutine.
 103 #
 104
 105 if ($content_type eq 'text/plain') {
 106         process_text(\*STDIN, $encoding, $charset);
 107         exit 0;
 108 }
 109
 110 #
 111 # Alright, here's what we need to do.
 112 #
 113 # Find any text/plain parts and decode them.  Decode them via base64 or
 114 # quoted-printable, and feed them to our formatting filter when appropriate.
 115 # Put markers in the output for other content types.
 116 #
 117
 118 ($type) = (split('/', $content_type));
 119
 120 if ($type eq 'multipart') {
 121
 122         #
 123         # For multipart messages we have to do a little extra.
 124         # Eat the MIME prologue (everything up until the first boundary)
 125         #
 126
 127         if (! defined $boundary || $boundary eq '') {
 128                 print "No boundary in Content-Type header!\n";
 129                 eat_part(\*STDIN);
 130                 exit 1;
 131         }
 132
 133         while (<STDIN>) {
 134                 last if match_boundary($_, $boundary);
 135         }
 136
 137         if (eof(STDIN)) {
 138                 print "Unable to find boundary in message\n";
 139                 exit 1;
 140         }
 141 } else {
 142         undef $boundary;
 143 }
 144
 145 process_part(\*STDIN, $content_type, $encoding, $charset, $boundary);
 146
 147 if ($boundary) {
 148         #
 149         # Eat the MIME epilog
 150         #
 151         eat_part(\*STDIN);
 152 }
 153
 154 exit 0;
 155
 156 #
 157 # Handled encoded text.  I think we can assume if the encoding is q-p
 158 # or base64 to feed it into a formatting filter.
 159 #
 160
 161 sub process_text (*$$;$)
 162 {
 163         my ($input, $encoding, $charset, $boundary) = @_;
 164         my $text, $filterpid, $prefixpid, $finread, $finwrite;
 165         my $foutread, $foutwrite, $decoder, $ret, $filterflag;
 166         my $text, $maxline = 0;
 167
 168         #
 169         # In the simple case, just spit out the text prefixed by the
 170         # quote character
 171         #
 172
 173         if ($encoding eq '7bit' || $encoding eq '8bit') {
 174                 #
 175                 # Switch the character set to whatever is specified by
 176                 # the MIME message
 177                 #
 178                 binmode($input, ":encoding($charset)");
 179                 while (<$input>) {
 180                         $ret = match_boundary($_, $boundary);
 181                         if (defined $ret) {
 182                                 binmode($input, ':encoding(us-ascii)');
 183                                 return $ret;
 184                         }
 185                         print $quoteprefix, $_;
 186                 }
 187                 return 'EOF';
 188         } else {
 189                 #
 190                 # If we've got some other encoding, the input text is almost
 191                 # certainly US-ASCII
 192                 #
 193
 194                 binmode($input, ':encoding(us-ascii)');
 195
 196                 $decoder = find_decoder(lc($encoding));
 197                 if (! defined $decoder) {
 198                         return 'EOF';
 199                 }
 200         }
 201
 202         #
 203         # Okay, assume that the encoding will make it so that we MIGHT need
 204         # to filter it.  Read it in; if the lines are too long, filter it
 205         #
 206
 207         my $chardecode = find_encoding($charset);
 208
 209         while (<$input>) {
 210                 my @lines, $len;
 211
 212                 last if ($ret = match_boundary($_, $boundary));
 213
 214                 $text .= $_;
 215
 216         }
 217
 218         binmode($input, ':encoding(us-ascii)');
 219
 220         $text = $chardecode->decode(&$decoder($text));
 221
 222         grep {
 223                 my $len;
 224                 if (($len = length) > $maxline) {
 225                         $maxline = $len;
 226                 }} split(/^/, $text);
 227
 228         if (! defined $ret) {
 229                 $ret = 'EOF';
 230         }
 231
 232         if ($maxline <= $maxcolwidth) {
 233                 #
 234                 # These are short enough; just output it now as-is
 235                 #
 236                 foreach my $line (split(/^/, $text)) {
 237                         print STDOUT $quoteprefix, $line;
 238                 }
 239                 return $ret;
 240         }
 241
 242         #
 243         # We fork a copy of ourselves to read the output from the filter
 244         # program and prefix the quote character.
 245         #
 246
 247         pipe($finread, $finwrite) || die "pipe() failed: $!\n";
 248         pipe($foutread, $foutwrite) || die "pipe() (second) failed: $!\n";
 249
 250         binmode($finread, ":encoding($outcharset)");
 251         binmode($finwrite, ":encoding($outcharset)");
 252         binmode($foutread, ":encoding($outcharset)");
 253         binmode($foutwrite, ":encoding($outcharset)");
 254
 255         if ($filterpid = fork) {
 256                 #
 257                 # Close the pipes in the parent that we're not using
 258                 #
 259
 260                 close($finread);
 261                 close($foutwrite);
 262         } elsif (defined $filterpid) {
 263                 #
 264                 # Close our ununsed filehandles
 265                 #
 266
 267                 close($finwrite);
 268                 close($foutread);
 269
 270                 #
 271                 # Dup() down the filehandles to standard input and output
 272                 #
 273
 274                 open(STDIN, "<&", $finread) ||
 275                                         die "dup(filterin) failed: $!\n";
 276                 open(STDOUT, ">&", $foutwrite) ||
 277                                         die "dup(filterout) failed: $!\n";
 278
 279                 #
 280                 # Close our copies.
 281                 #
 282
 283                 close($finread);
 284                 close($foutwrite);
 285
 286                 #
 287                 # Exec our filter
 288                 #
 289
 290                 exec $filterprogram ||
 291                                 die "Unable to exec $filterprogram: $!\n";
 292         } else {
 293                 die "Fork for $filterprogram failed: $!\n";
 294         }
 295
 296         #
 297         # Fork our output handler.
 298         #
 299
 300         if ($prefixpid = fork) {
 301                 #
 302                 # We don't need these anymore
 303                 #
 304                 close($foutread);
 305
 306         } elsif (defined $prefixpid) {
 307                 #
 308                 # Read from foutwrite, and output (with prefix) to stdout
 309                 #
 310
 311                 close($finwrite);
 312
 313                 while (<$foutread>) {
 314                         print STDOUT $quoteprefix, $_;
 315                 }
 316
 317                 exit 0;
 318         }
 319
 320         #
 321         # Send our input to the filter program
 322         #
 323
 324         if (%filterreplace) {
 325                 foreach my $match (keys %filterreplace) {
 326                          $text =~ s/$match/$filterreplace{$match}/g;
 327                 }
 328         }
 329
 330         print $finwrite $text;
 331
 332         close($finwrite);
 333         waitpid $filterpid, 0;
 334         warn "Filter process exited with ", ($? >> 8), "\n" if $?;
 335         waitpid $prefixpid, 0;
 336         warn "Pipe reader process exited with ", ($? >> 8), "\n" if $?;
 337
 338         return $ret;
 339 }
 340
 341 #
 342 # Filter HTML through a converter program
 343 #
 344
 345 sub process_html (*$$;$)
 346 {
 347         my ($input, $encoding, $charset, $boundary) = @_;
 348         my $filterpid, $prefixpid, $finread, $finwrite;
 349         my $foutread, $foutwrite, $decoder, $ret;
 350
 351         if (! defined($decoder = find_decoder(lc($encoding)))) {
 352                 return 'EOF';
 353         }
 354
 355         #
 356         # We fork a copy of ourselves to read the output from the filter
 357         # program and prefix the quote character.
 358         #
 359
 360         pipe($finread, $finwrite) || die "pipe() failed: $!\n";
 361         pipe($foutread, $foutwrite) || die "pipe() (second) failed: $!\n";
 362
 363         binmode($finread, ":encoding($outcharset)");
 364         binmode($finread, ":encoding($outcharset)");
 365         binmode($foutread, ":encoding($outcharset)");
 366         binmode($foutwrite, ":encoding($outcharset)");
 367
 368         if ($filterpid = fork) {
 369                 #
 370                 # Close the pipes in the parent that we're not using
 371                 #
 372
 373                 close($finread);
 374                 close($foutwrite);
 375         } elsif (defined $filterpid) {
 376                 #
 377                 # Close our ununsed filehandles
 378                 #
 379
 380                 close($finwrite);
 381                 close($foutread);
 382
 383                 #
 384                 # Dup() down the filehandles to standard input and output
 385                 #
 386
 387                 open(STDIN, "<&", $finread) ||
 388                                         die "dup(filterin) failed: $!\n";
 389                 open(STDOUT, ">&", $foutwrite) ||
 390                                         die "dup(filterout) failed: $!\n";
 391
 392                 #
 393                 # Close our copies.
 394                 #
 395
 396                 close($finread);
 397                 close($foutwrite);
 398
 399                 #
 400                 # Exec our converter
 401                 #
 402
 403                 exec (@htmlconv) ||
 404                                 die "Unable to exec $filterprogram: $!\n";
 405         } else {
 406                 die "Fork for $htmlconv[0] failed: $!\n";
 407         }
 408
 409         #
 410         # Fork our output handler.
 411         #
 412
 413         if ($prefixpid = fork) {
 414                 #
 415                 # We don't need these anymore
 416                 #
 417                 close($foutread);
 418
 419         } elsif (defined $prefixpid) {
 420                 #
 421                 # Read from foutwrite, and output (with prefix) to stdout
 422                 #
 423
 424                 close($finwrite);
 425
 426                 while (<$foutread>) {
 427                         print STDOUT $quoteprefix, $_;
 428                 }
 429
 430                 exit 0;
 431         }
 432
 433         #
 434         # Send our input to the filter program
 435         #
 436
 437         while (<$input>) {
 438                 last if ($ret = match_boundary($_, $boundary));
 439                 print $finwrite (&$decoder($_));
 440         }
 441
 442         if (! defined $ret) {
 443                 $ret = 'EOF';
 444         }
 445
 446         close($finwrite);
 447         waitpid $filterpid, 0;
 448         warn "HTML converter process exited with ", scalar($? >> 8), "\n" if $?;
 449         waitpid $prefixpid, 0;
 450         warn "Pipe reader process exited with ", $? >> 8, "\n" if $?;
 451
 452         return $ret;
 453 }
 454
 455 #
 456 # Decide what to do, based on what kind of content it is.
 457 #
 458
 459 sub process_part (*$$$$;$)
 460 {
 461         my ($input, $content_type, $encoding, $charset, $boundary, $name) = @_;
 462         my ($type, $subtype) = (split('/', $content_type, -1), '');
 463
 464         if ($type eq 'text') {
 465                 #
 466                 # If this is a text part, right now we only deal with
 467                 # plain and HTML parts.
 468                 #
 469                 if ($subtype eq 'plain') {
 470                         return process_text($input, $encoding, $charset,
 471                                             $boundary);
 472                 } elsif ($subtype eq 'html') {
 473                         return process_html($input, $encoding, $charset,
 474                                             $boundary);
 475                 } else {
 476                         print ">>> $content_type content\n";
 477                         return eat_part($input, $boundary);
 478                 }
 479         } elsif ($type eq 'multipart') {
 480                 return process_multipart($input, $subtype, $boundary);
 481         } else {
 482                 #
 483                 # Other types we're not sure what to do with right now
 484                 # Just put a marker in there
 485                 #
 486
 487                 print ">>> $content_type attachment";
 488                 if (defined $name) {
 489                         print ", name=$name";
 490                 }
 491                 print "\n";
 492
 493                 return eat_part($input, $boundary);
 494         }
 495 }
 496
 497 #
 498 # Process a multipart message.
 499 #
 500 # When called, we should be right after the beginning of the first
 501 # boundary marker.  So we should be pointed at header lines which describe
 502 # the content of this part
 503 #
 504
 505 sub process_multipart ($$$)
 506 {
 507         my ($input, $subtype, $boundary) = @_;
 508         my $altout;
 509
 510         while (1) {
 511                 my $encoding, $type, $end, $name, $charset;
 512
 513                 #
 514                 # Use the Mail::Header package to read in any headers
 515                 # corresponding to this part
 516                 #
 517
 518                 my $head = Mail::Header->new($input, (MailFrom => 'IGNORE'));
 519
 520                 #
 521                 # Extract out any Content-Type, Content-Transfer-Encoding, and
 522                 # Content-Disposition headers
 523                 #
 524
 525                 my $ctype = Mail::Field->extract('Content-Type', $head);
 526                 my $cte = Mail::Field->extract('Content-Transfer-Encoding',
 527                                                $head);
 528                 my $cdispo = Mail::Field->extract('Content-Disposition', $head);
 529
 530                 if (defined $ctype) {
 531                         $type = $ctype->type;
 532                         $charset = $ctype->charset;
 533                 } else {
 534                         $type = 'text/plain';
 535                         $charset = 'us-ascii';
 536                 }
 537
 538                 $encoding = defined $cte ? lc($cte->param('_')) : '7bit';
 539                 $name = defined $cdispo ? $cdispo->param('filename') : undef;
 540
 541                 #
 542                 # Special handling for multipart/alternative; pick
 543                 # the "first" one we can handle (which is usually
 544                 # text/plain) and silently eat the rest, but output a
 545                 # warning if we can't handle anything.
 546                 #
 547
 548                 if ($altout) {
 549                         $end = eat_part($input, $boundary);
 550                 } else {
 551                         my $subboundary = $boundary;
 552                         my $maintype = (split('/', $type))[0];
 553
 554                         if ($maintype eq 'multipart') {
 555                                 $subboundary = $ctype->boundary;
 556                                 #
 557                                 # Go until we find our beginning of this
 558                                 # part
 559                                 #
 560                                 my $subend = eat_part($input, $subboundary);
 561
 562                                 if ($subend ne 'EOP') {
 563                                         print ">>> WARNING: malformed ",
 564                                                 "nested multipart\n";
 565                                         return $subend;
 566                                 }
 567                         }
 568
 569                         $end = process_part($input, $type, $encoding,
 570                                             $charset, $subboundary, $name);
 571
 572                         if ($subtype eq 'alternative' && ! defined $altout &&
 573                             $type eq 'text/plain') {
 574                                 $altout = 1;
 575                         }
 576
 577                         #
 578                         # Since we changed the semantics of $boundary
 579                         # above for nested multiparts, if we are
 580                         # handling a nested multipart then find the end
 581                         # of our current part
 582                         #
 583
 584                         if ($maintype eq 'multipart') {
 585                                 $end = eat_part($input, $boundary);
 586                         }
 587
 588                 }
 589
 590                 if ($end eq 'EOM' || $end eq 'EOF') {
 591                         if ($subtype eq 'alternative' && !defined $altout) {
 592                                 print ">>>multipart/alternative: no suitable ",
 593                                         "parts\n";
 594                         }
 595                         return $end;
 596                 }
 597         }
 598 }
 599
 600 #
 601 # "Eat" a MIME part; consume content until we hit the boundary or EOF
 602 #
 603
 604 sub eat_part ($$)
 605 {
 606         my ($input, $boundary) = @_;
 607         my $ret;
 608
 609         #
 610         # If we weren't given a boundary just eat input until EOF
 611         #
 612
 613         if (! defined $boundary) {
 614                 while (<$input>) { }
 615                 return 'EOF';
 616         }
 617
 618         #
 619         # Otherwise, consume data until we hit our boundary
 620         #
 621
 622         while (<$input>) {
 623                 if ($ret = match_boundary($_, $boundary)) {
 624                         return $ret;
 625                 }
 626         }
 627
 628         return 'EOF';
 629 }
 630
 631 #
 632 # Return the decoder subroutine to use
 633 #
 634
 635 sub find_decoder ($)
 636 {
 637         my ($encoding) = @_;
 638
 639         if ($encoding eq '7bit' || $encoding eq '8bit') {
 640                 return \&null_decoder;
 641         } elsif ($encoding eq 'base64') {
 642                 return \&decode_base64;
 643         } elsif ($encoding eq 'quoted-printable') {
 644                 return \&decode_qp;
 645         } else {
 646                 warn "Unknown encoding: $encoding\n";
 647                 return undef;
 648         }
 649 }
 650
 651 sub null_decoder ($)
 652 {
 653         my ($input) = @_;
 654
 655         return $input;
 656 }
 657
 658 #
 659 # Match a line against the boundary string
 660 #
 661
 662 sub match_boundary($$)
 663 {
 664         my ($line, $boundary) = @_;
 665
 666         return if ! defined $boundary;
 667
 668         if (substr($line, 0, 2) eq '--') {
 669                 $line =~ s/[ \t\r\n]+\Z//;
 670                 if ($line eq "--$boundary") {
 671                         return 'EOP';
 672                 } elsif ($line eq "--$boundary--") {
 673                         return 'EOM';
 674                 }
 675         }
 676
 677         return undef;
 678 }