diplodocus.org Git - nmh/blob - docs/contrib/replyfilter

   1 #!/usr/bin/perl
   2 #
   3 # replyfilter - A reply filter for nmh
   4 #
   5 # The idea behind this program is that it will act as a format filter
   6 # for nmh.  It will try to extract out all text/plain parts and format
   7 # them if necessary using a filter program.
   8 #
   9 # To use this program, configure nmh in the following way (nmh 1.5 or later):
  10 #
  11 # - Put the path to this program in your .mh_profile under formatproc:
  12 #
  13 #   formatproc: replyfilter
  14 #
  15 # - Create a mhl reply filter that consists of the following line:
  16 #
  17 #   body:nocomponent,format,nowrap,formatarg="%(trim{content-type})%(putstr)",formatarg="%(trim{content-transfer-encoding})%(putstr)",formatarg=">"
  18 #
  19 #   To decode this a bit:
  20 #
  21 #   body        - Output the "body" component
  22 #   nocomponent - Don't output a component prefix (normally here we use a
  23 #                 component prefix of ">" as a quote character, but we're
  24 #                 going to have replyfilter do that).
  25 #   nowrap      - Don't wrap lines if they exceed the column width
  26 #   formatarg   - Arguments to fmtproc.  The first argument is the value of
  27 #                 the Content-type header; the second is the value of the
  28 #                 Content-Transfer-Encoding header.  The last "formatarg"
  29 #                 is used as your quoting prefix.  Replace it with whatever
  30 #                 you want.
  31 #
  32
  33 use Mail::Field;
  34 use MIME::Head;
  35 use MIME::QuotedPrint;
  36 use MIME::Base64;
  37 use Encode;
  38
  39 #
  40 # The program we use to format "long" text.  Should be capable of reading
  41 # from standard input and sending the formatted text to standard output.
  42 #
  43
  44 $filterprogram = 'par';
  45
  46 #
  47 # If the above filter program has problems with some input, use the following
  48 # regular expression to remove any problematic input.  In this example we
  49 # filter out the UTF-8 non-breaking space (U+00A0) because that makes par
  50 # mangle the output.  Uncomment this if this ends up being a problem for
  51 # you, or feel free to add others.
  52 #
  53
  54 #%filterreplace = ( "\N{U+a0}" => " " );
  55
  56 #
  57 # Our output character set.  This script assumes a UTF-8 locale, but if you
  58 # want to run under a different locale the change it here.
  59 #
  60
  61 $outcharset = 'utf-8';
  62
  63 #
  64 # Maximum column width (used by the HTML converter and to decide if we need
  65 # to invoke the filter program
  66 #
  67
  68 $maxcolwidth = 78;
  69
  70 #
  71 # Out HTML converter program & arguments. charset will be appended
  72 #
  73
  74 @htmlconv = ('w3m', '-dump', '-cols', $maxcolwidth - 2, '-T', 'text/html',
  75              '-O', $outcharset, '-I');
  76
  77
  78 die "Usage: $0 Content-type content-transfer-encoding quote-prefix\n"
  79                                 if $#ARGV != 2;
  80
  81 if ($ARGV[0] ne "") {
  82         my $ctype = Mail::Field->new('Content-Type', $ARGV[0]);
  83         $content_type =  $ctype->type;
  84         $charset = $ctype->charset;
  85         $boundary = $ctype->boundary;
  86 } else {
  87         $content_type = 'text/plain';
  88         $charset = 'us-ascii';
  89 }
  90
  91 $encoding = $ARGV[1] eq "" ? '7bit' : lc($ARGV[1]);
  92 $quoteprefix = $ARGV[2];
  93
  94 #
  95 # Set up our output to be in our character set
  96 #
  97
  98 binmode(STDOUT, ":encoding($outcharset)");
  99
 100 #
 101 # The simplest case: if we have a single type of text/plain, send it
 102 # to our format subroutine.
 103 #
 104
 105 if ($content_type eq 'text/plain') {
 106         process_text(\*STDIN, $encoding, $charset);
 107         exit 0;
 108 }
 109
 110 #
 111 # Alright, here's what we need to do.
 112 #
 113 # Find any text/plain parts and decode them.  Decode them via base64 or
 114 # quoted-printable, and feed them to our formatting filter when appropriate.
 115 # Put markers in the output for other content types.
 116 #
 117
 118 ($type) = (split('/', $content_type));
 119
 120 if ($type eq 'multipart') {
 121
 122         #
 123         # For multipart messages we have to do a little extra.
 124         # Eat the MIME prologue (everything up until the first boundary)
 125         #
 126
 127         if (! defined $boundary || $boundary eq '') {
 128                 print "No boundary in Content-Type header!\n";
 129                 eat_part(\*STDIN);
 130                 exit 1;
 131         }
 132
 133         while (<STDIN>) {
 134                 last if match_boundary($_, $boundary);
 135         }
 136
 137         if (eof(STDIN)) {
 138                 print "Unable to find boundary in message\n";
 139                 exit 1;
 140         }
 141 } else {
 142         undef $boundary;
 143 }
 144
 145 process_part(\*STDIN, $content_type, $encoding, $charset, $boundary);
 146
 147 if ($boundary) {
 148         #
 149         # Eat the MIME epilog
 150         #
 151         eat_part(\*STDIN);
 152 }
 153
 154 exit 0;
 155
 156 #
 157 # Handled encoded text.  I think we can assume if the encoding is q-p
 158 # or base64 to feed it into a formatting filter.
 159 #
 160
 161 sub process_text (*$$;$)
 162 {
 163         my ($input, $encoding, $charset, $boundary) = @_;
 164         my $text, $filterpid, $prefixpid, $finread, $finwrite;
 165         my $foutread, $foutwrite, $decoder, $ret, $filterflag;
 166         my $text, $maxline = 0;
 167
 168         #
 169         # In the simple case, just spit out the text prefixed by the
 170         # quote character
 171         #
 172
 173         if ($encoding eq '7bit' || $encoding eq '8bit') {
 174                 #
 175                 # Switch the character set to whatever is specified by
 176                 # the MIME message
 177                 #
 178                 binmode($input, ":encoding($charset)");
 179                 while (<$input>) {
 180                         $ret = match_boundary($_, $boundary);
 181                         if (defined $ret) {
 182                                 binmode($input, ':encoding(us-ascii)');
 183                                 return $ret;
 184                         }
 185                         print $quoteprefix, $_;
 186                 }
 187                 return 'EOF';
 188         } else {
 189                 #
 190                 # If we've got some other encoding, the input text is almost
 191                 # certainly US-ASCII
 192                 #
 193
 194                 binmode($input, ':encoding(us-ascii)');
 195
 196                 $decoder = find_decoder(lc($encoding));
 197                 if (! defined $decoder) {
 198                         return 'EOF';
 199                 }
 200         }
 201
 202         #
 203         # Okay, assume that the encoding will make it so that we MIGHT need
 204         # to filter it.  Read it in; if the lines are too long, filter it
 205         #
 206
 207         my $chardecode = find_encoding($charset);
 208
 209         while (<$input>) {
 210                 my @lines, $len;
 211
 212                 last if ($ret = match_boundary($_, $boundary));
 213
 214                 $text .= $_;
 215
 216         }
 217
 218         binmode($input, ':encoding(us-ascii)');
 219
 220         $text = $chardecode->decode(&$decoder($text));
 221
 222         grep {
 223                 my $len;
 224                 if (($len = length) > $maxline) {
 225                         $maxline = $len;
 226                 }} split(/^/, $text);
 227
 228         if (! defined $ret) {
 229                 $ret = 'EOF';
 230         }
 231
 232         if ($maxline <= $maxcolwidth) {
 233                 #
 234                 # These are short enough; just output it now as-is
 235                 #
 236                 foreach my $line (split(/^/, $text)) {
 237                         print STDOUT $quoteprefix, $line;
 238                 }
 239                 return $ret;
 240         }
 241
 242         #
 243         # We fork a copy of ourselves to read the output from the filter
 244         # program and prefix the quote character.
 245         #
 246
 247         pipe($finread, $finwrite) || die "pipe() failed: $!\n";
 248         pipe($foutread, $foutwrite) || die "pipe() (second) failed: $!\n";
 249
 250         binmode($finread, ":encoding($outcharset)");
 251         binmode($finwrite, ":encoding($outcharset)");
 252         binmode($foutread, ":encoding($outcharset)");
 253         binmode($foutwrite, ":encoding($outcharset)");
 254
 255         if ($filterpid = fork) {
 256                 #
 257                 # Close the pipes in the parent that we're not using
 258                 #
 259
 260                 close($finread);
 261                 close($foutwrite);
 262         } elsif (defined $filterpid) {
 263                 #
 264                 # Close our ununsed filehandles
 265                 #
 266
 267                 close($finwrite);
 268                 close($foutread);
 269
 270                 #
 271                 # Dup() down the filehandles to standard input and output
 272                 #
 273
 274                 open(STDIN, "<&", $finread) ||
 275                                         die "dup(filterin) failed: $!\n";
 276                 open(STDOUT, ">&", $foutwrite) ||
 277                                         die "dup(filterout) failed: $!\n";
 278
 279                 #
 280                 # Close our copies.
 281                 #
 282
 283                 close($finread);
 284                 close($foutwrite);
 285
 286                 #
 287                 # Exec our filter
 288                 #
 289
 290                 exec $filterprogram ||
 291                                 die "Unable to exec $filterprogram: $!\n";
 292         } else {
 293                 die "Fork for $filterprogram failed: $!\n";
 294         }
 295
 296         #
 297         # Fork our output handler.
 298         #
 299
 300         if ($prefixpid = fork) {
 301                 #
 302                 # We don't need these anymore
 303                 #
 304                 close($foutread);
 305
 306         } elsif (defined $prefixpid) {
 307                 #
 308                 # Read from foutwrite, and output (with prefix) to stdout
 309                 #
 310
 311                 close($finwrite);
 312
 313                 while (<$foutread>) {
 314                         print STDOUT $quoteprefix, $_;
 315                 }
 316
 317                 exit 0;
 318         }
 319
 320         #
 321         # Send our input to the filter program
 322         #
 323
 324         if (%filterreplace) {
 325                 foreach my $match (keys %filterreplace) {
 326                          $text =~ s/$match/$filterreplace{$match}/g;
 327                 }
 328         }
 329
 330         print $finwrite $text;
 331
 332         close($finwrite);
 333         waitpid $filterpid, 0;
 334         warn "Filter process exited with ", ($? >> 8), "\n" if $?;
 335         waitpid $prefixpid, 0;
 336         warn "Pipe reader process exited with ", ($? >> 8), "\n" if $?;
 337
 338         return $ret;
 339 }
 340
 341 #
 342 # Filter HTML through a converter program
 343 #
 344
 345 sub process_html (*$$;$)
 346 {
 347         my ($input, $encoding, $charset, $boundary) = @_;
 348         my $filterpid, $prefixpid, $finread, $finwrite;
 349         my $foutread, $foutwrite, $decoder, $ret;
 350
 351         if (! defined($decoder = find_decoder(lc($encoding)))) {
 352                 return 'EOF';
 353         }
 354
 355         #
 356         # We fork a copy of ourselves to read the output from the filter
 357         # program and prefix the quote character.
 358         #
 359
 360         pipe($finread, $finwrite) || die "pipe() failed: $!\n";
 361         pipe($foutread, $foutwrite) || die "pipe() (second) failed: $!\n";
 362
 363         binmode($finread, ":encoding($outcharset)");
 364         binmode($finread, ":encoding($outcharset)");
 365         binmode($foutread, ":encoding($outcharset)");
 366         binmode($foutwrite, ":encoding($outcharset)");
 367
 368         if ($filterpid = fork) {
 369                 #
 370                 # Close the pipes in the parent that we're not using
 371                 #
 372
 373                 close($finread);
 374                 close($foutwrite);
 375         } elsif (defined $filterpid) {
 376                 #
 377                 # Close our ununsed filehandles
 378                 #
 379
 380                 close($finwrite);
 381                 close($foutread);
 382
 383                 #
 384                 # Dup() down the filehandles to standard input and output
 385                 #
 386
 387                 open(STDIN, "<&", $finread) ||
 388                                         die "dup(filterin) failed: $!\n";
 389                 open(STDOUT, ">&", $foutwrite) ||
 390                                         die "dup(filterout) failed: $!\n";
 391
 392                 #
 393                 # Close our copies.
 394                 #
 395
 396                 close($finread);
 397                 close($foutwrite);
 398
 399                 #
 400                 # Exec our converter
 401                 #
 402
 403                 my @conv = (@htmlconv, $charset);
 404                 exec (@conv) ||
 405                                 die "Unable to exec $filterprogram: $!\n";
 406         } else {
 407                 die "Fork for $htmlconv[0] failed: $!\n";
 408         }
 409
 410         #
 411         # Fork our output handler.
 412         #
 413
 414         if ($prefixpid = fork) {
 415                 #
 416                 # We don't need these anymore
 417                 #
 418                 close($foutread);
 419
 420         } elsif (defined $prefixpid) {
 421                 #
 422                 # Read from foutwrite, and output (with prefix) to stdout
 423                 #
 424
 425                 close($finwrite);
 426
 427                 while (<$foutread>) {
 428                         print STDOUT $quoteprefix, $_;
 429                 }
 430
 431                 exit 0;
 432         }
 433
 434         #
 435         # Send our input to the filter program
 436         #
 437
 438         while (<$input>) {
 439                 last if ($ret = match_boundary($_, $boundary));
 440                 print $finwrite (&$decoder($_));
 441         }
 442
 443         if (! defined $ret) {
 444                 $ret = 'EOF';
 445         }
 446
 447         close($finwrite);
 448         waitpid $filterpid, 0;
 449         warn "HTML converter process exited with ", scalar($? >> 8), "\n" if $?;
 450         waitpid $prefixpid, 0;
 451         warn "Pipe reader process exited with ", $? >> 8, "\n" if $?;
 452
 453         return $ret;
 454 }
 455
 456 #
 457 # Decide what to do, based on what kind of content it is.
 458 #
 459
 460 sub process_part (*$$$$;$)
 461 {
 462         my ($input, $content_type, $encoding, $charset, $boundary, $name) = @_;
 463         my ($type, $subtype) = (split('/', $content_type, -1), '');
 464
 465         if ($type eq 'text') {
 466                 #
 467                 # If this is a text part, right now we only deal with
 468                 # plain and HTML parts.
 469                 #
 470                 if ($subtype eq 'plain') {
 471                         return process_text($input, $encoding, $charset,
 472                                             $boundary);
 473                 } elsif ($subtype eq 'html') {
 474                         return process_html($input, $encoding, $charset,
 475                                             $boundary);
 476                 } else {
 477                         print ">>> $content_type content\n";
 478                         return eat_part($input, $boundary);
 479                 }
 480         } elsif ($type eq 'multipart') {
 481                 return process_multipart($input, $subtype, $boundary);
 482         } else {
 483                 #
 484                 # Other types we're not sure what to do with right now
 485                 # Just put a marker in there
 486                 #
 487
 488                 print ">>> $content_type attachment";
 489                 if (defined $name) {
 490                         print ", name=$name";
 491                 }
 492                 print "\n";
 493
 494                 return eat_part($input, $boundary);
 495         }
 496 }
 497
 498 #
 499 # Process a multipart message.
 500 #
 501 # When called, we should be right after the beginning of the first
 502 # boundary marker.  So we should be pointed at header lines which describe
 503 # the content of this part
 504 #
 505
 506 sub process_multipart ($$$)
 507 {
 508         my ($input, $subtype, $boundary) = @_;
 509         my $altout;
 510
 511         while (1) {
 512                 my $encoding, $type, $end, $name, $charset;
 513
 514                 #
 515                 # Use the Mail::Header package to read in any headers
 516                 # corresponding to this part
 517                 #
 518
 519                 my $head = Mail::Header->new($input, (MailFrom => 'IGNORE'));
 520
 521                 #
 522                 # Extract out any Content-Type, Content-Transfer-Encoding, and
 523                 # Content-Disposition headers
 524                 #
 525
 526                 my $ctype = Mail::Field->extract('Content-Type', $head);
 527                 my $cte = Mail::Field->extract('Content-Transfer-Encoding',
 528                                                $head);
 529                 my $cdispo = Mail::Field->extract('Content-Disposition', $head);
 530
 531                 if (defined $ctype) {
 532                         $type = $ctype->type;
 533                         $charset = $ctype->charset;
 534                 } else {
 535                         $type = 'text/plain';
 536                         $charset = 'us-ascii';
 537                 }
 538
 539                 $encoding = defined $cte ? lc($cte->param('_')) : '7bit';
 540                 $name = defined $cdispo ? $cdispo->param('filename') : undef;
 541
 542                 #
 543                 # Special handling for multipart/alternative; pick
 544                 # the "first" one we can handle (which is usually
 545                 # text/plain) and silently eat the rest, but output a
 546                 # warning if we can't handle anything.
 547                 #
 548
 549                 if ($altout) {
 550                         $end = eat_part($input, $boundary);
 551                 } else {
 552                         my $subboundary = $boundary;
 553                         my $maintype = (split('/', $type))[0];
 554
 555                         if ($maintype eq 'multipart') {
 556                                 $subboundary = $ctype->boundary;
 557                                 #
 558                                 # Go until we find our beginning of this
 559                                 # part
 560                                 #
 561                                 my $subend = eat_part($input, $subboundary);
 562
 563                                 if ($subend ne 'EOP') {
 564                                         print ">>> WARNING: malformed ",
 565                                                 "nested multipart\n";
 566                                         return $subend;
 567                                 }
 568                         }
 569
 570                         $end = process_part($input, $type, $encoding,
 571                                             $charset, $subboundary, $name);
 572
 573                         if ($subtype eq 'alternative' && ! defined $altout &&
 574                             $type eq 'text/plain') {
 575                                 $altout = 1;
 576                         }
 577
 578                         #
 579                         # Since we changed the semantics of $boundary
 580                         # above for nested multiparts, if we are
 581                         # handling a nested multipart then find the end
 582                         # of our current part
 583                         #
 584
 585                         if ($maintype eq 'multipart') {
 586                                 $end = eat_part($input, $boundary);
 587                         }
 588
 589                 }
 590
 591                 if ($end eq 'EOM' || $end eq 'EOF') {
 592                         if ($subtype eq 'alternative' && !defined $altout) {
 593                                 print ">>>multipart/alternative: no suitable ",
 594                                         "parts\n";
 595                         }
 596                         return $end;
 597                 }
 598         }
 599 }
 600
 601 #
 602 # "Eat" a MIME part; consume content until we hit the boundary or EOF
 603 #
 604
 605 sub eat_part ($$)
 606 {
 607         my ($input, $boundary) = @_;
 608         my $ret;
 609
 610         #
 611         # If we weren't given a boundary just eat input until EOF
 612         #
 613
 614         if (! defined $boundary) {
 615                 while (<$input>) { }
 616                 return 'EOF';
 617         }
 618
 619         #
 620         # Otherwise, consume data until we hit our boundary
 621         #
 622
 623         while (<$input>) {
 624                 if ($ret = match_boundary($_, $boundary)) {
 625                         return $ret;
 626                 }
 627         }
 628
 629         return 'EOF';
 630 }
 631
 632 #
 633 # Return the decoder subroutine to use
 634 #
 635
 636 sub find_decoder ($)
 637 {
 638         my ($encoding) = @_;
 639
 640         if ($encoding eq '7bit' || $encoding eq '8bit') {
 641                 return \&null_decoder;
 642         } elsif ($encoding eq 'base64') {
 643                 return \&decode_base64;
 644         } elsif ($encoding eq 'quoted-printable') {
 645                 return \&decode_qp;
 646         } else {
 647                 warn "Unknown encoding: $encoding\n";
 648                 return undef;
 649         }
 650 }
 651
 652 sub null_decoder ($)
 653 {
 654         my ($input) = @_;
 655
 656         return $input;
 657 }
 658
 659 #
 660 # Match a line against the boundary string
 661 #
 662
 663 sub match_boundary($$)
 664 {
 665         my ($line, $boundary) = @_;
 666
 667         return if ! defined $boundary;
 668
 669         if (substr($line, 0, 2) eq '--') {
 670                 $line =~ s/[ \t\r\n]+\Z//;
 671                 if ($line eq "--$boundary") {
 672                         return 'EOP';
 673                 } elsif ($line eq "--$boundary--") {
 674                         return 'EOM';
 675                 }
 676         }
 677
 678         return undef;
 679 }