diplodocus.org Git - nmh/blob - docs/contrib/replyfilter

   1 #!/usr/bin/perl
   2 #
   3 # replyfilter - A reply filter for nmh
   4 #
   5 # The idea behind this program is that it will act as a format filter
   6 # for nmh.  It will try to extract out all text/plain parts and format
   7 # them if necessary using a filter program.
   8 #
   9 # To use this program, configure nmh in the following way (nmh 1.5 or later):
  10 #
  11 # - Put the path to this program in your .mh_profile under formatproc:
  12 #
  13 #   formatproc: replyfilter
  14 #
  15 # - Create a mhl reply filter that consists of the following line:
  16 #
  17 #   body:nocomponent,format,nowrap,formatarg="%(trim{content-type})%(putstr)",formatarg="%(trim{content-transfer-encoding})%(putstr)",formatarg=">"
  18 #
  19 #   To decode this a bit:
  20 #
  21 #   body        - Output the "body" component
  22 #   nocomponent - Don't output a component prefix (normally here we use a
  23 #                 component prefix of ">" as a quote character, but we're
  24 #                 going to have replyfilter do that).
  25 #   nowrap      - Don't wrap lines if they exceed the column width
  26 #   formatarg   - Arguments to fmtproc.  The first argument is the value of
  27 #                 the Content-type header; the second is the value of the
  28 #                 Content-Transfer-Encoding header.  The last "formatarg"
  29 #                 is used as your quoting prefix.  Replace it with whatever
  30 #                 you want.
  31 #
  32
  33 use Mail::Field;
  34 use MIME::Head;
  35 use MIME::QuotedPrint;
  36 use MIME::Base64;
  37 use Encode;
  38
  39 #
  40 # The program we use to format "long" text
  41 #
  42
  43 $filterprogram = 'par';
  44
  45 #
  46 # Our output character set.  This script assumes a UTF-8 locale, but if you
  47 # want to run under a different locale the change it here.
  48 #
  49
  50 $outcharset = 'utf-8';
  51
  52 #
  53 # Maximum column width (used by the HTML converter and to decide if we need
  54 # to invoke the filter program
  55 #
  56
  57 $maxcolwidth = 78;
  58
  59 #
  60 # Out HTML converter program & arguments
  61 #
  62
  63 @htmlconv = ('w3m', '-dump', '-cols', $maxcolwidth - 2, '-T', 'text/html',
  64              '-O', $outcharset);
  65
  66
  67 die "Usage: $0 Content-type content-transfer-encoding quote-prefix\n"
  68                                 if $#ARGV != 2;
  69
  70 if ($ARGV[0] ne "") {
  71         my $ctype = Mail::Field->new('Content-Type', $ARGV[0]);
  72         $content_type =  $ctype->type;
  73         $charset = $ctype->charset;
  74         $boundary = $ctype->boundary;
  75 } else {
  76         $content_type = 'text/plain';
  77         $charset = 'us-ascii';
  78 }
  79
  80 $encoding = $ARGV[1] eq "" ? '7bit' : lc($ARGV[1]);
  81 $quoteprefix = $ARGV[2];
  82
  83 #
  84 # Set up our output to be in our character set
  85 #
  86
  87 binmode(STDOUT, ":encoding($outcharset)");
  88
  89 #
  90 # The simplest case: if we have a single type of text/plain, send it
  91 # to our format subroutine.
  92 #
  93
  94 if ($content_type eq 'text/plain') {
  95         process_text(\*STDIN, $encoding, $charset);
  96         exit 0;
  97 }
  98
  99 #
 100 # Alright, here's what we need to do.
 101 #
 102 # Find any text/plain parts and decode them.  Decode them via base64 or
 103 # quoted-printable, and feed them to our formatting filter when appropriate.
 104 # Put markers in the output for other content types.
 105 #
 106
 107 ($type) = (split('/', $content_type));
 108
 109 if ($type eq 'multipart') {
 110
 111         #
 112         # For multipart messages we have to do a little extra.
 113         # Eat the MIME prologue (everything up until the first boundary)
 114         #
 115
 116         if (! defined $boundary || $boundary eq '') {
 117                 print "No boundary in Content-Type header!\n";
 118                 eat_part(\*STDIN);
 119                 exit 1;
 120         }
 121
 122         while (<STDIN>) {
 123                 last if match_boundary($_, $boundary);
 124         }
 125
 126         if (eof(STDIN)) {
 127                 print "Unable to find boundary in message\n";
 128                 exit 1;
 129         }
 130 } else {
 131         undef $boundary;
 132 }
 133
 134 process_part(\*STDIN, $content_type, $encoding, $charset, $boundary);
 135
 136 if ($boundary) {
 137         #
 138         # Eat the MIME epilog
 139         #
 140         eat_part(\*STDIN);
 141 }
 142
 143 exit 0;
 144
 145 #
 146 # Handled encoded text.  I think we can assume if the encoding is q-p
 147 # or base64 to feed it into a formatting filter.
 148 #
 149
 150 sub process_text (*$$;$)
 151 {
 152         my ($input, $encoding, $charset, $boundary) = @_;
 153         my $text, $filterpid, $prefixpid, $finread, $finwrite;
 154         my $foutread, $foutwrite, $decoder, $ret, $filterflag;
 155         my @text = ( '' ), $maxline = 0;
 156
 157         #
 158         # In the simple case, just spit out the text prefixed by the
 159         # quote character
 160         #
 161
 162         if ($encoding eq '7bit' || $encoding eq '8bit') {
 163                 #
 164                 # Switch the character set to whatever is specified by
 165                 # the MIME message
 166                 #
 167                 binmode($input, ":encoding($charset)");
 168                 while (<$input>) {
 169                         $ret = match_boundary($_, $boundary);
 170                         if (defined $ret) {
 171                                 binmode($input, ':encoding(us-ascii)');
 172                                 return $ret;
 173                         }
 174                         print $quoteprefix, $_;
 175                 }
 176                 return 'EOF';
 177         } else {
 178                 #
 179                 # If we've got some other encoding, the input text is almost
 180                 # certainly US-ASCII
 181                 #
 182
 183                 binmode($input, ':encoding(us-ascii)');
 184
 185                 $decoder = find_decoder($encoding);
 186                 if (! defined $decoder) {
 187                         return 'EOF';
 188                 }
 189         }
 190
 191         #
 192         # Okay, assume that the encoding will make it so that we MIGHT need
 193         # to filter it.  Read it in; if it's too long, filter it.
 194         #
 195
 196         my $chardecode = find_encoding($charset);
 197
 198         while (<$input>) {
 199                 my @lines, $len;
 200
 201                 last if ($ret = match_boundary($_, $boundary));
 202
 203                 @lines = split(/^/, $chardecode->decode(&$decoder($_)));
 204
 205                 if (substr($text[$#text], -1, 1) eq "\n") {
 206                         push @text, shift @lines;
 207                 } else {
 208                         $text[$#text] .= shift @lines;
 209                 }
 210                 if (($len = length($text[$#text])) > $maxline) {
 211                         $maxline = $len;
 212                 }
 213
 214                 if ($#lines > -1) {
 215                         push @text, @lines;
 216                 }
 217         }
 218
 219         binmode($input, ':encoding(us-ascii)');
 220
 221         if (! defined $ret) {
 222                 $ret = 'EOF';
 223         }
 224
 225         if ($maxline <= $maxcolwidth) {
 226                 #
 227                 # These are short enough; just output it now as-is
 228                 #
 229                 foreach my $line (@text) {
 230                         print STDOUT $quoteprefix, $line;
 231                 }
 232                 return $ret;
 233         }
 234
 235         #
 236         # We fork a copy of ourselves to read the output from the filter
 237         # program and prefix the quote character.
 238         #
 239
 240         pipe($finread, $finwrite) || die "pipe() failed: $!\n";
 241         pipe($foutread, $foutwrite) || die "pipe() (second) failed: $!\n";
 242
 243         binmode($finread, ":encoding($outcharset)");
 244         binmode($finwrite, ":encoding($outcharset)");
 245         binmode($foutread, ":encoding($outcharset)");
 246         binmode($foutwrite, ":encoding($outcharset)");
 247
 248         if ($filterpid = fork) {
 249                 #
 250                 # Close the pipes in the parent that we're not using
 251                 #
 252
 253                 close($finread);
 254                 close($foutwrite);
 255         } elsif (defined $filterpid) {
 256                 #
 257                 # Close our ununsed filehandles
 258                 #
 259
 260                 close($finwrite);
 261                 close($foutread);
 262
 263                 #
 264                 # Dup() down the filehandles to standard input and output
 265                 #
 266
 267                 open(STDIN, "<&", $finread) ||
 268                                         die "dup(filterin) failed: $!\n";
 269                 open(STDOUT, ">&", $foutwrite) ||
 270                                         die "dup(filterout) failed: $!\n";
 271
 272                 #
 273                 # Close our copies.
 274                 #
 275
 276                 close($finread);
 277                 close($foutwrite);
 278
 279                 #
 280                 # Exec our filter
 281                 #
 282
 283                 exec $filterprogram ||
 284                                 die "Unable to exec $filterprogram: $!\n";
 285         } else {
 286                 die "Fork for $filterprogram failed: $!\n";
 287         }
 288
 289         #
 290         # Fork our output handler.
 291         #
 292
 293         if ($prefixpid = fork) {
 294                 #
 295                 # We don't need these anymore
 296                 #
 297                 close($foutread);
 298
 299         } elsif (defined $prefixpid) {
 300                 #
 301                 # Read from foutwrite, and output (with prefix) to stdout
 302                 #
 303
 304                 close($finwrite);
 305
 306                 while (<$foutread>) {
 307                         print STDOUT $quoteprefix, $_;
 308                 }
 309
 310                 exit 0;
 311         }
 312
 313         #
 314         # Send our input to the filter program
 315         #
 316
 317         print $finwrite @text;
 318
 319         close($finwrite);
 320         waitpid $filterpid, 0;
 321         warn "Filter process exited with ", ($? >> 8), "\n" if $?;
 322         waitpid $prefixpid, 0;
 323         warn "Pipe reader process exited with ", ($? >> 8), "\n" if $?;
 324
 325         return $ret;
 326 }
 327
 328 #
 329 # Filter HTML through a converter program
 330 #
 331
 332 sub process_html (*$$;$)
 333 {
 334         my ($input, $encoding, $charset, $boundary) = @_;
 335         my $filterpid, $prefixpid, $finread, $finwrite;
 336         my $foutread, $foutwrite, $decoder, $ret;
 337
 338         if (! defined($decoder = find_decoder($encoding))) {
 339                 return 'EOF';
 340         }
 341
 342         #
 343         # We fork a copy of ourselves to read the output from the filter
 344         # program and prefix the quote character.
 345         #
 346
 347         pipe($finread, $finwrite) || die "pipe() failed: $!\n";
 348         pipe($foutread, $foutwrite) || die "pipe() (second) failed: $!\n";
 349
 350         binmode($finread, ":encoding($outcharset)");
 351         binmode($finread, ":encoding($outcharset)");
 352         binmode($foutread, ":encoding($outcharset)");
 353         binmode($foutwrite, ":encoding($outcharset)");
 354
 355         if ($filterpid = fork) {
 356                 #
 357                 # Close the pipes in the parent that we're not using
 358                 #
 359
 360                 close($finread);
 361                 close($foutwrite);
 362         } elsif (defined $filterpid) {
 363                 #
 364                 # Close our ununsed filehandles
 365                 #
 366
 367                 close($finwrite);
 368                 close($foutread);
 369
 370                 #
 371                 # Dup() down the filehandles to standard input and output
 372                 #
 373
 374                 open(STDIN, "<&", $finread) ||
 375                                         die "dup(filterin) failed: $!\n";
 376                 open(STDOUT, ">&", $foutwrite) ||
 377                                         die "dup(filterout) failed: $!\n";
 378
 379                 #
 380                 # Close our copies.
 381                 #
 382
 383                 close($finread);
 384                 close($foutwrite);
 385
 386                 #
 387                 # Exec our converter
 388                 #
 389
 390                 exec (@htmlconv) ||
 391                                 die "Unable to exec $filterprogram: $!\n";
 392         } else {
 393                 die "Fork for $htmlconv[0] failed: $!\n";
 394         }
 395
 396         #
 397         # Fork our output handler.
 398         #
 399
 400         if ($prefixpid = fork) {
 401                 #
 402                 # We don't need these anymore
 403                 #
 404                 close($foutread);
 405
 406         } elsif (defined $prefixpid) {
 407                 #
 408                 # Read from foutwrite, and output (with prefix) to stdout
 409                 #
 410
 411                 close($finwrite);
 412
 413                 while (<$foutread>) {
 414                         print STDOUT $quoteprefix, $_;
 415                 }
 416
 417                 exit 0;
 418         }
 419
 420         #
 421         # Send our input to the filter program
 422         #
 423
 424         while (<$input>) {
 425                 last if ($ret = match_boundary($_, $boundary));
 426                 print $finwrite (&$decoder($_));
 427         }
 428
 429         if (! defined $ret) {
 430                 $ret = 'EOF';
 431         }
 432
 433         close($finwrite);
 434         waitpid $filterpid, 0;
 435         warn "HTML converter process exited with ", scalar($? >> 8), "\n" if $?;
 436         waitpid $prefixpid, 0;
 437         warn "Pipe reader process exited with ", $? >> 8, "\n" if $?;
 438
 439         return $ret;
 440 }
 441
 442 #
 443 # Decide what to do, based on what kind of content it is.
 444 #
 445
 446 sub process_part (*$$$$;$)
 447 {
 448         my ($input, $content_type, $encoding, $charset, $boundary, $name) = @_;
 449         my ($type, $subtype) = (split('/', $content_type, -1), '');
 450
 451         if ($type eq 'text') {
 452                 #
 453                 # If this is a text part, right now we only deal with
 454                 # plain and HTML parts.
 455                 #
 456                 if ($subtype eq 'plain') {
 457                         return process_text($input, $encoding, $charset,
 458                                             $boundary);
 459                 } elsif ($subtype eq 'html') {
 460                         return process_html($input, $encoding, $charset,
 461                                             $boundary);
 462                 } else {
 463                         print ">>> $content_type content\n";
 464                         return eat_part($input, $boundary);
 465                 }
 466         } elsif ($type eq 'multipart') {
 467                 return process_multipart($input, $subtype, $boundary);
 468         } else {
 469                 #
 470                 # Other types we're not sure what to do with right now
 471                 # Just put a marker in there
 472                 #
 473
 474                 print ">>> $content_type attachment";
 475                 if (defined $name) {
 476                         print ", name=$name";
 477                 }
 478                 print "\n";
 479
 480                 return eat_part($input, $boundary);
 481         }
 482 }
 483
 484 #
 485 # Process a multipart message.
 486 #
 487 # When called, we should be right after the beginning of the first
 488 # boundary marker.  So we should be pointed at header lines which describe
 489 # the content of this part
 490 #
 491
 492 sub process_multipart ($$$)
 493 {
 494         my ($input, $subtype, $boundary) = @_;
 495         my $altout;
 496
 497         while (1) {
 498                 my $encoding, $type, $end, $name, $charset;
 499
 500                 #
 501                 # Use the Mail::Header package to read in any headers
 502                 # corresponding to this part
 503                 #
 504
 505                 my $head = Mail::Header->new($input, (MailFrom => 'IGNORE'));
 506
 507                 #
 508                 # Extract out any Content-Type, Content-Transfer-Encoding, and
 509                 # Content-Disposition headers
 510                 #
 511
 512                 my $ctype = Mail::Field->extract('Content-Type', $head);
 513                 my $cte = Mail::Field->extract('Content-Transfer-Encoding',
 514                                                $head);
 515                 my $cdispo = Mail::Field->extract('Content-Disposition', $head);
 516
 517                 if (defined $ctype) {
 518                         $type = $ctype->type;
 519                         $charset = $ctype->charset;
 520                 } else {
 521                         $type = 'text/plain';
 522                         $charset = 'us-ascii';
 523                 }
 524
 525                 $encoding = defined $cte ? $cte->param('_') : '7bit';
 526                 $name = defined $cdispo ? $cdispo->param('filename') : undef;
 527
 528                 #
 529                 # Special handling for multipart/alternative; pick
 530                 # the "first" one we can handle (which is usually
 531                 # text/plain) and silently eat the rest, but output a
 532                 # warning if we can't handle anything.
 533                 #
 534
 535                 if ($altout) {
 536                         $end = eat_part($input, $boundary);
 537                 } else {
 538                         my $subboundary = $boundary;
 539                         my $maintype = (split('/', $type))[0];
 540
 541                         if ($maintype eq 'multipart') {
 542                                 $subboundary = $ctype->boundary;
 543                                 #
 544                                 # Go until we find our beginning of this
 545                                 # part
 546                                 #
 547                                 my $subend = eat_part($input, $subboundary);
 548
 549                                 if ($subend ne 'EOP') {
 550                                         print ">>> WARNING: malformed ",
 551                                                 "nested multipart\n";
 552                                         return $subend;
 553                                 }
 554                         }
 555
 556                         $end = process_part($input, $type, $encoding,
 557                                             $charset, $subboundary, $name);
 558
 559                         if ($subtype eq 'alternative' && ! defined $altout &&
 560                             $type eq 'text/plain') {
 561                                 $altout = 1;
 562                         }
 563
 564                         #
 565                         # Since we changed the semantics of $boundary
 566                         # above for nested multiparts, if we are
 567                         # handling a nested multipart then find the end
 568                         # of our current part
 569                         #
 570
 571                         if ($maintype eq 'multipart') {
 572                                 $end = eat_part($input, $boundary);
 573                         }
 574
 575                 }
 576
 577                 if ($end eq 'EOM' || $end eq 'EOF') {
 578                         if ($subtype eq 'alternative' && !defined $altout) {
 579                                 print ">>>multipart/alternative: no suitable ",
 580                                         "parts\n";
 581                         }
 582                         return $end;
 583                 }
 584         }
 585 }
 586
 587 #
 588 # "Eat" a MIME part; consume content until we hit the boundary or EOF
 589 #
 590
 591 sub eat_part ($$)
 592 {
 593         my ($input, $boundary) = @_;
 594         my $ret;
 595
 596         #
 597         # If we weren't given a boundary just eat input until EOF
 598         #
 599
 600         if (! defined $boundary) {
 601                 while (<$input>) { }
 602                 return 'EOF';
 603         }
 604
 605         #
 606         # Otherwise, consume data until we hit our boundary
 607         #
 608
 609         while (<$input>) {
 610                 if ($ret = match_boundary($_, $boundary)) {
 611                         return $ret;
 612                 }
 613         }
 614
 615         return 'EOF';
 616 }
 617
 618 #
 619 # Return the decoder subroutine to use
 620 #
 621
 622 sub find_decoder ($)
 623 {
 624         my ($encoding) = @_;
 625
 626         if ($encoding eq '7bit' || $encoding eq '8bit') {
 627                 return \&null_decoder;
 628         } elsif ($encoding eq 'base64') {
 629                 return \&decode_base64;
 630         } elsif ($encoding eq 'quoted-printable') {
 631                 return \&decode_qp;
 632         } else {
 633                 warn "Unknown encoding: $encoding\n";
 634                 return undef;
 635         }
 636 }
 637
 638 sub null_decoder ($)
 639 {
 640         my ($input) = @_;
 641
 642         return $input;
 643 }
 644
 645 #
 646 # Match a line against the boundary string
 647 #
 648
 649 sub match_boundary($$)
 650 {
 651         my ($_, $boundary) = @_;
 652
 653         return if ! defined $boundary;
 654
 655         if (substr($_, 0, 2) eq '--') {
 656                 s/[ \t\r\n]+\Z//;
 657                 if ($_ eq "--$boundary") {
 658                         return 'EOP';
 659                 } elsif ($_ eq "--$boundary--") {
 660                         return 'EOM';
 661                 }
 662         }
 663
 664         return undef;
 665 }