diplodocus.org Git - nmh/blob - docs/contrib/replyfilter

   1 #!/usr/bin/perl
   2 #
   3 # replyfilter - A reply filter for nmh
   4 #
   5 # The idea behind this program is that it will act as a format filter
   6 # for nmh.  It will try to extract out all text/plain parts and format
   7 # them if necessary using a filter program.
   8 #
   9 # To use this program, configure nmh in the following way (nmh 1.5 or later):
  10 #
  11 # - Put the path to this program in your .mh_profile under formatproc:
  12 #
  13 #      formatproc: replyfilter
  14 #
  15 #   or invoke repl with "-fmtproc replyfilter".
  16 #
  17 # - Create an mhl reply filter that consists of the following line:
  18 #
  19 #      body:nocomponent,format,nowrap,formatarg="%(trim{content-type})%(putstr)",formatarg="%(trim{content-transfer-encoding})%(putstr)",formatarg=">"
  20 #
  21 #   By default, repl will look for the mhl reply filter by the name
  22 #   "mhl.reply", but it will look elsewhere if the -filter switch is given.
  23 #
  24 #   To decode this a bit:
  25 #
  26 #   body        - Output the "body" component
  27 #   nocomponent - Don't output a component prefix (normally here we use a
  28 #                 component prefix of ">" as a quote character, but we're
  29 #                 going to have replyfilter do that).
  30 #   nowrap      - Don't wrap lines if they exceed the column width
  31 #   formatarg   - Arguments to formatproc.  The first argument is the value
  32 #                 of the Content-type header; the second is the value of the
  33 #                 Content-Transfer-Encoding header.  The last "formatarg"
  34 #                 is used as your quoting prefix.  Replace it with whatever
  35 #                 you want.
  36 #
  37
  38 use Mail::Field;
  39 use MIME::Head;
  40 use MIME::QuotedPrint;
  41 use MIME::Base64;
  42 use Encode;
  43
  44 #
  45 # The program we use to format "long" text.  Should be capable of reading
  46 # from standard input and sending the formatted text to standard output.
  47 #
  48
  49 $filterprogram = 'par';
  50
  51 #
  52 # If the above filter program has problems with some input, use the following
  53 # regular expression to remove any problematic input.  In this example we
  54 # filter out the UTF-8 non-breaking space (U+00A0) because that makes par
  55 # mangle the output.  Uncomment this if this ends up being a problem for
  56 # you, or feel free to add others.
  57 #
  58
  59 #%filterreplace = ( "\N{U+a0}" => " " );
  60
  61 #
  62 # Our output character set.  This script assumes a UTF-8 locale, but if you
  63 # want to run under a different locale the change it here.
  64 #
  65
  66 $outcharset = 'utf-8';
  67
  68 #
  69 # Maximum column width (used by the HTML converter and to decide if we need
  70 # to invoke the filter program
  71 #
  72
  73 $maxcolwidth = 78;
  74
  75 #
  76 # Out HTML converter program & arguments. charset will be appended
  77 #
  78
  79 @htmlconv = ('w3m', '-dump', '-cols', $maxcolwidth - 2, '-T', 'text/html',
  80              '-O', $outcharset, '-I');
  81
  82
  83 die "Usage: $0 Content-type content-transfer-encoding quote-prefix\n"
  84                                 if $#ARGV != 2;
  85
  86 if ($ARGV[0] ne "") {
  87         my $ctype = Mail::Field->new('Content-Type', $ARGV[0]);
  88         $content_type =  $ctype->type;
  89         $charset = $ctype->charset;
  90         $boundary = $ctype->boundary;
  91 } else {
  92         $content_type = 'text/plain';
  93         $charset = 'us-ascii';
  94 }
  95
  96 $encoding = $ARGV[1] eq "" ? '7bit' : lc($ARGV[1]);
  97 $quoteprefix = $ARGV[2];
  98
  99 #
 100 # Set up our output to be in our character set
 101 #
 102
 103 binmode(STDOUT, ":encoding($outcharset)");
 104
 105 #
 106 # The simplest case: if we have a single type of text/plain, send it
 107 # to our format subroutine.
 108 #
 109
 110 if ($content_type eq 'text/plain') {
 111         process_text(\*STDIN, $encoding, $charset);
 112         exit 0;
 113 }
 114
 115 #
 116 # Alright, here's what we need to do.
 117 #
 118 # Find any text/plain parts and decode them.  Decode them via base64 or
 119 # quoted-printable, and feed them to our formatting filter when appropriate.
 120 # Put markers in the output for other content types.
 121 #
 122
 123 ($type) = (split('/', $content_type));
 124
 125 if ($type eq 'multipart') {
 126
 127         #
 128         # For multipart messages we have to do a little extra.
 129         # Eat the MIME prologue (everything up until the first boundary)
 130         #
 131
 132         if (! defined $boundary || $boundary eq '') {
 133                 print "No boundary in Content-Type header!\n";
 134                 eat_part(\*STDIN);
 135                 exit 1;
 136         }
 137
 138         while (<STDIN>) {
 139                 last if match_boundary($_, $boundary);
 140         }
 141
 142         if (eof(STDIN)) {
 143                 print "Unable to find boundary in message\n";
 144                 exit 1;
 145         }
 146 } else {
 147         undef $boundary;
 148 }
 149
 150 process_part(\*STDIN, $content_type, $encoding, $charset, $boundary);
 151
 152 if ($boundary) {
 153         #
 154         # Eat the MIME epilog
 155         #
 156         eat_part(\*STDIN);
 157 }
 158
 159 exit 0;
 160
 161 #
 162 # Handled encoded text.  I think we can assume if the encoding is q-p
 163 # or base64 to feed it into a formatting filter.
 164 #
 165
 166 sub process_text
 167 {
 168         my ($input, $encoding, $charset, $boundary) = @_;
 169         my $text, $filterpid, $prefixpid, $finread, $finwrite;
 170         my $foutread, $foutwrite, $decoder, $ret, $filterflag;
 171         my $text, $maxline = 0;
 172
 173         #
 174         # In the simple case, just spit out the text prefixed by the
 175         # quote character
 176         #
 177
 178         if ($encoding eq '7bit' || $encoding eq '8bit') {
 179                 #
 180                 # Switch the character set to whatever is specified by
 181                 # the MIME message
 182                 #
 183                 binmode($input, ":encoding($charset)");
 184                 while (<$input>) {
 185                         $ret = match_boundary($_, $boundary);
 186                         if (defined $ret) {
 187                                 binmode($input, ':encoding(us-ascii)');
 188                                 return $ret;
 189                         }
 190                         print $quoteprefix, $_;
 191                 }
 192                 return 'EOF';
 193         } else {
 194                 #
 195                 # If we've got some other encoding, the input text is almost
 196                 # certainly US-ASCII
 197                 #
 198
 199                 binmode($input, ':encoding(us-ascii)');
 200
 201                 $decoder = find_decoder(lc($encoding));
 202                 if (! defined $decoder) {
 203                         return 'EOF';
 204                 }
 205         }
 206
 207         #
 208         # Okay, assume that the encoding will make it so that we MIGHT need
 209         # to filter it.  Read it in; if the lines are too long, filter it
 210         #
 211
 212         my $chardecode = find_encoding($charset);
 213
 214         while (<$input>) {
 215                 my @lines, $len;
 216
 217                 last if ($ret = match_boundary($_, $boundary));
 218
 219                 $text .= $_;
 220
 221         }
 222
 223         binmode($input, ':encoding(us-ascii)');
 224
 225         $text = $chardecode->decode(&$decoder($text));
 226
 227         grep {
 228                 my $len;
 229                 if (($len = length) > $maxline) {
 230                         $maxline = $len;
 231                 }} split(/^/, $text);
 232
 233         if (! defined $ret) {
 234                 $ret = 'EOF';
 235         }
 236
 237         if ($maxline <= $maxcolwidth) {
 238                 #
 239                 # These are short enough; just output it now as-is
 240                 #
 241                 foreach my $line (split(/^/, $text)) {
 242                         print STDOUT $quoteprefix, $line;
 243                 }
 244                 return $ret;
 245         }
 246
 247         #
 248         # We fork a copy of ourselves to read the output from the filter
 249         # program and prefix the quote character.
 250         #
 251
 252         pipe($finread, $finwrite) || die "pipe() failed: $!\n";
 253         pipe($foutread, $foutwrite) || die "pipe() (second) failed: $!\n";
 254
 255         binmode($finread, ":encoding($outcharset)");
 256         binmode($finwrite, ":encoding($outcharset)");
 257         binmode($foutread, ":encoding($outcharset)");
 258         binmode($foutwrite, ":encoding($outcharset)");
 259
 260         if ($filterpid = fork) {
 261                 #
 262                 # Close the pipes in the parent that we're not using
 263                 #
 264
 265                 close($finread);
 266                 close($foutwrite);
 267         } elsif (defined $filterpid) {
 268                 #
 269                 # Close our ununsed filehandles
 270                 #
 271
 272                 close($finwrite);
 273                 close($foutread);
 274
 275                 #
 276                 # Dup() down the filehandles to standard input and output
 277                 #
 278
 279                 open(STDIN, "<&", $finread) ||
 280                                         die "dup(filterin) failed: $!\n";
 281                 open(STDOUT, ">&", $foutwrite) ||
 282                                         die "dup(filterout) failed: $!\n";
 283
 284                 #
 285                 # Close our copies.
 286                 #
 287
 288                 close($finread);
 289                 close($foutwrite);
 290
 291                 #
 292                 # Exec our filter
 293                 #
 294
 295                 exec $filterprogram ||
 296                                 die "Unable to exec $filterprogram: $!\n";
 297         } else {
 298                 die "Fork for $filterprogram failed: $!\n";
 299         }
 300
 301         #
 302         # Fork our output handler.
 303         #
 304
 305         if ($prefixpid = fork) {
 306                 #
 307                 # We don't need these anymore
 308                 #
 309                 close($foutread);
 310
 311         } elsif (defined $prefixpid) {
 312                 #
 313                 # Read from foutwrite, and output (with prefix) to stdout
 314                 #
 315
 316                 close($finwrite);
 317
 318                 while (<$foutread>) {
 319                         print STDOUT $quoteprefix, $_;
 320                 }
 321
 322                 exit 0;
 323         }
 324
 325         #
 326         # Send our input to the filter program
 327         #
 328
 329         if (%filterreplace) {
 330                 foreach my $match (keys %filterreplace) {
 331                          $text =~ s/$match/$filterreplace{$match}/g;
 332                 }
 333         }
 334
 335         print $finwrite $text;
 336
 337         close($finwrite);
 338         waitpid $filterpid, 0;
 339         warn "Filter process exited with ", ($? >> 8), "\n" if $?;
 340         waitpid $prefixpid, 0;
 341         warn "Pipe reader process exited with ", ($? >> 8), "\n" if $?;
 342
 343         return $ret;
 344 }
 345
 346 #
 347 # Filter HTML through a converter program
 348 #
 349
 350 sub process_html
 351 {
 352         my ($input, $encoding, $charset, $boundary) = @_;
 353         my $filterpid, $prefixpid, $finread, $finwrite;
 354         my $foutread, $foutwrite, $decoder, $ret;
 355
 356         if (! defined($decoder = find_decoder(lc($encoding)))) {
 357                 return 'EOF';
 358         }
 359
 360         #
 361         # We fork a copy of ourselves to read the output from the filter
 362         # program and prefix the quote character.
 363         #
 364
 365         pipe($finread, $finwrite) || die "pipe() failed: $!\n";
 366         pipe($foutread, $foutwrite) || die "pipe() (second) failed: $!\n";
 367
 368         binmode($finread, ":encoding($outcharset)");
 369         binmode($finread, ":encoding($outcharset)");
 370         binmode($foutread, ":encoding($outcharset)");
 371         binmode($foutwrite, ":encoding($outcharset)");
 372
 373         if ($filterpid = fork) {
 374                 #
 375                 # Close the pipes in the parent that we're not using
 376                 #
 377
 378                 close($finread);
 379                 close($foutwrite);
 380         } elsif (defined $filterpid) {
 381                 #
 382                 # Close our ununsed filehandles
 383                 #
 384
 385                 close($finwrite);
 386                 close($foutread);
 387
 388                 #
 389                 # Dup() down the filehandles to standard input and output
 390                 #
 391
 392                 open(STDIN, "<&", $finread) ||
 393                                         die "dup(filterin) failed: $!\n";
 394                 open(STDOUT, ">&", $foutwrite) ||
 395                                         die "dup(filterout) failed: $!\n";
 396
 397                 #
 398                 # Close our copies.
 399                 #
 400
 401                 close($finread);
 402                 close($foutwrite);
 403
 404                 #
 405                 # Exec our converter
 406                 #
 407
 408                 my @conv = (@htmlconv, $charset);
 409                 exec (@conv) ||
 410                                 die "Unable to exec $htmlconv[0]: $!\n";
 411         } else {
 412                 die "Fork for $htmlconv[0] failed: $!\n";
 413         }
 414
 415         #
 416         # Fork our output handler.
 417         #
 418
 419         if ($prefixpid = fork) {
 420                 #
 421                 # We don't need these anymore
 422                 #
 423                 close($foutread);
 424
 425         } elsif (defined $prefixpid) {
 426                 #
 427                 # Read from foutwrite, and output (with prefix) to stdout
 428                 #
 429
 430                 close($finwrite);
 431
 432                 while (<$foutread>) {
 433                         print STDOUT $quoteprefix, $_;
 434                 }
 435
 436                 exit 0;
 437         }
 438
 439         #
 440         # Send our input to the filter program
 441         #
 442
 443         while (<$input>) {
 444                 last if ($ret = match_boundary($_, $boundary));
 445                 print $finwrite (&$decoder($_));
 446         }
 447
 448         if (! defined $ret) {
 449                 $ret = 'EOF';
 450         }
 451
 452         close($finwrite);
 453         waitpid $filterpid, 0;
 454         warn "HTML converter process exited with ", scalar($? >> 8), "\n" if $?;
 455         waitpid $prefixpid, 0;
 456         warn "Pipe reader process exited with ", $? >> 8, "\n" if $?;
 457
 458         return $ret;
 459 }
 460
 461 #
 462 # Decide what to do, based on what kind of content it is.
 463 #
 464
 465 sub process_part
 466 {
 467         my ($input, $content_type, $encoding, $charset, $boundary, $name) = @_;
 468         my ($type, $subtype) = (split('/', $content_type, -1), '');
 469
 470         if ($type eq 'text') {
 471                 #
 472                 # If this is a text part, right now we only deal with
 473                 # plain and HTML parts.
 474                 #
 475                 if ($subtype eq 'plain') {
 476                         return process_text($input, $encoding, $charset,
 477                                             $boundary);
 478                 } elsif ($subtype eq 'html') {
 479                         return process_html($input, $encoding, $charset,
 480                                             $boundary);
 481                 } else {
 482                         print ">>> $content_type content\n";
 483                         return eat_part($input, $boundary);
 484                 }
 485         } elsif ($type eq 'multipart') {
 486                 return process_multipart($input, $subtype, $boundary);
 487         } else {
 488                 #
 489                 # Other types we're not sure what to do with right now
 490                 # Just put a marker in there
 491                 #
 492
 493                 print ">>> $content_type attachment";
 494                 if (defined $name) {
 495                         print ", name=$name";
 496                 }
 497                 print "\n";
 498
 499                 return eat_part($input, $boundary);
 500         }
 501 }
 502
 503 #
 504 # Process a multipart message.
 505 #
 506 # When called, we should be right after the beginning of the first
 507 # boundary marker.  So we should be pointed at header lines which describe
 508 # the content of this part
 509 #
 510
 511 sub process_multipart
 512 {
 513         my ($input, $subtype, $boundary) = @_;
 514         my $altout;
 515
 516         while (1) {
 517                 my $encoding, $type, $end, $name, $charset;
 518
 519                 #
 520                 # Use the Mail::Header package to read in any headers
 521                 # corresponding to this part
 522                 #
 523
 524                 my $head = Mail::Header->new($input, (MailFrom => 'IGNORE'));
 525
 526                 #
 527                 # Extract out any Content-Type, Content-Transfer-Encoding, and
 528                 # Content-Disposition headers
 529                 #
 530
 531                 my $ctype = Mail::Field->extract('Content-Type', $head);
 532                 my $cte = Mail::Field->extract('Content-Transfer-Encoding',
 533                                                $head);
 534                 my $cdispo = Mail::Field->extract('Content-Disposition', $head);
 535
 536                 if (defined $ctype) {
 537                         $type = $ctype->type;
 538                         $charset = $ctype->charset;
 539                 } else {
 540                         $type = 'text/plain';
 541                         $charset = 'us-ascii';
 542                 }
 543
 544                 $encoding = defined $cte ? lc($cte->param('_')) : '7bit';
 545                 $name = defined $cdispo ? $cdispo->param('filename') : undef;
 546
 547                 #
 548                 # Special handling for multipart/alternative; pick
 549                 # the "first" one we can handle (which is usually
 550                 # text/plain) and silently eat the rest, but output a
 551                 # warning if we can't handle anything.
 552                 #
 553
 554                 if ($altout) {
 555                         $end = eat_part($input, $boundary);
 556                 } else {
 557                         my $subboundary = $boundary;
 558                         my $maintype = (split('/', $type))[0];
 559
 560                         if ($maintype eq 'multipart') {
 561                                 $subboundary = $ctype->boundary;
 562                                 #
 563                                 # Go until we find our beginning of this
 564                                 # part
 565                                 #
 566                                 my $subend = eat_part($input, $subboundary);
 567
 568                                 if ($subend ne 'EOP') {
 569                                         print ">>> WARNING: malformed ",
 570                                                 "nested multipart\n";
 571                                         return $subend;
 572                                 }
 573                         }
 574
 575                         $end = process_part($input, $type, $encoding,
 576                                             $charset, $subboundary, $name);
 577
 578                         if ($subtype eq 'alternative' && ! defined $altout &&
 579                             $type eq 'text/plain') {
 580                                 $altout = 1;
 581                         }
 582
 583                         #
 584                         # Since we changed the semantics of $boundary
 585                         # above for nested multiparts, if we are
 586                         # handling a nested multipart then find the end
 587                         # of our current part
 588                         #
 589
 590                         if ($maintype eq 'multipart') {
 591                                 $end = eat_part($input, $boundary);
 592                         }
 593
 594                 }
 595
 596                 if ($end eq 'EOM' || $end eq 'EOF') {
 597                         if ($subtype eq 'alternative' && !defined $altout) {
 598                                 print ">>>multipart/alternative: no suitable ",
 599                                         "parts\n";
 600                         }
 601                         return $end;
 602                 }
 603         }
 604 }
 605
 606 #
 607 # "Eat" a MIME part; consume content until we hit the boundary or EOF
 608 #
 609
 610 sub eat_part
 611 {
 612         my ($input, $boundary) = @_;
 613         my $ret;
 614
 615         #
 616         # If we weren't given a boundary just eat input until EOF
 617         #
 618
 619         if (! defined $boundary) {
 620                 while (<$input>) { }
 621                 return 'EOF';
 622         }
 623
 624         #
 625         # Otherwise, consume data until we hit our boundary
 626         #
 627
 628         while (<$input>) {
 629                 if ($ret = match_boundary($_, $boundary)) {
 630                         return $ret;
 631                 }
 632         }
 633
 634         return 'EOF';
 635 }
 636
 637 #
 638 # Return the decoder subroutine to use
 639 #
 640
 641 sub find_decoder
 642 {
 643         my ($encoding) = @_;
 644
 645         if ($encoding eq '7bit' || $encoding eq '8bit') {
 646                 return \&null_decoder;
 647         } elsif ($encoding eq 'base64') {
 648                 return \&decode_base64;
 649         } elsif ($encoding eq 'quoted-printable') {
 650                 return \&decode_qp;
 651         } else {
 652                 warn "Unknown encoding: $encoding\n";
 653                 return undef;
 654         }
 655 }
 656
 657 sub null_decoder
 658 {
 659         my ($input) = @_;
 660
 661         return $input;
 662 }
 663
 664 #
 665 # Match a line against the boundary string
 666 #
 667
 668 sub match_boundary
 669 {
 670         my ($line, $boundary) = @_;
 671
 672         return if ! defined $boundary;
 673
 674         if (substr($line, 0, 2) eq '--') {
 675                 $line =~ s/[ \t\r\n]+\Z//;
 676                 if ($line eq "--$boundary") {
 677                         return 'EOP';
 678                 } elsif ($line eq "--$boundary--") {
 679                         return 'EOM';
 680                 }
 681         }
 682
 683         return undef;
 684 }