]> diplodocus.org Git - nmh/blob - docs/contrib/replyfilter
Update manpages to use .TP for tagged paragraphs (part I).
[nmh] / docs / contrib / replyfilter
1 #!/usr/bin/perl
2 #
3 # replyfilter - A reply filter for nmh
4 #
5 # The idea behind this program is that it will act as a format filter
6 # for nmh. It will try to extract out all text/plain parts and format
7 # them if necessary using a filter program.
8 #
9 # To use this program, configure nmh in the following way (nmh 1.5 or later):
10 #
11 # - Put the path to this program in your .mh_profile under formatproc:
12 #
13 # formatproc: replyfilter
14 #
15 # - Create a mhl reply filter that consists of the following line:
16 #
17 # body:nocomponent,format,nowrap,formatarg="%(trim{content-type})%(putstr)",formatarg="%(trim{content-transfer-encoding})%(putstr)",formatarg=">"
18 #
19 # To decode this a bit:
20 #
21 # body - Output the "body" component
22 # nocomponent - Don't output a component prefix (normally here we use a
23 # component prefix of ">" as a quote character, but we're
24 # going to have replyfilter do that).
25 # nowrap - Don't wrap lines if they exceed the column width
26 # formatarg - Arguments to fmtproc. The first argument is the value of
27 # the Content-type header; the second is the value of the
28 # Content-Transfer-Encoding header. The last "formatarg"
29 # is used as your quoting prefix. Replace it with whatever
30 # you want.
31 #
32
33 use Mail::Field;
34 use MIME::Head;
35 use MIME::QuotedPrint;
36 use MIME::Base64;
37 use Encode;
38
39 #
40 # The program we use to format "long" text. Should be capable of reading
41 # from standard input and sending the formatted text to standard output.
42 #
43
44 $filterprogram = 'par';
45
46 #
47 # If the above filter program has problems with some input, use the following
48 # regular expression to remove any problematic input. In this example we
49 # filter out the UTF-8 non-breaking space (U+00A0) because that makes par
50 # mangle the output. Uncomment this if this ends up being a problem for
51 # you, or feel free to add others.
52 #
53
54 #%filterreplace = ( "\N{U+a0}" => " " );
55
56 #
57 # Our output character set. This script assumes a UTF-8 locale, but if you
58 # want to run under a different locale the change it here.
59 #
60
61 $outcharset = 'utf-8';
62
63 #
64 # Maximum column width (used by the HTML converter and to decide if we need
65 # to invoke the filter program
66 #
67
68 $maxcolwidth = 78;
69
70 #
71 # Out HTML converter program & arguments
72 #
73
74 @htmlconv = ('w3m', '-dump', '-cols', $maxcolwidth - 2, '-T', 'text/html',
75 '-O', $outcharset);
76
77
78 die "Usage: $0 Content-type content-transfer-encoding quote-prefix\n"
79 if $#ARGV != 2;
80
81 if ($ARGV[0] ne "") {
82 my $ctype = Mail::Field->new('Content-Type', $ARGV[0]);
83 $content_type = $ctype->type;
84 $charset = $ctype->charset;
85 $boundary = $ctype->boundary;
86 } else {
87 $content_type = 'text/plain';
88 $charset = 'us-ascii';
89 }
90
91 $encoding = $ARGV[1] eq "" ? '7bit' : lc($ARGV[1]);
92 $quoteprefix = $ARGV[2];
93
94 #
95 # Set up our output to be in our character set
96 #
97
98 binmode(STDOUT, ":encoding($outcharset)");
99
100 #
101 # The simplest case: if we have a single type of text/plain, send it
102 # to our format subroutine.
103 #
104
105 if ($content_type eq 'text/plain') {
106 process_text(\*STDIN, $encoding, $charset);
107 exit 0;
108 }
109
110 #
111 # Alright, here's what we need to do.
112 #
113 # Find any text/plain parts and decode them. Decode them via base64 or
114 # quoted-printable, and feed them to our formatting filter when appropriate.
115 # Put markers in the output for other content types.
116 #
117
118 ($type) = (split('/', $content_type));
119
120 if ($type eq 'multipart') {
121
122 #
123 # For multipart messages we have to do a little extra.
124 # Eat the MIME prologue (everything up until the first boundary)
125 #
126
127 if (! defined $boundary || $boundary eq '') {
128 print "No boundary in Content-Type header!\n";
129 eat_part(\*STDIN);
130 exit 1;
131 }
132
133 while (<STDIN>) {
134 last if match_boundary($_, $boundary);
135 }
136
137 if (eof(STDIN)) {
138 print "Unable to find boundary in message\n";
139 exit 1;
140 }
141 } else {
142 undef $boundary;
143 }
144
145 process_part(\*STDIN, $content_type, $encoding, $charset, $boundary);
146
147 if ($boundary) {
148 #
149 # Eat the MIME epilog
150 #
151 eat_part(\*STDIN);
152 }
153
154 exit 0;
155
156 #
157 # Handled encoded text. I think we can assume if the encoding is q-p
158 # or base64 to feed it into a formatting filter.
159 #
160
161 sub process_text (*$$;$)
162 {
163 my ($input, $encoding, $charset, $boundary) = @_;
164 my $text, $filterpid, $prefixpid, $finread, $finwrite;
165 my $foutread, $foutwrite, $decoder, $ret, $filterflag;
166 my $text, $maxline = 0;
167
168 #
169 # In the simple case, just spit out the text prefixed by the
170 # quote character
171 #
172
173 if ($encoding eq '7bit' || $encoding eq '8bit') {
174 #
175 # Switch the character set to whatever is specified by
176 # the MIME message
177 #
178 binmode($input, ":encoding($charset)");
179 while (<$input>) {
180 $ret = match_boundary($_, $boundary);
181 if (defined $ret) {
182 binmode($input, ':encoding(us-ascii)');
183 return $ret;
184 }
185 print $quoteprefix, $_;
186 }
187 return 'EOF';
188 } else {
189 #
190 # If we've got some other encoding, the input text is almost
191 # certainly US-ASCII
192 #
193
194 binmode($input, ':encoding(us-ascii)');
195
196 $decoder = find_decoder(lc($encoding));
197 if (! defined $decoder) {
198 return 'EOF';
199 }
200 }
201
202 #
203 # Okay, assume that the encoding will make it so that we MIGHT need
204 # to filter it. Read it in; if the lines are too long, filter it
205 #
206
207 my $chardecode = find_encoding($charset);
208
209 while (<$input>) {
210 my @lines, $len;
211
212 last if ($ret = match_boundary($_, $boundary));
213
214 $text .= $_;
215
216 }
217
218 binmode($input, ':encoding(us-ascii)');
219
220 $text = $chardecode->decode(&$decoder($text));
221
222 grep {
223 my $len;
224 if (($len = length) > $maxline) {
225 $maxline = $len;
226 }} split(/^/, $text);
227
228 if (! defined $ret) {
229 $ret = 'EOF';
230 }
231
232 if ($maxline <= $maxcolwidth) {
233 #
234 # These are short enough; just output it now as-is
235 #
236 foreach my $line (split(/^/, $text)) {
237 print STDOUT $quoteprefix, $line;
238 }
239 return $ret;
240 }
241
242 #
243 # We fork a copy of ourselves to read the output from the filter
244 # program and prefix the quote character.
245 #
246
247 pipe($finread, $finwrite) || die "pipe() failed: $!\n";
248 pipe($foutread, $foutwrite) || die "pipe() (second) failed: $!\n";
249
250 binmode($finread, ":encoding($outcharset)");
251 binmode($finwrite, ":encoding($outcharset)");
252 binmode($foutread, ":encoding($outcharset)");
253 binmode($foutwrite, ":encoding($outcharset)");
254
255 if ($filterpid = fork) {
256 #
257 # Close the pipes in the parent that we're not using
258 #
259
260 close($finread);
261 close($foutwrite);
262 } elsif (defined $filterpid) {
263 #
264 # Close our ununsed filehandles
265 #
266
267 close($finwrite);
268 close($foutread);
269
270 #
271 # Dup() down the filehandles to standard input and output
272 #
273
274 open(STDIN, "<&", $finread) ||
275 die "dup(filterin) failed: $!\n";
276 open(STDOUT, ">&", $foutwrite) ||
277 die "dup(filterout) failed: $!\n";
278
279 #
280 # Close our copies.
281 #
282
283 close($finread);
284 close($foutwrite);
285
286 #
287 # Exec our filter
288 #
289
290 exec $filterprogram ||
291 die "Unable to exec $filterprogram: $!\n";
292 } else {
293 die "Fork for $filterprogram failed: $!\n";
294 }
295
296 #
297 # Fork our output handler.
298 #
299
300 if ($prefixpid = fork) {
301 #
302 # We don't need these anymore
303 #
304 close($foutread);
305
306 } elsif (defined $prefixpid) {
307 #
308 # Read from foutwrite, and output (with prefix) to stdout
309 #
310
311 close($finwrite);
312
313 while (<$foutread>) {
314 print STDOUT $quoteprefix, $_;
315 }
316
317 exit 0;
318 }
319
320 #
321 # Send our input to the filter program
322 #
323
324 if (%filterreplace) {
325 foreach my $match (keys %filterreplace) {
326 $text =~ s/$match/$filterreplace{$match}/g;
327 }
328 }
329
330 print $finwrite $text;
331
332 close($finwrite);
333 waitpid $filterpid, 0;
334 warn "Filter process exited with ", ($? >> 8), "\n" if $?;
335 waitpid $prefixpid, 0;
336 warn "Pipe reader process exited with ", ($? >> 8), "\n" if $?;
337
338 return $ret;
339 }
340
341 #
342 # Filter HTML through a converter program
343 #
344
345 sub process_html (*$$;$)
346 {
347 my ($input, $encoding, $charset, $boundary) = @_;
348 my $filterpid, $prefixpid, $finread, $finwrite;
349 my $foutread, $foutwrite, $decoder, $ret;
350
351 if (! defined($decoder = find_decoder(lc($encoding)))) {
352 return 'EOF';
353 }
354
355 #
356 # We fork a copy of ourselves to read the output from the filter
357 # program and prefix the quote character.
358 #
359
360 pipe($finread, $finwrite) || die "pipe() failed: $!\n";
361 pipe($foutread, $foutwrite) || die "pipe() (second) failed: $!\n";
362
363 binmode($finread, ":encoding($outcharset)");
364 binmode($finread, ":encoding($outcharset)");
365 binmode($foutread, ":encoding($outcharset)");
366 binmode($foutwrite, ":encoding($outcharset)");
367
368 if ($filterpid = fork) {
369 #
370 # Close the pipes in the parent that we're not using
371 #
372
373 close($finread);
374 close($foutwrite);
375 } elsif (defined $filterpid) {
376 #
377 # Close our ununsed filehandles
378 #
379
380 close($finwrite);
381 close($foutread);
382
383 #
384 # Dup() down the filehandles to standard input and output
385 #
386
387 open(STDIN, "<&", $finread) ||
388 die "dup(filterin) failed: $!\n";
389 open(STDOUT, ">&", $foutwrite) ||
390 die "dup(filterout) failed: $!\n";
391
392 #
393 # Close our copies.
394 #
395
396 close($finread);
397 close($foutwrite);
398
399 #
400 # Exec our converter
401 #
402
403 exec (@htmlconv) ||
404 die "Unable to exec $filterprogram: $!\n";
405 } else {
406 die "Fork for $htmlconv[0] failed: $!\n";
407 }
408
409 #
410 # Fork our output handler.
411 #
412
413 if ($prefixpid = fork) {
414 #
415 # We don't need these anymore
416 #
417 close($foutread);
418
419 } elsif (defined $prefixpid) {
420 #
421 # Read from foutwrite, and output (with prefix) to stdout
422 #
423
424 close($finwrite);
425
426 while (<$foutread>) {
427 print STDOUT $quoteprefix, $_;
428 }
429
430 exit 0;
431 }
432
433 #
434 # Send our input to the filter program
435 #
436
437 while (<$input>) {
438 last if ($ret = match_boundary($_, $boundary));
439 print $finwrite (&$decoder($_));
440 }
441
442 if (! defined $ret) {
443 $ret = 'EOF';
444 }
445
446 close($finwrite);
447 waitpid $filterpid, 0;
448 warn "HTML converter process exited with ", scalar($? >> 8), "\n" if $?;
449 waitpid $prefixpid, 0;
450 warn "Pipe reader process exited with ", $? >> 8, "\n" if $?;
451
452 return $ret;
453 }
454
455 #
456 # Decide what to do, based on what kind of content it is.
457 #
458
459 sub process_part (*$$$$;$)
460 {
461 my ($input, $content_type, $encoding, $charset, $boundary, $name) = @_;
462 my ($type, $subtype) = (split('/', $content_type, -1), '');
463
464 if ($type eq 'text') {
465 #
466 # If this is a text part, right now we only deal with
467 # plain and HTML parts.
468 #
469 if ($subtype eq 'plain') {
470 return process_text($input, $encoding, $charset,
471 $boundary);
472 } elsif ($subtype eq 'html') {
473 return process_html($input, $encoding, $charset,
474 $boundary);
475 } else {
476 print ">>> $content_type content\n";
477 return eat_part($input, $boundary);
478 }
479 } elsif ($type eq 'multipart') {
480 return process_multipart($input, $subtype, $boundary);
481 } else {
482 #
483 # Other types we're not sure what to do with right now
484 # Just put a marker in there
485 #
486
487 print ">>> $content_type attachment";
488 if (defined $name) {
489 print ", name=$name";
490 }
491 print "\n";
492
493 return eat_part($input, $boundary);
494 }
495 }
496
497 #
498 # Process a multipart message.
499 #
500 # When called, we should be right after the beginning of the first
501 # boundary marker. So we should be pointed at header lines which describe
502 # the content of this part
503 #
504
505 sub process_multipart ($$$)
506 {
507 my ($input, $subtype, $boundary) = @_;
508 my $altout;
509
510 while (1) {
511 my $encoding, $type, $end, $name, $charset;
512
513 #
514 # Use the Mail::Header package to read in any headers
515 # corresponding to this part
516 #
517
518 my $head = Mail::Header->new($input, (MailFrom => 'IGNORE'));
519
520 #
521 # Extract out any Content-Type, Content-Transfer-Encoding, and
522 # Content-Disposition headers
523 #
524
525 my $ctype = Mail::Field->extract('Content-Type', $head);
526 my $cte = Mail::Field->extract('Content-Transfer-Encoding',
527 $head);
528 my $cdispo = Mail::Field->extract('Content-Disposition', $head);
529
530 if (defined $ctype) {
531 $type = $ctype->type;
532 $charset = $ctype->charset;
533 } else {
534 $type = 'text/plain';
535 $charset = 'us-ascii';
536 }
537
538 $encoding = defined $cte ? lc($cte->param('_')) : '7bit';
539 $name = defined $cdispo ? $cdispo->param('filename') : undef;
540
541 #
542 # Special handling for multipart/alternative; pick
543 # the "first" one we can handle (which is usually
544 # text/plain) and silently eat the rest, but output a
545 # warning if we can't handle anything.
546 #
547
548 if ($altout) {
549 $end = eat_part($input, $boundary);
550 } else {
551 my $subboundary = $boundary;
552 my $maintype = (split('/', $type))[0];
553
554 if ($maintype eq 'multipart') {
555 $subboundary = $ctype->boundary;
556 #
557 # Go until we find our beginning of this
558 # part
559 #
560 my $subend = eat_part($input, $subboundary);
561
562 if ($subend ne 'EOP') {
563 print ">>> WARNING: malformed ",
564 "nested multipart\n";
565 return $subend;
566 }
567 }
568
569 $end = process_part($input, $type, $encoding,
570 $charset, $subboundary, $name);
571
572 if ($subtype eq 'alternative' && ! defined $altout &&
573 $type eq 'text/plain') {
574 $altout = 1;
575 }
576
577 #
578 # Since we changed the semantics of $boundary
579 # above for nested multiparts, if we are
580 # handling a nested multipart then find the end
581 # of our current part
582 #
583
584 if ($maintype eq 'multipart') {
585 $end = eat_part($input, $boundary);
586 }
587
588 }
589
590 if ($end eq 'EOM' || $end eq 'EOF') {
591 if ($subtype eq 'alternative' && !defined $altout) {
592 print ">>>multipart/alternative: no suitable ",
593 "parts\n";
594 }
595 return $end;
596 }
597 }
598 }
599
600 #
601 # "Eat" a MIME part; consume content until we hit the boundary or EOF
602 #
603
604 sub eat_part ($$)
605 {
606 my ($input, $boundary) = @_;
607 my $ret;
608
609 #
610 # If we weren't given a boundary just eat input until EOF
611 #
612
613 if (! defined $boundary) {
614 while (<$input>) { }
615 return 'EOF';
616 }
617
618 #
619 # Otherwise, consume data until we hit our boundary
620 #
621
622 while (<$input>) {
623 if ($ret = match_boundary($_, $boundary)) {
624 return $ret;
625 }
626 }
627
628 return 'EOF';
629 }
630
631 #
632 # Return the decoder subroutine to use
633 #
634
635 sub find_decoder ($)
636 {
637 my ($encoding) = @_;
638
639 if ($encoding eq '7bit' || $encoding eq '8bit') {
640 return \&null_decoder;
641 } elsif ($encoding eq 'base64') {
642 return \&decode_base64;
643 } elsif ($encoding eq 'quoted-printable') {
644 return \&decode_qp;
645 } else {
646 warn "Unknown encoding: $encoding\n";
647 return undef;
648 }
649 }
650
651 sub null_decoder ($)
652 {
653 my ($input) = @_;
654
655 return $input;
656 }
657
658 #
659 # Match a line against the boundary string
660 #
661
662 sub match_boundary($$)
663 {
664 my ($line, $boundary) = @_;
665
666 return if ! defined $boundary;
667
668 if (substr($line, 0, 2) eq '--') {
669 $line =~ s/[ \t\r\n]+\Z//;
670 if ($line eq "--$boundary") {
671 return 'EOP';
672 } elsif ($line eq "--$boundary--") {
673 return 'EOM';
674 }
675 }
676
677 return undef;
678 }