]> diplodocus.org Git - nmh/blob - docs/contrib/replyfilter
context_find.c: Hoist strlen(3) out of search loop.
[nmh] / docs / contrib / replyfilter
1 #!/usr/bin/perl
2 #
3 # replyfilter - A reply filter for nmh
4 #
5 # The idea behind this program is that it will act as a format filter
6 # for nmh. It will try to extract out all text/plain parts and format
7 # them if necessary using a filter program.
8 #
9 # To use this program, configure nmh in the following way (nmh 1.5 or later):
10 #
11 # - Put the path to this program in your .mh_profile under formatproc:
12 #
13 # formatproc: replyfilter
14 #
15 # or invoke repl with "-fmtproc replyfilter".
16 #
17 # - Create an mhl reply filter that consists of the following line:
18 #
19 # body:nocomponent,format,nowrap,formatarg="%(trim{content-type})%(putstr)",formatarg="%(trim{content-transfer-encoding})%(putstr)",formatarg=">"
20 #
21 # By default, repl will look for the mhl reply filter by the name
22 # "mhl.reply", but it will look elsewhere if the -filter switch is given.
23 #
24 # To decode this a bit:
25 #
26 # body - Output the "body" component
27 # nocomponent - Don't output a component prefix (normally here we use a
28 # component prefix of ">" as a quote character, but we're
29 # going to have replyfilter do that).
30 # nowrap - Don't wrap lines if they exceed the column width
31 # formatarg - Arguments to formatproc. The first argument is the value
32 # of the Content-type header; the second is the value of the
33 # Content-Transfer-Encoding header. The last "formatarg"
34 # is used as your quoting prefix. Replace it with whatever
35 # you want.
36 #
37
38 use Mail::Field;
39 use MIME::Head;
40 use MIME::QuotedPrint;
41 use MIME::Base64;
42 use Encode;
43
44 #
45 # The program we use to format "long" text. Should be capable of reading
46 # from standard input and sending the formatted text to standard output.
47 #
48
49 $filterprogram = 'par';
50
51 #
52 # If the above filter program has problems with some input, use the following
53 # regular expression to remove any problematic input. In this example we
54 # filter out the UTF-8 non-breaking space (U+00A0) because that makes par
55 # mangle the output. Uncomment this if this ends up being a problem for
56 # you, or feel free to add others.
57 #
58
59 #%filterreplace = ( "\N{U+a0}" => " " );
60
61 #
62 # Our output character set. This script assumes a UTF-8 locale, but if you
63 # want to run under a different locale the change it here.
64 #
65
66 $outcharset = 'utf-8';
67
68 #
69 # Maximum column width (used by the HTML converter and to decide if we need
70 # to invoke the filter program
71 #
72
73 $maxcolwidth = 78;
74
75 #
76 # Out HTML converter program & arguments. charset will be appended
77 #
78
79 @htmlconv = ('w3m', '-dump', '-cols', $maxcolwidth - 2, '-T', 'text/html',
80 '-O', $outcharset, '-I');
81
82
83 die "Usage: $0 Content-type content-transfer-encoding quote-prefix\n"
84 if $#ARGV != 2;
85
86 if ($ARGV[0] ne "") {
87 my $ctype = Mail::Field->new('Content-Type', $ARGV[0]);
88 $content_type = $ctype->type;
89 $charset = $ctype->charset;
90 $boundary = $ctype->boundary;
91 } else {
92 $content_type = 'text/plain';
93 $charset = 'us-ascii';
94 }
95
96 $encoding = $ARGV[1] eq "" ? '7bit' : lc($ARGV[1]);
97 $quoteprefix = $ARGV[2];
98
99 #
100 # Set up our output to be in our character set
101 #
102
103 binmode(STDOUT, ":encoding($outcharset)");
104
105 #
106 # The simplest case: if we have a single type of text/plain, send it
107 # to our format subroutine.
108 #
109
110 if ($content_type eq 'text/plain') {
111 process_text(\*STDIN, $encoding, $charset);
112 exit 0;
113 }
114
115 #
116 # Alright, here's what we need to do.
117 #
118 # Find any text/plain parts and decode them. Decode them via base64 or
119 # quoted-printable, and feed them to our formatting filter when appropriate.
120 # Put markers in the output for other content types.
121 #
122
123 ($type) = (split('/', $content_type));
124
125 if ($type eq 'multipart') {
126
127 #
128 # For multipart messages we have to do a little extra.
129 # Eat the MIME prologue (everything up until the first boundary)
130 #
131
132 if (! defined $boundary || $boundary eq '') {
133 print "No boundary in Content-Type header!\n";
134 eat_part(\*STDIN);
135 exit 1;
136 }
137
138 while (<STDIN>) {
139 last if match_boundary($_, $boundary);
140 }
141
142 if (eof(STDIN)) {
143 print "Unable to find boundary in message\n";
144 exit 1;
145 }
146 } else {
147 undef $boundary;
148 }
149
150 process_part(\*STDIN, $content_type, $encoding, $charset, $boundary);
151
152 if ($boundary) {
153 #
154 # Eat the MIME epilog
155 #
156 eat_part(\*STDIN);
157 }
158
159 exit 0;
160
161 #
162 # Handled encoded text. I think we can assume if the encoding is q-p
163 # or base64 to feed it into a formatting filter.
164 #
165
166 sub process_text
167 {
168 my ($input, $encoding, $charset, $boundary) = @_;
169 my $text, $filterpid, $prefixpid, $finread, $finwrite;
170 my $foutread, $foutwrite, $decoder, $ret, $filterflag;
171 my $text, $maxline = 0;
172
173 #
174 # In the simple case, just spit out the text prefixed by the
175 # quote character
176 #
177
178 if ($encoding eq '7bit' || $encoding eq '8bit') {
179 #
180 # Switch the character set to whatever is specified by
181 # the MIME message
182 #
183 binmode($input, ":encoding($charset)");
184 while (<$input>) {
185 $ret = match_boundary($_, $boundary);
186 if (defined $ret) {
187 binmode($input, ':encoding(us-ascii)');
188 return $ret;
189 }
190 print $quoteprefix, $_;
191 }
192 return 'EOF';
193 } else {
194 #
195 # If we've got some other encoding, the input text is almost
196 # certainly US-ASCII
197 #
198
199 binmode($input, ':encoding(us-ascii)');
200
201 $decoder = find_decoder(lc($encoding));
202 if (! defined $decoder) {
203 return 'EOF';
204 }
205 }
206
207 #
208 # Okay, assume that the encoding will make it so that we MIGHT need
209 # to filter it. Read it in; if the lines are too long, filter it
210 #
211
212 my $chardecode = find_encoding($charset);
213
214 while (<$input>) {
215 my @lines, $len;
216
217 last if ($ret = match_boundary($_, $boundary));
218
219 $text .= $_;
220
221 }
222
223 binmode($input, ':encoding(us-ascii)');
224
225 $text = $chardecode->decode(&$decoder($text));
226
227 grep {
228 my $len;
229 if (($len = length) > $maxline) {
230 $maxline = $len;
231 }} split(/^/, $text);
232
233 if (! defined $ret) {
234 $ret = 'EOF';
235 }
236
237 if ($maxline <= $maxcolwidth) {
238 #
239 # These are short enough; just output it now as-is
240 #
241 foreach my $line (split(/^/, $text)) {
242 print STDOUT $quoteprefix, $line;
243 }
244 return $ret;
245 }
246
247 #
248 # We fork a copy of ourselves to read the output from the filter
249 # program and prefix the quote character.
250 #
251
252 pipe($finread, $finwrite) || die "pipe() failed: $!\n";
253 pipe($foutread, $foutwrite) || die "pipe() (second) failed: $!\n";
254
255 binmode($finread, ":encoding($outcharset)");
256 binmode($finwrite, ":encoding($outcharset)");
257 binmode($foutread, ":encoding($outcharset)");
258 binmode($foutwrite, ":encoding($outcharset)");
259
260 if ($filterpid = fork) {
261 #
262 # Close the pipes in the parent that we're not using
263 #
264
265 close($finread);
266 close($foutwrite);
267 } elsif (defined $filterpid) {
268 #
269 # Close our ununsed filehandles
270 #
271
272 close($finwrite);
273 close($foutread);
274
275 #
276 # Dup() down the filehandles to standard input and output
277 #
278
279 open(STDIN, "<&", $finread) ||
280 die "dup(filterin) failed: $!\n";
281 open(STDOUT, ">&", $foutwrite) ||
282 die "dup(filterout) failed: $!\n";
283
284 #
285 # Close our copies.
286 #
287
288 close($finread);
289 close($foutwrite);
290
291 #
292 # Exec our filter
293 #
294
295 exec $filterprogram ||
296 die "Unable to exec $filterprogram: $!\n";
297 } else {
298 die "Fork for $filterprogram failed: $!\n";
299 }
300
301 #
302 # Fork our output handler.
303 #
304
305 if ($prefixpid = fork) {
306 #
307 # We don't need these anymore
308 #
309 close($foutread);
310
311 } elsif (defined $prefixpid) {
312 #
313 # Read from foutwrite, and output (with prefix) to stdout
314 #
315
316 close($finwrite);
317
318 while (<$foutread>) {
319 print STDOUT $quoteprefix, $_;
320 }
321
322 exit 0;
323 }
324
325 #
326 # Send our input to the filter program
327 #
328
329 if (%filterreplace) {
330 foreach my $match (keys %filterreplace) {
331 $text =~ s/$match/$filterreplace{$match}/g;
332 }
333 }
334
335 print $finwrite $text;
336
337 close($finwrite);
338 waitpid $filterpid, 0;
339 warn "Filter process exited with ", ($? >> 8), "\n" if $?;
340 waitpid $prefixpid, 0;
341 warn "Pipe reader process exited with ", ($? >> 8), "\n" if $?;
342
343 return $ret;
344 }
345
346 #
347 # Filter HTML through a converter program
348 #
349
350 sub process_html
351 {
352 my ($input, $encoding, $charset, $boundary) = @_;
353 my $filterpid, $prefixpid, $finread, $finwrite;
354 my $foutread, $foutwrite, $decoder, $ret;
355
356 if (! defined($decoder = find_decoder(lc($encoding)))) {
357 return 'EOF';
358 }
359
360 #
361 # We fork a copy of ourselves to read the output from the filter
362 # program and prefix the quote character.
363 #
364
365 pipe($finread, $finwrite) || die "pipe() failed: $!\n";
366 pipe($foutread, $foutwrite) || die "pipe() (second) failed: $!\n";
367
368 binmode($finread, ":encoding($outcharset)");
369 binmode($finread, ":encoding($outcharset)");
370 binmode($foutread, ":encoding($outcharset)");
371 binmode($foutwrite, ":encoding($outcharset)");
372
373 if ($filterpid = fork) {
374 #
375 # Close the pipes in the parent that we're not using
376 #
377
378 close($finread);
379 close($foutwrite);
380 } elsif (defined $filterpid) {
381 #
382 # Close our ununsed filehandles
383 #
384
385 close($finwrite);
386 close($foutread);
387
388 #
389 # Dup() down the filehandles to standard input and output
390 #
391
392 open(STDIN, "<&", $finread) ||
393 die "dup(filterin) failed: $!\n";
394 open(STDOUT, ">&", $foutwrite) ||
395 die "dup(filterout) failed: $!\n";
396
397 #
398 # Close our copies.
399 #
400
401 close($finread);
402 close($foutwrite);
403
404 #
405 # Exec our converter
406 #
407
408 my @conv = (@htmlconv, $charset);
409 exec (@conv) ||
410 die "Unable to exec $htmlconv[0]: $!\n";
411 } else {
412 die "Fork for $htmlconv[0] failed: $!\n";
413 }
414
415 #
416 # Fork our output handler.
417 #
418
419 if ($prefixpid = fork) {
420 #
421 # We don't need these anymore
422 #
423 close($foutread);
424
425 } elsif (defined $prefixpid) {
426 #
427 # Read from foutwrite, and output (with prefix) to stdout
428 #
429
430 close($finwrite);
431
432 while (<$foutread>) {
433 print STDOUT $quoteprefix, $_;
434 }
435
436 exit 0;
437 }
438
439 #
440 # Send our input to the filter program
441 #
442
443 while (<$input>) {
444 last if ($ret = match_boundary($_, $boundary));
445 print $finwrite (&$decoder($_));
446 }
447
448 if (! defined $ret) {
449 $ret = 'EOF';
450 }
451
452 close($finwrite);
453 waitpid $filterpid, 0;
454 warn "HTML converter process exited with ", scalar($? >> 8), "\n" if $?;
455 waitpid $prefixpid, 0;
456 warn "Pipe reader process exited with ", $? >> 8, "\n" if $?;
457
458 return $ret;
459 }
460
461 #
462 # Decide what to do, based on what kind of content it is.
463 #
464
465 sub process_part
466 {
467 my ($input, $content_type, $encoding, $charset, $boundary, $name) = @_;
468 my ($type, $subtype) = (split('/', $content_type, -1), '');
469
470 if ($type eq 'text') {
471 #
472 # If this is a text part, right now we only deal with
473 # plain and HTML parts.
474 #
475 if ($subtype eq 'plain') {
476 return process_text($input, $encoding, $charset,
477 $boundary);
478 } elsif ($subtype eq 'html') {
479 return process_html($input, $encoding, $charset,
480 $boundary);
481 } else {
482 print ">>> $content_type content\n";
483 return eat_part($input, $boundary);
484 }
485 } elsif ($type eq 'multipart') {
486 return process_multipart($input, $subtype, $boundary);
487 } else {
488 #
489 # Other types we're not sure what to do with right now
490 # Just put a marker in there
491 #
492
493 print ">>> $content_type attachment";
494 if (defined $name) {
495 print ", name=$name";
496 }
497 print "\n";
498
499 return eat_part($input, $boundary);
500 }
501 }
502
503 #
504 # Process a multipart message.
505 #
506 # When called, we should be right after the beginning of the first
507 # boundary marker. So we should be pointed at header lines which describe
508 # the content of this part
509 #
510
511 sub process_multipart
512 {
513 my ($input, $subtype, $boundary) = @_;
514 my $altout;
515
516 while (1) {
517 my $encoding, $type, $end, $name, $charset;
518
519 #
520 # Use the Mail::Header package to read in any headers
521 # corresponding to this part
522 #
523
524 my $head = Mail::Header->new($input, (MailFrom => 'IGNORE'));
525
526 #
527 # Extract out any Content-Type, Content-Transfer-Encoding, and
528 # Content-Disposition headers
529 #
530
531 my $ctype = Mail::Field->extract('Content-Type', $head);
532 my $cte = Mail::Field->extract('Content-Transfer-Encoding',
533 $head);
534 my $cdispo = Mail::Field->extract('Content-Disposition', $head);
535
536 if (defined $ctype) {
537 $type = $ctype->type;
538 $charset = $ctype->charset;
539 } else {
540 $type = 'text/plain';
541 $charset = 'us-ascii';
542 }
543
544 $encoding = defined $cte ? lc($cte->param('_')) : '7bit';
545 $name = defined $cdispo ? $cdispo->param('filename') : undef;
546
547 #
548 # Special handling for multipart/alternative; pick
549 # the "first" one we can handle (which is usually
550 # text/plain) and silently eat the rest, but output a
551 # warning if we can't handle anything.
552 #
553
554 if ($altout) {
555 $end = eat_part($input, $boundary);
556 } else {
557 my $subboundary = $boundary;
558 my $maintype = (split('/', $type))[0];
559
560 if ($maintype eq 'multipart') {
561 $subboundary = $ctype->boundary;
562 #
563 # Go until we find our beginning of this
564 # part
565 #
566 my $subend = eat_part($input, $subboundary);
567
568 if ($subend ne 'EOP') {
569 print ">>> WARNING: malformed ",
570 "nested multipart\n";
571 return $subend;
572 }
573 }
574
575 $end = process_part($input, $type, $encoding,
576 $charset, $subboundary, $name);
577
578 if ($subtype eq 'alternative' && ! defined $altout &&
579 $type eq 'text/plain') {
580 $altout = 1;
581 }
582
583 #
584 # Since we changed the semantics of $boundary
585 # above for nested multiparts, if we are
586 # handling a nested multipart then find the end
587 # of our current part
588 #
589
590 if ($maintype eq 'multipart') {
591 $end = eat_part($input, $boundary);
592 }
593
594 }
595
596 if ($end eq 'EOM' || $end eq 'EOF') {
597 if ($subtype eq 'alternative' && !defined $altout) {
598 print ">>>multipart/alternative: no suitable ",
599 "parts\n";
600 }
601 return $end;
602 }
603 }
604 }
605
606 #
607 # "Eat" a MIME part; consume content until we hit the boundary or EOF
608 #
609
610 sub eat_part
611 {
612 my ($input, $boundary) = @_;
613 my $ret;
614
615 #
616 # If we weren't given a boundary just eat input until EOF
617 #
618
619 if (! defined $boundary) {
620 while (<$input>) { }
621 return 'EOF';
622 }
623
624 #
625 # Otherwise, consume data until we hit our boundary
626 #
627
628 while (<$input>) {
629 if ($ret = match_boundary($_, $boundary)) {
630 return $ret;
631 }
632 }
633
634 return 'EOF';
635 }
636
637 #
638 # Return the decoder subroutine to use
639 #
640
641 sub find_decoder
642 {
643 my ($encoding) = @_;
644
645 if ($encoding eq '7bit' || $encoding eq '8bit') {
646 return \&null_decoder;
647 } elsif ($encoding eq 'base64') {
648 return \&decode_base64;
649 } elsif ($encoding eq 'quoted-printable') {
650 return \&decode_qp;
651 } else {
652 warn "Unknown encoding: $encoding\n";
653 return undef;
654 }
655 }
656
657 sub null_decoder
658 {
659 my ($input) = @_;
660
661 return $input;
662 }
663
664 #
665 # Match a line against the boundary string
666 #
667
668 sub match_boundary
669 {
670 my ($line, $boundary) = @_;
671
672 return if ! defined $boundary;
673
674 if (substr($line, 0, 2) eq '--') {
675 $line =~ s/[ \t\r\n]+\Z//;
676 if ($line eq "--$boundary") {
677 return 'EOP';
678 } elsif ($line eq "--$boundary--") {
679 return 'EOM';
680 }
681 }
682
683 return undef;
684 }