]> diplodocus.org Git - nmh/blob - docs/historical/mh-6.8.5/sbr/m_getfld.c
sbr/mts.c: Delete mmdlm2; use same-valued mmdlm1 instead.
[nmh] / docs / historical / mh-6.8.5 / sbr / m_getfld.c
1 /* m_getfld.c - read/parse a message */
2 #ifndef lint
3 static char ident[] = "@(#)$Id: m_getfld.c,v 1.15 1993/02/26 21:57:14 jromine Exp $";
4 #endif /* lint */
5
6 #include "../h/mh.h"
7 #include <stdio.h>
8 #include "../zotnet/mts.h"
9 #include <ctype.h>
10
11
12 /* This module has a long and checkered history. First, it didn't burst
13 maildrops correctly because it considered two CTRL-A:s in a row to be
14 an inter-message delimiter. It really is four CTRL-A:s followed by a
15 newline. Unfortunately, MMDF will convert this delimiter *inside* a
16 message to a CTRL-B followed by three CTRL-A:s and a newline. This
17 caused the old version of m_getfld() to declare eom prematurely. The
18 fix was a lot slower than
19
20 c == '\001' && peekc (iob) == '\001'
21
22 but it worked, and to increase generality, UUCP style maildrops could
23 be parsed as well. Unfortunately the speed issue finally caught up with
24 us since this routine is at the very heart of MH.
25
26 To speed things up considerably, the routine Eom() was made an auxilary
27 function called by the macro eom(). Unless we are bursting a maildrop,
28 the eom() macro returns FALSE saying we aren't at the end of the
29 message.
30
31 The next thing to do is to read the mtstailor file and initialize
32 delimiter[] and delimlen accordingly...
33
34 After mhl was made a built-in in msh, m_getfld() worked just fine
35 (using m_unknown() at startup). Until one day: a message which was
36 the result of a bursting was shown. Then, since the burst boundaries
37 aren't CTRL-A:s, m_getfld() would blinding plunge on past the boundary.
38 Very sad. The solution: introduce m_eomsbr(). This hook gets called
39 after the end of each line (since testing for eom involves an fseek()).
40 This worked fine, until one day: a message with no body portion arrived.
41 Then the
42
43 while (eom (c = Getc (iob), iob))
44 continue;
45
46 loop caused m_getfld() to return FMTERR. So, that logic was changed to
47 check for (*eom_action) and act accordingly.
48
49 This worked fine, until one day: someone didn't use four CTRL:A's as
50 their delimiters. So, the bullet got bit and we read mts.h and
51 continue to struggle on. It's not that bad though, since the only time
52 the code gets executed is when inc (or msh) calls it, and both of these
53 have already called mts_init().
54
55 ------------------------
56 (Written by Van Jacobson for the mh6 m_getfld, January, 1986):
57
58 This routine was accounting for 60% of the cpu time used by most mh
59 programs. I spent a bit of time tuning and it now accounts for <10%
60 of the time used. Like any heavily tuned routine, it's a bit
61 complex and you want to be sure you understand everything that it's
62 doing before you start hacking on it. Let me try to emphasize
63 that: every line in this atrocity depends on every other line,
64 sometimes in subtle ways. You should understand it all, in detail,
65 before trying to change any part. If you do change it, test the
66 result thoroughly (I use a hand-constructed test file that exercises
67 all the ways a header name, header body, header continuation,
68 header-body separator, body line and body eom can align themselves
69 with respect to a buffer boundary). "Minor" bugs in this routine
70 result in garbaged or lost mail.
71
72 If you hack on this and slow it down, I, my children and my
73 children's children will curse you.
74
75 This routine gets used on three different types of files: normal,
76 single msg files, "packed" unix or mmdf mailboxs (when used by inc)
77 and packed, directoried bulletin board files (when used by msh).
78 The biggest impact of different file types is in "eom" testing. The
79 code has been carefully organized to test for eom at appropriate
80 times and at no other times (since the check is quite expensive).
81 I have tried to arrange things so that the eom check need only be
82 done on entry to this routine. Since an eom can only occur after a
83 newline, this is easy to manage for header fields. For the msg
84 body, we try to efficiently search the input buffer to see if
85 contains the eom delimiter. If it does, we take up to the
86 delimiter, otherwise we take everything in the buffer. (The change
87 to the body eom/copy processing produced the most noticeable
88 performance difference, particularly for "inc" and "show".)
89
90 There are three qualitatively different things this routine busts
91 out of a message: field names, field text and msg bodies. Field
92 names are typically short (~8 char) and the loop that extracts them
93 might terminate on a colon, newline or max width. I considered
94 using a Vax "scanc" to locate the end of the field followed by a
95 "bcopy" but the routine call overhead on a Vax is too large for this
96 to work on short names. If Berkeley ever makes "inline" part of the
97 C optimiser (so things like "scanc" turn into inline instructions) a
98 change here would be worthwhile.
99
100 Field text is typically 60 - 100 characters so there's (barely)
101 a win in doing a routine call to something that does a "locc"
102 followed by a "bmove". About 30% of the fields have continuations
103 (usually the 822 "received:" lines) and each continuation generates
104 another routine call. "Inline" would be a big win here, as well.
105
106 Messages, as of this writing, seem to come in two flavors: small
107 (~1K) and long (>2K). Most messages have 400 - 600 bytes of headers
108 so message bodies average at least a few hundred characters.
109 Assuming your system uses reasonably sized stdio buffers (1K or
110 more), this routine should be able to remove the body in large
111 (>500 byte) chunks. The makes the cost of a call to "bcopy"
112 small but there is a premium on checking for the eom in packed
113 maildrops. The eom pattern is always a simple string so we can
114 construct an efficient pattern matcher for it (e.g., a Vax "matchc"
115 instruction). Some thought went into recognizing the start of
116 an eom that has been split across two buffers.
117
118 This routine wants to deal with large chunks of data so, rather
119 than "getc" into a local buffer, it uses stdio's buffer. If
120 you try to use it on a non-buffered file, you'll get what you
121 deserve. This routine "knows" that struct FILEs have a _ptr
122 and a _cnt to describe the current state of the buffer and
123 it knows that _filbuf ignores the _ptr & _cnt and simply fills
124 the buffer. If stdio on your system doesn't work this way, you
125 may have to make small changes in this routine.
126
127 This routine also "knows" that an EOF indication on a stream is
128 "sticky" (i.e., you will keep getting EOF until you reposition the
129 stream). If your system doesn't work this way it is broken and you
130 should complain to the vendor. As a consequence of the sticky
131 EOF, this routine will never return any kind of EOF status when
132 there is data in "name" or "buf").
133 */
134
135
136 #define Getc(iob) getc(iob)
137 #define eom(c,iob) (msg_style != MS_DEFAULT && \
138 (((c) == *msg_delim && m_Eom(c,iob)) ||\
139 (eom_action && (*eom_action)(c))))
140
141 static unsigned char *matchc();
142 static unsigned char *locc();
143
144 static unsigned char **pat_map;
145
146 extern int msg_count; /* defined in sbr/m_msgdef.c = 0
147 * disgusting hack for "inc" so it can
148 * know how many characters were stuffed
149 * in the buffer on the last call (see
150 * comments in uip/scansbr.c) */
151
152 extern int msg_style; /* defined in sbr/m_msgdef.c = MS_DEFAULT */
153 /*
154 * The "full" delimiter string for a packed maildrop consists
155 * of a newline followed by the actual delimiter. E.g., the
156 * full string for a Unix maildrop would be: "\n\nFrom ".
157 * "Fdelim" points to the start of the full string and is used
158 * in the BODY case of the main routine to search the buffer for
159 * a possible eom. Msg_delim points to the first character of
160 * the actual delim. string (i.e., fdelim+1). Edelim
161 * points to the 2nd character of actual delimiter string. It
162 * is used in m_Eom because the first character of the string
163 * has been read and matched before m_Eom is called.
164 */
165 extern char *msg_delim; /* defined in sbr/m_msgdef.c = "" */
166 static unsigned char *fdelim;
167 static unsigned char *delimend;
168 static int fdelimlen;
169 static unsigned char *edelim;
170 static int edelimlen;
171
172 static int (*eom_action) () = NULL;
173
174 #ifdef _FSTDIO
175 #define _ptr _p /* Gag */
176 #define _cnt _r /* Retch */
177 #define _filbuf __srget /* Puke */
178 #endif
179
180 /* \f */
181
182 m_getfld (state, name, buf, bufsz, iob)
183 int state;
184 int bufsz;
185 unsigned char *name,
186 *buf;
187 register FILE *iob;
188 {
189 register unsigned char *cp;
190 register unsigned char *bp;
191 register unsigned char *ep;
192 register unsigned char *sp;
193 register int cnt;
194 register int c;
195 register int i;
196 register int j;
197
198 if ((c = Getc(iob)) < 0) {
199 msg_count = 0;
200 *buf = 0;
201 return FILEEOF;
202 }
203 if (eom (c, iob)) {
204 if (! eom_action) {
205 /* flush null messages */
206 while ((c = Getc(iob)) >= 0 && eom (c, iob))
207 ;
208 if (c >= 0)
209 (void) ungetc(c, iob);
210 }
211 msg_count = 0;
212 *buf = 0;
213 return FILEEOF;
214 }
215
216 switch (state) {
217 case FLDEOF:
218 case BODYEOF:
219 case FLD:
220 if (c == '\n' || c == '-') {
221 /* we hit the header/body separator */
222 while (c != '\n' && (c = Getc(iob)) >= 0)
223 ;
224
225 if (c < 0 || (c = Getc(iob)) < 0 || eom (c, iob)) {
226 if (! eom_action) {
227 /* flush null messages */
228 while ((c = Getc(iob)) >= 0 && eom (c, iob))
229 ;
230 if (c >= 0)
231 (void) ungetc(c, iob);
232 }
233 msg_count = 0;
234 *buf = 0;
235 return FILEEOF;
236 }
237 state = BODY;
238 goto body;
239 }
240 /*
241 * get the name of this component. take characters up
242 * to a ':', a newline or NAMESZ-1 characters, whichever
243 * comes first.
244 */
245 cp = name; i = NAMESZ - 1;
246 for (;;) {
247 bp = sp = (unsigned char *) iob->_ptr - 1;
248 j = (cnt = iob->_cnt+1) < i ? cnt : i;
249 while ((c = *bp++) != ':' && c != '\n' && --j >= 0)
250 *cp++ = c;
251
252 j = bp - sp;
253 if ((cnt -= j) <= 0) {
254 if (_filbuf(iob) == EOF) {
255 *cp = *buf = 0;
256 advise (NULLCP, "eof encountered in field \"%s\"",
257 name);
258 return FMTERR;
259 }
260 } else {
261 iob->_ptr = bp + 1;
262 iob->_cnt = cnt - 1;
263 }
264 if (c == ':')
265 break;
266
267 /*
268 * something went wrong. possibilities are:
269 * . hit a newline (error)
270 * . got more than namesz chars. (error)
271 * . hit the end of the buffer. (loop)
272 */
273 if (c == '\n') {
274 *cp = *buf = 0;
275 advise (NULLCP, "eol encountered in field \"%s\"", name);
276 state = FMTERR;
277 goto finish;
278 }
279 if ((i -= j) <= 0) {
280 *cp = *buf = 0;
281 advise (NULLCP, "field name \"%s\" exceeds %d bytes",
282 name, NAMESZ - 1);
283 state = LENERR;
284 goto finish;
285 }
286 }
287
288 while (isspace (*--cp) && cp >= name)
289 ;
290 *++cp = 0;
291 /* fall through */
292
293 case FLDPLUS:
294 /*
295 * get (more of) the text of a field. take
296 * characters up to the end of this field (newline
297 * followed by non-blank) or bufsz-1 characters.
298 */
299 cp = buf; i = bufsz-1;
300 for (;;) {
301 cnt = iob->_cnt++; bp = (unsigned char *) --iob->_ptr;
302 c = cnt < i ? cnt : i;
303 while (ep = locc( c, bp, '\n' )) {
304 /*
305 * if we hit the end of this field, return.
306 */
307 if ((j = *++ep) != ' ' && j != '\t') {
308 j = ep - (unsigned char *) iob->_ptr;
309 (void) bcopy( iob->_ptr, cp, j);
310 iob->_ptr = ep; iob->_cnt -= j;
311 cp += j;
312 state = FLD;
313 goto finish;
314 }
315 c -= ep - bp; bp = ep;
316 }
317 /*
318 * end of input or dest buffer - copy what we've found.
319 */
320 c += bp - (unsigned char *) iob->_ptr;
321 (void) bcopy( iob->_ptr, cp, c);
322 i -= c; cp += c;
323 if (i <= 0) {
324 /* the dest buffer is full */
325 iob->_cnt -= c; iob->_ptr += c;
326 state = FLDPLUS;
327 break;
328 }
329 /*
330 * There's one character left in the input buffer.
331 * Copy it & fill the buffer. If the last char
332 * was a newline and the next char is not whitespace,
333 * this is the end of the field. Otherwise loop.
334 */
335 --i;
336 *cp++ = j = *(iob->_ptr + c);
337 c = _filbuf(iob);
338 if ((j == '\0' || j == '\n') && c != ' ' && c != '\t') {
339 if (c != EOF)
340 --iob->_ptr, ++iob->_cnt;
341 state = FLD;
342 break;
343 }
344 }
345 break;
346
347 case BODY:
348 body:
349 /*
350 * get the message body up to bufsz characters or the
351 * end of the message. Sleazy hack: if bufsz is negative
352 * we assume that we were called to copy directly into
353 * the output buffer and we don't add an eos.
354 */
355 i = (bufsz < 0) ? -bufsz : bufsz-1;
356 bp = (unsigned char *) --iob->_ptr; cnt = ++iob->_cnt;
357 c = (cnt < i ? cnt : i);
358 if (msg_style != MS_DEFAULT && c > 1) {
359 /*
360 * packed maildrop - only take up to the (possible)
361 * start of the next message. This "matchc" should
362 * probably be a Boyer-Moore matcher for non-vaxen,
363 * particularly since we have the alignment table
364 * all built for the end-of-buffer test (next).
365 * But our vax timings indicate that the "matchc"
366 * instruction is 50% faster than a carefully coded
367 * B.M. matcher for most strings. (So much for elegant
368 * algorithms vs. brute force.) Since I (currently)
369 * run MH on a vax, we use the matchc instruction. --vj
370 */
371 if (ep = matchc( fdelimlen, fdelim, c, bp ) )
372 c = ep - bp + 1;
373 else {
374 /*
375 * There's no delim in the buffer but there may be
376 * a partial one at the end. If so, we want to leave
377 * it so the "eom" check on the next call picks it up.
378 * Use a modified Boyer-Moore matcher to make this
379 * check relatively cheap. The first "if" figures
380 * out what position in the pattern matches the last
381 * character in the buffer. The inner "while" matches
382 * the pattern against the buffer, backwards starting
383 * at that position. Note that unless the buffer
384 * ends with one of the characters in the pattern
385 * (excluding the first and last), we do only one test.
386 */
387 ep = bp + c - 1;
388 if (sp = pat_map[*ep]) {
389 do {
390 cp = sp;
391 while (*--ep == *--cp)
392 ;
393 if (cp < fdelim) {
394 if (ep >= bp)
395 /*
396 * ep < bp means that all the buffer
397 * contains is a prefix of delim.
398 * If this prefix is really a delim, the
399 * m_eom call at entry should have found
400 * it. Thus it's not a delim and we can
401 * take all of it.
402 */
403 c = (ep - bp) + 2;
404 break;
405 }
406 /* try matching one less char of delim string */
407 ep = bp + c - 1;
408 } while (--sp > fdelim);
409 }
410 }
411 }
412 (void) bcopy( bp, buf, c );
413 iob->_cnt -= c;
414 iob->_ptr += c;
415 if (bufsz < 0) {
416 msg_count = c;
417 return (state);
418 }
419 cp = buf + c;
420 break;
421
422 default:
423 adios (NULLCP, "m_getfld() called with bogus state of %d", state);
424 }
425 finish:;
426 *cp = 0;
427 msg_count = cp - buf;
428 return (state);
429 }
430
431 /* \f */
432
433 #ifdef RPATHS
434 static char unixbuf[BUFSIZ] = "";
435 #endif /* RPATHS */
436
437 void
438 m_unknown(iob)
439 register FILE *iob;
440 {
441 register int c;
442 register long pos;
443 char text[10];
444 register char *cp;
445 register char *delimstr;
446
447 msg_style = MS_UNKNOWN;
448
449 /* Figure out what the message delimitter string is for this
450 * maildrop. (This used to be part of m_Eom but I didn't like
451 * the idea of an "if" statement that could only succeed on the
452 * first call to m_Eom getting executed on each call, i.e., at
453 * every newline in the message).
454 *
455 * If the first line of the maildrop is a Unix "from" line, we say the
456 * style is UUCP and eat the rest of the line. Otherwise we say the style
457 * is MMDF & look for the delimiter string specified when MH was built
458 * (or from the mtstailor file).
459 */
460 pos = ftell (iob);
461 if (fread (text, sizeof *text, 5, iob) == 5
462 && strncmp (text, "From ", 5) == 0) {
463 msg_style = MS_UUCP;
464 delimstr = "\nFrom ";
465 #ifndef RPATHS
466 while ((c = getc (iob)) != '\n' && c >= 0)
467 ;
468 #else /* RPATHS */
469 cp = unixbuf;
470 while ((c = getc (iob)) != '\n')
471 *cp++ = c;
472 *cp = 0;
473 #endif /* RPATHS */
474 } else {
475 /* not a Unix style maildrop */
476 (void) fseek (iob, pos, 0);
477 if (mmdlm2 == NULLCP || *mmdlm2 == 0)
478 mmdlm2 = "\001\001\001\001\n";
479 delimstr = mmdlm2;
480 msg_style = MS_MMDF;
481 }
482 c = strlen (delimstr);
483 fdelim = (unsigned char *)malloc((unsigned)c + 3);
484 *fdelim++ = '\0';
485 *fdelim = '\n';
486 msg_delim = (char *)fdelim+1;
487 edelim = (unsigned char *)msg_delim+1;
488 fdelimlen = c + 1;
489 edelimlen = c - 1;
490 (void)strcpy(msg_delim, delimstr);
491 delimend = (unsigned char *)msg_delim + edelimlen;
492 if (edelimlen <= 1)
493 adios (NULLCP, "maildrop delimiter must be at least 2 bytes");
494 /*
495 * build a Boyer-Moore end-position map for the matcher in m_getfld.
496 * N.B. - we don't match just the first char (since it's the newline
497 * separator) or the last char (since the matchc would have found it
498 * if it was a real delim).
499 */
500 pat_map = (unsigned char **) calloc (256, sizeof (unsigned char *));
501
502 for (cp = (char *)fdelim + 1; cp < (char *)delimend; cp++ )
503 pat_map[*cp] = (unsigned char *)cp;
504
505 if (msg_style == MS_MMDF) {
506 /* flush extra msg hdrs */
507 while ((c = Getc(iob)) >= 0 && eom (c, iob))
508 ;
509 if (c >= 0)
510 (void) ungetc(c, iob);
511 }
512 }
513
514
515 void m_eomsbr (action)
516 int (*action) ();
517 {
518 if (eom_action = action) {
519 msg_style = MS_MSH;
520 *msg_delim = 0;
521 fdelimlen = 1;
522 delimend = fdelim;
523 } else {
524 msg_style = MS_MMDF;
525 msg_delim = (char *)fdelim + 1;
526 fdelimlen = strlen((char *)fdelim);
527 delimend = (unsigned char *)(msg_delim + edelimlen);
528 }
529 }
530
531 /* \f */
532
533 /* test for msg delimiter string */
534
535 int m_Eom (c, iob)
536 register int c;
537 register FILE *iob;
538 {
539 register long pos = 0L;
540 register int i;
541 char text[10];
542 #ifdef RPATHS
543 register char *cp;
544 #endif /* RPATHS */
545
546 pos = ftell (iob);
547 if ((i = fread (text, sizeof *text, edelimlen, iob)) != edelimlen
548 || strncmp (text, (char *)edelim, edelimlen)) {
549 if (i == 0 && msg_style == MS_UUCP)
550 /* the final newline in the (brain damaged) unix-format
551 * maildrop is part of the delimitter - delete it.
552 */
553 return 1;
554
555 #ifdef notdef
556 (void) fseek (iob, pos, 0);
557 #else
558 (void) fseek (iob, (long)(pos-1), 0);
559 (void) getc (iob); /* should be OK */
560 #endif /* !notdef */
561 return 0;
562 }
563
564 if (msg_style == MS_UUCP) {
565 #ifndef RPATHS
566 while ((c = getc (iob)) != '\n')
567 if (c < 0)
568 break;
569 #else /* RPATHS */
570 cp = unixbuf;
571 while ((c = getc (iob)) != '\n' && c >= 0)
572 *cp++ = c;
573 *cp = 0;
574 #endif /* RPATHS */
575 }
576
577 return 1;
578 }
579
580 /* \f */
581
582 #ifdef RPATHS
583 char *unixline () {
584 register char *cp,
585 *dp,
586 *pp;
587 static char unixfrom[BUFSIZ];
588
589 pp = unixfrom;
590 if (cp = dp = index (unixbuf, ' ')) {
591 while (cp = index (cp + 1, 'r'))
592 if (strncmp (cp, "remote from ", 12) == 0) {
593 *cp = 0;
594 (void) sprintf (pp, "%s!", cp + 12);
595 pp += strlen (pp);
596 break;
597 }
598 if (cp == NULL)
599 cp = unixbuf + strlen (unixbuf);
600 if ((cp -= 25) >= dp)
601 *cp = 0;
602 }
603
604 (void) sprintf (pp, "%s\n", unixbuf);
605 unixbuf[0] = 0;
606 return unixfrom;
607 }
608 #endif /* RPATHS */
609
610 /* \f */
611
612 #if (vax && !lint)
613 asm(".align 1");
614 asm("_matchc: .word 0");
615 asm(" movq 4(ap),r0");
616 asm(" movq 12(ap),r2");
617 asm(" matchc r0,(r1),r2,(r3)");
618 asm(" beql 1f");
619 asm(" movl 4(ap),r3");
620 asm("1: subl3 4(ap),r3,r0");
621 asm(" ret");
622 #else
623 static unsigned char *
624 matchc( patln, pat, strln, str )
625 int patln;
626 char *pat;
627 int strln;
628 register char *str;
629 {
630 register char *es = str + strln - patln;
631 register char *sp;
632 register char *pp;
633 register char *ep = pat + patln;
634 register char pc = *pat++;
635
636 for(;;) {
637 while (pc != *str++)
638 if (str > es)
639 return 0;
640
641 sp = str; pp = pat;
642 while (pp < ep && *sp++ == *pp)
643 pp++;
644 if (pp >= ep)
645 return ((unsigned char *)--str);
646 }
647 }
648 #endif
649
650 /* \f */
651
652 /*
653 * Locate character "term" in the next "cnt" characters of "src".
654 * If found, return its address, otherwise return 0.
655 */
656 #if (vax && !lint)
657 asm(".align 1");
658 asm("_locc: .word 0");
659 asm(" movq 4(ap),r0");
660 asm(" locc 12(ap),r0,(r1)");
661 asm(" beql 1f");
662 asm(" movl r1,r0");
663 asm("1: ret");
664 #else
665 static unsigned char *
666 locc( cnt, src, term )
667 register int cnt;
668 register unsigned char *src;
669 register unsigned char term;
670 {
671 while (*src++ != term && --cnt > 0);
672
673 return (cnt > 0 ? --src : (unsigned char *)0);
674 }
675 #endif
676
677 /* \f */
678
679 #if !defined (BSD42) && !defined (bcopy)
680 int bcmp (b1, b2, length)
681 register char *b1,
682 *b2;
683 register int length;
684 {
685 while (length-- > 0)
686 if (*b1++ != *b2++)
687 return 1;
688
689 return 0;
690 }
691
692
693 bcopy (b1, b2, length)
694 register char *b1,
695 *b2;
696 register int length;
697 {
698 while (length-- > 0)
699 *b2++ = *b1++;
700 }
701
702
703 bzero (b, length)
704 register char *b;
705 register int length;
706 {
707 while (length-- > 0)
708 *b++ = 0;
709 }
710 #endif /* not BSD42 */