]> diplodocus.org Git - nmh/blob - sbr/m_getfld.c
new.c: Order two return statements to match comment.
[nmh] / sbr / m_getfld.c
1 /* m_getfld.c -- read/parse a message
2 *
3 * This code is Copyright (c) 2002, by the authors of nmh. See the
4 * COPYRIGHT file in the root directory of the nmh distribution for
5 * complete copyright information.
6 */
7
8 #include <h/mh.h>
9 #include <h/mts.h>
10 #include <h/utils.h>
11 #include <inttypes.h>
12
13 /*
14 Purpose
15 =======
16 Reads an Internet message (RFC 5322), or one or more messages
17 stored in a maildrop in mbox (RFC 4155) or MMDF format, from a file
18 stream. Each call to m_getfld() reads one header field, or a
19 portion of the body, in sequence.
20
21 Inputs
22 ======
23 gstate: opaque parse state
24 bufsz: maximum number of characters to load into buf
25 iob: input file stream
26
27 Outputs
28 =======
29 name: header field name (array of size NAMESZ=999)
30 buf: either a header field body or message body
31 bufsz: number of characters loaded into buf
32 (return value): message parse state on return from function
33
34 Functions
35 =========
36 void m_getfld_state_destroy (m_getfld_state_t *gstate): destroys
37 the parse state pointed to by the gstate argument.
38
39 m_getfld_state_reset (m_getfld_state_t *gstate): resets the parse
40 state to FLD.
41
42 void m_unknown(FILE *iob): Determines the message delimiter string
43 for the maildrop. Called by inc and scan when reading from a
44 maildrop file.
45
46 State variables
47 ===============
48 m_getfld() retains state internally between calls in the
49 m_getfld_state_t variable. These are used for detecting the end of
50 each message when reading maildrops:
51
52 char **pat_map
53 char *fdelim
54 char *delimend
55 int fdelimlen
56 char *edelim
57 int edelimlen
58 char *msg_delim
59 int msg_style
60
61 Usage
62 =====
63 m_getfld_state_t gstate;
64
65 gstate = m_getfld_state_init(mailfp);
66 Perhaps m_getfld_track_filepos2(&gstate);
67 ...
68 state = m_getfld2(&gstate, ...);
69 ...Repeat until finished with mailfp.
70 m_getfld_state_destroy (&gstate);
71
72 The state is retained internally by gstate. To reset its state to FLD:
73 m_getfld_state_reset (&gstate);
74 */
75
76 /* The following described the old implementation. The high-level
77 structure hasn't changed, but some of the details have. I'm
78 leaving this as-is, though, for posterity.
79 */
80
81 /* This module has a long and checkered history. First, it didn't burst
82 maildrops correctly because it considered two CTRL-A:s in a row to be
83 an inter-message delimiter. It really is four CTRL-A:s followed by a
84 newline. Unfortunately, MMDF will convert this delimiter *inside* a
85 message to a CTRL-B followed by three CTRL-A:s and a newline. This
86 caused the old version of m_getfld() to declare eom prematurely. The
87 fix was a lot slower than
88
89 c == '\001' && peekc (iob) == '\001'
90
91 but it worked, and to increase generality, MBOX style maildrops could
92 be parsed as well. Unfortunately the speed issue finally caught up with
93 us since this routine is at the very heart of MH.
94
95 To speed things up considerably, the routine Eom() was made an auxiliary
96 function called by the macro eom(). Unless we are bursting a maildrop,
97 the eom() macro returns false saying we aren't at the end of the
98 message.
99
100 The next thing to do is to read the mts.conf file and initialize
101 delimiter[] and delimlen accordingly...
102
103 After mhl was made a built-in in msh, m_getfld() worked just fine
104 (using m_unknown() at startup). Until one day: a message which was
105 the result of a bursting was shown. Then, since the burst boundaries
106 aren't CTRL-A:s, m_getfld() would blinding plunge on past the boundary.
107 Very sad. The solution: introduce m_eomsbr(). This hook gets called
108 after the end of each line (since testing for eom involves an fseek()).
109 This worked fine, until one day: a message with no body portion arrived.
110 Then the
111
112 while (eom (c = Getc (iob), iob))
113 continue;
114
115 loop caused m_getfld() to return FMTERR. So, that logic was changed to
116 check for (*eom_action) and act accordingly.
117
118 This worked fine, until one day: someone didn't use four CTRL:A's as
119 their delimiters. So, the bullet got bit and we read mts.h and
120 continue to struggle on. It's not that bad though, since the only time
121 the code gets executed is when inc (or msh) calls it, and both of these
122 have already called mts_init().
123
124 ------------------------
125 (Written by Van Jacobson for the mh6 m_getfld, January, 1986):
126
127 This routine was accounting for 60% of the cpu time used by most mh
128 programs. I spent a bit of time tuning and it now accounts for <10%
129 of the time used. Like any heavily tuned routine, it's a bit
130 complex and you want to be sure you understand everything that it's
131 doing before you start hacking on it. Let me try to emphasize
132 that: every line in this atrocity depends on every other line,
133 sometimes in subtle ways. You should understand it all, in detail,
134 before trying to change any part. If you do change it, test the
135 result thoroughly (I use a hand-constructed test file that exercises
136 all the ways a header name, header body, header continuation,
137 header-body separator, body line and body eom can align themselves
138 with respect to a buffer boundary). "Minor" bugs in this routine
139 result in garbaged or lost mail.
140
141 If you hack on this and slow it down, I, my children and my
142 children's children will curse you.
143
144 This routine gets used on three different types of files: normal,
145 single msg files, "packed" unix or mmdf mailboxs (when used by inc)
146 and packed, directoried bulletin board files (when used by msh).
147 The biggest impact of different file types is in "eom" testing. The
148 code has been carefully organized to test for eom at appropriate
149 times and at no other times (since the check is quite expensive).
150 I have tried to arrange things so that the eom check need only be
151 done on entry to this routine. Since an eom can only occur after a
152 newline, this is easy to manage for header fields. For the msg
153 body, we try to efficiently search the input buffer to see if
154 contains the eom delimiter. If it does, we take up to the
155 delimiter, otherwise we take everything in the buffer. (The change
156 to the body eom/copy processing produced the most noticeable
157 performance difference, particularly for "inc" and "show".)
158
159 There are three qualitatively different things this routine busts
160 out of a message: field names, field text and msg bodies. Field
161 names are typically short (~8 char) and the loop that extracts them
162 might terminate on a colon, newline or max width. I considered
163 using a Vax "scanc" to locate the end of the field followed by a
164 "memmove" but the routine call overhead on a Vax is too large for this
165 to work on short names. If Berkeley ever makes "inline" part of the
166 C optimiser (so things like "scanc" turn into inline instructions) a
167 change here would be worthwhile.
168
169 Field text is typically 60 - 100 characters so there's (barely)
170 a win in doing a routine call to something that does a "locc"
171 followed by a "bmove". About 30% of the fields have continuations
172 (usually the 822 "received:" lines) and each continuation generates
173 another routine call. "Inline" would be a big win here, as well.
174
175 Messages, as of this writing, seem to come in two flavors: small
176 (~1K) and long (>2K). Most messages have 400 - 600 bytes of headers
177 so message bodies average at least a few hundred characters.
178 Assuming your system uses reasonably sized stdio buffers (1K or
179 more), this routine should be able to remove the body in large
180 (>500 byte) chunks. The makes the cost of a call to "memmove"
181 small but there is a premium on checking for the eom in packed
182 maildrops. The eom pattern is always a simple string so we can
183 construct an efficient pattern matcher for it (e.g., a Vax "matchc"
184 instruction). Some thought went into recognizing the start of
185 an eom that has been split across two buffers.
186
187 This routine wants to deal with large chunks of data so, rather
188 than "getc" into a local buffer, it uses stdio's buffer. If
189 you try to use it on a non-buffered file, you'll get what you
190 deserve. This routine "knows" that struct FILEs have a _ptr
191 and a _cnt to describe the current state of the buffer and
192 it knows that _filbuf ignores the _ptr & _cnt and simply fills
193 the buffer. If stdio on your system doesn't work this way, you
194 may have to make small changes in this routine.
195
196 This routine also "knows" that an EOF indication on a stream is
197 "sticky" (i.e., you will keep getting EOF until you reposition the
198 stream). If your system doesn't work this way it is broken and you
199 should complain to the vendor. As a consequence of the sticky
200 EOF, this routine will never return any kind of EOF status when
201 there is data in "name" or "buf").
202 */
203
204 /*
205 * static prototypes
206 */
207 static void Ungetc(m_getfld_state_t s);
208 static int m_Eom (m_getfld_state_t);
209
210 #define eom(c,s) (s->msg_style != MS_DEFAULT && \
211 ((c) == *s->msg_delim && m_Eom(s)))
212
213 /*
214 * Maildrop styles
215 */
216 #define MS_DEFAULT 0 /* default (one msg per file) */
217 #define MS_UNKNOWN 1 /* type not known yet */
218 #define MS_MBOX 2 /* Unix-style "from" lines */
219 #define MS_MMDF 3 /* string MMDF_DELIM */
220
221 /* This replaces the old approach, with its direct access to stdio
222 * internals. It uses one fread() to load a buffer that we manage.
223 *
224 * MSG_INPUT_SIZE is the size of the buffer.
225 * MAX_DELIMITER_SIZE is the maximum size of the delimiter used to
226 * separate messages in a maildrop, such as mbox "From ".
227 *
228 * Some of the tests in the test suite assume a MSG_INPUT_SIZE
229 * of 8192.
230 */
231 #define MSG_INPUT_SIZE NMH_BUFSIZ
232 #define MAX_DELIMITER_SIZE 5
233
234 struct m_getfld_state {
235 /* The file to read from; I/O block. Caller keeps passing it after
236 * initialisation due to historic interface so it keeps getting
237 * updated, presumably to the same value. */
238 FILE *iob;
239
240 /* Holds content of iob. */
241 char msg_buf[2 * MSG_INPUT_SIZE + MAX_DELIMITER_SIZE];
242 /* Points to the next byte to read from msg_buf. */
243 char *readpos;
244 /* Points to just after the last valid byte in msg_buf. If readpos
245 * equals end then msg_buf is empty. */
246 char *end;
247
248 /* Whether the caller intends to ftell(3)/fseek(3) iob's position,
249 * and thus whether m_getfld() needs to detect that and compensate. */
250 int track_filepos;
251 /* Position in iob given what's been consumed ready for returning to
252 * the caller. Further than this may have been read into msg_buf. */
253 off_t total_bytes_read;
254 /* Bytes of iob consumed during this call. */
255 off_t bytes_read;
256 /* What fseeko(3) tells us iob's position is having just explicitly
257 * set it to total_bytes_read. Surely always the same? */
258 off_t last_caller_pos;
259 /* Saved position in iob from filling msg_buf, prior to returning. */
260 off_t last_internal_pos;
261
262 /* One of the MS_* macros tracking the type of iob's content and
263 * thus if it's a single email, or several with delimeters. Default
264 * is MS_DEFAULT. */
265 int msg_style;
266
267 /* The message delimeter if iob has multiple emails, else NULL. For
268 * MS_MBOX it's the string that separates two emails, "\nFrom ",
269 * i.e. the terminating blank line of the previous email, and the
270 * starting From_ line of the next, but for MS_MMDF it's
271 * "\001\001\001\001\n" that may start or terminate an email. */
272 char *msg_delim;
273 /* The last non-NUL char of msg_delim. */
274 char *delimend;
275 /* When searching for msg_delim after an email, it's only of
276 * interest at the start of the line, i.e. when preceded by a
277 * linefeed. fdelim points to msg_delim[-1] that contains '\n' so
278 * it can be used as the needle. */
279 char *fdelim;
280 /* strlen(fdelim). */
281 int fdelimlen;
282 /* The second char of msg_delim. Used when the first char has
283 * already been matched to test the rest. */
284 char *edelim;
285 /* strlen(edelim). */
286 int edelimlen;
287 /* The relationship between all of these pointers and lengths for
288 * the two possible msg_delim values.
289 *
290 * "\0\n\nFrom \0" 9 "\0\n\001\001\001\001\n\0" 8
291 * | || | | | | |
292 * | || s->delimend | | | s->delimend
293 * | || | | |
294 * | |s->edelim s->edelimlen=5 | | s->edelim s->edelimlen=4
295 * | | | |
296 * | s->msg_delim | s->msg_delim
297 * | |
298 * s->fdelim s->fdelimlen=7 s->fdelim s->fdelimlen=6
299 */
300
301 /* Maps all the bytes of msg_delim, apart from the last two,
302 * including the NUL, onto the last position in msg_delim where they
303 * occur. Bytes not present are NULL. */
304 char **pat_map;
305
306 /* The parser's current state. Also returned to the caller, amongst
307 * other possible values, to indicate the token consumed. One of
308 * FLD, FLDPLUS, BODY, or FILEEOF. */
309 int state;
310 };
311
312 m_getfld_state_t m_getfld_state_init(FILE *iob)
313 {
314 m_getfld_state_t s;
315
316 NEW(s);
317 s->readpos = s->end = s->msg_buf;
318 s->bytes_read = s->total_bytes_read = 0;
319 s->last_caller_pos = s->last_internal_pos = 0;
320 s->iob = iob;
321 s->pat_map = NULL;
322 s->msg_style = MS_DEFAULT;
323 s->msg_delim = "";
324 s->fdelim = s->delimend = s->edelim = NULL;
325 s->fdelimlen = s->edelimlen = 0;
326 s->state = FLD;
327 s->track_filepos = 0;
328
329 return s;
330 }
331
332 /* scan() needs to force an initial state of FLD for each message. */
333 void
334 m_getfld_state_reset (m_getfld_state_t *gstate) {
335 if (*gstate) {
336 (*gstate)->state = FLD;
337 }
338 }
339
340 /* If the caller interleaves ftell*()/fseek*() calls with m_getfld()
341 calls, m_getfld() must keep track of the file position. The caller
342 must use this function to inform m_getfld(). */
343 void
344 m_getfld_track_filepos (m_getfld_state_t *gstate, FILE *iob) {
345 if (! *gstate) {
346 *gstate = m_getfld_state_init(iob);
347 }
348
349 (*gstate)->track_filepos = 1;
350 }
351
352 /* m_getfld_track_filepos() with the existing iob. */
353 void m_getfld_track_filepos2(m_getfld_state_t *gstate)
354 {
355 if (!*gstate)
356 adios(NULL, "m_getfld_track_filepos2 without gstate");
357
358 m_getfld_track_filepos(gstate, (*gstate)->iob);
359 }
360
361 void m_getfld_state_destroy (m_getfld_state_t *gstate) {
362 m_getfld_state_t s = *gstate;
363
364 if (s) {
365 if (s->fdelim) {
366 free (s->fdelim-1);
367 free (s->pat_map);
368 }
369 free (s);
370 *gstate = 0;
371 }
372 }
373
374 /*
375 Summary of file and message input buffer positions:
376
377 input file -------------------------------------------EOF
378 | |
379 last_caller_pos last_internal_pos
380
381
382 msg_buf --------------------EOF
383 | | |
384 msg_buf readpos end
385
386 |<>|=retained characters, difference
387 between last_internal_pos and
388 first readpos value after reading
389 in new chunk in read_more()
390
391 When returning from m_getfld()/m_unknown():
392 1) Save the internal file position in last_internal_pos. That's the
393 m_getfld() position reference in the input file.
394 2) Set file stream position so that callers can use ftell().
395
396 When entering m_getfld()/m_unknown():
397 Check to see if the call had changed the file position. If so,
398 adjust the internal position reference accordingly. If not, restore
399 the internal file position from last_internal_pos.
400 */
401
402
403 static void
404 enter_getfld (m_getfld_state_t *gstate, FILE *iob) {
405 m_getfld_state_t s;
406 off_t pos;
407 off_t pos_movement;
408
409 if (! *gstate) {
410 *gstate = m_getfld_state_init(iob);
411 }
412 s = *gstate;
413 s->bytes_read = 0;
414
415 /* This is ugly and no longer necessary, but is retained just in
416 case it's needed again. The parser used to open the input file
417 multiple times, so we had to always use the FILE * that's
418 passed to m_getfld(). Now the parser inits a new
419 m_getfld_state for each file. See comment below about the
420 readpos shift code being currently unused. */
421 s->iob = iob;
422
423 if (!s->track_filepos)
424 return;
425
426 if ((pos = ftello(iob)) == -1)
427 adios("getfld's iob", "failed to get offset on entry");
428 if (pos == 0 && s->last_internal_pos == 0)
429 return;
430
431 if (s->last_internal_pos == 0) {
432 s->total_bytes_read = pos;
433 return;
434 }
435
436 pos_movement = pos - s->last_caller_pos; /* Can be < 0. */
437 if (pos_movement == 0) {
438 pos = s->last_internal_pos;
439 } else {
440 /* The current file stream position differs from the
441 last one, so caller must have called ftell/o().
442 Or, this is the first call and the file position
443 was not at 0. */
444
445 if (s->readpos + pos_movement >= s->msg_buf &&
446 s->readpos + pos_movement < s->end) {
447 /* This is currently unused. It could be used by
448 parse_mime() if it was changed to use a global
449 m_getfld_state. */
450 /* We can shift readpos and remain within the
451 bounds of msg_buf. */
452 s->readpos += pos_movement;
453 s->total_bytes_read += pos_movement;
454 pos = s->last_internal_pos;
455 } else {
456 off_t off;
457 size_t num_read;
458
459 /* This seek skips past an integral number of
460 chunks of size MSG_INPUT_SIZE. */
461 off = pos / MSG_INPUT_SIZE * MSG_INPUT_SIZE;
462 if (fseeko(iob, off, SEEK_SET) == -1)
463 adios("getfld's iob", "failed to set offset to skip: "
464 "%" PRIdMAX, (intmax_t)off);
465 num_read = fread (s->msg_buf, 1, MSG_INPUT_SIZE, iob);
466 s->readpos = s->msg_buf + pos % MSG_INPUT_SIZE;
467 s->end = s->msg_buf + num_read;
468 s->total_bytes_read = pos;
469 }
470 }
471
472 if (fseeko(iob, pos, SEEK_SET) == -1)
473 adios("getfld's iob", "failed to set offset on entry: %" PRIdMAX,
474 (intmax_t)pos);
475 }
476
477 static void
478 leave_getfld (m_getfld_state_t s) {
479 s->total_bytes_read += s->bytes_read;
480
481 if (s->track_filepos) {
482 /* Save the internal file position that we use for the input buffer. */
483 if ((s->last_internal_pos = ftello(s->iob)) == -1)
484 adios("getfld's iob", "failed to get offset before seek");
485
486 /* Set file stream position so that callers can use ftell(). */
487 if (fseeko(s->iob, s->total_bytes_read, SEEK_SET) == -1)
488 adios("getfld's iob", "failed to set offset: %" PRIdMAX,
489 (intmax_t)s->total_bytes_read);
490
491 s->last_caller_pos = s->total_bytes_read;
492 }
493 }
494
495 static size_t
496 read_more (m_getfld_state_t s) {
497 /* Retain at least edelimlen characters that have already been read,
498 if at least edelimlen have been read, so that we can back up to them
499 in m_Eom(). */
500 ssize_t retain = s->end - s->msg_buf < s->edelimlen ? 0 : s->edelimlen;
501 size_t num_read;
502
503 if (retain > 0) {
504 if (retain < s->end - s->readpos)
505 retain = s->end - s->readpos;
506 assert (retain <= s->readpos - s->msg_buf);
507
508 /* Move what we want to retain at end of the buffer to the beginning. */
509 memmove (s->msg_buf, s->readpos - retain, retain);
510 }
511
512 s->readpos = s->msg_buf + retain;
513 num_read = fread (s->readpos, 1, MSG_INPUT_SIZE, s->iob);
514 s->end = s->readpos + num_read;
515
516 return num_read;
517 }
518
519 /* Return the next character consumed from the input, fetching more of
520 * the input for the buffer if required, or EOF on end of file. */
521 static int
522 Getc (m_getfld_state_t s) {
523 if ((s->end - s->readpos < 1 && read_more (s) == 0) ||
524 s->readpos >= s->end)
525 return EOF;
526
527 s->bytes_read++;
528 return (unsigned char)*s->readpos++;
529 }
530
531 /* Return the next character that Getc() would return, which may be EOF. */
532 static int
533 Peek (m_getfld_state_t s)
534 {
535 int c;
536
537 c = Getc(s);
538 if (c != EOF)
539 Ungetc(s);
540
541 return c;
542 }
543
544 /* If there's room, undo the consumption of one character from msg_buf,
545 * rewinding so it's read next, else die. */
546 static void
547 Ungetc(m_getfld_state_t s)
548 {
549 if (s->readpos == s->msg_buf)
550 adios(NULL, "Ungetc() at start of message buffer.");
551
552 s->readpos--;
553 s->bytes_read--;
554 }
555
556
557 int
558 m_getfld (m_getfld_state_t *gstate, char name[NAMESZ], char *buf, int *bufsz,
559 FILE *iob)
560 {
561 m_getfld_state_t s;
562 char *cp;
563 int max, n, c;
564
565 enter_getfld (gstate, iob);
566 s = *gstate;
567
568 if ((c = Getc(s)) == EOF) {
569 *bufsz = *buf = 0;
570 leave_getfld (s);
571 return s->state = FILEEOF;
572 }
573 if (eom (c, s)) {
574 /* flush null messages */
575 while ((c = Getc(s)) != EOF && eom (c, s))
576 ;
577
578 if (c != EOF)
579 Ungetc(s);
580 *bufsz = *buf = 0;
581 leave_getfld (s);
582 return s->state = FILEEOF;
583 }
584
585 switch (s->state) {
586 case FLD:
587 if (c == '\n' || c == '-') {
588 /* we hit the header/body separator */
589 while (c != '\n' && (c = Getc(s)) != EOF)
590 ;
591
592 if (c == EOF || (c = Getc(s)) == EOF || eom (c, s)) {
593 /* flush null messages */
594 while ((c = Getc(s)) != EOF && eom (c, s))
595 ;
596 if (c != EOF)
597 Ungetc(s);
598 *bufsz = *buf = 0;
599 leave_getfld (s);
600 return s->state = FILEEOF;
601 }
602 s->state = BODY;
603 goto body;
604 }
605 /*
606 * get the name of this component. take characters up
607 * to a ':', a newline or NAMESZ-1 characters, whichever
608 * comes first.
609 */
610 cp = name;
611 max = NAMESZ - 1;
612 /* Get the field name. The first time through the loop,
613 this copies out the first character, which was loaded
614 into c prior to loop entry. Initialize n to 1 to
615 account for that. */
616 for (n = 1;
617 c != ':' && c != '\n' && c != EOF && n < max;
618 ++n, c = Getc (s)) {
619 *cp++ = c;
620 }
621
622 /* Check for next character, which is either the space after
623 the ':' or the first folded whitespace. */
624 {
625 int next_char;
626 if (c == EOF || (next_char = Peek (s)) == EOF) {
627 *bufsz = *cp = *buf = 0;
628 inform("eof encountered in field \"%s\"", name);
629 leave_getfld (s);
630 return s->state = FMTERR;
631 }
632 }
633
634 /* If c isn't ':' here, something went wrong. Possibilities are:
635 * . hit a newline (error)
636 * . got more than namesz chars. (error)
637 */
638 if (c == ':') {
639 /* Finished header name, fall through to FLDPLUS below. */
640 } else if (c == '\n') {
641 /* We hit the end of the line without seeing ':' to
642 * terminate the field name. This is usually (always?)
643 * spam. But, blowing up is lame, especially when
644 * scan(1)ing a folder with such messages. Pretend such
645 * lines are the first of the body (at least mutt also
646 * handles it this way). */
647
648 /* See if buf can hold this line, since we were assuming
649 * we had a buffer of NAMESZ, not bufsz. */
650 /* + 1 for the newline */
651 if (*bufsz < n + 1) {
652 /* No, it can't. Oh well, guess we'll blow up. */
653 *bufsz = *cp = *buf = 0;
654 inform("eol encountered in field \"%s\"", name);
655 s->state = FMTERR;
656 break;
657 }
658 memcpy (buf, name, n - 1);
659 buf[n - 1] = '\n';
660 buf[n] = '\0';
661 /* Indicate this wasn't a header field using a character
662 that can't appear in a header field. */
663 name[0] = ':';
664 /* The last character read was '\n'. s->bytes_read
665 (and n) include that, but it was not put into the
666 name array in the for loop above. So subtract 1. */
667 *bufsz = --s->bytes_read; /* == n - 1 */
668 leave_getfld (s);
669 return s->state = BODY;
670 }
671 if (max <= n) {
672 /* By design, the loop above discards the last character
673 it had read. It's in c, use it. */
674 *cp++ = c;
675 *bufsz = *cp = *buf = 0;
676 inform("field name \"%s\" exceeds %d bytes", name,
677 NAMESZ - 2);
678 s->state = LENERR;
679 break;
680 }
681
682 /* Trim any trailing spaces from the end of name. */
683 while (isspace ((unsigned char) *--cp) && cp >= name) continue;
684 *++cp = 0;
685 /* readpos points to the first character of the field body. */
686 /* FALLTHRU */
687
688 case FLDPLUS: {
689 /*
690 * get (more of) the text of a field. Take
691 * characters up to the end of this field (newline
692 * followed by non-blank) or bufsz-1 characters.
693 */
694 int finished;
695
696 cp = buf;
697 max = *bufsz-1;
698 n = 0;
699 for (finished = 0; ! finished; ) {
700 while (c != '\n' && c != EOF && n++ < max) {
701 if ((c = Getc (s)) != EOF)
702 *cp++ = c;
703 }
704
705 if (c != EOF)
706 c = Peek (s);
707 if (max < n) {
708 /* The dest buffer is full. Need to back the read
709 pointer up by one because when m_getfld() is
710 reentered, it will read a character. Then
711 we'll jump right to the FLDPLUS handling code,
712 which will not store that character, but
713 instead move on to the next one. */
714 if (s->readpos > s->msg_buf) {
715 --s->readpos;
716 --s->bytes_read;
717 }
718 s->state = FLDPLUS;
719 finished = 1;
720 } else if (c != ' ' && c != '\t') {
721 /* The next character is not folded whitespace, so
722 prepare to move on to the next field. It's OK
723 if c is EOF, it will be handled on the next
724 call to m_getfld (). */
725 s->state = FLD;
726 finished = 1;
727 } else {
728 /* Folded header field, continues on the next line. */
729 }
730 }
731 *bufsz = s->bytes_read;
732 break;
733 }
734
735 body:
736 case BODY: {
737 /*
738 * get the message body up to bufsz characters or the
739 * end of the message.
740 */
741 char *bp;
742
743 name[0] = '\0';
744 max = *bufsz-1;
745 /* Back up and store the current position. */
746 bp = --s->readpos;
747 c = min(s->end - s->readpos, max);
748 if (s->msg_style != MS_DEFAULT && c > 1) {
749 /*
750 * packed maildrop - only take up to the (possible)
751 * start of the next message. This "matchc" should
752 * probably be a Boyer-Moore matcher for non-vaxen,
753 * particularly since we have the alignment table
754 * all built for the end-of-buffer test (next).
755 * But our vax timings indicate that the "matchc"
756 * instruction is 50% faster than a carefully coded
757 * B.M. matcher for most strings. (So much for elegant
758 * algorithms vs. brute force.) Since I (currently)
759 * run MH on a vax, we use the matchc instruction. --vj
760 */
761 char *ep;
762
763 if ((ep = memmem(bp, c, s->fdelim, s->fdelimlen)))
764 /* Plus one to nab the '\n' that starts fdelim as
765 * that ends the previous line; it isn't part of
766 * msg_delim. */
767 c = ep - bp + 1;
768 else {
769 /*
770 * There's no delim in the buffer but there may be
771 * a partial one at the end. If so, we want to leave
772 * it so the "eom" check on the next call picks it up.
773 * Use a modified Boyer-Moore matcher to make this
774 * check relatively cheap. The first "if" figures
775 * out what position in the pattern matches the last
776 * character in the buffer. The inner "while" matches
777 * the pattern against the buffer, backwards starting
778 * at that position. Note that unless the buffer
779 * ends with one of the characters in the pattern
780 * (excluding the first and last), we do only one test.
781 */
782 char *sp;
783
784 ep = bp + c - 1; /* The last byte. */
785 if ((sp = s->pat_map[(unsigned char) *ep])) {
786 do {
787 /* This if() is true unless (a) the buffer is too
788 * small to contain this delimiter prefix, or
789 * (b) it contains exactly enough chars for the
790 * delimiter prefix.
791 * For case (a) obviously we aren't going to match.
792 * For case (b), if the buffer really contained exactly
793 * a delim prefix, then the m_eom call at entry
794 * should have found it. Thus it's not a delim
795 * and we know we won't get a match.
796 */
797 if (((sp - s->fdelim) + 2) <= c) {
798 cp = sp;
799 /* Unfortunately although fdelim has a preceding NUL
800 * we can't use this as a sentinel in case the buffer
801 * contains a NUL in exactly the wrong place (this
802 * would cause us to run off the front of fdelim).
803 */
804 while (*--ep == *--cp)
805 if (cp < s->fdelim)
806 break;
807 if (cp < s->fdelim) {
808 /* we matched the entire delim prefix,
809 * so only take the buffer up to there.
810 * we know ep >= bp -- check above prevents underrun
811 */
812 c = (ep - bp) + 2;
813 break;
814 }
815 }
816 /* try matching one less char of delim string */
817 ep = bp + c - 1;
818 } while (--sp > s->fdelim);
819 }
820 }
821 }
822 memcpy( buf, bp, c );
823 /* Advance the current position to reflect the copy out.
824 c is less than or equal to the number of bytes remaining
825 in the read buffer, so will not overrun it. */
826 s->readpos += c;
827 cp = buf + c;
828 /* Subtract 1 from c because the first character was read by
829 Getc(), and therefore already accounted for in s->bytes_read. */
830 s->bytes_read += c - 1;
831 *bufsz = s->bytes_read;
832 break;
833 }
834
835 default:
836 adios (NULL, "m_getfld() called with bogus state of %d", s->state);
837 }
838
839 *cp = 0;
840 leave_getfld (s);
841
842 return s->state;
843 }
844
845
846 /* m_getfld() with the existing iob. */
847 int m_getfld2(m_getfld_state_t *gstate, char name[NAMESZ], char *buf, int *bufsz)
848 {
849 if (!*gstate)
850 adios(NULL, "m_getfld2 without gstate");
851
852 return m_getfld(gstate, name, buf, bufsz, (*gstate)->iob);
853 }
854
855
856 void
857 m_unknown(m_getfld_state_t *gstate, FILE *iob)
858 {
859 m_getfld_state_t s;
860 int c;
861 char text[MAX_DELIMITER_SIZE];
862 char from[] = "From ";
863 char *cp;
864 char *delimstr;
865 unsigned int i;
866
867 enter_getfld (gstate, iob);
868 s = *gstate;
869
870 /*
871 * Figure out what the message delimiter string is for this
872 * maildrop. (This used to be part of m_Eom but I didn't like
873 * the idea of an "if" statement that could only succeed on the
874 * first call to m_Eom getting executed on each call, i.e., at
875 * every newline in the message).
876 *
877 * If the first line of the maildrop is a Unix "From " line, we
878 * say the style is MBOX and eat the rest of the line. Otherwise
879 * we say the style is MMDF and look for the delimiter string
880 * specified when nmh was built (or from the mts.conf file).
881 */
882
883 s->msg_style = MS_UNKNOWN;
884
885 for (i = 0, cp = text; i < sizeof text; ++i, ++cp) {
886 if ((c = Getc (s)) == EOF) {
887 *cp = '\0';
888 break;
889 }
890 *cp = c;
891 }
892
893 if (i == sizeof from-1 && strncmp (text, "From ", sizeof from-1) == 0) {
894 s->msg_style = MS_MBOX;
895 delimstr = "\nFrom ";
896 while ((c = Getc(s)) != EOF && c != '\n')
897 ;
898 } else {
899 /* not a Unix style maildrop */
900 s->readpos -= s->bytes_read;
901 s->bytes_read = 0;
902 delimstr = MMDF_DELIM;
903 s->msg_style = MS_MMDF;
904 }
905
906 /* "\nFrom \0" 7 "\001\001\001\001\n\0" 6
907 * | |
908 * delimstr c=6 delimstr c=5
909 */
910 c = strlen (delimstr);
911 s->fdelim = mh_xmalloc (c + 3); /* \0, \n, delimstr, \0 */
912 *s->fdelim++ = '\0';
913 *s->fdelim = '\n';
914 s->fdelimlen = c + 1;
915 s->msg_delim = s->fdelim+1;
916 strcpy (s->msg_delim, delimstr);
917 s->edelim = s->msg_delim+1;
918 s->edelimlen = c - 1;
919 s->delimend = s->msg_delim + s->edelimlen;
920 if (s->edelimlen <= 1)
921 adios (NULL, "maildrop delimiter must be at least 2 bytes");
922
923 /*
924 * build a Boyer-Moore end-position map for the matcher in m_getfld.
925 * N.B. - we don't match just the first char (since it's the newline
926 * separator) or the last char (since the matchc would have found it
927 * if it was a real delim).
928 */
929 s->pat_map = (char **) mh_xcalloc (256, sizeof(char *));
930
931 for (cp = s->fdelim + 1; cp < s->delimend; cp++ )
932 s->pat_map[(unsigned char)*cp] = cp;
933
934 if (s->msg_style == MS_MMDF) {
935 /* flush extra msg hdrs */
936 while ((c = Getc(s)) != EOF && eom (c, s))
937 ;
938 if (c != EOF)
939 Ungetc(s);
940 }
941
942 leave_getfld (s);
943 }
944
945
946 /* m_unknown() with the existing iob. */
947 void m_unknown2(m_getfld_state_t *gstate)
948 {
949 if (!*gstate)
950 adios(NULL, "m_unknown2 without gstate");
951
952 m_unknown(gstate, (*gstate)->iob);
953 }
954
955
956 /*
957 * test for msg delimiter string
958 */
959
960 static int
961 m_Eom (m_getfld_state_t s)
962 {
963 int i;
964 char text[MAX_DELIMITER_SIZE];
965 char *cp;
966 int adjust = 1;
967
968 for (i = 0, cp = text; i < s->edelimlen; ++i, ++cp) {
969 int c2;
970
971 if ((c2 = Getc (s)) == EOF) {
972 *cp = '\0';
973 break;
974 }
975 *cp = c2;
976 }
977
978 if (i != s->edelimlen ||
979 strncmp (text, (char *)s->edelim, s->edelimlen)) {
980 if (i == 0 && s->msg_style == MS_MBOX) {
981 /* the final newline in the (brain damaged) unix-format
982 * maildrop is part of the delimiter - delete it.
983 */
984 return 1;
985 }
986
987 if (i <= 2 && s->msg_style == MS_MBOX &&
988 i != s->edelimlen && ! strncmp(text, s->fdelim, i)) {
989 /* If all or part of fdelim appeared at the end of the file,
990 back up even more so that the bytes are included in the
991 message. */
992 adjust = 2;
993 }
994
995 /* Did not find delimiter, so restore the read position.
996 Note that on input, a character had already been read
997 with Getc(). It will be unget by m_getfld () on return. */
998 s->readpos -= s->bytes_read - adjust;
999 s->bytes_read = adjust;
1000 return 0;
1001 }
1002
1003 if (s->msg_style == MS_MBOX) {
1004 int c;
1005 while ((c = Getc(s)) != EOF && c != '\n')
1006 ;
1007 }
1008
1009 return 1;
1010 }