]> diplodocus.org Git - nmh/blob - sbr/m_getfld.c
Various IMAP protocol improvements
[nmh] / sbr / m_getfld.c
1 /* m_getfld.c -- read/parse a message
2 *
3 * This code is Copyright (c) 2002, by the authors of nmh. See the
4 * COPYRIGHT file in the root directory of the nmh distribution for
5 * complete copyright information.
6 */
7
8 #include <h/mh.h>
9 #include <h/mts.h>
10 #include <h/utils.h>
11 #include <inttypes.h>
12
13 /*
14 Purpose
15 =======
16 Reads an Internet message (RFC 5322), or one or more messages
17 stored in a maildrop in mbox (RFC 4155) or MMDF format, from a file
18 stream. Each call to m_getfld() reads one header field, or a
19 portion of the body, in sequence.
20
21 Inputs
22 ======
23 gstate: opaque parse state
24 bufsz: maximum number of characters to load into buf
25 iob: input file stream
26
27 Outputs
28 =======
29 name: header field name (array of size NAMESZ=999)
30 buf: either a header field body or message body
31 bufsz: number of characters loaded into buf
32 (return value): message parse state on return from function
33
34 Functions
35 =========
36 void m_getfld_state_destroy (m_getfld_state_t *gstate): destroys
37 the parse state pointed to by the gstate argument.
38
39 m_getfld_state_reset (m_getfld_state_t *gstate): resets the parse
40 state to FLD.
41
42 void m_unknown(FILE *iob): Determines the message delimiter string
43 for the maildrop. Called by inc and scan when reading from a
44 maildrop file.
45
46 State variables
47 ===============
48 m_getfld() retains state internally between calls in the
49 m_getfld_state_t variable. These are used for detecting the end of
50 each message when reading maildrops:
51
52 char **pat_map
53 char *fdelim
54 char *delimend
55 int fdelimlen
56 char *edelim
57 int edelimlen
58 char *msg_delim
59 int msg_style
60
61 Usage
62 =====
63 m_getfld_state_t gstate;
64
65 gstate = m_getfld_state_init(mailfp);
66 Perhaps m_getfld_track_filepos2(&gstate);
67 ...
68 state = m_getfld2(&gstate, ...);
69 ...Repeat until finished with mailfp.
70 m_getfld_state_destroy (&gstate);
71
72 The state is retained internally by gstate. To reset its state to FLD:
73 m_getfld_state_reset (&gstate);
74 */
75
76 /* The following described the old implementation. The high-level
77 structure hasn't changed, but some of the details have. I'm
78 leaving this as-is, though, for posterity.
79 */
80
81 /* This module has a long and checkered history. First, it didn't burst
82 maildrops correctly because it considered two CTRL-A:s in a row to be
83 an inter-message delimiter. It really is four CTRL-A:s followed by a
84 newline. Unfortunately, MMDF will convert this delimiter *inside* a
85 message to a CTRL-B followed by three CTRL-A:s and a newline. This
86 caused the old version of m_getfld() to declare eom prematurely. The
87 fix was a lot slower than
88
89 c == '\001' && peekc (iob) == '\001'
90
91 but it worked, and to increase generality, MBOX style maildrops could
92 be parsed as well. Unfortunately the speed issue finally caught up with
93 us since this routine is at the very heart of MH.
94
95 To speed things up considerably, the routine Eom() was made an auxiliary
96 function called by the macro eom(). Unless we are bursting a maildrop,
97 the eom() macro returns false saying we aren't at the end of the
98 message.
99
100 The next thing to do is to read the mts.conf file and initialize
101 delimiter[] and delimlen accordingly...
102
103 After mhl was made a built-in in msh, m_getfld() worked just fine
104 (using m_unknown() at startup). Until one day: a message which was
105 the result of a bursting was shown. Then, since the burst boundaries
106 aren't CTRL-A:s, m_getfld() would blinding plunge on past the boundary.
107 Very sad. The solution: introduce m_eomsbr(). This hook gets called
108 after the end of each line (since testing for eom involves an fseek()).
109 This worked fine, until one day: a message with no body portion arrived.
110 Then the
111
112 while (eom (c = Getc (iob), iob))
113 continue;
114
115 loop caused m_getfld() to return FMTERR. So, that logic was changed to
116 check for (*eom_action) and act accordingly.
117
118 This worked fine, until one day: someone didn't use four CTRL:A's as
119 their delimiters. So, the bullet got bit and we read mts.h and
120 continue to struggle on. It's not that bad though, since the only time
121 the code gets executed is when inc (or msh) calls it, and both of these
122 have already called mts_init().
123
124 ------------------------
125 (Written by Van Jacobson for the mh6 m_getfld, January, 1986):
126
127 This routine was accounting for 60% of the cpu time used by most mh
128 programs. I spent a bit of time tuning and it now accounts for <10%
129 of the time used. Like any heavily tuned routine, it's a bit
130 complex and you want to be sure you understand everything that it's
131 doing before you start hacking on it. Let me try to emphasize
132 that: every line in this atrocity depends on every other line,
133 sometimes in subtle ways. You should understand it all, in detail,
134 before trying to change any part. If you do change it, test the
135 result thoroughly (I use a hand-constructed test file that exercises
136 all the ways a header name, header body, header continuation,
137 header-body separator, body line and body eom can align themselves
138 with respect to a buffer boundary). "Minor" bugs in this routine
139 result in garbaged or lost mail.
140
141 If you hack on this and slow it down, I, my children and my
142 children's children will curse you.
143
144 This routine gets used on three different types of files: normal,
145 single msg files, "packed" unix or mmdf mailboxs (when used by inc)
146 and packed, directoried bulletin board files (when used by msh).
147 The biggest impact of different file types is in "eom" testing. The
148 code has been carefully organized to test for eom at appropriate
149 times and at no other times (since the check is quite expensive).
150 I have tried to arrange things so that the eom check need only be
151 done on entry to this routine. Since an eom can only occur after a
152 newline, this is easy to manage for header fields. For the msg
153 body, we try to efficiently search the input buffer to see if
154 contains the eom delimiter. If it does, we take up to the
155 delimiter, otherwise we take everything in the buffer. (The change
156 to the body eom/copy processing produced the most noticeable
157 performance difference, particularly for "inc" and "show".)
158
159 There are three qualitatively different things this routine busts
160 out of a message: field names, field text and msg bodies. Field
161 names are typically short (~8 char) and the loop that extracts them
162 might terminate on a colon, newline or max width. I considered
163 using a Vax "scanc" to locate the end of the field followed by a
164 "memmove" but the routine call overhead on a Vax is too large for this
165 to work on short names. If Berkeley ever makes "inline" part of the
166 C optimiser (so things like "scanc" turn into inline instructions) a
167 change here would be worthwhile.
168
169 Field text is typically 60 - 100 characters so there's (barely)
170 a win in doing a routine call to something that does a "locc"
171 followed by a "bmove". About 30% of the fields have continuations
172 (usually the 822 "received:" lines) and each continuation generates
173 another routine call. "Inline" would be a big win here, as well.
174
175 Messages, as of this writing, seem to come in two flavors: small
176 (~1K) and long (>2K). Most messages have 400 - 600 bytes of headers
177 so message bodies average at least a few hundred characters.
178 Assuming your system uses reasonably sized stdio buffers (1K or
179 more), this routine should be able to remove the body in large
180 (>500 byte) chunks. The makes the cost of a call to "memmove"
181 small but there is a premium on checking for the eom in packed
182 maildrops. The eom pattern is always a simple string so we can
183 construct an efficient pattern matcher for it (e.g., a Vax "matchc"
184 instruction). Some thought went into recognizing the start of
185 an eom that has been split across two buffers.
186
187 This routine wants to deal with large chunks of data so, rather
188 than "getc" into a local buffer, it uses stdio's buffer. If
189 you try to use it on a non-buffered file, you'll get what you
190 deserve. This routine "knows" that struct FILEs have a _ptr
191 and a _cnt to describe the current state of the buffer and
192 it knows that _filbuf ignores the _ptr & _cnt and simply fills
193 the buffer. If stdio on your system doesn't work this way, you
194 may have to make small changes in this routine.
195
196 This routine also "knows" that an EOF indication on a stream is
197 "sticky" (i.e., you will keep getting EOF until you reposition the
198 stream). If your system doesn't work this way it is broken and you
199 should complain to the vendor. As a consequence of the sticky
200 EOF, this routine will never return any kind of EOF status when
201 there is data in "name" or "buf").
202 */
203
204 /*
205 * static prototypes
206 */
207 static void Ungetc(m_getfld_state_t s);
208 static int m_Eom (m_getfld_state_t);
209
210 #define eom(c,s) (s->msg_style != MS_DEFAULT && \
211 ((c) == *s->msg_delim && m_Eom(s)))
212
213 /*
214 * Maildrop styles
215 */
216 #define MS_DEFAULT 0 /* default (one msg per file) */
217 #define MS_UNKNOWN 1 /* type not known yet */
218 #define MS_MBOX 2 /* Unix-style "from" lines */
219 #define MS_MMDF 3 /* string MMDF_DELIM */
220
221 /* This replaces the old approach, with its direct access to stdio
222 * internals. It uses one fread() to load a buffer that we manage.
223 *
224 * MSG_INPUT_SIZE is the size of the buffer.
225 * MAX_DELIMITER_SIZE is the maximum size of the delimiter used to
226 * separate messages in a maildrop, such as mbox "From ".
227 *
228 * Some of the tests in the test suite assume a MSG_INPUT_SIZE
229 * of 8192.
230 */
231 #define MSG_INPUT_SIZE NMH_BUFSIZ
232 #define MAX_DELIMITER_SIZE 5
233
234 struct m_getfld_state {
235 /* The file to read from; I/O block. Caller keeps passing it after
236 * initialisation due to historic interface so it keeps getting
237 * updated, presumably to the same value. */
238 FILE *iob;
239
240 /* Holds content of iob. */
241 char msg_buf[2 * MSG_INPUT_SIZE + MAX_DELIMITER_SIZE];
242 /* Points to the next byte to read from msg_buf. */
243 char *readpos;
244 /* Points to just after the last valid byte in msg_buf. If readpos
245 * equals end then msg_buf is empty. */
246 char *end;
247
248 /* Whether the caller intends to ftell(3)/fseek(3) iob's position,
249 * and thus whether m_getfld() needs to detect that and compensate. */
250 int track_filepos;
251 /* Position in iob given what's been consumed ready for returning to
252 * the caller. Further than this may have been read into msg_buf. */
253 off_t total_bytes_read;
254 /* Bytes of iob consumed during this call. */
255 off_t bytes_read;
256 /* What fseeko(3) tells us iob's position is having just explicitly
257 * set it to total_bytes_read. Surely always the same? */
258 off_t last_caller_pos;
259 /* Saved position in iob from filling msg_buf, prior to returning. */
260 off_t last_internal_pos;
261
262 /* One of the MS_* macros tracking the type of iob's content and
263 * thus if it's a single email, or several with delimeters. Default
264 * is MS_DEFAULT. */
265 int msg_style;
266
267 /* The message delimeter if iob has multiple emails, else NULL. For
268 * MS_MBOX it's the string that separates two emails, "\nFrom ",
269 * i.e. the terminating blank line of the previous email, and the
270 * starting From_ line of the next, but for MS_MMDF it's
271 * "\001\001\001\001\n" that may start or terminate an email. */
272 char *msg_delim;
273 /* The last non-NUL char of msg_delim. */
274 char *delimend;
275 /* When searching for msg_delim after an email, it's only of
276 * interest at the start of the line, i.e. when preceded by a
277 * linefeed. fdelim points to msg_delim[-1] that contains '\n' so
278 * it can be used as the needle. */
279 char *fdelim;
280 /* strlen(fdelim). */
281 int fdelimlen;
282 /* The second char of msg_delim. Used when the first char has
283 * already been matched to test the rest. */
284 char *edelim;
285 /* strlen(edelim). */
286 int edelimlen;
287 /* The relationship between all of these pointers and lengths for
288 * the two possible msg_delim values.
289 *
290 * "\0\n\nFrom \0" 9 "\0\n\001\001\001\001\n\0" 8
291 * | || | | | | |
292 * | || s->delimend | | | s->delimend
293 * | || | | |
294 * | |s->edelim s->edelimlen=5 | | s->edelim s->edelimlen=4
295 * | | | |
296 * | s->msg_delim | s->msg_delim
297 * | |
298 * s->fdelim s->fdelimlen=7 s->fdelim s->fdelimlen=6
299 */
300
301 /* Maps all the bytes of msg_delim, apart from the last two,
302 * including the NUL, onto the last position in msg_delim where they
303 * occur. Bytes not present are NULL. */
304 char **pat_map;
305
306 /* The parser's current state. Also returned to the caller, amongst
307 * other possible values, to indicate the token consumed. One of
308 * FLD, FLDPLUS, BODY, or FILEEOF. */
309 int state;
310 };
311
312 m_getfld_state_t m_getfld_state_init(FILE *iob)
313 {
314 m_getfld_state_t s;
315
316 NEW(s);
317 s->readpos = s->end = s->msg_buf;
318 s->bytes_read = s->total_bytes_read = 0;
319 s->last_caller_pos = s->last_internal_pos = 0;
320 s->iob = iob;
321 s->pat_map = NULL;
322 s->msg_style = MS_DEFAULT;
323 s->msg_delim = "";
324 s->fdelim = s->delimend = s->edelim = NULL;
325 s->fdelimlen = s->edelimlen = 0;
326 s->state = FLD;
327 s->track_filepos = 0;
328
329 return s;
330 }
331
332 /* scan() needs to force an initial state of FLD for each message. */
333 void
334 m_getfld_state_reset (m_getfld_state_t *gstate)
335 {
336 if (*gstate) {
337 (*gstate)->state = FLD;
338 }
339 }
340
341 /* If the caller interleaves ftell*()/fseek*() calls with m_getfld()
342 calls, m_getfld() must keep track of the file position. The caller
343 must use this function to inform m_getfld(). */
344 void
345 m_getfld_track_filepos (m_getfld_state_t *gstate, FILE *iob)
346 {
347 if (! *gstate) {
348 *gstate = m_getfld_state_init(iob);
349 }
350
351 (*gstate)->track_filepos = 1;
352 }
353
354 /* m_getfld_track_filepos() with the existing iob. */
355 void
356 m_getfld_track_filepos2(m_getfld_state_t *gstate)
357 {
358 if (!*gstate)
359 die("m_getfld_track_filepos2 without gstate");
360
361 m_getfld_track_filepos(gstate, (*gstate)->iob);
362 }
363
364 void
365 m_getfld_state_destroy (m_getfld_state_t *gstate)
366 {
367 m_getfld_state_t s = *gstate;
368
369 if (s) {
370 if (s->fdelim) {
371 free (s->fdelim-1);
372 free (s->pat_map);
373 }
374 free (s);
375 *gstate = 0;
376 }
377 }
378
379 /*
380 Summary of file and message input buffer positions:
381
382 input file -------------------------------------------EOF
383 | |
384 last_caller_pos last_internal_pos
385
386
387 msg_buf --------------------EOF
388 | | |
389 msg_buf readpos end
390
391 |<>|=retained characters, difference
392 between last_internal_pos and
393 first readpos value after reading
394 in new chunk in read_more()
395
396 When returning from m_getfld()/m_unknown():
397 1) Save the internal file position in last_internal_pos. That's the
398 m_getfld() position reference in the input file.
399 2) Set file stream position so that callers can use ftell().
400
401 When entering m_getfld()/m_unknown():
402 Check to see if the call had changed the file position. If so,
403 adjust the internal position reference accordingly. If not, restore
404 the internal file position from last_internal_pos.
405 */
406
407
408 static void
409 enter_getfld (m_getfld_state_t *gstate, FILE *iob)
410 {
411 m_getfld_state_t s;
412 off_t pos;
413 off_t pos_movement;
414
415 if (! *gstate) {
416 *gstate = m_getfld_state_init(iob);
417 }
418 s = *gstate;
419 s->bytes_read = 0;
420
421 /* This is ugly and no longer necessary, but is retained just in
422 case it's needed again. The parser used to open the input file
423 multiple times, so we had to always use the FILE * that's
424 passed to m_getfld(). Now the parser inits a new
425 m_getfld_state for each file. See comment below about the
426 readpos shift code being currently unused. */
427 s->iob = iob;
428
429 if (!s->track_filepos)
430 return;
431
432 if ((pos = ftello(iob)) == -1)
433 adios("getfld's iob", "failed to get offset on entry");
434 if (pos == 0 && s->last_internal_pos == 0)
435 return;
436
437 if (s->last_internal_pos == 0) {
438 s->total_bytes_read = pos;
439 return;
440 }
441
442 pos_movement = pos - s->last_caller_pos; /* Can be < 0. */
443 if (pos_movement == 0) {
444 pos = s->last_internal_pos;
445 } else {
446 /* The current file stream position differs from the
447 last one, so caller must have called ftell/o().
448 Or, this is the first call and the file position
449 was not at 0. */
450
451 if (s->readpos + pos_movement >= s->msg_buf &&
452 s->readpos + pos_movement < s->end) {
453 /* This is currently unused. It could be used by
454 parse_mime() if it was changed to use a global
455 m_getfld_state. */
456 /* We can shift readpos and remain within the
457 bounds of msg_buf. */
458 s->readpos += pos_movement;
459 s->total_bytes_read += pos_movement;
460 pos = s->last_internal_pos;
461 } else {
462 off_t off;
463 size_t num_read;
464
465 /* This seek skips past an integral number of
466 chunks of size MSG_INPUT_SIZE. */
467 off = pos / MSG_INPUT_SIZE * MSG_INPUT_SIZE;
468 if (fseeko(iob, off, SEEK_SET) == -1)
469 adios("getfld's iob", "failed to set offset to skip: "
470 "%" PRIdMAX, (intmax_t)off);
471 num_read = fread (s->msg_buf, 1, MSG_INPUT_SIZE, iob);
472 s->readpos = s->msg_buf + pos % MSG_INPUT_SIZE;
473 s->end = s->msg_buf + num_read;
474 s->total_bytes_read = pos;
475 }
476 }
477
478 if (fseeko(iob, pos, SEEK_SET) == -1)
479 adios("getfld's iob", "failed to set offset on entry: %" PRIdMAX,
480 (intmax_t)pos);
481 }
482
483 static void
484 leave_getfld (m_getfld_state_t s)
485 {
486 s->total_bytes_read += s->bytes_read;
487
488 if (s->track_filepos) {
489 /* Save the internal file position that we use for the input buffer. */
490 if ((s->last_internal_pos = ftello(s->iob)) == -1)
491 adios("getfld's iob", "failed to get offset before seek");
492
493 /* Set file stream position so that callers can use ftell(). */
494 if (fseeko(s->iob, s->total_bytes_read, SEEK_SET) == -1)
495 adios("getfld's iob", "failed to set offset: %" PRIdMAX,
496 (intmax_t)s->total_bytes_read);
497
498 s->last_caller_pos = s->total_bytes_read;
499 }
500 }
501
502 static size_t
503 read_more (m_getfld_state_t s)
504 {
505 /* Retain at least edelimlen characters that have already been read,
506 if at least edelimlen have been read, so that we can back up to them
507 in m_Eom(). */
508 ssize_t retain = s->end - s->msg_buf < s->edelimlen ? 0 : s->edelimlen;
509 size_t num_read;
510
511 if (retain > 0) {
512 if (retain < s->end - s->readpos)
513 retain = s->end - s->readpos;
514 assert (retain <= s->readpos - s->msg_buf);
515
516 /* Move what we want to retain at end of the buffer to the beginning. */
517 memmove (s->msg_buf, s->readpos - retain, retain);
518 }
519
520 s->readpos = s->msg_buf + retain;
521 num_read = fread (s->readpos, 1, MSG_INPUT_SIZE, s->iob);
522 s->end = s->readpos + num_read;
523
524 return num_read;
525 }
526
527 /* Return the next character consumed from the input, fetching more of
528 * the input for the buffer if required, or EOF on end of file. */
529 static int
530 Getc (m_getfld_state_t s)
531 {
532 if ((s->end - s->readpos < 1 && read_more (s) == 0) ||
533 s->readpos >= s->end)
534 return EOF;
535
536 s->bytes_read++;
537 return (unsigned char)*s->readpos++;
538 }
539
540 /* Return the next character that Getc() would return, which may be EOF. */
541 static int
542 Peek (m_getfld_state_t s)
543 {
544 int c;
545
546 c = Getc(s);
547 if (c != EOF)
548 Ungetc(s);
549
550 return c;
551 }
552
553 /* If there's room, undo the consumption of one character from msg_buf,
554 * rewinding so it's read next, else die. */
555 static void
556 Ungetc(m_getfld_state_t s)
557 {
558 if (s->readpos == s->msg_buf)
559 die("Ungetc() at start of message buffer.");
560
561 s->readpos--;
562 s->bytes_read--;
563 }
564
565
566 int
567 m_getfld (m_getfld_state_t *gstate, char name[NAMESZ], char *buf, int *bufsz,
568 FILE *iob)
569 {
570 m_getfld_state_t s;
571 char *cp;
572 int max, n, c;
573
574 enter_getfld (gstate, iob);
575 s = *gstate;
576
577 if ((c = Getc(s)) == EOF) {
578 *bufsz = *buf = 0;
579 leave_getfld (s);
580 return s->state = FILEEOF;
581 }
582 if (eom (c, s)) {
583 /* flush null messages */
584 while ((c = Getc(s)) != EOF && eom (c, s))
585 ;
586
587 if (c != EOF)
588 Ungetc(s);
589 *bufsz = *buf = 0;
590 leave_getfld (s);
591 return s->state = FILEEOF;
592 }
593
594 switch (s->state) {
595 case FLD:
596 if (c == '\n' || c == '-') {
597 /* we hit the header/body separator */
598 while (c != '\n' && (c = Getc(s)) != EOF)
599 ;
600
601 if (c == EOF || (c = Getc(s)) == EOF || eom (c, s)) {
602 /* flush null messages */
603 while ((c = Getc(s)) != EOF && eom (c, s))
604 ;
605 if (c != EOF)
606 Ungetc(s);
607 *bufsz = *buf = 0;
608 leave_getfld (s);
609 return s->state = FILEEOF;
610 }
611 s->state = BODY;
612 goto body;
613 }
614 /*
615 * get the name of this component. take characters up
616 * to a ':', a newline or NAMESZ-1 characters, whichever
617 * comes first.
618 */
619 cp = name;
620 max = NAMESZ - 1;
621 /* Get the field name. The first time through the loop,
622 this copies out the first character, which was loaded
623 into c prior to loop entry. Initialize n to 1 to
624 account for that. */
625 for (n = 1;
626 c != ':' && c != '\n' && c != EOF && n < max;
627 ++n, c = Getc (s)) {
628 *cp++ = c;
629 }
630
631 /* Check for next character, which is either the space after
632 the ':' or the first folded whitespace. */
633 {
634 int next_char;
635 if (c == EOF || (next_char = Peek (s)) == EOF) {
636 *bufsz = *cp = *buf = 0;
637 inform("eof encountered in field \"%s\"", name);
638 leave_getfld (s);
639 return s->state = FMTERR;
640 }
641 }
642
643 /* If c isn't ':' here, something went wrong. Possibilities are:
644 * . hit a newline (error)
645 * . got more than namesz chars. (error)
646 */
647 if (c == ':') {
648 /* Finished header name, fall through to FLDPLUS below. */
649 } else if (c == '\n') {
650 /* We hit the end of the line without seeing ':' to
651 * terminate the field name. This is usually (always?)
652 * spam. But, blowing up is lame, especially when
653 * scan(1)ing a folder with such messages. Pretend such
654 * lines are the first of the body (at least mutt also
655 * handles it this way). */
656
657 /* See if buf can hold this line, since we were assuming
658 * we had a buffer of NAMESZ, not bufsz. */
659 /* + 1 for the newline */
660 if (*bufsz < n + 1) {
661 /* No, it can't. Oh well, guess we'll blow up. */
662 *bufsz = *cp = *buf = 0;
663 inform("eol encountered in field \"%s\"", name);
664 s->state = FMTERR;
665 break;
666 }
667 memcpy (buf, name, n - 1);
668 buf[n - 1] = '\n';
669 buf[n] = '\0';
670 /* Indicate this wasn't a header field using a character
671 that can't appear in a header field. */
672 name[0] = ':';
673 /* The last character read was '\n'. s->bytes_read
674 (and n) include that, but it was not put into the
675 name array in the for loop above. So subtract 1. */
676 *bufsz = --s->bytes_read; /* == n - 1 */
677 leave_getfld (s);
678 return s->state = BODY;
679 }
680 if (max <= n) {
681 /* By design, the loop above discards the last character
682 it had read. It's in c, use it. */
683 *cp++ = c;
684 *bufsz = *cp = *buf = 0;
685 inform("field name \"%s\" exceeds %d bytes", name,
686 NAMESZ - 2);
687 s->state = LENERR;
688 break;
689 }
690
691 /* Trim any trailing spaces from the end of name. */
692 while (isspace ((unsigned char) *--cp) && cp >= name) continue;
693 *++cp = 0;
694 /* readpos points to the first character of the field body. */
695 /* FALLTHRU */
696
697 case FLDPLUS: {
698 /*
699 * get (more of) the text of a field. Take
700 * characters up to the end of this field (newline
701 * followed by non-blank) or bufsz-1 characters.
702 */
703 cp = buf;
704 max = *bufsz-1;
705 n = 0;
706 for (bool finished = false; !finished; ) {
707 while (c != '\n' && c != EOF && n++ < max) {
708 if ((c = Getc (s)) != EOF)
709 *cp++ = c;
710 }
711
712 if (c != EOF)
713 c = Peek (s);
714 if (max < n) {
715 /* The dest buffer is full. Need to back the read
716 pointer up by one because when m_getfld() is
717 reentered, it will read a character. Then
718 we'll jump right to the FLDPLUS handling code,
719 which will not store that character, but
720 instead move on to the next one. */
721 if (s->readpos > s->msg_buf) {
722 --s->readpos;
723 --s->bytes_read;
724 }
725 s->state = FLDPLUS;
726 finished = true;
727 } else if (c != ' ' && c != '\t') {
728 /* The next character is not folded whitespace, so
729 prepare to move on to the next field. It's OK
730 if c is EOF, it will be handled on the next
731 call to m_getfld (). */
732 s->state = FLD;
733 finished = true;
734 } else {
735 /* Folded header field, continues on the next line. */
736 }
737 }
738 *bufsz = s->bytes_read;
739 break;
740 }
741
742 body:
743 case BODY: {
744 /*
745 * get the message body up to bufsz characters or the
746 * end of the message.
747 */
748 char *bp;
749
750 name[0] = '\0';
751 max = *bufsz-1;
752 /* Back up and store the current position. */
753 bp = --s->readpos;
754 c = min(s->end - s->readpos, max);
755 if (s->msg_style != MS_DEFAULT && c > 1) {
756 /*
757 * packed maildrop - only take up to the (possible)
758 * start of the next message. This "matchc" should
759 * probably be a Boyer-Moore matcher for non-vaxen,
760 * particularly since we have the alignment table
761 * all built for the end-of-buffer test (next).
762 * But our vax timings indicate that the "matchc"
763 * instruction is 50% faster than a carefully coded
764 * B.M. matcher for most strings. (So much for elegant
765 * algorithms vs. brute force.) Since I (currently)
766 * run MH on a vax, we use the matchc instruction. --vj
767 */
768 char *ep;
769
770 if ((ep = memmem(bp, c, s->fdelim, s->fdelimlen)))
771 /* Plus one to nab the '\n' that starts fdelim as
772 * that ends the previous line; it isn't part of
773 * msg_delim. */
774 c = ep - bp + 1;
775 else {
776 /*
777 * There's no delim in the buffer but there may be
778 * a partial one at the end. If so, we want to leave
779 * it so the "eom" check on the next call picks it up.
780 * Use a modified Boyer-Moore matcher to make this
781 * check relatively cheap. The first "if" figures
782 * out what position in the pattern matches the last
783 * character in the buffer. The inner "while" matches
784 * the pattern against the buffer, backwards starting
785 * at that position. Note that unless the buffer
786 * ends with one of the characters in the pattern
787 * (excluding the first and last), we do only one test.
788 */
789 char *sp;
790
791 ep = bp + c - 1; /* The last byte. */
792 if ((sp = s->pat_map[(unsigned char) *ep])) {
793 do {
794 /* This if() is true unless (a) the buffer is too
795 * small to contain this delimiter prefix, or
796 * (b) it contains exactly enough chars for the
797 * delimiter prefix.
798 * For case (a) obviously we aren't going to match.
799 * For case (b), if the buffer really contained exactly
800 * a delim prefix, then the m_eom call at entry
801 * should have found it. Thus it's not a delim
802 * and we know we won't get a match.
803 */
804 if (((sp - s->fdelim) + 2) <= c) {
805 cp = sp;
806 /* Unfortunately although fdelim has a preceding NUL
807 * we can't use this as a sentinel in case the buffer
808 * contains a NUL in exactly the wrong place (this
809 * would cause us to run off the front of fdelim).
810 */
811 while (*--ep == *--cp)
812 if (cp < s->fdelim)
813 break;
814 if (cp < s->fdelim) {
815 /* we matched the entire delim prefix,
816 * so only take the buffer up to there.
817 * we know ep >= bp -- check above prevents underrun
818 */
819 c = (ep - bp) + 2;
820 break;
821 }
822 }
823 /* try matching one less char of delim string */
824 ep = bp + c - 1;
825 } while (--sp > s->fdelim);
826 }
827 }
828 }
829 memcpy( buf, bp, c );
830 /* Advance the current position to reflect the copy out.
831 c is less than or equal to the number of bytes remaining
832 in the read buffer, so will not overrun it. */
833 s->readpos += c;
834 cp = buf + c;
835 /* Subtract 1 from c because the first character was read by
836 Getc(), and therefore already accounted for in s->bytes_read. */
837 s->bytes_read += c - 1;
838 *bufsz = s->bytes_read;
839 break;
840 }
841
842 default:
843 die("m_getfld() called with bogus state of %d", s->state);
844 }
845
846 *cp = 0;
847 leave_getfld (s);
848
849 return s->state;
850 }
851
852
853 /* m_getfld() with the existing iob. */
854 int
855 m_getfld2(m_getfld_state_t *gstate, char name[NAMESZ], char *buf, int *bufsz)
856 {
857 if (!*gstate)
858 die("m_getfld2 without gstate");
859
860 return m_getfld(gstate, name, buf, bufsz, (*gstate)->iob);
861 }
862
863
864 void
865 m_unknown(m_getfld_state_t *gstate, FILE *iob)
866 {
867 m_getfld_state_t s;
868 int c;
869 char text[MAX_DELIMITER_SIZE];
870 char from[] = "From ";
871 char *cp;
872 char *delimstr;
873 unsigned int i;
874
875 enter_getfld (gstate, iob);
876 s = *gstate;
877
878 /*
879 * Figure out what the message delimiter string is for this
880 * maildrop. (This used to be part of m_Eom but I didn't like
881 * the idea of an "if" statement that could only succeed on the
882 * first call to m_Eom getting executed on each call, i.e., at
883 * every newline in the message).
884 *
885 * If the first line of the maildrop is a Unix "From " line, we
886 * say the style is MBOX and eat the rest of the line. Otherwise
887 * we say the style is MMDF and look for the delimiter string
888 * specified when nmh was built (or from the mts.conf file).
889 */
890
891 s->msg_style = MS_UNKNOWN;
892
893 for (i = 0, cp = text; i < sizeof text; ++i, ++cp) {
894 if ((c = Getc (s)) == EOF) {
895 *cp = '\0';
896 break;
897 }
898 *cp = c;
899 }
900
901 if (i == sizeof from-1 && strncmp (text, "From ", sizeof from-1) == 0) {
902 s->msg_style = MS_MBOX;
903 delimstr = "\nFrom ";
904 while ((c = Getc(s)) != EOF && c != '\n')
905 ;
906 } else {
907 /* not a Unix style maildrop */
908 s->readpos -= s->bytes_read;
909 s->bytes_read = 0;
910 delimstr = MMDF_DELIM;
911 s->msg_style = MS_MMDF;
912 }
913
914 /* "\nFrom \0" 7 "\001\001\001\001\n\0" 6
915 * | |
916 * delimstr c=6 delimstr c=5
917 */
918 c = strlen (delimstr);
919 s->fdelim = mh_xmalloc (c + 3); /* \0, \n, delimstr, \0 */
920 *s->fdelim++ = '\0';
921 *s->fdelim = '\n';
922 s->fdelimlen = c + 1;
923 s->msg_delim = s->fdelim+1;
924 strcpy (s->msg_delim, delimstr);
925 s->edelim = s->msg_delim+1;
926 s->edelimlen = c - 1;
927 s->delimend = s->msg_delim + s->edelimlen;
928 if (s->edelimlen <= 1)
929 die("maildrop delimiter must be at least 2 bytes");
930
931 /*
932 * build a Boyer-Moore end-position map for the matcher in m_getfld.
933 * N.B. - we don't match just the first char (since it's the newline
934 * separator) or the last char (since the matchc would have found it
935 * if it was a real delim).
936 */
937 s->pat_map = mh_xcalloc (256, sizeof(char *));
938
939 for (cp = s->fdelim + 1; cp < s->delimend; cp++ )
940 s->pat_map[(unsigned char)*cp] = cp;
941
942 if (s->msg_style == MS_MMDF) {
943 /* flush extra msg hdrs */
944 while ((c = Getc(s)) != EOF && eom (c, s))
945 ;
946 if (c != EOF)
947 Ungetc(s);
948 }
949
950 leave_getfld (s);
951 }
952
953
954 /* m_unknown() with the existing iob. */
955 void
956 m_unknown2(m_getfld_state_t *gstate)
957 {
958 if (!*gstate)
959 die("m_unknown2 without gstate");
960
961 m_unknown(gstate, (*gstate)->iob);
962 }
963
964
965 /*
966 * test for msg delimiter string
967 */
968
969 static int
970 m_Eom (m_getfld_state_t s)
971 {
972 int i;
973 char text[MAX_DELIMITER_SIZE];
974 char *cp;
975 int adjust = 1;
976
977 for (i = 0, cp = text; i < s->edelimlen; ++i, ++cp) {
978 int c2;
979
980 if ((c2 = Getc (s)) == EOF) {
981 *cp = '\0';
982 break;
983 }
984 *cp = c2;
985 }
986
987 if (i != s->edelimlen ||
988 strncmp (text, (char *)s->edelim, s->edelimlen)) {
989 if (i == 0 && s->msg_style == MS_MBOX) {
990 /* the final newline in the (brain damaged) unix-format
991 * maildrop is part of the delimiter - delete it.
992 */
993 return 1;
994 }
995
996 if (i <= 2 && s->msg_style == MS_MBOX &&
997 i != s->edelimlen && ! strncmp(text, s->fdelim, i)) {
998 /* If all or part of fdelim appeared at the end of the file,
999 back up even more so that the bytes are included in the
1000 message. */
1001 adjust = 2;
1002 }
1003
1004 /* Did not find delimiter, so restore the read position.
1005 Note that on input, a character had already been read
1006 with Getc(). It will be unget by m_getfld () on return. */
1007 s->readpos -= s->bytes_read - adjust;
1008 s->bytes_read = adjust;
1009 return 0;
1010 }
1011
1012 if (s->msg_style == MS_MBOX) {
1013 int c;
1014 while ((c = Getc(s)) != EOF && c != '\n')
1015 ;
1016 }
1017
1018 return 1;
1019 }