diplodocus.org Git - nmh/blob - sbr/m_getfld.c

   1 /* m_getfld.c -- read/parse a message
   2  *
   3  * This code is Copyright (c) 2002, by the authors of nmh.  See the
   4  * COPYRIGHT file in the root directory of the nmh distribution for
   5  * complete copyright information.
   6  */
   7
   8 #include "h/mh.h"
   9 #include "m_getfld.h"
  10 #include "error.h"
  11 #include "h/mts.h"
  12 #include "h/utils.h"
  13 #include <inttypes.h>
  14
  15 /*
  16    Purpose
  17    =======
  18    Reads an Internet message (RFC 5322), or one or more messages
  19    stored in a maildrop in mbox (RFC 4155) or MMDF format, from a file
  20    stream.  Each call to m_getfld() reads one header field, or a
  21    portion of the body, in sequence.
  22
  23    Inputs
  24    ======
  25    gstate:  opaque parse state
  26    bufsz:  maximum number of characters to load into buf
  27    iob:  input file stream
  28
  29    Outputs
  30    =======
  31    name:  header field name (array of size NAMESZ=999)
  32    buf:  either a header field body or message body
  33    bufsz:  number of characters loaded into buf
  34    (return value):  message parse state on return from function
  35
  36    Functions
  37    =========
  38    void m_getfld_state_destroy (m_getfld_state_t *gstate): destroys
  39    the parse state pointed to by the gstate argument.
  40
  41    m_getfld_state_reset (m_getfld_state_t *gstate): resets the parse
  42    state to FLD.
  43
  44    void m_unknown(FILE *iob):  Determines the message delimiter string
  45    for the maildrop.  Called by inc and scan when reading from a
  46    maildrop file.
  47
  48    State variables
  49    ===============
  50    m_getfld() retains state internally between calls in the
  51    m_getfld_state_t variable.  These are used for detecting the end of
  52    each message when reading maildrops:
  53
  54      char **pat_map
  55      char *fdelim
  56      char *delimend
  57      int fdelimlen
  58      char *edelim
  59      int edelimlen
  60      char *msg_delim
  61      int msg_style
  62
  63    Usage
  64    =====
  65    m_getfld_state_t gstate;
  66
  67    gstate = m_getfld_state_init(mailfp);
  68    Perhaps m_getfld_track_filepos2(&gstate);
  69    ...
  70       state = m_getfld2(&gstate, ...);
  71       ...Repeat until finished with mailfp.
  72    m_getfld_state_destroy (&gstate);
  73
  74    The state is retained internally by gstate.  To reset its state to FLD:
  75    m_getfld_state_reset (&gstate);
  76 */
  77
  78 /* The following described the old implementation.  The high-level
  79    structure hasn't changed, but some of the details have.  I'm
  80    leaving this as-is, though, for posterity.
  81  */
  82
  83 /* This module has a long and checkered history.  First, it didn't burst
  84    maildrops correctly because it considered two CTRL-A:s in a row to be
  85    an inter-message delimiter.  It really is four CTRL-A:s followed by a
  86    newline.  Unfortunately, MMDF will convert this delimiter *inside* a
  87    message to a CTRL-B followed by three CTRL-A:s and a newline.  This
  88    caused the old version of m_getfld() to declare eom prematurely.  The
  89    fix was a lot slower than
  90
  91                 c == '\001' && peekc (iob) == '\001'
  92
  93    but it worked, and to increase generality, MBOX style maildrops could
  94    be parsed as well.  Unfortunately the speed issue finally caught up with
  95    us since this routine is at the very heart of MH.
  96
  97    To speed things up considerably, the routine Eom() was made an auxiliary
  98    function called by the macro eom().  Unless we are bursting a maildrop,
  99    the eom() macro returns false saying we aren't at the end of the
 100    message.
 101
 102    The next thing to do is to read the mts.conf file and initialize
 103    delimiter[] and delimlen accordingly...
 104
 105    After mhl was made a built-in in msh, m_getfld() worked just fine
 106    (using m_unknown() at startup).  Until one day: a message which was
 107    the result of a bursting was shown. Then, since the burst boundaries
 108    aren't CTRL-A:s, m_getfld() would blinding plunge on past the boundary.
 109    Very sad.  The solution: introduce m_eomsbr().  This hook gets called
 110    after the end of each line (since testing for eom involves an fseek()).
 111    This worked fine, until one day: a message with no body portion arrived.
 112    Then the
 113
 114                    while (eom (c = Getc (iob), iob))
 115                         continue;
 116
 117    loop caused m_getfld() to return FMTERR.  So, that logic was changed to
 118    check for (*eom_action) and act accordingly.
 119
 120    This worked fine, until one day: someone didn't use four CTRL:A's as
 121    their delimiters.  So, the bullet got bit and we read mts.h and
 122    continue to struggle on.  It's not that bad though, since the only time
 123    the code gets executed is when inc (or msh) calls it, and both of these
 124    have already called mts_init().
 125
 126    ------------------------
 127    (Written by Van Jacobson for the mh6 m_getfld, January, 1986):
 128
 129    This routine was accounting for 60% of the cpu time used by most mh
 130    programs.  I spent a bit of time tuning and it now accounts for <10%
 131    of the time used.  Like any heavily tuned routine, it's a bit
 132    complex and you want to be sure you understand everything that it's
 133    doing before you start hacking on it.  Let me try to emphasize
 134    that:  every line in this atrocity depends on every other line,
 135    sometimes in subtle ways.  You should understand it all, in detail,
 136    before trying to change any part.  If you do change it, test the
 137    result thoroughly (I use a hand-constructed test file that exercises
 138    all the ways a header name, header body, header continuation,
 139    header-body separator, body line and body eom can align themselves
 140    with respect to a buffer boundary).  "Minor" bugs in this routine
 141    result in garbaged or lost mail.
 142
 143    If you hack on this and slow it down, I, my children and my
 144    children's children will curse you.
 145
 146    This routine gets used on three different types of files: normal,
 147    single msg files, "packed" unix or mmdf mailboxs (when used by inc)
 148    and packed, directoried bulletin board files (when used by msh).
 149    The biggest impact of different file types is in "eom" testing.  The
 150    code has been carefully organized to test for eom at appropriate
 151    times and at no other times (since the check is quite expensive).
 152    I have tried to arrange things so that the eom check need only be
 153    done on entry to this routine.  Since an eom can only occur after a
 154    newline, this is easy to manage for header fields.  For the msg
 155    body, we try to efficiently search the input buffer to see if
 156    contains the eom delimiter.  If it does, we take up to the
 157    delimiter, otherwise we take everything in the buffer.  (The change
 158    to the body eom/copy processing produced the most noticeable
 159    performance difference, particularly for "inc" and "show".)
 160
 161    There are three qualitatively different things this routine busts
 162    out of a message: field names, field text and msg bodies.  Field
 163    names are typically short (~8 char) and the loop that extracts them
 164    might terminate on a colon, newline or max width.  I considered
 165    using a Vax "scanc" to locate the end of the field followed by a
 166    "memmove" but the routine call overhead on a Vax is too large for this
 167    to work on short names.  If Berkeley ever makes "inline" part of the
 168    C optimiser (so things like "scanc" turn into inline instructions) a
 169    change here would be worthwhile.
 170
 171    Field text is typically 60 - 100 characters so there's (barely)
 172    a win in doing a routine call to something that does a "locc"
 173    followed by a "bmove".  About 30% of the fields have continuations
 174    (usually the 822 "received:" lines) and each continuation generates
 175    another routine call.  "Inline" would be a big win here, as well.
 176
 177    Messages, as of this writing, seem to come in two flavors: small
 178    (~1K) and long (>2K).  Most messages have 400 - 600 bytes of headers
 179    so message bodies average at least a few hundred characters.
 180    Assuming your system uses reasonably sized stdio buffers (1K or
 181    more), this routine should be able to remove the body in large
 182    (>500 byte) chunks.  The makes the cost of a call to "memmove"
 183    small but there is a premium on checking for the eom in packed
 184    maildrops.  The eom pattern is always a simple string so we can
 185    construct an efficient pattern matcher for it (e.g., a Vax "matchc"
 186    instruction).  Some thought went into recognizing the start of
 187    an eom that has been split across two buffers.
 188
 189    This routine wants to deal with large chunks of data so, rather
 190    than "getc" into a local buffer, it uses stdio's buffer.  If
 191    you try to use it on a non-buffered file, you'll get what you
 192    deserve.  This routine "knows" that struct FILEs have a _ptr
 193    and a _cnt to describe the current state of the buffer and
 194    it knows that _filbuf ignores the _ptr & _cnt and simply fills
 195    the buffer.  If stdio on your system doesn't work this way, you
 196    may have to make small changes in this routine.
 197
 198    This routine also "knows" that an EOF indication on a stream is
 199    "sticky" (i.e., you will keep getting EOF until you reposition the
 200    stream).  If your system doesn't work this way it is broken and you
 201    should complain to the vendor.  As a consequence of the sticky
 202    EOF, this routine will never return any kind of EOF status when
 203    there is data in "name" or "buf").
 204   */
 205
 206 /*
 207  * static prototypes
 208  */
 209 static void Ungetc(m_getfld_state_t s);
 210 static int m_Eom (m_getfld_state_t);
 211
 212 #define eom(c,s)        (s->msg_style != MS_DEFAULT && \
 213                          ((c) == *s->msg_delim && m_Eom(s)))
 214
 215 /*
 216  * Maildrop styles
 217  */
 218 #define MS_DEFAULT      0       /* default (one msg per file) */
 219 #define MS_UNKNOWN      1       /* type not known yet         */
 220 #define MS_MBOX         2       /* Unix-style "from" lines    */
 221 #define MS_MMDF         3       /* string MMDF_DELIM          */
 222
 223 /* This replaces the old approach, with its direct access to stdio
 224  * internals.  It uses one fread() to load a buffer that we manage.
 225  *
 226  * MSG_INPUT_SIZE is the size of the buffer.
 227  * MAX_DELIMITER_SIZE is the maximum size of the delimiter used to
 228  * separate messages in a maildrop, such as mbox "From ".
 229  *
 230  * Some of the tests in the test suite assume a MSG_INPUT_SIZE
 231  * of 8192.
 232  */
 233 #define MSG_INPUT_SIZE NMH_BUFSIZ
 234 #define MAX_DELIMITER_SIZE 5
 235
 236 struct m_getfld_state {
 237     /* The file to read from;  I/O block.  Caller keeps passing it after
 238      * initialisation due to historic interface so it keeps getting
 239      * updated, presumably to the same value. */
 240     FILE *iob;
 241
 242     /* Holds content of iob. */
 243     char msg_buf[2 * MSG_INPUT_SIZE + MAX_DELIMITER_SIZE];
 244     /* Points to the next byte to read from msg_buf. */
 245     char *readpos;
 246     /* Points to just after the last valid byte in msg_buf.  If readpos
 247      * equals end then msg_buf is empty. */
 248     char *end;
 249
 250     /* Whether the caller intends to ftell(3)/fseek(3) iob's position,
 251      * and thus whether m_getfld() needs to detect that and compensate. */
 252     int track_filepos;
 253     /* Position in iob given what's been consumed ready for returning to
 254      * the caller.  Further than this may have been read into msg_buf. */
 255     off_t total_bytes_read;
 256     /* Bytes of iob consumed during this call. */
 257     off_t bytes_read;
 258     /* What fseeko(3) tells us iob's position is having just explicitly
 259      * set it to total_bytes_read.  Surely always the same? */
 260     off_t last_caller_pos;
 261     /* Saved position in iob from filling msg_buf, prior to returning. */
 262     off_t last_internal_pos;
 263
 264     /* One of the MS_* macros tracking the type of iob's content and
 265      * thus if it's a single email, or several with delimeters.  Default
 266      * is MS_DEFAULT. */
 267     int msg_style;
 268
 269     /* The message delimeter if iob has multiple emails, else NULL.  For
 270      * MS_MBOX it's the string that separates two emails, "\nFrom ",
 271      * i.e. the terminating blank line of the previous email, and the
 272      * starting From_ line of the next, but for MS_MMDF it's
 273      * "\001\001\001\001\n" that may start or terminate an email. */
 274     char *msg_delim;
 275     /* The last non-NUL char of msg_delim. */
 276     char *delimend;
 277     /* When searching for msg_delim after an email, it's only of
 278      * interest at the start of the line, i.e. when preceded by a
 279      * linefeed.  fdelim points to msg_delim[-1] that contains '\n' so
 280      * it can be used as the needle. */
 281     char *fdelim;
 282     /* strlen(fdelim). */
 283     int fdelimlen;
 284     /* The second char of msg_delim.  Used when the first char has
 285      * already been matched to test the rest. */
 286     char *edelim;
 287     /* strlen(edelim). */
 288     int edelimlen;
 289     /* The relationship between all of these pointers and lengths for
 290      * the two possible msg_delim values.
 291      *
 292      *     "\0\n\nFrom \0"   9              "\0\n\001\001\001\001\n\0"   8
 293      *         | ||   |                         |   |   |         |
 294      *         | ||   s->delimend               |   |   |         s->delimend
 295      *         | ||                             |   |   |
 296      *         | |s->edelim  s->edelimlen=5     |   |   s->edelim  s->edelimlen=4
 297      *         | |                              |   |
 298      *         | s->msg_delim                   |   s->msg_delim
 299      *         |                                |
 300      *         s->fdelim  s->fdelimlen=7        s->fdelim  s->fdelimlen=6
 301      */
 302
 303     /* Maps all the bytes of msg_delim, apart from the last two,
 304      * including the NUL, onto the last position in msg_delim where they
 305      * occur.  Bytes not present are NULL. */
 306     char **pat_map;
 307
 308     /* The parser's current state.  Also returned to the caller, amongst
 309      * other possible values, to indicate the token consumed.  One of
 310      * FLD, FLDPLUS, BODY, or FILEEOF. */
 311     int state;
 312 };
 313
 314 m_getfld_state_t m_getfld_state_init(FILE *iob)
 315 {
 316     m_getfld_state_t s;
 317
 318     NEW(s);
 319     s->readpos = s->end = s->msg_buf;
 320     s->bytes_read = s->total_bytes_read = 0;
 321     s->last_caller_pos = s->last_internal_pos = 0;
 322     s->iob = iob;
 323     s->pat_map = NULL;
 324     s->msg_style = MS_DEFAULT;
 325     s->msg_delim = "";
 326     s->fdelim = s->delimend = s->edelim = NULL;
 327     s->fdelimlen = s->edelimlen = 0;
 328     s->state = FLD;
 329     s->track_filepos = 0;
 330
 331     return s;
 332 }
 333
 334 /* scan() needs to force an initial state of FLD for each message. */
 335 void
 336 m_getfld_state_reset (m_getfld_state_t *gstate)
 337 {
 338     if (*gstate) {
 339         (*gstate)->state = FLD;
 340     }
 341 }
 342
 343 /* If the caller interleaves ftell*()/fseek*() calls with m_getfld()
 344    calls, m_getfld() must keep track of the file position.  The caller
 345    must use this function to inform m_getfld(). */
 346 void
 347 m_getfld_track_filepos (m_getfld_state_t *gstate, FILE *iob)
 348 {
 349     if (! *gstate) {
 350         *gstate = m_getfld_state_init(iob);
 351     }
 352
 353     (*gstate)->track_filepos = 1;
 354 }
 355
 356 /* m_getfld_track_filepos() with the existing iob. */
 357 void
 358 m_getfld_track_filepos2(m_getfld_state_t *gstate)
 359 {
 360     if (!*gstate)
 361         die("m_getfld_track_filepos2 without gstate");
 362
 363     m_getfld_track_filepos(gstate, (*gstate)->iob);
 364 }
 365
 366 void
 367 m_getfld_state_destroy (m_getfld_state_t *gstate)
 368 {
 369     m_getfld_state_t s = *gstate;
 370
 371     if (s) {
 372         if (s->fdelim) {
 373             free (s->fdelim-1);
 374             free (s->pat_map);
 375         }
 376         free (s);
 377         *gstate = 0;
 378     }
 379 }
 380
 381 /*
 382   Summary of file and message input buffer positions:
 383
 384   input file      -------------------------------------------EOF
 385                                  |              |
 386                           last_caller_pos  last_internal_pos
 387
 388
 389   msg_buf                   --------------------EOF
 390                             |         |         |
 391                          msg_buf   readpos     end
 392
 393                             |<>|=retained characters, difference
 394                                  between last_internal_pos and
 395                                  first readpos value after reading
 396                                  in new chunk in read_more()
 397
 398   When returning from m_getfld()/m_unknown():
 399   1) Save the internal file position in last_internal_pos.  That's the
 400      m_getfld() position reference in the input file.
 401   2) Set file stream position so that callers can use ftell().
 402
 403   When entering m_getfld()/m_unknown():
 404   Check to see if the call had changed the file position.  If so,
 405   adjust the internal position reference accordingly.  If not, restore
 406   the internal file position from last_internal_pos.
 407 */
 408
 409
 410 static void
 411 enter_getfld (m_getfld_state_t *gstate, FILE *iob)
 412 {
 413     m_getfld_state_t s;
 414     off_t pos;
 415     off_t pos_movement;
 416
 417     if (! *gstate) {
 418         *gstate = m_getfld_state_init(iob);
 419     }
 420     s = *gstate;
 421     s->bytes_read = 0;
 422
 423     /* This is ugly and no longer necessary, but is retained just in
 424        case it's needed again.  The parser used to open the input file
 425        multiple times, so we had to always use the FILE * that's
 426        passed to m_getfld().  Now the parser inits a new
 427        m_getfld_state for each file.  See comment below about the
 428        readpos shift code being currently unused. */
 429     s->iob = iob;
 430
 431     if (!s->track_filepos)
 432         return;
 433
 434     if ((pos = ftello(iob)) == -1)
 435         adios("getfld's iob", "failed to get offset on entry");
 436     if (pos == 0 && s->last_internal_pos == 0)
 437         return;
 438
 439     if (s->last_internal_pos == 0) {
 440         s->total_bytes_read = pos;
 441         return;
 442     }
 443
 444     pos_movement = pos - s->last_caller_pos; /* Can be < 0. */
 445     if (pos_movement == 0) {
 446         pos = s->last_internal_pos;
 447     } else {
 448         /* The current file stream position differs from the
 449            last one, so caller must have called ftell/o().
 450            Or, this is the first call and the file position
 451            was not at 0. */
 452
 453         if (s->readpos + pos_movement >= s->msg_buf  &&
 454             s->readpos + pos_movement < s->end) {
 455             /* This is currently unused.  It could be used by
 456                parse_mime() if it was changed to use a global
 457                m_getfld_state. */
 458             /* We can shift readpos and remain within the
 459                bounds of msg_buf. */
 460             s->readpos += pos_movement;
 461             s->total_bytes_read += pos_movement;
 462             pos = s->last_internal_pos;
 463         } else {
 464             off_t off;
 465             size_t num_read;
 466
 467             /* This seek skips past an integral number of
 468                chunks of size MSG_INPUT_SIZE. */
 469             off = pos / MSG_INPUT_SIZE * MSG_INPUT_SIZE;
 470             if (fseeko(iob, off, SEEK_SET) == -1)
 471                 adios("getfld's iob", "failed to set offset to skip: "
 472                     "%" PRIdMAX, (intmax_t)off);
 473             num_read = fread (s->msg_buf, 1, MSG_INPUT_SIZE, iob);
 474             s->readpos = s->msg_buf  +  pos % MSG_INPUT_SIZE;
 475             s->end = s->msg_buf + num_read;
 476             s->total_bytes_read = pos;
 477         }
 478     }
 479
 480     if (fseeko(iob, pos, SEEK_SET) == -1)
 481         adios("getfld's iob", "failed to set offset on entry: %" PRIdMAX,
 482             (intmax_t)pos);
 483 }
 484
 485 static void
 486 leave_getfld (m_getfld_state_t s)
 487 {
 488     s->total_bytes_read += s->bytes_read;
 489
 490     if (s->track_filepos) {
 491         /* Save the internal file position that we use for the input buffer. */
 492         if ((s->last_internal_pos = ftello(s->iob)) == -1)
 493             adios("getfld's iob", "failed to get offset before seek");
 494
 495         /* Set file stream position so that callers can use ftell(). */
 496         if (fseeko(s->iob, s->total_bytes_read, SEEK_SET) == -1)
 497             adios("getfld's iob", "failed to set offset: %" PRIdMAX,
 498                 (intmax_t)s->total_bytes_read);
 499
 500         s->last_caller_pos = s->total_bytes_read;
 501     }
 502 }
 503
 504 static size_t
 505 read_more (m_getfld_state_t s)
 506 {
 507     /* Retain at least edelimlen characters that have already been read,
 508        if at least edelimlen have been read, so that we can back up to them
 509        in m_Eom(). */
 510     ssize_t retain = s->end - s->msg_buf < s->edelimlen ? 0 : s->edelimlen;
 511     size_t num_read;
 512
 513     if (retain > 0) {
 514         if (retain < s->end - s->readpos)
 515             retain = s->end - s->readpos;
 516         assert (retain <= s->readpos - s->msg_buf);
 517
 518         /* Move what we want to retain at end of the buffer to the beginning. */
 519         memmove (s->msg_buf, s->readpos - retain, retain);
 520     }
 521
 522     s->readpos = s->msg_buf + retain;
 523     num_read = fread (s->readpos, 1, MSG_INPUT_SIZE, s->iob);
 524     s->end = s->readpos + num_read;
 525
 526     return num_read;
 527 }
 528
 529 /* Return the next character consumed from the input, fetching more of
 530  * the input for the buffer if required, or EOF on end of file. */
 531 static int
 532 Getc (m_getfld_state_t s)
 533 {
 534     if ((s->end - s->readpos < 1 && read_more (s) == 0) ||
 535         s->readpos >= s->end)
 536         return EOF;
 537
 538     s->bytes_read++;
 539     return (unsigned char)*s->readpos++;
 540 }
 541
 542 /* Return the next character that Getc() would return, which may be EOF. */
 543 static int
 544 Peek (m_getfld_state_t s)
 545 {
 546     int c;
 547
 548     c = Getc(s);
 549     if (c != EOF)
 550         Ungetc(s);
 551
 552     return c;
 553 }
 554
 555 /* If there's room, undo the consumption of one character from msg_buf,
 556  * rewinding so it's read next, else die. */
 557 static void
 558 Ungetc(m_getfld_state_t s)
 559 {
 560     if (s->readpos == s->msg_buf)
 561         die("Ungetc() at start of message buffer.");
 562
 563     s->readpos--;
 564     s->bytes_read--;
 565 }
 566
 567
 568 int
 569 m_getfld (m_getfld_state_t *gstate, char name[NAMESZ], char *buf, int *bufsz,
 570           FILE *iob)
 571 {
 572     m_getfld_state_t s;
 573     char *cp;
 574     int max, n, c;
 575
 576     enter_getfld (gstate, iob);
 577     s = *gstate;
 578
 579     if ((c = Getc(s)) == EOF) {
 580         *bufsz = *buf = 0;
 581         leave_getfld (s);
 582         return s->state = FILEEOF;
 583     }
 584     if (eom (c, s)) {
 585         /* flush null messages */
 586         while ((c = Getc(s)) != EOF && eom (c, s))
 587             ;
 588
 589         if (c != EOF)
 590             Ungetc(s);
 591         *bufsz = *buf = 0;
 592         leave_getfld (s);
 593         return s->state = FILEEOF;
 594     }
 595
 596     switch (s->state) {
 597         case FLD:
 598             if (c == '\n' || c == '-') {
 599                 /* we hit the header/body separator */
 600                 while (c != '\n' && (c = Getc(s)) != EOF)
 601                     ;
 602
 603                 if (c == EOF || (c = Getc(s)) == EOF || eom (c, s)) {
 604                     /* flush null messages */
 605                     while ((c = Getc(s)) != EOF && eom (c, s))
 606                         ;
 607                     if (c != EOF)
 608                         Ungetc(s);
 609                     *bufsz = *buf = 0;
 610                     leave_getfld (s);
 611                     return s->state = FILEEOF;
 612                 }
 613                 s->state = BODY;
 614                 goto body;
 615             }
 616             /*
 617              * get the name of this component.  take characters up
 618              * to a ':', a newline or NAMESZ-1 characters, whichever
 619              * comes first.
 620              */
 621             cp = name;
 622             max = NAMESZ - 1;
 623             /* Get the field name.  The first time through the loop,
 624                this copies out the first character, which was loaded
 625                into c prior to loop entry.  Initialize n to 1 to
 626                account for that. */
 627             for (n = 1;
 628                  c != ':'  &&  c != '\n'  &&  c != EOF  &&  n < max;
 629                  ++n, c = Getc (s)) {
 630                 *cp++ = c;
 631             }
 632
 633             /* Check for next character, which is either the space after
 634                the ':' or the first folded whitespace. */
 635             {
 636                 int next_char;
 637                 if (c == EOF  ||  (next_char = Peek (s)) == EOF) {
 638                     *bufsz = *cp = *buf = 0;
 639                     inform("eof encountered in field \"%s\"", name);
 640                     leave_getfld (s);
 641                     return s->state = FMTERR;
 642                 }
 643             }
 644
 645             /* If c isn't ':' here, something went wrong.  Possibilities are:
 646              *  . hit a newline (error)
 647              *  . got more than namesz chars. (error)
 648              */
 649             if (c == ':') {
 650                 /* Finished header name, fall through to FLDPLUS below. */
 651             } else if (c == '\n') {
 652                 /* We hit the end of the line without seeing ':' to
 653                  * terminate the field name.  This is usually (always?)
 654                  * spam.  But, blowing up is lame, especially when
 655                  * scan(1)ing a folder with such messages.  Pretend such
 656                  * lines are the first of the body (at least mutt also
 657                  * handles it this way). */
 658
 659                 /* See if buf can hold this line, since we were assuming
 660                  * we had a buffer of NAMESZ, not bufsz. */
 661                 /* + 1 for the newline */
 662                 if (*bufsz < n + 1) {
 663                     /* No, it can't.  Oh well, guess we'll blow up. */
 664                     *bufsz = *cp = *buf = 0;
 665                     inform("eol encountered in field \"%s\"", name);
 666                     s->state = FMTERR;
 667                     break;
 668                 }
 669                 memcpy (buf, name, n - 1);
 670                 buf[n - 1] = '\n';
 671                 buf[n] = '\0';
 672                 /* Indicate this wasn't a header field using a character
 673                    that can't appear in a header field. */
 674                 name[0] = ':';
 675                 /* The last character read was '\n'.  s->bytes_read
 676                    (and n) include that, but it was not put into the
 677                    name array in the for loop above.  So subtract 1. */
 678                 *bufsz = --s->bytes_read;  /* == n - 1 */
 679                 leave_getfld (s);
 680                 return s->state = BODY;
 681             }
 682             if (max <= n) {
 683                 /* By design, the loop above discards the last character
 684                    it had read.  It's in c, use it. */
 685                 *cp++ = c;
 686                 *bufsz = *cp = *buf = 0;
 687                 inform("field name \"%s\" exceeds %d bytes", name,
 688                         NAMESZ - 2);
 689                 s->state = LENERR;
 690                 break;
 691             }
 692
 693             /* Trim any trailing spaces from the end of name. */
 694             while (isspace ((unsigned char) *--cp) && cp >= name) continue;
 695             *++cp = 0;
 696             /* readpos points to the first character of the field body. */
 697             /* FALLTHRU */
 698
 699         case FLDPLUS: {
 700             /*
 701              * get (more of) the text of a field.  Take
 702              * characters up to the end of this field (newline
 703              * followed by non-blank) or bufsz-1 characters.
 704              */
 705             cp = buf;
 706             max = *bufsz-1;
 707             n = 0;
 708             for (bool finished = false; !finished; ) {
 709                 while (c != '\n'  &&  c != EOF  &&  n++ < max) {
 710                     if ((c = Getc (s)) != EOF)
 711                         *cp++ = c;
 712                 }
 713
 714                 if (c != EOF)
 715                     c = Peek (s);
 716                 if (max < n) {
 717                     /* The dest buffer is full.  Need to back the read
 718                        pointer up by one because when m_getfld() is
 719                        reentered, it will read a character.  Then
 720                        we'll jump right to the FLDPLUS handling code,
 721                        which will not store that character, but
 722                        instead move on to the next one. */
 723                     if (s->readpos > s->msg_buf) {
 724                         --s->readpos;
 725                         --s->bytes_read;
 726                     }
 727                     s->state = FLDPLUS;
 728                     finished = true;
 729                 } else if (c != ' '  &&  c != '\t') {
 730                     /* The next character is not folded whitespace, so
 731                        prepare to move on to the next field.  It's OK
 732                        if c is EOF, it will be handled on the next
 733                        call to m_getfld (). */
 734                     s->state = FLD;
 735                     finished = true;
 736                 } else {
 737                     /* Folded header field, continues on the next line. */
 738                 }
 739             }
 740             *bufsz = s->bytes_read;
 741             break;
 742         }
 743
 744         body:
 745         case BODY: {
 746             /*
 747              * get the message body up to bufsz characters or the
 748              * end of the message.
 749              */
 750             char *bp;
 751
 752             name[0] = '\0';
 753             max = *bufsz-1;
 754             /* Back up and store the current position. */
 755             bp = --s->readpos;
 756             c = min(s->end - s->readpos, max);
 757             if (s->msg_style != MS_DEFAULT && c > 1) {
 758                 /*
 759                  * packed maildrop - only take up to the (possible)
 760                  * start of the next message.  This "matchc" should
 761                  * probably be a Boyer-Moore matcher for non-vaxen,
 762                  * particularly since we have the alignment table
 763                  * all built for the end-of-buffer test (next).
 764                  * But our vax timings indicate that the "matchc"
 765                  * instruction is 50% faster than a carefully coded
 766                  * B.M. matcher for most strings.  (So much for elegant
 767                  * algorithms vs. brute force.)  Since I (currently)
 768                  * run MH on a vax, we use the matchc instruction. --vj
 769                  */
 770                 char *ep;
 771
 772                 if ((ep = memmem(bp, c, s->fdelim, s->fdelimlen)))
 773                     /* Plus one to nab the '\n' that starts fdelim as
 774                      * that ends the previous line;  it isn't part of
 775                      * msg_delim. */
 776                     c = ep - bp + 1;
 777                 else {
 778                     /*
 779                      * There's no delim in the buffer but there may be
 780                      * a partial one at the end.  If so, we want to leave
 781                      * it so the "eom" check on the next call picks it up.
 782                      * Use a modified Boyer-Moore matcher to make this
 783                      * check relatively cheap.  The first "if" figures
 784                      * out what position in the pattern matches the last
 785                      * character in the buffer.  The inner "while" matches
 786                      * the pattern against the buffer, backwards starting
 787                      * at that position.  Note that unless the buffer
 788                      * ends with one of the characters in the pattern
 789                      * (excluding the first and last), we do only one test.
 790                      */
 791                     char *sp;
 792
 793                     ep = bp + c - 1; /* The last byte. */
 794                     if ((sp = s->pat_map[(unsigned char) *ep])) {
 795                         do {
 796                             /* This if() is true unless (a) the buffer is too
 797                              * small to contain this delimiter prefix, or
 798                              * (b) it contains exactly enough chars for the
 799                              * delimiter prefix.
 800                              * For case (a) obviously we aren't going to match.
 801                              * For case (b), if the buffer really contained exactly
 802                              * a delim prefix, then the m_eom call at entry
 803                              * should have found it.  Thus it's not a delim
 804                              * and we know we won't get a match.
 805                              */
 806                             if (((sp - s->fdelim) + 2) <= c) {
 807                                 cp = sp;
 808                                 /* Unfortunately although fdelim has a preceding NUL
 809                                  * we can't use this as a sentinel in case the buffer
 810                                  * contains a NUL in exactly the wrong place (this
 811                                  * would cause us to run off the front of fdelim).
 812                                  */
 813                                 while (*--ep == *--cp)
 814                                     if (cp < s->fdelim)
 815                                         break;
 816                                 if (cp < s->fdelim) {
 817                                     /* we matched the entire delim prefix,
 818                                      * so only take the buffer up to there.
 819                                      * we know ep >= bp -- check above prevents underrun
 820                                      */
 821                                     c = (ep - bp) + 2;
 822                                     break;
 823                                 }
 824                             }
 825                             /* try matching one less char of delim string */
 826                             ep = bp + c - 1;
 827                         } while (--sp > s->fdelim);
 828                     }
 829                 }
 830             }
 831             memcpy( buf, bp, c );
 832             /* Advance the current position to reflect the copy out.
 833                c is less than or equal to the number of bytes remaining
 834                in the read buffer, so will not overrun it. */
 835             s->readpos += c;
 836             cp = buf + c;
 837             /* Subtract 1 from c because the first character was read by
 838                Getc(), and therefore already accounted for in s->bytes_read. */
 839             s->bytes_read += c - 1;
 840             *bufsz = s->bytes_read;
 841             break;
 842         }
 843
 844         default:
 845             die("m_getfld() called with bogus state of %d", s->state);
 846     }
 847
 848     *cp = 0;
 849     leave_getfld (s);
 850
 851     return s->state;
 852 }
 853
 854
 855 /* m_getfld() with the existing iob. */
 856 int
 857 m_getfld2(m_getfld_state_t *gstate, char name[NAMESZ], char *buf, int *bufsz)
 858 {
 859     if (!*gstate)
 860         die("m_getfld2 without gstate");
 861
 862     return m_getfld(gstate, name, buf, bufsz, (*gstate)->iob);
 863 }
 864
 865
 866 void
 867 m_unknown(m_getfld_state_t *gstate, FILE *iob)
 868 {
 869     m_getfld_state_t s;
 870     int c;
 871     char text[MAX_DELIMITER_SIZE];
 872     char from[] = "From ";
 873     char *cp;
 874     char *delimstr;
 875     unsigned int i;
 876
 877     enter_getfld (gstate, iob);
 878     s = *gstate;
 879
 880 /*
 881  * Figure out what the message delimiter string is for this
 882  * maildrop.  (This used to be part of m_Eom but I didn't like
 883  * the idea of an "if" statement that could only succeed on the
 884  * first call to m_Eom getting executed on each call, i.e., at
 885  * every newline in the message).
 886  *
 887  * If the first line of the maildrop is a Unix "From " line, we
 888  * say the style is MBOX and eat the rest of the line.  Otherwise
 889  * we say the style is MMDF and look for the delimiter string
 890  * specified when nmh was built (or from the mts.conf file).
 891  */
 892
 893     s->msg_style = MS_UNKNOWN;
 894
 895     for (i = 0, cp = text; i < sizeof text; ++i, ++cp) {
 896         if ((c = Getc (s)) == EOF) {
 897             *cp = '\0';
 898             break;
 899         }
 900         *cp = c;
 901     }
 902
 903     if (i == sizeof from-1  &&  strncmp (text, "From ", sizeof from-1) == 0) {
 904         s->msg_style = MS_MBOX;
 905         delimstr = "\nFrom ";
 906         while ((c = Getc(s)) != EOF && c != '\n')
 907             ;
 908     } else {
 909         /* not a Unix style maildrop */
 910         s->readpos -= s->bytes_read;
 911         s->bytes_read = 0;
 912         delimstr = MMDF_DELIM;
 913         s->msg_style = MS_MMDF;
 914     }
 915
 916     /*     "\nFrom \0"   7                  "\001\001\001\001\n\0"  6
 917      *       |                                  |
 918      *       delimstr   c=6                     delimstr   c=5
 919      */
 920     c = strlen (delimstr);
 921     s->fdelim = mh_xmalloc (c + 3); /* \0, \n, delimstr, \0 */
 922     *s->fdelim++ = '\0';
 923     *s->fdelim = '\n';
 924     s->fdelimlen = c + 1;
 925     s->msg_delim = s->fdelim+1;
 926     strcpy (s->msg_delim, delimstr);
 927     s->edelim = s->msg_delim+1;
 928     s->edelimlen = c - 1;
 929     s->delimend = s->msg_delim + s->edelimlen;
 930     if (s->edelimlen <= 1)
 931         die("maildrop delimiter must be at least 2 bytes");
 932
 933     /*
 934      * build a Boyer-Moore end-position map for the matcher in m_getfld.
 935      * N.B. - we don't match just the first char (since it's the newline
 936      * separator) or the last char (since the matchc would have found it
 937      * if it was a real delim).
 938      */
 939     s->pat_map = mh_xcalloc (256, sizeof(char *));
 940
 941     for (cp = s->fdelim + 1; cp < s->delimend; cp++ )
 942         s->pat_map[(unsigned char)*cp] = cp;
 943
 944     if (s->msg_style == MS_MMDF) {
 945         /* flush extra msg hdrs */
 946         while ((c = Getc(s)) != EOF && eom (c, s))
 947             ;
 948         if (c != EOF)
 949             Ungetc(s);
 950     }
 951
 952     leave_getfld (s);
 953 }
 954
 955
 956 /* m_unknown() with the existing iob. */
 957 void
 958 m_unknown2(m_getfld_state_t *gstate)
 959 {
 960     if (!*gstate)
 961         die("m_unknown2 without gstate");
 962
 963     m_unknown(gstate, (*gstate)->iob);
 964 }
 965
 966
 967 /*
 968  * test for msg delimiter string
 969  */
 970
 971 static int
 972 m_Eom (m_getfld_state_t s)
 973 {
 974     int i;
 975     char text[MAX_DELIMITER_SIZE];
 976     char *cp;
 977     int adjust = 1;
 978
 979     for (i = 0, cp = text; i < s->edelimlen; ++i, ++cp) {
 980         int c2;
 981
 982         if ((c2 = Getc (s)) == EOF) {
 983             *cp = '\0';
 984             break;
 985         }
 986         *cp = c2;
 987     }
 988
 989     if (i != s->edelimlen  ||
 990         strncmp (text, (char *)s->edelim, s->edelimlen)) {
 991         if (i == 0 && s->msg_style == MS_MBOX) {
 992             /* the final newline in the (brain damaged) unix-format
 993              * maildrop is part of the delimiter - delete it.
 994              */
 995             return 1;
 996         }
 997
 998         if (i <= 2  &&  s->msg_style == MS_MBOX  &&
 999             i != s->edelimlen  &&  ! strncmp(text, s->fdelim, i)) {
1000             /* If all or part of fdelim appeared at the end of the file,
1001                back up even more so that the bytes are included in the
1002                message. */
1003             adjust = 2;
1004         }
1005
1006         /* Did not find delimiter, so restore the read position.
1007            Note that on input, a character had already been read
1008            with Getc().  It will be unget by m_getfld () on return. */
1009         s->readpos -= s->bytes_read - adjust;
1010         s->bytes_read = adjust;
1011         return 0;
1012     }
1013
1014     if (s->msg_style == MS_MBOX) {
1015         int c;
1016         while ((c = Getc(s)) != EOF && c != '\n')
1017             ;
1018     }
1019
1020     return 1;
1021 }