-
-/*
- * m_getfld.c -- read/parse a message
+/* m_getfld.c -- read/parse a message
*
* This code is Copyright (c) 2002, by the authors of nmh. See the
* COPYRIGHT file in the root directory of the nmh distribution for
be parsed as well. Unfortunately the speed issue finally caught up with
us since this routine is at the very heart of MH.
- To speed things up considerably, the routine Eom() was made an auxilary
+ To speed things up considerably, the routine Eom() was made an auxiliary
function called by the macro eom(). Unless we are bursting a maildrop,
the eom() macro returns FALSE saying we aren't at the end of the
message.
names are typically short (~8 char) and the loop that extracts them
might terminate on a colon, newline or max width. I considered
using a Vax "scanc" to locate the end of the field followed by a
- "bcopy" but the routine call overhead on a Vax is too large for this
+ "memmove" but the routine call overhead on a Vax is too large for this
to work on short names. If Berkeley ever makes "inline" part of the
C optimiser (so things like "scanc" turn into inline instructions) a
change here would be worthwhile.
so message bodies average at least a few hundred characters.
Assuming your system uses reasonably sized stdio buffers (1K or
more), this routine should be able to remove the body in large
- (>500 byte) chunks. The makes the cost of a call to "bcopy"
+ (>500 byte) chunks. The makes the cost of a call to "memmove"
small but there is a premium on checking for the eom in packed
maildrops. The eom pattern is always a simple string so we can
construct an efficient pattern matcher for it (e.g., a Vax "matchc"
/*
* static prototypes
*/
-struct m_getfld_state;
static int m_Eom (m_getfld_state_t);
-static char *matchc(int, char *, int, char *);
#define eom(c,s) (s->msg_style != MS_DEFAULT && \
((c) == *s->msg_delim && m_Eom(s)))
char *end; /* One past the last character read in. */
/* The following support tracking of the read position in the
input file stream so that callers can interleave m_getfld()
- calls with ftell() and fseek(). ytes_read replaces the old
+ calls with ftell() and fseek(). bytes_read replaces the old
m_getfld() msg_count global. last_caller_pos is stored when
leaving m_getfld()/m_unknown(), then checked on the next entry.
last_internal_pos is used to remember the position used
* The "full" delimiter string for a packed maildrop consists
* of a newline followed by the actual delimiter. E.g., the
* full string for a Unix maildrop would be: "\n\nFrom ".
- * "Fdelim" points to the start of the full string and is used
+ * "fdelim" points to the start of the full string and is used
* in the BODY case of the main routine to search the buffer for
* a possible eom. Msg_delim points to the first character of
- * the actual delim. string (i.e., fdelim+1). Edelim
+ * the actual delim. string (i.e., fdelim+1). edelim
* points to the 2nd character of actual delimiter string. It
* is used in m_Eom because the first character of the string
* has been read and matched before m_Eom is called.
m_getfld_state_init (m_getfld_state_t *gstate, FILE *iob) {
m_getfld_state_t s;
- s = *gstate = (m_getfld_state_t) mh_xmalloc(sizeof (struct m_getfld_state));
+ NEW(s);
+ *gstate = s;
s->readpos = s->end = s->msg_buf;
s->bytes_read = s->total_bytes_read = 0;
s->last_caller_pos = s->last_internal_pos = 0;
static void
enter_getfld (m_getfld_state_t *gstate, FILE *iob) {
m_getfld_state_t s;
- off_t pos = ftello (iob);
+ off_t pos;
+ off_t pos_movement;
if (! *gstate) {
m_getfld_state_init (gstate, iob);
readpos shift code being currently unused. */
s->iob = iob;
- if (s->track_filepos && (pos != 0 || s->last_internal_pos != 0)) {
- if (s->last_internal_pos == 0) {
- s->total_bytes_read = pos;
- } else {
- off_t pos_movement = pos - s->last_caller_pos; /* Can be < 0. */
-
- if (pos_movement == 0) {
- pos = s->last_internal_pos;
- } else {
- /* The current file stream position differs from the
- last one, so caller must have called ftell/o().
- Or, this is the first call and the file position
- was not at 0. */
-
- if (s->readpos + pos_movement >= s->msg_buf &&
- s->readpos + pos_movement < s->end) {
- /* This is currently unused. It could be used by
- parse_mime() if it was changed to use a global
- m_getfld_state. */
- /* We can shift readpos and remain within the
- bounds of msg_buf. */
- s->readpos += pos_movement;
- s->total_bytes_read += pos_movement;
- pos = s->last_internal_pos;
- } else {
- size_t num_read;
-
- /* This seek skips past an integral number of
- chunks of size MSG_INPUT_SIZE. */
- fseeko (iob, pos/MSG_INPUT_SIZE * MSG_INPUT_SIZE, SEEK_SET);
- num_read = fread (s->msg_buf, 1, MSG_INPUT_SIZE, iob);
- s->readpos = s->msg_buf + pos % MSG_INPUT_SIZE;
- s->end = s->msg_buf + num_read;
- s->total_bytes_read = pos;
- }
- }
+ if (!s->track_filepos)
+ return;
- fseeko (iob, pos, SEEK_SET);
- }
+ pos = ftello(iob);
+ if (pos == 0 && s->last_internal_pos == 0)
+ return;
+
+ if (s->last_internal_pos == 0) {
+ s->total_bytes_read = pos;
+ return;
}
+
+ pos_movement = pos - s->last_caller_pos; /* Can be < 0. */
+ if (pos_movement == 0) {
+ pos = s->last_internal_pos;
+ } else {
+ /* The current file stream position differs from the
+ last one, so caller must have called ftell/o().
+ Or, this is the first call and the file position
+ was not at 0. */
+
+ if (s->readpos + pos_movement >= s->msg_buf &&
+ s->readpos + pos_movement < s->end) {
+ /* This is currently unused. It could be used by
+ parse_mime() if it was changed to use a global
+ m_getfld_state. */
+ /* We can shift readpos and remain within the
+ bounds of msg_buf. */
+ s->readpos += pos_movement;
+ s->total_bytes_read += pos_movement;
+ pos = s->last_internal_pos;
+ } else {
+ size_t num_read;
+
+ /* This seek skips past an integral number of
+ chunks of size MSG_INPUT_SIZE. */
+ fseeko (iob, pos/MSG_INPUT_SIZE * MSG_INPUT_SIZE, SEEK_SET);
+ num_read = fread (s->msg_buf, 1, MSG_INPUT_SIZE, iob);
+ s->readpos = s->msg_buf + pos % MSG_INPUT_SIZE;
+ s->end = s->msg_buf + num_read;
+ s->total_bytes_read = pos;
+ }
+ }
+
+ fseeko (iob, pos, SEEK_SET);
}
static void
ssize_t retain = s->edelimlen;
size_t num_read;
- if (retain < s->end - s->readpos) retain = s->end - s->readpos;
+ if (retain < s->end - s->readpos)
+ retain = s->end - s->readpos;
assert (retain <= s->readpos - s->msg_buf);
/* Move what we want to retain at end of the buffer to the beginning. */
but EOF is typically 0xffffffff. */
static int
Getc (m_getfld_state_t s) {
- if (s->end - s->readpos < 1) {
- if (read_more (s) == 0) {
- /* Pretend that we read a character. That's what stdio does. */
- ++s->readpos;
- return EOF;
- }
- }
+ if ((s->end - s->readpos < 1 && read_more (s) == 0) ||
+ s->readpos >= s->end)
+ return EOF;
- ++s->bytes_read;
- return s->readpos < s->end ? (unsigned char) *s->readpos++ : EOF;
+ s->bytes_read++;
+ return (unsigned char)*s->readpos++;
}
static int
Peek (m_getfld_state_t s) {
- if (s->end - s->readpos < 1) {
- if (read_more (s) == 0) {
- /* Pretend that we read a character. That's what stdio does. */
- ++s->readpos;
- return EOF;
- }
+ if (s->end - s->readpos < 1 && read_more (s) == 0) {
+ return EOF;
}
-
return s->readpos < s->end ? (unsigned char) *s->readpos : EOF;
}
Ungetc (int c, m_getfld_state_t s) {
if (s->readpos == s->msg_buf) {
return EOF;
- } else {
- --s->bytes_read;
- return *--s->readpos = (unsigned char) c;
}
+ --s->bytes_read;
+ return *--s->readpos = (unsigned char) c;
}
FILE *iob)
{
m_getfld_state_t s;
- register char *cp;
- register int max, n, c;
+ char *cp;
+ int max, n, c;
enter_getfld (gstate, iob);
s = *gstate;
- if ((c = Getc(s)) < 0) {
+ if ((c = Getc(s)) == EOF) {
*bufsz = *buf = 0;
leave_getfld (s);
return s->state = FILEEOF;
}
if (eom (c, s)) {
/* flush null messages */
- while ((c = Getc(s)) >= 0 && eom (c, s))
+ while ((c = Getc(s)) != EOF && eom (c, s))
;
- if (c >= 0)
+ if (c != EOF)
Ungetc(c, s);
*bufsz = *buf = 0;
leave_getfld (s);
case FLD:
if (c == '\n' || c == '-') {
/* we hit the header/body separator */
- while (c != '\n' && (c = Getc(s)) >= 0) continue;
+ while (c != '\n' && (c = Getc(s)) != EOF)
+ ;
- if (c < 0 || (c = Getc(s)) < 0 || eom (c, s)) {
+ if (c == EOF || (c = Getc(s)) == EOF || eom (c, s)) {
/* flush null messages */
- while ((c = Getc(s)) >= 0 && eom (c, s))
+ while ((c = Getc(s)) != EOF && eom (c, s))
;
- if (c >= 0)
+ if (c != EOF)
Ungetc(c, s);
*bufsz = *buf = 0;
leave_getfld (s);
int next_char;
if (c == EOF || (next_char = Peek (s)) == EOF) {
*bufsz = *cp = *buf = 0;
- advise (NULL, "eof encountered in field \"%s\"", name);
+ inform("eof encountered in field \"%s\"", name);
leave_getfld (s);
return s->state = FMTERR;
}
if (*bufsz < n + 1) {
/* No, it can't. Oh well, guess we'll blow up. */
*bufsz = *cp = *buf = 0;
- advise (NULL, "eol encountered in field \"%s\"", name);
+ inform("eol encountered in field \"%s\"", name);
s->state = FMTERR;
break;
}
memcpy (buf, name, n - 1);
buf[n - 1] = '\n';
buf[n] = '\0';
+ /* Indicate this wasn't a header field using a character
+ that can't appear in a header field. */
+ name[0] = ':';
/* The last character read was '\n'. s->bytes_read
(and n) include that, but it was not put into the
name array in the for loop above. So subtract 1. */
*bufsz = --s->bytes_read; /* == n - 1 */
leave_getfld (s);
return s->state = BODY;
- } else if (max <= n) {
+ }
+ if (max <= n) {
/* By design, the loop above discards the last character
it had read. It's in c, use it. */
*cp++ = c;
*bufsz = *cp = *buf = 0;
- advise (NULL, "field name \"%s\" exceeds %d bytes", name,
+ inform("field name \"%s\" exceeds %d bytes", name,
NAMESZ - 2);
s->state = LENERR;
break;
while (isspace ((unsigned char) *--cp) && cp >= name) continue;
*++cp = 0;
/* readpos points to the first character of the field body. */
- /* fall through */
+ /* FALLTHRU */
case FLDPLUS: {
/*
n = 0;
for (finished = 0; ! finished; ) {
while (c != '\n' && c != EOF && n++ < max) {
- if ((c = Getc (s)) != EOF) { *cp++ = c; }
+ if ((c = Getc (s)) != EOF)
+ *cp++ = c;
}
- if (c != EOF) c = Peek (s);
+ if (c != EOF)
+ c = Peek (s);
if (max < n) {
/* The dest buffer is full. Need to back the read
pointer up by one because when m_getfld() is
*/
char *bp;
+ name[0] = '\0';
max = *bufsz-1;
/* Back up and store the current position. */
bp = --s->readpos;
- c = s->end - s->readpos < max ? s->end - s->readpos : max;
+ c = min(s->end - s->readpos, max);
if (s->msg_style != MS_DEFAULT && c > 1) {
/*
* packed maildrop - only take up to the (possible)
*/
char *ep;
- if ((ep = matchc( s->fdelimlen, s->fdelim, c, bp )))
+ if ((ep = memmem(bp, c, s->fdelim, s->fdelimlen)))
c = ep - bp + 1;
else {
/*
m_unknown(m_getfld_state_t *gstate, FILE *iob)
{
m_getfld_state_t s;
- register int c;
+ int c;
char text[MAX_DELIMITER_SIZE];
char from[] = "From ";
- register char *cp;
- register char *delimstr;
+ char *cp;
+ char *delimstr;
unsigned int i;
enter_getfld (gstate, iob);
s = *gstate;
/*
- * Figure out what the message delimitter string is for this
+ * Figure out what the message delimiter string is for this
* maildrop. (This used to be part of m_Eom but I didn't like
* the idea of an "if" statement that could only succeed on the
* first call to m_Eom getting executed on each call, i.e., at
if ((c = Getc (s)) == EOF) {
*cp = '\0';
break;
- } else {
- *cp = c;
}
+ *cp = c;
}
if (i == sizeof from-1 && strncmp (text, "From ", sizeof from-1) == 0) {
s->msg_style = MS_MBOX;
delimstr = "\nFrom ";
- while ((c = Getc (s)) != '\n' && c >= 0) continue;
+ while ((c = Getc(s)) != EOF && c != '\n')
+ ;
} else {
/* not a Unix style maildrop */
s->readpos -= s->bytes_read;
delimstr = mmdlm2;
s->msg_style = MS_MMDF;
}
+
c = strlen (delimstr);
- s->fdelim = mh_xmalloc (c + 3);
+ s->fdelim = mh_xmalloc (c + 3); /* \0, \n, delimstr, \0 */
*s->fdelim++ = '\0';
*s->fdelim = '\n';
- s->msg_delim = s->fdelim+1;
- s->edelim = s->msg_delim+1;
s->fdelimlen = c + 1;
- s->edelimlen = c - 1; /* == strlen (delimstr) */
+ s->msg_delim = s->fdelim+1;
strcpy (s->msg_delim, delimstr);
+ s->edelim = s->msg_delim+1;
+ s->edelimlen = c - 1;
s->delimend = s->msg_delim + s->edelimlen;
if (s->edelimlen <= 1)
adios (NULL, "maildrop delimiter must be at least 2 bytes");
+
+ /* Now malloc'd memory at s->fdelim-1 is referenced several times,
+ * containing a copy of the string constant from delimstr.
+ *
+ * "\nFrom \0" 7 "\001\001\001\001\n\0" 6
+ * | |
+ * delimstr c=6 delimstr c=5
+ *
+ * "\0\n\nFrom \0" 9 "\0\n\001\001\001\001\n\0" 8
+ * | || | | | | |
+ * | || s->delimend | | | s->delimend
+ * | || | | |
+ * | |s->edelim s->edelimlen=5 | | s->edelim s->edelimlen=4
+ * | | | |
+ * | s->msg_delim | s->msg_delim
+ * | |
+ * s->fdelim s->fdelimlen=7 s->fdelim s->fdelimlen=6
+ */
+
/*
* build a Boyer-Moore end-position map for the matcher in m_getfld.
* N.B. - we don't match just the first char (since it's the newline
if (s->msg_style == MS_MMDF) {
/* flush extra msg hdrs */
- while ((c = Getc(s)) >= 0 && eom (c, s))
+ while ((c = Getc(s)) != EOF && eom (c, s))
;
- if (c >= 0)
+ if (c != EOF)
Ungetc(c, s);
}
static int
m_Eom (m_getfld_state_t s)
{
- register int i;
+ int i;
char text[MAX_DELIMITER_SIZE];
char *cp;
if ((c2 = Getc (s)) == EOF) {
*cp = '\0';
break;
- } else {
- *cp = c2;
}
+ *cp = c2;
}
if (i != s->edelimlen ||
strncmp (text, (char *)s->edelim, s->edelimlen)) {
if (i == 0 && s->msg_style == MS_MBOX)
/* the final newline in the (brain damaged) unix-format
- * maildrop is part of the delimitter - delete it.
+ * maildrop is part of the delimiter - delete it.
*/
return 1;
if (s->msg_style == MS_MBOX) {
int c;
- while ((c = Getc (s)) != '\n')
- if (c < 0)
- break;
+ while ((c = Getc(s)) != EOF && c != '\n')
+ ;
}
return 1;
}
-
-
-static char *
-matchc(int patln, char *pat, int strln, char *str)
-{
- register char *es = str + strln - patln;
- register char *sp;
- register char *pp;
- register char *ep = pat + patln;
- register char pc = *pat++;
-
- for(;;) {
- while (pc != *str++)
- if (str > es)
- return 0;
- if (str > es+1)
- return 0;
- sp = str; pp = pat;
- while (pp < ep && *sp++ == *pp)
- pp++;
- if (pp >= ep)
- return --str;
- }
-}