X-Git-Url: https://diplodocus.org/git/nmh/blobdiff_plain/e6a9609ee92dd8d01ae2524ea193bbcd88b5be73..d2520ac7054ad75d60342606bf13c821305d958c:/uip/mhparse.c diff --git a/uip/mhparse.c b/uip/mhparse.c index 50510856..59f8b221 100644 --- a/uip/mhparse.c +++ b/uip/mhparse.c @@ -16,6 +16,9 @@ #include #include #include +#ifdef HAVE_ICONV +# include +#endif /* HAVE_ICONV */ extern int debugsw; @@ -135,6 +138,8 @@ static int readDigest (CT, char *); static int get_leftover_mp_content (CT, int); static int InitURL (CT); static int openURL (CT, char **); +static int parse_header_attrs (const char *, const char *, char **, PM *, + PM *, char **); static size_t param_len(PM, int, size_t, int *, int *, size_t *); static size_t encode_param(PM, char *, size_t, size_t, size_t, int); static size_t normal_param(PM, char *, size_t, size_t, size_t); @@ -592,88 +597,6 @@ add_header (CT ct, char *name, char *value) } -/* Make sure that buf contains at least one appearance of name, - followed by =. If not, insert both name and value, just after - first semicolon, if any. Note that name should not contain a - trailing =. And quotes will be added around the value. Typical - usage: make sure that a Content-Disposition header contains - filename="foo". If it doesn't and value does, use value from - that. */ -static char * -incl_name_value (char *buf, char *name, char *value) { - char *newbuf = buf; - - /* Assume that name is non-null. */ - if (buf && value) { - char *name_plus_equal = concat (name, "=", NULL); - - if (! strstr (buf, name_plus_equal)) { - char *insertion; - char *cp, *prefix, *suffix; - - /* Trim trailing space, esp. newline. */ - for (cp = &buf[strlen (buf) - 1]; - cp >= buf && isspace ((unsigned char) *cp); - --cp) { - *cp = '\0'; - } - - insertion = concat ("; ", name, "=", "\"", value, "\"", NULL); - - /* Insert at first semicolon, if any. If none, append to - end. */ - prefix = add (buf, NULL); - if ((cp = strchr (prefix, ';'))) { - suffix = concat (cp, NULL); - *cp = '\0'; - newbuf = concat (prefix, insertion, suffix, "\n", NULL); - free (suffix); - } else { - /* Append to end. */ - newbuf = concat (buf, insertion, "\n", NULL); - } - - free (prefix); - free (insertion); - free (buf); - } - - free (name_plus_equal); - } - - return newbuf; -} - -/* Extract just name_suffix="foo", if any, from value. If there isn't - one, return the entire value. Note that, for example, a name_suffix - of name will match filename="foo", and return foo. */ -static char * -extract_name_value (char *name_suffix, char *value) { - char *extracted_name_value = value; - char *name_suffix_plus_quote = concat (name_suffix, "=\"", NULL); - char *name_suffix_equals = strstr (value, name_suffix_plus_quote); - char *cp; - - free (name_suffix_plus_quote); - if (name_suffix_equals) { - char *name_suffix_begin; - - /* Find first \". */ - for (cp = name_suffix_equals; *cp != '"'; ++cp) /* empty */; - name_suffix_begin = ++cp; - /* Find second \". */ - for (; *cp != '"'; ++cp) /* empty */; - - extracted_name_value = mh_xmalloc (cp - name_suffix_begin + 1); - memcpy (extracted_name_value, - name_suffix_begin, - cp - name_suffix_begin); - extracted_name_value[cp - name_suffix_begin] = '\0'; - } - - return extracted_name_value; -} - /* * Parse Content-Type line and (if `magic' is non-zero) mhbuild composition * directives. Fills in the information of the CTinfo structure. @@ -912,14 +835,11 @@ magic_skip: have a *filename=, extract it from the magic contents. The r1bindex call skips any leading directory components. */ - if (ct->c_dispo) - ct->c_dispo = - incl_name_value (ct->c_dispo, - "filename", - r1bindex (extract_name_value ("name", - ci-> - ci_magic), - '/')); + if (ct->c_dispo_type && + !get_param(ct->c_dispo_first, "filename", '_', 1)) { + add_param(&ct->c_dispo_first, &ct->c_dispo_last, "filename", + r1bindex(ci->ci_magic, '/'), 0); + } } else advise (NULL, @@ -2298,7 +2218,7 @@ open7Bit (CT ct, char **file) fprintf (ce->ce_fp, "%s: %s/%s", TYPE_FIELD, ci->ci_type, ci->ci_subtype); len += strlen (TYPE_FIELD) + 2 + strlen (ci->ci_type) + 1 + strlen (ci->ci_subtype); - buffer = output_params(len, ci->ci_first_pm, &len); + buffer = output_params(len, ci->ci_first_pm, &len, 0); if (buffer) { fputs (buffer, ce->ce_fp); @@ -3306,16 +3226,49 @@ get_ce_method (const char *method) { return NULL; } -int +/* + * Parse a series of MIME attributes (or parameters) given a header as + * input. + * + * Arguments include: + * + * filename - Name of input file (for error messages) + * fieldname - Name of field being processed + * headerp - Pointer to pointer of the beginning of the MIME attributes. + * Updated to point to end of attributes when finished. + * param_head - Pointer to head of parameter list + * param_tail - Pointer to tail of parameter list + * commentp - Pointer to header comment pointer (may be NULL) + * + * Returns OK if parsing was successful, NOTOK if parsing failed, and + * DONE to indicate a benign error (minor parsing error, but the program + * should continue). + */ + +static int parse_header_attrs (const char *filename, const char *fieldname, char **header_attrp, PM *param_head, PM *param_tail, char **commentp) { char *cp = *header_attrp; PM pm; + struct sectlist { + char *value; + int index; + int len; + struct sectlist *next; + } *sp, *sp2; + struct parmlist { + char *name; + char *charset; + char *lang; + struct sectlist *sechead; + struct parmlist *next; + } *pp, *pp2, *phead = NULL; while (*cp == ';') { - char *dp, *vp, *up, c; + char *dp, *vp, *up, *nameptr, *valptr, *charset = NULL, *lang = NULL; + int encoded = 0, partial = 0, len = 0, index = 0; cp++; while (isspace ((unsigned char) *cp)) @@ -3349,60 +3302,297 @@ parse_header_attrs (const char *filename, const char *fieldname, return NOTOK; } - pm = mh_xmalloc(sizeof(*pm)); - memset(pm, 0, sizeof(*pm)); + /* + * To handle RFC 2231, we have to deal with the following extensions: + * + * name*=encoded-value + * name*=part-N-of-a-parameter-value + * name**=encoded-part-N-of-a-parameter-value + * + * So the rule is: + * If there's a * right before the equal sign, it's encoded. + * If there's a * and one or more digits, then it's section N. + * + * Remember we can have one or the other, or both. cp points to + * beginning of name, up points past the last character in the + * parameter name. + */ + + for (vp = cp; vp < up; vp++) { + if (*vp == '*' && vp < up - 1) { + partial = 1; + continue; + } else if (*vp == '*' && vp == up - 1) { + encoded = 1; + } else if (partial) { + if (isdigit((unsigned char) *vp)) + index = *vp - '0' + index * 10; + else { + advise (NULL, "invalid parameter index in message %s's " + "%s: field\n%*s(parameter %s)", filename, + fieldname, strlen(invo_name) + 2, "", cp); + return NOTOK; + } + } else { + len++; + } + } + + /* + * Break out the parameter name and value sections and allocate + * memory for each. + */ + + nameptr = mh_xmalloc(len + 1); + strncpy(nameptr, cp, len); + nameptr[len] = '\0'; - /* This is all mega-bozo and needs cleanup */ - vp = (pm->pm_name = add (cp, NULL)) + (up - cp); - *vp = '\0'; for (dp++; isspace ((unsigned char) *dp);) dp++; - /* Now store the attribute value. */ + if (encoded) { + /* + * Single quotes delimit the character set and language tag. + * They are required on the first section (or a complete + * parameter). + */ + if (index == 0) { + vp = dp; + while (*vp != '\'' && !isspace((unsigned char) *vp) && + *vp != '\0') + vp++; + if (*vp == '\'') { + if (vp != dp) { + len = vp - dp; + charset = mh_xmalloc(len + 1); + strncpy(charset, dp, len); + charset[len] = '\0'; + } else { + charset = NULL; + } + vp++; + } else { + advise(NULL, "missing charset in message %s's %s: " + "field\n%*s(parameter %s)", filename, fieldname, + strlen(invo_name) + 2, "", nameptr); + free(nameptr); + return NOTOK; + } + dp = vp; + + while (*vp != '\'' && !isspace((unsigned char) *vp) && + *vp != '\0') + vp++; + + if (*vp == '\'') { + if (vp != dp) { + len = vp - dp; + lang = mh_xmalloc(len + 1); + strncpy(lang, dp, len); + lang[len] = '\0'; + } else { + lang = NULL; + } + vp++; + } else { + advise(NULL, "missing language tag in message %s's %s: " + "field\n%*s(parameter %s)", filename, fieldname, + strlen(invo_name) + 2, "", nameptr); + free(nameptr); + if (charset) + free(charset); + return NOTOK; + } + + dp = vp; + } + + /* + * At this point vp should be pointing at the beginning + * of the encoded value/section. Continue until we reach + * the end or get whitespace. But first, calculate the + * length so we can allocate the correct buffer size. + */ + + for (vp = dp, len = 0; istoken(*vp); vp++) { + if (*vp == '%') { + if (*(vp + 1) == '\0' || + !isxdigit((unsigned char) *(vp + 1)) || + *(vp + 2) == '\0' || + !isxdigit((unsigned char) *(vp + 2))) { + advise(NULL, "invalid encoded sequence in message " + "%s's %s: field\n%*s(parameter %s)", + filename, fieldname, strlen(invo_name) + 2, + "", nameptr); + free(nameptr); + if (charset) + free(charset); + if (lang) + free(lang); + return NOTOK; + } + vp += 2; + } + len++; + } + + up = valptr = mh_xmalloc(len + 1); + + for (vp = dp; istoken(*vp); vp++) { + if (*vp == '%') { + *up++ = decode_qp(*(vp + 1), *(vp + 2)); + vp += 2; + } else { + *up++ = *vp; + } + } - vp = pm->pm_name + (dp - cp); + *up = '\0'; + cp = vp; + } else { + /* + * A "normal" string. If it's got a leading quote, then we + * strip the quotes out. Otherwise go until we reach the end + * or get whitespace. Note we scan it twice; once to get the + * length, then the second time copies it into the destination + * buffer. + */ - if (*dp == '"') { - for (cp = ++dp, dp = vp;;) { - switch (c = *cp++) { + len = 0; + + if (*dp == '"') { + for (cp = dp + 1;;) { + switch (*cp++) { case '\0': bad_quote: advise (NULL, "invalid quoted-string in message %s's %s: " "field\n%*s(parameter %s)", filename, fieldname, strlen(invo_name) + 2, "", - pm->pm_name); + nameptr); + free(nameptr); + if (charset) + free(charset); + if (lang) + free(lang); return NOTOK; + case '"': + break; case '\\': - *dp++ = c; - if ((c = *cp++) == '\0') + if (*++cp == '\0') goto bad_quote; - /* else fall... */ - + /* FALL THROUGH */ default: - *dp++ = c; + len++; continue; + } + break; + } - case '"': - *dp = '\0'; + } else { + for (cp = dp; istoken (*cp); cp++) { + len++; + } + } + + valptr = mh_xmalloc(len + 1); + + if (*dp == '"') { + int i; + for (cp = dp + 1, vp = valptr, i = 0; i < len; i++) { + if (*cp == '\\') { + cp++; + } + *vp++ = *cp++; + } + cp++; + } else { + strncpy(valptr, cp = dp, len); + cp += len; + } + + valptr[len] = '\0'; + } + + /* + * If 'partial' is set, we don't allocate a parameter now. We + * put it on the parameter linked list to be reassembled later. + * + * "phead" points to a list of all parameters we need to reassemble. + * Each parameter has a list of sections. We insert the sections in + * order. + */ + + if (partial) { + for (pp = phead; pp != NULL; pp = pp->next) { + if (strcasecmp(nameptr, pp->name) == 0) + break; + } + + if (pp == NULL) { + pp = mh_xmalloc(sizeof(*pp)); + memset(pp, 0, sizeof(*pp)); + pp->name = nameptr; + pp->next = phead; + phead = pp; + } + + /* + * Insert this into the section linked list + */ + + sp = mh_xmalloc(sizeof(*sp)); + memset(sp, 0, sizeof(*sp)); + sp->value = valptr; + sp->index = index; + sp->len = len; + + if (pp->sechead == NULL || pp->sechead->index > index) { + sp->next = pp->sechead; + pp->sechead = sp; + } else { + for (sp2 = pp->sechead; sp2 != NULL; sp2 = sp2->next) { + if (sp2->index == sp->index) { + advise (NULL, "duplicate index (%d) in message " + "%s's %s: field\n%*s(parameter %s)", sp->index, + filename, fieldname, strlen(invo_name) + 2, "", + nameptr); + return NOTOK; + } + if (sp2->index < sp->index && + (sp2->next == NULL || sp2->next->index > sp->index)) { + sp->next = sp2->next; + sp2->next = sp; break; + } } - break; + + if (sp2 == NULL) { + advise(NULL, "Internal error: cannot insert partial " + "param in message %s's %s: field\n%*s(parameter %s)", + filename, fieldname, strlen(invo_name) + 2, "", + nameptr); + return NOTOK; + } + } + + /* + * Save our charset and lang tags. + */ + + if (index == 0 && encoded) { + if (pp->charset) + free(pp->charset); + pp->charset = charset; + if (pp->lang) + free(pp->lang); + pp->lang = lang; } } else { - for (cp = dp, dp = vp; istoken (*cp); cp++, dp++) - continue; - *dp = '\0'; - } - pm->pm_value = getcpy(vp); - if (!*vp) { - advise (NULL, - "invalid parameter in message %s's %s: " - "field\n%*s(parameter %s)", - filename, fieldname, strlen(invo_name) + 2, "", - pm->pm_name); - return NOTOK; + pm = add_param(param_head, param_tail, nameptr, valptr, 1); + pm->pm_charset = charset; + pm->pm_lang = lang; } while (isspace ((unsigned char) *cp)) @@ -3412,20 +3602,70 @@ bad_quote: get_comment (filename, fieldname, &cp, commentp) == NOTOK) { return NOTOK; } + } - if (*param_head == NULL) { - *param_head = pm; - *param_tail = pm; - } else { - (*param_tail)->pm_next = pm; - *param_tail = pm; + /* + * Now that we're done, reassemble all of the partial parameters. + */ + + for (pp = phead; pp != NULL; ) { + char *p, *q; + size_t tlen = 0; + int pindex = 0; + for (sp = pp->sechead; sp != NULL; sp = sp->next) { + if (sp->index != pindex++) { + advise(NULL, "missing section %d for parameter in " + "message %s's %s: field\n%*s(parameter %s)", pindex - 1, + filename, fieldname, strlen(invo_name) + 2, "", + pp->name); + return NOTOK; + } + tlen += sp->len; } + + p = q = mh_xmalloc(tlen + 1); + for (sp = pp->sechead; sp != NULL; ) { + memcpy(q, sp->value, sp->len); + q += sp->len; + free(sp->value); + sp2 = sp->next; + free(sp); + sp = sp2; + } + + p[tlen] = '\0'; + + pm = add_param(param_head, param_tail, pp->name, p, 1); + pm->pm_charset = pp->charset; + pm->pm_lang = pp->lang; + pp2 = pp->next; + free(pp); + pp = pp2; } *header_attrp = cp; return OK; } +/* + * Return the charset for a particular content type. Return pointer is + * only valid until the next call to content_charset(). + */ + +char * +content_charset (CT ct) { + static char *ret_charset = NULL; + + if (ret_charset != NULL) { + free(ret_charset); + } + + ret_charset = get_param(ct->c_ctinfo.ci_first_pm, "charset", '?', 0); + + return ret_charset ? ret_charset : "US-ASCII"; +} + + /* * Create a string based on a list of output parameters. Assume that this * parameter string will be appended to an existing header, so start out @@ -3433,7 +3673,7 @@ bad_quote: */ char * -output_params(size_t initialwidth, PM params, int *offsetout) +output_params(size_t initialwidth, PM params, int *offsetout, int external) { char *paramout = NULL; char line[CPERLIN * 2], *q; @@ -3446,6 +3686,9 @@ output_params(size_t initialwidth, PM params, int *offsetout) valoff = 0; q = line; + if (external && strcasecmp(params->pm_name, "body") == 0) + continue; + if (strlen(params->pm_name) > CPERLIN) { advise(NULL, "Parameter name \"%s\" is too long", params->pm_name); if (paramout) @@ -3618,17 +3861,17 @@ param_len(PM pm, int index, size_t valueoff, int *encode, int *cont, * section. * - There are 8-bit characters within N bytes of our section start. * N is calculated based on the number of bytes it would take to - * reach CPERLIN - 1. Specifically: + * reach CPERLIN. Specifically: * 8 (starting tab) + * strlen(param name) + * 4 ('* for section marker, '=', opening/closing '"') * strlen (index) * is the number of bytes used by everything that isn't part of the - * value. So that gets subtracted from CPERLIN - 1. + * value. So that gets subtracted from CPERLIN. */ snprintf(indexchar, sizeof(indexchar), "%d", index); - maxfit = CPERLIN - (13 + len + strlen(indexchar)); + maxfit = CPERLIN - (12 + len + strlen(indexchar)); if ((eightbit && index == 0) || contains8bit(start, start + maxfit)) { *encode = 1; } @@ -3655,11 +3898,15 @@ param_len(PM pm, int index, size_t valueoff, int *encode, int *cont, pm->pm_lang = getcpy(NULL); /* Default to a blank lang tag */ len++; /* For the encoding marker */ + maxfit--; if (index == 0) { - len += strlen(pm->pm_charset) + strlen(pm->pm_lang) + 2; + int enclen = strlen(pm->pm_charset) + strlen(pm->pm_lang) + 2; + len += enclen; + maxfit-= enclen; } else { /* - * We know we definitely need to include an index. + * We know we definitely need to include an index. maxfit already + * includes the section marker. */ len += strlen(indexchar); } @@ -3826,14 +4073,14 @@ normal_param(PM pm, char *output, size_t len, size_t valuelen, */ PM -add_param(PM *first, PM *last, const char *name, const char *value) +add_param(PM *first, PM *last, char *name, char *value, int nocopy) { PM pm = mh_xmalloc(sizeof(*pm)); memset(pm, 0, sizeof(*pm)); - pm->pm_name = getcpy(name); - pm->pm_value = getcpy(value); + pm->pm_name = nocopy ? name : getcpy(name); + pm->pm_value = nocopy ? value : getcpy(value); if (*first) { (*last)->pm_next = pm; @@ -3845,3 +4092,165 @@ add_param(PM *first, PM *last, const char *name, const char *value) return pm; } + +/* + * Either replace a current parameter with a new value, or add the parameter + * to the parameter linked list. + */ + +PM +replace_param(PM *first, PM *last, char *name, char *value, int nocopy) +{ + PM pm; + + for (pm = *first; pm != NULL; pm = pm->pm_next) { + if (strcasecmp(name, pm->pm_name) == 0) { + /* + * If nocopy is set, it's assumed that we own both name + * and value. We don't need name, so we discard it now. + */ + if (nocopy) + free(name); + free(pm->pm_value); + pm->pm_value = nocopy ? value : getcpy(value); + return pm; + } + } + + return add_param(first, last, name, value, nocopy); +} + +/* + * Retrieve a parameter value from a parameter linked list. If the parameter + * value needs converted to the local character set, do that now. + */ + +char * +get_param(PM first, const char *name, char replace, int fetchonly) +{ + while (first != NULL) { + if (strcasecmp(name, first->pm_name) == 0) { + if (fetchonly) + return first->pm_value; + else + return getcpy(get_param_value(first, replace)); + } + first = first->pm_next; + } + + return NULL; +} + +/* + * Return a parameter value, converting to the local character set if + * necessary + */ + +char *get_param_value(PM pm, char replace) +{ + static char buffer[4096]; /* I hope no parameters are larger */ + size_t bufsize = sizeof(buffer); +#ifdef HAVE_ICONV + size_t inbytes; + int utf8; + iconv_t cd; + ICONV_CONST char *p; +#else /* HAVE_ICONV */ + char *p; +#endif /* HAVE_ICONV */ + + char *q; + + /* + * If we don't have a character set indicated, it's assumed to be + * US-ASCII. If it matches our character set, we don't need to convert + * anything. + */ + + if (!pm->pm_charset || check_charset(pm->pm_charset, + strlen(pm->pm_charset))) { + return pm->pm_value; + } + + /* + * In this case, we need to convert. If we have iconv support, use + * that. Otherwise, go through and simply replace every non-ASCII + * character with the substitution character. + */ + +#ifdef HAVE_ICONV + q = buffer; + bufsize = sizeof(buffer); + utf8 = strcasecmp(pm->pm_charset, "UTF-8") == 0; + + cd = iconv_open(get_charset(), pm->pm_charset); + if (cd == (iconv_t) -1) { + goto noiconv; + } + + inbytes = strlen(pm->pm_value); + p = pm->pm_value; + + while (inbytes) { + if (iconv(cd, &p, &inbytes, &q, &bufsize) == (size_t)-1) { + if (errno != EILSEQ) { + iconv_close(cd); + goto noiconv; + } + /* + * Reset shift state, substitute our character, + * try to restart conversion. + */ + + iconv(cd, NULL, NULL, &q, &bufsize); + + if (bufsize == 0) { + iconv_close(cd); + goto noiconv; + } + *q++ = replace; + bufsize--; + if (bufsize == 0) { + iconv_close(cd); + goto noiconv; + } + if (utf8) { + for (++p, --inbytes; + inbytes > 0 && (((unsigned char) *q) & 0xc0) == 0x80; + ++p, --inbytes) + continue; + } else { + p++; + inbytes--; + } + } + } + + iconv_close(cd); + + if (bufsize == 0) + q--; + *q = '\0'; + + return buffer; + +noiconv: +#endif /* HAVE_ICONV */ + + /* + * Take everything non-ASCII and substituite the replacement character + */ + + q = buffer; + bufsize = sizeof(buffer); + for (p = pm->pm_value; *p != '\0' && bufsize > 1; p++, q++, bufsize--) { + if (isascii((unsigned char) *p) && !iscntrl((unsigned char) *p)) + *q = *p; + else + *q = replace; + } + + *q = '\0'; + + return buffer; +}