X-Git-Url: https://diplodocus.org/git/nmh/blobdiff_plain/9ccd16a6994b628e7b105fc5205bb680f874b16a..cf57870921b26703aad420c6741c524b33736ff1:/sbr/encode_rfc2047.c diff --git a/sbr/encode_rfc2047.c b/sbr/encode_rfc2047.c index 04a74f1a..d1ccd2e9 100644 --- a/sbr/encode_rfc2047.c +++ b/sbr/encode_rfc2047.c @@ -56,6 +56,7 @@ static int field_encode_quoted(const char *, char **, const char *, int, static int field_encode_base64(const char *, char **, const char *); static int scanstring(const char *, int *, int *, int *); static int utf8len(const char *); +static int pref_encoding(int, int, int); /* * Encode a message header using RFC 2047 encoding. We make the assumption @@ -124,14 +125,14 @@ encode_rfc2047(const char *name, char **value, int encoding, * On the encoding we choose, and the specifics of encoding: * * - If a specified encoding is passed in, we use that. - * - If more than 50% of the characters are high-bit, we use base64 - * and encode the whole field as one atom (possibly split). - * - Otherwise, we use quoted-printable. + * - Otherwise, pick which encoding is shorter. + * + * We don't quite handle continuation right here, but it should be + * pretty close. */ if (encoding == CE_UNKNOWN) - encoding = (eightbitcount * 10 / (asciicount + eightbitcount) > 5) ? - CE_BASE64 : CE_QUOTED; + encoding = pref_encoding(asciicount, qpspecialcount, eightbitcount); unfold_header(value, asciicount + eightbitcount); @@ -160,7 +161,7 @@ field_encode_quoted(const char *name, char **value, const char *charset, { int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column, newline = 1; int charsetlen = strlen(charset), utf8; - char *output = NULL, *p, *q; + char *output = NULL, *p, *q = NULL; /* * Right now we just encode the whole thing. Maybe later on we'll @@ -280,15 +281,24 @@ field_encode_quoted(const char *name, char **value, const char *charset, * allow for the encoded output. */ if (column + (utf8len(p) * 3) > ENCODELINELIMIT - 2) { - newline = 1; + newline = 1; } } } - strcat(q, "?="); + if (q == NULL) { + /* This should never happen, but just in case. Found by + clang static analyzer. */ + admonish (NULL, "null output encoding for %s", *value); + return 1; + } + *q++ = '?'; + *q++ = '='; if (prefixlen) - strcat(q, "\n"); + *q++ = '\n'; + + *q = '\0'; free(*value); @@ -309,7 +319,7 @@ field_encode_base64(const char *name, char **value, const char *charset) { int prefixlen = name ? strlen(name) + 2 : 0, charsetlen = strlen(charset); int outlen = 0, numencode, curlen; - char *output = NULL, *p = *value, *q = NULL, *linestart; + char *output = NULL, *p = *value, *q = NULL, *linestart = NULL; /* * Skip over any leading white space. @@ -513,8 +523,8 @@ unfold_header(char **value, int len) * This has the side effect of stripping off the final newline * for the header; we put it back in the encoding routine. */ - while (is_fws(*q++)) - ; + while (is_fws(*q)) + q++; if (*q == '\0') break; @@ -539,10 +549,14 @@ static int field_encode_address(const char *name, char **value, int encoding, const char *charset) { - int prefixlen = strlen(name) + 2, column = prefixlen, groupflag, errflag; - int eightbitchars; - char *mp, *output = NULL; + int prefixlen = strlen(name) + 2, column = prefixlen, groupflag; + int asciichars, specialchars, eightbitchars, reformat = 0, errflag = 0; + size_t len; + char *mp, *cp = NULL, *output = NULL; + char *tmpbuf = NULL; + size_t tmpbufsize = 0; struct mailname *mn; + char errbuf[BUFSIZ]; /* * Because these are addresses, we need to handle them individually. @@ -552,23 +566,242 @@ field_encode_address(const char *name, char **value, int encoding, */ /* - * The output headers always have to start with a space first. + * The output headers always have to start with a space first; this + * is just the way the API works right now. */ output = add(" ", output); - for (groupflag = 0; mp = getname(*value); ) { - if ((mn = getm(mp, NULL, 0, AD_HOST, NULL)) == NULL) { + for (groupflag = 0; (mp = getname(*value)); ) { + if ((mn = getm(mp, NULL, 0, errbuf, sizeof(errbuf))) == NULL) { + advise(NULL, "%s: %s", errbuf, mp); errflag++; continue; } + reformat = 0; + /* * We only care if the phrase (m_pers) or any trailing comment * (m_note) have 8-bit characters. If doing q-p, we also need - * to encode anything marked as qspecial(). + * to encode anything marked as qspecial(). Unquote it first + * so the specialchars count is right. + */ + + if (! mn->m_pers) + goto check_note; + + if ((len = strlen(mn->m_pers)) + 1 > tmpbufsize) { + tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1); + } + + unquote_string(mn->m_pers, tmpbuf); + + if (scanstring(tmpbuf, &asciichars, &eightbitchars, + &specialchars)) { + /* + * If we have 8-bit characters, encode it. + */ + + if (encoding == CE_UNKNOWN) + encoding = pref_encoding(asciichars, specialchars, + eightbitchars); + + /* + * This is okay, because the output of unquote_string will be either + * equal or shorter than the original. + */ + + strcpy(mn->m_pers, tmpbuf); + + switch (encoding) { + + case CE_BASE64: + if (field_encode_base64(NULL, &mn->m_pers, charset)) { + errflag++; + goto out; + } + break; + + case CE_QUOTED: + if (field_encode_quoted(NULL, &mn->m_pers, charset, asciichars, + eightbitchars + specialchars, 1)) { + errflag++; + goto out; + } + break; + + default: + advise(NULL, "Internal error: unknown RFC-2047 encoding type"); + errflag++; + goto out; + } + + reformat++; + } + + check_note: + + /* + * The "note" field is generally a comment at the end of the address, + * at least as how it's implemented here. Notes are always surrounded + * by parenthesis (since they're comments). Strip them out and + * then put them back when we format the final field, but they do + * not get encoded. + */ + + if (! mn->m_note) + goto do_reformat; + + if ((len = strlen(mn->m_note)) + 1 > tmpbufsize) { + tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1); + } + + if (mn->m_note[0] != '(' || mn->m_note[len - 1] != ')') { + advise(NULL, "Internal error: Invalid note field \"%s\"", + mn->m_note); + errflag++; + goto out; + } + + strncpy(tmpbuf, mn->m_note + 1, len - 1); + tmpbuf[len - 2] = '\0'; + + if (scanstring(tmpbuf, &asciichars, &eightbitchars, + &specialchars)) { + /* + * If we have 8-bit characters, encode it. + */ + + if (encoding == CE_UNKNOWN) + encoding = pref_encoding(asciichars, specialchars, + eightbitchars); + + switch (encoding) { + + case CE_BASE64: + if (field_encode_base64(NULL, &tmpbuf, charset)) { + errflag++; + goto out; + } + break; + + case CE_QUOTED: + if (field_encode_quoted(NULL, &tmpbuf, charset, asciichars, + eightbitchars + specialchars, 1)) { + errflag++; + goto out; + } + break; + + default: + advise(NULL, "Internal error: unknown RFC-2047 encoding type"); + errflag++; + goto out; + } + + reformat++; + + /* + * Make sure the size of tmpbuf is correct (it always gets + * reallocated in the above functions). + */ + + tmpbufsize = strlen(tmpbuf) + 1; + + /* + * Put the note field back surrounded by parenthesis. + */ + + mn->m_note = mh_xrealloc(mn->m_note, tmpbufsize + 2); + + snprintf(mn->m_note, tmpbufsize + 2, "(%s)", tmpbuf); + } + +do_reformat: + + /* + * So, some explanation is in order. + * + * We know we need to rewrite at least one address in the header, + * otherwise we wouldn't be here. If we had to reformat this + * particular address, then run it through adrformat(). Otherwise + * we can use m_text directly. */ + + /* + * If we were in a group but are no longer, make sure we add a + * semicolon (which needs to be FIRST, as it needs to be at the end + * of the last address). + */ + + if (groupflag && ! mn->m_ingrp) { + output = add(";", output); + column += 1; + } + + groupflag = mn->m_ingrp; + + if (mn->m_gname) { + cp = mh_xstrdup(mn->m_gname); + } + + if (reformat) { + cp = add(adrformat(mn), cp); + } else { + cp = add(mn->m_text, cp); + } + + len = strlen(cp); + + /* + * If we're not at the beginning of the line, add a command and + * either a space or a newline. + */ + + if (column != prefixlen) { + if (len + column + 2 > OUTPUTLINELEN) { + + if ((size_t) (prefixlen + 3) < tmpbufsize) + tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = prefixlen + 3); + + snprintf(tmpbuf, tmpbufsize, ",\n%*s", column = prefixlen, ""); + output = add(tmpbuf, output); + } else { + output = add(", ", output); + column += 2; + } + } + + /* + * Finally add the address + */ + + output = add(cp, output); + column += len; + free(cp); + cp = NULL; + } + + /* + * Just in case we're at the end of a list + */ + + if (groupflag) { + output = add(";", output); } + + output = add("\n", output); + + free(*value); + *value = output; + output = NULL; + +out: + mh_xfree(tmpbuf); + mh_xfree(output); + + return errflag > 0; } /* @@ -585,13 +818,51 @@ scanstring(const char *string, int *asciilen, int *eightbitchars, for (; *string != '\0'; string++) { if ((isascii((unsigned char) *string))) { - (*asciilen++); - if (!qphrasevalid((unsigned char) *string)) + (*asciilen)++; + /* + * So, a space is not a valid phrase character, but we're counting + * an exception here, because in q-p a space can be directly + * encoded as an underscore. + */ + if (!qphrasevalid((unsigned char) *string) && *string != ' ') (*specialchars)++; } else { (*eightbitchars)++; } } - return eightbitchars > 0; + return *eightbitchars > 0; +} + +/* + * This function is to be used to decide which encoding algorithm we should + * use if one is not given. Basically, we pick whichever one is the shorter + * of the two. + * + * Arguments are: + * + * ascii - Number of ASCII characters in to-be-encoded string. + * specials - Number of ASCII characters in to-be-encoded string that + * still require encoding under quoted-printable. Note that + * these are included in the "ascii" total. + * eightbit - Eight-bit characters in the to-be-encoded string. + * + * Returns one of CE_BASE64 or CE_QUOTED. + */ + +static int +pref_encoding(int ascii, int specials, int eightbits) +{ + /* + * The length of the q-p encoding is: + * + * ascii - specials + (specials + eightbits) * 3. + * + * The length of the base64 encoding is: + * + * base64len(ascii + eightbits) (See macro for details) + */ + + return base64len(ascii + eightbits) < (ascii - specials + + (specials + eightbits) * 3) ? CE_BASE64 : CE_QUOTED; }