X-Git-Url: https://diplodocus.org/git/nmh/blobdiff_plain/f63b238c3fa2af9db08be8ec72e0e62e6a3842ad..2d82da244d02a79d994b9020099cfb627a8b32ed:/sbr/encode_rfc2047.c diff --git a/sbr/encode_rfc2047.c b/sbr/encode_rfc2047.c index 0f040bc9..537a275e 100644 --- a/sbr/encode_rfc2047.c +++ b/sbr/encode_rfc2047.c @@ -8,6 +8,7 @@ #include #include +#include #include /* @@ -35,10 +36,27 @@ static char *address_headers[] = { * Macros we use for parsing headers */ -#define is_fws(c) (c == '\t' || c == ' ') +#define is_fws(c) (c == '\t' || c == ' ' || c == '\n') + +#define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \ + (c >= 'a' && c <= 'z') || \ + c == '!' || c == '*' || c == '+' || c == '-' || \ + c == '/' || c == '=' || c == '_') +#define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_') + +#define base64len(n) ((((n) + 2) / 3 ) * 4) /* String len to base64 len */ +#define strbase64(n) ((n) / 4 * 3) /* Chars that fit in base64 */ + +#define ENCODELINELIMIT 76 static void unfold_header(char **, int); static int field_encode_address(const char *, char **, int, const char *); +static int field_encode_quoted(const char *, char **, const char *, int, + int, int); +static int field_encode_base64(const char *, char **, const char *); +static int scanstring(const char *, int *, int *, int *); +static int utf8len(const char *); +static int pref_encoding(int, int, int); /* * Encode a message header using RFC 2047 encoding. We make the assumption @@ -50,7 +68,7 @@ int encode_rfc2047(const char *name, char **value, int encoding, const char *charset) { - int i, count = 0, len; + int i, asciicount = 0, eightbitcount = 0, qpspecialcount = 0; char *p; /* @@ -58,11 +76,15 @@ encode_rfc2047(const char *name, char **value, int encoding, */ for (p = *value; *p != '\0'; p++) { - if (! isascii((int) *p)) - count++; + if (isascii((unsigned char) *p)) { + asciicount++; + if (qpspecial((unsigned char) *p)) + qpspecialcount++; + } else + eightbitcount++; } - if (count == 0) + if (eightbitcount == 0) return 0; /* @@ -103,42 +125,376 @@ encode_rfc2047(const char *name, char **value, int encoding, * On the encoding we choose, and the specifics of encoding: * * - If a specified encoding is passed in, we use that. - * - If more than 50% of the characters are high-bit, we use base64 - * and encode the whole field as one atom (possibly split). - * Otherwise, we use quoted-printable. - * - If more than 10% of the characters are high-bit, then we encode - * the entire header as one (possibly split) atom. Otherwise, - * take each atom as they come and encode it on a per-atom basis. + * - Otherwise, pick which encoding is shorter. + * + * We don't quite handle continuation right here, but it should be + * pretty close. */ - len = strlen(*value); - if (encoding == CE_UNKNOWN) - encoding = (count * 10 / len > 5) ? CE_BASE64 : CE_QUOTED; + encoding = pref_encoding(asciicount, qpspecialcount, eightbitcount); + + unfold_header(value, asciicount + eightbitcount); switch (encoding) { case CE_BASE64: - return field_encode_base64(value, charset, len, NULL); + return field_encode_base64(name, value, charset); case CE_QUOTED: - if (count * 100 / len > 10) { - return field_encode_quoted(value, charset, len, NULL); + return field_encode_quoted(name, value, charset, asciicount, + eightbitcount + qpspecialcount, 0); + + default: + advise(NULL, "Internal error: unknown RFC-2047 encoding type"); + return 1; + } +} + +/* + * Encode our specified header (or field) using quoted-printable + */ + +static int +field_encode_quoted(const char *name, char **value, const char *charset, + int ascii, int encoded, int phraserules) +{ + int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column, newline = 1; + int charsetlen = strlen(charset), utf8; + char *output = NULL, *p, *q = NULL; + + /* + * Right now we just encode the whole thing. Maybe later on we'll + * only encode things on a per-atom basis. + */ + + p = *value; + + column = prefixlen + 2; /* Header name plus ": " */ + + utf8 = strcasecmp(charset, "UTF-8") == 0; + + while (*p != '\0') { + /* + * Start a new line, if it's time + */ + if (newline) { + /* + * If it's the start of the header, we don't need to pad it + * + * The length of the output string is ... + * =?charset?Q?...?= so that's 7+strlen(charset) + 2 for \n NUL + * + * plus 1 for every ASCII character and 3 for every eight bit + * or special character (eight bit characters are written as =XX). + * + */ + + int tokenlen; + + outlen += 9 + charsetlen + ascii + 3 * encoded; + + /* + * If output is set, then we're continuing the header. Otherwise + * do the initial allocation. + */ + + if (output) { + int curlen = q - output, i; + outlen += prefixlen + 1; /* Header plus \n ": " */ + output = mh_xrealloc(output, outlen); + q = output + curlen; + *q++ = '?'; + *q++ = '='; + *q++ = '\n'; + for (i = 0; i < prefixlen; i++) + *q++ = ' '; + } else { + /* + * A bit of a hack here; the header can contain multiple + * spaces (probably at least one) until we get to the + * actual text. Copy until we get to a non-space. + */ + output = mh_xmalloc(outlen); + q = output; + while (is_fws(*p)) + *q++ = *p++; + } + + tokenlen = snprintf(q, outlen - (q - output), "=?%s?Q?", charset); + q += tokenlen; + column = prefixlen + tokenlen; + newline = 0; + } + + /* + * Process each character, encoding if necessary + * + * Note that we have a different set of rules if we're processing + * RFC 5322 'phrase' (something you'd see in an address header). + */ + + column++; + + if (*p == ' ') { + *q++ = '_'; + ascii--; + } else if (isascii((unsigned char) *p) && + (phraserules ? qphrasevalid((unsigned char) *p) : + !qpspecial((unsigned char) *p))) { + *q++ = *p; + ascii--; + } else { + snprintf(q, outlen - (q - output), "=%02X", (unsigned char) *p); + q += 3; + column += 2; /* column already incremented by 1 above */ + encoded--; + } + + p++; + + /* + * We're not allowed more than ENCODELINELIMIT characters per line, + * so reserve some room for the final ?=. + * + * If prefixlen == 0, we haven't been passed in a header name, so + * don't ever wrap the field (we're likely doing an address). + */ + + if (prefixlen == 0) + continue; + + if (column >= ENCODELINELIMIT - 2) { + newline = 1; + } else if (utf8) { + /* + * Okay, this is a bit weird, but to explain a bit more ... + * + * RFC 2047 prohibits the splitting of multibyte characters + * across encoded words. Right now we only handle the case + * of UTF-8, the most common multibyte encoding. + * + * p is now pointing at the next input character. If we're + * using UTF-8 _and_ we'd go over ENCODELINELIMIT given the + * length of the complete character, then trigger a newline + * now. Note that we check the length * 3 since we have to + * allow for the encoded output. + */ + if (column + (utf8len(p) * 3) > ENCODELINELIMIT - 2) { + newline = 1; + } + } + } + + *q++ = '?'; + *q++ = '='; + + if (prefixlen) + *q++ = '\n'; + + *q = '\0'; + + free(*value); + + *value = output; + + return 0; +} + +/* + * Encode our specified header (or field) using base64. + * + * This is a little easier since every character gets encoded, we can + * calculate the line wrap up front. + */ + +static int +field_encode_base64(const char *name, char **value, const char *charset) +{ + int prefixlen = name ? strlen(name) + 2 : 0, charsetlen = strlen(charset); + int outlen = 0, numencode, curlen; + char *output = NULL, *p = *value, *q = NULL, *linestart = NULL; + + /* + * Skip over any leading white space. + */ + + while (*p == ' ' || *p == '\t') + p++; + + /* + * If we had a zero-length prefix, then just encode the whole field + * as-is, without line wrapping. Note that in addition to the encoding + * + * The added length we need is =? + charset + ?B? ... ?= + * + * That's 7 + strlen(charset) + 2 (for \n NUL). + */ + + while (prefixlen && ((base64len(strlen(p)) + 7 + charsetlen + + prefixlen) > ENCODELINELIMIT)) { + + /* + * Our very first time, don't pad the line in the front + * + * Note ENCODELINELIMIT is + 2 because of \n \0 + */ + + + if (! output) { + outlen += ENCODELINELIMIT + 2; + output = q = mh_xmalloc(outlen); + linestart = q - prefixlen; /* Yes, this is intentional */ } else { + int curstart = linestart - output; + curlen = q - output; + + outlen += ENCODELINELIMIT + 2; + output = mh_xrealloc(output, outlen); + q = output + curlen; + linestart = output + curstart; + } + + /* + * We should have enough space now, so prepend the encoding markers + * and character set information. The leading space is intentional. + */ + + q += snprintf(q, outlen - (q - output), " =?%s?B?", charset); + + /* + * Find out how much room we have left on the line and see how + * many characters we can stuff in. The start of our line + * is marked by "linestart", so use that to figure out how + * many characters are left out of ENCODELINELIMIT. Reserve + * 2 characters for the end markers and calculate how many + * characters we can fit into that space given the base64 + * encoding expansion. + */ + + numencode = strbase64(ENCODELINELIMIT - (q - linestart) - 2); + + if (numencode <= 0) { + advise(NULL, "Internal error: tried to encode %d characters " + "in base64", numencode); + return 1; + } + + /* + * RFC 2047 prohibits spanning multibyte characters across tokens. + * Right now we only check for UTF-8. + * + * So note the key here ... we want to make sure the character BEYOND + * our last character is not a continuation byte. If it's the start + * of a new multibyte character or a single-byte character, that's ok. + */ + + if (strcasecmp(charset, "UTF-8") == 0) { /* - * Break it down by atoms. + * p points to the start of our current buffer, so p + numencode + * is one past the last character to encode */ - unfold_header(value, len); + while (numencode > 0 && ((*(p + numencode) & 0xc0) == 0x80)) + numencode--; + + if (numencode == 0) { + advise(NULL, "Internal error: could not find start of " + "UTF-8 character when base64 encoding header"); + return 1; + } } - default: - advise(NULL, "Internal error: unknown RFC-2047 encoding type"); + + if (writeBase64raw((unsigned char *) p, numencode, + (unsigned char *) q) != OK) { + advise(NULL, "Internal error: base64 encoding of header failed"); + return 1; + } + + p += numencode; + q += base64len(numencode); + + /* + * This will point us at the beginning of the new line (trust me). + */ + + linestart = q + 3; + + /* + * What's going on here? Well, we know we're continuing to the next + * line, so we want to add continuation padding. We also add the + * trailing marker for the RFC 2047 token at this time as well. + * This uses a trick of snprintf(); we tell it to print a zero-length + * string, but pad it out to prefixlen - 1 characters; that ends + * up always printing out the requested number of spaces. We use + * prefixlen - 1 because we always add a space on the starting + * token marker; this makes things work out correctly for the first + * line, which should have a space between the ':' and the start + * of the token. + * + * It's okay if you don't follow all of that. + */ + + q += snprintf(q, outlen - (q - output), "?=\n%*s", prefixlen - 1, ""); + } + + /* + * We're here if there is either no prefix, or we can fit it in less + * than ENCODELINELIMIT characters. Encode the whole thing. + */ + + outlen += prefixlen + 9 + charsetlen + base64len(strlen(p)); + curlen = q - output; + + output = mh_xrealloc(output, outlen); + q = output + curlen; + + q += snprintf(q, outlen - (q - output), "%s=?%s?B?", + prefixlen ? " " : "", charset); + + if (writeBase64raw((unsigned char *) p, strlen(p), + (unsigned char *) q) != OK) { + advise(NULL, "Internal error: base64 encoding of header failed"); return 1; } + strcat(q, "?="); + + if (prefixlen) + strcat(q, "\n"); + + free(*value); + + *value = output; + return 0; } +/* + * Calculate the length of a UTF-8 character. + * + * If it's not a UTF-8 character (or we're in the middle of a multibyte + * character) then simply return 0. + */ + +static int +utf8len(const char *p) +{ + int len = 1; + + if (*p == '\0') + return 0; + + if (isascii((unsigned char) *p) || (((unsigned char) *p) & 0xc0) == 0x80) + return 0; + + p++; + while ((((unsigned char) *p++) & 0xc0) == 0x80) + len++; + + return len; +} + /* * "Unfold" a header, making it a single line (without continuation) * @@ -157,6 +513,9 @@ unfold_header(char **value, int len) /* * When we get a newline, skip to the next non-whitespace * character and add a space to replace all of the whitespace + * + * This has the side effect of stripping off the final newline + * for the header; we put it back in the encoding routine. */ while (is_fws(*q)) q++; @@ -174,3 +533,333 @@ unfold_header(char **value, int len) free(*value); *value = str; } + +/* + * Decode a header containing addresses. This means we have to parse + * each address and only encode the display-name or comment field. + */ + +static int +field_encode_address(const char *name, char **value, int encoding, + const char *charset) +{ + int prefixlen = strlen(name) + 2, column = prefixlen, groupflag; + int asciichars, specialchars, eightbitchars, reformat = 0, errflag = 0; + size_t len; + char *mp, *cp = NULL, *output = NULL; + char *tmpbuf = NULL; + size_t tmpbufsize = 0; + struct mailname *mn; + char errbuf[BUFSIZ]; + + /* + * Because these are addresses, we need to handle them individually. + * + * Break them down and process them one by one. This means we have to + * rewrite the whole header, but that's unavoidable. + */ + + /* + * The output headers always have to start with a space first; this + * is just the way the API works right now. + */ + + output = add(" ", output); + + for (groupflag = 0; (mp = getname(*value)); ) { + if ((mn = getm(mp, NULL, 0, errbuf, sizeof(errbuf))) == NULL) { + advise(mp, "%s", errbuf); + errflag++; + continue; + } + + reformat = 0; + + /* + * We only care if the phrase (m_pers) or any trailing comment + * (m_note) have 8-bit characters. If doing q-p, we also need + * to encode anything marked as qspecial(). Unquote it first + * so the specialchars count is right. + */ + + if (! mn->m_pers) + goto check_note; + + if ((len = strlen(mn->m_pers)) + 1 > tmpbufsize) { + tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1); + } + + unquote_string(mn->m_pers, tmpbuf); + + if (scanstring(tmpbuf, &asciichars, &eightbitchars, + &specialchars)) { + /* + * If we have 8-bit characters, encode it. + */ + + if (encoding == CE_UNKNOWN) + encoding = pref_encoding(asciichars, specialchars, + eightbitchars); + + /* + * This is okay, because the output of unquote_string will be either + * equal or shorter than the original. + */ + + strcpy(mn->m_pers, tmpbuf); + + switch (encoding) { + + case CE_BASE64: + if (field_encode_base64(NULL, &mn->m_pers, charset)) { + errflag++; + goto out; + } + break; + + case CE_QUOTED: + if (field_encode_quoted(NULL, &mn->m_pers, charset, asciichars, + eightbitchars + specialchars, 1)) { + errflag++; + goto out; + } + break; + + default: + advise(NULL, "Internal error: unknown RFC-2047 encoding type"); + errflag++; + goto out; + } + + reformat++; + } + + check_note: + + /* + * The "note" field is generally a comment at the end of the address, + * at least as how it's implemented here. Notes are always surrounded + * by parenthesis (since they're comments). Strip them out and + * then put them back when we format the final field, but they do + * not get encoded. + */ + + if (! mn->m_note) + goto do_reformat; + + if ((len = strlen(mn->m_note)) + 1 > tmpbufsize) { + tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1); + } + + if (mn->m_note[0] != '(' || mn->m_note[len - 1] != ')') { + advise(NULL, "Internal error: Invalid note field \"%s\"", + mn->m_note); + errflag++; + goto out; + } + + strncpy(tmpbuf, mn->m_note + 1, len - 1); + tmpbuf[len - 2] = '\0'; + + if (scanstring(tmpbuf, &asciichars, &eightbitchars, + &specialchars)) { + /* + * If we have 8-bit characters, encode it. + */ + + if (encoding == CE_UNKNOWN) + encoding = pref_encoding(asciichars, specialchars, + eightbitchars); + + switch (encoding) { + + case CE_BASE64: + if (field_encode_base64(NULL, &tmpbuf, charset)) { + errflag++; + goto out; + } + break; + + case CE_QUOTED: + if (field_encode_quoted(NULL, &tmpbuf, charset, asciichars, + eightbitchars + specialchars, 1)) { + errflag++; + goto out; + } + break; + + default: + advise(NULL, "Internal error: unknown RFC-2047 encoding type"); + errflag++; + goto out; + } + + reformat++; + + /* + * Make sure the size of tmpbuf is correct (it always gets + * reallocated in the above functions). + */ + + tmpbufsize = strlen(tmpbuf) + 1; + + /* + * Put the note field back surrounded by parenthesis. + */ + + mn->m_note = mh_xrealloc(mn->m_note, tmpbufsize + 2); + + snprintf(mn->m_note, tmpbufsize + 2, "(%s)", tmpbuf); + } + +do_reformat: + + /* + * So, some explanation is in order. + * + * We know we need to rewrite at least one address in the header, + * otherwise we wouldn't be here. If we had to reformat this + * particular address, then run it through adrformat(). Otherwise + * we can use m_text directly. + */ + + /* + * If we were in a group but are no longer, make sure we add a + * semicolon (which needs to be FIRST, as it needs to be at the end + * of the last address). + */ + + if (groupflag && ! mn->m_ingrp) { + output = add(";", output); + column += 1; + } + + groupflag = mn->m_ingrp; + + if (mn->m_gname) { + cp = add(mn->m_gname, NULL); + } + + if (reformat) { + cp = add(adrformat(mn), cp); + } else { + cp = add(mn->m_text, cp); + } + + len = strlen(cp); + + /* + * If we're not at the beginning of the line, add a command and + * either a space or a newline. + */ + + if (column != prefixlen) { + if (len + column + 2 > OUTPUTLINELEN) { + + if ((size_t) (prefixlen + 3) < tmpbufsize) + tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = prefixlen + 3); + + snprintf(tmpbuf, tmpbufsize, ",\n%*s", column = prefixlen, ""); + output = add(tmpbuf, output); + } else { + output = add(", ", output); + column += 2; + } + } + + /* + * Finally add the address + */ + + output = add(cp, output); + column += len; + free(cp); + cp = NULL; + } + + /* + * Just in case we're at the end of a list + */ + + if (groupflag) { + output = add(";", output); + } + + output = add("\n", output); + + free(*value); + *value = output; + output = NULL; + +out: + + if (tmpbuf) + free(tmpbuf); + if (output) + free(output); + + return errflag > 0; +} + +/* + * Scan a string, check for characters that need to be encoded + */ + +static int +scanstring(const char *string, int *asciilen, int *eightbitchars, + int *specialchars) +{ + *asciilen = 0; + *eightbitchars = 0; + *specialchars = 0; + + for (; *string != '\0'; string++) { + if ((isascii((unsigned char) *string))) { + (*asciilen)++; + /* + * So, a space is not a valid phrase character, but we're counting + * an exception here, because in q-p a space can be directly + * encoded as an underscore. + */ + if (!qphrasevalid((unsigned char) *string) && *string != ' ') + (*specialchars)++; + } else { + (*eightbitchars)++; + } + } + + return *eightbitchars > 0; +} + +/* + * This function is to be used to decide which encoding algorithm we should + * use if one is not given. Basically, we pick whichever one is the shorter + * of the two. + * + * Arguments are: + * + * ascii - Number of ASCII characters in to-be-encoded string. + * specials - Number of ASCII characters in to-be-encoded string that + * still require encoding under quoted-printable. Note that + * these are included in the "ascii" total. + * eightbit - Eight-bit characters in the to-be-encoded string. + * + * Returns one of CE_BASE64 or CE_QUOTED. + */ + +static int +pref_encoding(int ascii, int specials, int eightbits) +{ + /* + * The length of the q-p encoding is: + * + * ascii - specials + (specials + eightbits) * 3. + * + * The length of the base64 encoding is: + * + * base64len(ascii + eightbits) (See macro for details) + */ + + return base64len(ascii + eightbits) < (ascii - specials + + (specials + eightbits) * 3) ? CE_BASE64 : CE_QUOTED; +}