#include <h/mh.h>
#include <h/mhparse.h>
+#include <h/addrsbr.h>
#include <h/utils.h>
/*
* Macros we use for parsing headers
*/
-#define is_fws(c) (c == '\t' || c == ' ')
+#define is_fws(c) (c == '\t' || c == ' ' || c == '\n')
-#define qpspecial(c) (c < ' ' || c == '=' && c == '?' && c == '_')
+#define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \
+ (c >= 'a' && c <= 'z') || \
+ c == '!' || c == '*' || c == '+' || c == '-' || \
+ c == '/' || c == '=' || c == '_')
+#define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')
+
+#define base64len(n) ((((n) + 2) / 3 ) * 4) /* String len to base64 len */
+#define strbase64(n) ((n) / 4 * 3) /* Chars that fit in base64 */
#define ENCODELINELIMIT 76
static void unfold_header(char **, int);
static int field_encode_address(const char *, char **, int, const char *);
-static int field_encode_quoted(const char *, char **, const char *, int, int);
+static int field_encode_quoted(const char *, char **, const char *, int,
+ int, int);
+static int field_encode_base64(const char *, char **, const char *);
+static int scanstring(const char *, int *, int *, int *);
static int utf8len(const char *);
+static int pref_encoding(int, int, int);
/*
* Encode a message header using RFC 2047 encoding. We make the assumption
*/
for (p = *value; *p != '\0'; p++) {
- if (isascii((int) *p)) {
+ if (isascii((unsigned char) *p)) {
asciicount++;
- if (qpspecial(*p))
+ if (qpspecial((unsigned char) *p))
qpspecialcount++;
} else
eightbitcount++;
* On the encoding we choose, and the specifics of encoding:
*
* - If a specified encoding is passed in, we use that.
- * - If more than 50% of the characters are high-bit, we use base64
- * and encode the whole field as one atom (possibly split).
- * - Otherwise, we use quoted-printable.
+ * - Otherwise, pick which encoding is shorter.
+ *
+ * We don't quite handle continuation right here, but it should be
+ * pretty close.
*/
if (encoding == CE_UNKNOWN)
- encoding = (eightbitcount * 10 / (asciicount + eightbitcount) > 5) ?
- CE_BASE64 : CE_QUOTED;
+ encoding = pref_encoding(asciicount, qpspecialcount, eightbitcount);
unfold_header(value, asciicount + eightbitcount);
switch (encoding) {
-#if 0
case CE_BASE64:
- return field_encode_base64(name, value, encoding, charset);
-#endif
+ return field_encode_base64(name, value, charset);
case CE_QUOTED:
return field_encode_quoted(name, value, charset, asciicount,
- eightbitcount + qpspecialcount);
+ eightbitcount + qpspecialcount, 0);
default:
advise(NULL, "Internal error: unknown RFC-2047 encoding type");
}
/*
- * Encode our specified header using quoted-printable
+ * Encode our specified header (or field) using quoted-printable
*/
static int
field_encode_quoted(const char *name, char **value, const char *charset,
- int ascii, int encoded)
+ int ascii, int encoded, int phraserules)
{
int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column, newline = 1;
int charsetlen = strlen(charset), utf8;
- char *output = NULL, *p, *q;
+ char *output = NULL, *p, *q = NULL;
/*
* Right now we just encode the whole thing. Maybe later on we'll
* If it's the start of the header, we don't need to pad it
*
* The length of the output string is ...
- * =?charset?Q?...?= so that's 7+strlen(charset) + 1 for NUL
+ * =?charset?Q?...?= so that's 7+strlen(charset) + 2 for \n NUL
*
* plus 1 for every ASCII character and 3 for every eight bit
* or special character (eight bit characters are written as =XX).
*
*/
- outlen += 8 + charsetlen + ascii + 3 * encoded;
+ int tokenlen;
+
+ outlen += 9 + charsetlen + ascii + 3 * encoded;
+
+ /*
+ * If output is set, then we're continuing the header. Otherwise
+ * do the initial allocation.
+ */
+
if (output) {
int curlen = q - output, i;
outlen += prefixlen + 1; /* Header plus \n ": " */
for (i = 0; i < prefixlen; i++)
*q++ = ' ';
} else {
+ /*
+ * A bit of a hack here; the header can contain multiple
+ * spaces (probably at least one) until we get to the
+ * actual text. Copy until we get to a non-space.
+ */
output = mh_xmalloc(outlen);
q = output;
+ while (is_fws(*p))
+ *q++ = *p++;
}
- q += snprintf(q, outlen - (q - output), "=?%s?Q?", charset);
- column = prefixlen;
+ tokenlen = snprintf(q, outlen - (q - output), "=?%s?Q?", charset);
+ q += tokenlen;
+ column = prefixlen + tokenlen;
newline = 0;
}
/*
* Process each character, encoding if necessary
+ *
+ * Note that we have a different set of rules if we're processing
+ * RFC 5322 'phrase' (something you'd see in an address header).
*/
column++;
if (*p == ' ') {
*q++ = '_';
ascii--;
- } else if (!qpspecial(*p)) {
+ } else if (isascii((unsigned char) *p) &&
+ (phraserules ? qphrasevalid((unsigned char) *p) :
+ !qpspecial((unsigned char) *p))) {
*q++ = *p;
ascii--;
} else {
- snprintf(q, outlen - (q - output), "=%02X", (unsigned int) *p);
+ snprintf(q, outlen - (q - output), "=%02X", (unsigned char) *p);
q += 3;
- column += 3;
+ column += 2; /* column already incremented by 1 above */
encoded--;
}
}
}
+ *q++ = '?';
+ *q++ = '=';
+
+ if (prefixlen)
+ *q++ = '\n';
+
+ *q = '\0';
+
+ free(*value);
+
+ *value = output;
+
+ return 0;
+}
+
+/*
+ * Encode our specified header (or field) using base64.
+ *
+ * This is a little easier since every character gets encoded, we can
+ * calculate the line wrap up front.
+ */
+
+static int
+field_encode_base64(const char *name, char **value, const char *charset)
+{
+ int prefixlen = name ? strlen(name) + 2 : 0, charsetlen = strlen(charset);
+ int outlen = 0, numencode, curlen;
+ char *output = NULL, *p = *value, *q = NULL, *linestart = NULL;
+
+ /*
+ * Skip over any leading white space.
+ */
+
+ while (*p == ' ' || *p == '\t')
+ p++;
+
+ /*
+ * If we had a zero-length prefix, then just encode the whole field
+ * as-is, without line wrapping. Note that in addition to the encoding
+ *
+ * The added length we need is =? + charset + ?B? ... ?=
+ *
+ * That's 7 + strlen(charset) + 2 (for \n NUL).
+ */
+
+ while (prefixlen && ((base64len(strlen(p)) + 7 + charsetlen +
+ prefixlen) > ENCODELINELIMIT)) {
+
+ /*
+ * Our very first time, don't pad the line in the front
+ *
+ * Note ENCODELINELIMIT is + 2 because of \n \0
+ */
+
+
+ if (! output) {
+ outlen += ENCODELINELIMIT + 2;
+ output = q = mh_xmalloc(outlen);
+ linestart = q - prefixlen; /* Yes, this is intentional */
+ } else {
+ int curstart = linestart - output;
+ curlen = q - output;
+
+ outlen += ENCODELINELIMIT + 2;
+ output = mh_xrealloc(output, outlen);
+ q = output + curlen;
+ linestart = output + curstart;
+ }
+
+ /*
+ * We should have enough space now, so prepend the encoding markers
+ * and character set information. The leading space is intentional.
+ */
+
+ q += snprintf(q, outlen - (q - output), " =?%s?B?", charset);
+
+ /*
+ * Find out how much room we have left on the line and see how
+ * many characters we can stuff in. The start of our line
+ * is marked by "linestart", so use that to figure out how
+ * many characters are left out of ENCODELINELIMIT. Reserve
+ * 2 characters for the end markers and calculate how many
+ * characters we can fit into that space given the base64
+ * encoding expansion.
+ */
+
+ numencode = strbase64(ENCODELINELIMIT - (q - linestart) - 2);
+
+ if (numencode <= 0) {
+ advise(NULL, "Internal error: tried to encode %d characters "
+ "in base64", numencode);
+ return 1;
+ }
+
+ /*
+ * RFC 2047 prohibits spanning multibyte characters across tokens.
+ * Right now we only check for UTF-8.
+ *
+ * So note the key here ... we want to make sure the character BEYOND
+ * our last character is not a continuation byte. If it's the start
+ * of a new multibyte character or a single-byte character, that's ok.
+ */
+
+ if (strcasecmp(charset, "UTF-8") == 0) {
+ /*
+ * p points to the start of our current buffer, so p + numencode
+ * is one past the last character to encode
+ */
+
+ while (numencode > 0 && ((*(p + numencode) & 0xc0) == 0x80))
+ numencode--;
+
+ if (numencode == 0) {
+ advise(NULL, "Internal error: could not find start of "
+ "UTF-8 character when base64 encoding header");
+ return 1;
+ }
+ }
+
+ if (writeBase64raw((unsigned char *) p, numencode,
+ (unsigned char *) q) != OK) {
+ advise(NULL, "Internal error: base64 encoding of header failed");
+ return 1;
+ }
+
+ p += numencode;
+ q += base64len(numencode);
+
+ /*
+ * This will point us at the beginning of the new line (trust me).
+ */
+
+ linestart = q + 3;
+
+ /*
+ * What's going on here? Well, we know we're continuing to the next
+ * line, so we want to add continuation padding. We also add the
+ * trailing marker for the RFC 2047 token at this time as well.
+ * This uses a trick of snprintf(); we tell it to print a zero-length
+ * string, but pad it out to prefixlen - 1 characters; that ends
+ * up always printing out the requested number of spaces. We use
+ * prefixlen - 1 because we always add a space on the starting
+ * token marker; this makes things work out correctly for the first
+ * line, which should have a space between the ':' and the start
+ * of the token.
+ *
+ * It's okay if you don't follow all of that.
+ */
+
+ q += snprintf(q, outlen - (q - output), "?=\n%*s", prefixlen - 1, "");
+ }
+
+ /*
+ * We're here if there is either no prefix, or we can fit it in less
+ * than ENCODELINELIMIT characters. Encode the whole thing.
+ */
+
+ outlen += prefixlen + 9 + charsetlen + base64len(strlen(p));
+ curlen = q - output;
+
+ output = mh_xrealloc(output, outlen);
+ q = output + curlen;
+
+ q += snprintf(q, outlen - (q - output), "%s=?%s?B?",
+ prefixlen ? " " : "", charset);
+
+ if (writeBase64raw((unsigned char *) p, strlen(p),
+ (unsigned char *) q) != OK) {
+ advise(NULL, "Internal error: base64 encoding of header failed");
+ return 1;
+ }
+
strcat(q, "?=");
+ if (prefixlen)
+ strcat(q, "\n");
+
free(*value);
*value = output;
if (*p == '\0')
return 0;
- if (isascii((int) *p) || (*((unsigned char *) p) & 0xc0) == 0x80)
+ if (isascii((unsigned char) *p) || (((unsigned char) *p) & 0xc0) == 0x80)
return 0;
p++;
- while ((*((unsigned char *) p++) & 0xc0) == 0x80)
+ while ((((unsigned char) *p++) & 0xc0) == 0x80)
len++;
return len;
/*
* When we get a newline, skip to the next non-whitespace
* character and add a space to replace all of the whitespace
+ *
+ * This has the side effect of stripping off the final newline
+ * for the header; we put it back in the encoding routine.
*/
while (is_fws(*q))
q++;
*value = str;
}
+/*
+ * Decode a header containing addresses. This means we have to parse
+ * each address and only encode the display-name or comment field.
+ */
+
static int
field_encode_address(const char *name, char **value, int encoding,
const char *charset)
{
- return 0;
+ int prefixlen = strlen(name) + 2, column = prefixlen, groupflag;
+ int asciichars, specialchars, eightbitchars, reformat = 0, errflag = 0;
+ size_t len;
+ char *mp, *cp = NULL, *output = NULL;
+ char *tmpbuf = NULL;
+ size_t tmpbufsize = 0;
+ struct mailname *mn;
+ char errbuf[BUFSIZ];
+
+ /*
+ * Because these are addresses, we need to handle them individually.
+ *
+ * Break them down and process them one by one. This means we have to
+ * rewrite the whole header, but that's unavoidable.
+ */
+
+ /*
+ * The output headers always have to start with a space first; this
+ * is just the way the API works right now.
+ */
+
+ output = add(" ", output);
+
+ for (groupflag = 0; (mp = getname(*value)); ) {
+ if ((mn = getm(mp, NULL, 0, errbuf, sizeof(errbuf))) == NULL) {
+ advise(NULL, "%s: %s", errbuf, mp);
+ errflag++;
+ continue;
+ }
+
+ reformat = 0;
+
+ /*
+ * We only care if the phrase (m_pers) or any trailing comment
+ * (m_note) have 8-bit characters. If doing q-p, we also need
+ * to encode anything marked as qspecial(). Unquote it first
+ * so the specialchars count is right.
+ */
+
+ if (! mn->m_pers)
+ goto check_note;
+
+ if ((len = strlen(mn->m_pers)) + 1 > tmpbufsize) {
+ tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
+ }
+
+ unquote_string(mn->m_pers, tmpbuf);
+
+ if (scanstring(tmpbuf, &asciichars, &eightbitchars,
+ &specialchars)) {
+ /*
+ * If we have 8-bit characters, encode it.
+ */
+
+ if (encoding == CE_UNKNOWN)
+ encoding = pref_encoding(asciichars, specialchars,
+ eightbitchars);
+
+ /*
+ * This is okay, because the output of unquote_string will be either
+ * equal or shorter than the original.
+ */
+
+ strcpy(mn->m_pers, tmpbuf);
+
+ switch (encoding) {
+
+ case CE_BASE64:
+ if (field_encode_base64(NULL, &mn->m_pers, charset)) {
+ errflag++;
+ goto out;
+ }
+ break;
+
+ case CE_QUOTED:
+ if (field_encode_quoted(NULL, &mn->m_pers, charset, asciichars,
+ eightbitchars + specialchars, 1)) {
+ errflag++;
+ goto out;
+ }
+ break;
+
+ default:
+ advise(NULL, "Internal error: unknown RFC-2047 encoding type");
+ errflag++;
+ goto out;
+ }
+
+ reformat++;
+ }
+
+ check_note:
+
+ /*
+ * The "note" field is generally a comment at the end of the address,
+ * at least as how it's implemented here. Notes are always surrounded
+ * by parenthesis (since they're comments). Strip them out and
+ * then put them back when we format the final field, but they do
+ * not get encoded.
+ */
+
+ if (! mn->m_note)
+ goto do_reformat;
+
+ if ((len = strlen(mn->m_note)) + 1 > tmpbufsize) {
+ tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
+ }
+
+ if (mn->m_note[0] != '(' || mn->m_note[len - 1] != ')') {
+ advise(NULL, "Internal error: Invalid note field \"%s\"",
+ mn->m_note);
+ errflag++;
+ goto out;
+ }
+
+ strncpy(tmpbuf, mn->m_note + 1, len - 1);
+ tmpbuf[len - 2] = '\0';
+
+ if (scanstring(tmpbuf, &asciichars, &eightbitchars,
+ &specialchars)) {
+ /*
+ * If we have 8-bit characters, encode it.
+ */
+
+ if (encoding == CE_UNKNOWN)
+ encoding = pref_encoding(asciichars, specialchars,
+ eightbitchars);
+
+ switch (encoding) {
+
+ case CE_BASE64:
+ if (field_encode_base64(NULL, &tmpbuf, charset)) {
+ errflag++;
+ goto out;
+ }
+ break;
+
+ case CE_QUOTED:
+ if (field_encode_quoted(NULL, &tmpbuf, charset, asciichars,
+ eightbitchars + specialchars, 1)) {
+ errflag++;
+ goto out;
+ }
+ break;
+
+ default:
+ advise(NULL, "Internal error: unknown RFC-2047 encoding type");
+ errflag++;
+ goto out;
+ }
+
+ reformat++;
+
+ /*
+ * Make sure the size of tmpbuf is correct (it always gets
+ * reallocated in the above functions).
+ */
+
+ tmpbufsize = strlen(tmpbuf) + 1;
+
+ /*
+ * Put the note field back surrounded by parenthesis.
+ */
+
+ mn->m_note = mh_xrealloc(mn->m_note, tmpbufsize + 2);
+
+ snprintf(mn->m_note, tmpbufsize + 2, "(%s)", tmpbuf);
+ }
+
+do_reformat:
+
+ /*
+ * So, some explanation is in order.
+ *
+ * We know we need to rewrite at least one address in the header,
+ * otherwise we wouldn't be here. If we had to reformat this
+ * particular address, then run it through adrformat(). Otherwise
+ * we can use m_text directly.
+ */
+
+ /*
+ * If we were in a group but are no longer, make sure we add a
+ * semicolon (which needs to be FIRST, as it needs to be at the end
+ * of the last address).
+ */
+
+ if (groupflag && ! mn->m_ingrp) {
+ output = add(";", output);
+ column += 1;
+ }
+
+ groupflag = mn->m_ingrp;
+
+ if (mn->m_gname) {
+ cp = add(mn->m_gname, NULL);
+ }
+
+ if (reformat) {
+ cp = add(adrformat(mn), cp);
+ } else {
+ cp = add(mn->m_text, cp);
+ }
+
+ len = strlen(cp);
+
+ /*
+ * If we're not at the beginning of the line, add a command and
+ * either a space or a newline.
+ */
+
+ if (column != prefixlen) {
+ if (len + column + 2 > OUTPUTLINELEN) {
+
+ if ((size_t) (prefixlen + 3) < tmpbufsize)
+ tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = prefixlen + 3);
+
+ snprintf(tmpbuf, tmpbufsize, ",\n%*s", column = prefixlen, "");
+ output = add(tmpbuf, output);
+ } else {
+ output = add(", ", output);
+ column += 2;
+ }
+ }
+
+ /*
+ * Finally add the address
+ */
+
+ output = add(cp, output);
+ column += len;
+ free(cp);
+ cp = NULL;
+ }
+
+ /*
+ * Just in case we're at the end of a list
+ */
+
+ if (groupflag) {
+ output = add(";", output);
+ }
+
+ output = add("\n", output);
+
+ free(*value);
+ *value = output;
+ output = NULL;
+
+out:
+
+ if (tmpbuf)
+ free(tmpbuf);
+ if (output)
+ free(output);
+
+ return errflag > 0;
+}
+
+/*
+ * Scan a string, check for characters that need to be encoded
+ */
+
+static int
+scanstring(const char *string, int *asciilen, int *eightbitchars,
+ int *specialchars)
+{
+ *asciilen = 0;
+ *eightbitchars = 0;
+ *specialchars = 0;
+
+ for (; *string != '\0'; string++) {
+ if ((isascii((unsigned char) *string))) {
+ (*asciilen)++;
+ /*
+ * So, a space is not a valid phrase character, but we're counting
+ * an exception here, because in q-p a space can be directly
+ * encoded as an underscore.
+ */
+ if (!qphrasevalid((unsigned char) *string) && *string != ' ')
+ (*specialchars)++;
+ } else {
+ (*eightbitchars)++;
+ }
+ }
+
+ return *eightbitchars > 0;
+}
+
+/*
+ * This function is to be used to decide which encoding algorithm we should
+ * use if one is not given. Basically, we pick whichever one is the shorter
+ * of the two.
+ *
+ * Arguments are:
+ *
+ * ascii - Number of ASCII characters in to-be-encoded string.
+ * specials - Number of ASCII characters in to-be-encoded string that
+ * still require encoding under quoted-printable. Note that
+ * these are included in the "ascii" total.
+ * eightbit - Eight-bit characters in the to-be-encoded string.
+ *
+ * Returns one of CE_BASE64 or CE_QUOTED.
+ */
+
+static int
+pref_encoding(int ascii, int specials, int eightbits)
+{
+ /*
+ * The length of the q-p encoding is:
+ *
+ * ascii - specials + (specials + eightbits) * 3.
+ *
+ * The length of the base64 encoding is:
+ *
+ * base64len(ascii + eightbits) (See macro for details)
+ */
+
+ return base64len(ascii + eightbits) < (ascii - specials +
+ (specials + eightbits) * 3) ? CE_BASE64 : CE_QUOTED;
}