X-Git-Url: https://diplodocus.org/git/nmh/blobdiff_plain/c05412e6b606d064d54c6d2a13f511cdc34d71f1..a1e07630c435cd9fb50aea6563e5835efdc13e03:/sbr/encode_rfc2047.c

diff --git a/sbr/encode_rfc2047.c b/sbr/encode_rfc2047.c
index 33ed5d87..75b5808f 100644
--- a/sbr/encode_rfc2047.c
+++ b/sbr/encode_rfc2047.c
@@ -1,5 +1,4 @@
-/*
- * Routines to encode message headers using RFC 2047-encoding.
+/* encode_rfc2047.c -- encode message headers using RFC 2047 encoding.
  *
  * This code is Copyright (c) 2002, by the authors of nmh.  See the
  * COPYRIGHT file in the root directory of the nmh distribution for
@@ -8,7 +7,9 @@
 
 #include <h/mh.h>
 #include <h/mhparse.h>
+#include <h/addrsbr.h>
 #include <h/utils.h>
+#include "unquote.h"
 
 /*
  * List of headers that contain addresses and as a result require special
@@ -37,14 +38,25 @@ static char *address_headers[] = {
 
 #define is_fws(c) (c == '\t' || c == ' ' || c == '\n')
 
+#define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \
+			 (c >= 'a' && c <= 'z') || \
+			 c == '!' || c == '*' || c == '+' || c == '-' || \
+			 c == '/' || c == '=' || c == '_')
 #define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')
 
+#define base64len(n) ((((n) + 2) / 3 ) * 4)	/* String len to base64 len */
+#define strbase64(n) ((n) / 4 * 3)		/* Chars that fit in base64 */
+
 #define ENCODELINELIMIT	76
 
 static void unfold_header(char **, int);
 static int field_encode_address(const char *, char **, int, const char *);
-static int field_encode_quoted(const char *, char **, const char *, int, int);
+static int field_encode_quoted(const char *, char **, const char *, int,
+			       int, int);
+static int field_encode_base64(const char *, char **, const char *);
+static int scanstring(const char *, int *, int *, int *);
 static int utf8len(const char *);
+static int pref_encoding(int, int, int);
 
 /*
  * Encode a message header using RFC 2047 encoding.  We make the assumption
@@ -95,7 +107,7 @@ encode_rfc2047(const char *name, char **value, int encoding,
     	charset = write_charset_8bit();
 
     if (strcasecmp(charset, "US-ASCII") == 0) {
-    	advise(NULL, "Cannot use US-ASCII with 8 bit characters in header");
+    	inform("Cannot use US-ASCII with 8 bit characters in header");
 	return 1;
     }
 
@@ -113,45 +125,43 @@ encode_rfc2047(const char *name, char **value, int encoding,
      * On the encoding we choose, and the specifics of encoding:
      *
      * - If a specified encoding is passed in, we use that.
-     * - If more than 50% of the characters are high-bit, we use base64
-     *   and encode the whole field as one atom (possibly split).
-     * - Otherwise, we use quoted-printable.
+     * - Otherwise, pick which encoding is shorter.
+     *
+     * We don't quite handle continuation right here, but it should be
+     * pretty close.
      */
 
     if (encoding == CE_UNKNOWN)
-    	encoding = (eightbitcount * 10 / (asciicount + eightbitcount) > 5) ?
-						CE_BASE64 : CE_QUOTED;
+        encoding = pref_encoding(asciicount, qpspecialcount, eightbitcount);
 
     unfold_header(value, asciicount + eightbitcount);
 
     switch (encoding) {
 
-#if 0
     case CE_BASE64:
-    	return field_encode_base64(name, value, encoding, charset);
-#endif
+    	return field_encode_base64(name, value, charset);
 
     case CE_QUOTED:
 	return field_encode_quoted(name, value, charset, asciicount,
-				   eightbitcount + qpspecialcount);
+				   eightbitcount + qpspecialcount, 0);
 
     default:
-    	advise(NULL, "Internal error: unknown RFC-2047 encoding type");
+    	inform("Internal error: unknown RFC-2047 encoding type");
 	return 1;
     }
 }
 
 /*
- * Encode our specified header using quoted-printable
+ * Encode our specified header (or field) using quoted-printable
  */
 
 static int
 field_encode_quoted(const char *name, char **value, const char *charset,
-		    int ascii, int encoded)
+		    int ascii, int encoded, int phraserules)
 {
     int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column, newline = 1;
     int charsetlen = strlen(charset), utf8;
-    char *output = NULL, *p, *q;
+    char *output = NULL, *p, *q = NULL;
 
     /*
      * Right now we just encode the whole thing.  Maybe later on we'll
@@ -219,6 +229,9 @@ field_encode_quoted(const char *name, char **value, const char *charset,
 
 	/*
 	 * Process each character, encoding if necessary
+	 *
+	 * Note that we have a different set of rules if we're processing
+	 * RFC 5322 'phrase' (something you'd see in an address header).
 	 */
 
 	column++;
@@ -226,7 +239,9 @@ field_encode_quoted(const char *name, char **value, const char *charset,
 	if (*p == ' ') {
 	    *q++ = '_';
 	    ascii--;
-	} else if (isascii((int) *p) && !qpspecial((int) *p)) {
+	} else if (isascii((unsigned char) *p) &&
+		   (phraserules ? qphrasevalid((unsigned char) *p) :
+		   			!qpspecial((unsigned char) *p))) {
 	    *q++ = *p;
 	    ascii--;
 	} else {
@@ -266,12 +281,193 @@ field_encode_quoted(const char *name, char **value, const char *charset,
 	     * allow for the encoded output.
 	     */
 	    if (column + (utf8len(p) * 3) > ENCODELINELIMIT - 2) {
-	    	newline = 1;
+		newline = 1;
 	    }
 	}
     }
 
-    strcat(q, "?=\n");
+    if (q == NULL) {
+	/* This should never happen, but just in case.  Found by
+	   clang static analyzer. */
+	inform("null output encoding for %s, continuing...", *value);
+	return 1;
+    }
+    *q++ = '?';
+    *q++ = '=';
+
+    if (prefixlen)
+	*q++ = '\n';
+
+    *q = '\0';
+
+    free(*value);
+
+    *value = output;
+
+    return 0;
+}
+
+/*
+ * Encode our specified header (or field) using base64.
+ *
+ * This is a little easier since every character gets encoded, we can
+ * calculate the line wrap up front.
+ */
+
+static int
+field_encode_base64(const char *name, char **value, const char *charset)
+{
+    int prefixlen = name ? strlen(name) + 2 : 0, charsetlen = strlen(charset);
+    int outlen = 0, numencode, curlen;
+    char *output = NULL, *p = *value, *q = NULL, *linestart = NULL;
+
+    /*
+     * Skip over any leading white space.
+     */
+
+    while (*p == ' ' || *p == '\t')
+    	p++;
+
+    /*
+     * If we had a zero-length prefix, then just encode the whole field
+     * as-is, without line wrapping.  Note that in addition to the encoding
+     *
+     * The added length we need is =? + charset + ?B? ... ?=
+     *
+     * That's 7 + strlen(charset) + 2 (for \n NUL).
+     */
+
+    while (prefixlen && ((base64len(strlen(p)) + 7 + charsetlen +
+    			  prefixlen) > ENCODELINELIMIT)) {
+
+	/*
+	 * Our very first time, don't pad the line in the front
+	 *
+	 * Note ENCODELINELIMIT is + 2 because of \n \0
+	 */
+
+
+	if (! output) {
+	    outlen += ENCODELINELIMIT + 2;
+	    output = q = mh_xmalloc(outlen);
+	    linestart = q - prefixlen;	/* Yes, this is intentional */
+	} else {
+	    int curstart = linestart - output;
+	    curlen = q - output;
+
+	    outlen += ENCODELINELIMIT + 2;
+	    output = mh_xrealloc(output, outlen);
+	    q = output + curlen;
+	    linestart = output + curstart;
+	}
+
+	/*
+	 * We should have enough space now, so prepend the encoding markers
+	 * and character set information.  The leading space is intentional.
+	 */
+
+	q += snprintf(q, outlen - (q - output), " =?%s?B?", charset);
+
+	/*
+         * Find out how much room we have left on the line and see how
+         * many characters we can stuff in.  The start of our line
+         * is marked by "linestart", so use that to figure out how
+         * many characters are left out of ENCODELINELIMIT.  Reserve
+         * 2 characters for the end markers and calculate how many
+         * characters we can fit into that space given the base64
+         * encoding expansion.
+	 */
+
+	numencode = strbase64(ENCODELINELIMIT - (q - linestart) - 2);
+
+	if (numencode <= 0) {
+	    inform("Internal error: tried to encode %d characters "
+	    	   "in base64", numencode);
+	    return 1;
+	}
+
+	/*
+	 * RFC 2047 prohibits spanning multibyte characters across tokens.
+	 * Right now we only check for UTF-8.
+	 *
+	 * So note the key here ... we want to make sure the character BEYOND
+	 * our last character is not a continuation byte.  If it's the start
+	 * of a new multibyte character or a single-byte character, that's ok.
+	 */
+
+	if (strcasecmp(charset, "UTF-8") == 0) {
+	    /*
+	     * p points to the start of our current buffer, so p + numencode
+	     * is one past the last character to encode
+	     */
+
+	    while (numencode > 0 && ((*(p + numencode) & 0xc0) == 0x80))
+	    	numencode--;
+
+	    if (numencode == 0) {
+	    	inform("Internal error: could not find start of "
+		       "UTF-8 character when base64 encoding header");
+		return 1;
+	    }
+	}
+
+	if (writeBase64raw((unsigned char *) p, numencode,
+			   (unsigned char *) q) != OK) {
+	    inform("Internal error: base64 encoding of header failed");
+	    return 1;
+	}
+
+	p += numencode;
+	q += base64len(numencode);
+
+	/*
+	 * This will point us at the beginning of the new line (trust me).
+	 */
+
+	linestart = q + 3;
+
+	/*
+	 * What's going on here?  Well, we know we're continuing to the next
+	 * line, so we want to add continuation padding.  We also add the
+	 * trailing marker for the RFC 2047 token at this time as well.
+	 * This uses a trick of snprintf(); we tell it to print a zero-length
+	 * string, but pad it out to prefixlen - 1 characters; that ends
+	 * up always printing out the requested number of spaces.  We use
+	 * prefixlen - 1 because we always add a space on the starting
+	 * token marker; this makes things work out correctly for the first
+	 * line, which should have a space between the ':' and the start
+	 * of the token.
+	 *
+	 * It's okay if you don't follow all of that.
+	 */
+
+	q += snprintf(q, outlen - (q - output), "?=\n%*s", prefixlen - 1, "");
+    }
+
+    /*
+     * We're here if there is either no prefix, or we can fit it in less
+     * than ENCODELINELIMIT characters.  Encode the whole thing.
+     */
+
+    outlen += prefixlen + 9 + charsetlen + base64len(strlen(p));
+    curlen = q - output;
+
+    output = mh_xrealloc(output, outlen);
+    q = output + curlen;
+
+    q += snprintf(q, outlen - (q - output), "%s=?%s?B?",
+    		  prefixlen ? " " : "", charset);
+
+    if (writeBase64raw((unsigned char *) p, strlen(p),
+    		       (unsigned char *) q) != OK) {
+	inform("Internal error: base64 encoding of header failed");
+	return 1;
+    }
+
+    strcat(q, "?=");
+
+    if (prefixlen)
+    	strcat(q, "\n");
 
     free(*value);
 
@@ -327,8 +523,8 @@ unfold_header(char **value, int len)
 	     * This has the side effect of stripping off the final newline
 	     * for the header; we put it back in the encoding routine.
 	     */
-	    while (is_fws(*q++))
-	    	;
+	    while (is_fws(*q))
+	    	q++;
 	    if (*q == '\0')
 	    	break;
 
@@ -344,9 +540,329 @@ unfold_header(char **value, int len)
     *value = str;
 }
 
+/*
+ * Decode a header containing addresses.  This means we have to parse
+ * each address and only encode the display-name or comment field.
+ */
+
 static int
 field_encode_address(const char *name, char **value, int encoding,
 		     const char *charset)
 {
-    return 0;
+    int prefixlen = strlen(name) + 2, column = prefixlen, groupflag;
+    int asciichars, specialchars, eightbitchars, reformat = 0, errflag = 0;
+    size_t len;
+    char *mp, *cp = NULL, *output = NULL;
+    char *tmpbuf = NULL;
+    size_t tmpbufsize = 0;
+    struct mailname *mn;
+    char errbuf[BUFSIZ];
+
+    /*
+     * Because these are addresses, we need to handle them individually.
+     *
+     * Break them down and process them one by one.  This means we have to
+     * rewrite the whole header, but that's unavoidable.
+     */
+
+    /*
+     * The output headers always have to start with a space first; this
+     * is just the way the API works right now.
+     */
+
+    output = add(" ", output);
+
+    for (groupflag = 0; (mp = getname(*value)); ) {
+    	if ((mn = getm(mp, NULL, 0, errbuf, sizeof(errbuf))) == NULL) {
+	    inform("%s: %s", errbuf, mp);
+	    errflag++;
+	    continue;
+	}
+
+	reformat = 0;
+
+	/*
+	 * We only care if the phrase (m_pers) or any trailing comment
+	 * (m_note) have 8-bit characters.  If doing q-p, we also need
+	 * to encode anything marked as qspecial().  Unquote it first
+	 * so the specialchars count is right.
+	 */
+
+	if (! mn->m_pers)
+	    goto check_note;
+
+	if ((len = strlen(mn->m_pers)) + 1 > tmpbufsize) {
+	    tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
+	}
+
+	unquote_string(mn->m_pers, tmpbuf);
+
+	if (scanstring(tmpbuf, &asciichars, &eightbitchars,
+		       &specialchars)) {
+	    /*
+	     * If we have 8-bit characters, encode it.
+	     */
+
+	    if (encoding == CE_UNKNOWN)
+	    	encoding = pref_encoding(asciichars, specialchars,
+					 eightbitchars);
+
+	    /*
+	     * This is okay, because the output of unquote_string will be either
+	     * equal or shorter than the original.
+	     */
+
+	    strcpy(mn->m_pers, tmpbuf);
+
+	    switch (encoding) {
+
+	    case CE_BASE64:
+	    	if (field_encode_base64(NULL, &mn->m_pers, charset)) {
+		    errflag++;
+		    goto out;
+		}
+		break;
+
+	    case CE_QUOTED:
+	    	if (field_encode_quoted(NULL, &mn->m_pers, charset, asciichars,
+					eightbitchars + specialchars, 1)) {
+		    errflag++;
+		    goto out;
+		}
+		break;
+
+	    default:
+		inform("Internal error: unknown RFC-2047 encoding type");
+		errflag++;
+		goto out;
+	    }
+
+	    reformat++;
+	}
+
+	check_note:
+
+	/*
+	 * The "note" field is generally a comment at the end of the address,
+	 * at least as how it's implemented here.  Notes are always surrounded
+	 * by parenthesis (since they're comments).  Strip them out and
+	 * then put them back when we format the final field, but they do
+	 * not get encoded.
+	 */
+
+	if (! mn->m_note)
+	    goto do_reformat;
+
+	if ((len = strlen(mn->m_note)) + 1 > tmpbufsize) {
+	    tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
+	}
+
+	if (mn->m_note[0] != '(' || mn->m_note[len - 1] != ')') {
+	    inform("Internal error: Invalid note field \"%s\"",
+	    	   mn->m_note);
+	    errflag++;
+	    goto out;
+	}
+
+	strncpy(tmpbuf, mn->m_note + 1, len - 1);
+	tmpbuf[len - 2] = '\0';
+
+	if (scanstring(tmpbuf, &asciichars, &eightbitchars,
+		       &specialchars)) {
+	    /*
+	     * If we have 8-bit characters, encode it.
+	     */
+
+	    if (encoding == CE_UNKNOWN)
+	    	encoding = pref_encoding(asciichars, specialchars,
+					 eightbitchars);
+
+	    switch (encoding) {
+
+	    case CE_BASE64:
+	    	if (field_encode_base64(NULL, &tmpbuf, charset)) {
+		    errflag++;
+		    goto out;
+		}
+		break;
+
+	    case CE_QUOTED:
+	    	if (field_encode_quoted(NULL, &tmpbuf, charset, asciichars,
+					eightbitchars + specialchars, 1)) {
+		    errflag++;
+		    goto out;
+		}
+		break;
+
+	    default:
+		inform("Internal error: unknown RFC-2047 encoding type");
+		errflag++;
+		goto out;
+	    }
+
+	    reformat++;
+
+	    /*
+	     * Make sure the size of tmpbuf is correct (it always gets
+	     * reallocated in the above functions).
+	     */
+
+	    tmpbufsize = strlen(tmpbuf) + 1;
+
+	    /*
+	     * Put the note field back surrounded by parenthesis.
+	     */
+
+	    mn->m_note = mh_xrealloc(mn->m_note, tmpbufsize + 2);
+
+	    snprintf(mn->m_note, tmpbufsize + 2, "(%s)", tmpbuf);
+	}
+
+do_reformat:
+
+	/*
+	 * So, some explanation is in order.
+	 *
+	 * We know we need to rewrite at least one address in the header,
+	 * otherwise we wouldn't be here.  If we had to reformat this
+	 * particular address, then run it through adrformat().  Otherwise
+	 * we can use m_text directly.
+	 */
+
+	/*
+	 * If we were in a group but are no longer, make sure we add a
+	 * semicolon (which needs to be FIRST, as it needs to be at the end
+	 * of the last address).
+	 */
+
+	if (groupflag && ! mn->m_ingrp) {
+	    output = add(";", output);
+	    column++;
+	}
+
+	groupflag = mn->m_ingrp;
+
+	if (mn->m_gname) {
+	    cp = mh_xstrdup(mn->m_gname);
+	}
+
+	if (reformat) {
+	    cp = add(adrformat(mn), cp);
+	} else {
+	    cp = add(mn->m_text, cp);
+	}
+
+	len = strlen(cp);
+
+	/*
+	 * If we're not at the beginning of the line, add a command and
+	 * either a space or a newline.
+	 */
+
+	if (column != prefixlen) {
+	    if (len + column + 2 > OUTPUTLINELEN) {
+
+	    	if ((size_t) (prefixlen + 3) < tmpbufsize)
+		    tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = prefixlen + 3);
+
+		snprintf(tmpbuf, tmpbufsize, ",\n%*s", column = prefixlen, "");
+		output = add(tmpbuf, output);
+	    } else {
+	    	output = add(", ", output);
+		column += 2;
+	    }
+	}
+
+	/*
+	 * Finally add the address
+	 */
+
+	output = add(cp, output);
+	column += len;
+	free(cp);
+	cp = NULL;
+    }
+
+    /*
+     * Just in case we're at the end of a list
+     */
+
+    if (groupflag) {
+	output = add(";", output);
+    }
+
+    output = add("\n", output);
+
+    free(*value);
+    *value = output;
+    output = NULL;
+
+out:
+    mh_xfree(tmpbuf);
+    mh_xfree(output);
+
+    return errflag > 0;
+}
+
+/*
+ * Scan a string, check for characters that need to be encoded
+ */
+
+static int
+scanstring(const char *string, int *asciilen, int *eightbitchars,
+	   int *specialchars)
+{
+    *asciilen = 0;
+    *eightbitchars = 0;
+    *specialchars = 0;
+
+    for (; *string != '\0'; string++) {
+    	if ((isascii((unsigned char) *string))) {
+	    (*asciilen)++;
+	    /*
+	     * So, a space is not a valid phrase character, but we're counting
+	     * an exception here, because in q-p a space can be directly
+	     * encoded as an underscore.
+	     */
+	    if (!qphrasevalid((unsigned char) *string) && *string != ' ')
+	    	(*specialchars)++;
+	} else {
+	    (*eightbitchars)++;
+	}
+    }
+
+    return *eightbitchars > 0;
+}
+
+/*
+ * This function is to be used to decide which encoding algorithm we should
+ * use if one is not given.  Basically, we pick whichever one is the shorter
+ * of the two.
+ *
+ * Arguments are:
+ *
+ * ascii	- Number of ASCII characters in to-be-encoded string.
+ * specials	- Number of ASCII characters in to-be-encoded string that
+ *		  still require encoding under quoted-printable.  Note that
+ *		  these are included in the "ascii" total.
+ * eightbit	- Eight-bit characters in the to-be-encoded string.
+ *
+ * Returns one of CE_BASE64 or CE_QUOTED.
+ */
+
+static int
+pref_encoding(int ascii, int specials, int eightbits)
+{
+    /*
+     * The length of the q-p encoding is:
+     *
+     * ascii - specials + (specials + eightbits) * 3.
+     *
+     * The length of the base64 encoding is:
+     *
+     * base64len(ascii + eightbits)	(See macro for details)
+     */
+
+    return base64len(ascii + eightbits) < (ascii - specials +
+    			(specials + eightbits) * 3) ? CE_BASE64 : CE_QUOTED;
 }