From: Ken Hornstein <kenh@pobox.com>
Date: Tue, 29 Oct 2013 19:58:35 +0000 (-0400)
Subject: A hopefully-functional quoted-printable encoder
X-Git-Url: https://diplodocus.org/git/nmh/commitdiff_plain/9647352681e9508c01c8fcd94f188c8e0ab7e284?hp=f63b238c3fa2af9db08be8ec72e0e62e6a3842ad

A hopefully-functional quoted-printable encoder
---

diff --git a/sbr/encode_rfc2047.c b/sbr/encode_rfc2047.c
index 0f040bc9..28b592f0 100644
--- a/sbr/encode_rfc2047.c
+++ b/sbr/encode_rfc2047.c
@@ -37,8 +37,15 @@ static char *address_headers[] = {
 
 #define is_fws(c) (c == '\t' || c == ' ')
 
+#define qpspecial(c) (c < ' ' || c == '=' && c == '?' && c == '_')
+
+#define ENCODELINELIMIT	76
+
 static void unfold_header(char **, int);
 static int field_encode_address(const char *, char **, int, const char *);
+static int field_encode_quoted(const char *, char **, int, const char *,
+			       int, int);
+static int utf8len(const char *);
 
 /*
  * Encode a message header using RFC 2047 encoding.  We make the assumption
@@ -50,7 +57,7 @@ int
 encode_rfc2047(const char *name, char **value, int encoding,
 	       const char *charset)
 {
-    int i, count = 0, len;
+    int i, asciicount = 0, eightbitcount = 0, qpspecialcount = 0;
     char *p;
 
     /*
@@ -58,11 +65,15 @@ encode_rfc2047(const char *name, char **value, int encoding,
      */
 
     for (p = *value; *p != '\0'; p++) {
-	if (! isascii((int) *p))
-	count++;
+	if (isascii((int) *p)) {
+	    asciicount++;
+	    if (qpspecial(*p))
+	    	qpspecialcount++;
+	} else
+	    eightbitcount++;
     }
 
-    if (count == 0)
+    if (eightbitcount == 0)
     	return 0;
 
     /*
@@ -105,40 +116,179 @@ encode_rfc2047(const char *name, char **value, int encoding,
      * - If a specified encoding is passed in, we use that.
      * - If more than 50% of the characters are high-bit, we use base64
      *   and encode the whole field as one atom (possibly split).
-     *   Otherwise, we use quoted-printable.
-     * - If more than 10% of the characters are high-bit, then we encode
-     *   the entire header as one (possibly split) atom.  Otherwise,
-     *   take each atom as they come and encode it on a per-atom basis.
+     * - Otherwise, we use quoted-printable.
      */
 
-    len = strlen(*value);
-
     if (encoding == CE_UNKNOWN)
-    	encoding = (count * 10 / len > 5) ? CE_BASE64 : CE_QUOTED;
+    	encoding = (eightbitcount * 10 / (asciicount + eightbitcount) > 5) ?
+						CE_BASE64 : CE_QUOTED;
+
+    unfold_header(value, asciicount + eightbitcount);
 
     switch (encoding) {
 
+#if 0
     case CE_BASE64:
-    	return field_encode_base64(value, charset, len, NULL);
+    	return field_encode_base64(name, value, encoding, charset);
+#endif
 
     case CE_QUOTED:
-    	if (count * 100 / len > 10) {
-	    return field_encode_quoted(value, charset, len, NULL);
-	} else {
-	    /*
-	     * Break it down by atoms.
-	     */
+	return field_encode_quoted(name, value, encoding, charset, asciicount,
+				   eightbitcount + qpspecialcount);
 
-	    unfold_header(value, len);
-	}
     default:
     	advise(NULL, "Internal error: unknown RFC-2047 encoding type");
 	return 1;
     }
+}
+
+/*
+ * Encode our specified header using quoted-printable
+ */
+
+static int
+field_encode_quoted(const char *name, char **value, int encoding,
+		    const char *charset, int ascii, int encoded)
+{
+    int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column, newline = 1;
+    int charsetlen = strlen(charset), utf8;
+    char *output = NULL, *p, *q;
+
+    /*
+     * Right now we just encode the whole thing.  Maybe later on we'll
+     * only encode things on a per-atom basis.
+     */
+
+    p = *value;
+
+    column = prefixlen + 2;	/* Header name plus ": " */
+
+    utf8 = strcasecmp(charset, "UTF-8") == 0;
+
+    while (*p != '\0') {
+    	/*
+	 * Start a new line, if it's time
+	 */
+    	if (newline) {
+	    /*
+	     * If it's the start of the header, we don't need to pad it
+	     *
+	     * The length of the output string is ...
+	     * =?charset?Q?...?=  so that's 7+strlen(charset) + 1 for NUL
+	     *
+	     * plus 1 for every ASCII character and 3 for every eight bit
+	     * or special character (eight bit characters are written as =XX).
+	     *
+	     */
+
+	    outlen += 8 + charsetlen + ascii + 3 * encoded;
+	    if (output) {
+	        int curlen = q - output, i;
+		outlen += prefixlen + 1;	/* Header plus \n ": " */
+		output = mh_xrealloc(output, outlen);
+		q = output + curlen;
+		*q++ = '?';
+		*q++ = '=';
+		*q++ = '\n';
+		for (i = 0; i < prefixlen; i++)
+		    *q++ = ' ';
+	    } else {
+	    	output = mh_xmalloc(outlen);
+		q = output;
+	    }
+
+	    q += snprintf(q, outlen - (q - output), "=?%s?Q?", charset);
+	    column = prefixlen;
+	    newline = 0;
+	}
+
+	/*
+	 * Process each character, encoding if necessary
+	 */
+
+	column++;
+
+	if (*p == ' ') {
+	    *q++ = '_';
+	    ascii--;
+	} else if (!qpspecial(*p)) {
+	    *q++ = *p;
+	    ascii--;
+	} else {
+	    snprintf(q, outlen - (q - output), "=%02X", (unsigned int) *p);
+	    q += 2;
+	    column += 2;
+	    encoded--;
+	}
+
+	p++;
+
+	/*
+	 * We're not allowed more than ENCODELINELIMIT characters per line,
+	 * so reserve some room for the final ?=.
+	 *
+	 * If prefixlen == 0, we haven't been passed in a header name, so
+	 * don't ever wrap the field (we're likely doing an address).
+	 */
+
+	if (prefixlen == 0)
+	    continue;
+
+	if (column >= ENCODELINELIMIT - 2) {
+	    newline = 1;
+	} else if (utf8) {
+	    /*
+	     * Okay, this is a bit weird, but to explain a bit more ...
+	     *
+	     * RFC 2047 prohibits the splitting of multibyte characters
+	     * across encoded words.  Right now we only handle the case
+	     * of UTF-8, the most common multibyte encoding.
+	     *
+	     * p is now pointing at the next input character.  If we're
+	     * using UTF-8 _and_ we'd go over ENCODELINELIMIT given the
+	     * length of the complete character, then trigger a newline
+	     * now
+	     */
+	    if (column + utf8len(p) > ENCODELINELIMIT - 2) {
+	    	newline = 1;
+	    }
+	}
+    }
+
+    strcat(q, "?=");
+
+    free(*value);
+
+    *value = output;
 
     return 0;
 }
 
+/*
+ * Calculate the length of a UTF-8 character.
+ *
+ * If it's not a UTF-8 character (or we're in the middle of a multibyte
+ * character) then simply return 1.
+ */
+
+static int
+utf8len(const char *p)
+{
+    int len = 1;
+
+    if (*p == '\0')
+    	return 0;
+
+    if (isascii((int) *p) || (*((unsigned char *) p) & 0xc0) == 0x80)
+    	return 1;
+
+    p++;
+    while ((*((unsigned char *) p++) & 0xc0) == 0x80)
+    	len++;
+
+    return len;
+}
+
 /*
  * "Unfold" a header, making it a single line (without continuation)
  *
@@ -174,3 +324,10 @@ unfold_header(char **value, int len)
     free(*value);
     *value = str;
 }
+
+static int
+field_encode_address(const char *name, char **value, int encoding,
+		     const char *charset)
+{
+    return 0;
+}