Fix up the unquote test, with help from Lyndon and Ralph.

[nmh] / sbr / encode_rfc2047.c
diff --git a/sbr/encode_rfc2047.c b/sbr/encode_rfc2047.c

index fcb1c10457921825cd18da2d0d0415413a28f103..04a74f1af878a883dddad4b7e8e6e5e82547c905 100644 (file)
--- a/sbr/encode_rfc2047.c
+++ b/sbr/encode_rfc2047.c
@@ -8,6 +8,7 @@
  
  #include <h/mh.h>
  #include <h/mhparse.h>
  
  #include <h/mh.h>
  #include <h/mhparse.h>
+#include <h/addrsbr.h>
  #include <h/utils.h>
  
  /*
  #include <h/utils.h>
  
  /*
@@ -43,15 +44,17 @@ static char *address_headers[] = {
                          c == '/' || c == '=' || c == '_')
  #define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')
  
                          c == '/' || c == '=' || c == '_')
  #define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')
  
-#define base64len(n) (((n + 2) / 3 ) * 4)      /* String len to base64 len */
-#define strbase64(n) (n * 3 / 4)               /* Chars that fit in base64 */
+#define base64len(n) ((((n) + 2) / 3 ) * 4)    /* String len to base64 len */
+#define strbase64(n) ((n) / 4 * 3)             /* Chars that fit in base64 */
  
  #define ENCODELINELIMIT        76
  
  static void unfold_header(char **, int);
  static int field_encode_address(const char *, char **, int, const char *);
  
  #define ENCODELINELIMIT        76
  
  static void unfold_header(char **, int);
  static int field_encode_address(const char *, char **, int, const char *);
-static int field_encode_quoted(const char *, char **, const char *, int, int);
+static int field_encode_quoted(const char *, char **, const char *, int,
+                              int, int);
  static int field_encode_base64(const char *, char **, const char *);
  static int field_encode_base64(const char *, char **, const char *);
+static int scanstring(const char *, int *, int *, int *);
  static int utf8len(const char *);
  
  /*
  static int utf8len(const char *);
  
  /*
@@ -139,7 +142,7 @@ encode_rfc2047(const char *name, char **value, int encoding,
  
      case CE_QUOTED:
         return field_encode_quoted(name, value, charset, asciicount,
  
      case CE_QUOTED:
         return field_encode_quoted(name, value, charset, asciicount,
-                                  eightbitcount + qpspecialcount);
+                                  eightbitcount + qpspecialcount, 0);
  
      default:
         advise(NULL, "Internal error: unknown RFC-2047 encoding type");
  
      default:
         advise(NULL, "Internal error: unknown RFC-2047 encoding type");
@@ -153,7 +156,7 @@ encode_rfc2047(const char *name, char **value, int encoding,
  
  static int
  field_encode_quoted(const char *name, char **value, const char *charset,
  
  static int
  field_encode_quoted(const char *name, char **value, const char *charset,
-                   int ascii, int encoded)
+                   int ascii, int encoded, int phraserules)
  {
      int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column, newline = 1;
      int charsetlen = strlen(charset), utf8;
  {
      int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column, newline = 1;
      int charsetlen = strlen(charset), utf8;
@@ -225,6 +228,9 @@ field_encode_quoted(const char *name, char **value, const char *charset,
  
         /*
          * Process each character, encoding if necessary
  
         /*
          * Process each character, encoding if necessary
+        *
+        * Note that we have a different set of rules if we're processing
+        * RFC 5322 'phrase' (something you'd see in an address header).
          */
  
         column++;
          */
  
         column++;
@@ -232,7 +238,9 @@ field_encode_quoted(const char *name, char **value, const char *charset,
         if (*p == ' ') {
             *q++ = '_';
             ascii--;
         if (*p == ' ') {
             *q++ = '_';
             ascii--;
-       } else if (isascii((int) *p) && !qpspecial((int) *p)) {
+       } else if (isascii((unsigned char) *p) &&
+                  (phraserules ? qphrasevalid((unsigned char) *p) :
+                                       !qpspecial((unsigned char) *p))) {
             *q++ = *p;
             ascii--;
         } else {
             *q++ = *p;
             ascii--;
         } else {
@@ -277,7 +285,10 @@ field_encode_quoted(const char *name, char **value, const char *charset,
         }
      }
  
         }
      }
  
-    strcat(q, "?=\n");
+    strcat(q, "?=");
+
+    if (prefixlen)
+       strcat(q, "\n");
  
      free(*value);
  
  
      free(*value);
  
@@ -297,8 +308,15 @@ static int
  field_encode_base64(const char *name, char **value, const char *charset)
  {
      int prefixlen = name ? strlen(name) + 2 : 0, charsetlen = strlen(charset);
  field_encode_base64(const char *name, char **value, const char *charset)
  {
      int prefixlen = name ? strlen(name) + 2 : 0, charsetlen = strlen(charset);
-    int outlen = 0, numencode;
-    char *output = NULL, *p = *value, *q, *linestart;
+    int outlen = 0, numencode, curlen;
+    char *output = NULL, *p = *value, *q = NULL, *linestart;
+
+    /*
+     * Skip over any leading white space.
+     */
+
+    while (*p == ' ' || *p == '\t')
+       p++;
  
      /*
       * If we had a zero-length prefix, then just encode the whole field
  
      /*
       * If we had a zero-length prefix, then just encode the whole field
@@ -314,36 +332,40 @@ field_encode_base64(const char *name, char **value, const char *charset)
  
         /*
          * Our very first time, don't pad the line in the front
  
         /*
          * Our very first time, don't pad the line in the front
+        *
+        * Note ENCODELINELIMIT is + 2 because of \n \0
          */
  
  
         if (! output) {
          */
  
  
         if (! output) {
-           outlen += ENCODELINELIMIT - prefixlen + 1;
+           outlen += ENCODELINELIMIT + 2;
             output = q = mh_xmalloc(outlen);
             linestart = q - prefixlen;  /* Yes, this is intentional */
         } else {
             output = q = mh_xmalloc(outlen);
             linestart = q - prefixlen;  /* Yes, this is intentional */
         } else {
-           int curlen = q - output;
+           int curstart = linestart - output;
+           curlen = q - output;
  
  
-           outlen += ENCODELINELIMIT + 1;
+           outlen += ENCODELINELIMIT + 2;
             output = mh_xrealloc(output, outlen);
             output = mh_xrealloc(output, outlen);
-           linestart = q = output + curlen;
-           q += snprintf(q, outlen - (q - output), "%*s", prefixlen, "");
+           q = output + curlen;
+           linestart = output + curstart;
         }
  
         /*
          * We should have enough space now, so prepend the encoding markers
         }
  
         /*
          * We should have enough space now, so prepend the encoding markers
-        * and character set information
+        * and character set information.  The leading space is intentional.
          */
  
          */
  
-       q += snprintf(q, outlen - (q - output), "=?%s?B?", charset);
+       q += snprintf(q, outlen - (q - output), " =?%s?B?", charset);
  
         /*
  
         /*
-        * Find out how much room we have left on the line and see how many
-        * characters we can stuff in.  The start of our line is marked
-        * by "linestart", so use that to figure out how many characters
-        * are left out of ENCODELINELIMIT.  Reserve 2 characters for the
-        * end markers, and calculate how many characters we can fit into
-        * that space given the base64 encoding expansion.
+         * Find out how much room we have left on the line and see how
+         * many characters we can stuff in.  The start of our line
+         * is marked by "linestart", so use that to figure out how
+         * many characters are left out of ENCODELINELIMIT.  Reserve
+         * 2 characters for the end markers and calculate how many
+         * characters we can fit into that space given the base64
+         * encoding expansion.
          */
  
         numencode = strbase64(ENCODELINELIMIT - (q - linestart) - 2);
          */
  
         numencode = strbase64(ENCODELINELIMIT - (q - linestart) - 2);
@@ -357,9 +379,90 @@ field_encode_base64(const char *name, char **value, const char *charset)
         /*
          * RFC 2047 prohibits spanning multibyte characters across tokens.
          * Right now we only check for UTF-8.
         /*
          * RFC 2047 prohibits spanning multibyte characters across tokens.
          * Right now we only check for UTF-8.
+        *
+        * So note the key here ... we want to make sure the character BEYOND
+        * our last character is not a continuation byte.  If it's the start
+        * of a new multibyte character or a single-byte character, that's ok.
          */
          */
+
+       if (strcasecmp(charset, "UTF-8") == 0) {
+           /*
+            * p points to the start of our current buffer, so p + numencode
+            * is one past the last character to encode
+            */
+
+           while (numencode > 0 && ((*(p + numencode) & 0xc0) == 0x80))
+               numencode--;
+
+           if (numencode == 0) {
+               advise(NULL, "Internal error: could not find start of "
+                      "UTF-8 character when base64 encoding header");
+               return 1;
+           }
+       }
+
+       if (writeBase64raw((unsigned char *) p, numencode,
+                          (unsigned char *) q) != OK) {
+           advise(NULL, "Internal error: base64 encoding of header failed");
+           return 1;
+       }
+
+       p += numencode;
+       q += base64len(numencode);
+
+       /*
+        * This will point us at the beginning of the new line (trust me).
+        */
+
+       linestart = q + 3;
+
+       /*
+        * What's going on here?  Well, we know we're continuing to the next
+        * line, so we want to add continuation padding.  We also add the
+        * trailing marker for the RFC 2047 token at this time as well.
+        * This uses a trick of snprintf(); we tell it to print a zero-length
+        * string, but pad it out to prefixlen - 1 characters; that ends
+        * up always printing out the requested number of spaces.  We use
+        * prefixlen - 1 because we always add a space on the starting
+        * token marker; this makes things work out correctly for the first
+        * line, which should have a space between the ':' and the start
+        * of the token.
+        *
+        * It's okay if you don't follow all of that.
+        */
+
+       q += snprintf(q, outlen - (q - output), "?=\n%*s", prefixlen - 1, "");
+    }
+
+    /*
+     * We're here if there is either no prefix, or we can fit it in less
+     * than ENCODELINELIMIT characters.  Encode the whole thing.
+     */
+
+    outlen += prefixlen + 9 + charsetlen + base64len(strlen(p));
+    curlen = q - output;
+
+    output = mh_xrealloc(output, outlen);
+    q = output + curlen;
+
+    q += snprintf(q, outlen - (q - output), "%s=?%s?B?",
+                 prefixlen ? " " : "", charset);
+
+    if (writeBase64raw((unsigned char *) p, strlen(p),
+                      (unsigned char *) q) != OK) {
+       advise(NULL, "Internal error: base64 encoding of header failed");
+       return 1;
      }
  
      }
  
+    strcat(q, "?=");
+
+    if (prefixlen)
+       strcat(q, "\n");
+
+    free(*value);
+
+    *value = output;
+
      return 0;
  }
  
      return 0;
  }
  
@@ -427,9 +530,68 @@ unfold_header(char **value, int len)
      *value = str;
  }
  
      *value = str;
  }
  
+/*
+ * Decode a header containing addresses.  This means we have to parse
+ * each address and only encode the display-name or comment field.
+ */
+
  static int
  field_encode_address(const char *name, char **value, int encoding,
                      const char *charset)
  {
  static int
  field_encode_address(const char *name, char **value, int encoding,
                      const char *charset)
  {
-    return 0;
+    int prefixlen = strlen(name) + 2, column = prefixlen, groupflag, errflag;
+    int eightbitchars;
+    char *mp, *output = NULL;
+    struct mailname *mn;
+
+    /*
+     * Because these are addresses, we need to handle them individually.
+     *
+     * Break them down and process them one by one.  This means we have to
+     * rewrite the whole header, but that's unavoidable.
+     */
+
+    /*
+     * The output headers always have to start with a space first.
+     */
+
+    output = add(" ", output);
+
+    for (groupflag = 0; mp = getname(*value); ) {
+       if ((mn = getm(mp, NULL, 0, AD_HOST, NULL)) == NULL) {
+           errflag++;
+           continue;
+       }
+
+       /*
+        * We only care if the phrase (m_pers) or any trailing comment
+        * (m_note) have 8-bit characters.  If doing q-p, we also need
+        * to encode anything marked as qspecial().
+        */
+    }
+}
+
+/*
+ * Scan a string, check for characters that need to be encoded
+ */
+
+static int
+scanstring(const char *string, int *asciilen, int *eightbitchars,
+          int *specialchars)
+{
+    *asciilen = 0;
+    *eightbitchars = 0;
+    *specialchars = 0;
+
+    for (; *string != '\0'; string++) {
+       if ((isascii((unsigned char) *string))) {
+           (*asciilen++);
+           if (!qphrasevalid((unsigned char) *string))
+               (*specialchars)++;
+       } else {
+           (*eightbitchars)++;
+       }
+    }
+
+    return eightbitchars > 0;
  }
  }