Fixed file descriptor leak in mhfixmsg when run on multiple input files.

[nmh] / sbr / encode_rfc2047.c
diff --git a/sbr/encode_rfc2047.c b/sbr/encode_rfc2047.c

index 04a74f1af878a883dddad4b7e8e6e5e82547c905..bb45b5c10f396411bbca82ae9426cee1388f33f8 100644 (file)
--- a/sbr/encode_rfc2047.c
+++ b/sbr/encode_rfc2047.c
@@ -1,15 +1,19 @@
-/*
- * Routines to encode message headers using RFC 2047-encoding.
+/* encode_rfc2047.c -- encode message headers using RFC 2047 encoding.
   *
   * This code is Copyright (c) 2002, by the authors of nmh.  See the
   * COPYRIGHT file in the root directory of the nmh distribution for
   * complete copyright information.
   */
  
-#include <h/mh.h>
-#include <h/mhparse.h>
-#include <h/addrsbr.h>
-#include <h/utils.h>
+#include "h/mh.h"
+#include "encode_rfc2047.h"
+#include "check_charset.h"
+#include "error.h"
+#include "h/mhparse.h"
+#include "h/addrsbr.h"
+#include "h/utils.h"
+#include "base64.h"
+#include "unquote.h"
  
  /*
   * List of headers that contain addresses and as a result require special
@@ -56,6 +60,7 @@ static int field_encode_quoted(const char *, char **, const char *, int,
  static int field_encode_base64(const char *, char **, const char *);
  static int scanstring(const char *, int *, int *, int *);
  static int utf8len(const char *);
+static int pref_encoding(int, int, int);
  
  /*
   * Encode a message header using RFC 2047 encoding.  We make the assumption
@@ -78,13 +83,13 @@ encode_rfc2047(const char *name, char **value, int encoding,
         if (isascii((unsigned char) *p)) {
             asciicount++;
             if (qpspecial((unsigned char) *p))
-               qpspecialcount++;
+               qpspecialcount++;
         } else
             eightbitcount++;
      }
  
      if (eightbitcount == 0)
-       return 0;
+       return 0;
  
      /*
       * Some rules from RFC 2047:
@@ -103,10 +108,10 @@ encode_rfc2047(const char *name, char **value, int encoding,
       */
  
      if (charset == NULL)
-       charset = write_charset_8bit();
+       charset = write_charset_8bit();
  
      if (strcasecmp(charset, "US-ASCII") == 0) {
-       advise(NULL, "Cannot use US-ASCII with 8 bit characters in header");
+       inform("Cannot use US-ASCII with 8 bit characters in header");
         return 1;
      }
  
@@ -116,7 +121,7 @@ encode_rfc2047(const char *name, char **value, int encoding,
       */
  
      for (i = 0; address_headers[i]; i++) {
-       if (strcasecmp(name, address_headers[i]) == 0)
+       if (strcasecmp(name, address_headers[i]) == 0)
             return field_encode_address(name, value, encoding, charset);
      }
  
@@ -124,28 +129,28 @@ encode_rfc2047(const char *name, char **value, int encoding,
       * On the encoding we choose, and the specifics of encoding:
       *
       * - If a specified encoding is passed in, we use that.
-     * - If more than 50% of the characters are high-bit, we use base64
-     *   and encode the whole field as one atom (possibly split).
-     * - Otherwise, we use quoted-printable.
+     * - Otherwise, pick which encoding is shorter.
+     *
+     * We don't quite handle continuation right here, but it should be
+     * pretty close.
       */
  
      if (encoding == CE_UNKNOWN)
-       encoding = (eightbitcount * 10 / (asciicount + eightbitcount) > 5) ?
-                                               CE_BASE64 : CE_QUOTED;
+        encoding = pref_encoding(asciicount, qpspecialcount, eightbitcount);
  
      unfold_header(value, asciicount + eightbitcount);
  
      switch (encoding) {
  
      case CE_BASE64:
-       return field_encode_base64(name, value, charset);
+       return field_encode_base64(name, value, charset);
  
      case CE_QUOTED:
         return field_encode_quoted(name, value, charset, asciicount,
                                    eightbitcount + qpspecialcount, 0);
  
      default:
-       advise(NULL, "Internal error: unknown RFC-2047 encoding type");
+       inform("Internal error: unknown RFC-2047 encoding type");
         return 1;
      }
  }
@@ -158,9 +163,9 @@ static int
  field_encode_quoted(const char *name, char **value, const char *charset,
                     int ascii, int encoded, int phraserules)
  {
-    int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column, newline = 1;
+    int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column;
      int charsetlen = strlen(charset), utf8;
-    char *output = NULL, *p, *q;
+    char *output = NULL, *p, *q = NULL;
  
      /*
       * Right now we just encode the whole thing.  Maybe later on we'll
@@ -173,11 +178,12 @@ field_encode_quoted(const char *name, char **value, const char *charset,
  
      utf8 = strcasecmp(charset, "UTF-8") == 0;
  
+    bool newline = true;
      while (*p != '\0') {
-       /*
+       /*
          * Start a new line, if it's time
          */
-       if (newline) {
+       if (newline) {
             /*
              * If it's the start of the header, we don't need to pad it
              *
@@ -209,12 +215,12 @@ field_encode_quoted(const char *name, char **value, const char *charset,
                 for (i = 0; i < prefixlen; i++)
                     *q++ = ' ';
             } else {
-               /*
+               /*
                  * A bit of a hack here; the header can contain multiple
                  * spaces (probably at least one) until we get to the
                  * actual text.  Copy until we get to a non-space.
                  */
-               output = mh_xmalloc(outlen);
+               output = mh_xmalloc(outlen);
                 q = output;
                 while (is_fws(*p))
                     *q++ = *p++;
@@ -223,7 +229,7 @@ field_encode_quoted(const char *name, char **value, const char *charset,
             tokenlen = snprintf(q, outlen - (q - output), "=?%s?Q?", charset);
             q += tokenlen;
             column = prefixlen + tokenlen;
-           newline = 0;
+           newline = false;
         }
  
         /*
@@ -240,7 +246,7 @@ field_encode_quoted(const char *name, char **value, const char *charset,
             ascii--;
         } else if (isascii((unsigned char) *p) &&
                    (phraserules ? qphrasevalid((unsigned char) *p) :
-                                       !qpspecial((unsigned char) *p))) {
+                                       !qpspecial((unsigned char) *p))) {
             *q++ = *p;
             ascii--;
         } else {
@@ -264,7 +270,7 @@ field_encode_quoted(const char *name, char **value, const char *charset,
             continue;
  
         if (column >= ENCODELINELIMIT - 2) {
-           newline = 1;
+           newline = true;
         } else if (utf8) {
             /*
              * Okay, this is a bit weird, but to explain a bit more ...
@@ -280,15 +286,24 @@ field_encode_quoted(const char *name, char **value, const char *charset,
              * allow for the encoded output.
              */
             if (column + (utf8len(p) * 3) > ENCODELINELIMIT - 2) {
-               newline = 1;
+               newline = true;
             }
         }
      }
  
-    strcat(q, "?=");
+    if (q == NULL) {
+       /* This should never happen, but just in case.  Found by
+          clang static analyzer. */
+       inform("null output encoding for %s, continuing...", *value);
+       return 1;
+    }
+    *q++ = '?';
+    *q++ = '=';
  
      if (prefixlen)
-       strcat(q, "\n");
+       *q++ = '\n';
+
+    *q = '\0';
  
      free(*value);
  
@@ -309,14 +324,14 @@ field_encode_base64(const char *name, char **value, const char *charset)
  {
      int prefixlen = name ? strlen(name) + 2 : 0, charsetlen = strlen(charset);
      int outlen = 0, numencode, curlen;
-    char *output = NULL, *p = *value, *q = NULL, *linestart;
+    char *output = NULL, *p = *value, *q = NULL, *linestart = NULL;
  
      /*
       * Skip over any leading white space.
       */
  
      while (*p == ' ' || *p == '\t')
-       p++;
+       p++;
  
      /*
       * If we had a zero-length prefix, then just encode the whole field
@@ -328,7 +343,7 @@ field_encode_base64(const char *name, char **value, const char *charset)
       */
  
      while (prefixlen && ((base64len(strlen(p)) + 7 + charsetlen +
-                         prefixlen) > ENCODELINELIMIT)) {
+                         prefixlen) > ENCODELINELIMIT)) {
  
         /*
          * Our very first time, don't pad the line in the front
@@ -371,8 +386,8 @@ field_encode_base64(const char *name, char **value, const char *charset)
         numencode = strbase64(ENCODELINELIMIT - (q - linestart) - 2);
  
         if (numencode <= 0) {
-           advise(NULL, "Internal error: tried to encode %d characters "
-                  "in base64", numencode);
+           inform("Internal error: tried to encode %d characters "
+                  "in base64", numencode);
             return 1;
         }
  
@@ -392,10 +407,10 @@ field_encode_base64(const char *name, char **value, const char *charset)
              */
  
             while (numencode > 0 && ((*(p + numencode) & 0xc0) == 0x80))
-               numencode--;
+               numencode--;
  
             if (numencode == 0) {
-               advise(NULL, "Internal error: could not find start of "
+               inform("Internal error: could not find start of "
                        "UTF-8 character when base64 encoding header");
                 return 1;
             }
@@ -403,7 +418,7 @@ field_encode_base64(const char *name, char **value, const char *charset)
  
         if (writeBase64raw((unsigned char *) p, numencode,
                            (unsigned char *) q) != OK) {
-           advise(NULL, "Internal error: base64 encoding of header failed");
+           inform("Internal error: base64 encoding of header failed");
             return 1;
         }
  
@@ -446,18 +461,18 @@ field_encode_base64(const char *name, char **value, const char *charset)
      q = output + curlen;
  
      q += snprintf(q, outlen - (q - output), "%s=?%s?B?",
-                 prefixlen ? " " : "", charset);
+                 prefixlen ? " " : "", charset);
  
      if (writeBase64raw((unsigned char *) p, strlen(p),
-                      (unsigned char *) q) != OK) {
-       advise(NULL, "Internal error: base64 encoding of header failed");
+                      (unsigned char *) q) != OK) {
+       inform("Internal error: base64 encoding of header failed");
         return 1;
      }
  
      strcat(q, "?=");
  
      if (prefixlen)
-       strcat(q, "\n");
+       strcat(q, "\n");
  
      free(*value);
  
@@ -479,14 +494,14 @@ utf8len(const char *p)
      int len = 1;
  
      if (*p == '\0')
-       return 0;
+       return 0;
  
      if (isascii((unsigned char) *p) || (((unsigned char) *p) & 0xc0) == 0x80)
-       return 0;
+       return 0;
  
      p++;
      while ((((unsigned char) *p++) & 0xc0) == 0x80)
-       len++;
+       len++;
  
      return len;
  }
@@ -505,7 +520,7 @@ unfold_header(char **value, int len)
      char *p = str, *q = *value;
  
      while (*q != '\0') {
-       if (*q == '\n') {
+       if (*q == '\n') {
             /*
              * When we get a newline, skip to the next non-whitespace
              * character and add a space to replace all of the whitespace
@@ -513,10 +528,10 @@ unfold_header(char **value, int len)
              * This has the side effect of stripping off the final newline
              * for the header; we put it back in the encoding routine.
              */
-           while (is_fws(*q++))
-               ;
+           while (is_fws(*q))
+               q++;
             if (*q == '\0')
-               break;
+               break;
  
             *p++ = ' ';
         } else {
@@ -539,10 +554,18 @@ static int
  field_encode_address(const char *name, char **value, int encoding,
                      const char *charset)
  {
-    int prefixlen = strlen(name) + 2, column = prefixlen, groupflag, errflag;
+    int prefixlen = strlen(name) + 2, column = prefixlen, groupflag;
+    int asciichars;
+    int specialchars;
      int eightbitchars;
-    char *mp, *output = NULL;
+    bool reformat = false;
+    bool errflag = false;
+    size_t len;
+    char *mp, *cp = NULL, *output = NULL;
+    char *tmpbuf = NULL;
+    size_t tmpbufsize = 0;
      struct mailname *mn;
+    char errbuf[BUFSIZ];
  
      /*
       * Because these are addresses, we need to handle them individually.
@@ -552,23 +575,242 @@ field_encode_address(const char *name, char **value, int encoding,
       */
  
      /*
-     * The output headers always have to start with a space first.
+     * The output headers always have to start with a space first; this
+     * is just the way the API works right now.
       */
  
      output = add(" ", output);
  
-    for (groupflag = 0; mp = getname(*value); ) {
-       if ((mn = getm(mp, NULL, 0, AD_HOST, NULL)) == NULL) {
-           errflag++;
+    for (groupflag = 0; (mp = getname(*value)); ) {
+       if ((mn = getm(mp, NULL, 0, errbuf, sizeof(errbuf))) == NULL) {
+           inform("%s: %s", errbuf, mp);
+           errflag = true;
             continue;
         }
  
+       reformat = false;
+
         /*
          * We only care if the phrase (m_pers) or any trailing comment
          * (m_note) have 8-bit characters.  If doing q-p, we also need
-        * to encode anything marked as qspecial().
+        * to encode anything marked as qspecial().  Unquote it first
+        * so the specialchars count is right.
+        */
+
+       if (! mn->m_pers)
+           goto check_note;
+
+       if ((len = strlen(mn->m_pers)) + 1 > tmpbufsize) {
+           tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
+       }
+
+       unquote_string(mn->m_pers, tmpbuf);
+
+       if (scanstring(tmpbuf, &asciichars, &eightbitchars,
+                      &specialchars)) {
+           /*
+            * If we have 8-bit characters, encode it.
+            */
+
+           if (encoding == CE_UNKNOWN)
+               encoding = pref_encoding(asciichars, specialchars,
+                                        eightbitchars);
+
+           /*
+            * This is okay, because the output of unquote_string will be either
+            * equal or shorter than the original.
+            */
+
+           strcpy(mn->m_pers, tmpbuf);
+
+           switch (encoding) {
+
+           case CE_BASE64:
+               if (field_encode_base64(NULL, &mn->m_pers, charset)) {
+                   errflag = true;
+                   goto out;
+               }
+               break;
+
+           case CE_QUOTED:
+               if (field_encode_quoted(NULL, &mn->m_pers, charset, asciichars,
+                                       eightbitchars + specialchars, 1)) {
+                   errflag = true;
+                   goto out;
+               }
+               break;
+
+           default:
+               inform("Internal error: unknown RFC-2047 encoding type");
+               errflag = true;
+               goto out;
+           }
+
+           reformat = true;
+       }
+
+       check_note:
+
+       /*
+        * The "note" field is generally a comment at the end of the address,
+        * at least as how it's implemented here.  Notes are always surrounded
+        * by parenthesis (since they're comments).  Strip them out and
+        * then put them back when we format the final field, but they do
+        * not get encoded.
+        */
+
+       if (! mn->m_note)
+           goto do_reformat;
+
+       if ((len = strlen(mn->m_note)) + 1 > tmpbufsize) {
+           tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
+       }
+
+       if (mn->m_note[0] != '(' || mn->m_note[len - 1] != ')') {
+           inform("Internal error: Invalid note field \"%s\"",
+                  mn->m_note);
+           errflag = true;
+           goto out;
+       }
+
+       strncpy(tmpbuf, mn->m_note + 1, len - 1);
+       tmpbuf[len - 2] = '\0';
+
+       if (scanstring(tmpbuf, &asciichars, &eightbitchars,
+                      &specialchars)) {
+           /*
+            * If we have 8-bit characters, encode it.
+            */
+
+           if (encoding == CE_UNKNOWN)
+               encoding = pref_encoding(asciichars, specialchars,
+                                        eightbitchars);
+
+           switch (encoding) {
+
+           case CE_BASE64:
+               if (field_encode_base64(NULL, &tmpbuf, charset)) {
+                   errflag = true;
+                   goto out;
+               }
+               break;
+
+           case CE_QUOTED:
+               if (field_encode_quoted(NULL, &tmpbuf, charset, asciichars,
+                                       eightbitchars + specialchars, 1)) {
+                   errflag = true;
+                   goto out;
+               }
+               break;
+
+           default:
+               inform("Internal error: unknown RFC-2047 encoding type");
+               errflag = true;
+               goto out;
+           }
+
+           reformat = true;
+
+           /*
+            * Make sure the size of tmpbuf is correct (it always gets
+            * reallocated in the above functions).
+            */
+
+           tmpbufsize = strlen(tmpbuf) + 1;
+
+           /*
+            * Put the note field back surrounded by parenthesis.
+            */
+
+           mn->m_note = mh_xrealloc(mn->m_note, tmpbufsize + 2);
+
+           snprintf(mn->m_note, tmpbufsize + 2, "(%s)", tmpbuf);
+       }
+
+do_reformat:
+
+       /*
+        * So, some explanation is in order.
+        *
+        * We know we need to rewrite at least one address in the header,
+        * otherwise we wouldn't be here.  If we had to reformat this
+        * particular address, then run it through adrformat().  Otherwise
+        * we can use m_text directly.
          */
+
+       /*
+        * If we were in a group but are no longer, make sure we add a
+        * semicolon (which needs to be FIRST, as it needs to be at the end
+        * of the last address).
+        */
+
+       if (groupflag && ! mn->m_ingrp) {
+           output = add(";", output);
+           column++;
+       }
+
+       groupflag = mn->m_ingrp;
+
+       if (mn->m_gname) {
+           cp = mh_xstrdup(mn->m_gname);
+       }
+
+       if (reformat) {
+           cp = add(adrformat(mn), cp);
+       } else {
+           cp = add(mn->m_text, cp);
+       }
+
+       len = strlen(cp);
+
+       /*
+        * If we're not at the beginning of the line, add a command and
+        * either a space or a newline.
+        */
+
+       if (column != prefixlen) {
+           if (len + column + 2 > OUTPUTLINELEN) {
+
+               if ((size_t) (prefixlen + 3) < tmpbufsize)
+                   tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = prefixlen + 3);
+
+               snprintf(tmpbuf, tmpbufsize, ",\n%*s", column = prefixlen, "");
+               output = add(tmpbuf, output);
+           } else {
+               output = add(", ", output);
+               column += 2;
+           }
+       }
+
+       /*
+        * Finally add the address
+        */
+
+       output = add(cp, output);
+       column += len;
+       free(cp);
+       cp = NULL;
+    }
+
+    /*
+     * Just in case we're at the end of a list
+     */
+
+    if (groupflag) {
+       output = add(";", output);
      }
+
+    output = add("\n", output);
+
+    free(*value);
+    *value = output;
+    output = NULL;
+
+out:
+    free(tmpbuf);
+    free(output);
+
+    return errflag;
  }
  
  /*
@@ -584,14 +826,52 @@ scanstring(const char *string, int *asciilen, int *eightbitchars,
      *specialchars = 0;
  
      for (; *string != '\0'; string++) {
-       if ((isascii((unsigned char) *string))) {
-           (*asciilen++);
-           if (!qphrasevalid((unsigned char) *string))
-               (*specialchars)++;
+       if ((isascii((unsigned char) *string))) {
+           (*asciilen)++;
+           /*
+            * So, a space is not a valid phrase character, but we're counting
+            * an exception here, because in q-p a space can be directly
+            * encoded as an underscore.
+            */
+           if (!qphrasevalid((unsigned char) *string) && *string != ' ')
+               (*specialchars)++;
         } else {
             (*eightbitchars)++;
         }
      }
  
-    return eightbitchars > 0;
+    return *eightbitchars > 0;
+}
+
+/*
+ * This function is to be used to decide which encoding algorithm we should
+ * use if one is not given.  Basically, we pick whichever one is the shorter
+ * of the two.
+ *
+ * Arguments are:
+ *
+ * ascii       - Number of ASCII characters in to-be-encoded string.
+ * specials    - Number of ASCII characters in to-be-encoded string that
+ *               still require encoding under quoted-printable.  Note that
+ *               these are included in the "ascii" total.
+ * eightbit    - Eight-bit characters in the to-be-encoded string.
+ *
+ * Returns one of CE_BASE64 or CE_QUOTED.
+ */
+
+static int
+pref_encoding(int ascii, int specials, int eightbits)
+{
+    /*
+     * The length of the q-p encoding is:
+     *
+     * ascii - specials + (specials + eightbits) * 3.
+     *
+     * The length of the base64 encoding is:
+     *
+     * base64len(ascii + eightbits)    (See macro for details)
+     */
+
+    return base64len(ascii + eightbits) < (ascii - specials +
+                       (specials + eightbits) * 3) ? CE_BASE64 : CE_QUOTED;
  }