diplodocus.org Git - nmh/blob - sbr/encode_rfc2047.c

   1 /*
   2  * Routines to encode message headers using RFC 2047-encoding.
   3  *
   4  * This code is Copyright (c) 2002, by the authors of nmh.  See the
   5  * COPYRIGHT file in the root directory of the nmh distribution for
   6  * complete copyright information.
   7  */
   8
   9 #include <h/mh.h>
  10 #include <h/mhparse.h>
  11 #include <h/addrsbr.h>
  12 #include <h/utils.h>
  13
  14 /*
  15  * List of headers that contain addresses and as a result require special
  16  * handling
  17  */
  18
  19 static char *address_headers[] = {
  20     "To",
  21     "From",
  22     "cc",
  23     "Bcc",
  24     "Reply-To",
  25     "Sender",
  26     "Resent-To",
  27     "Resent-From",
  28     "Resent-cc",
  29     "Resent-Bcc",
  30     "Resent-Reply-To",
  31     "Resent-Sender",
  32     NULL,
  33 };
  34
  35 /*
  36  * Macros we use for parsing headers
  37  */
  38
  39 #define is_fws(c) (c == '\t' || c == ' ' || c == '\n')
  40
  41 #define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \
  42                          (c >= 'a' && c <= 'z') || \
  43                          c == '!' || c == '*' || c == '+' || c == '-' || \
  44                          c == '/' || c == '=' || c == '_')
  45 #define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')
  46
  47 #define base64len(n) ((((n) + 2) / 3 ) * 4)     /* String len to base64 len */
  48 #define strbase64(n) ((n) / 4 * 3)              /* Chars that fit in base64 */
  49
  50 #define ENCODELINELIMIT 76
  51
  52 static void unfold_header(char **, int);
  53 static int field_encode_address(const char *, char **, int, const char *);
  54 static int field_encode_quoted(const char *, char **, const char *, int,
  55                                int, int);
  56 static int field_encode_base64(const char *, char **, const char *);
  57 static int scanstring(const char *, int *, int *, int *);
  58 static int utf8len(const char *);
  59
  60 /*
  61  * Encode a message header using RFC 2047 encoding.  We make the assumption
  62  * that all characters < 128 are ASCII and as a consequence don't need any
  63  * encoding.
  64  */
  65
  66 int
  67 encode_rfc2047(const char *name, char **value, int encoding,
  68                const char *charset)
  69 {
  70     int i, asciicount = 0, eightbitcount = 0, qpspecialcount = 0;
  71     char *p;
  72
  73     /*
  74      * First, check to see if we even need to encode the header
  75      */
  76
  77     for (p = *value; *p != '\0'; p++) {
  78         if (isascii((unsigned char) *p)) {
  79             asciicount++;
  80             if (qpspecial((unsigned char) *p))
  81                 qpspecialcount++;
  82         } else
  83             eightbitcount++;
  84     }
  85
  86     if (eightbitcount == 0)
  87         return 0;
  88
  89     /*
  90      * Some rules from RFC 2047:
  91      *
  92      * - Encoded words cannot be more than 75 characters long
  93      * - Multiple "long" encoded words must be on new lines.
  94      *
  95      * Also, we're not permitted to encode email addresses, so
  96      * we need to actually _parse_ email addresses and only encode
  97      * the right bits.
  98      */
  99
 100     /*
 101      * If charset was NULL, then get the value from the locale.  But
 102      * we reject it if it returns US-ASCII
 103      */
 104
 105     if (charset == NULL)
 106         charset = write_charset_8bit();
 107
 108     if (strcasecmp(charset, "US-ASCII") == 0) {
 109         advise(NULL, "Cannot use US-ASCII with 8 bit characters in header");
 110         return 1;
 111     }
 112
 113     /*
 114      * If we have an address header, then we need to parse the addresses
 115      * and only encode the names or comments.  Otherwise, handle it normally.
 116      */
 117
 118     for (i = 0; address_headers[i]; i++) {
 119         if (strcasecmp(name, address_headers[i]) == 0)
 120             return field_encode_address(name, value, encoding, charset);
 121     }
 122
 123     /*
 124      * On the encoding we choose, and the specifics of encoding:
 125      *
 126      * - If a specified encoding is passed in, we use that.
 127      * - If more than 50% of the characters are high-bit, we use base64
 128      *   and encode the whole field as one atom (possibly split).
 129      * - Otherwise, we use quoted-printable.
 130      */
 131
 132     if (encoding == CE_UNKNOWN)
 133         encoding = (eightbitcount * 10 / (asciicount + eightbitcount) > 5) ?
 134                                                 CE_BASE64 : CE_QUOTED;
 135
 136     unfold_header(value, asciicount + eightbitcount);
 137
 138     switch (encoding) {
 139
 140     case CE_BASE64:
 141         return field_encode_base64(name, value, charset);
 142
 143     case CE_QUOTED:
 144         return field_encode_quoted(name, value, charset, asciicount,
 145                                    eightbitcount + qpspecialcount, 0);
 146
 147     default:
 148         advise(NULL, "Internal error: unknown RFC-2047 encoding type");
 149         return 1;
 150     }
 151 }
 152
 153 /*
 154  * Encode our specified header (or field) using quoted-printable
 155  */
 156
 157 static int
 158 field_encode_quoted(const char *name, char **value, const char *charset,
 159                     int ascii, int encoded, int phraserules)
 160 {
 161     int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column, newline = 1;
 162     int charsetlen = strlen(charset), utf8;
 163     char *output = NULL, *p, *q;
 164
 165     /*
 166      * Right now we just encode the whole thing.  Maybe later on we'll
 167      * only encode things on a per-atom basis.
 168      */
 169
 170     p = *value;
 171
 172     column = prefixlen + 2;     /* Header name plus ": " */
 173
 174     utf8 = strcasecmp(charset, "UTF-8") == 0;
 175
 176     while (*p != '\0') {
 177         /*
 178          * Start a new line, if it's time
 179          */
 180         if (newline) {
 181             /*
 182              * If it's the start of the header, we don't need to pad it
 183              *
 184              * The length of the output string is ...
 185              * =?charset?Q?...?=  so that's 7+strlen(charset) + 2 for \n NUL
 186              *
 187              * plus 1 for every ASCII character and 3 for every eight bit
 188              * or special character (eight bit characters are written as =XX).
 189              *
 190              */
 191
 192             int tokenlen;
 193
 194             outlen += 9 + charsetlen + ascii + 3 * encoded;
 195
 196             /*
 197              * If output is set, then we're continuing the header.  Otherwise
 198              * do the initial allocation.
 199              */
 200
 201             if (output) {
 202                 int curlen = q - output, i;
 203                 outlen += prefixlen + 1;        /* Header plus \n ": " */
 204                 output = mh_xrealloc(output, outlen);
 205                 q = output + curlen;
 206                 *q++ = '?';
 207                 *q++ = '=';
 208                 *q++ = '\n';
 209                 for (i = 0; i < prefixlen; i++)
 210                     *q++ = ' ';
 211             } else {
 212                 /*
 213                  * A bit of a hack here; the header can contain multiple
 214                  * spaces (probably at least one) until we get to the
 215                  * actual text.  Copy until we get to a non-space.
 216                  */
 217                 output = mh_xmalloc(outlen);
 218                 q = output;
 219                 while (is_fws(*p))
 220                     *q++ = *p++;
 221             }
 222
 223             tokenlen = snprintf(q, outlen - (q - output), "=?%s?Q?", charset);
 224             q += tokenlen;
 225             column = prefixlen + tokenlen;
 226             newline = 0;
 227         }
 228
 229         /*
 230          * Process each character, encoding if necessary
 231          *
 232          * Note that we have a different set of rules if we're processing
 233          * RFC 5322 'phrase' (something you'd see in an address header).
 234          */
 235
 236         column++;
 237
 238         if (*p == ' ') {
 239             *q++ = '_';
 240             ascii--;
 241         } else if (isascii((unsigned char) *p) &&
 242                    (phraserules ? qphrasevalid((unsigned char) *p) :
 243                                         !qpspecial((unsigned char) *p))) {
 244             *q++ = *p;
 245             ascii--;
 246         } else {
 247             snprintf(q, outlen - (q - output), "=%02X", (unsigned char) *p);
 248             q += 3;
 249             column += 2;        /* column already incremented by 1 above */
 250             encoded--;
 251         }
 252
 253         p++;
 254
 255         /*
 256          * We're not allowed more than ENCODELINELIMIT characters per line,
 257          * so reserve some room for the final ?=.
 258          *
 259          * If prefixlen == 0, we haven't been passed in a header name, so
 260          * don't ever wrap the field (we're likely doing an address).
 261          */
 262
 263         if (prefixlen == 0)
 264             continue;
 265
 266         if (column >= ENCODELINELIMIT - 2) {
 267             newline = 1;
 268         } else if (utf8) {
 269             /*
 270              * Okay, this is a bit weird, but to explain a bit more ...
 271              *
 272              * RFC 2047 prohibits the splitting of multibyte characters
 273              * across encoded words.  Right now we only handle the case
 274              * of UTF-8, the most common multibyte encoding.
 275              *
 276              * p is now pointing at the next input character.  If we're
 277              * using UTF-8 _and_ we'd go over ENCODELINELIMIT given the
 278              * length of the complete character, then trigger a newline
 279              * now.  Note that we check the length * 3 since we have to
 280              * allow for the encoded output.
 281              */
 282             if (column + (utf8len(p) * 3) > ENCODELINELIMIT - 2) {
 283                 newline = 1;
 284             }
 285         }
 286     }
 287
 288     strcat(q, "?=");
 289
 290     if (prefixlen)
 291         strcat(q, "\n");
 292
 293     free(*value);
 294
 295     *value = output;
 296
 297     return 0;
 298 }
 299
 300 /*
 301  * Encode our specified header (or field) using base64.
 302  *
 303  * This is a little easier since every character gets encoded, we can
 304  * calculate the line wrap up front.
 305  */
 306
 307 static int
 308 field_encode_base64(const char *name, char **value, const char *charset)
 309 {
 310     int prefixlen = name ? strlen(name) + 2 : 0, charsetlen = strlen(charset);
 311     int outlen = 0, numencode, curlen;
 312     char *output = NULL, *p = *value, *q = NULL, *linestart;
 313
 314     /*
 315      * Skip over any leading white space.
 316      */
 317
 318     while (*p == ' ' || *p == '\t')
 319         p++;
 320
 321     /*
 322      * If we had a zero-length prefix, then just encode the whole field
 323      * as-is, without line wrapping.  Note that in addition to the encoding
 324      *
 325      * The added length we need is =? + charset + ?B? ... ?=
 326      *
 327      * That's 7 + strlen(charset) + 2 (for \n NUL).
 328      */
 329
 330     while (prefixlen && ((base64len(strlen(p)) + 7 + charsetlen +
 331                           prefixlen) > ENCODELINELIMIT)) {
 332
 333         /*
 334          * Our very first time, don't pad the line in the front
 335          *
 336          * Note ENCODELINELIMIT is + 2 because of \n \0
 337          */
 338
 339
 340         if (! output) {
 341             outlen += ENCODELINELIMIT + 2;
 342             output = q = mh_xmalloc(outlen);
 343             linestart = q - prefixlen;  /* Yes, this is intentional */
 344         } else {
 345             int curstart = linestart - output;
 346             curlen = q - output;
 347
 348             outlen += ENCODELINELIMIT + 2;
 349             output = mh_xrealloc(output, outlen);
 350             q = output + curlen;
 351             linestart = output + curstart;
 352         }
 353
 354         /*
 355          * We should have enough space now, so prepend the encoding markers
 356          * and character set information.  The leading space is intentional.
 357          */
 358
 359         q += snprintf(q, outlen - (q - output), " =?%s?B?", charset);
 360
 361         /*
 362          * Find out how much room we have left on the line and see how
 363          * many characters we can stuff in.  The start of our line
 364          * is marked by "linestart", so use that to figure out how
 365          * many characters are left out of ENCODELINELIMIT.  Reserve
 366          * 2 characters for the end markers and calculate how many
 367          * characters we can fit into that space given the base64
 368          * encoding expansion.
 369          */
 370
 371         numencode = strbase64(ENCODELINELIMIT - (q - linestart) - 2);
 372
 373         if (numencode <= 0) {
 374             advise(NULL, "Internal error: tried to encode %d characters "
 375                    "in base64", numencode);
 376             return 1;
 377         }
 378
 379         /*
 380          * RFC 2047 prohibits spanning multibyte characters across tokens.
 381          * Right now we only check for UTF-8.
 382          *
 383          * So note the key here ... we want to make sure the character BEYOND
 384          * our last character is not a continuation byte.  If it's the start
 385          * of a new multibyte character or a single-byte character, that's ok.
 386          */
 387
 388         if (strcasecmp(charset, "UTF-8") == 0) {
 389             /*
 390              * p points to the start of our current buffer, so p + numencode
 391              * is one past the last character to encode
 392              */
 393
 394             while (numencode > 0 && ((*(p + numencode) & 0xc0) == 0x80))
 395                 numencode--;
 396
 397             if (numencode == 0) {
 398                 advise(NULL, "Internal error: could not find start of "
 399                        "UTF-8 character when base64 encoding header");
 400                 return 1;
 401             }
 402         }
 403
 404         if (writeBase64raw((unsigned char *) p, numencode,
 405                            (unsigned char *) q) != OK) {
 406             advise(NULL, "Internal error: base64 encoding of header failed");
 407             return 1;
 408         }
 409
 410         p += numencode;
 411         q += base64len(numencode);
 412
 413         /*
 414          * This will point us at the beginning of the new line (trust me).
 415          */
 416
 417         linestart = q + 3;
 418
 419         /*
 420          * What's going on here?  Well, we know we're continuing to the next
 421          * line, so we want to add continuation padding.  We also add the
 422          * trailing marker for the RFC 2047 token at this time as well.
 423          * This uses a trick of snprintf(); we tell it to print a zero-length
 424          * string, but pad it out to prefixlen - 1 characters; that ends
 425          * up always printing out the requested number of spaces.  We use
 426          * prefixlen - 1 because we always add a space on the starting
 427          * token marker; this makes things work out correctly for the first
 428          * line, which should have a space between the ':' and the start
 429          * of the token.
 430          *
 431          * It's okay if you don't follow all of that.
 432          */
 433
 434         q += snprintf(q, outlen - (q - output), "?=\n%*s", prefixlen - 1, "");
 435     }
 436
 437     /*
 438      * We're here if there is either no prefix, or we can fit it in less
 439      * than ENCODELINELIMIT characters.  Encode the whole thing.
 440      */
 441
 442     outlen += prefixlen + 9 + charsetlen + base64len(strlen(p));
 443     curlen = q - output;
 444
 445     output = mh_xrealloc(output, outlen);
 446     q = output + curlen;
 447
 448     q += snprintf(q, outlen - (q - output), "%s=?%s?B?",
 449                   prefixlen ? " " : "", charset);
 450
 451     if (writeBase64raw((unsigned char *) p, strlen(p),
 452                        (unsigned char *) q) != OK) {
 453         advise(NULL, "Internal error: base64 encoding of header failed");
 454         return 1;
 455     }
 456
 457     strcat(q, "?=");
 458
 459     if (prefixlen)
 460         strcat(q, "\n");
 461
 462     free(*value);
 463
 464     *value = output;
 465
 466     return 0;
 467 }
 468
 469 /*
 470  * Calculate the length of a UTF-8 character.
 471  *
 472  * If it's not a UTF-8 character (or we're in the middle of a multibyte
 473  * character) then simply return 0.
 474  */
 475
 476 static int
 477 utf8len(const char *p)
 478 {
 479     int len = 1;
 480
 481     if (*p == '\0')
 482         return 0;
 483
 484     if (isascii((unsigned char) *p) || (((unsigned char) *p) & 0xc0) == 0x80)
 485         return 0;
 486
 487     p++;
 488     while ((((unsigned char) *p++) & 0xc0) == 0x80)
 489         len++;
 490
 491     return len;
 492 }
 493
 494 /*
 495  * "Unfold" a header, making it a single line (without continuation)
 496  *
 497  * We cheat a bit here; we never make the string longer, so using the
 498  * original length here is fine.
 499  */
 500
 501 static void
 502 unfold_header(char **value, int len)
 503 {
 504     char *str = mh_xmalloc(len + 1);
 505     char *p = str, *q = *value;
 506
 507     while (*q != '\0') {
 508         if (*q == '\n') {
 509             /*
 510              * When we get a newline, skip to the next non-whitespace
 511              * character and add a space to replace all of the whitespace
 512              *
 513              * This has the side effect of stripping off the final newline
 514              * for the header; we put it back in the encoding routine.
 515              */
 516             while (is_fws(*q++))
 517                 ;
 518             if (*q == '\0')
 519                 break;
 520
 521             *p++ = ' ';
 522         } else {
 523             *p++ = *q++;
 524         }
 525     }
 526
 527     *p = '\0';
 528
 529     free(*value);
 530     *value = str;
 531 }
 532
 533 /*
 534  * Decode a header containing addresses.  This means we have to parse
 535  * each address and only encode the display-name or comment field.
 536  */
 537
 538 static int
 539 field_encode_address(const char *name, char **value, int encoding,
 540                      const char *charset)
 541 {
 542     int prefixlen = strlen(name) + 2, column = prefixlen, groupflag, errflag;
 543     int eightbitchars;
 544     char *mp, *output = NULL;
 545     struct mailname *mn;
 546
 547     /*
 548      * Because these are addresses, we need to handle them individually.
 549      *
 550      * Break them down and process them one by one.  This means we have to
 551      * rewrite the whole header, but that's unavoidable.
 552      */
 553
 554     /*
 555      * The output headers always have to start with a space first.
 556      */
 557
 558     output = add(" ", output);
 559
 560     for (groupflag = 0; mp = getname(*value); ) {
 561         if ((mn = getm(mp, NULL, 0, AD_HOST, NULL)) == NULL) {
 562             errflag++;
 563             continue;
 564         }
 565
 566         /*
 567          * We only care if the phrase (m_pers) or any trailing comment
 568          * (m_note) have 8-bit characters.  If doing q-p, we also need
 569          * to encode anything marked as qspecial().
 570          */
 571     }
 572 }
 573
 574 /*
 575  * Scan a string, check for characters that need to be encoded
 576  */
 577
 578 static int
 579 scanstring(const char *string, int *asciilen, int *eightbitchars,
 580            int *specialchars)
 581 {
 582     *asciilen = 0;
 583     *eightbitchars = 0;
 584     *specialchars = 0;
 585
 586     for (; *string != '\0'; string++) {
 587         if ((isascii((unsigned char) *string))) {
 588             (*asciilen++);
 589             if (!qphrasevalid((unsigned char) *string))
 590                 (*specialchars)++;
 591         } else {
 592             (*eightbitchars)++;
 593         }
 594     }
 595
 596     return eightbitchars > 0;
 597 }