]> diplodocus.org Git - nmh/blob - sbr/encode_rfc2047.c
Fix up the unquote test, with help from Lyndon and Ralph.
[nmh] / sbr / encode_rfc2047.c
1 /*
2 * Routines to encode message headers using RFC 2047-encoding.
3 *
4 * This code is Copyright (c) 2002, by the authors of nmh. See the
5 * COPYRIGHT file in the root directory of the nmh distribution for
6 * complete copyright information.
7 */
8
9 #include <h/mh.h>
10 #include <h/mhparse.h>
11 #include <h/addrsbr.h>
12 #include <h/utils.h>
13
14 /*
15 * List of headers that contain addresses and as a result require special
16 * handling
17 */
18
19 static char *address_headers[] = {
20 "To",
21 "From",
22 "cc",
23 "Bcc",
24 "Reply-To",
25 "Sender",
26 "Resent-To",
27 "Resent-From",
28 "Resent-cc",
29 "Resent-Bcc",
30 "Resent-Reply-To",
31 "Resent-Sender",
32 NULL,
33 };
34
35 /*
36 * Macros we use for parsing headers
37 */
38
39 #define is_fws(c) (c == '\t' || c == ' ' || c == '\n')
40
41 #define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \
42 (c >= 'a' && c <= 'z') || \
43 c == '!' || c == '*' || c == '+' || c == '-' || \
44 c == '/' || c == '=' || c == '_')
45 #define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')
46
47 #define base64len(n) ((((n) + 2) / 3 ) * 4) /* String len to base64 len */
48 #define strbase64(n) ((n) / 4 * 3) /* Chars that fit in base64 */
49
50 #define ENCODELINELIMIT 76
51
52 static void unfold_header(char **, int);
53 static int field_encode_address(const char *, char **, int, const char *);
54 static int field_encode_quoted(const char *, char **, const char *, int,
55 int, int);
56 static int field_encode_base64(const char *, char **, const char *);
57 static int scanstring(const char *, int *, int *, int *);
58 static int utf8len(const char *);
59
60 /*
61 * Encode a message header using RFC 2047 encoding. We make the assumption
62 * that all characters < 128 are ASCII and as a consequence don't need any
63 * encoding.
64 */
65
66 int
67 encode_rfc2047(const char *name, char **value, int encoding,
68 const char *charset)
69 {
70 int i, asciicount = 0, eightbitcount = 0, qpspecialcount = 0;
71 char *p;
72
73 /*
74 * First, check to see if we even need to encode the header
75 */
76
77 for (p = *value; *p != '\0'; p++) {
78 if (isascii((unsigned char) *p)) {
79 asciicount++;
80 if (qpspecial((unsigned char) *p))
81 qpspecialcount++;
82 } else
83 eightbitcount++;
84 }
85
86 if (eightbitcount == 0)
87 return 0;
88
89 /*
90 * Some rules from RFC 2047:
91 *
92 * - Encoded words cannot be more than 75 characters long
93 * - Multiple "long" encoded words must be on new lines.
94 *
95 * Also, we're not permitted to encode email addresses, so
96 * we need to actually _parse_ email addresses and only encode
97 * the right bits.
98 */
99
100 /*
101 * If charset was NULL, then get the value from the locale. But
102 * we reject it if it returns US-ASCII
103 */
104
105 if (charset == NULL)
106 charset = write_charset_8bit();
107
108 if (strcasecmp(charset, "US-ASCII") == 0) {
109 advise(NULL, "Cannot use US-ASCII with 8 bit characters in header");
110 return 1;
111 }
112
113 /*
114 * If we have an address header, then we need to parse the addresses
115 * and only encode the names or comments. Otherwise, handle it normally.
116 */
117
118 for (i = 0; address_headers[i]; i++) {
119 if (strcasecmp(name, address_headers[i]) == 0)
120 return field_encode_address(name, value, encoding, charset);
121 }
122
123 /*
124 * On the encoding we choose, and the specifics of encoding:
125 *
126 * - If a specified encoding is passed in, we use that.
127 * - If more than 50% of the characters are high-bit, we use base64
128 * and encode the whole field as one atom (possibly split).
129 * - Otherwise, we use quoted-printable.
130 */
131
132 if (encoding == CE_UNKNOWN)
133 encoding = (eightbitcount * 10 / (asciicount + eightbitcount) > 5) ?
134 CE_BASE64 : CE_QUOTED;
135
136 unfold_header(value, asciicount + eightbitcount);
137
138 switch (encoding) {
139
140 case CE_BASE64:
141 return field_encode_base64(name, value, charset);
142
143 case CE_QUOTED:
144 return field_encode_quoted(name, value, charset, asciicount,
145 eightbitcount + qpspecialcount, 0);
146
147 default:
148 advise(NULL, "Internal error: unknown RFC-2047 encoding type");
149 return 1;
150 }
151 }
152
153 /*
154 * Encode our specified header (or field) using quoted-printable
155 */
156
157 static int
158 field_encode_quoted(const char *name, char **value, const char *charset,
159 int ascii, int encoded, int phraserules)
160 {
161 int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column, newline = 1;
162 int charsetlen = strlen(charset), utf8;
163 char *output = NULL, *p, *q;
164
165 /*
166 * Right now we just encode the whole thing. Maybe later on we'll
167 * only encode things on a per-atom basis.
168 */
169
170 p = *value;
171
172 column = prefixlen + 2; /* Header name plus ": " */
173
174 utf8 = strcasecmp(charset, "UTF-8") == 0;
175
176 while (*p != '\0') {
177 /*
178 * Start a new line, if it's time
179 */
180 if (newline) {
181 /*
182 * If it's the start of the header, we don't need to pad it
183 *
184 * The length of the output string is ...
185 * =?charset?Q?...?= so that's 7+strlen(charset) + 2 for \n NUL
186 *
187 * plus 1 for every ASCII character and 3 for every eight bit
188 * or special character (eight bit characters are written as =XX).
189 *
190 */
191
192 int tokenlen;
193
194 outlen += 9 + charsetlen + ascii + 3 * encoded;
195
196 /*
197 * If output is set, then we're continuing the header. Otherwise
198 * do the initial allocation.
199 */
200
201 if (output) {
202 int curlen = q - output, i;
203 outlen += prefixlen + 1; /* Header plus \n ": " */
204 output = mh_xrealloc(output, outlen);
205 q = output + curlen;
206 *q++ = '?';
207 *q++ = '=';
208 *q++ = '\n';
209 for (i = 0; i < prefixlen; i++)
210 *q++ = ' ';
211 } else {
212 /*
213 * A bit of a hack here; the header can contain multiple
214 * spaces (probably at least one) until we get to the
215 * actual text. Copy until we get to a non-space.
216 */
217 output = mh_xmalloc(outlen);
218 q = output;
219 while (is_fws(*p))
220 *q++ = *p++;
221 }
222
223 tokenlen = snprintf(q, outlen - (q - output), "=?%s?Q?", charset);
224 q += tokenlen;
225 column = prefixlen + tokenlen;
226 newline = 0;
227 }
228
229 /*
230 * Process each character, encoding if necessary
231 *
232 * Note that we have a different set of rules if we're processing
233 * RFC 5322 'phrase' (something you'd see in an address header).
234 */
235
236 column++;
237
238 if (*p == ' ') {
239 *q++ = '_';
240 ascii--;
241 } else if (isascii((unsigned char) *p) &&
242 (phraserules ? qphrasevalid((unsigned char) *p) :
243 !qpspecial((unsigned char) *p))) {
244 *q++ = *p;
245 ascii--;
246 } else {
247 snprintf(q, outlen - (q - output), "=%02X", (unsigned char) *p);
248 q += 3;
249 column += 2; /* column already incremented by 1 above */
250 encoded--;
251 }
252
253 p++;
254
255 /*
256 * We're not allowed more than ENCODELINELIMIT characters per line,
257 * so reserve some room for the final ?=.
258 *
259 * If prefixlen == 0, we haven't been passed in a header name, so
260 * don't ever wrap the field (we're likely doing an address).
261 */
262
263 if (prefixlen == 0)
264 continue;
265
266 if (column >= ENCODELINELIMIT - 2) {
267 newline = 1;
268 } else if (utf8) {
269 /*
270 * Okay, this is a bit weird, but to explain a bit more ...
271 *
272 * RFC 2047 prohibits the splitting of multibyte characters
273 * across encoded words. Right now we only handle the case
274 * of UTF-8, the most common multibyte encoding.
275 *
276 * p is now pointing at the next input character. If we're
277 * using UTF-8 _and_ we'd go over ENCODELINELIMIT given the
278 * length of the complete character, then trigger a newline
279 * now. Note that we check the length * 3 since we have to
280 * allow for the encoded output.
281 */
282 if (column + (utf8len(p) * 3) > ENCODELINELIMIT - 2) {
283 newline = 1;
284 }
285 }
286 }
287
288 strcat(q, "?=");
289
290 if (prefixlen)
291 strcat(q, "\n");
292
293 free(*value);
294
295 *value = output;
296
297 return 0;
298 }
299
300 /*
301 * Encode our specified header (or field) using base64.
302 *
303 * This is a little easier since every character gets encoded, we can
304 * calculate the line wrap up front.
305 */
306
307 static int
308 field_encode_base64(const char *name, char **value, const char *charset)
309 {
310 int prefixlen = name ? strlen(name) + 2 : 0, charsetlen = strlen(charset);
311 int outlen = 0, numencode, curlen;
312 char *output = NULL, *p = *value, *q = NULL, *linestart;
313
314 /*
315 * Skip over any leading white space.
316 */
317
318 while (*p == ' ' || *p == '\t')
319 p++;
320
321 /*
322 * If we had a zero-length prefix, then just encode the whole field
323 * as-is, without line wrapping. Note that in addition to the encoding
324 *
325 * The added length we need is =? + charset + ?B? ... ?=
326 *
327 * That's 7 + strlen(charset) + 2 (for \n NUL).
328 */
329
330 while (prefixlen && ((base64len(strlen(p)) + 7 + charsetlen +
331 prefixlen) > ENCODELINELIMIT)) {
332
333 /*
334 * Our very first time, don't pad the line in the front
335 *
336 * Note ENCODELINELIMIT is + 2 because of \n \0
337 */
338
339
340 if (! output) {
341 outlen += ENCODELINELIMIT + 2;
342 output = q = mh_xmalloc(outlen);
343 linestart = q - prefixlen; /* Yes, this is intentional */
344 } else {
345 int curstart = linestart - output;
346 curlen = q - output;
347
348 outlen += ENCODELINELIMIT + 2;
349 output = mh_xrealloc(output, outlen);
350 q = output + curlen;
351 linestart = output + curstart;
352 }
353
354 /*
355 * We should have enough space now, so prepend the encoding markers
356 * and character set information. The leading space is intentional.
357 */
358
359 q += snprintf(q, outlen - (q - output), " =?%s?B?", charset);
360
361 /*
362 * Find out how much room we have left on the line and see how
363 * many characters we can stuff in. The start of our line
364 * is marked by "linestart", so use that to figure out how
365 * many characters are left out of ENCODELINELIMIT. Reserve
366 * 2 characters for the end markers and calculate how many
367 * characters we can fit into that space given the base64
368 * encoding expansion.
369 */
370
371 numencode = strbase64(ENCODELINELIMIT - (q - linestart) - 2);
372
373 if (numencode <= 0) {
374 advise(NULL, "Internal error: tried to encode %d characters "
375 "in base64", numencode);
376 return 1;
377 }
378
379 /*
380 * RFC 2047 prohibits spanning multibyte characters across tokens.
381 * Right now we only check for UTF-8.
382 *
383 * So note the key here ... we want to make sure the character BEYOND
384 * our last character is not a continuation byte. If it's the start
385 * of a new multibyte character or a single-byte character, that's ok.
386 */
387
388 if (strcasecmp(charset, "UTF-8") == 0) {
389 /*
390 * p points to the start of our current buffer, so p + numencode
391 * is one past the last character to encode
392 */
393
394 while (numencode > 0 && ((*(p + numencode) & 0xc0) == 0x80))
395 numencode--;
396
397 if (numencode == 0) {
398 advise(NULL, "Internal error: could not find start of "
399 "UTF-8 character when base64 encoding header");
400 return 1;
401 }
402 }
403
404 if (writeBase64raw((unsigned char *) p, numencode,
405 (unsigned char *) q) != OK) {
406 advise(NULL, "Internal error: base64 encoding of header failed");
407 return 1;
408 }
409
410 p += numencode;
411 q += base64len(numencode);
412
413 /*
414 * This will point us at the beginning of the new line (trust me).
415 */
416
417 linestart = q + 3;
418
419 /*
420 * What's going on here? Well, we know we're continuing to the next
421 * line, so we want to add continuation padding. We also add the
422 * trailing marker for the RFC 2047 token at this time as well.
423 * This uses a trick of snprintf(); we tell it to print a zero-length
424 * string, but pad it out to prefixlen - 1 characters; that ends
425 * up always printing out the requested number of spaces. We use
426 * prefixlen - 1 because we always add a space on the starting
427 * token marker; this makes things work out correctly for the first
428 * line, which should have a space between the ':' and the start
429 * of the token.
430 *
431 * It's okay if you don't follow all of that.
432 */
433
434 q += snprintf(q, outlen - (q - output), "?=\n%*s", prefixlen - 1, "");
435 }
436
437 /*
438 * We're here if there is either no prefix, or we can fit it in less
439 * than ENCODELINELIMIT characters. Encode the whole thing.
440 */
441
442 outlen += prefixlen + 9 + charsetlen + base64len(strlen(p));
443 curlen = q - output;
444
445 output = mh_xrealloc(output, outlen);
446 q = output + curlen;
447
448 q += snprintf(q, outlen - (q - output), "%s=?%s?B?",
449 prefixlen ? " " : "", charset);
450
451 if (writeBase64raw((unsigned char *) p, strlen(p),
452 (unsigned char *) q) != OK) {
453 advise(NULL, "Internal error: base64 encoding of header failed");
454 return 1;
455 }
456
457 strcat(q, "?=");
458
459 if (prefixlen)
460 strcat(q, "\n");
461
462 free(*value);
463
464 *value = output;
465
466 return 0;
467 }
468
469 /*
470 * Calculate the length of a UTF-8 character.
471 *
472 * If it's not a UTF-8 character (or we're in the middle of a multibyte
473 * character) then simply return 0.
474 */
475
476 static int
477 utf8len(const char *p)
478 {
479 int len = 1;
480
481 if (*p == '\0')
482 return 0;
483
484 if (isascii((unsigned char) *p) || (((unsigned char) *p) & 0xc0) == 0x80)
485 return 0;
486
487 p++;
488 while ((((unsigned char) *p++) & 0xc0) == 0x80)
489 len++;
490
491 return len;
492 }
493
494 /*
495 * "Unfold" a header, making it a single line (without continuation)
496 *
497 * We cheat a bit here; we never make the string longer, so using the
498 * original length here is fine.
499 */
500
501 static void
502 unfold_header(char **value, int len)
503 {
504 char *str = mh_xmalloc(len + 1);
505 char *p = str, *q = *value;
506
507 while (*q != '\0') {
508 if (*q == '\n') {
509 /*
510 * When we get a newline, skip to the next non-whitespace
511 * character and add a space to replace all of the whitespace
512 *
513 * This has the side effect of stripping off the final newline
514 * for the header; we put it back in the encoding routine.
515 */
516 while (is_fws(*q++))
517 ;
518 if (*q == '\0')
519 break;
520
521 *p++ = ' ';
522 } else {
523 *p++ = *q++;
524 }
525 }
526
527 *p = '\0';
528
529 free(*value);
530 *value = str;
531 }
532
533 /*
534 * Decode a header containing addresses. This means we have to parse
535 * each address and only encode the display-name or comment field.
536 */
537
538 static int
539 field_encode_address(const char *name, char **value, int encoding,
540 const char *charset)
541 {
542 int prefixlen = strlen(name) + 2, column = prefixlen, groupflag, errflag;
543 int eightbitchars;
544 char *mp, *output = NULL;
545 struct mailname *mn;
546
547 /*
548 * Because these are addresses, we need to handle them individually.
549 *
550 * Break them down and process them one by one. This means we have to
551 * rewrite the whole header, but that's unavoidable.
552 */
553
554 /*
555 * The output headers always have to start with a space first.
556 */
557
558 output = add(" ", output);
559
560 for (groupflag = 0; mp = getname(*value); ) {
561 if ((mn = getm(mp, NULL, 0, AD_HOST, NULL)) == NULL) {
562 errflag++;
563 continue;
564 }
565
566 /*
567 * We only care if the phrase (m_pers) or any trailing comment
568 * (m_note) have 8-bit characters. If doing q-p, we also need
569 * to encode anything marked as qspecial().
570 */
571 }
572 }
573
574 /*
575 * Scan a string, check for characters that need to be encoded
576 */
577
578 static int
579 scanstring(const char *string, int *asciilen, int *eightbitchars,
580 int *specialchars)
581 {
582 *asciilen = 0;
583 *eightbitchars = 0;
584 *specialchars = 0;
585
586 for (; *string != '\0'; string++) {
587 if ((isascii((unsigned char) *string))) {
588 (*asciilen++);
589 if (!qphrasevalid((unsigned char) *string))
590 (*specialchars)++;
591 } else {
592 (*eightbitchars)++;
593 }
594 }
595
596 return eightbitchars > 0;
597 }