]>
diplodocus.org Git - nmh/blob - sbr/encode_rfc2047.c
2 * Routines to encode message headers using RFC 2047-encoding.
4 * This code is Copyright (c) 2002, by the authors of nmh. See the
5 * COPYRIGHT file in the root directory of the nmh distribution for
6 * complete copyright information.
10 #include <h/mhparse.h>
11 #include <h/addrsbr.h>
15 * List of headers that contain addresses and as a result require special
19 static char *address_headers
[] = {
36 * Macros we use for parsing headers
39 #define is_fws(c) (c == '\t' || c == ' ' || c == '\n')
41 #define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \
42 (c >= 'a' && c <= 'z') || \
43 c == '!' || c == '*' || c == '+' || c == '-' || \
44 c == '/' || c == '=' || c == '_')
45 #define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')
47 #define base64len(n) ((((n) + 2) / 3 ) * 4) /* String len to base64 len */
48 #define strbase64(n) ((n) / 4 * 3) /* Chars that fit in base64 */
50 #define ENCODELINELIMIT 76
52 static void unfold_header(char **, int);
53 static int field_encode_address(const char *, char **, int, const char *);
54 static int field_encode_quoted(const char *, char **, const char *, int,
56 static int field_encode_base64(const char *, char **, const char *);
57 static int scanstring(const char *, int *, int *, int *);
58 static int utf8len(const char *);
61 * Encode a message header using RFC 2047 encoding. We make the assumption
62 * that all characters < 128 are ASCII and as a consequence don't need any
67 encode_rfc2047(const char *name
, char **value
, int encoding
,
70 int i
, asciicount
= 0, eightbitcount
= 0, qpspecialcount
= 0;
74 * First, check to see if we even need to encode the header
77 for (p
= *value
; *p
!= '\0'; p
++) {
78 if (isascii((unsigned char) *p
)) {
80 if (qpspecial((unsigned char) *p
))
86 if (eightbitcount
== 0)
90 * Some rules from RFC 2047:
92 * - Encoded words cannot be more than 75 characters long
93 * - Multiple "long" encoded words must be on new lines.
95 * Also, we're not permitted to encode email addresses, so
96 * we need to actually _parse_ email addresses and only encode
101 * If charset was NULL, then get the value from the locale. But
102 * we reject it if it returns US-ASCII
106 charset
= write_charset_8bit();
108 if (strcasecmp(charset
, "US-ASCII") == 0) {
109 advise(NULL
, "Cannot use US-ASCII with 8 bit characters in header");
114 * If we have an address header, then we need to parse the addresses
115 * and only encode the names or comments. Otherwise, handle it normally.
118 for (i
= 0; address_headers
[i
]; i
++) {
119 if (strcasecmp(name
, address_headers
[i
]) == 0)
120 return field_encode_address(name
, value
, encoding
, charset
);
124 * On the encoding we choose, and the specifics of encoding:
126 * - If a specified encoding is passed in, we use that.
127 * - If more than 50% of the characters are high-bit, we use base64
128 * and encode the whole field as one atom (possibly split).
129 * - Otherwise, we use quoted-printable.
132 if (encoding
== CE_UNKNOWN
)
133 encoding
= (eightbitcount
* 10 / (asciicount
+ eightbitcount
) > 5) ?
134 CE_BASE64
: CE_QUOTED
;
136 unfold_header(value
, asciicount
+ eightbitcount
);
141 return field_encode_base64(name
, value
, charset
);
144 return field_encode_quoted(name
, value
, charset
, asciicount
,
145 eightbitcount
+ qpspecialcount
, 0);
148 advise(NULL
, "Internal error: unknown RFC-2047 encoding type");
154 * Encode our specified header (or field) using quoted-printable
158 field_encode_quoted(const char *name
, char **value
, const char *charset
,
159 int ascii
, int encoded
, int phraserules
)
161 int prefixlen
= name
? strlen(name
) + 2: 0, outlen
= 0, column
, newline
= 1;
162 int charsetlen
= strlen(charset
), utf8
;
163 char *output
= NULL
, *p
, *q
;
166 * Right now we just encode the whole thing. Maybe later on we'll
167 * only encode things on a per-atom basis.
172 column
= prefixlen
+ 2; /* Header name plus ": " */
174 utf8
= strcasecmp(charset
, "UTF-8") == 0;
178 * Start a new line, if it's time
182 * If it's the start of the header, we don't need to pad it
184 * The length of the output string is ...
185 * =?charset?Q?...?= so that's 7+strlen(charset) + 2 for \n NUL
187 * plus 1 for every ASCII character and 3 for every eight bit
188 * or special character (eight bit characters are written as =XX).
194 outlen
+= 9 + charsetlen
+ ascii
+ 3 * encoded
;
197 * If output is set, then we're continuing the header. Otherwise
198 * do the initial allocation.
202 int curlen
= q
- output
, i
;
203 outlen
+= prefixlen
+ 1; /* Header plus \n ": " */
204 output
= mh_xrealloc(output
, outlen
);
209 for (i
= 0; i
< prefixlen
; i
++)
213 * A bit of a hack here; the header can contain multiple
214 * spaces (probably at least one) until we get to the
215 * actual text. Copy until we get to a non-space.
217 output
= mh_xmalloc(outlen
);
223 tokenlen
= snprintf(q
, outlen
- (q
- output
), "=?%s?Q?", charset
);
225 column
= prefixlen
+ tokenlen
;
230 * Process each character, encoding if necessary
232 * Note that we have a different set of rules if we're processing
233 * RFC 5322 'phrase' (something you'd see in an address header).
241 } else if (isascii((unsigned char) *p
) &&
242 (phraserules
? qphrasevalid((unsigned char) *p
) :
243 !qpspecial((unsigned char) *p
))) {
247 snprintf(q
, outlen
- (q
- output
), "=%02X", (unsigned char) *p
);
249 column
+= 2; /* column already incremented by 1 above */
256 * We're not allowed more than ENCODELINELIMIT characters per line,
257 * so reserve some room for the final ?=.
259 * If prefixlen == 0, we haven't been passed in a header name, so
260 * don't ever wrap the field (we're likely doing an address).
266 if (column
>= ENCODELINELIMIT
- 2) {
270 * Okay, this is a bit weird, but to explain a bit more ...
272 * RFC 2047 prohibits the splitting of multibyte characters
273 * across encoded words. Right now we only handle the case
274 * of UTF-8, the most common multibyte encoding.
276 * p is now pointing at the next input character. If we're
277 * using UTF-8 _and_ we'd go over ENCODELINELIMIT given the
278 * length of the complete character, then trigger a newline
279 * now. Note that we check the length * 3 since we have to
280 * allow for the encoded output.
282 if (column
+ (utf8len(p
) * 3) > ENCODELINELIMIT
- 2) {
301 * Encode our specified header (or field) using base64.
303 * This is a little easier since every character gets encoded, we can
304 * calculate the line wrap up front.
308 field_encode_base64(const char *name
, char **value
, const char *charset
)
310 int prefixlen
= name
? strlen(name
) + 2 : 0, charsetlen
= strlen(charset
);
311 int outlen
= 0, numencode
, curlen
;
312 char *output
= NULL
, *p
= *value
, *q
= NULL
, *linestart
;
315 * Skip over any leading white space.
318 while (*p
== ' ' || *p
== '\t')
322 * If we had a zero-length prefix, then just encode the whole field
323 * as-is, without line wrapping. Note that in addition to the encoding
325 * The added length we need is =? + charset + ?B? ... ?=
327 * That's 7 + strlen(charset) + 2 (for \n NUL).
330 while (prefixlen
&& ((base64len(strlen(p
)) + 7 + charsetlen
+
331 prefixlen
) > ENCODELINELIMIT
)) {
334 * Our very first time, don't pad the line in the front
336 * Note ENCODELINELIMIT is + 2 because of \n \0
341 outlen
+= ENCODELINELIMIT
+ 2;
342 output
= q
= mh_xmalloc(outlen
);
343 linestart
= q
- prefixlen
; /* Yes, this is intentional */
345 int curstart
= linestart
- output
;
348 outlen
+= ENCODELINELIMIT
+ 2;
349 output
= mh_xrealloc(output
, outlen
);
351 linestart
= output
+ curstart
;
355 * We should have enough space now, so prepend the encoding markers
356 * and character set information. The leading space is intentional.
359 q
+= snprintf(q
, outlen
- (q
- output
), " =?%s?B?", charset
);
362 * Find out how much room we have left on the line and see how
363 * many characters we can stuff in. The start of our line
364 * is marked by "linestart", so use that to figure out how
365 * many characters are left out of ENCODELINELIMIT. Reserve
366 * 2 characters for the end markers and calculate how many
367 * characters we can fit into that space given the base64
368 * encoding expansion.
371 numencode
= strbase64(ENCODELINELIMIT
- (q
- linestart
) - 2);
373 if (numencode
<= 0) {
374 advise(NULL
, "Internal error: tried to encode %d characters "
375 "in base64", numencode
);
380 * RFC 2047 prohibits spanning multibyte characters across tokens.
381 * Right now we only check for UTF-8.
383 * So note the key here ... we want to make sure the character BEYOND
384 * our last character is not a continuation byte. If it's the start
385 * of a new multibyte character or a single-byte character, that's ok.
388 if (strcasecmp(charset
, "UTF-8") == 0) {
390 * p points to the start of our current buffer, so p + numencode
391 * is one past the last character to encode
394 while (numencode
> 0 && ((*(p
+ numencode
) & 0xc0) == 0x80))
397 if (numencode
== 0) {
398 advise(NULL
, "Internal error: could not find start of "
399 "UTF-8 character when base64 encoding header");
404 if (writeBase64raw((unsigned char *) p
, numencode
,
405 (unsigned char *) q
) != OK
) {
406 advise(NULL
, "Internal error: base64 encoding of header failed");
411 q
+= base64len(numencode
);
414 * This will point us at the beginning of the new line (trust me).
420 * What's going on here? Well, we know we're continuing to the next
421 * line, so we want to add continuation padding. We also add the
422 * trailing marker for the RFC 2047 token at this time as well.
423 * This uses a trick of snprintf(); we tell it to print a zero-length
424 * string, but pad it out to prefixlen - 1 characters; that ends
425 * up always printing out the requested number of spaces. We use
426 * prefixlen - 1 because we always add a space on the starting
427 * token marker; this makes things work out correctly for the first
428 * line, which should have a space between the ':' and the start
431 * It's okay if you don't follow all of that.
434 q
+= snprintf(q
, outlen
- (q
- output
), "?=\n%*s", prefixlen
- 1, "");
438 * We're here if there is either no prefix, or we can fit it in less
439 * than ENCODELINELIMIT characters. Encode the whole thing.
442 outlen
+= prefixlen
+ 9 + charsetlen
+ base64len(strlen(p
));
445 output
= mh_xrealloc(output
, outlen
);
448 q
+= snprintf(q
, outlen
- (q
- output
), "%s=?%s?B?",
449 prefixlen
? " " : "", charset
);
451 if (writeBase64raw((unsigned char *) p
, strlen(p
),
452 (unsigned char *) q
) != OK
) {
453 advise(NULL
, "Internal error: base64 encoding of header failed");
470 * Calculate the length of a UTF-8 character.
472 * If it's not a UTF-8 character (or we're in the middle of a multibyte
473 * character) then simply return 0.
477 utf8len(const char *p
)
484 if (isascii((unsigned char) *p
) || (((unsigned char) *p
) & 0xc0) == 0x80)
488 while ((((unsigned char) *p
++) & 0xc0) == 0x80)
495 * "Unfold" a header, making it a single line (without continuation)
497 * We cheat a bit here; we never make the string longer, so using the
498 * original length here is fine.
502 unfold_header(char **value
, int len
)
504 char *str
= mh_xmalloc(len
+ 1);
505 char *p
= str
, *q
= *value
;
510 * When we get a newline, skip to the next non-whitespace
511 * character and add a space to replace all of the whitespace
513 * This has the side effect of stripping off the final newline
514 * for the header; we put it back in the encoding routine.
534 * Decode a header containing addresses. This means we have to parse
535 * each address and only encode the display-name or comment field.
539 field_encode_address(const char *name
, char **value
, int encoding
,
542 int prefixlen
= strlen(name
) + 2, column
= prefixlen
, groupflag
, errflag
;
544 char *mp
, *output
= NULL
;
548 * Because these are addresses, we need to handle them individually.
550 * Break them down and process them one by one. This means we have to
551 * rewrite the whole header, but that's unavoidable.
555 * The output headers always have to start with a space first.
558 output
= add(" ", output
);
560 for (groupflag
= 0; mp
= getname(*value
); ) {
561 if ((mn
= getm(mp
, NULL
, 0, AD_HOST
, NULL
)) == NULL
) {
567 * We only care if the phrase (m_pers) or any trailing comment
568 * (m_note) have 8-bit characters. If doing q-p, we also need
569 * to encode anything marked as qspecial().
575 * Scan a string, check for characters that need to be encoded
579 scanstring(const char *string
, int *asciilen
, int *eightbitchars
,
586 for (; *string
!= '\0'; string
++) {
587 if ((isascii((unsigned char) *string
))) {
589 if (!qphrasevalid((unsigned char) *string
))
596 return eightbitchars
> 0;