]>
diplodocus.org Git - nmh/blob - sbr/encode_rfc2047.c
2 * Routines to encode message headers using RFC 2047-encoding.
4 * This code is Copyright (c) 2002, by the authors of nmh. See the
5 * COPYRIGHT file in the root directory of the nmh distribution for
6 * complete copyright information.
10 #include <h/mhparse.h>
11 #include <h/addrsbr.h>
15 * List of headers that contain addresses and as a result require special
19 static char *address_headers
[] = {
36 * Macros we use for parsing headers
39 #define is_fws(c) (c == '\t' || c == ' ' || c == '\n')
41 #define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \
42 (c >= 'a' && c <= 'z') || \
43 c == '!' || c == '*' || c == '+' || c == '-' || \
44 c == '/' || c == '=' || c == '_')
45 #define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')
47 #define base64len(n) ((((n) + 2) / 3 ) * 4) /* String len to base64 len */
48 #define strbase64(n) ((n) / 4 * 3) /* Chars that fit in base64 */
50 #define ENCODELINELIMIT 76
52 static void unfold_header(char **, int);
53 static int field_encode_address(const char *, char **, int, const char *);
54 static int field_encode_quoted(const char *, char **, const char *, int,
56 static int field_encode_base64(const char *, char **, const char *);
57 static int scanstring(const char *, int *, int *, int *);
58 static int utf8len(const char *);
59 static int pref_encoding(int, int, int);
62 * Encode a message header using RFC 2047 encoding. We make the assumption
63 * that all characters < 128 are ASCII and as a consequence don't need any
68 encode_rfc2047(const char *name
, char **value
, int encoding
,
71 int i
, asciicount
= 0, eightbitcount
= 0, qpspecialcount
= 0;
75 * First, check to see if we even need to encode the header
78 for (p
= *value
; *p
!= '\0'; p
++) {
79 if (isascii((unsigned char) *p
)) {
81 if (qpspecial((unsigned char) *p
))
87 if (eightbitcount
== 0)
91 * Some rules from RFC 2047:
93 * - Encoded words cannot be more than 75 characters long
94 * - Multiple "long" encoded words must be on new lines.
96 * Also, we're not permitted to encode email addresses, so
97 * we need to actually _parse_ email addresses and only encode
102 * If charset was NULL, then get the value from the locale. But
103 * we reject it if it returns US-ASCII
107 charset
= write_charset_8bit();
109 if (strcasecmp(charset
, "US-ASCII") == 0) {
110 advise(NULL
, "Cannot use US-ASCII with 8 bit characters in header");
115 * If we have an address header, then we need to parse the addresses
116 * and only encode the names or comments. Otherwise, handle it normally.
119 for (i
= 0; address_headers
[i
]; i
++) {
120 if (strcasecmp(name
, address_headers
[i
]) == 0)
121 return field_encode_address(name
, value
, encoding
, charset
);
125 * On the encoding we choose, and the specifics of encoding:
127 * - If a specified encoding is passed in, we use that.
128 * - Otherwise, pick which encoding is shorter.
130 * We don't quite handle continuation right here, but it should be
134 if (encoding
== CE_UNKNOWN
)
135 encoding
= pref_encoding(asciicount
, qpspecialcount
, eightbitcount
);
137 unfold_header(value
, asciicount
+ eightbitcount
);
142 return field_encode_base64(name
, value
, charset
);
145 return field_encode_quoted(name
, value
, charset
, asciicount
,
146 eightbitcount
+ qpspecialcount
, 0);
149 advise(NULL
, "Internal error: unknown RFC-2047 encoding type");
155 * Encode our specified header (or field) using quoted-printable
159 field_encode_quoted(const char *name
, char **value
, const char *charset
,
160 int ascii
, int encoded
, int phraserules
)
162 int prefixlen
= name
? strlen(name
) + 2: 0, outlen
= 0, column
, newline
= 1;
163 int charsetlen
= strlen(charset
), utf8
;
164 char *output
= NULL
, *p
, *q
= NULL
;
167 * Right now we just encode the whole thing. Maybe later on we'll
168 * only encode things on a per-atom basis.
173 column
= prefixlen
+ 2; /* Header name plus ": " */
175 utf8
= strcasecmp(charset
, "UTF-8") == 0;
179 * Start a new line, if it's time
183 * If it's the start of the header, we don't need to pad it
185 * The length of the output string is ...
186 * =?charset?Q?...?= so that's 7+strlen(charset) + 2 for \n NUL
188 * plus 1 for every ASCII character and 3 for every eight bit
189 * or special character (eight bit characters are written as =XX).
195 outlen
+= 9 + charsetlen
+ ascii
+ 3 * encoded
;
198 * If output is set, then we're continuing the header. Otherwise
199 * do the initial allocation.
203 int curlen
= q
- output
, i
;
204 outlen
+= prefixlen
+ 1; /* Header plus \n ": " */
205 output
= mh_xrealloc(output
, outlen
);
210 for (i
= 0; i
< prefixlen
; i
++)
214 * A bit of a hack here; the header can contain multiple
215 * spaces (probably at least one) until we get to the
216 * actual text. Copy until we get to a non-space.
218 output
= mh_xmalloc(outlen
);
224 tokenlen
= snprintf(q
, outlen
- (q
- output
), "=?%s?Q?", charset
);
226 column
= prefixlen
+ tokenlen
;
231 * Process each character, encoding if necessary
233 * Note that we have a different set of rules if we're processing
234 * RFC 5322 'phrase' (something you'd see in an address header).
242 } else if (isascii((unsigned char) *p
) &&
243 (phraserules
? qphrasevalid((unsigned char) *p
) :
244 !qpspecial((unsigned char) *p
))) {
248 snprintf(q
, outlen
- (q
- output
), "=%02X", (unsigned char) *p
);
250 column
+= 2; /* column already incremented by 1 above */
257 * We're not allowed more than ENCODELINELIMIT characters per line,
258 * so reserve some room for the final ?=.
260 * If prefixlen == 0, we haven't been passed in a header name, so
261 * don't ever wrap the field (we're likely doing an address).
267 if (column
>= ENCODELINELIMIT
- 2) {
271 * Okay, this is a bit weird, but to explain a bit more ...
273 * RFC 2047 prohibits the splitting of multibyte characters
274 * across encoded words. Right now we only handle the case
275 * of UTF-8, the most common multibyte encoding.
277 * p is now pointing at the next input character. If we're
278 * using UTF-8 _and_ we'd go over ENCODELINELIMIT given the
279 * length of the complete character, then trigger a newline
280 * now. Note that we check the length * 3 since we have to
281 * allow for the encoded output.
283 if (column
+ (utf8len(p
) * 3) > ENCODELINELIMIT
- 2) {
290 /* This should never happen, but just in case. Found by
291 clang static analyzer. */
292 admonish (NULL
, "null output encoding for %s", *value
);
311 * Encode our specified header (or field) using base64.
313 * This is a little easier since every character gets encoded, we can
314 * calculate the line wrap up front.
318 field_encode_base64(const char *name
, char **value
, const char *charset
)
320 int prefixlen
= name
? strlen(name
) + 2 : 0, charsetlen
= strlen(charset
);
321 int outlen
= 0, numencode
, curlen
;
322 char *output
= NULL
, *p
= *value
, *q
= NULL
, *linestart
= NULL
;
325 * Skip over any leading white space.
328 while (*p
== ' ' || *p
== '\t')
332 * If we had a zero-length prefix, then just encode the whole field
333 * as-is, without line wrapping. Note that in addition to the encoding
335 * The added length we need is =? + charset + ?B? ... ?=
337 * That's 7 + strlen(charset) + 2 (for \n NUL).
340 while (prefixlen
&& ((base64len(strlen(p
)) + 7 + charsetlen
+
341 prefixlen
) > ENCODELINELIMIT
)) {
344 * Our very first time, don't pad the line in the front
346 * Note ENCODELINELIMIT is + 2 because of \n \0
351 outlen
+= ENCODELINELIMIT
+ 2;
352 output
= q
= mh_xmalloc(outlen
);
353 linestart
= q
- prefixlen
; /* Yes, this is intentional */
355 int curstart
= linestart
- output
;
358 outlen
+= ENCODELINELIMIT
+ 2;
359 output
= mh_xrealloc(output
, outlen
);
361 linestart
= output
+ curstart
;
365 * We should have enough space now, so prepend the encoding markers
366 * and character set information. The leading space is intentional.
369 q
+= snprintf(q
, outlen
- (q
- output
), " =?%s?B?", charset
);
372 * Find out how much room we have left on the line and see how
373 * many characters we can stuff in. The start of our line
374 * is marked by "linestart", so use that to figure out how
375 * many characters are left out of ENCODELINELIMIT. Reserve
376 * 2 characters for the end markers and calculate how many
377 * characters we can fit into that space given the base64
378 * encoding expansion.
381 numencode
= strbase64(ENCODELINELIMIT
- (q
- linestart
) - 2);
383 if (numencode
<= 0) {
384 advise(NULL
, "Internal error: tried to encode %d characters "
385 "in base64", numencode
);
390 * RFC 2047 prohibits spanning multibyte characters across tokens.
391 * Right now we only check for UTF-8.
393 * So note the key here ... we want to make sure the character BEYOND
394 * our last character is not a continuation byte. If it's the start
395 * of a new multibyte character or a single-byte character, that's ok.
398 if (strcasecmp(charset
, "UTF-8") == 0) {
400 * p points to the start of our current buffer, so p + numencode
401 * is one past the last character to encode
404 while (numencode
> 0 && ((*(p
+ numencode
) & 0xc0) == 0x80))
407 if (numencode
== 0) {
408 advise(NULL
, "Internal error: could not find start of "
409 "UTF-8 character when base64 encoding header");
414 if (writeBase64raw((unsigned char *) p
, numencode
,
415 (unsigned char *) q
) != OK
) {
416 advise(NULL
, "Internal error: base64 encoding of header failed");
421 q
+= base64len(numencode
);
424 * This will point us at the beginning of the new line (trust me).
430 * What's going on here? Well, we know we're continuing to the next
431 * line, so we want to add continuation padding. We also add the
432 * trailing marker for the RFC 2047 token at this time as well.
433 * This uses a trick of snprintf(); we tell it to print a zero-length
434 * string, but pad it out to prefixlen - 1 characters; that ends
435 * up always printing out the requested number of spaces. We use
436 * prefixlen - 1 because we always add a space on the starting
437 * token marker; this makes things work out correctly for the first
438 * line, which should have a space between the ':' and the start
441 * It's okay if you don't follow all of that.
444 q
+= snprintf(q
, outlen
- (q
- output
), "?=\n%*s", prefixlen
- 1, "");
448 * We're here if there is either no prefix, or we can fit it in less
449 * than ENCODELINELIMIT characters. Encode the whole thing.
452 outlen
+= prefixlen
+ 9 + charsetlen
+ base64len(strlen(p
));
455 output
= mh_xrealloc(output
, outlen
);
458 q
+= snprintf(q
, outlen
- (q
- output
), "%s=?%s?B?",
459 prefixlen
? " " : "", charset
);
461 if (writeBase64raw((unsigned char *) p
, strlen(p
),
462 (unsigned char *) q
) != OK
) {
463 advise(NULL
, "Internal error: base64 encoding of header failed");
480 * Calculate the length of a UTF-8 character.
482 * If it's not a UTF-8 character (or we're in the middle of a multibyte
483 * character) then simply return 0.
487 utf8len(const char *p
)
494 if (isascii((unsigned char) *p
) || (((unsigned char) *p
) & 0xc0) == 0x80)
498 while ((((unsigned char) *p
++) & 0xc0) == 0x80)
505 * "Unfold" a header, making it a single line (without continuation)
507 * We cheat a bit here; we never make the string longer, so using the
508 * original length here is fine.
512 unfold_header(char **value
, int len
)
514 char *str
= mh_xmalloc(len
+ 1);
515 char *p
= str
, *q
= *value
;
520 * When we get a newline, skip to the next non-whitespace
521 * character and add a space to replace all of the whitespace
523 * This has the side effect of stripping off the final newline
524 * for the header; we put it back in the encoding routine.
544 * Decode a header containing addresses. This means we have to parse
545 * each address and only encode the display-name or comment field.
549 field_encode_address(const char *name
, char **value
, int encoding
,
552 int prefixlen
= strlen(name
) + 2, column
= prefixlen
, groupflag
;
553 int asciichars
, specialchars
, eightbitchars
, reformat
= 0, errflag
= 0;
555 char *mp
, *cp
= NULL
, *output
= NULL
;
557 size_t tmpbufsize
= 0;
562 * Because these are addresses, we need to handle them individually.
564 * Break them down and process them one by one. This means we have to
565 * rewrite the whole header, but that's unavoidable.
569 * The output headers always have to start with a space first; this
570 * is just the way the API works right now.
573 output
= add(" ", output
);
575 for (groupflag
= 0; (mp
= getname(*value
)); ) {
576 if ((mn
= getm(mp
, NULL
, 0, errbuf
, sizeof(errbuf
))) == NULL
) {
577 advise(NULL
, "%s: %s", errbuf
, mp
);
585 * We only care if the phrase (m_pers) or any trailing comment
586 * (m_note) have 8-bit characters. If doing q-p, we also need
587 * to encode anything marked as qspecial(). Unquote it first
588 * so the specialchars count is right.
594 if ((len
= strlen(mn
->m_pers
)) + 1 > tmpbufsize
) {
595 tmpbuf
= mh_xrealloc(tmpbuf
, tmpbufsize
= len
+ 1);
598 unquote_string(mn
->m_pers
, tmpbuf
);
600 if (scanstring(tmpbuf
, &asciichars
, &eightbitchars
,
603 * If we have 8-bit characters, encode it.
606 if (encoding
== CE_UNKNOWN
)
607 encoding
= pref_encoding(asciichars
, specialchars
,
611 * This is okay, because the output of unquote_string will be either
612 * equal or shorter than the original.
615 strcpy(mn
->m_pers
, tmpbuf
);
620 if (field_encode_base64(NULL
, &mn
->m_pers
, charset
)) {
627 if (field_encode_quoted(NULL
, &mn
->m_pers
, charset
, asciichars
,
628 eightbitchars
+ specialchars
, 1)) {
635 advise(NULL
, "Internal error: unknown RFC-2047 encoding type");
646 * The "note" field is generally a comment at the end of the address,
647 * at least as how it's implemented here. Notes are always surrounded
648 * by parenthesis (since they're comments). Strip them out and
649 * then put them back when we format the final field, but they do
656 if ((len
= strlen(mn
->m_note
)) + 1 > tmpbufsize
) {
657 tmpbuf
= mh_xrealloc(tmpbuf
, tmpbufsize
= len
+ 1);
660 if (mn
->m_note
[0] != '(' || mn
->m_note
[len
- 1] != ')') {
661 advise(NULL
, "Internal error: Invalid note field \"%s\"",
667 strncpy(tmpbuf
, mn
->m_note
+ 1, len
- 1);
668 tmpbuf
[len
- 2] = '\0';
670 if (scanstring(tmpbuf
, &asciichars
, &eightbitchars
,
673 * If we have 8-bit characters, encode it.
676 if (encoding
== CE_UNKNOWN
)
677 encoding
= pref_encoding(asciichars
, specialchars
,
683 if (field_encode_base64(NULL
, &tmpbuf
, charset
)) {
690 if (field_encode_quoted(NULL
, &tmpbuf
, charset
, asciichars
,
691 eightbitchars
+ specialchars
, 1)) {
698 advise(NULL
, "Internal error: unknown RFC-2047 encoding type");
706 * Make sure the size of tmpbuf is correct (it always gets
707 * reallocated in the above functions).
710 tmpbufsize
= strlen(tmpbuf
) + 1;
713 * Put the note field back surrounded by parenthesis.
716 mn
->m_note
= mh_xrealloc(mn
->m_note
, tmpbufsize
+ 2);
718 snprintf(mn
->m_note
, tmpbufsize
+ 2, "(%s)", tmpbuf
);
724 * So, some explanation is in order.
726 * We know we need to rewrite at least one address in the header,
727 * otherwise we wouldn't be here. If we had to reformat this
728 * particular address, then run it through adrformat(). Otherwise
729 * we can use m_text directly.
733 * If we were in a group but are no longer, make sure we add a
734 * semicolon (which needs to be FIRST, as it needs to be at the end
735 * of the last address).
738 if (groupflag
&& ! mn
->m_ingrp
) {
739 output
= add(";", output
);
743 groupflag
= mn
->m_ingrp
;
746 cp
= add(mn
->m_gname
, NULL
);
750 cp
= add(adrformat(mn
), cp
);
752 cp
= add(mn
->m_text
, cp
);
758 * If we're not at the beginning of the line, add a command and
759 * either a space or a newline.
762 if (column
!= prefixlen
) {
763 if (len
+ column
+ 2 > OUTPUTLINELEN
) {
765 if ((size_t) (prefixlen
+ 3) < tmpbufsize
)
766 tmpbuf
= mh_xrealloc(tmpbuf
, tmpbufsize
= prefixlen
+ 3);
768 snprintf(tmpbuf
, tmpbufsize
, ",\n%*s", column
= prefixlen
, "");
769 output
= add(tmpbuf
, output
);
771 output
= add(", ", output
);
777 * Finally add the address
780 output
= add(cp
, output
);
787 * Just in case we're at the end of a list
791 output
= add(";", output
);
794 output
= add("\n", output
);
811 * Scan a string, check for characters that need to be encoded
815 scanstring(const char *string
, int *asciilen
, int *eightbitchars
,
822 for (; *string
!= '\0'; string
++) {
823 if ((isascii((unsigned char) *string
))) {
826 * So, a space is not a valid phrase character, but we're counting
827 * an exception here, because in q-p a space can be directly
828 * encoded as an underscore.
830 if (!qphrasevalid((unsigned char) *string
) && *string
!= ' ')
837 return *eightbitchars
> 0;
841 * This function is to be used to decide which encoding algorithm we should
842 * use if one is not given. Basically, we pick whichever one is the shorter
847 * ascii - Number of ASCII characters in to-be-encoded string.
848 * specials - Number of ASCII characters in to-be-encoded string that
849 * still require encoding under quoted-printable. Note that
850 * these are included in the "ascii" total.
851 * eightbit - Eight-bit characters in the to-be-encoded string.
853 * Returns one of CE_BASE64 or CE_QUOTED.
857 pref_encoding(int ascii
, int specials
, int eightbits
)
860 * The length of the q-p encoding is:
862 * ascii - specials + (specials + eightbits) * 3.
864 * The length of the base64 encoding is:
866 * base64len(ascii + eightbits) (See macro for details)
869 return base64len(ascii
+ eightbits
) < (ascii
- specials
+
870 (specials
+ eightbits
) * 3) ? CE_BASE64
: CE_QUOTED
;