]> diplodocus.org Git - nmh/blob - sbr/fmt_rfc2047.c
Alter mh-chart(7)'s NAME to be lowercase.
[nmh] / sbr / fmt_rfc2047.c
1
2 /*
3 * fmt_rfc2047.c -- decode RFC-2047 header format
4 *
5 * This code is Copyright (c) 2002, by the authors of nmh. See the
6 * COPYRIGHT file in the root directory of the nmh distribution for
7 * complete copyright information.
8 */
9
10 #include <h/mh.h>
11 #include <h/utils.h>
12 #ifdef HAVE_ICONV
13 # include <iconv.h>
14 #endif
15
16 static signed char hexindex[] = {
17 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
18 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
19 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
20 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,
21 -1,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,
22 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
23 -1,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,
24 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
25 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
26 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
27 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
28 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
29 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
30 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
31 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
32 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
33 };
34
35 static signed char index_64[128] = {
36 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
37 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
38 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
39 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
40 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
41 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
42 -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
43 41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
44 };
45
46 #define char64(c) (((unsigned char) (c) > 127) ? -1 : index_64[(unsigned char) (c)])
47
48 /*
49 * Decode two quoted-pair characters
50 */
51
52 int
53 decode_qp (unsigned char byte1, unsigned char byte2)
54 {
55 if (hexindex[byte1] == -1 || hexindex[byte2] == -1)
56 return -1;
57 return (hexindex[byte1] << 4 | hexindex[byte2]);
58 }
59
60 /* Check if character is linear whitespace */
61 #define is_lws(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
62
63
64 /*
65 * Decode the string as a RFC-2047 header field
66 */
67
68 /* Add character to the destination buffer, and bomb out if it fills up */
69 #define ADDCHR(C) do { *q++ = (C); dstlen--; if (!dstlen) goto buffull; } while (0)
70
71 int
72 decode_rfc2047 (char *str, char *dst, size_t dstlen)
73 {
74 char *p, *q, *pp;
75 char *startofmime, *endofmime, *endofcharset;
76 int c, quoted_printable;
77 int encoding_found = 0; /* did we decode anything? */
78 int between_encodings = 0; /* are we between two encodings? */
79 int equals_pending = 0; /* is there a '=' pending? */
80 int whitespace = 0; /* how much whitespace between encodings? */
81 #ifdef HAVE_ICONV
82 int use_iconv = 0; /* are we converting encoding with iconv? */
83 iconv_t cd = NULL;
84 int fromutf8 = 0;
85 char *saveq, *convbuf = NULL;
86 size_t savedstlen;
87 #endif
88
89 if (!str)
90 return 0;
91
92 /*
93 * Do a quick and dirty check for the '=' character.
94 * This should quickly eliminate many cases.
95 */
96 if (!strchr (str, '='))
97 return 0;
98
99 for (p = str, q = dst; *p; p++) {
100
101 /* reset iconv */
102 #ifdef HAVE_ICONV
103 if (use_iconv) {
104 iconv_close(cd);
105 use_iconv = 0;
106 }
107 #endif
108 /*
109 * If we had an '=' character pending from
110 * last iteration, then add it first.
111 */
112 if (equals_pending) {
113 ADDCHR('=');
114 equals_pending = 0;
115 between_encodings = 0; /* we have added non-whitespace text */
116 }
117
118 if (*p != '=') {
119 /* count linear whitespace while between encodings */
120 if (between_encodings && is_lws(*p))
121 whitespace++;
122 else
123 between_encodings = 0; /* we have added non-whitespace text */
124 ADDCHR(*p);
125 continue;
126 }
127
128 equals_pending = 1; /* we have a '=' pending */
129
130 /* Check for initial =? */
131 if (*p == '=' && p[1] && p[1] == '?' && p[2]) {
132 startofmime = p + 2;
133
134 /* Scan ahead for the next '?' character */
135 for (pp = startofmime; *pp && *pp != '?'; pp++)
136 ;
137
138 if (!*pp)
139 continue;
140
141 /*
142 * RFC 2231 specifies that language information can appear
143 * in a charset specification like so:
144 *
145 * =?us-ascii*en?Q?Foo?=
146 *
147 * Right now we don't use language information, so ignore it.
148 */
149
150 for (endofcharset = startofmime;
151 *endofcharset != '*' && endofcharset < pp;
152 endofcharset++)
153 ;
154
155 /* Check if character set can be handled natively */
156 if (!check_charset(startofmime, endofcharset - startofmime)) {
157 #ifdef HAVE_ICONV
158 /* .. it can't. We'll use iconv then. */
159 *endofcharset = '\0';
160 cd = iconv_open(get_charset(), startofmime);
161 fromutf8 = !strcasecmp(startofmime, "UTF-8");
162 *pp = '?';
163 if (cd == (iconv_t)-1) continue;
164 use_iconv = 1;
165 #else
166 continue;
167 #endif
168 }
169
170 startofmime = pp + 1;
171
172 /* Check for valid encoding type */
173 if (*startofmime != 'B' && *startofmime != 'b' &&
174 *startofmime != 'Q' && *startofmime != 'q')
175 continue;
176
177 /* Is encoding quoted printable or base64? */
178 quoted_printable = (*startofmime == 'Q' || *startofmime == 'q');
179 startofmime++;
180
181 /* Check for next '?' character */
182 if (*startofmime != '?')
183 continue;
184 startofmime++;
185
186 /*
187 * Scan ahead for the ending ?=
188 *
189 * While doing this, we will also check if encoded
190 * word has any embedded linear whitespace.
191 */
192 endofmime = NULL;
193 for (pp = startofmime; *pp && *(pp+1); pp++) {
194 if (is_lws(*pp)) {
195 break;
196 } else if (*pp == '?' && pp[1] == '=') {
197 endofmime = pp;
198 break;
199 }
200 }
201 if (is_lws(*pp) || endofmime == NULL)
202 continue;
203
204 /*
205 * We've found an encoded word, so we can drop
206 * the '=' that was pending
207 */
208 equals_pending = 0;
209
210 /*
211 * If we are between two encoded words separated only by
212 * linear whitespace, then we ignore the whitespace.
213 * We will roll back the buffer the number of whitespace
214 * characters we've seen since last encoded word.
215 */
216 if (between_encodings) {
217 q -= whitespace;
218 dstlen += whitespace;
219 }
220
221 #ifdef HAVE_ICONV
222 /*
223 * empty encoded text. This ensures that we don't
224 * malloc 0 bytes but skip on to the end
225 */
226 if (endofmime == startofmime && use_iconv) {
227 use_iconv = 0;
228 iconv_close(cd);
229 }
230
231 if (use_iconv) {
232 saveq = q;
233 savedstlen = dstlen;
234 q = convbuf = (char *) mh_xmalloc(endofmime - startofmime);
235 }
236 /* ADDCHR2 is for adding characters when q is or might be convbuf:
237 * in this case on buffer-full we want to run iconv before returning.
238 * I apologise for the dreadful name.
239 */
240 #define ADDCHR2(C) do { *q++ = (C); dstlen--; if (!dstlen) goto iconvbuffull; } while (0)
241 #else
242 #define ADDCHR2(C) ADDCHR(C)
243 #endif
244
245 /* Now decode the text */
246 if (quoted_printable) {
247 for (pp = startofmime; pp < endofmime; pp++) {
248 if (*pp == '=') {
249 c = decode_qp (pp[1], pp[2]);
250 if (c == -1)
251 continue;
252 if (c != 0)
253 *q++ = c;
254 pp += 2;
255 } else if (*pp == '_') {
256 ADDCHR2(' ');
257 } else {
258 ADDCHR2(*pp);
259 }
260 }
261 } else {
262 /* base64 */
263 int c1, c2, c3, c4;
264 c1 = c2 = c3 = c4 = -1;
265
266 pp = startofmime;
267 while (pp < endofmime) {
268 /* 6 + 2 bits */
269 while ((pp < endofmime) &&
270 ((c1 = char64(*pp)) == -1)) {
271 pp++;
272 }
273 if (pp < endofmime) {
274 pp++;
275 }
276 while ((pp < endofmime) &&
277 ((c2 = char64(*pp)) == -1)) {
278 pp++;
279 }
280 if (pp < endofmime && c1 != -1 && c2 != -1) {
281 ADDCHR2((c1 << 2) | (c2 >> 4));
282 pp++;
283 }
284 /* 4 + 4 bits */
285 while ((pp < endofmime) &&
286 ((c3 = char64(*pp)) == -1)) {
287 pp++;
288 }
289 if (pp < endofmime && c2 != -1 && c3 != -1) {
290 ADDCHR2(((c2 & 0xF) << 4) | (c3 >> 2));
291 pp++;
292 }
293 /* 2 + 6 bits */
294 while ((pp < endofmime) &&
295 ((c4 = char64(*pp)) == -1)) {
296 pp++;
297 }
298 if (pp < endofmime && c3 != -1 && c4 != -1) {
299 ADDCHR2(((c3 & 0x3) << 6) | (c4));
300 pp++;
301 }
302 }
303 }
304
305 #ifdef HAVE_ICONV
306 iconvbuffull:
307 /* NB that the string at convbuf is not necessarily NUL terminated here:
308 * q points to the first byte after the valid part.
309 */
310 /* Convert to native character set */
311 if (use_iconv) {
312 size_t inbytes = q - convbuf;
313 ICONV_CONST char *start = convbuf;
314
315 while (inbytes) {
316 if (iconv(cd, &start, &inbytes, &saveq, &savedstlen) ==
317 (size_t)-1) {
318 if (errno != EILSEQ) break;
319 /* character couldn't be converted. we output a `?'
320 * and try to carry on which won't work if
321 * either encoding was stateful */
322 iconv (cd, 0, 0, &saveq, &savedstlen);
323 if (!savedstlen)
324 break;
325 *saveq++ = '?';
326 savedstlen--;
327 if (!savedstlen)
328 break;
329 /* skip to next input character */
330 if (fromutf8) {
331 for (++start, --inbytes;
332 start < q && (*start & 192) == 128;
333 ++start, --inbytes)
334 continue;
335 } else
336 start++, inbytes--;
337 if (start >= q)
338 break;
339 }
340 }
341 q = saveq;
342 /* Stop now if (1) we hit the end of the buffer trying to do
343 * MIME decoding and have just iconv-converted a partial string
344 * or (2) our iconv-conversion hit the end of the buffer.
345 */
346 if (!dstlen || !savedstlen)
347 goto buffull;
348 dstlen = savedstlen;
349 free(convbuf);
350 }
351 #endif
352
353 /*
354 * Now that we are done decoding this particular
355 * encoded word, advance string to trailing '='.
356 */
357 p = endofmime + 1;
358
359 encoding_found = 1; /* we found (at least 1) encoded word */
360 between_encodings = 1; /* we have just decoded something */
361 whitespace = 0; /* re-initialize amount of whitespace */
362 }
363 }
364 #ifdef HAVE_ICONV
365 if (use_iconv) iconv_close(cd);
366 #endif
367
368 /* If an equals was pending at end of string, add it now. */
369 if (equals_pending)
370 ADDCHR('=');
371 *q = '\0';
372
373 return encoding_found;
374
375 buffull:
376 /* q is currently just off the end of the buffer, so rewind to NUL terminate */
377 q--;
378 *q = '\0';
379 return encoding_found;
380 }