]> diplodocus.org Git - nmh/blob - sbr/icalendar.l
mhlsbr.c: Don't strchr(3) non-string NUL-less buffer.
[nmh] / sbr / icalendar.l
1 /* icalendar.l -- icalendar (RFC 5545) scanner
2 *
3 * This code is Copyright (c) 2014, by the authors of nmh. See the
4 * COPYRIGHT file in the root directory of the nmh distribution for
5 * complete copyright information.
6 */
7
8 /* See porting notes at end of this file. */
9
10 %{
11 #include "h/mh.h"
12 #include "h/icalendar.h"
13 #include "sbr/icalparse.h"
14 #include "sbr/base64.h"
15
16 static char *unfold (char *, size_t *);
17 static void destroy_icallex(void);
18 %}
19
20 /*
21 * These flex options aren't used:
22 * 8bit not needed
23 * case-insensitive not needed
24 * align not used because this isn't performance critical
25 */
26 %option outfile="lex.yy.c" prefix="ical"
27 %option perf-report warn
28 %option never-interactive noinput noyywrap
29
30 /*
31 * From RFC 5545 § 3.1.
32 */
33 name {iana-token}|{x-name}
34 iana-token ({ALPHA}|{DIGIT}|-)+
35 x-name X-({vendorid}-)?({ALPHA}|{DIGIT}|-)+
36 vendorid ({ALPHA}|{DIGIT}){3,}
37 param-name {iana-token}|{x-name}
38 param-value {paramtext}|{quoted-string}
39 paramtext {SAFE-CHAR}*
40 value {VALUE-CHAR}*
41 quoted-string {DQUOTE}{QSAFE-CHAR}*{DQUOTE}
42 QSAFE-CHAR {WSP}|[\x21\x23-\x7E]|{NON-US-ASCII}
43 SAFE-CHAR {WSP}|[\x21\x23-\x2B\x2D-\x39\x3C-\x7E]|{NON-US-ASCII}
44 VALUE-CHAR {WSP}|[\x21-\x7E]|{NON-US-ASCII}
45 /* The following is a short-cut definition that admits more
46 that the UNICODE characters permitted by RFC 5545. */
47 NON-US-ASCII [\x80-\xF8]{2,4}
48 /* The following excludes HTAB, unlike {CTL}. */
49 CONTROL [\x00-\x08\x0A-\x1F\x7F]
50 EQUAL =
51 /* Solaris lex requires that the , be escaped. */
52 COMMA \,
53 /*
54 * From RFC 5545 § 2.1.
55 */
56 COLON :
57 SEMICOLON ;
58
59 /*
60 * From RFC 5545 § 3.3.11.
61 */
62 text ({TSAFE-CHAR}|:|{DQUOTE}|{ESCAPED-CHAR})*
63 ESCAPED-CHAR \\\\|\\;|\\,|\\N|\\n
64 TSAFE-CHAR {WSP}|[\x21\x23-\x2B\x2D-\x39\x3C-\x5B\x5D-\x7E]|{NON-US-ASCII|
65
66 /*
67 * Core rules (definitions) from RFC 5234 Appendix B.1.
68 */
69 ALPHA [\x41-\x5A\x61-\x7A]
70 BIT [01]
71 CHAR [\x01-\x7F]
72 CR \x0D
73 /* Variance from RFC 5234: the {CR} is required in
74 CRLF, but it is optional below to support Unix
75 filesystem convention. */
76 CRLF ({CR}?{LF})+
77 CTL [\x00-\x1F\x7F]
78 DIGIT [\x30-\x39]
79 DQUOTE \x22
80 HEXDIG {DIGIT}|[A-F]
81 HTAB \x09
82 LF \x0A
83 LWSP ({WSP}|({CRLF}{WSP}))*
84 OCTET [\x00-\xFF]
85 SP \x20
86 VCHAR [\x21-\x7E]
87 WSP {SP}|{HTAB}
88
89 /*
90 * Our definitions.
91 */
92 fold {CRLF}{WSP}
93 folded-name {name}({fold}+{iana-token})+
94 folded-param-name {param-name}({fold}+{iana-token})+
95 folded-quoted-string {DQUOTE}{QSAFE-CHAR}*{fold}+{QSAFE-CHAR}*{DQUOTE}
96 folded-param-value {paramtext}({fold}{paramtext}*)+|{folded-quoted-string}
97 folded-value {VALUE-CHAR}*({fold}{VALUE-CHAR}*)+
98
99 %s s_name s_colon s_value s_semicolon s_param_name s_equal s_comma
100
101 %%
102
103 <INITIAL>
104 {CRLF} {
105 /* Eat any leading newlines. */
106 }
107
108 <INITIAL>
109 {folded-name} {
110 /* flex 2.5.4 defines icalleng as an int instead of a size_t,
111 so copy it. */
112 size_t len = icalleng;
113 unfold (icaltext, &len);
114 icalleng = len;
115
116 icallval = strdup (icaltext);
117 /* yy_push_state (s_name); * s_name */
118 BEGIN (s_name); /* s_name */
119 return ICAL_NAME;
120 }
121
122 <INITIAL>
123 {name} {
124 icallval = strdup (icaltext);
125 /* yy_push_state (s_name); * s_name */
126 BEGIN (s_name); /* s_name */
127 return ICAL_NAME;
128 }
129
130 <s_name>
131 {COLON} {
132 /* Don't need to strdup a single character. */
133 icallval = icaltext;
134 /* yy_pop_state (); * INITIAL */
135 /* yy_push_state (s_colon); * s_colon */
136 BEGIN (s_colon); /* s_colon */
137 return ICAL_COLON;
138 }
139
140 <s_colon>
141 {folded-value} {
142 /* flex 2.5.4 defines icalleng as an int instead of a size_t,
143 so copy it. */
144 size_t len = icalleng;
145 unfold (icaltext, &len);
146 icalleng = len;
147
148 icallval = strdup (icaltext);
149 /* yy_pop_state (); * INITIAL */
150 /* yy_push_state (s_value); * s_value */
151 BEGIN (s_value); /* s_value */
152 return ICAL_VALUE;
153 }
154
155 <s_colon>
156 {value} {
157 icallval = strdup (icaltext);
158 /* yy_pop_state (); * INITIAL */
159 /* yy_push_state (s_value); * s_value */
160 BEGIN (s_value); /* s_value */
161 return ICAL_VALUE;
162 }
163
164 <s_name>
165 {SEMICOLON} {
166 /* Don't need to strdup a single character. */
167 icallval = icaltext;
168 /* yy_push_state (s_semicolon); * s_name, s_semicolon */
169 BEGIN (s_semicolon); /* s_name, s_semicolon */
170 return ICAL_SEMICOLON;
171 }
172
173 <s_semicolon>
174 {folded-param-name} {
175 /* flex 2.5.4 defines icalleng as an int instead of a size_t,
176 so copy it. */
177 size_t len = icalleng;
178 unfold (icaltext, &len);
179 icalleng = len;
180
181 icallval = strdup (icaltext);
182 /* yy_pop_state (); * s_name */
183 /* yy_push_state (s_param_name); * s_name, s_param_name */
184 BEGIN (s_param_name); /* s_name, s_param_name */
185 return ICAL_PARAM_NAME;
186 }
187
188 <s_semicolon>
189 {param-name} {
190 icallval = strdup (icaltext);
191 /* yy_pop_state (); * s_name */
192 /* yy_push_state (s_param_name); * s_name, s_param_name */
193 BEGIN (s_param_name); /* s_name, s_param_name */
194 return ICAL_PARAM_NAME;
195 }
196
197 <s_param_name>
198 {EQUAL} {
199 /* Don't need to strdup a single character. */
200 icallval = icaltext;
201 /* yy_pop_state (); * s_name */
202 /* yy_push_state (s_equal); * s_name, s_equal */
203 BEGIN (s_equal); /* s_name, s_equal */
204 return ICAL_EQUAL;
205 }
206
207 <s_equal,s_comma>
208 {folded-param-value} {
209 /* flex 2.5.4 defines icalleng as an int instead of a size_t,
210 so copy it. */
211 size_t len = icalleng;
212 unfold (icaltext, &len);
213 icalleng = len;
214
215 icallval = strdup (icaltext);
216 /* yy_pop_state (); * s_name */
217 BEGIN (s_name); /* s_name */
218 return ICAL_PARAM_VALUE;
219 }
220
221 <s_equal,s_comma>
222 {param-value} {
223 icallval = strdup (icaltext);
224 /* yy_pop_state (); * s_name */
225 BEGIN (s_name); /* s_name */
226 return ICAL_PARAM_VALUE;
227 }
228
229 <s_name>
230 {COMMA} {
231 /* Don't need to strdup a single character. */
232 icallval = icaltext;
233 /* yy_push_state (s_comma); * s_name, s_comma */
234 BEGIN (s_comma); /* s_name, s_comma */
235 return ICAL_COMMA;
236 }
237
238 <s_value>
239 {CRLF} {
240 /* Use start condition to ensure that all newlines are where expected. */
241 icallval = icaltext;
242 /* yy_pop_state (); * INITIAL */
243 BEGIN (INITIAL); /* INITIAL */
244 return ICAL_CRLF;
245 }
246
247 <s_colon>
248 {CRLF} {
249 /* Null value. */
250 icallval = strdup ("");
251 /* yy_pop_state (); * INITIAL */
252 /* yy_push_state (s_value); * s_value */
253 BEGIN (s_value); /* s_value */
254 /* Push the newline back so it can be handled in the proper state. */
255 unput ('\n');
256 return ICAL_VALUE;
257 }
258
259 . {
260 /* By default, flex will just pass unmatched text. Catch it instead. */
261 contentline *clines = vevents.last->contentlines;
262 contentline *cline;
263
264 if (clines && (cline = clines->last)) {
265 if (cline->unexpected == NULL) {
266 cline->unexpected = charstring_create (0);
267 }
268 charstring_append_cstring (cline->unexpected, icaltext);
269 }
270 }
271
272 <INITIAL>
273 <<EOF>> {
274 /* See next rule for when start state is not INITIAL. */
275 destroy_icallex ();
276 yyterminate ();
277 }
278
279 <<EOF>> {
280 /* Missing a final newline after a token. The input does not conform
281 to RFC 5545 § 3.1, which requires that each contentline end with a
282 CRLF. (Assume that the token is at the end of a contentline.) Be
283 liberal in what we accept by faking a newline here, and setting the
284 start state to terminate on the next call. */
285 BEGIN (INITIAL);
286 return ICAL_CRLF;
287 }
288
289 %%
290
291 static char *
292 unfold (char *text, size_t *leng) {
293 /* It's legal to shorten text and modify leng (because we don't
294 use yymore()). */
295 char *cp;
296
297 /* First squash any CR-LF-WSP sequences. */
298 while ((cp = strstr (text, "\r\n ")) || (cp = strstr (text, "\r\n\t"))) {
299 /* Subtract any characters prior to fold sequence and 3 for
300 the fold sequence, and add 1 for the terminating null. */
301 (void) memmove (cp, cp + 3, *leng - (cp - text) - 3 + 1);
302 *leng -= 3;
303 }
304
305 /* Then squash any LF-WSP sequences. */
306 while ((cp = strstr (text, "\n ")) || (cp = strstr (text, "\n\t"))) {
307 /* Subtract any characters prior to fold sequence and 2 for
308 the fold sequence, and add 1 for the terminating null. */
309 (void) memmove (cp, cp + 2, *leng - (cp - text) - 2 + 1);
310 *leng -= 2;
311 }
312
313 return text;
314 }
315
316
317 /*
318 * To clean up memory, call the function provided by modern
319 * versions of flex. Older versions don't have it, and of
320 * course this won't do anything if the scanner was built
321 * with something other than flex.
322 */
323 static void
324 destroy_icallex(void) {
325 #if defined FLEX_SCANNER && defined YY_FLEX_SUBMINOR_VERSION
326 /* Hack: rely on fact that the the YY_FLEX_SUBMINOR_VERSION
327 #define was added to flex (flex.skl v. 2.163) after
328 #yylex_destroy() was added. */
329 icallex_destroy ();
330 #endif /* FLEX_SCANNER && YY_CURRENT_BUFFER_LVALUE */
331 }
332
333 /*
334 * See comment in h/icalendar.h about having to provide these
335 * because flex 2.5.4 doesn't.
336 */
337 void
338 icalset_inputfile (FILE *file) {
339 yyin = file;
340 }
341
342 void
343 icalset_outputfile (FILE *file) {
344 yyout = file;
345 }
346
347 /*
348 * Porting notes
349 * -------------
350 * POSIX lex only supports an entry point name of yylex(). nmh
351 * programs can contain multiple scanners (see sbr/dtimep.l), so
352 * nmh requires the use of flex to build them.
353 * In addition, if there is a need to port this to Solaris lex:
354 * - Use the lex -e or -w option.
355 * - Comment out all of the %options.
356 * - Comment out the <<EOF>> rule.
357 * - The start condition and pattern must be on the same line.
358 * - Comments must be inside rules, not just before them.
359 * - Don't use start condition stack. In the code, above BEGIN's are
360 * used instead, and the contents of an imaginary start condition
361 * stack are shown after each. The stack operations are also shown
362 * in comments.
363 */