]> diplodocus.org Git - nmh/blob - sbr/icalendar.l
mhbuildsbr.c: Flip logic, moving goto to then-block; no need for else.
[nmh] / sbr / icalendar.l
1 /* icalendar.l -- icalendar (RFC 5545) scanner
2 *
3 * This code is Copyright (c) 2014, by the authors of nmh. See the
4 * COPYRIGHT file in the root directory of the nmh distribution for
5 * complete copyright information.
6 */
7
8 /* See porting notes at end of this file. */
9
10 %{
11 #include "h/mh.h"
12 #include "h/icalendar.h"
13 #include "sbr/icalparse.h"
14
15 static char *unfold (char *, size_t *);
16 static void destroy_icallex(void);
17 %}
18
19 /*
20 * These flex options aren't used:
21 * 8bit not needed
22 * case-insensitive not needed
23 * align not used because this isn't performance critical
24 */
25 %option outfile="lex.yy.c" prefix="ical"
26 %option perf-report warn
27 %option never-interactive noinput noyywrap
28
29 /*
30 * From RFC 5545 § 3.1.
31 */
32 name {iana-token}|{x-name}
33 iana-token ({ALPHA}|{DIGIT}|-)+
34 x-name X-({vendorid}-)?({ALPHA}|{DIGIT}|-)+
35 vendorid ({ALPHA}|{DIGIT}){3,}
36 param-name {iana-token}|{x-name}
37 param-value {paramtext}|{quoted-string}
38 paramtext {SAFE-CHAR}*
39 value {VALUE-CHAR}*
40 quoted-string {DQUOTE}{QSAFE-CHAR}*{DQUOTE}
41 QSAFE-CHAR {WSP}|[\x21\x23-\x7E]|{NON-US-ASCII}
42 SAFE-CHAR {WSP}|[\x21\x23-\x2B\x2D-\x39\x3C-\x7E]|{NON-US-ASCII}
43 VALUE-CHAR {WSP}|[\x21-\x7E]|{NON-US-ASCII}
44 /* The following is a short-cut definition that admits more
45 that the UNICODE characters permitted by RFC 5545. */
46 NON-US-ASCII [\x80-\xF8]{2,4}
47 /* The following excludes HTAB, unlike {CTL}. */
48 CONTROL [\x00-\x08\x0A-\x1F\x7F]
49 EQUAL =
50 /* Solaris lex requires that the , be escaped. */
51 COMMA \,
52 /*
53 * From RFC 5545 § 2.1.
54 */
55 COLON :
56 SEMICOLON ;
57
58 /*
59 * From RFC 5545 § 3.3.11.
60 */
61 text ({TSAFE-CHAR}|:|{DQUOTE}|{ESCAPED-CHAR})*
62 ESCAPED-CHAR \\\\|\\;|\\,|\\N|\\n
63 TSAFE-CHAR {WSP}|[\x21\x23-\x2B\x2D-\x39\x3C-\x5B\x5D-\x7E]|{NON-US-ASCII|
64
65 /*
66 * Core rules (definitions) from RFC 5234 Appendix B.1.
67 */
68 ALPHA [\x41-\x5A\x61-\x7A]
69 BIT [01]
70 CHAR [\x01-\x7F]
71 CR \x0D
72 /* Variance from RFC 5234: the {CR} is required in
73 CRLF, but it is optional below to support Unix
74 filesystem convention. */
75 CRLF ({CR}?{LF})+
76 CTL [\x00-\x1F\x7F]
77 DIGIT [\x30-\x39]
78 DQUOTE \x22
79 HEXDIG {DIGIT}|[A-F]
80 HTAB \x09
81 LF \x0A
82 LWSP ({WSP}|({CRLF}{WSP}))*
83 OCTET [\x00-\xFF]
84 SP \x20
85 VCHAR [\x21-\x7E]
86 WSP {SP}|{HTAB}
87
88 /*
89 * Our definitions.
90 */
91 fold {CRLF}{WSP}
92 folded-name {name}({fold}+{iana-token})+
93 folded-param-name {param-name}({fold}+{iana-token})+
94 folded-quoted-string {DQUOTE}{QSAFE-CHAR}*{fold}+{QSAFE-CHAR}*{DQUOTE}
95 folded-param-value {paramtext}({fold}{paramtext}*)+|{folded-quoted-string}
96 folded-value {VALUE-CHAR}*({fold}{VALUE-CHAR}*)+
97
98 %s s_name s_colon s_value s_semicolon s_param_name s_equal s_comma
99
100 %%
101
102 <INITIAL>
103 {CRLF} {
104 /* Eat any leading newlines. */
105 }
106
107 <INITIAL>
108 {folded-name} {
109 /* flex 2.5.4 defines icalleng as an int instead of a size_t,
110 so copy it. */
111 size_t len = icalleng;
112 unfold (icaltext, &len);
113 icalleng = len;
114
115 icallval = strdup (icaltext);
116 /* yy_push_state (s_name); * s_name */
117 BEGIN (s_name); /* s_name */
118 return ICAL_NAME;
119 }
120
121 <INITIAL>
122 {name} {
123 icallval = strdup (icaltext);
124 /* yy_push_state (s_name); * s_name */
125 BEGIN (s_name); /* s_name */
126 return ICAL_NAME;
127 }
128
129 <s_name>
130 {COLON} {
131 /* Don't need to strdup a single character. */
132 icallval = icaltext;
133 /* yy_pop_state (); * INITIAL */
134 /* yy_push_state (s_colon); * s_colon */
135 BEGIN (s_colon); /* s_colon */
136 return ICAL_COLON;
137 }
138
139 <s_colon>
140 {folded-value} {
141 /* flex 2.5.4 defines icalleng as an int instead of a size_t,
142 so copy it. */
143 size_t len = icalleng;
144 unfold (icaltext, &len);
145 icalleng = len;
146
147 icallval = strdup (icaltext);
148 /* yy_pop_state (); * INITIAL */
149 /* yy_push_state (s_value); * s_value */
150 BEGIN (s_value); /* s_value */
151 return ICAL_VALUE;
152 }
153
154 <s_colon>
155 {value} {
156 icallval = strdup (icaltext);
157 /* yy_pop_state (); * INITIAL */
158 /* yy_push_state (s_value); * s_value */
159 BEGIN (s_value); /* s_value */
160 return ICAL_VALUE;
161 }
162
163 <s_name>
164 {SEMICOLON} {
165 /* Don't need to strdup a single character. */
166 icallval = icaltext;
167 /* yy_push_state (s_semicolon); * s_name, s_semicolon */
168 BEGIN (s_semicolon); /* s_name, s_semicolon */
169 return ICAL_SEMICOLON;
170 }
171
172 <s_semicolon>
173 {folded-param-name} {
174 /* flex 2.5.4 defines icalleng as an int instead of a size_t,
175 so copy it. */
176 size_t len = icalleng;
177 unfold (icaltext, &len);
178 icalleng = len;
179
180 icallval = strdup (icaltext);
181 /* yy_pop_state (); * s_name */
182 /* yy_push_state (s_param_name); * s_name, s_param_name */
183 BEGIN (s_param_name); /* s_name, s_param_name */
184 return ICAL_PARAM_NAME;
185 }
186
187 <s_semicolon>
188 {param-name} {
189 icallval = strdup (icaltext);
190 /* yy_pop_state (); * s_name */
191 /* yy_push_state (s_param_name); * s_name, s_param_name */
192 BEGIN (s_param_name); /* s_name, s_param_name */
193 return ICAL_PARAM_NAME;
194 }
195
196 <s_param_name>
197 {EQUAL} {
198 /* Don't need to strdup a single character. */
199 icallval = icaltext;
200 /* yy_pop_state (); * s_name */
201 /* yy_push_state (s_equal); * s_name, s_equal */
202 BEGIN (s_equal); /* s_name, s_equal */
203 return ICAL_EQUAL;
204 }
205
206 <s_equal,s_comma>
207 {folded-param-value} {
208 /* flex 2.5.4 defines icalleng as an int instead of a size_t,
209 so copy it. */
210 size_t len = icalleng;
211 unfold (icaltext, &len);
212 icalleng = len;
213
214 icallval = strdup (icaltext);
215 /* yy_pop_state (); * s_name */
216 BEGIN (s_name); /* s_name */
217 return ICAL_PARAM_VALUE;
218 }
219
220 <s_equal,s_comma>
221 {param-value} {
222 icallval = strdup (icaltext);
223 /* yy_pop_state (); * s_name */
224 BEGIN (s_name); /* s_name */
225 return ICAL_PARAM_VALUE;
226 }
227
228 <s_name>
229 {COMMA} {
230 /* Don't need to strdup a single character. */
231 icallval = icaltext;
232 /* yy_push_state (s_comma); * s_name, s_comma */
233 BEGIN (s_comma); /* s_name, s_comma */
234 return ICAL_COMMA;
235 }
236
237 <s_value>
238 {CRLF} {
239 /* Use start condition to ensure that all newlines are where expected. */
240 icallval = icaltext;
241 /* yy_pop_state (); * INITIAL */
242 BEGIN (INITIAL); /* INITIAL */
243 return ICAL_CRLF;
244 }
245
246 <s_colon>
247 {CRLF} {
248 /* Null value. */
249 icallval = strdup ("");
250 /* yy_pop_state (); * INITIAL */
251 /* yy_push_state (s_value); * s_value */
252 BEGIN (s_value); /* s_value */
253 /* Push the newline back so it can be handled in the proper state. */
254 unput ('\n');
255 return ICAL_VALUE;
256 }
257
258 . {
259 /* By default, flex will just pass unmatched text. Catch it instead. */
260 contentline *clines = vevents.last->contentlines;
261 contentline *cline;
262
263 if (clines && (cline = clines->last)) {
264 if (cline->unexpected == NULL) {
265 cline->unexpected = charstring_create (0);
266 }
267 charstring_append_cstring (cline->unexpected, icaltext);
268 }
269 }
270
271 <INITIAL>
272 <<EOF>> {
273 /* See next rule for when start state is not INITIAL. */
274 destroy_icallex ();
275 yyterminate ();
276 }
277
278 <<EOF>> {
279 /* Missing a final newline after a token. The input does not conform
280 to RFC 5545 § 3.1, which requires that each contentline end with a
281 CRLF. (Assume that the token is at the end of a contentline.) Be
282 liberal in what we accept by faking a newline here, and setting the
283 start state to terminate on the next call. */
284 BEGIN (INITIAL);
285 return ICAL_CRLF;
286 }
287
288 %%
289
290 static char *
291 unfold (char *text, size_t *leng) {
292 /* It's legal to shorten text and modify leng (because we don't
293 use yymore()). */
294 char *cp;
295
296 /* First squash any CR-LF-WSP sequences. */
297 while ((cp = strstr (text, "\r\n ")) || (cp = strstr (text, "\r\n\t"))) {
298 /* Subtract any characters prior to fold sequence and 3 for
299 the fold sequence, and add 1 for the terminating null. */
300 (void) memmove (cp, cp + 3, *leng - (cp - text) - 3 + 1);
301 *leng -= 3;
302 }
303
304 /* Then squash any LF-WSP sequences. */
305 while ((cp = strstr (text, "\n ")) || (cp = strstr (text, "\n\t"))) {
306 /* Subtract any characters prior to fold sequence and 2 for
307 the fold sequence, and add 1 for the terminating null. */
308 (void) memmove (cp, cp + 2, *leng - (cp - text) - 2 + 1);
309 *leng -= 2;
310 }
311
312 return text;
313 }
314
315
316 /*
317 * To clean up memory, call the function provided by modern
318 * versions of flex. Older versions don't have it, and of
319 * course this won't do anything if the scanner was built
320 * with something other than flex.
321 */
322 static void
323 destroy_icallex(void) {
324 #if defined FLEX_SCANNER && defined YY_FLEX_SUBMINOR_VERSION
325 /* Hack: rely on fact that the the YY_FLEX_SUBMINOR_VERSION
326 #define was added to flex (flex.skl v. 2.163) after
327 #yylex_destroy() was added. */
328 icallex_destroy ();
329 #endif /* FLEX_SCANNER && YY_CURRENT_BUFFER_LVALUE */
330 }
331
332 /*
333 * See comment in h/icalendar.h about having to provide these
334 * because flex 2.5.4 doesn't.
335 */
336 void
337 icalset_inputfile (FILE *file) {
338 yyin = file;
339 }
340
341 void
342 icalset_outputfile (FILE *file) {
343 yyout = file;
344 }
345
346 /*
347 * Porting notes
348 * -------------
349 * POSIX lex only supports an entry point name of yylex(). nmh
350 * programs can contain multiple scanners (see sbr/dtimep.l), so
351 * nmh requires the use of flex to build them.
352 * In addition, if there is a need to port this to Solaris lex:
353 * - Use the lex -e or -w option.
354 * - Comment out all of the %options.
355 * - Comment out the <<EOF>> rule.
356 * - The start condition and pattern must be on the same line.
357 * - Comments must be inside rules, not just before them.
358 * - Don't use start condition stack. In the code, above BEGIN's are
359 * used instead, and the contents of an imaginary start condition
360 * stack are shown after each. The stack operations are also shown
361 * in comments.
362 */