From: Ken Hornstein <kenh@pobox.com>
Date: Fri, 29 Aug 2014 02:29:21 +0000 (-0400)
Subject: Very very rough cut at trying to parse email addresses with Bison.
X-Git-Url: https://diplodocus.org/git/nmh/commitdiff_plain/60f8bfdaf9d304bdac5bd1d450160c86599b7b45?ds=inline;hp=-c

Very very rough cut at trying to parse email addresses with Bison.
---

60f8bfdaf9d304bdac5bd1d450160c86599b7b45
diff --git a/.gitignore b/.gitignore
index 5b326498..2992ad6d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,6 +54,9 @@ a.out.dSYM/
 /sbr/*.a
 /sbr/ctype-checked.*
 /sbr/sigmsg.h
+/sbr/addrparse.tab.c
+/sbr/addrparse.tab.h
+/sbr/addrparse.output
 /test/testdir/
 /uip/ali
 /uip/anno
diff --git a/sbr/addrparse.y b/sbr/addrparse.y
new file mode 100644
index 00000000..7121545d
--- /dev/null
+++ b/sbr/addrparse.y
@@ -0,0 +1,211 @@
+%name-prefix "addr"
+
+/*
+ * Comments on these tokens:
+ *
+ * ATEXT is defined in RFC 5222 as:
+ *	ALPHA / DIGIT /
+ *	'!' / '#' / '$' / '%' / '&' / ''' / '*' / '+' / '-' / '/' /
+ *      '=' / '?' / '^' / '_' / '`' / '{' / '|' / '}' / '~'
+ *
+ * All printable ASCII characters except for spaces and specials
+ *
+ * QSTRING is a quoted string, which is printable ASCII characters except
+ * for \ or the quote character surrounded by quotes.  Use \ for quoting
+ * \ and the quote character.
+ *
+ * FWS is folding white space, which is defined as SP (\040), HTAB (\011),
+ * and NL (\012).  Technically CR (\015) is part of that, but traditionally
+ * Unix format files don't have that character.
+ *
+ * COMMENT is a comment string, which is printable ASCII characters except
+ * for '(', ')', and '\'.  Uses same quoting rules as QSTRING.  To make
+ * the grammer slightly less conflict-happy, COMMENT must include any FWS
+ * in front or behind of it (simply have it eaten in the lexer).
+ *
+ * Everything else is a SPECIAL, which is returned directly.  These are
+ * defined in RFC 5322 as:
+ *
+ *	'(' / ')' / '<' / '>' / '[' / ']' / ':' / ';' / '@' / '\' / ',' / '.' /
+ *	'"'
+ *
+ * Technically we don't return all of these; we handle () in comments, " in
+ * quoted string handling, and \ in those handlers.
+ */
+
+%token ATEXT QSTRING FWS COMMENT
+
+%%
+
+/*
+ * A list of addresses; the main entry point to the parser
+ */
+address_list:	/* nothing */
+	| address_list ',' address
+	;
+
+/*
+ * A single address; can be a single mailbox, or a group address
+ */
+
+address:
+	mailbox
+	| group
+	;
+
+/*
+ * A traditional single mailbox.  Either in Name <user@name> or just a bare
+ * email address with no angle brackets.
+ */
+
+mailbox:
+	name_addr
+	| addr_spec
+	;
+
+/*
+ * An email address, with the angle brackets.  Optionally contains a display
+ * name in the front.  The RFC says "display-name", but display-name is
+ * defined as a phrase, so we just use that.
+ */
+
+name_addr:
+	phrase angle_addr
+	| angle_addr
+	;
+
+angle_addr:
+	cfws '<' addr_spec '>' cfws
+	| cfws '<' addr_spec '>'
+	| '<' addr_spec '>' cfws
+	| '<' addr_spec '>'
+	;
+
+/*
+ * The group list syntax.  The group list is allowed to be empty or be
+ * spaces, so we define group_list as either being a mailbox list or
+ * just being CFWS.  mailbox_list can be empty, so that can handle the
+ * case of nothing being between the ':' and the ';'
+ */
+group:
+	phrase ':' group_list ';' cfws
+	| phrase ':' group_list ';'
+	| phrase ':' ';'
+	;
+
+group_list:
+	mailbox_list
+	| cfws
+	;
+
+mailbox_list:	/* nothing */
+	| mailbox_list ',' mailbox
+	;
+
+addr_spec:
+	local_part '@' domain
+	;
+
+local_part:
+	dot_atom
+	| quoted_string
+	;
+
+domain:
+	dot_atom
+	| domain_literal
+	;
+
+domain_literal:
+	cfws '[' dtext_fws ']' cfws
+	| cfws '[' dtext_fws ']'
+	| '[' dtext_fws ']' cfws
+	| '[' dtext_fws ']'
+	;
+
+/*
+ * It was hard to make a definition of dtext and domain-literal that
+ * exactly matched the RFC.  This was the best I could come up with.
+ */
+ 
+dtext_fws: /* nothing */
+	| FWS ATEXT FWS
+	| FWS ATEXT
+	| ATEXT FWS
+	| dtext_fws FWS ATEXT FWS
+	| dtext_fws FWS ATEXT
+	| dtext_fws ATEXT FWS
+	| dtext_fws ATEXT
+	;
+
+phrase:
+	word
+	| phrase word
+	| obs_phrase
+	;
+
+/*
+ * obs-phrase is basically the same as "phrase", but after the first word
+ * you're allowed to have a '.'.  I believe this is correct.
+ */
+
+obs_phrase:
+	word obs_phrase_list
+	;
+
+obs_phrase_list:
+	word
+	| '.'
+	| obs_phrase_list word
+	;
+
+word:
+	atom
+	| quoted_string
+	;
+
+/*
+ * This makes sure any comments and white space before/after the quoted string
+ * get eaten.
+ */
+quoted_string:
+	cfws QSTRING cfws
+	| QSTRING cfws
+	| cfws QSTRING
+	| QSTRING
+	;
+
+atom:
+	cfws ATEXT cfws
+	| cfws ATEXT
+	| ATEXT cfws
+	| ATEXT
+	;
+
+/*
+ * Making dot-atom work was a little confusing; I finally handled it by
+ * defining "dot_atom_text" as having two or more ATEXTs separted by
+ * '.', and defining dot_atom as allowing a single atom.
+ */
+dot_atom:
+	atom
+	| cfws dot_atom_text cfws
+	| cfws dot_atom_text
+	| dot_atom_text cfws
+	| dot_atom_text
+	;
+
+dot_atom_text:
+	ATEXT '.' ATEXT
+	| dot_atom_text '.' ATEXT
+	;
+
+/*
+ * As mentioned above, technically in the CFWS definition in the RFC allows
+ * FWS before and after the comment.  The lexer is responsible for eating
+ * the FWS before/after comments.
+ */
+cfws:
+	COMMENT
+	| FWS
+	;