]> diplodocus.org Git - nmh/blob - test/scan/test-scan-multibyte
Updates for 1.7 release.
[nmh] / test / scan / test-scan-multibyte
1 #!/bin/sh
2 ############################################################
3 #
4 # Test scan to see if multibyte support (UTF-8 locale) works
5 #
6 # Other tests will get the normal ASCII case, so all we care
7 # about here is UTF-8 encoded headers (RFC 2047).
8 #
9 # Note that this file should be edited via a UTF-8 aware
10 # editor, since UTF-8 characters are in it.
11 #
12 ############################################################
13
14 set -e
15
16 if test -z "${MH_OBJ_DIR}"; then
17 srcdir=`dirname "$0"`/../..
18 MH_OBJ_DIR=`cd "$srcdir" && pwd`; export MH_OBJ_DIR
19 fi
20
21 . "$MH_OBJ_DIR/test/common.sh"
22
23 setup_test
24
25 if test "${MULTIBYTE_ENABLED}" -ne 1; then
26 test_skip "configure did not detect multibyte support"
27 fi
28
29 require_locale en_US.UTF-8 en_US.UTF8 en_US.utf-8 en_US.utf8
30
31 #
32 # Create a test message with RFC 2047 headers we can scan
33 #
34 # In this Subject header in this message is a "n" with a Combining Diaeresis
35 # (U+0308). There is different interpretation of this character with respect
36 # to wcwidth() (which is supposed to return the column width of a character).
37 # We use a test program to determine what the output width of U+0308 is
38 # and adjust our test output appropriately.
39 #
40 # True Spın̈al Tap fans will note that David st Hubbins was born in Squatney,
41 # London, England, and thus having his name language-tagged with "cy" is almost
42 # certainly incorrect. But in his own words: "Here lies David st Hubbins,
43 # and why not?".
44 #
45 # The second "* in the To line is just to exercise the parser a bit.
46 #
47
48 cat > "${MH_TEST_DIR}/Mail/inbox/11" <<EOF
49 From: David =?utf-8*cy?q?=EF=AC=86?= Hubbins <hubbins@example.com>
50 To: Sir Denis =?utf-8*?q?Eton=E2=80=93Hogg? <sirdenis@example.com>
51 Date: Friday, 2 Mar 1984 00:00:00
52 Subject: =?utf-8?q?Sp=C4=B1n=CC=88al_Tap_=E2=86=92_Tap_into_America!?=
53
54 Things are looking great!
55 EOF
56
57 width=`${MH_OBJ_DIR}/test/getcwidth "→n̈"`
58 if test $? -ne 0; then
59 echo "getcwidth failed to run"
60 exit 1
61 fi
62
63 expected="$MH_TEST_DIR/$$.expected"
64 actual="$MH_TEST_DIR/$$.actual"
65
66
67 start_test 'RFC 2047 headers'
68 if test "$width" -eq 3; then
69 cat > "$expected" <<EOF
70 11 03/02 David st Hubbins Spın̈al Tap → Tap into America!<<Things are looki
71 EOF
72 elif test "$width" -eq 2; then
73 cat > "$expected" <<EOF
74 11 03/02 David st Hubbins Spın̈al Tap → Tap into America!<<Things are lookin
75 EOF
76 else
77 echo "Unsupported width for UTF-8 test string: $width"
78 exit 1
79 fi
80
81 run_prog scan -width 80 +inbox 11 > $actual || exit 1
82 check "$expected" "$actual"
83
84
85 #
86 # Check decoding with an invalid multibyte sequence. We skip this test
87 # if we don't have iconv support, since it requires converting from one
88 # character set to another. Be sure we created the test file, though, because
89 # it's required for the test right after it.
90 #
91
92 start_test 'invalid multibyte sequence'
93 cat >`mhpath new` <<EOF
94 From: Test12 <test12@example.com>
95 To: Some User <user@example.com>
96 Date: Mon, 31 Dec 2012 00:00:00
97 Message-Id: 12@test.nmh
98 Subject: =?UTF-8?B?MjAxMyBOZXcgWWVhcuKAmXMgRGVhbHMhIFN0YXJ0IHRoZSB5ZWFy?=
99 =?UTF-8?B?IHJpZ2h0IHdpdGggYmlnIHNhdmluZ3M=?=
100
101 This message has an encoded Subject with an invalid character for
102 single-byte character sets, but it (U+2019) is valid UTF-8.
103 EOF
104
105 if test "$ICONV_ENABLED" -eq 1; then
106 cat >"$expected" <<EOF
107 12 12/31 Test12 2013 New Year?s Deals! Start the year right
108 EOF
109
110 # Don't use run_prog here because it loses the environment setting.
111 LC_ALL=C scan -width 74 last >"$actual"
112 check "$expected" "$actual"
113 fi
114
115 #
116 # Find out the width of our Unicode apostrophe (U+2019). Some implementations
117 # say it has a width of 2, but that seems totally bizarre to me.
118 #
119
120 width=`${MH_OBJ_DIR}/test/getcwidth U+2019`
121 if test $? -ne 0; then
122 echo "getcwidth failed to run"
123 exit 1
124 fi
125
126 # check scan width with a valid multibyte sequence
127 start_test 'scan width with a valid multibyte sequence'
128 if test "$width" -eq 1; then
129 cat >"$expected" <<EOF
130 12 12/31 Test12 2013 New Year’s Deals! Start the year right
131 EOF
132 elif test "$width" -eq 2; then
133 cat >"$expected" <<EOF
134 12 12/31 Test12 2013 New Year’s Deals! Start the year righ
135 EOF
136 else
137 echo "Unsupported width for U+2019: $width"
138 fi
139
140 run_prog scan -width 74 last >"$actual"
141 check "$expected" "$actual"
142
143
144 if test "$ICONV_ENABLED" -eq 1; then
145 start_test 'encoded single quote'
146 cat >"$expected" <<EOF
147 13 01/13 sender@example.co <<The Subject: is an encoded single quote, 0x92.
148 EOF
149
150 cat >"${MH_TEST_DIR}/Mail/inbox/13" <<EOF
151 From: <sender@example.com>
152 Subject: =?iso-8859-1?B?kgo=?=
153 Date: Mon, 13 Jan 2014 14:18:33 -0600
154
155 The Subject: is an encoded single quote, 0x92. cpstripped() didn't
156 properly count it when decoding, which could be seen with:
157
158 scan -format '%(decode{subject})%{body}'
159
160 The scan listing was two characters too long.
161 EOF
162
163 run_prog scan -width 80 last >"$actual"
164 check "$expected" "$actual"
165 fi
166
167
168 start_test 'insufficient room for multicolumn character'
169 #### This multibyte character requires 2 columns for display, but
170 #### only 1 is availble. cpstripped() used to get this wrong.
171
172 cat >"$expected" <<EOF
173
174 EOF
175
176 cat >`mhpath new` <<EOF
177 Mime-Version: 1.0
178
179 在 Should not see any of this text
180 EOF
181
182 run_prog scan -format '%{body}' -width 1 last >"$actual"
183 check "$expected" "$actual"
184
185
186 finish_test
187 exit $failed