#!/bin/sh ############################################################ # # Test scan to see if multibyte support (UTF-8 locale) works # # Other tests will get the normal ASCII case, so all we care # about here is UTF-8 encoded headers (RFC 2047). # # Note that this file should be edited via a UTF-8 aware # editor, since UTF-8 characters are in it. # ############################################################ set -e if test -z "${MH_OBJ_DIR}"; then srcdir=`dirname "$0"`/../.. MH_OBJ_DIR=`cd "$srcdir" && pwd`; export MH_OBJ_DIR fi . "$MH_OBJ_DIR/test/common.sh" setup_test if test "${MULTIBYTE_ENABLED}" -ne 1; then test_skip "configure did not detect multibyte support" fi LC_ALL=en_US.UTF-8; export LC_ALL # # Create a test message with RFC 2047 headers we can scan # # In this Subject header in this message is a "n" with a Combining Diaeresis # (U+0308). There is different interpretation of this character with respect # to wcwidth() (which is supposed to return the column width of a character). # We use a test program to determine what the output width of U+0308 is # and adjust our test output appropriately. # # True Spın̈al Tap fans will note that David st Hubbins was born in Squatney, # London, England, and thus having his name language-tagged with "cy" is almost # certainly incorrect. But in his own words: "Here lies David st Hubbins, # and why not?". # # The second "* in the To line is just to exercise the parser a bit. # cat > "${MH_TEST_DIR}/Mail/inbox/11" < To: Sir Denis =?utf-8*?q?Eton=E2=80=93Hogg? Date: Friday, 2 Mar 1984 00:00:00 Subject: =?utf-8?q?Sp=C4=B1n=CC=88al_Tap_=E2=86=92_Tap_into_America!?= Things are looking great! EOF width=`${MH_OBJ_DIR}/test/getcwidth "→n̈"` if test $? -ne 0; then echo "getcwidth failed to run" exit 1 fi expected="$MH_TEST_DIR/$$.expected" actual="$MH_TEST_DIR/$$.actual" if test "$width" -eq 3; then cat > "$expected" < "$expected" < $actual || exit 1 check "$expected" "$actual" # # Check decoding with an invalid multibyte sequence. We skip this test # if we don't have iconv support, since it requires converting from one # character set to another. Be sure we created the test file, though, because # it's required for the test right after it. # cat >`mhpath new` < To: Some User Date: Mon, 31 Dec 2012 00:00:00 Message-Id: 12@test.nmh Subject: =?UTF-8?B?MjAxMyBOZXcgWWVhcuKAmXMgRGVhbHMhIFN0YXJ0IHRoZSB5ZWFy?= =?UTF-8?B?IHJpZ2h0IHdpdGggYmlnIHNhdmluZ3M=?= This message has an encoded Subject with an invalid character for single-byte character sets, but it (U+2019) is valid UTF-8. EOF if test "$ICONV_ENABLED" -eq 1; then cat >"$expected" <"$actual" check "$expected" "$actual" fi # # Find out the width of our Unicode apostrophe (U+2019). Some implementations # say it has a width of 2, but that seems totally bizarre to me. # width=`${MH_OBJ_DIR}/test/getcwidth U+2019` if test $? -ne 0; then echo "getcwidth failed to run" exit 1 fi # check scan width with a valid multibyte sequence if test "$width" -eq 1; then cat >"$expected" <"$expected" <"$actual" check "$expected" "$actual" if test "$ICONV_ENABLED" -eq 1; then cat >"$expected" <"${MH_TEST_DIR}/Mail/inbox/13" < Subject: =?iso-8859-1?B?kgo=?= Date: Mon, 13 Jan 2014 14:18:33 -0600 The Subject: is an encoded single quote, 0x92. cpstripped() didn't properly count it when decoding, which could be seen with: scan -format '%(decode{subject})%{body}' The scan listing was two characters too long. EOF run_prog scan -width 80 last >"$actual" check "$expected" "$actual" fi exit $failed