From: David Levine Date: Fri, 4 Jan 2013 03:59:44 +0000 (-0600) Subject: In cpstripped() and cptrimmed(), if a multibyte character is X-Git-Url: https://diplodocus.org/git/nmh/commitdiff_plain/71eaed755250fdd6ac66d1139e59a2cc7ad3980e?ds=sidebyside;hp=--cc In cpstripped() and cptrimmed(), if a multibyte character is found, only count it as taking up one character in the destination buffer (if it has enought room). This way, scan(1)'s output won't be jagged if there are any subjects, for example, that have multibyte characters. Added a test for this to test-scan-multibyte, and moved the test for an invalid multibyte sequence from test-scan to it. --- 71eaed755250fdd6ac66d1139e59a2cc7ad3980e diff --git a/sbr/fmt_scan.c b/sbr/fmt_scan.c index ac4d23af..0d9e1e90 100644 --- a/sbr/fmt_scan.c +++ b/sbr/fmt_scan.c @@ -118,7 +118,8 @@ cpnumber(char **dest, int num, unsigned int wid, char fill, size_t n) { * no more than n bytes are copied */ static void -cptrimmed(char **dest, char *str, unsigned int wid, char fill, size_t n) { +cptrimmed(char **dest, char *str, unsigned int wid, char fill, size_t n, + size_t max) { int remaining; /* remaining output width available */ int c, ljust; int end; /* number of input bytes remaining in str */ @@ -129,7 +130,8 @@ cptrimmed(char **dest, char *str, unsigned int wid, char fill, size_t n) { #endif char *sp; /* current position in source string */ char *cp = *dest; /* current position in destination string */ - char *ep = cp + n; /* end of destination buffer */ + char *ep = cp + n; /* end of destination buffer based on desired width */ + char *epmax = cp + max; /* true end of destination buffer */ int prevCtrl = 1; /* get alignment */ @@ -144,6 +146,13 @@ cptrimmed(char **dest, char *str, unsigned int wid, char fill, size_t n) { while (*sp && remaining > 0 && end > 0) { #ifdef MULTIBYTE_SUPPORT char_len = mbtowc(&wide_char, sp, end); + + /* Account for multibyte characters taking only one character's + width of output. */ + if (char_len > 1 && epmax - ep >= char_len - 1) { + ep += char_len - 1; + } + if (char_len <= 0 || (cp + char_len > ep)) break; @@ -208,7 +217,7 @@ cptrimmed(char **dest, char *str, unsigned int wid, char fill, size_t n) { } static void -cpstripped (char **dest, char *end, char *str) +cpstripped (char **dest, char *end, char *max, char *str) { int prevCtrl = 1; /* This is 1 so we strip out leading spaces */ int len; @@ -235,6 +244,12 @@ cpstripped (char **dest, char *end, char *str) #ifdef MULTIBYTE_SUPPORT char_len = mbtowc(&wide_char, str, len); + /* Account for multibyte characters taking only one character's + width of output. */ + if (char_len > 1 && max - end >= char_len - 1) { + end += char_len - 1; + } + if (char_len <= 0 || *dest + char_len > end) break; @@ -373,10 +388,11 @@ fmt_scan (struct format *format, char *scanl, size_t max, int width, int *dat) switch (fmt->f_type) { case FT_COMP: - cpstripped (&cp, ep, fmt->f_comp->c_text); + cpstripped (&cp, ep, scanl + max - 1, fmt->f_comp->c_text); break; case FT_COMPF: - cptrimmed (&cp, fmt->f_comp->c_text, fmt->f_width, fmt->f_fill, ep - cp); + cptrimmed (&cp, fmt->f_comp->c_text, fmt->f_width, fmt->f_fill, + ep - cp, scanl - cp + max - 1); break; case FT_LIT: @@ -399,10 +415,11 @@ fmt_scan (struct format *format, char *scanl, size_t max, int width, int *dat) break; case FT_STR: - cpstripped (&cp, ep, str); + cpstripped (&cp, ep, scanl + max - 1, str); break; case FT_STRF: - cptrimmed (&cp, str, fmt->f_width, fmt->f_fill, ep - cp); + cptrimmed (&cp, str, fmt->f_width, fmt->f_fill, ep - cp, + scanl - cp + max - 1); break; case FT_STRLIT: sp = str; @@ -926,7 +943,7 @@ fmt_scan (struct format *format, char *scanl, size_t max, int width, int *dat) *cp++ = ' '; } } - cpstripped (&cp, ep, lp); + cpstripped (&cp, ep, scanl + max - 1, lp); } break; diff --git a/test/scan/test-scan b/test/scan/test-scan index 7fd4b7bb..632c54ba 100755 --- a/test/scan/test-scan +++ b/test/scan/test-scan @@ -53,24 +53,5 @@ mark -sequence unseen 10 scan -form scan.highlighted -width 80 >"$actual" || exit 1 check "$expected" "$actual" -# check decoding with an invalid multibyte sequence. -cat >"$expected" <`mhpath new` < -To: Some User -Date: Mon, 31 Dec 2012 00:00:00 -Message-Id: 11@test.nmh -Subject: =?UTF-8?B?MjAxMyBOZXcgWWVhcuKAmXMgRGVhbHMhIFN0YXJ0IHRoZSB5ZWFy?= - =?UTF-8?B?IHJpZ2h0IHdpdGggYmlnIHNhdmluZ3M=?= - -This message has an encoded Subject with an invalid character for the -ISO-8859-1 character set. -EOF -LC_CTYPE=ISO-8859-1 MM_CHARSET=ISO-8859-1 scan -width 75 last >"$actual" -check "$expected" "$actual" - exit $failed diff --git a/test/scan/test-scan-multibyte b/test/scan/test-scan-multibyte index ce4ec695..e877cf2d 100755 --- a/test/scan/test-scan-multibyte +++ b/test/scan/test-scan-multibyte @@ -41,15 +41,43 @@ Subject: =?utf-8?q?Spin=CC=88al_Tap_=E2=86=92_Tap_into_America!?= Things are looking great! EOF -expected=$MH_TEST_DIR/$$.expected -actual=$MH_TEST_DIR/$$.actual +expected="$MH_TEST_DIR/$$.expected" +actual="$MH_TEST_DIR/$$.actual" -cat > $expected < "$expected" < $actual || exit 1 +check "$expected" "$actual" + +# check decoding with an invalid multibyte sequence +cat >`mhpath new` < +To: Some User +Date: Mon, 31 Dec 2012 00:00:00 +Message-Id: 12@test.nmh +Subject: =?UTF-8?B?MjAxMyBOZXcgWWVhcuKAmXMgRGVhbHMhIFN0YXJ0IHRoZSB5ZWFy?= + =?UTF-8?B?IHJpZ2h0IHdpdGggYmlnIHNhdmluZ3M=?= + +This message has an encoded Subject with an invalid character for the +ISO-8859-1 character set, but it (U+2019) is valid UTF-8. +EOF + +cat >"$expected" <"$actual" check "$expected" "$actual" +# check scan width with a valid multibyte sequence +cat >"$expected" <"$actual" +check "$expected" "$actual" + + exit $failed