found, only count it as taking up one character in the
destination buffer (if it has enought room). This way,
scan(1)'s output won't be jagged if there are any subjects,
for example, that have multibyte characters.
Added a test for this to test-scan-multibyte, and moved the
test for an invalid multibyte sequence from test-scan to it.
* no more than n bytes are copied
*/
static void
* no more than n bytes are copied
*/
static void
-cptrimmed(char **dest, char *str, unsigned int wid, char fill, size_t n) {
+cptrimmed(char **dest, char *str, unsigned int wid, char fill, size_t n,
+ size_t max) {
int remaining; /* remaining output width available */
int c, ljust;
int end; /* number of input bytes remaining in str */
int remaining; /* remaining output width available */
int c, ljust;
int end; /* number of input bytes remaining in str */
#endif
char *sp; /* current position in source string */
char *cp = *dest; /* current position in destination string */
#endif
char *sp; /* current position in source string */
char *cp = *dest; /* current position in destination string */
- char *ep = cp + n; /* end of destination buffer */
+ char *ep = cp + n; /* end of destination buffer based on desired width */
+ char *epmax = cp + max; /* true end of destination buffer */
int prevCtrl = 1;
/* get alignment */
int prevCtrl = 1;
/* get alignment */
while (*sp && remaining > 0 && end > 0) {
#ifdef MULTIBYTE_SUPPORT
char_len = mbtowc(&wide_char, sp, end);
while (*sp && remaining > 0 && end > 0) {
#ifdef MULTIBYTE_SUPPORT
char_len = mbtowc(&wide_char, sp, end);
+
+ /* Account for multibyte characters taking only one character's
+ width of output. */
+ if (char_len > 1 && epmax - ep >= char_len - 1) {
+ ep += char_len - 1;
+ }
+
if (char_len <= 0 || (cp + char_len > ep))
break;
if (char_len <= 0 || (cp + char_len > ep))
break;
-cpstripped (char **dest, char *end, char *str)
+cpstripped (char **dest, char *end, char *max, char *str)
{
int prevCtrl = 1; /* This is 1 so we strip out leading spaces */
int len;
{
int prevCtrl = 1; /* This is 1 so we strip out leading spaces */
int len;
#ifdef MULTIBYTE_SUPPORT
char_len = mbtowc(&wide_char, str, len);
#ifdef MULTIBYTE_SUPPORT
char_len = mbtowc(&wide_char, str, len);
+ /* Account for multibyte characters taking only one character's
+ width of output. */
+ if (char_len > 1 && max - end >= char_len - 1) {
+ end += char_len - 1;
+ }
+
if (char_len <= 0 || *dest + char_len > end)
break;
if (char_len <= 0 || *dest + char_len > end)
break;
switch (fmt->f_type) {
case FT_COMP:
switch (fmt->f_type) {
case FT_COMP:
- cpstripped (&cp, ep, fmt->f_comp->c_text);
+ cpstripped (&cp, ep, scanl + max - 1, fmt->f_comp->c_text);
- cptrimmed (&cp, fmt->f_comp->c_text, fmt->f_width, fmt->f_fill, ep - cp);
+ cptrimmed (&cp, fmt->f_comp->c_text, fmt->f_width, fmt->f_fill,
+ ep - cp, scanl - cp + max - 1);
- cpstripped (&cp, ep, str);
+ cpstripped (&cp, ep, scanl + max - 1, str);
- cptrimmed (&cp, str, fmt->f_width, fmt->f_fill, ep - cp);
+ cptrimmed (&cp, str, fmt->f_width, fmt->f_fill, ep - cp,
+ scanl - cp + max - 1);
break;
case FT_STRLIT:
sp = str;
break;
case FT_STRLIT:
sp = str;
- cpstripped (&cp, ep, lp);
+ cpstripped (&cp, ep, scanl + max - 1, lp);
scan -form scan.highlighted -width 80 >"$actual" || exit 1
check "$expected" "$actual"
scan -form scan.highlighted -width 80 >"$actual" || exit 1
check "$expected" "$actual"
-# check decoding with an invalid multibyte sequence.
-cat >"$expected" <<EOF
- 11 12/31 Test11 2013 New Year?s Deals! Start the year right
-EOF
-
-cat >`mhpath new` <<EOF
-From: Test11 <test11@example.com>
-To: Some User <user@example.com>
-Date: Mon, 31 Dec 2012 00:00:00
-Message-Id: 11@test.nmh
-Subject: =?UTF-8?B?MjAxMyBOZXcgWWVhcuKAmXMgRGVhbHMhIFN0YXJ0IHRoZSB5ZWFy?=
- =?UTF-8?B?IHJpZ2h0IHdpdGggYmlnIHNhdmluZ3M=?=
-
-This message has an encoded Subject with an invalid character for the
-ISO-8859-1 character set.
-EOF
-LC_CTYPE=ISO-8859-1 MM_CHARSET=ISO-8859-1 scan -width 75 last >"$actual"
-check "$expected" "$actual"
-
Things are looking great!
EOF
Things are looking great!
EOF
-expected=$MH_TEST_DIR/$$.expected
-actual=$MH_TEST_DIR/$$.actual
+expected="$MH_TEST_DIR/$$.expected"
+actual="$MH_TEST_DIR/$$.actual"
11 03/02 David st Hubbins Spin̈al Tap → Tap into America!<<Things are
EOF
scan -width 80 +inbox 11 > $actual || exit 1
11 03/02 David st Hubbins Spin̈al Tap → Tap into America!<<Things are
EOF
scan -width 80 +inbox 11 > $actual || exit 1
+check "$expected" "$actual"
+
+# check decoding with an invalid multibyte sequence
+cat >`mhpath new` <<EOF
+From: Test12 <test12@example.com>
+To: Some User <user@example.com>
+Date: Mon, 31 Dec 2012 00:00:00
+Message-Id: 12@test.nmh
+Subject: =?UTF-8?B?MjAxMyBOZXcgWWVhcuKAmXMgRGVhbHMhIFN0YXJ0IHRoZSB5ZWFy?=
+ =?UTF-8?B?IHJpZ2h0IHdpdGggYmlnIHNhdmluZ3M=?=
+
+This message has an encoded Subject with an invalid character for the
+ISO-8859-1 character set, but it (U+2019) is valid UTF-8.
+EOF
+
+cat >"$expected" <<EOF
+ 12 12/31 Test12 2013 New Year?s Deals! Start the year right
+EOF
+LC_CTYPE=ISO-8859-1 MM_CHARSET=ISO-8859-1 scan -width 75 last >"$actual"
check "$expected" "$actual"
check "$expected" "$actual"
+# check scan width with a valid multibyte sequence
+cat >"$expected" <<EOF
+ 12 12/31 Test12 2013 New Year’s Deals! Start the year right
+EOF
+
+LC_CTYPE=en_US.UTF-8 MM_CHARSET=UTF-8 scan -width 75 last >"$actual"
+check "$expected" "$actual"
+
+