From: David Levine <levinedl@acm.org>
Date: Fri, 4 Jan 2013 03:59:44 +0000 (-0600)
Subject: In cpstripped() and cptrimmed(), if a multibyte character is
X-Git-Url: https://diplodocus.org/git/nmh/commitdiff_plain/71eaed755250fdd6ac66d1139e59a2cc7ad3980e?ds=sidebyside;hp=--cc

In cpstripped() and cptrimmed(), if a multibyte character is
found, only count it as taking up one character in the
destination buffer (if it has enought room).  This way,
scan(1)'s output won't be jagged if there are any subjects,
for example, that have multibyte characters.

Added a test for this to test-scan-multibyte, and moved the
test for an invalid multibyte sequence from test-scan to it.
---

71eaed755250fdd6ac66d1139e59a2cc7ad3980e
diff --git a/sbr/fmt_scan.c b/sbr/fmt_scan.c
index ac4d23af..0d9e1e90 100644
--- a/sbr/fmt_scan.c
+++ b/sbr/fmt_scan.c
@@ -118,7 +118,8 @@ cpnumber(char **dest, int num, unsigned int wid, char fill, size_t n) {
  * no more than n bytes are copied
  */
 static void
-cptrimmed(char **dest, char *str, unsigned int wid, char fill, size_t n) {
+cptrimmed(char **dest, char *str, unsigned int wid, char fill, size_t n,
+	  size_t max) {
     int remaining;     /* remaining output width available */
     int c, ljust;
     int end;           /* number of input bytes remaining in str */
@@ -129,7 +130,8 @@ cptrimmed(char **dest, char *str, unsigned int wid, char fill, size_t n) {
 #endif
     char *sp;          /* current position in source string */
     char *cp = *dest;  /* current position in destination string */
-    char *ep = cp + n; /* end of destination buffer */
+    char *ep = cp + n; /* end of destination buffer based on desired width */
+    char *epmax = cp + max; /* true end of destination buffer */
     int prevCtrl = 1;
 
     /* get alignment */
@@ -144,6 +146,13 @@ cptrimmed(char **dest, char *str, unsigned int wid, char fill, size_t n) {
 	while (*sp && remaining > 0 && end > 0) {
 #ifdef MULTIBYTE_SUPPORT
 	    char_len = mbtowc(&wide_char, sp, end);
+
+	    /* Account for multibyte characters taking only one character's
+	       width of output. */
+	    if (char_len > 1  &&  epmax - ep >= char_len - 1) {
+		ep += char_len - 1;
+	    }
+
 	    if (char_len <= 0 || (cp + char_len > ep))
 		break;
 
@@ -208,7 +217,7 @@ cptrimmed(char **dest, char *str, unsigned int wid, char fill, size_t n) {
 }
 
 static void
-cpstripped (char **dest, char *end, char *str)
+cpstripped (char **dest, char *end, char *max, char *str)
 {
     int prevCtrl = 1;	/* This is 1 so we strip out leading spaces */
     int len;
@@ -235,6 +244,12 @@ cpstripped (char **dest, char *end, char *str)
 #ifdef MULTIBYTE_SUPPORT
     	char_len = mbtowc(&wide_char, str, len);
 
+	/* Account for multibyte characters taking only one character's
+	   width of output. */
+	if (char_len > 1  &&  max - end >= char_len - 1) {
+	    end += char_len - 1;
+	}
+
 	if (char_len <= 0 || *dest + char_len > end)
 	    break;
 
@@ -373,10 +388,11 @@ fmt_scan (struct format *format, char *scanl, size_t max, int width, int *dat)
 	switch (fmt->f_type) {
 
 	case FT_COMP:
-	    cpstripped (&cp, ep, fmt->f_comp->c_text);
+	    cpstripped (&cp, ep, scanl + max - 1, fmt->f_comp->c_text);
 	    break;
 	case FT_COMPF:
-	    cptrimmed (&cp, fmt->f_comp->c_text, fmt->f_width, fmt->f_fill, ep - cp);
+	    cptrimmed (&cp, fmt->f_comp->c_text, fmt->f_width, fmt->f_fill,
+		       ep - cp, scanl - cp + max - 1);
 	    break;
 
 	case FT_LIT:
@@ -399,10 +415,11 @@ fmt_scan (struct format *format, char *scanl, size_t max, int width, int *dat)
 	    break;
 
 	case FT_STR:
-	    cpstripped (&cp, ep, str);
+	    cpstripped (&cp, ep, scanl + max - 1, str);
 	    break;
 	case FT_STRF:
-	    cptrimmed (&cp, str, fmt->f_width, fmt->f_fill, ep - cp);
+	    cptrimmed (&cp, str, fmt->f_width, fmt->f_fill, ep - cp,
+		       scanl - cp + max - 1);
 	    break;
 	case FT_STRLIT:
 	    sp = str;
@@ -926,7 +943,7 @@ fmt_scan (struct format *format, char *scanl, size_t max, int width, int *dat)
 			*cp++ = ' ';
 		}
 	    }
-	    cpstripped (&cp, ep, lp);
+	    cpstripped (&cp, ep, scanl + max - 1, lp);
 	    }
 	    break;
 
diff --git a/test/scan/test-scan b/test/scan/test-scan
index 7fd4b7bb..632c54ba 100755
--- a/test/scan/test-scan
+++ b/test/scan/test-scan
@@ -53,24 +53,5 @@ mark -sequence unseen 10
 scan -form scan.highlighted -width 80 >"$actual" || exit 1
 check "$expected" "$actual"
 
-# check decoding with an invalid multibyte sequence.
-cat >"$expected" <<EOF
-  11  12/31 Test11             2013 New Year?s Deals! Start the year right
-EOF
-
-cat >`mhpath new` <<EOF
-From: Test11 <test11@example.com>
-To: Some User <user@example.com>
-Date: Mon, 31 Dec 2012 00:00:00
-Message-Id: 11@test.nmh
-Subject: =?UTF-8?B?MjAxMyBOZXcgWWVhcuKAmXMgRGVhbHMhIFN0YXJ0IHRoZSB5ZWFy?=
-	=?UTF-8?B?IHJpZ2h0IHdpdGggYmlnIHNhdmluZ3M=?=
-
-This message has an encoded Subject with an invalid character for the
-ISO-8859-1 character set.
-EOF
-LC_CTYPE=ISO-8859-1 MM_CHARSET=ISO-8859-1 scan -width 75 last >"$actual"
-check "$expected" "$actual"
-
 
 exit $failed
diff --git a/test/scan/test-scan-multibyte b/test/scan/test-scan-multibyte
index ce4ec695..e877cf2d 100755
--- a/test/scan/test-scan-multibyte
+++ b/test/scan/test-scan-multibyte
@@ -41,15 +41,43 @@ Subject: =?utf-8?q?Spin=CC=88al_Tap_=E2=86=92_Tap_into_America!?=
 Things are looking great!
 EOF
 
-expected=$MH_TEST_DIR/$$.expected
-actual=$MH_TEST_DIR/$$.actual
+expected="$MH_TEST_DIR/$$.expected"
+actual="$MH_TEST_DIR/$$.actual"
 
-cat > $expected <<EOF
+cat > "$expected" <<EOF
   11  03/02 David ï¬ Hubbins    SpinÌal Tap â Tap into America!<<Things are
 EOF
 
 scan -width 80 +inbox 11 > $actual || exit 1
+check "$expected" "$actual"
+
+# check decoding with an invalid multibyte sequence
+cat >`mhpath new` <<EOF
+From: Test12 <test12@example.com>
+To: Some User <user@example.com>
+Date: Mon, 31 Dec 2012 00:00:00
+Message-Id: 12@test.nmh
+Subject: =?UTF-8?B?MjAxMyBOZXcgWWVhcuKAmXMgRGVhbHMhIFN0YXJ0IHRoZSB5ZWFy?=
+	=?UTF-8?B?IHJpZ2h0IHdpdGggYmlnIHNhdmluZ3M=?=
+
+This message has an encoded Subject with an invalid character for the
+ISO-8859-1 character set, but it (U+2019) is valid UTF-8.
+EOF
+
+cat >"$expected" <<EOF
+  12  12/31 Test12             2013 New Year?s Deals! Start the year right
+EOF
 
+LC_CTYPE=ISO-8859-1 MM_CHARSET=ISO-8859-1 scan -width 75 last >"$actual"
 check "$expected" "$actual"
 
+# check scan width with a valid multibyte sequence
+cat >"$expected" <<EOF
+  12  12/31 Test12             2013 New Yearâs Deals! Start the year right
+EOF
+
+LC_CTYPE=en_US.UTF-8 MM_CHARSET=UTF-8 scan -width 75 last >"$actual"
+check "$expected" "$actual"
+
+
 exit $failed