From: Ken Hornstein <kenh@pobox.com>
Date: Sat, 2 Feb 2013 05:33:51 +0000 (-0500)
Subject: Rework multibyte test so it will calculate the width of all of the
X-Git-Url: https://diplodocus.org/git/nmh/commitdiff_plain/eee44c467bf31f198c73a51cab2f7e9d5a2ad47c?ds=inline;hp=49e9f226f20711b8504584f32e0bb8323739fab8

Rework multibyte test so it will calculate the width of all of the
Unicode characters we are using.
---

diff --git a/test/getcwidth.c b/test/getcwidth.c
index f2a2ac69..59ae33e3 100644
--- a/test/getcwidth.c
+++ b/test/getcwidth.c
@@ -1,7 +1,7 @@
 /*
- * getcwidth - Get the OS's idea of the width of a combining character
+ * getcwidth - Get the OS's idea of the width of Unicode codepoints
  *
- * This code is Copyright (c) 2012, by the authors of nmh.  See the
+ * This code is Copyright (c) 2013, by the authors of nmh.  See the
  * COPYRIGHT file in the root directory of the nmh distribution for
  * complete copyright information.
  */
@@ -20,91 +20,127 @@
 #include <wchar.h>
 #endif
 
+#ifdef MULTIBYTE_SUPPORT
 static void usage(char *);
 static void dumpwidth(void);
+static void getwidth(const char *);
+#endif /* MULTIBYTE_SUPPORT */
 
 int
 main(int argc, char *argv[])
 {
+#ifndef MULTIBYTE_SUPPORT
+	fprintf(stderr, "Nmh was not configured with multibyte support\n");
+	exit(1);
+#else /* MULTIBYTE_SUPPORT */
 	wchar_t c;
-	int charlen;
-	char *p;
-
-	/*
-	 * This is the UTF-8 for "n" + U+0308 (Combining Diaeresis)
-	 */
-
-	unsigned char string[] = "n\xcc\x88";
+	int i;
 
 	setlocale(LC_ALL, "");
 
-	if (argc > 2)
+	if (argc < 2)
 		usage(argv[0]);
 
-	if (argc == 2) {
-		if (strcmp(argv[1], "--dump") == 0) {
+	if (strcmp(argv[1], "--dump") == 0) {
+		if (argc == 2) {
 			dumpwidth();
 			exit(0);
 		} else {
-			usage(argv[0]);
+			fprintf(stderr, "--dump cannot be combined with "
+				"other arguments\n");
+			exit(1);
 		}
 	}
 
-#ifndef MULTIBYTE_SUPPORT
-	fprintf(stderr, "Nmh was not configured with multibyte support\n");
-	exit(1);
-#else /* MULTIBYTE_SUPPORT */
 	/*
-	 * It's not clear to me that we can just call mbtowc() with a
-	 * combining character; just to be safe, feed it in a base
-	 * character first.
+	 * Process each argument.  If it begins with "U+", then try to
+	 * convert it to a Unicode codepoint.  Otherwise, take each
+	 * string and get the total width
 	 */
 
-	mbtowc(NULL, NULL, 0);
-
-	charlen = mbtowc(&c, (char *) string, strlen((char *) string));
-
-	if (charlen != 1) {
-		fprintf(stderr, "We expected a beginning character length "
-			"of 1, got %d instead\n", charlen);
-		exit(1);
-	}
-
-	p = (char *) (string + charlen);
-
-	charlen = mbtowc(&c, p, strlen(p));
-
-	if (charlen != 2) {
-		fprintf(stderr, "We expected a multibyte character length "
-			"of 2, got %d instead\n", charlen);
-		fprintf(stderr, "Are you using a UTF-8 locale?\n");
-		exit(1);
+	for (i = 1; i < argc; i++) {
+		if (strncmp(argv[i], "U+", 2) == 0) {
+			/*
+			 * We're making a big assumption here that
+			 * wchar_t represents a Unicode codepoint.
+			 * That technically isn't valid unless the
+			 * C compiler defines __STDC_ISO_10646__, but
+			 * we're going to assume now that it works.
+			 */
+			errno = 0;
+			c = strtoul(argv[i] + 2, NULL, 16);
+			if (errno) {
+				fprintf(stderr, "Codepoint %s invalid\n",
+					argv[i]);
+				continue;
+			}
+			printf("%d\n", wcwidth(c));
+		} else {
+			getwidth(argv[i]);
+		}
 	}
 
-	printf("%d\n", wcwidth(c));
-
 	exit(0);
-#endif /* MULTIBYTE_SUPPORT */
 }
 
 static void
 usage(char *argv0)
 {
 	fprintf(stderr, "Usage: %s [--dump]\n", argv0);
-	fprintf(stderr, "Returns the column width of a UTF-8 combining "
-		"multibyte character\n");
+	fprintf(stderr, "       %s U+XXXX [...]\n", argv0);
+	fprintf(stderr, "       %s utf-8-sequence [...]\n", argv0);
+	fprintf(stderr, "Returns the column width of a Unicode codepoint "
+		"or UTF-8 character sequence\n");
 	fprintf(stderr, "\t--dump\tDump complete width table\n");
 
 	exit(1);
 }
 
+static void
+getwidth(const char *string)
+{
+	wchar_t c;
+	int charlen, charleft = strlen(string);
+	int length = 0;
+
+	/*
+	 * In theory we should be able to use wcswidth(), but since we're
+	 * testing out how the format libraries behave we'll do it a character
+	 * at a time.
+	 */
+
+	mbtowc(NULL, NULL, 0);
+
+	while (charleft > 0) {
+		int clen;
+
+		charlen = mbtowc(&c, string, charleft);
+
+		if (charlen == 0)
+			break;
+
+		if (charlen < 0) {
+			fprintf(stderr, "Unable to convert string \"%s\"\n",
+				string);
+			return;
+		}
+
+		if ((clen = wcwidth(c)) < 0) {
+			fprintf(stderr, "U+%04X non-printable\n", c);
+			return;
+		}
+
+		length += clen;
+		string += charlen;
+		charleft -= charlen;
+	}
+
+	printf("%d\n", length);
+}
+
 static void
 dumpwidth(void)
 {
-#ifndef MULTIBYTE_SUPPORT
-	fprintf(stderr, "Nmh was not configured with multibyte support\n");
-	exit(1);
-#else /* MULTIBYTE_SUPPORT */
 	wchar_t wc, low;
 	int width, lastwidth;
 
@@ -120,5 +156,5 @@ dumpwidth(void)
 	width = wcwidth(wc - 1);
 	if (width == lastwidth)
 		printf("%04X - %04X = %d\n", low, wc - 1, width);
-#endif /* MULTIBYTE_SUPPORT */
 }
+#endif /* MULTIBYTE_SUPPORT */
diff --git a/test/scan/test-scan-multibyte b/test/scan/test-scan-multibyte
index 3acd1ca1..92900a90 100755
--- a/test/scan/test-scan-multibyte
+++ b/test/scan/test-scan-multibyte
@@ -47,7 +47,7 @@ Subject: =?utf-8?q?Sp=C4=B1n=CC=88al_Tap_=E2=86=92_Tap_into_America!?=
 Things are looking great!
 EOF
 
-width=`${MH_OBJ_DIR}/test/getcwidth`
+width=`${MH_OBJ_DIR}/test/getcwidth "ï¬ânÌ"`
 if test $? -ne 0; then
     echo "getcwidth failed to run"
     exit 1
@@ -56,28 +56,21 @@ fi
 expected="$MH_TEST_DIR/$$.expected"
 actual="$MH_TEST_DIR/$$.actual"
 
-if test "$width" -eq 1; then
+if test "$width" -eq 4; then
 cat > "$expected" <<EOF
   11  03/02 David ï¬ Hubbins    SpÄ±nÌal Tap â Tap into America!<<Things are look
 EOF
-elif test "$width" -eq 0; then
+elif test "$width" -eq 3; then
 cat > "$expected" <<EOF
   11  03/02 David ï¬ Hubbins    SpÄ±nÌal Tap â Tap into America!<<Things are looki
 EOF
 else
-    echo "Unsupported width for combining diaeresis: $width"
+    echo "Unsupported width for UTF-8 test string: $width"
     exit 1
 fi
 
 scan -width 80 +inbox 11 > $actual || exit 1
-set -x
-oldfailed="${failed:-0}"
 check "$expected" "$actual"
-if test "$oldfailed" -ne "${failed:-0}"; then
-	echo "Complete UTF-8 width table for BMP"
-	${MH_OBJ_DIR}/test/getcwidth --dump
-fi
-set +x
 
 #
 # Check decoding with an invalid multibyte sequence.  We skip this test