Rework multibyte test so it will calculate the width of all of the

author Ken Hornstein <kenh@pobox.com>

Sat, 2 Feb 2013 05:33:51 +0000 (00:33 -0500)

committer Ken Hornstein <kenh@pobox.com>

Sat, 2 Feb 2013 05:33:51 +0000 (00:33 -0500)
author Ken Hornstein <kenh@pobox.com>
Sat, 2 Feb 2013 05:33:51 +0000 (00:33 -0500)
committer Ken Hornstein <kenh@pobox.com>
Sat, 2 Feb 2013 05:33:51 +0000 (00:33 -0500)
diff --git a/test/getcwidth.c b/test/getcwidth.c

index f2a2ac69e5207dfd9375644f402d42f282613f44..59ae33e3cbeab0329c862ebc0fea974028e82430 100644 (file)
--- a/test/getcwidth.c
+++ b/test/getcwidth.c
@@ -1,7 +1,7 @@
  /*
- * getcwidth - Get the OS's idea of the width of a combining character
+ * getcwidth - Get the OS's idea of the width of Unicode codepoints
   *
- * This code is Copyright (c) 2012, by the authors of nmh.  See the
+ * This code is Copyright (c) 2013, by the authors of nmh.  See the
   * COPYRIGHT file in the root directory of the nmh distribution for
   * complete copyright information.
   */
@@ -20,91 +20,127 @@
  #include <wchar.h>
  #endif
  
+#ifdef MULTIBYTE_SUPPORT
  static void usage(char *);
  static void dumpwidth(void);
+static void getwidth(const char *);
+#endif /* MULTIBYTE_SUPPORT */
  
  int
  main(int argc, char *argv[])
  {
+#ifndef MULTIBYTE_SUPPORT
+       fprintf(stderr, "Nmh was not configured with multibyte support\n");
+       exit(1);
+#else /* MULTIBYTE_SUPPORT */
         wchar_t c;
-       int charlen;
-       char *p;
-
-       /*
-        * This is the UTF-8 for "n" + U+0308 (Combining Diaeresis)
-        */
-
-       unsigned char string[] = "n\xcc\x88";
+       int i;
  
         setlocale(LC_ALL, "");
  
-       if (argc > 2)
+       if (argc < 2)
                 usage(argv[0]);
  
-       if (argc == 2) {
-               if (strcmp(argv[1], "--dump") == 0) {
+       if (strcmp(argv[1], "--dump") == 0) {
+               if (argc == 2) {
                         dumpwidth();
                         exit(0);
                 } else {
-                       usage(argv[0]);
+                       fprintf(stderr, "--dump cannot be combined with "
+                               "other arguments\n");
+                       exit(1);
                 }
         }
  
-#ifndef MULTIBYTE_SUPPORT
-       fprintf(stderr, "Nmh was not configured with multibyte support\n");
-       exit(1);
-#else /* MULTIBYTE_SUPPORT */
         /*
-        * It's not clear to me that we can just call mbtowc() with a
-        * combining character; just to be safe, feed it in a base
-        * character first.
+        * Process each argument.  If it begins with "U+", then try to
+        * convert it to a Unicode codepoint.  Otherwise, take each
+        * string and get the total width
          */
  
-       mbtowc(NULL, NULL, 0);
-
-       charlen = mbtowc(&c, (char *) string, strlen((char *) string));
-
-       if (charlen != 1) {
-               fprintf(stderr, "We expected a beginning character length "
-                       "of 1, got %d instead\n", charlen);
-               exit(1);
-       }
-
-       p = (char *) (string + charlen);
-
-       charlen = mbtowc(&c, p, strlen(p));
-
-       if (charlen != 2) {
-               fprintf(stderr, "We expected a multibyte character length "
-                       "of 2, got %d instead\n", charlen);
-               fprintf(stderr, "Are you using a UTF-8 locale?\n");
-               exit(1);
+       for (i = 1; i < argc; i++) {
+               if (strncmp(argv[i], "U+", 2) == 0) {
+                       /*
+                        * We're making a big assumption here that
+                        * wchar_t represents a Unicode codepoint.
+                        * That technically isn't valid unless the
+                        * C compiler defines __STDC_ISO_10646__, but
+                        * we're going to assume now that it works.
+                        */
+                       errno = 0;
+                       c = strtoul(argv[i] + 2, NULL, 16);
+                       if (errno) {
+                               fprintf(stderr, "Codepoint %s invalid\n",
+                                       argv[i]);
+                               continue;
+                       }
+                       printf("%d\n", wcwidth(c));
+               } else {
+                       getwidth(argv[i]);
+               }
         }
  
-       printf("%d\n", wcwidth(c));
-
         exit(0);
-#endif /* MULTIBYTE_SUPPORT */
  }
  
  static void
  usage(char *argv0)
  {
         fprintf(stderr, "Usage: %s [--dump]\n", argv0);
-       fprintf(stderr, "Returns the column width of a UTF-8 combining "
-               "multibyte character\n");
+       fprintf(stderr, "       %s U+XXXX [...]\n", argv0);
+       fprintf(stderr, "       %s utf-8-sequence [...]\n", argv0);
+       fprintf(stderr, "Returns the column width of a Unicode codepoint "
+               "or UTF-8 character sequence\n");
         fprintf(stderr, "\t--dump\tDump complete width table\n");
  
         exit(1);
  }
  
+static void
+getwidth(const char *string)
+{
+       wchar_t c;
+       int charlen, charleft = strlen(string);
+       int length = 0;
+
+       /*
+        * In theory we should be able to use wcswidth(), but since we're
+        * testing out how the format libraries behave we'll do it a character
+        * at a time.
+        */
+
+       mbtowc(NULL, NULL, 0);
+
+       while (charleft > 0) {
+               int clen;
+
+               charlen = mbtowc(&c, string, charleft);
+
+               if (charlen == 0)
+                       break;
+
+               if (charlen < 0) {
+                       fprintf(stderr, "Unable to convert string \"%s\"\n",
+                               string);
+                       return;
+               }
+
+               if ((clen = wcwidth(c)) < 0) {
+                       fprintf(stderr, "U+%04X non-printable\n", c);
+                       return;
+               }
+
+               length += clen;
+               string += charlen;
+               charleft -= charlen;
+       }
+
+       printf("%d\n", length);
+}
+
  static void
  dumpwidth(void)
  {
-#ifndef MULTIBYTE_SUPPORT
-       fprintf(stderr, "Nmh was not configured with multibyte support\n");
-       exit(1);
-#else /* MULTIBYTE_SUPPORT */
         wchar_t wc, low;
         int width, lastwidth;
  
@@ -120,5 +156,5 @@ dumpwidth(void)
         width = wcwidth(wc - 1);
         if (width == lastwidth)
                 printf("%04X - %04X = %d\n", low, wc - 1, width);
-#endif /* MULTIBYTE_SUPPORT */
  }
+#endif /* MULTIBYTE_SUPPORT */
diff --git a/test/scan/test-scan-multibyte b/test/scan/test-scan-multibyte

index 3acd1ca123cfe78705ee9eb312d18732bb7b0c6a..92900a907169da649eaf2e2859544329e67e5c90 100755 (executable)
--- a/test/scan/test-scan-multibyte
+++ b/test/scan/test-scan-multibyte
@@ -47,7 +47,7 @@ Subject: =?utf-8?q?Sp=C4=B1n=CC=88al_Tap_=E2=86=92_Tap_into_America!?=
  Things are looking great!
  EOF
  
-width=`${MH_OBJ_DIR}/test/getcwidth`
+width=`${MH_OBJ_DIR}/test/getcwidth "ﬆ→n̈"`
  if test $? -ne 0; then
      echo "getcwidth failed to run"
      exit 1
@@ -56,28 +56,21 @@ fi
  expected="$MH_TEST_DIR/$$.expected"
  actual="$MH_TEST_DIR/$$.actual"
  
-if test "$width" -eq 1; then
+if test "$width" -eq 4; then
  cat > "$expected" <<EOF
    11  03/02 David ﬆ Hubbins    Spın̈al Tap → Tap into America!<<Things are look
  EOF
-elif test "$width" -eq 0; then
+elif test "$width" -eq 3; then
  cat > "$expected" <<EOF
    11  03/02 David ﬆ Hubbins    Spın̈al Tap → Tap into America!<<Things are looki
  EOF
  else
-    echo "Unsupported width for combining diaeresis: $width"
+    echo "Unsupported width for UTF-8 test string: $width"
      exit 1
  fi
  
  scan -width 80 +inbox 11 > $actual || exit 1
-set -x
-oldfailed="${failed:-0}"
  check "$expected" "$actual"
-if test "$oldfailed" -ne "${failed:-0}"; then
-       echo "Complete UTF-8 width table for BMP"
-       ${MH_OBJ_DIR}/test/getcwidth --dump
-fi
-set +x
  
  #
  # Check decoding with an invalid multibyte sequence.  We skip this test
author	Ken Hornstein <kenh@pobox.com>
	Sat, 2 Feb 2013 05:33:51 +0000 (00:33 -0500)
committer	Ken Hornstein <kenh@pobox.com>
	Sat, 2 Feb 2013 05:33:51 +0000 (00:33 -0500)
test/getcwidth.c		patch \| blob \| history
test/scan/test-scan-multibyte		patch \| blob \| history