diplodocus.org Git - nmh/blob - test/getcwidth.c

   1 /*
   2  * getcwidth - Get the OS's idea of the width of Unicode codepoints
   3  *
   4  * This code is Copyright (c) 2013, by the authors of nmh.  See the
   5  * COPYRIGHT file in the root directory of the nmh distribution for
   6  * complete copyright information.
   7  */
   8
   9 #include <stdio.h>
  10 #include <stdlib.h>
  11 #include <string.h>
  12 #include <errno.h>
  13
  14 #ifdef HAVE_CONFIG_H
  15 #include <config.h>
  16 #endif
  17
  18 #ifdef MULTIBYTE_SUPPORT
  19 #include <locale.h>
  20 #include <wchar.h>
  21 #include <wctype.h>
  22 #endif
  23
  24 #ifdef MULTIBYTE_SUPPORT
  25 static void usage(char *);
  26 static void dumpwidth(void);
  27 static void dumpctype(void);
  28 static void getwidth(const char *);
  29 #endif /* MULTIBYTE_SUPPORT */
  30
  31 int
  32 main(int argc, char *argv[])
  33 {
  34 #ifndef MULTIBYTE_SUPPORT
  35         (void) argc;
  36         (void) argv;
  37         fprintf(stderr, "Nmh was not configured with multibyte support\n");
  38         exit(1);
  39 #else /* MULTIBYTE_SUPPORT */
  40         wchar_t c;
  41         int i;
  42
  43         if (! setlocale(LC_ALL, "")) {
  44                 fprintf(stderr, "setlocale failed, check your LC_ALL, "
  45                     "LC_CTYPE, and LANG environment variables\n");
  46         }
  47
  48         if (argc < 2)
  49                 usage(argv[0]);
  50
  51         if (strcmp(argv[1], "--dump") == 0) {
  52                 if (argc == 2) {
  53                         dumpwidth();
  54                         exit(0);
  55                 } else {
  56                         fprintf(stderr, "--dump cannot be combined with "
  57                                 "other arguments\n");
  58                         exit(1);
  59                 }
  60         }
  61
  62         if (strcmp(argv[1], "--ctype") == 0) {
  63                 if (argc != 2) {
  64                         fprintf(stderr, "--ctype cannot be combined with other arguments\n");
  65                         exit(1);
  66                 }
  67                 dumpctype();
  68                 exit(0);
  69         }
  70
  71         /*
  72          * Process each argument.  If it begins with "U+", then try to
  73          * convert it to a Unicode codepoint.  Otherwise, take each
  74          * string and get the total width
  75          */
  76
  77         for (i = 1; i < argc; i++) {
  78                 if (strncmp(argv[i], "U+", 2) == 0) {
  79                         /*
  80                          * We're making a big assumption here that
  81                          * wchar_t represents a Unicode codepoint.
  82                          * That technically isn't valid unless the
  83                          * C compiler defines __STDC_ISO_10646__, but
  84                          * we're going to assume now that it works.
  85                          */
  86                         errno = 0;
  87                         c = strtoul(argv[i] + 2, NULL, 16);
  88                         if (errno) {
  89                                 fprintf(stderr, "Codepoint %s invalid\n",
  90                                         argv[i]);
  91                                 continue;
  92                         }
  93                         printf("%d\n", wcwidth(c));
  94                 } else {
  95                         getwidth(argv[i]);
  96                 }
  97         }
  98
  99         exit(0);
 100 }
 101
 102 static void
 103 usage(char *argv0)
 104 {
 105         fprintf(stderr, "Usage: %s [--dump]\n", argv0);
 106         fprintf(stderr, "       %s U+XXXX [...]\n", argv0);
 107         fprintf(stderr, "       %s utf-8-sequence [...]\n", argv0);
 108         fprintf(stderr, "Returns the column width of a Unicode codepoint "
 109                 "or UTF-8 character sequence\n");
 110         fprintf(stderr, "\t--dump\tDump complete width table\n");
 111
 112         exit(1);
 113 }
 114
 115 static void
 116 getwidth(const char *string)
 117 {
 118         wchar_t c;
 119         int charlen, charleft = strlen(string);
 120         int length = 0;
 121
 122         /*
 123          * In theory we should be able to use wcswidth(), but since we're
 124          * testing out how the format libraries behave we'll do it a character
 125          * at a time.
 126          */
 127
 128         if (mbtowc(NULL, NULL, 0)) {}
 129
 130         while (charleft > 0) {
 131                 int clen;
 132
 133                 charlen = mbtowc(&c, string, charleft);
 134
 135                 if (charlen == 0)
 136                         break;
 137
 138                 if (charlen < 0) {
 139                         fprintf(stderr, "Unable to convert string \"%s\"\n",
 140                                 string);
 141                         return;
 142                 }
 143
 144                 if ((clen = wcwidth(c)) < 0) {
 145                         fprintf(stderr, "U+%04lX non-printable\n",
 146                                 (unsigned long int) c);
 147                         return;
 148                 }
 149
 150                 length += clen;
 151                 string += charlen;
 152                 charleft -= charlen;
 153         }
 154
 155         printf("%d\n", length);
 156 }
 157
 158 typedef struct {
 159         wchar_t min, max;
 160 } unicode_range;
 161
 162 static unicode_range range[] = {
 163         /* https://en.wikipedia.org/wiki/Unicode#Code_point_planes_and_blocks */
 164         {  L'\x0000',  L'\xffff' },
 165         { L'\x10000', L'\x14fff' },
 166         { L'\x16000', L'\x18fff' },
 167         { L'\x1b000', L'\x1bfff' },
 168         { L'\x1d000', L'\x1ffff' },
 169         { L'\x20000', L'\x2ffff' },
 170         { L'\xe0000', L'\xe0fff' },
 171         { L'\0', L'\0' }, /* Terminates list. */
 172 };
 173
 174 static void
 175 dumpwidth(void)
 176 {
 177         unicode_range *r;
 178         int first;
 179         wchar_t wc, start;
 180         int width, lastwidth;
 181
 182         for (r = range; r->max; r++) {
 183                 first = 1;
 184                 for (wc = r->min; wc <= r->max; wc++) {
 185                         width = wcwidth(wc);
 186                         if (first) {
 187                                 start = wc;
 188                                 lastwidth = width;
 189                                 first = 0;
 190                                 continue;
 191                         }
 192                         if (width != lastwidth) {
 193                                 printf("%04lX - %04lX = %d\n", (unsigned long)start,
 194                                            (unsigned long int)wc - 1, lastwidth);
 195                                 start = wc;
 196                                 lastwidth = width;
 197                         }
 198                         if (wc == r->max)
 199                                 printf("%04lX - %04lX = %d\n", (unsigned long)start,
 200                                            (unsigned long int)wc, lastwidth);
 201                 }
 202         }
 203 }
 204
 205 static void
 206 dumpctype(void)
 207 {
 208         unicode_range *r;
 209         wchar_t wc;
 210
 211         for (r = range; r->max; r++) {
 212                 for (wc = r->min; wc <= r->max; wc++) {
 213                         printf("%6x  %2d  %c%c%c%c%c%c%c%c%c%c%c%c\n",
 214                                 wc, wcwidth(wc),
 215                                 iswcntrl(wc) ? 'c' : '-',
 216                                 iswprint(wc) ? 'p' : '-',
 217                                 iswgraph(wc) ? 'g' : '-',
 218                                 iswalpha(wc) ? 'a' : '-',
 219                                 iswupper(wc) ? 'u' : '-',
 220                                 iswlower(wc) ? 'l' : '-',
 221                                 iswdigit(wc) ? 'd' : '-',
 222                                 iswxdigit(wc) ? 'x' : '-',
 223                                 iswalnum(wc) ? 'N' : '-',
 224                                 iswpunct(wc) ? '@' : '-',
 225                                 iswspace(wc) ? 's' : '-',
 226                                 iswblank(wc) ? 'b' : '-');
 227                 }
 228         }
 229 #endif /* MULTIBYTE_SUPPORT */
 230 }