diplodocus.org Git - nmh/blob - test/getcwidth.c

   1 /*
   2  * getcwidth - Get the OS's idea of the width of Unicode codepoints
   3  *
   4  * This code is Copyright (c) 2013, by the authors of nmh.  See the
   5  * COPYRIGHT file in the root directory of the nmh distribution for
   6  * complete copyright information.
   7  */
   8
   9 #include <stdio.h>
  10 #include <stdlib.h>
  11 #include <string.h>
  12 #include <errno.h>
  13
  14 #ifdef HAVE_CONFIG_H
  15 #include <config.h>
  16 #endif
  17
  18 #ifdef MULTIBYTE_SUPPORT
  19 #include <locale.h>
  20 #include <wchar.h>
  21 #include <wctype.h>
  22 #endif
  23
  24 #ifdef MULTIBYTE_SUPPORT
  25 static void usage(char *);
  26 static void dumpwidth(void);
  27 static void dumpctype(void);
  28 static void getwidth(const char *);
  29 #endif /* MULTIBYTE_SUPPORT */
  30
  31 int
  32 main(int argc, char *argv[])
  33 {
  34 #ifndef MULTIBYTE_SUPPORT
  35         (void) argc;
  36         (void) argv;
  37         fprintf(stderr, "Nmh was not configured with multibyte support\n");
  38         exit(1);
  39 #else /* MULTIBYTE_SUPPORT */
  40         wchar_t c;
  41         int i;
  42
  43         if (! setlocale(LC_ALL, "")) {
  44                 fprintf(stderr, "setlocale failed, check your LC_ALL, "
  45                     "LC_CTYPE, and LANG environment variables\n");
  46         }
  47
  48         if (argc < 2)
  49                 usage(argv[0]);
  50
  51         if (strcmp(argv[1], "--dump") == 0) {
  52                 if (argc == 2) {
  53                         dumpwidth();
  54                         exit(0);
  55                 } else {
  56                         fprintf(stderr, "--dump cannot be combined with "
  57                                 "other arguments\n");
  58                         exit(1);
  59                 }
  60         }
  61
  62         if (strcmp(argv[1], "--ctype") == 0) {
  63                 if (argc != 2) {
  64                         fprintf(stderr, "--ctype cannot be combined with other arguments\n");
  65                         exit(1);
  66                 }
  67                 dumpctype();
  68                 exit(0);
  69         }
  70
  71         /*
  72          * Process each argument.  If it begins with "U+", then try to
  73          * convert it to a Unicode codepoint.  Otherwise, take each
  74          * string and get the total width
  75          */
  76
  77         for (i = 1; i < argc; i++) {
  78                 if (strncmp(argv[i], "U+", 2) == 0) {
  79                         /*
  80                          * We're making a big assumption here that
  81                          * wchar_t represents a Unicode codepoint.
  82                          * That technically isn't valid unless the
  83                          * C compiler defines __STDC_ISO_10646__, but
  84                          * we're going to assume now that it works.
  85                          */
  86                         errno = 0;
  87                         c = strtoul(argv[i] + 2, NULL, 16);
  88                         if (errno) {
  89                                 fprintf(stderr, "Codepoint %s invalid\n",
  90                                         argv[i]);
  91                                 continue;
  92                         }
  93                         printf("%d\n", wcwidth(c));
  94                 } else {
  95                         getwidth(argv[i]);
  96                 }
  97         }
  98
  99         exit(0);
 100 }
 101
 102 static void
 103 usage(char *argv0)
 104 {
 105         fprintf(stderr, "Usage: %s [--dump]\n", argv0);
 106         fprintf(stderr, "       %s [--ctype]\n", argv0);
 107         fprintf(stderr, "       %s U+XXXX [...]\n", argv0);
 108         fprintf(stderr, "       %s utf-8-sequence [...]\n", argv0);
 109         fprintf(stderr, "Returns the column width of a Unicode codepoint "
 110                 "or UTF-8 character sequence\n");
 111         fprintf(stderr, "\t--dump\tDump complete width table\n");
 112         fprintf(stderr, "\t--ctype\tPrint wctype(3) table.\n");
 113
 114         exit(1);
 115 }
 116
 117 static void
 118 getwidth(const char *string)
 119 {
 120         wchar_t c;
 121         int charlen, charleft = strlen(string);
 122         int length = 0;
 123
 124         /*
 125          * In theory we should be able to use wcswidth(), but since we're
 126          * testing out how the format libraries behave we'll do it a character
 127          * at a time.
 128          */
 129
 130         if (mbtowc(NULL, NULL, 0)) {}
 131
 132         while (charleft > 0) {
 133                 int clen;
 134
 135                 charlen = mbtowc(&c, string, charleft);
 136
 137                 if (charlen == 0)
 138                         break;
 139
 140                 if (charlen < 0) {
 141                         fprintf(stderr, "Unable to convert string \"%s\"\n",
 142                                 string);
 143                         return;
 144                 }
 145
 146                 if ((clen = wcwidth(c)) < 0) {
 147                         fprintf(stderr, "U+%04lX non-printable\n",
 148                                 (unsigned long int) c);
 149                         return;
 150                 }
 151
 152                 length += clen;
 153                 string += charlen;
 154                 charleft -= charlen;
 155         }
 156
 157         printf("%d\n", length);
 158 }
 159
 160 typedef struct {
 161         wchar_t min, max;
 162 } unicode_range;
 163
 164 static unicode_range range[] = {
 165         /* https://en.wikipedia.org/wiki/Unicode#Code_point_planes_and_blocks */
 166         {  L'\x0000',    L'\xff' },
 167 #if WCHAR_MAX >= 0xffff
 168         {  L'\x0100',  L'\xffff' },
 169 #if WCHAR_MAX >= 0xfffff
 170         { L'\x10000', L'\x14fff' },
 171         { L'\x16000', L'\x18fff' },
 172         { L'\x1b000', L'\x1bfff' },
 173         { L'\x1d000', L'\x1ffff' },
 174         { L'\x20000', L'\x2ffff' },
 175         { L'\xe0000', L'\xe0fff' },
 176 #endif
 177 #endif
 178         { L'\0', L'\0' }, /* Terminates list. */
 179 };
 180
 181 static void
 182 dumpwidth(void)
 183 {
 184         unicode_range *r;
 185         int first;
 186         wchar_t wc, start;
 187         int width, lastwidth;
 188
 189         for (r = range; r->max; r++) {
 190                 first = 1;
 191                 for (wc = r->min; wc <= r->max; wc++) {
 192                         width = wcwidth(wc);
 193                         if (first) {
 194                                 start = wc;
 195                                 lastwidth = width;
 196                                 first = 0;
 197                                 continue;
 198                         }
 199                         if (width != lastwidth) {
 200                                 printf("%04lX - %04lX = %d\n", (unsigned long)start,
 201                                            (unsigned long int)wc - 1, lastwidth);
 202                                 start = wc;
 203                                 lastwidth = width;
 204                         }
 205                         if (wc == r->max) {
 206                                 printf("%04lX - %04lX = %d\n", (unsigned long)start,
 207                                            (unsigned long int)wc, lastwidth);
 208                 /* wchar_t can be a 16-bit unsigned short. */
 209                 break;
 210             }
 211                 }
 212         }
 213 }
 214
 215 static void
 216 dumpctype(void)
 217 {
 218         unicode_range *r;
 219         wchar_t wc;
 220
 221         for (r = range; r->max; r++) {
 222                 for (wc = r->min; wc <= r->max; wc++) {
 223                         printf("%6x  %2d  %c%c%c%c%c%c%c%c%c%c%c%c\n",
 224                                 wc, wcwidth(wc),
 225                                 iswcntrl(wc) ? 'c' : '-',
 226                                 iswprint(wc) ? 'p' : '-',
 227                                 iswgraph(wc) ? 'g' : '-',
 228                                 iswalpha(wc) ? 'a' : '-',
 229                                 iswupper(wc) ? 'u' : '-',
 230                                 iswlower(wc) ? 'l' : '-',
 231                                 iswdigit(wc) ? 'd' : '-',
 232                                 iswxdigit(wc) ? 'x' : '-',
 233                                 iswalnum(wc) ? 'N' : '-',
 234                                 iswpunct(wc) ? '@' : '-',
 235                                 iswspace(wc) ? 's' : '-',
 236                                 iswblank(wc) ? 'b' : '-');
 237
 238             if (wc == r->max)
 239                 /* wchar_t can be a 16-bit unsigned short. */
 240                 break;
 241                 }
 242         }
 243 #endif /* MULTIBYTE_SUPPORT */
 244 }