diplodocus.org Git - nmh/blob - test/getcwidth.c

   1 /* getcwidth - Get the OS's idea of the width of Unicode codepoints
   2  *
   3  * This code is Copyright (c) 2013, by the authors of nmh.  See the
   4  * COPYRIGHT file in the root directory of the nmh distribution for
   5  * complete copyright information.
   6  */
   7
   8 #include <stdio.h>
   9 #include <stdlib.h>
  10 #include <string.h>
  11 #include <errno.h>
  12
  13 #ifdef HAVE_CONFIG_H
  14 #include <config.h>
  15 #endif
  16
  17 #ifdef MULTIBYTE_SUPPORT
  18 #include <locale.h>
  19 #include <wchar.h>
  20 #include <wctype.h>
  21 #endif
  22
  23 #ifdef MULTIBYTE_SUPPORT
  24 static void usage(char *);
  25 static void dumpwidth(void);
  26 static void dumpctype(void);
  27 static void getwidth(const char *);
  28 #endif /* MULTIBYTE_SUPPORT */
  29
  30 int
  31 main(int argc, char *argv[])
  32 {
  33 #ifndef MULTIBYTE_SUPPORT
  34         (void) argc;
  35         (void) argv;
  36         fprintf(stderr, "Nmh was not configured with multibyte support\n");
  37         exit(1);
  38 #else /* MULTIBYTE_SUPPORT */
  39         wchar_t c;
  40         int i;
  41
  42         if (! setlocale(LC_ALL, "")) {
  43                 fprintf(stderr, "setlocale failed, check your LC_ALL, "
  44                     "LC_CTYPE, and LANG environment variables\n");
  45         }
  46
  47         if (argc < 2)
  48                 usage(argv[0]);
  49
  50         if (strcmp(argv[1], "--dump") == 0) {
  51                 if (argc == 2) {
  52                         dumpwidth();
  53                         exit(0);
  54                 } else {
  55                         fprintf(stderr, "--dump cannot be combined with "
  56                                 "other arguments\n");
  57                         exit(1);
  58                 }
  59         }
  60
  61         if (strcmp(argv[1], "--ctype") == 0) {
  62                 if (argc != 2) {
  63                         fprintf(stderr, "--ctype cannot be combined with other arguments\n");
  64                         exit(1);
  65                 }
  66                 dumpctype();
  67                 exit(0);
  68         }
  69
  70         /*
  71          * Process each argument.  If it begins with "U+", then try to
  72          * convert it to a Unicode codepoint.  Otherwise, take each
  73          * string and get the total width
  74          */
  75
  76         for (i = 1; i < argc; i++) {
  77                 if (strncmp(argv[i], "U+", 2) == 0) {
  78                         /*
  79                          * We're making a big assumption here that
  80                          * wchar_t represents a Unicode codepoint.
  81                          * That technically isn't valid unless the
  82                          * C compiler defines __STDC_ISO_10646__, but
  83                          * we're going to assume now that it works.
  84                          */
  85                         errno = 0;
  86                         c = strtoul(argv[i] + 2, NULL, 16);
  87                         if (errno) {
  88                                 fprintf(stderr, "Codepoint %s invalid\n",
  89                                         argv[i]);
  90                                 continue;
  91                         }
  92                         printf("%d\n", wcwidth(c));
  93                 } else {
  94                         getwidth(argv[i]);
  95                 }
  96         }
  97
  98         exit(0);
  99 }
 100
 101 static void
 102 usage(char *argv0)
 103 {
 104         fprintf(stderr, "Usage: %s [--dump]\n", argv0);
 105         fprintf(stderr, "       %s [--ctype]\n", argv0);
 106         fprintf(stderr, "       %s U+XXXX [...]\n", argv0);
 107         fprintf(stderr, "       %s utf-8-sequence [...]\n", argv0);
 108         fprintf(stderr, "Returns the column width of a Unicode codepoint "
 109                 "or UTF-8 character sequence\n");
 110         fprintf(stderr, "\t--dump\tDump complete width table\n");
 111         fprintf(stderr, "\t--ctype\tPrint wctype(3) table.\n");
 112
 113         exit(1);
 114 }
 115
 116 static void
 117 getwidth(const char *string)
 118 {
 119         wchar_t c;
 120         int charlen, charleft = strlen(string);
 121         int length = 0;
 122
 123         /*
 124          * In theory we should be able to use wcswidth(), but since we're
 125          * testing out how the format libraries behave we'll do it a character
 126          * at a time.
 127          */
 128
 129         if (mbtowc(NULL, NULL, 0)) {}
 130
 131         while (charleft > 0) {
 132                 int clen;
 133
 134                 charlen = mbtowc(&c, string, charleft);
 135
 136                 if (charlen == 0)
 137                         break;
 138
 139                 if (charlen < 0) {
 140                         fprintf(stderr, "Unable to convert string \"%s\"\n",
 141                                 string);
 142                         return;
 143                 }
 144
 145                 if ((clen = wcwidth(c)) < 0) {
 146                         fprintf(stderr, "U+%04lX non-printable\n",
 147                                 (unsigned long int) c);
 148                         return;
 149                 }
 150
 151                 length += clen;
 152                 string += charlen;
 153                 charleft -= charlen;
 154         }
 155
 156         printf("%d\n", length);
 157 }
 158
 159 typedef struct {
 160         wchar_t min, max;
 161 } unicode_range;
 162
 163 static unicode_range range[] = {
 164         /* https://en.wikipedia.org/wiki/Unicode#Code_point_planes_and_blocks */
 165         {  L'\x0000',    L'\xff' },
 166 #if WCHAR_MAX >= 0xffff
 167         {  L'\x0100',  L'\xffff' },
 168 #if WCHAR_MAX >= 0xfffff
 169         { L'\x10000', L'\x14fff' },
 170         { L'\x16000', L'\x18fff' },
 171         { L'\x1b000', L'\x1bfff' },
 172         { L'\x1d000', L'\x1ffff' },
 173         { L'\x20000', L'\x2ffff' },
 174         { L'\xe0000', L'\xe0fff' },
 175 #endif
 176 #endif
 177         { L'\0', L'\0' }, /* Terminates list. */
 178 };
 179
 180 static void
 181 dumpwidth(void)
 182 {
 183         unicode_range *r;
 184         int first;
 185         wchar_t wc, start;
 186         int width, lastwidth;
 187
 188         for (r = range; r->max; r++) {
 189                 first = 1;
 190                 for (wc = r->min; wc <= r->max; wc++) {
 191                         width = wcwidth(wc);
 192                         if (first) {
 193                                 start = wc;
 194                                 lastwidth = width;
 195                                 first = 0;
 196                                 continue;
 197                         }
 198                         if (width != lastwidth) {
 199                                 printf("%04lX - %04lX = %d\n", (unsigned long)start,
 200                                            (unsigned long int)wc - 1, lastwidth);
 201                                 start = wc;
 202                                 lastwidth = width;
 203                         }
 204                         if (wc == r->max) {
 205                                 printf("%04lX - %04lX = %d\n", (unsigned long)start,
 206                                            (unsigned long int)wc, lastwidth);
 207                 /* wchar_t can be a 16-bit unsigned short. */
 208                 break;
 209             }
 210                 }
 211         }
 212 }
 213
 214 static void
 215 dumpctype(void)
 216 {
 217         unicode_range *r;
 218         wchar_t wc;
 219
 220         for (r = range; r->max; r++) {
 221                 for (wc = r->min; wc <= r->max; wc++) {
 222                         printf("%6x  %2d  %c%c%c%c%c%c%c%c%c%c%c%c\n",
 223                                 wc, wcwidth(wc),
 224                                 iswcntrl(wc) ? 'c' : '-',
 225                                 iswprint(wc) ? 'p' : '-',
 226                                 iswgraph(wc) ? 'g' : '-',
 227                                 iswalpha(wc) ? 'a' : '-',
 228                                 iswupper(wc) ? 'u' : '-',
 229                                 iswlower(wc) ? 'l' : '-',
 230                                 iswdigit(wc) ? 'd' : '-',
 231                                 iswxdigit(wc) ? 'x' : '-',
 232                                 iswalnum(wc) ? 'N' : '-',
 233                                 iswpunct(wc) ? '@' : '-',
 234                                 iswspace(wc) ? 's' : '-',
 235                                 iswblank(wc) ? 'b' : '-');
 236
 237             if (wc == r->max)
 238                 /* wchar_t can be a 16-bit unsigned short. */
 239                 break;
 240                 }
 241         }
 242 #endif /* MULTIBYTE_SUPPORT */
 243 }