]> diplodocus.org Git - nmh/blob - test/getcwidth.c
Fix invalid pointer arithmetic.
[nmh] / test / getcwidth.c
1 /*
2 * getcwidth - Get the OS's idea of the width of Unicode codepoints
3 *
4 * This code is Copyright (c) 2013, by the authors of nmh. See the
5 * COPYRIGHT file in the root directory of the nmh distribution for
6 * complete copyright information.
7 */
8
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <errno.h>
13
14 #ifdef HAVE_CONFIG_H
15 #include <config.h>
16 #endif
17
18 #ifdef MULTIBYTE_SUPPORT
19 #include <locale.h>
20 #include <wchar.h>
21 #include <wctype.h>
22 #endif
23
24 #ifdef MULTIBYTE_SUPPORT
25 static void usage(char *);
26 static void dumpwidth(void);
27 static void dumpctype(void);
28 static void getwidth(const char *);
29 #endif /* MULTIBYTE_SUPPORT */
30
31 int
32 main(int argc, char *argv[])
33 {
34 #ifndef MULTIBYTE_SUPPORT
35 (void) argc;
36 (void) argv;
37 fprintf(stderr, "Nmh was not configured with multibyte support\n");
38 exit(1);
39 #else /* MULTIBYTE_SUPPORT */
40 wchar_t c;
41 int i;
42
43 if (! setlocale(LC_ALL, "")) {
44 fprintf(stderr, "setlocale failed, check your LC_ALL, "
45 "LC_CTYPE, and LANG environment variables\n");
46 }
47
48 if (argc < 2)
49 usage(argv[0]);
50
51 if (strcmp(argv[1], "--dump") == 0) {
52 if (argc == 2) {
53 dumpwidth();
54 exit(0);
55 } else {
56 fprintf(stderr, "--dump cannot be combined with "
57 "other arguments\n");
58 exit(1);
59 }
60 }
61
62 if (strcmp(argv[1], "--ctype") == 0) {
63 if (argc != 2) {
64 fprintf(stderr, "--ctype cannot be combined with other arguments\n");
65 exit(1);
66 }
67 dumpctype();
68 exit(0);
69 }
70
71 /*
72 * Process each argument. If it begins with "U+", then try to
73 * convert it to a Unicode codepoint. Otherwise, take each
74 * string and get the total width
75 */
76
77 for (i = 1; i < argc; i++) {
78 if (strncmp(argv[i], "U+", 2) == 0) {
79 /*
80 * We're making a big assumption here that
81 * wchar_t represents a Unicode codepoint.
82 * That technically isn't valid unless the
83 * C compiler defines __STDC_ISO_10646__, but
84 * we're going to assume now that it works.
85 */
86 errno = 0;
87 c = strtoul(argv[i] + 2, NULL, 16);
88 if (errno) {
89 fprintf(stderr, "Codepoint %s invalid\n",
90 argv[i]);
91 continue;
92 }
93 printf("%d\n", wcwidth(c));
94 } else {
95 getwidth(argv[i]);
96 }
97 }
98
99 exit(0);
100 }
101
102 static void
103 usage(char *argv0)
104 {
105 fprintf(stderr, "Usage: %s [--dump]\n", argv0);
106 fprintf(stderr, " %s [--ctype]\n", argv0);
107 fprintf(stderr, " %s U+XXXX [...]\n", argv0);
108 fprintf(stderr, " %s utf-8-sequence [...]\n", argv0);
109 fprintf(stderr, "Returns the column width of a Unicode codepoint "
110 "or UTF-8 character sequence\n");
111 fprintf(stderr, "\t--dump\tDump complete width table\n");
112 fprintf(stderr, "\t--ctype\tPrint wctype(3) table.\n");
113
114 exit(1);
115 }
116
117 static void
118 getwidth(const char *string)
119 {
120 wchar_t c;
121 int charlen, charleft = strlen(string);
122 int length = 0;
123
124 /*
125 * In theory we should be able to use wcswidth(), but since we're
126 * testing out how the format libraries behave we'll do it a character
127 * at a time.
128 */
129
130 if (mbtowc(NULL, NULL, 0)) {}
131
132 while (charleft > 0) {
133 int clen;
134
135 charlen = mbtowc(&c, string, charleft);
136
137 if (charlen == 0)
138 break;
139
140 if (charlen < 0) {
141 fprintf(stderr, "Unable to convert string \"%s\"\n",
142 string);
143 return;
144 }
145
146 if ((clen = wcwidth(c)) < 0) {
147 fprintf(stderr, "U+%04lX non-printable\n",
148 (unsigned long int) c);
149 return;
150 }
151
152 length += clen;
153 string += charlen;
154 charleft -= charlen;
155 }
156
157 printf("%d\n", length);
158 }
159
160 typedef struct {
161 wchar_t min, max;
162 } unicode_range;
163
164 static unicode_range range[] = {
165 /* https://en.wikipedia.org/wiki/Unicode#Code_point_planes_and_blocks */
166 { L'\x0000', L'\xff' },
167 #if WCHAR_MAX >= 0xffff
168 { L'\x0100', L'\xffff' },
169 #if WCHAR_MAX >= 0xfffff
170 { L'\x10000', L'\x14fff' },
171 { L'\x16000', L'\x18fff' },
172 { L'\x1b000', L'\x1bfff' },
173 { L'\x1d000', L'\x1ffff' },
174 { L'\x20000', L'\x2ffff' },
175 { L'\xe0000', L'\xe0fff' },
176 #endif
177 #endif
178 { L'\0', L'\0' }, /* Terminates list. */
179 };
180
181 static void
182 dumpwidth(void)
183 {
184 unicode_range *r;
185 int first;
186 wchar_t wc, start;
187 int width, lastwidth;
188
189 for (r = range; r->max; r++) {
190 first = 1;
191 for (wc = r->min; wc <= r->max; wc++) {
192 width = wcwidth(wc);
193 if (first) {
194 start = wc;
195 lastwidth = width;
196 first = 0;
197 continue;
198 }
199 if (width != lastwidth) {
200 printf("%04lX - %04lX = %d\n", (unsigned long)start,
201 (unsigned long int)wc - 1, lastwidth);
202 start = wc;
203 lastwidth = width;
204 }
205 if (wc == r->max) {
206 printf("%04lX - %04lX = %d\n", (unsigned long)start,
207 (unsigned long int)wc, lastwidth);
208 /* wchar_t can be a 16-bit unsigned short. */
209 break;
210 }
211 }
212 }
213 }
214
215 static void
216 dumpctype(void)
217 {
218 unicode_range *r;
219 wchar_t wc;
220
221 for (r = range; r->max; r++) {
222 for (wc = r->min; wc <= r->max; wc++) {
223 printf("%6x %2d %c%c%c%c%c%c%c%c%c%c%c%c\n",
224 wc, wcwidth(wc),
225 iswcntrl(wc) ? 'c' : '-',
226 iswprint(wc) ? 'p' : '-',
227 iswgraph(wc) ? 'g' : '-',
228 iswalpha(wc) ? 'a' : '-',
229 iswupper(wc) ? 'u' : '-',
230 iswlower(wc) ? 'l' : '-',
231 iswdigit(wc) ? 'd' : '-',
232 iswxdigit(wc) ? 'x' : '-',
233 iswalnum(wc) ? 'N' : '-',
234 iswpunct(wc) ? '@' : '-',
235 iswspace(wc) ? 's' : '-',
236 iswblank(wc) ? 'b' : '-');
237
238 if (wc == r->max)
239 /* wchar_t can be a 16-bit unsigned short. */
240 break;
241 }
242 }
243 #endif /* MULTIBYTE_SUPPORT */
244 }