1 /*
2  * csshow.c: display a character-set table on a terminal.
3  *
4  * Can display a selected block of Unicode, or the whole of any
5  * single-byte character set for which libcharset knows the
6  * translation.
7  *
8  * Intended mostly for quick-reference use - it might very well be
9  * quicker to type 'csshow U+0400' than to click around for ages in a
10  * browser finding the appropriate Unicode chart. But it also works
11  * well as a test of the specific font you've configured in your
12  * terminal window, of course.
13  *
14  * Possible extra features:
15  *  - configurable row len and table size.
16  *  - option to disambiguate the various classes of failure in the
17  *    output, e.g. if terminfo gives us control sequences to change
18  *    colours then we could colour the missing characters differently
19  *    depending on why they're missing.
20  *     + this mode probably implies that we must also display all
21  *       characters in the range, whether printable or not, because
22  *       the whole point might be to disambiguate the various causes
23  *       of undisplayability. (In particular, don't forget to turn off
24  *       the early exit when nothing in the range is printable at
25  *       all.)
26  *  - ability to display sub-blocks of multibyte encodings such as
27  *    EUCs. But that would need some thought about how to sensibly
28  *    index those tables.
29  */
30 
31 #define _XOPEN_SOURCE 500              /* for wcwidth and snprintf */
32 
33 #include <assert.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <locale.h>
38 #include <ctype.h>
39 
40 #ifndef HAVE_NO_WCWIDTH
41 #include <wchar.h>
42 #endif
43 
44 #ifndef HAVE_NO_WCTYPE
45 #include <wctype.h>
46 #endif
47 
48 #include "charset.h"
49 
50 static const char *helptext =
51     "usage: csshow ( CHARSET | BASE-UNICODE-VALUE )\n"
52     " e.g.: csshow Win1252\n"
53     "       csshow U+2500\n"
54     " also: csshow --help          display this help text\n"
55     ;
56 
help(FILE * fp)57 static void help(FILE *fp)
58 {
59     fputs(helptext, fp);
60 }
61 
62 enum Trans {
63     BAD_CHAR_IN_SOURCE_CHARSET,
64     BAD_CHAR_IN_OUTPUT_CHARSET,
65     UNPRINTABLE_CHAR,
66     FIRST_PRINTABLE_VALUE,
67     COMBINING_CHAR = FIRST_PRINTABLE_VALUE,
68     WIDE_PRINTABLE_CHAR,
69     NORMAL_PRINTABLE_CHAR
70 };
71 struct translated_char {
72     enum Trans type;
73     char buf[7]; /* maximum even theoretical UTF-8 code length, plus NUL */
74 };
75 
main(int argc,char ** argv)76 int main(int argc, char **argv)
77 {
78     int doing_opts = 1;
79     int source_charset = CS_ASCII, output_charset = CS_NONE;
80     unsigned long base = 0, size = 0x100, rowlen = 0x10;
81 
82     while (--argc > 0) {
83         const char *p = *++argv;
84         if (*p == '-' && doing_opts) {
85             if (!strcmp(p, "--")) {
86                 doing_opts = 0;
87             } else if (!strcmp(p, "--help")) {
88                 help(stdout);
89                 return 0;
90             } else {
91                 fprintf(stderr, "csshow: unrecognised option '%s'\n", p);
92                 return 1;
93             }
94         } else {
95             int cs;
96 
97             if (toupper((unsigned char)p[0]) == 'U' &&
98                 (p[1] == '-' || p[1] == '+')) {
99                 source_charset = CS_NONE; /* means just translate Unicode */
100                 base = strtoul(p+2, NULL, 16);
101             } else if ((cs = charset_from_localenc(p)) != CS_NONE) {
102                 if (!charset_is_single_byte(cs)) {
103                     fprintf(stderr, "csshow: cannot display multibyte"
104                             " charset %s\n", charset_to_localenc(cs));
105                     return 1;
106                 }
107                 source_charset = cs;
108                 base = 0;
109             } else {
110                 fprintf(stderr, "csshow: unrecognised argument '%s'\n", p);
111                 return 1;
112             }
113         }
114     }
115 
116 #ifndef HAVE_NO_WCTYPE
117     setlocale(LC_CTYPE, "");
118 #endif
119 
120     if (output_charset == CS_NONE)
121         output_charset = charset_from_locale();
122 
123     {
124         struct translated_char *trans;
125         const char *rowheadfmt;
126         int rowheadwidth, colwidth;
127         int printed_a_line, skipped_a_line;
128         unsigned long i, j;
129 
130         trans = malloc(size * sizeof(struct translated_char));
131         if (!trans) {
132             fprintf(stderr, "csshow: out of memory\n");
133             return 1;
134         }
135 
136         /*
137          * Initial loop figuring out what characters we have in our
138          * block, and in what way each of them is weird.
139          */
140         for (i = 0; i < size; i++) {
141             unsigned long charcode = base + i;
142             wchar_t wc;
143 
144             trans[i].buf[0] = '\0';
145 
146             if (source_charset == CS_NONE) {
147                 wc = charcode;
148             } else {
149                 char c = charcode;
150                 const char *cp = &c;
151                 int clen = 1;
152                 int error = 0;
153 
154                 int ret = charset_to_unicode(
155                     &cp, &clen, &wc, 1, source_charset, NULL, L"", 0);
156                 if (ret != 1) {
157                     trans[i].type = BAD_CHAR_IN_SOURCE_CHARSET;
158                     continue;
159                 }
160             }
161 
162             {
163                 const wchar_t *wcp = &wc;
164                 int wclen = 1;
165                 int error = 0;
166 
167                 int ret = charset_from_unicode(
168                     &wcp, &wclen, trans[i].buf, sizeof(trans[i].buf) - 1,
169                     output_charset, NULL, &error);
170 
171                 assert(ret < sizeof(trans[i].buf));
172                 trans[i].buf[ret] = '\0';
173 
174                 if (wclen != 0 || ret == 0 || error) {
175                     trans[i].type = BAD_CHAR_IN_OUTPUT_CHARSET;
176                     trans[i].buf[0] = '\0';
177                     continue;
178                 }
179             }
180 
181             /*
182              * OK, we have a Unicode character and a corresponding
183              * UTF-8 sequence. But it might still be something we have
184              * to take care over printing.
185              */
186 #ifndef HAVE_NO_WCTYPE
187             if (!iswprint(wc)) {
188                 trans[i].type = UNPRINTABLE_CHAR;
189                 trans[i].buf[0] = '\0';
190                 continue;
191             }
192 #endif
193             {
194 #ifndef HAVE_NO_WCWIDTH
195                 int width = wcwidth(wc);
196 #else
197                 int width = 1;
198 #endif
199 
200                 switch (width) {
201                   case 0:
202                     trans[i].type = COMBINING_CHAR;
203                     break;
204                   case 1:
205                     trans[i].type = NORMAL_PRINTABLE_CHAR;
206                     break;
207                   case 2:
208                     trans[i].type = WIDE_PRINTABLE_CHAR;
209                     break;
210                   default:
211                     /* If we somehow had wcwidth but not wctype, weird
212                      * returns from wcwidth give us a second way to
213                      * identify non-printable control characters. */
214                     trans[i].type = UNPRINTABLE_CHAR;
215                     trans[i].buf[0] = '\0';
216                     break;
217                 }
218             }
219         }
220 
221         /*
222          * Special case: if _nothing_ in our range is printable, we'll
223          * just terminate now.
224          */
225         for (i = 0; i < size; i++)
226             if (trans[i].type >= FIRST_PRINTABLE_VALUE)
227                 break;
228         if (i == size) {
229             fprintf(stderr, "csshow: nothing printable at all in this"
230                     " character range\n");
231             return 1;
232         }
233 
234         /*
235          * Now we can figure out whether there are any wide
236          * characters, in which case we should space out our table a
237          * bit more. (We might also have to do that if rowlen is
238          * large.)
239          */
240         {
241             char testbuf[64];
242             colwidth = snprintf(testbuf, sizeof(testbuf),
243                                 "%-2x", (unsigned)(rowlen-1));
244         }
245         if (colwidth < 3) {
246             for (i = 0; i < size; i++)
247                 if (trans[i].type == WIDE_PRINTABLE_CHAR)
248                     colwidth = 3;
249         }
250 
251         /*
252          * Work out the width of the heading column on the left.
253          */
254         if (source_charset == CS_NONE) {
255             rowheadfmt = "U+%-6.4X";
256         } else {
257             rowheadfmt = "%-4.2X";
258         }
259         {
260             char testbuf[64];
261             rowheadwidth = snprintf(testbuf, sizeof(testbuf),
262                                     rowheadfmt, (unsigned)(base + (size-1)));
263         }
264 
265         /* Heading line. */
266         printf("%*s", rowheadwidth, "");
267         for (i = 0; i < rowlen; i++)
268             printf("%-*X", colwidth, (unsigned)i);
269         printf("\n");
270 
271         printed_a_line = skipped_a_line = 0;
272 
273         for (j = 0; j < size; j += rowlen) {
274             /* See if we're skipping this row completely. */
275             int skip = 1;
276             for (i = 0; i < rowlen && j+i < size; i++)
277                 if (trans[j+i].type >= FIRST_PRINTABLE_VALUE)
278                     skip = 0;
279             if (skip) {
280                 skipped_a_line = 1;
281                 continue;
282             }
283 
284             /* We're printing this line, but we might need to print a
285              * blank line to indicate a previous skipped one. But we
286              * don't do that at the very start or end - we only
287              * indicate a skipped line between two that _were_
288              * printed. */
289             if (skipped_a_line && printed_a_line) {
290                 printf("\n");
291             }
292             skipped_a_line = 0;
293 
294             printed_a_line = 1;
295             printf(rowheadfmt, (unsigned)(base + j));;
296             for (i = 0; i < rowlen && j+i < size; i++) {
297                 int chars_left = colwidth;
298                 struct translated_char *chr = &trans[j+i];
299 
300                 switch (chr->type) {
301                   case COMBINING_CHAR:
302                     /* Print a space first, for the combining char to
303                      * safely combine with. */
304                     printf(" %s", chr->buf);
305                     chars_left--;
306                     break;
307                   case WIDE_PRINTABLE_CHAR:
308                     fputs(chr->buf, stdout);
309                     chars_left -= 2;
310                     break;
311                   case NORMAL_PRINTABLE_CHAR:
312                     fputs(chr->buf, stdout);
313                     chars_left--;
314                     break;
315                   default:
316                     /* Unprintable for one reason or another. */
317                     break;
318                 }
319 
320                 if (i+1 < rowlen && j+i+1 < size)
321                     printf("%*s", chars_left, "");
322             }
323             printf("\n");
324         }
325     }
326 
327     return 0;
328 }
329