1 /*
2  *
3  * Program to convert files from UTF-8 to ASCII, using the
4  * &#-escapes from XML to escape non-ASCII characters.
5  *
6  * Usage:
7  *
8  *   xml2asc
9  *
10  * Reads from stdin and write to stdout. Converts from UTF8 (with or
11  * without &#-escapes) to ASCII, inserting &#-escapes for all
12  * non-ASCII characters.
13  *
14  * Version: $Revision: 1.9 $ ($Date: 2017/11/24 10:14:49 $)
15  * Author: Bert Bos <bert@w3.org>
16  *
17  * Copyright © 1994-2011 World Wide Web Consortium
18  * See http://www.w3.org/Consortium/Legal/copyright-software
19  *
20  **/
21 #include "config.h"
22 #include <stdio.h>
23 #include <stdlib.h>
24 #if STDC_HEADERS
25 # include <string.h>
26 #else
27 # ifndef HAVE_STRCHR
28 #  define strchr index
29 #  define strrchr rindex
30 # endif
31 #endif
32 #include <ctype.h>
33 
34 #define NOT_A_CHAR 2097152	/* One more than the largest code point */
35 
36 static int nerrors = 0;
37 
38 /* getUTF8 -- read UTF8 encoded char from stdin, return NOT_A_CHAR on error */
getUTF8()39 static long getUTF8()
40 {
41   int b;
42   long c;
43 
44   /* 0 = 0000  1 = 0001  2 = 0010  3 = 0011
45      4 = 0100  5 = 0101  6 = 0110  7 = 0111
46      8 = 1000  9 = 1001  A = 1010  B = 1011
47      C = 1100  D = 1101  E = 1110  F = 1111 */
48 
49   if ((b = getchar()) == EOF) return EOF;	/* EOF */
50   if ((b & 0x80) == 0) return b;		/* 0xxxxxxx = ASCII */
51   if ((b & 0xE0) == 0xC0) {			/* 110xxxxx + 10xxxxxx */
52     c = b & 0x1F;
53     if ((b = getchar()) == EOF) {ungetc(EOF, stdin); return NOT_A_CHAR;}
54     c = (c << 6) | (b & 0x3F);
55     return c <= 0x7F ? NOT_A_CHAR : c;
56   }
57   if ((b & 0xF0) == 0xE0) {			/* 1110xxxx + (2) */
58     c = b & 0x0F;
59     if ((b = getchar()) == EOF) {ungetc(EOF, stdin); return NOT_A_CHAR;}
60     c = (c << 6) | (b & 0x3F);
61     if ((b = getchar()) == EOF) {ungetc(EOF, stdin); return NOT_A_CHAR;}
62     c = (c << 6) | (b & 0x3F);
63     if (0xD800 <= c && c <= 0xDFFF) return NOT_A_CHAR; /* Surrogate pair */
64     return c <= 0x7FF ? NOT_A_CHAR : c;
65   }
66   if ((b & 0xF8) == 0xF0) {			/* 11110xxx + (3) */
67     c = b & 0x07;
68     if ((b = getchar()) == EOF) {ungetc(EOF, stdin); return NOT_A_CHAR;}
69     c = (c << 6) | (b & 0x3F);
70     if ((b = getchar()) == EOF) {ungetc(EOF, stdin); return NOT_A_CHAR;}
71     c = (c << 6) | (b & 0x3F);
72     if ((b = getchar()) == EOF) {ungetc(EOF, stdin); return NOT_A_CHAR;}
73     c = (c << 6) | (b & 0x3F);
74     return c <= 0xFFFF ? NOT_A_CHAR : c;
75   }
76   return NOT_A_CHAR;
77 }
78 
79 /* xml2asc -- copy stdin to stdout, converting UTF8 XML to ASCII XML */
xml2asc(void)80 static void xml2asc(void)
81 {
82   long c;
83 
84   while ((c = getUTF8()) != EOF) {
85     if (c == NOT_A_CHAR) nerrors++;
86     else if (c <= 127) putchar(c);
87     else printf("&#%ld;", c);
88   }
89 }
90 
91 /* Print usage message, then exit */
usage(char * progname)92 static void usage(char *progname)
93 {
94   fprintf(stderr, "Version %s\nUsage: %s <infile >outfile\n", VERSION, progname);
95   exit(1);
96 }
97 
98 /* main -- main body */
main(int argc,char * argv[])99 int main(int argc, char *argv[])
100 {
101   if (argc != 1) usage(argv[0]);
102   xml2asc();
103   return nerrors;
104 }
105