1 /*
2 *
3 * Program to convert files from UTF-8 to ASCII, using the
4 * &#-escapes from XML to escape non-ASCII characters.
5 *
6 * Usage:
7 *
8 * xml2asc
9 *
10 * Reads from stdin and write to stdout. Converts from UTF8 (with or
11 * without &#-escapes) to ASCII, inserting &#-escapes for all
12 * non-ASCII characters.
13 *
14 * Version: $Revision: 1.9 $ ($Date: 2017/11/24 10:14:49 $)
15 * Author: Bert Bos <bert@w3.org>
16 *
17 * Copyright © 1994-2011 World Wide Web Consortium
18 * See http://www.w3.org/Consortium/Legal/copyright-software
19 *
20 **/
21 #include "config.h"
22 #include <stdio.h>
23 #include <stdlib.h>
24 #if STDC_HEADERS
25 # include <string.h>
26 #else
27 # ifndef HAVE_STRCHR
28 # define strchr index
29 # define strrchr rindex
30 # endif
31 #endif
32 #include <ctype.h>
33
34 #define NOT_A_CHAR 2097152 /* One more than the largest code point */
35
36 static int nerrors = 0;
37
38 /* getUTF8 -- read UTF8 encoded char from stdin, return NOT_A_CHAR on error */
getUTF8()39 static long getUTF8()
40 {
41 int b;
42 long c;
43
44 /* 0 = 0000 1 = 0001 2 = 0010 3 = 0011
45 4 = 0100 5 = 0101 6 = 0110 7 = 0111
46 8 = 1000 9 = 1001 A = 1010 B = 1011
47 C = 1100 D = 1101 E = 1110 F = 1111 */
48
49 if ((b = getchar()) == EOF) return EOF; /* EOF */
50 if ((b & 0x80) == 0) return b; /* 0xxxxxxx = ASCII */
51 if ((b & 0xE0) == 0xC0) { /* 110xxxxx + 10xxxxxx */
52 c = b & 0x1F;
53 if ((b = getchar()) == EOF) {ungetc(EOF, stdin); return NOT_A_CHAR;}
54 c = (c << 6) | (b & 0x3F);
55 return c <= 0x7F ? NOT_A_CHAR : c;
56 }
57 if ((b & 0xF0) == 0xE0) { /* 1110xxxx + (2) */
58 c = b & 0x0F;
59 if ((b = getchar()) == EOF) {ungetc(EOF, stdin); return NOT_A_CHAR;}
60 c = (c << 6) | (b & 0x3F);
61 if ((b = getchar()) == EOF) {ungetc(EOF, stdin); return NOT_A_CHAR;}
62 c = (c << 6) | (b & 0x3F);
63 if (0xD800 <= c && c <= 0xDFFF) return NOT_A_CHAR; /* Surrogate pair */
64 return c <= 0x7FF ? NOT_A_CHAR : c;
65 }
66 if ((b & 0xF8) == 0xF0) { /* 11110xxx + (3) */
67 c = b & 0x07;
68 if ((b = getchar()) == EOF) {ungetc(EOF, stdin); return NOT_A_CHAR;}
69 c = (c << 6) | (b & 0x3F);
70 if ((b = getchar()) == EOF) {ungetc(EOF, stdin); return NOT_A_CHAR;}
71 c = (c << 6) | (b & 0x3F);
72 if ((b = getchar()) == EOF) {ungetc(EOF, stdin); return NOT_A_CHAR;}
73 c = (c << 6) | (b & 0x3F);
74 return c <= 0xFFFF ? NOT_A_CHAR : c;
75 }
76 return NOT_A_CHAR;
77 }
78
79 /* xml2asc -- copy stdin to stdout, converting UTF8 XML to ASCII XML */
xml2asc(void)80 static void xml2asc(void)
81 {
82 long c;
83
84 while ((c = getUTF8()) != EOF) {
85 if (c == NOT_A_CHAR) nerrors++;
86 else if (c <= 127) putchar(c);
87 else printf("&#%ld;", c);
88 }
89 }
90
91 /* Print usage message, then exit */
usage(char * progname)92 static void usage(char *progname)
93 {
94 fprintf(stderr, "Version %s\nUsage: %s <infile >outfile\n", VERSION, progname);
95 exit(1);
96 }
97
98 /* main -- main body */
main(int argc,char * argv[])99 int main(int argc, char *argv[])
100 {
101 if (argc != 1) usage(argv[0]);
102 xml2asc();
103 return nerrors;
104 }
105