1 /* unent -- expand HTML entities
2 *
3 * Author: Bert Bos
4 * Created: 10 Aug 2008
5 */
6
7 #include "config.h"
8 #include <stdio.h>
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <ctype.h>
12 #include "export.h"
13 #include "unent.e"
14
15 static int leave_builtin = 0; /* Leave standard entities untouched */
16 static int fix_ampersands = 0; /* Replace lone and unrecognized & by & */
17
18 /* append_utf8 -- append the UTF-8 sequence for code n */
append_utf8(const int n)19 static void append_utf8(const int n)
20 {
21 if (n <= 0x7F) {
22 putchar(n);
23 } else if (n <= 0x7FF) {
24 putchar(0xC0 | (n >> 6));
25 putchar(0x80 | (n & 0x3F));
26 } else if (n <= 0xFFFF) {
27 putchar(0xE0 | (n >> 12));
28 putchar(0x80 | ((n >> 6) & 0x3F));
29 putchar(0x80 | (n & 0x3F));
30 } else if (n <= 0x1FFFFF) {
31 putchar(0xF0 | (n >> 18));
32 putchar(0x80 | ((n >> 12) & 0x3F));
33 putchar(0x80 | ((n >> 6) & 0x3F));
34 putchar(0x80 | (n & 0x3F));
35 } else if (n <= 0x3FFFFFF) {
36 putchar(0xF0 | (n >> 24));
37 putchar(0x80 | ((n >> 18) & 0x3F));
38 putchar(0x80 | ((n >> 12) & 0x3F));
39 putchar(0x80 | ((n >> 6) & 0x3F));
40 putchar(0x80 | (n & 0x3F));
41 } else {
42 putchar(0xF0 | (n >> 30));
43 putchar(0x80 | ((n >> 24) & 0x3F));
44 putchar(0x80 | ((n >> 18) & 0x3F));
45 putchar(0x80 | ((n >> 12) & 0x3F));
46 putchar(0x80 | ((n >> 6) & 0x3F));
47 putchar(0x80 | (n & 0x3F));
48 }
49 }
50
51
52 #define is_builtin(c) ((c)=='&'||(c)=='\''||(c)=='"'||(c)=='<'||(c)=='>')
53 #define hexval(c) ((c) <= '9' ? (c)-'0' : (c) <= 'F' ? 10+(c)-'A' : 10+(c)-'a')
54
55
56 /* expand -- print string, expanding entities to UTF-8 sequences */
expand(FILE * infile)57 static void expand(FILE *infile)
58 {
59 const struct _Entity *e;
60 int n, c;
61 char s[12]; /* Longest entity name has 8 characters */
62
63 while ((c = fgetc(infile)) != EOF) {
64 if (c != '&') { /* Literal character */
65 putchar(c);
66 } else if ((c = fgetc(infile)), isalnum(c)) { /* Named entity, e.g., < */
67 s[0] = c;
68 n = 1;
69 while ((c = fgetc(infile)), isalnum(c) && n < sizeof(s) - 1) s[n++] = c;
70 s[n] = '\0';
71 if (! (e = lookup_entity(s, n))) { /* Unknown entity */
72 if (fix_ampersands) fputs("&", stdout); else putchar('&');
73 fputs(s, stdout);
74 ungetc(c, infile);
75 } else {
76 if (leave_builtin && is_builtin(e->code)) printf("&%s;", s);
77 else append_utf8(e->code);
78 if (c != ';') ungetc(c, infile);
79 }
80 } else if (c == '#') { /* Numeric entity */
81 if ((c = fgetc(infile)), isdigit(c)) { /* Decimal entity, e.g., */
82 n = c - '0';
83 while ((c = fgetc(infile)), isdigit(c)) n = 10 * n + c - '0';
84 if (leave_builtin && is_builtin(n)) printf("&#%d;", n);
85 else append_utf8(n);
86 if (c != ';') ungetc(c, infile);
87 } else if (c == 'x') { /* Hexadecimal entity, e.g., 
 */
88 if ((c = fgetc(infile)), isxdigit(c)) {
89 n = hexval(c);
90 while ((c = fgetc(infile)), isxdigit(c)) n = 16 * n + hexval(c);
91 if (leave_builtin && is_builtin(n)) printf("&#x%x;", n);
92 else append_utf8(n);
93 if (c != ';') ungetc(c, infile);
94 } else { /* Invalid hexadecimal entity syntax */
95 if (fix_ampersands) fputs("&", stdout); else putchar('&');
96 printf("#x");
97 ungetc(c, infile);
98 }
99 } else { /* Invalid numerical entity */
100 if (fix_ampersands) fputs("&", stdout); else putchar('&');
101 putchar('#');
102 ungetc(c, infile);
103 }
104 } else { /* Neither a letter nor a '#' */
105 if (fix_ampersands) fputs("&", stdout); else putchar('&');
106 ungetc(c, infile);
107 }
108 }
109 /* SGML says also that a record-end (i.e., an end-of-line) may be
110 * used instead of a semicolon to end an entity reference. But the
111 * record-end is not suppressed in HTML and such an entity reference
112 * is invalid in XML, so we don't implement that rule here. Instead,
113 * the end-of-line is treated as any other character (other than
114 * semicolon) and left in the document.
115 */
116 }
117
118 static void usage(const char *prog)
119 #if __GNUC__ > 2 || __GNUC__ == 2 && __GNUC_MINOR__ >= 5
120 __attribute__((__noreturn__))
121 #endif
122 ;
123
124 /* usage -- print usage message and exit */
usage(const char * prog)125 static void usage(const char *prog)
126 {
127 fprintf(stderr, "Version %s\nUsage: %s [-b] [-f] [file]\n", VERSION, prog);
128 exit(2);
129 }
130
131 /* main -- read input, expand entities, write out again */
main(int argc,char * argv[])132 int main(int argc, char *argv[])
133 {
134 FILE *infile;
135 int c;
136
137 while ((c = getopt(argc, argv, "bf")) != -1)
138 switch (c) {
139 case 'b': leave_builtin = 1; break;
140 case 'f': fix_ampersands = 1; break;
141 default: usage(argv[0]);
142 }
143 if (optind == argc) infile = stdin;
144 else if (optind == argc - 1) infile = fopen(argv[optind], "r");
145 else usage(argv[0]);
146 if (infile == NULL) {perror(argv[optind]); exit(1);}
147
148 expand(infile);
149
150 fclose(infile);
151 return 0;
152 }
153