1 /* unent -- expand HTML entities
2  *
3  * Author: Bert Bos
4  * Created: 10 Aug 2008
5  */
6 
7 #include "config.h"
8 #include <stdio.h>
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <ctype.h>
12 #include "export.h"
13 #include "unent.e"
14 
15 static int leave_builtin = 0;	/* Leave standard entities untouched */
16 static int fix_ampersands = 0;	/* Replace lone and unrecognized & by &amp; */
17 
18 /* append_utf8 -- append the UTF-8 sequence for code n */
append_utf8(const int n)19 static void append_utf8(const int n)
20 {
21   if (n <= 0x7F) {
22     putchar(n);
23   } else if (n <= 0x7FF) {
24     putchar(0xC0 | (n >> 6));
25     putchar(0x80 | (n & 0x3F));
26   } else if (n <= 0xFFFF) {
27     putchar(0xE0 | (n >> 12));
28     putchar(0x80 | ((n >> 6) & 0x3F));
29     putchar(0x80 | (n & 0x3F));
30   } else if (n <= 0x1FFFFF) {
31     putchar(0xF0 | (n >> 18));
32     putchar(0x80 | ((n >> 12) & 0x3F));
33     putchar(0x80 | ((n >> 6) & 0x3F));
34     putchar(0x80 | (n & 0x3F));
35   } else if (n <= 0x3FFFFFF) {
36     putchar(0xF0 | (n >> 24));
37     putchar(0x80 | ((n >> 18) & 0x3F));
38     putchar(0x80 | ((n >> 12) & 0x3F));
39     putchar(0x80 | ((n >> 6) & 0x3F));
40     putchar(0x80 | (n & 0x3F));
41   } else {
42     putchar(0xF0 | (n >> 30));
43     putchar(0x80 | ((n >> 24) & 0x3F));
44     putchar(0x80 | ((n >> 18) & 0x3F));
45     putchar(0x80 | ((n >> 12) & 0x3F));
46     putchar(0x80 | ((n >> 6) & 0x3F));
47     putchar(0x80 | (n & 0x3F));
48   }
49 }
50 
51 
52 #define is_builtin(c) ((c)=='&'||(c)=='\''||(c)=='"'||(c)=='<'||(c)=='>')
53 #define hexval(c) ((c) <= '9' ? (c)-'0' : (c) <= 'F' ? 10+(c)-'A' : 10+(c)-'a')
54 
55 
56 /* expand -- print string, expanding entities to UTF-8 sequences */
expand(FILE * infile)57 static void expand(FILE *infile)
58 {
59   const struct _Entity *e;
60   int n, c;
61   char s[12];		    /* Longest entity name has 8 characters */
62 
63   while ((c = fgetc(infile)) != EOF) {
64     if (c != '&') {		/* Literal character */
65       putchar(c);
66     } else if ((c = fgetc(infile)), isalnum(c)) { /* Named entity, e.g., &lt; */
67       s[0] = c;
68       n = 1;
69       while ((c = fgetc(infile)), isalnum(c) && n < sizeof(s) - 1) s[n++] = c;
70       s[n] = '\0';
71       if (! (e = lookup_entity(s, n))) {	/* Unknown entity */
72 	if (fix_ampersands) fputs("&amp;", stdout); else putchar('&');
73 	fputs(s, stdout);
74 	ungetc(c, infile);
75       } else {
76 	if (leave_builtin && is_builtin(e->code)) printf("&%s;", s);
77 	else append_utf8(e->code);
78 	if (c != ';') ungetc(c, infile);
79       }
80     } else if (c == '#') {		     /* Numeric entity */
81       if ((c = fgetc(infile)), isdigit(c)) { /* Decimal entity, e.g., &#10; */
82 	n = c - '0';
83 	while ((c = fgetc(infile)), isdigit(c)) n = 10 * n + c - '0';
84 	if (leave_builtin && is_builtin(n)) printf("&#%d;", n);
85 	else append_utf8(n);
86 	if (c != ';') ungetc(c, infile);
87       } else if (c == 'x') {	/* Hexadecimal entity, e.g., &#x0A; */
88 	if ((c = fgetc(infile)), isxdigit(c)) {
89 	  n = hexval(c);
90 	  while ((c = fgetc(infile)), isxdigit(c)) n = 16 * n + hexval(c);
91 	  if (leave_builtin && is_builtin(n)) printf("&#x%x;", n);
92 	  else append_utf8(n);
93 	  if (c != ';') ungetc(c, infile);
94 	} else {	  	/* Invalid hexadecimal entity syntax */
95 	  if (fix_ampersands) fputs("&amp;", stdout); else putchar('&');
96 	  printf("#x");
97 	  ungetc(c, infile);
98 	}
99       } else {			/* Invalid numerical entity */
100 	if (fix_ampersands) fputs("&amp;", stdout); else putchar('&');
101 	putchar('#');
102 	ungetc(c, infile);
103       }
104     } else {			/* Neither a letter nor a '#' */
105       if (fix_ampersands) fputs("&amp;", stdout); else putchar('&');
106       ungetc(c, infile);
107     }
108   }
109   /* SGML says also that a record-end (i.e., an end-of-line) may be
110    * used instead of a semicolon to end an entity reference. But the
111    * record-end is not suppressed in HTML and such an entity reference
112    * is invalid in XML, so we don't implement that rule here. Instead,
113    * the end-of-line is treated as any other character (other than
114    * semicolon) and left in the document.
115    */
116 }
117 
118 static void usage(const char *prog)
119 #if __GNUC__ > 2 || __GNUC__ == 2 && __GNUC_MINOR__ >= 5
120   __attribute__((__noreturn__))
121 #endif
122 ;
123 
124 /* usage -- print usage message and exit */
usage(const char * prog)125 static void usage(const char *prog)
126 {
127   fprintf(stderr, "Version %s\nUsage: %s [-b] [-f] [file]\n", VERSION, prog);
128   exit(2);
129 }
130 
131 /* main -- read input, expand entities, write out again */
main(int argc,char * argv[])132 int main(int argc, char *argv[])
133 {
134   FILE *infile;
135   int c;
136 
137   while ((c = getopt(argc, argv, "bf")) != -1)
138     switch (c) {
139     case 'b': leave_builtin = 1; break;
140     case 'f': fix_ampersands = 1; break;
141     default: usage(argv[0]);
142     }
143   if (optind == argc) infile = stdin;
144   else if (optind == argc - 1) infile = fopen(argv[optind], "r");
145   else usage(argv[0]);
146   if (infile == NULL) {perror(argv[optind]); exit(1);}
147 
148   expand(infile);
149 
150   fclose(infile);
151   return 0;
152 }
153