1 /*
2 * Output all elements with a certain name and/or class.
3 * Input must be well-formed, since no HTML heuristics are applied.
4 *
5 * Copyright © 2000-2012 World Wide Web Consortium
6 * See http://www.w3.org/Consortium/Legal/copyright-software
7 *
8 * Author: Bert Bos <bert@w3.org>
9 * Created: 20 Aug 2000
10 * Version: $Id: hxextract.c,v 1.7 2017/11/24 09:50:25 bbos Exp $
11 */
12 #include "config.h"
13 #include <assert.h>
14 #include <ctype.h>
15 #include <stdlib.h>
16 #include <stdio.h>
17 #include <time.h>
18 #include <stdbool.h>
19 #if STDC_HEADERS
20 # include <string.h>
21 #else
22 # ifndef HAVE_STRCHR
23 # define strchr index
24 # define strrchr rindex
25 # endif
26 # ifndef HAVE_STRSTR
27 # include "strstr.e"
28 # endif
29 #endif
30 #include "export.h"
31 #include "types.e"
32 #include "html.e"
33 #include "heap.e"
34 #include "scan.e"
35 #include "dict.e"
36 #include "openurl.e"
37 #include "class.e"
38
39 #define INDEX "index" /* CLASS="... index..." */
40
41 #define MAXLINELEN 1024 /* In configfile */
42
43 static bool xml = false; /* Use <empty /> convention */
44 static int copying = 0; /* Start by not copying */
45 static string base = NULL; /* URL of each file */
46 static string endtext = ""; /* Text to insert at end */
47 static string targetelement = NULL; /* Element to extract */
48 static string targetclass = NULL; /* Class to extract */
49
50
51 /* add_href -- add an "href" attribute to a list of attributes */
add_href(pairlist * attribs,const string base,const conststring id)52 static void add_href(pairlist *attribs, const string base, const conststring id)
53 {
54 string h = NULL;
55
56 pairlist_set(attribs, "href", strapp(&h, base, "#", id, NULL));
57 free(h);
58 }
59
60 /* handle_error -- called when a parse error occurred */
handle_error(void * unused,const string s,int lineno)61 static void handle_error(void *unused, const string s, int lineno)
62 {
63 fprintf(stderr, "%d: %s\n", lineno, s);
64 }
65
66 /* start -- called before the first event is reported */
start(void)67 static void* start(void) {return NULL;}
68
69 /* end -- called after the last event is reported */
end(void * unused)70 static void end(void *unused) {}
71
72 /* handle_comment -- called after a comment is parsed */
handle_comment(void * unused,const string commenttext)73 static void handle_comment(void *unused, const string commenttext) {}
74
75 /* handle_text -- called after a text chunk is parsed */
handle_text(void * unused,const string text)76 static void handle_text(void *unused, const string text)
77 {
78 if (copying > 0) fputs(text, stdout);
79 }
80
81 /* handle_declaration -- called after a declaration is parsed */
handle_decl(void * unused,const string gi,const string fpi,const string url)82 static void handle_decl(void *unused, const string gi,
83 const string fpi, const string url) {}
84
85 /* handle_proc_instr -- called after a PI is parsed */
handle_pi(void * unused,const string pi_text)86 static void handle_pi(void *unused, const string pi_text) {}
87
88 /* print_tag -- print a start- or empty tag */
print_tag(const string name,pairlist attribs,bool empty)89 static void print_tag(const string name, pairlist attribs, bool empty)
90 {
91 pairlist a;
92 conststring t, h;
93
94 printf("<%s", name);
95 for (a = attribs; a != NULL; a = a->next) {
96 printf(" %s", a->name);
97 if (strcasecmp(a->name, "class") == 0 && (t = contains(a->value, INDEX))) {
98 /* Print value excluding INDEX */
99 printf("=\"");
100 for (h = a->value; h != t; h++) putchar(*h);
101 printf("%s\"", t + sizeof(INDEX) - 1);
102 } else {
103 if (a->value) printf("=\"%s\"", a->value);
104 }
105 }
106 printf((empty && xml) ? " />" : ">");
107 }
108
109 /* is_match check whether the element matches the target element and class */
is_match(const string name,pairlist attribs)110 static bool is_match(const string name, pairlist attribs)
111 {
112 return ((!targetelement || strcasecmp(name, targetelement) == 0)
113 && (!targetclass || has_class(attribs, targetclass)));
114 }
115
116 /* handle_starttag -- called after a start tag is parsed */
handle_starttag(void * unused,const string name,pairlist attribs)117 static void handle_starttag(void *unused, const string name, pairlist attribs)
118 {
119 conststring id;
120
121 if (copying || is_match(name, attribs)) {
122 if (!copying && (id = pairlist_get(attribs, "id")))
123 add_href(&attribs, base, id);
124 if (!eq(name, "a") && !eq(name, "A")) print_tag(name, attribs, false);
125 copying++;
126 }
127 }
128
129 /* handle_emptytag -- called after an empty tag is parsed */
handle_emptytag(void * unused,const string name,pairlist attribs)130 static void handle_emptytag(void *unused, const string name, pairlist attribs)
131 {
132 conststring id;
133
134 if (copying || is_match(name, attribs)) {
135 if (!copying && (id = pairlist_get(attribs, "id")))
136 add_href(&attribs, base, id);
137 if (!eq(name, "a") && !eq(name, "A")) print_tag(name, attribs, true);
138 }
139 }
140
141 /* handle_endtag -- called after an endtag is parsed (name may be "") */
handle_endtag(void * unused,const string name)142 static void handle_endtag(void *unused, const string name)
143 {
144 if (copying) {
145 if (!eq(name, "a") && !eq(name, "A")) printf("</%s>", name);
146 copying--;
147 }
148 }
149
150 /* process_configfile -- read @chapter lines from config file */
process_configfile(const string configfile)151 static void process_configfile(const string configfile)
152 {
153 char line[MAXLINELEN], chapter[MAXLINELEN];
154 FILE *f;
155
156 if (! (f = fopenurl(configfile, "r", NULL))) {perror(configfile); exit(2);}
157
158 /* ToDo: accept quoted file names with spaces in their name */
159 while (fgets(line, sizeof(line), f)) {
160 if (sscanf(line, " @chapter %s", chapter) == 1) {
161 if (!base) base = chapter;
162 yyin = fopenurl(chapter, "r", NULL);
163 if (yyin == NULL) {perror(chapter); exit(2);}
164 if (yyparse() != 0) exit(3);
165 fclose(yyin);
166 base = NULL;
167 }
168 }
169
170 fclose(f);
171 }
172
173 /* usage -- print usage message and exit */
usage(const string name)174 static void usage(const string name)
175 {
176 fprintf(stderr, "Usage: %s [-v] [-x] [-s text] [-e text] [-b base] element-or-class [-c configfile | file-or-URL]...\n",
177 name);
178 exit(1);
179 }
180
main(int argc,char * argv[])181 int main(int argc, char *argv[])
182 {
183 char *p;
184 int i;
185
186 /* Bind the parser callback routines to our handlers */
187 set_error_handler(handle_error);
188 set_start_handler(start);
189 set_end_handler(end);
190 set_comment_handler(handle_comment);
191 set_text_handler(handle_text);
192 set_decl_handler(handle_decl);
193 set_pi_handler(handle_pi);
194 set_starttag_handler(handle_starttag);
195 set_emptytag_handler(handle_emptytag);
196 set_endtag_handler(handle_endtag);
197
198 /* Loop over arguments; options may be in between file names */
199 for (i = 1; i < argc; i++) {
200 if (eq(argv[i], "-h") || eq(argv[i], "-?")) { /* Usage */
201 usage(argv[0]);
202 } else if (eq(argv[i], "-x")) { /* XML format */
203 xml = true;
204 } else if (eq(argv[i], "-s")) { /* Insert text at start */
205 printf("%s", argv[++i]);
206 } else if (eq(argv[i], "-e")) { /* Insert text at end */
207 endtext = argv[++i];
208 } else if (eq(argv[i], "-b")) { /* URL base */
209 base = argv[++i];
210 } else if (eq(argv[i], "-c")) { /* Config file */
211 process_configfile(argv[++i]);
212 } else if (eq(argv[i], "-v")) {
213 printf("Version: %s %s\n", PACKAGE, VERSION);
214 return 0;
215 } else if (eq(argv[i], "-")) { /* "-" = stdin */
216 if (!base) base = "";
217 yyin = stdin;
218 if (yyparse() != 0) exit(3);
219 base = NULL; /* Reset base */
220 } else if (targetelement || targetclass) { /* It's a file name or URL */
221 if (!base) base = argv[i];
222 yyin = fopenurl(argv[i], "r", NULL);
223 if (yyin == NULL) {perror(argv[i]); exit(2);}
224 if (yyparse() != 0) exit(3);
225 fclose(yyin);
226 base = NULL;
227 } else if (argv[i][0] == '.') { /* Class name */
228 targetclass = argv[i] + 1;
229 } else { /* Element name */
230 targetelement = argv[i];
231 if ((p = strchr(targetelement, '.'))) {
232 *p = '\0';
233 targetclass = p + 1;
234 }
235 }
236 }
237 if (!targetelement && !targetclass) usage(argv[0]);
238
239 printf("%s", endtext); /* Insert text at end */
240 return 0;
241 }
242