1 /*
2  * Output all elements with a certain name and/or class.
3  * Input must be well-formed, since no HTML heuristics are applied.
4  *
5  * Copyright © 2000-2012 World Wide Web Consortium
6  * See http://www.w3.org/Consortium/Legal/copyright-software
7  *
8  * Author: Bert Bos <bert@w3.org>
9  * Created: 20 Aug 2000
10  * Version: $Id: hxextract.c,v 1.7 2017/11/24 09:50:25 bbos Exp $
11  */
12 #include "config.h"
13 #include <assert.h>
14 #include <ctype.h>
15 #include <stdlib.h>
16 #include <stdio.h>
17 #include <time.h>
18 #include <stdbool.h>
19 #if STDC_HEADERS
20 # include <string.h>
21 #else
22 # ifndef HAVE_STRCHR
23 #  define strchr index
24 #  define strrchr rindex
25 # endif
26 # ifndef HAVE_STRSTR
27 #  include "strstr.e"
28 # endif
29 #endif
30 #include "export.h"
31 #include "types.e"
32 #include "html.e"
33 #include "heap.e"
34 #include "scan.e"
35 #include "dict.e"
36 #include "openurl.e"
37 #include "class.e"
38 
39 #define INDEX "index"				/* CLASS="... index..." */
40 
41 #define MAXLINELEN 1024				/* In configfile */
42 
43 static bool xml = false;			/* Use <empty /> convention */
44 static int copying = 0;				/* Start by not copying */
45 static string base = NULL;			/* URL of each file */
46 static string endtext = "";			/* Text to insert at end */
47 static string targetelement = NULL;		/* Element to extract */
48 static string targetclass = NULL;		/* Class to extract */
49 
50 
51 /* add_href -- add an "href" attribute to a list of attributes */
add_href(pairlist * attribs,const string base,const conststring id)52 static void add_href(pairlist *attribs, const string base, const conststring id)
53 {
54   string h = NULL;
55 
56   pairlist_set(attribs, "href", strapp(&h, base, "#", id, NULL));
57   free(h);
58 }
59 
60 /* handle_error -- called when a parse error occurred */
handle_error(void * unused,const string s,int lineno)61 static void handle_error(void *unused, const string s, int lineno)
62 {
63   fprintf(stderr, "%d: %s\n", lineno, s);
64 }
65 
66 /* start -- called before the first event is reported */
start(void)67 static void* start(void) {return NULL;}
68 
69 /* end -- called after the last event is reported */
end(void * unused)70 static void end(void *unused) {}
71 
72 /* handle_comment -- called after a comment is parsed */
handle_comment(void * unused,const string commenttext)73 static void handle_comment(void *unused, const string commenttext) {}
74 
75 /* handle_text -- called after a text chunk is parsed */
handle_text(void * unused,const string text)76 static void handle_text(void *unused, const string text)
77 {
78   if (copying > 0) fputs(text, stdout);
79 }
80 
81 /* handle_declaration -- called after a declaration is parsed */
handle_decl(void * unused,const string gi,const string fpi,const string url)82 static void handle_decl(void *unused, const string gi,
83 			const string fpi, const string url) {}
84 
85 /* handle_proc_instr -- called after a PI is parsed */
handle_pi(void * unused,const string pi_text)86 static void handle_pi(void *unused, const string pi_text) {}
87 
88 /* print_tag -- print a start- or empty tag */
print_tag(const string name,pairlist attribs,bool empty)89 static void print_tag(const string name, pairlist attribs, bool empty)
90 {
91   pairlist a;
92   conststring t, h;
93 
94   printf("<%s", name);
95   for (a = attribs; a != NULL; a = a->next) {
96     printf(" %s", a->name);
97     if (strcasecmp(a->name, "class") == 0 && (t = contains(a->value, INDEX))) {
98       /* Print value excluding INDEX */
99       printf("=\"");
100       for (h = a->value; h != t; h++) putchar(*h);
101       printf("%s\"", t + sizeof(INDEX) - 1);
102     } else {
103       if (a->value) printf("=\"%s\"", a->value);
104     }
105   }
106   printf((empty && xml) ? " />" : ">");
107 }
108 
109 /* is_match check whether the element matches the target element and class */
is_match(const string name,pairlist attribs)110 static bool is_match(const string name, pairlist attribs)
111 {
112   return ((!targetelement || strcasecmp(name, targetelement) == 0)
113 	  && (!targetclass || has_class(attribs, targetclass)));
114 }
115 
116 /* handle_starttag -- called after a start tag is parsed */
handle_starttag(void * unused,const string name,pairlist attribs)117 static void handle_starttag(void *unused, const string name, pairlist attribs)
118 {
119   conststring id;
120 
121   if (copying || is_match(name, attribs)) {
122     if (!copying && (id = pairlist_get(attribs, "id")))
123       add_href(&attribs, base, id);
124     if (!eq(name, "a") && !eq(name, "A")) print_tag(name, attribs, false);
125     copying++;
126   }
127 }
128 
129 /* handle_emptytag -- called after an empty tag is parsed */
handle_emptytag(void * unused,const string name,pairlist attribs)130 static void handle_emptytag(void *unused, const string name, pairlist attribs)
131 {
132   conststring id;
133 
134   if (copying || is_match(name, attribs)) {
135     if (!copying && (id = pairlist_get(attribs, "id")))
136       add_href(&attribs, base, id);
137     if (!eq(name, "a") && !eq(name, "A")) print_tag(name, attribs, true);
138   }
139 }
140 
141 /* handle_endtag -- called after an endtag is parsed (name may be "") */
handle_endtag(void * unused,const string name)142 static void handle_endtag(void *unused, const string name)
143 {
144   if (copying) {
145     if (!eq(name, "a") && !eq(name, "A")) printf("</%s>", name);
146     copying--;
147   }
148 }
149 
150 /* process_configfile -- read @chapter lines from config file */
process_configfile(const string configfile)151 static void process_configfile(const string configfile)
152 {
153   char line[MAXLINELEN], chapter[MAXLINELEN];
154   FILE *f;
155 
156   if (! (f = fopenurl(configfile, "r", NULL))) {perror(configfile); exit(2);}
157 
158   /* ToDo: accept quoted file names with spaces in their name */
159   while (fgets(line, sizeof(line), f)) {
160     if (sscanf(line, " @chapter %s", chapter) == 1) {
161       if (!base) base = chapter;
162       yyin = fopenurl(chapter, "r", NULL);
163       if (yyin == NULL) {perror(chapter); exit(2);}
164       if (yyparse() != 0) exit(3);
165       fclose(yyin);
166       base = NULL;
167     }
168   }
169 
170   fclose(f);
171 }
172 
173 /* usage -- print usage message and exit */
usage(const string name)174 static void usage(const string name)
175 {
176   fprintf(stderr, "Usage: %s [-v] [-x] [-s text] [-e text] [-b base] element-or-class [-c configfile | file-or-URL]...\n",
177 	  name);
178   exit(1);
179 }
180 
main(int argc,char * argv[])181 int main(int argc, char *argv[])
182 {
183   char *p;
184   int i;
185 
186   /* Bind the parser callback routines to our handlers */
187   set_error_handler(handle_error);
188   set_start_handler(start);
189   set_end_handler(end);
190   set_comment_handler(handle_comment);
191   set_text_handler(handle_text);
192   set_decl_handler(handle_decl);
193   set_pi_handler(handle_pi);
194   set_starttag_handler(handle_starttag);
195   set_emptytag_handler(handle_emptytag);
196   set_endtag_handler(handle_endtag);
197 
198   /* Loop over arguments; options may be in between file names */
199   for (i = 1; i < argc; i++) {
200     if (eq(argv[i], "-h") || eq(argv[i], "-?")) { /* Usage */
201       usage(argv[0]);
202     } else if (eq(argv[i], "-x")) {		/* XML format */
203       xml = true;
204     } else if (eq(argv[i], "-s")) {		/* Insert text at start */
205       printf("%s", argv[++i]);
206     } else if (eq(argv[i], "-e")) {		/* Insert text at end */
207       endtext = argv[++i];
208     } else if (eq(argv[i], "-b")) {		/* URL base */
209       base = argv[++i];
210     } else if (eq(argv[i], "-c")) {		/* Config file */
211       process_configfile(argv[++i]);
212     } else if (eq(argv[i], "-v")) {
213       printf("Version: %s %s\n", PACKAGE, VERSION);
214       return 0;
215     } else if (eq(argv[i], "-")) {		/* "-" = stdin */
216       if (!base) base = "";
217       yyin = stdin;
218       if (yyparse() != 0) exit(3);
219       base = NULL;				/* Reset base */
220     } else if (targetelement || targetclass) {	/* It's a file name or URL */
221       if (!base) base = argv[i];
222       yyin = fopenurl(argv[i], "r", NULL);
223       if (yyin == NULL) {perror(argv[i]); exit(2);}
224       if (yyparse() != 0) exit(3);
225       fclose(yyin);
226       base = NULL;
227     } else if (argv[i][0] == '.') {		/* Class name */
228       targetclass = argv[i] + 1;
229     } else {					/* Element name */
230       targetelement = argv[i];
231       if ((p = strchr(targetelement, '.'))) {
232 	*p = '\0';
233 	targetclass = p + 1;
234       }
235     }
236   }
237   if (!targetelement && !targetclass) usage(argv[0]);
238 
239   printf("%s", endtext);			/* Insert text at end */
240   return 0;
241 }
242