1 /*
2  * List all links from the given document.
3  *
4  * Copyright © 1994-2000 World Wide Web Consortium
5  * See http://www.w3.org/Consortium/Legal/copyright-software
6  *
7  * Bert Bos <bert@w3.org>
8  * Created 31 July 1999
9  * $Id: hxwls.c,v 1.13 2019/08/28 19:14:34 bbos Exp $
10  */
11 #include "config.h"
12 #include <assert.h>
13 #ifdef HAVE_UNISTD_H
14 #  include <unistd.h>
15 #endif
16 #include <ctype.h>
17 #include <stdlib.h>
18 #include <stdio.h>
19 #include <stdbool.h>
20 #if STDC_HEADERS
21 # include <string.h>
22 #else
23 # ifndef HAVE_STRCHR
24 #  define strchr index
25 #  define strrchr rindex
26 # endif
27 # ifndef HAVE_STRDUP
28 #  include "strdup.e"
29 # endif
30 #endif
31 #include "export.h"
32 #include "heap.e"
33 #include "types.e"
34 #include "html.e"
35 #include "scan.e"
36 #include "dict.e"
37 #include "openurl.e"
38 #include "url.e"
39 #include "errexit.e"
40 #include "unent.e"
41 
42 static bool has_error = false;
43 static string base = NULL;
44 static string self;
45 static enum {Short, Long, HTML, Tuple} format = Short;	/* Option -l -h -t */
46 static bool relative = false;		/* Option -r */
47 static bool ascii = false;		/* Option -a */
48 
49 
50 /* append_utf8 -- add UTF-8 bytes for code n at s, return end of string */
append_utf8(string s,const unsigned long n)51 static string append_utf8(string s, const unsigned long n)
52 {
53   /* We assume s is long enough */
54   if (n <= 0x7F) {
55     *(s++) = n;
56   } else if (n <= 0x7FF) {
57     *(s++) = 0xC0 | (n >> 6);
58     *(s++) = 0x80 | (n & 0x3F);
59   } else if (n <= 0xFFFF) {
60     *(s++) = 0xE0 | (n >> 12);
61     *(s++) = 0x80 | ((n >> 6) & 0x3F);
62     *(s++) = 0x80 | (n & 0x3F);
63   } else if (n <= 0x1FFFFF) {
64     *(s++) = 0xF0 | (n >> 18);
65     *(s++) = 0x80 | ((n >> 12) & 0x3F);
66     *(s++) = 0x80 | ((n >> 6) & 0x3F);
67     *(s++) = 0x80 | (n & 0x3F);
68   } else if (n <= 0x3FFFFFF) {
69     *(s++) = 0xF0 | (n >> 24);
70     *(s++) = 0x80 | ((n >> 18) & 0x3F);
71     *(s++) = 0x80 | ((n >> 12) & 0x3F);
72     *(s++) = 0x80 | ((n >> 6) & 0x3F);
73     *(s++) = 0x80 | (n & 0x3F);
74   } else {
75     *(s++) = 0xF0 | (n >> 30);
76     *(s++) = 0x80 | ((n >> 24) & 0x3F);
77     *(s++) = 0x80 | ((n >> 18) & 0x3F);
78     *(s++) = 0x80 | ((n >> 12) & 0x3F);
79     *(s++) = 0x80 | ((n >> 6) & 0x3F);
80     *(s++) = 0x80 | (n & 0x3F);
81   }
82   return s;
83 }
84 
85 
86 /* output -- print the link (lowercases rel argument) */
output(const conststring type,const conststring rel,conststring url)87 static void output(const conststring type, const conststring rel,
88 		   conststring url)
89 {
90   string h = NULL, q, r, rel1;
91   conststring p, s;
92   const struct _Entity *e;
93   unsigned long c;
94 
95   if (url) {					/* If we found a URL */
96 
97     /* Replace entities. */
98     h = newnstring(url, 2 * strlen(url));	/* Reserve sufficient space */
99     for (p = url, q = h; *p; p++) {
100       if (*p != '&') {
101 	*(q++) = *p;
102       } else if (*(p+1) == '#') {		/* Numeric entity */
103 	if (*(p+2) == 'x') c = strtoul(p + 3, &r, 16);
104 	else c = strtoul(p + 2, &r, 10);
105 	if (c > 0 && c <= 2147483647) q = append_utf8(q, c);
106 	p = *r == ';' ? r : r - 1;
107       } else {					/* Entity */
108 	for (s = p + 1; isalnum(*s); s++);
109 	if (!(e = lookup_entity(p+1, s - (p+1)))) *(q++) = '&'; /* Unknown */
110 	else {q = append_utf8(q, e->code); p = *s == ';' ? s : s -1;}
111       }
112     }
113     *q = '\0';
114     url = h;
115 
116     /* Make URL absolute */
117     if (! relative && base) {
118       h = URL_s_absolutize(base, url);
119       dispose(url);
120       url = h;
121     }
122     /* Convert IRI to URL, if requested */
123     if (ascii) {
124       h = URL_s_to_ascii(url);
125       dispose(url);
126       url = h;
127     }
128     rel1 = newstring(rel ? rel : "");
129     down(rel1);
130     switch (format) {
131       case HTML:
132 	printf("<li><a class=\"%s\" rel=\"%s\" href=\"%s\">%s</a></li>\n",
133 		 type, rel1, url, url);
134 	break;
135       case Long:
136 	printf("%s\t%s\t%s\n", type, rel1, url);
137 	break;
138       case Short:
139 	printf("%s\n", url);
140 	break;
141       case Tuple:
142 	printf("%s\t%s\t%s\t%s\n", self, type, rel1, url);
143 	break;
144       default:
145 	assert(!"Cannot happen!");
146     }
147     free(rel1);
148     free(h);
149   }
150 }
151 
152 
153 /* --------------- implements parser interface api------------------------- */
154 
155 /* handle_error -- called when a parse error occurred */
handle_error(void * clientdata,const string s,int lineno)156 void handle_error(void *clientdata, const string s, int lineno)
157 {
158   fprintf(stderr, "%d: %s\n", lineno, s);
159   has_error = true;
160 }
161 
162 /* start -- called before the first event is reported */
start(void)163 void* start(void)
164 {
165   if (format == HTML) {
166     printf("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\"\n");
167     printf("  \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n");
168     printf("<html>\n");
169     printf("<head><title>Output of listlinks</title></head>\n");
170     printf("<body>\n");
171     printf("<ol>\n");
172   }
173   return NULL;
174 }
175 
176 /* end -- called after the last event is reported */
end(void * clientdata)177 void end(void *clientdata)
178 {
179   if (format == HTML) {
180     printf("</ol>\n");
181     printf("</body>\n");
182     printf("</html>\n");
183   }
184 }
185 
186 /* handle_comment -- called after a comment is parsed */
handle_comment(void * clientdata,string commenttext)187 void handle_comment(void *clientdata, string commenttext)
188 {
189   free(commenttext);
190 }
191 
192 /* handle_text -- called after a text chunk is parsed */
handle_text(void * clientdata,string text)193 void handle_text(void *clientdata, string text)
194 {
195   /* There may be several consecutive calls to this routine. */
196   /* escape(text); */
197   free(text);
198 }
199 
200 /* handle_decl -- called after a declaration is parsed */
handle_decl(void * clientdata,string gi,string fpi,string url)201 void handle_decl(void *clientdata, string gi, string fpi, string url)
202 {
203   /* skip */
204   if (gi) free(gi);
205   if (fpi) free(fpi);
206   if (url) free(url);
207 }
208 
209 /* handle_pi -- called after a PI is parsed */
handle_pi(void * clientdata,string pi_text)210 void handle_pi(void *clientdata, string pi_text)
211 {
212   if (pi_text) free(pi_text);
213 }
214 
215 /* handle_starttag -- called after a start tag is parsed */
handle_starttag(void * clientdata,string name,pairlist attribs)216 void handle_starttag(void *clientdata, string name, pairlist attribs)
217 {
218   /* ToDo: print text of anchor, if available */
219   conststring h;
220 
221   if (strcasecmp(name, "base") == 0) {
222     h = pairlist_get(attribs, "href");
223     if (h) base = strdup(h);			/* Use as base from now on */
224     output("base", NULL, h);
225   } else if (strcasecmp(name, "link") == 0) {
226     output("link", pairlist_get(attribs, "rel"), pairlist_get(attribs, "href"));
227   } else if (strcasecmp(name, "a") == 0) {
228     output("a", pairlist_get(attribs, "rel"), pairlist_get(attribs, "href"));
229   } else if (strcasecmp(name, "img") == 0) {
230     output("img", NULL, pairlist_get(attribs, "src"));
231     output("img", "longdesc", pairlist_get(attribs, "longdesc"));
232     output("img", "srcset", pairlist_get(attribs, "srcset"));
233   } else if (strcasecmp(name, "input") == 0) {
234     output("input", "src", pairlist_get(attribs, "src"));
235   } else if (strcasecmp(name, "object") == 0) {
236     output("object", NULL,  pairlist_get(attribs, "data"));
237     output("object", "classid",  pairlist_get(attribs, "classid"));
238     output("object", "codebase",  pairlist_get(attribs, "codebase"));
239   } else if (strcasecmp(name, "area") == 0) {
240     output("area", pairlist_get(attribs, "rel"), pairlist_get(attribs, "href"));
241   } else if (strcasecmp(name, "ins") == 0) {
242     output("ins", NULL, pairlist_get(attribs, "cite"));
243   } else if (strcasecmp(name, "del") == 0) {
244     output("del", NULL, pairlist_get(attribs, "cite"));
245   } else if (strcasecmp(name, "q") == 0) {
246     output("q", NULL, pairlist_get(attribs, "cite"));
247   } else if (strcasecmp(name, "blockquote") == 0) {
248     output("bq", NULL, pairlist_get(attribs, "cite"));
249   } else if (strcasecmp(name, "form") == 0) {
250     output("form", pairlist_get(attribs, "method"), pairlist_get(attribs, "action"));
251   } else if (strcasecmp(name, "frame") == 0) {
252     output("frame", NULL, pairlist_get(attribs, "src"));
253   } else if (strcasecmp(name, "iframe") == 0) {
254     output("iframe", NULL, pairlist_get(attribs, "src"));
255   } else if (strcasecmp(name, "head") == 0) {
256     output("head", NULL, pairlist_get(attribs, "profile"));
257   } else if (strcasecmp(name, "script") == 0) {
258     output("script", NULL, pairlist_get(attribs, "src"));
259   } else if (strcasecmp(name, "body") == 0) {
260     output("body", NULL, pairlist_get(attribs, "background"));
261   } else if (strcasecmp(name, "video") == 0) {
262     output("video", NULL, pairlist_get(attribs, "src"));
263   } else if (strcasecmp(name, "audio") == 0) {
264     output("audio", NULL, pairlist_get(attribs, "src"));
265   } else if (strcasecmp(name, "source") == 0) {
266     output("source", "srcset", pairlist_get(attribs, "srcset"));
267     output("source", "src", pairlist_get(attribs, "src"));
268   }
269 
270   /* Free memory */
271   pairlist_delete(attribs);
272   free(name);
273 }
274 
275 /* handle_emptytag -- called after an empty tag is parsed */
handle_emptytag(void * clientdata,string name,pairlist attribs)276 void handle_emptytag(void *clientdata, string name, pairlist attribs)
277 {
278   handle_starttag(clientdata, name, attribs);
279 }
280 
281 /* handle_endtag -- called after an endtag is parsed (name may be "") */
handle_endtag(void * clientdata,string name)282 void handle_endtag(void *clientdata, string name)
283 {
284   free(name);
285 }
286 
287 /* --------------------------------------------------------------------- */
288 
289 
290 /* usage -- print usage message and exit */
usage(string progname)291 static void usage(string progname)
292 {
293   fprintf(stderr,
294 	  "Version %s\nUsage: %s [-l] [-r] [-h] [-b base] [-t] [-a] [HTML-file]\n",
295 	  VERSION, progname);
296   exit(1);
297 }
298 
299 
main(int argc,char * argv[])300 int main(int argc, char *argv[])
301 {
302   int c, status = 200;
303 
304   /* Bind the parser callback routines to our handlers */
305   set_error_handler(handle_error);
306   set_start_handler(start);
307   set_end_handler(end);
308   set_comment_handler(handle_comment);
309   set_text_handler(handle_text);
310   set_decl_handler(handle_decl);
311   set_pi_handler(handle_pi);
312   set_starttag_handler(handle_starttag);
313   set_emptytag_handler(handle_emptytag);
314   set_endtag_handler(handle_endtag);
315 
316   /* Parse command line arguments */
317   while ((c = getopt(argc, argv, "lb:rhta")) != -1) {
318     switch (c) {
319       case 'l': format = Long; break;		/* Long listing */
320       case 'b': base = strdup(optarg); break;	/* Set base of URL */
321       case 'r': relative = true; break;		/* Do not make URLs absolute */
322       case 'h': format = HTML; break;		/* Output in HTML format */
323       case 't': format = Tuple; break;		/* Output as 4-tuples */
324       case 'a': ascii = true; break;		/* Convert IRIs to URLs */
325       default: usage(argv[0]);
326     }
327   }
328 
329   if (optind == argc) {
330     yyin = stdin;
331     self = "-";
332   } else if (optind == argc - 1) {
333     if (!base) base = strdup(argv[optind]);
334     if (eq(argv[optind], "-")) yyin = stdin;
335     else yyin = fopenurl(argv[optind], "r", &status);
336     self = argv[optind];
337   } else {
338     usage(argv[0]);
339   }
340 
341   if (yyin == NULL) {perror(argv[optind]); exit(1);}
342   if (status != 200) errexit("%s : %s\n", argv[optind], http_strerror(status));
343 
344   if (yyparse() != 0) exit(3);
345 
346   if (base) free(base);
347 
348   return has_error ? 1 : 0;
349 }
350