1 /*
2 * List all links from the given document.
3 *
4 * Copyright © 1994-2000 World Wide Web Consortium
5 * See http://www.w3.org/Consortium/Legal/copyright-software
6 *
7 * Bert Bos <bert@w3.org>
8 * Created 31 July 1999
9 * $Id: hxwls.c,v 1.13 2019/08/28 19:14:34 bbos Exp $
10 */
11 #include "config.h"
12 #include <assert.h>
13 #ifdef HAVE_UNISTD_H
14 # include <unistd.h>
15 #endif
16 #include <ctype.h>
17 #include <stdlib.h>
18 #include <stdio.h>
19 #include <stdbool.h>
20 #if STDC_HEADERS
21 # include <string.h>
22 #else
23 # ifndef HAVE_STRCHR
24 # define strchr index
25 # define strrchr rindex
26 # endif
27 # ifndef HAVE_STRDUP
28 # include "strdup.e"
29 # endif
30 #endif
31 #include "export.h"
32 #include "heap.e"
33 #include "types.e"
34 #include "html.e"
35 #include "scan.e"
36 #include "dict.e"
37 #include "openurl.e"
38 #include "url.e"
39 #include "errexit.e"
40 #include "unent.e"
41
42 static bool has_error = false;
43 static string base = NULL;
44 static string self;
45 static enum {Short, Long, HTML, Tuple} format = Short; /* Option -l -h -t */
46 static bool relative = false; /* Option -r */
47 static bool ascii = false; /* Option -a */
48
49
50 /* append_utf8 -- add UTF-8 bytes for code n at s, return end of string */
append_utf8(string s,const unsigned long n)51 static string append_utf8(string s, const unsigned long n)
52 {
53 /* We assume s is long enough */
54 if (n <= 0x7F) {
55 *(s++) = n;
56 } else if (n <= 0x7FF) {
57 *(s++) = 0xC0 | (n >> 6);
58 *(s++) = 0x80 | (n & 0x3F);
59 } else if (n <= 0xFFFF) {
60 *(s++) = 0xE0 | (n >> 12);
61 *(s++) = 0x80 | ((n >> 6) & 0x3F);
62 *(s++) = 0x80 | (n & 0x3F);
63 } else if (n <= 0x1FFFFF) {
64 *(s++) = 0xF0 | (n >> 18);
65 *(s++) = 0x80 | ((n >> 12) & 0x3F);
66 *(s++) = 0x80 | ((n >> 6) & 0x3F);
67 *(s++) = 0x80 | (n & 0x3F);
68 } else if (n <= 0x3FFFFFF) {
69 *(s++) = 0xF0 | (n >> 24);
70 *(s++) = 0x80 | ((n >> 18) & 0x3F);
71 *(s++) = 0x80 | ((n >> 12) & 0x3F);
72 *(s++) = 0x80 | ((n >> 6) & 0x3F);
73 *(s++) = 0x80 | (n & 0x3F);
74 } else {
75 *(s++) = 0xF0 | (n >> 30);
76 *(s++) = 0x80 | ((n >> 24) & 0x3F);
77 *(s++) = 0x80 | ((n >> 18) & 0x3F);
78 *(s++) = 0x80 | ((n >> 12) & 0x3F);
79 *(s++) = 0x80 | ((n >> 6) & 0x3F);
80 *(s++) = 0x80 | (n & 0x3F);
81 }
82 return s;
83 }
84
85
86 /* output -- print the link (lowercases rel argument) */
output(const conststring type,const conststring rel,conststring url)87 static void output(const conststring type, const conststring rel,
88 conststring url)
89 {
90 string h = NULL, q, r, rel1;
91 conststring p, s;
92 const struct _Entity *e;
93 unsigned long c;
94
95 if (url) { /* If we found a URL */
96
97 /* Replace entities. */
98 h = newnstring(url, 2 * strlen(url)); /* Reserve sufficient space */
99 for (p = url, q = h; *p; p++) {
100 if (*p != '&') {
101 *(q++) = *p;
102 } else if (*(p+1) == '#') { /* Numeric entity */
103 if (*(p+2) == 'x') c = strtoul(p + 3, &r, 16);
104 else c = strtoul(p + 2, &r, 10);
105 if (c > 0 && c <= 2147483647) q = append_utf8(q, c);
106 p = *r == ';' ? r : r - 1;
107 } else { /* Entity */
108 for (s = p + 1; isalnum(*s); s++);
109 if (!(e = lookup_entity(p+1, s - (p+1)))) *(q++) = '&'; /* Unknown */
110 else {q = append_utf8(q, e->code); p = *s == ';' ? s : s -1;}
111 }
112 }
113 *q = '\0';
114 url = h;
115
116 /* Make URL absolute */
117 if (! relative && base) {
118 h = URL_s_absolutize(base, url);
119 dispose(url);
120 url = h;
121 }
122 /* Convert IRI to URL, if requested */
123 if (ascii) {
124 h = URL_s_to_ascii(url);
125 dispose(url);
126 url = h;
127 }
128 rel1 = newstring(rel ? rel : "");
129 down(rel1);
130 switch (format) {
131 case HTML:
132 printf("<li><a class=\"%s\" rel=\"%s\" href=\"%s\">%s</a></li>\n",
133 type, rel1, url, url);
134 break;
135 case Long:
136 printf("%s\t%s\t%s\n", type, rel1, url);
137 break;
138 case Short:
139 printf("%s\n", url);
140 break;
141 case Tuple:
142 printf("%s\t%s\t%s\t%s\n", self, type, rel1, url);
143 break;
144 default:
145 assert(!"Cannot happen!");
146 }
147 free(rel1);
148 free(h);
149 }
150 }
151
152
153 /* --------------- implements parser interface api------------------------- */
154
155 /* handle_error -- called when a parse error occurred */
handle_error(void * clientdata,const string s,int lineno)156 void handle_error(void *clientdata, const string s, int lineno)
157 {
158 fprintf(stderr, "%d: %s\n", lineno, s);
159 has_error = true;
160 }
161
162 /* start -- called before the first event is reported */
start(void)163 void* start(void)
164 {
165 if (format == HTML) {
166 printf("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\"\n");
167 printf(" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n");
168 printf("<html>\n");
169 printf("<head><title>Output of listlinks</title></head>\n");
170 printf("<body>\n");
171 printf("<ol>\n");
172 }
173 return NULL;
174 }
175
176 /* end -- called after the last event is reported */
end(void * clientdata)177 void end(void *clientdata)
178 {
179 if (format == HTML) {
180 printf("</ol>\n");
181 printf("</body>\n");
182 printf("</html>\n");
183 }
184 }
185
186 /* handle_comment -- called after a comment is parsed */
handle_comment(void * clientdata,string commenttext)187 void handle_comment(void *clientdata, string commenttext)
188 {
189 free(commenttext);
190 }
191
192 /* handle_text -- called after a text chunk is parsed */
handle_text(void * clientdata,string text)193 void handle_text(void *clientdata, string text)
194 {
195 /* There may be several consecutive calls to this routine. */
196 /* escape(text); */
197 free(text);
198 }
199
200 /* handle_decl -- called after a declaration is parsed */
handle_decl(void * clientdata,string gi,string fpi,string url)201 void handle_decl(void *clientdata, string gi, string fpi, string url)
202 {
203 /* skip */
204 if (gi) free(gi);
205 if (fpi) free(fpi);
206 if (url) free(url);
207 }
208
209 /* handle_pi -- called after a PI is parsed */
handle_pi(void * clientdata,string pi_text)210 void handle_pi(void *clientdata, string pi_text)
211 {
212 if (pi_text) free(pi_text);
213 }
214
215 /* handle_starttag -- called after a start tag is parsed */
handle_starttag(void * clientdata,string name,pairlist attribs)216 void handle_starttag(void *clientdata, string name, pairlist attribs)
217 {
218 /* ToDo: print text of anchor, if available */
219 conststring h;
220
221 if (strcasecmp(name, "base") == 0) {
222 h = pairlist_get(attribs, "href");
223 if (h) base = strdup(h); /* Use as base from now on */
224 output("base", NULL, h);
225 } else if (strcasecmp(name, "link") == 0) {
226 output("link", pairlist_get(attribs, "rel"), pairlist_get(attribs, "href"));
227 } else if (strcasecmp(name, "a") == 0) {
228 output("a", pairlist_get(attribs, "rel"), pairlist_get(attribs, "href"));
229 } else if (strcasecmp(name, "img") == 0) {
230 output("img", NULL, pairlist_get(attribs, "src"));
231 output("img", "longdesc", pairlist_get(attribs, "longdesc"));
232 output("img", "srcset", pairlist_get(attribs, "srcset"));
233 } else if (strcasecmp(name, "input") == 0) {
234 output("input", "src", pairlist_get(attribs, "src"));
235 } else if (strcasecmp(name, "object") == 0) {
236 output("object", NULL, pairlist_get(attribs, "data"));
237 output("object", "classid", pairlist_get(attribs, "classid"));
238 output("object", "codebase", pairlist_get(attribs, "codebase"));
239 } else if (strcasecmp(name, "area") == 0) {
240 output("area", pairlist_get(attribs, "rel"), pairlist_get(attribs, "href"));
241 } else if (strcasecmp(name, "ins") == 0) {
242 output("ins", NULL, pairlist_get(attribs, "cite"));
243 } else if (strcasecmp(name, "del") == 0) {
244 output("del", NULL, pairlist_get(attribs, "cite"));
245 } else if (strcasecmp(name, "q") == 0) {
246 output("q", NULL, pairlist_get(attribs, "cite"));
247 } else if (strcasecmp(name, "blockquote") == 0) {
248 output("bq", NULL, pairlist_get(attribs, "cite"));
249 } else if (strcasecmp(name, "form") == 0) {
250 output("form", pairlist_get(attribs, "method"), pairlist_get(attribs, "action"));
251 } else if (strcasecmp(name, "frame") == 0) {
252 output("frame", NULL, pairlist_get(attribs, "src"));
253 } else if (strcasecmp(name, "iframe") == 0) {
254 output("iframe", NULL, pairlist_get(attribs, "src"));
255 } else if (strcasecmp(name, "head") == 0) {
256 output("head", NULL, pairlist_get(attribs, "profile"));
257 } else if (strcasecmp(name, "script") == 0) {
258 output("script", NULL, pairlist_get(attribs, "src"));
259 } else if (strcasecmp(name, "body") == 0) {
260 output("body", NULL, pairlist_get(attribs, "background"));
261 } else if (strcasecmp(name, "video") == 0) {
262 output("video", NULL, pairlist_get(attribs, "src"));
263 } else if (strcasecmp(name, "audio") == 0) {
264 output("audio", NULL, pairlist_get(attribs, "src"));
265 } else if (strcasecmp(name, "source") == 0) {
266 output("source", "srcset", pairlist_get(attribs, "srcset"));
267 output("source", "src", pairlist_get(attribs, "src"));
268 }
269
270 /* Free memory */
271 pairlist_delete(attribs);
272 free(name);
273 }
274
275 /* handle_emptytag -- called after an empty tag is parsed */
handle_emptytag(void * clientdata,string name,pairlist attribs)276 void handle_emptytag(void *clientdata, string name, pairlist attribs)
277 {
278 handle_starttag(clientdata, name, attribs);
279 }
280
281 /* handle_endtag -- called after an endtag is parsed (name may be "") */
handle_endtag(void * clientdata,string name)282 void handle_endtag(void *clientdata, string name)
283 {
284 free(name);
285 }
286
287 /* --------------------------------------------------------------------- */
288
289
290 /* usage -- print usage message and exit */
usage(string progname)291 static void usage(string progname)
292 {
293 fprintf(stderr,
294 "Version %s\nUsage: %s [-l] [-r] [-h] [-b base] [-t] [-a] [HTML-file]\n",
295 VERSION, progname);
296 exit(1);
297 }
298
299
main(int argc,char * argv[])300 int main(int argc, char *argv[])
301 {
302 int c, status = 200;
303
304 /* Bind the parser callback routines to our handlers */
305 set_error_handler(handle_error);
306 set_start_handler(start);
307 set_end_handler(end);
308 set_comment_handler(handle_comment);
309 set_text_handler(handle_text);
310 set_decl_handler(handle_decl);
311 set_pi_handler(handle_pi);
312 set_starttag_handler(handle_starttag);
313 set_emptytag_handler(handle_emptytag);
314 set_endtag_handler(handle_endtag);
315
316 /* Parse command line arguments */
317 while ((c = getopt(argc, argv, "lb:rhta")) != -1) {
318 switch (c) {
319 case 'l': format = Long; break; /* Long listing */
320 case 'b': base = strdup(optarg); break; /* Set base of URL */
321 case 'r': relative = true; break; /* Do not make URLs absolute */
322 case 'h': format = HTML; break; /* Output in HTML format */
323 case 't': format = Tuple; break; /* Output as 4-tuples */
324 case 'a': ascii = true; break; /* Convert IRIs to URLs */
325 default: usage(argv[0]);
326 }
327 }
328
329 if (optind == argc) {
330 yyin = stdin;
331 self = "-";
332 } else if (optind == argc - 1) {
333 if (!base) base = strdup(argv[optind]);
334 if (eq(argv[optind], "-")) yyin = stdin;
335 else yyin = fopenurl(argv[optind], "r", &status);
336 self = argv[optind];
337 } else {
338 usage(argv[0]);
339 }
340
341 if (yyin == NULL) {perror(argv[optind]); exit(1);}
342 if (status != 200) errexit("%s : %s\n", argv[optind], http_strerror(status));
343
344 if (yyparse() != 0) exit(3);
345
346 if (base) free(base);
347
348 return has_error ? 1 : 0;
349 }
350