1 /* hxcopy -- copy an HTML file and update relative URLs at the same time
2 *
3 * Copy an HTML file with all URLs that were relative to OLDURL
4 * updated to be relative to NEWURL instead. (If the document has a
5 * BASE element, only that is updated.) OLDURL and NEWURL may
6 * themselves be relative (to the same base URL, which need not be
7 * mentioned).
8 *
9 * Part of HTML-XML-utils, see:
10 * http://www.w3.org/Tools/HTML-XML-utils/
11 *
12 * TO DO: Should it be an option whether URL references of the form
13 * "", "#foo" and "?bar" are replaced by "oldurl", "oldurl#foo" and
14 * "oldurl?bar"? (See adjust_url().)
15 *
16 * Created: 5 Dec 2008
17 * Author: Bert Bos <bert@w3.org>
18 *
19 * Copyright © 2008-2012 W3C
20 * See http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231
21 */
22
23 #include "config.h"
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <assert.h>
27 #include <unistd.h>
28 #include <errno.h>
29 #include <stdbool.h>
30 #if HAVE_STRING_H
31 # include <string.h>
32 #endif
33 #if HAVE_STRINGS_H
34 # include <strings.h>
35 #endif
36 #include "export.h"
37 #include "heap.e"
38 #include "types.e"
39 #include "html.e"
40 #include "scan.e"
41 #include "url.e"
42 #include "dict.e"
43 #include "openurl.e"
44 #include "errexit.e"
45
46 #define same(a, b) ((a) ? ((b) && eq((a), (b))) : !(b))
47
48 static bool has_errors = false; /* Enconutered errors during parsing */
49 static FILE *out = NULL; /* Where to write output */
50 static bool has_base = false; /* Document has a <BASE> element */
51 static string newbase; /* Path from OLDURL to NEWURL */
52 static bool replace_self = false; /* Change link to self in link to old */
53
54
55 /* path_from_url_to_url -- compute URL that is path from one URL to another */
path_from_url_to_url(const conststring a,const conststring b)56 static string path_from_url_to_url(const conststring a, const conststring b)
57 {
58 URL p, q;
59 string s = NULL;
60 char cwd[4096];
61 int i, j;
62
63 if (!getcwd(cwd, sizeof(cwd) - 1)) return NULL; /* To do: handle long path */
64 strcat(cwd, "/");
65 s = URL_s_absolutize(cwd, a); p = URL_new(s); dispose(s);
66 s = URL_s_absolutize(cwd, b); q = URL_new(s); dispose(s);
67 if (p->proto && !q->proto) {
68 errno = EACCES; /* Path from remote to local not possible */
69 } else if (!same(p->proto, q->proto) ||
70 !same(p->user, q->user) ||
71 !same(p->password, q->password) ||
72 !same(p->machine, q->machine) ||
73 !same(p->port, q->port)) {
74 s = newstring(b); /* Just use the URL b */
75 } else {
76 /* Find the last '/' before which both paths are the same */
77 for (j = i = 0; p->path[i] && q->path[i] && p->path[i] == q->path[i]; i++)
78 if (p->path[i] == '/') j = i;
79
80 /* Construct path from a to b by descending a and climbing b */
81 for (i = j + 1; p->path[i]; i++)
82 if (p->path[i] == '/') strapp(&s, "../", NULL);
83 strapp(&s, q->path + j + 1, NULL);
84 }
85 URL_dispose(p);
86 URL_dispose(q);
87 return s;
88 }
89
90
91 /* adjust_url -- return a new URL relative to newurl instead of oldurl */
adjust_url(const conststring url)92 static conststring adjust_url(const conststring url)
93 {
94 if (!replace_self && (!url || !url[0] || url[0] == '#' || url[0] == '?'))
95 return url; /* Don't replace references to self */
96 else
97 return URL_s_absolutize(newbase, url);
98 }
99
100
101 /* attribute_is_url -- check if the attribute is URL-valued */
attribute_is_url(const conststring attrib)102 static bool attribute_is_url(const conststring attrib)
103 {
104 return strcasecmp(attrib, "href") == 0 ||
105 strcasecmp(attrib, "src") == 0 ||
106 strcasecmp(attrib, "action") == 0 ||
107 strcasecmp(attrib, "background") == 0 ||
108 strcasecmp(attrib, "cite") == 0 ||
109 strcasecmp(attrib, "classid") == 0 ||
110 strcasecmp(attrib, "codebase") == 0 ||
111 strcasecmp(attrib, "data") == 0 ||
112 strcasecmp(attrib, "longdesc") == 0 ||
113 strcasecmp(attrib, "profile") == 0 ||
114 strcasecmp(attrib, "usemap") == 0;
115 }
116
117
118 /* handle_error -- called when a parse error occurred */
handle_error(void * clientdata,const string s,int lineno)119 void handle_error(void *clientdata, const string s, int lineno)
120 {
121 fprintf(stderr, "%d: %s\n", lineno, s);
122 has_errors = true;
123 }
124
125
126 /* start -- called before the first event is reported */
start(void)127 void* start(void)
128 {
129 return NULL;
130 }
131
132
133 /* end -- called after the last event is reported */
end(void * clientdata)134 void end(void *clientdata)
135 {
136 /* skip */
137 }
138
139
140 /* handle_comment -- called after a comment is parsed */
handle_comment(void * clientdata,string commenttext)141 void handle_comment(void *clientdata, string commenttext)
142 {
143 fprintf(out, "<!--%s-->", commenttext);
144 }
145
146
147 /* handle_text -- called after a text chunk is parsed */
handle_text(void * clientdata,string text)148 void handle_text(void *clientdata, string text)
149 {
150 fprintf(out, "%s", text);
151 }
152
153
154 /* handle_decl -- called after a declaration is parsed */
handle_decl(void * clientdata,string gi,string fpi,string url)155 void handle_decl(void *clientdata, string gi,
156 string fpi, string url)
157 {
158 fprintf(out, "<!DOCTYPE %s", gi);
159 if (fpi) fprintf(out, " PUBLIC \"%s\"", fpi);
160 if (url) fprintf(out, " %s\"%s\"", fpi ? "" : "SYSTEM ", url);
161 fprintf(out, ">");
162 }
163
164
165 /* handle_pi -- called after a PI is parsed */
handle_pi(void * clientdata,string pi_text)166 void handle_pi(void *clientdata, string pi_text)
167 {
168 fprintf(out, "<?%s>", pi_text);
169 }
170
171
172 /* handle_starttag -- called after a start tag is parsed */
handle_starttag(void * clientdata,string name,pairlist attribs)173 void handle_starttag(void *clientdata, string name, pairlist attribs)
174 {
175 conststring v;
176 pairlist p;
177
178 fprintf(out, "<%s", name);
179 for (p = attribs; p; p = p->next) {
180 fprintf(out, " %s", p->name);
181 if (!p->value) v = NULL;
182 else if (has_base) v = newstring(p->value); /* No need to adjust */
183 else if (attribute_is_url(p->name)) v = adjust_url(p->value);
184 else v = newstring(p->value); /* No need to adjust */
185 if (v) fprintf(out, "=\"%s\"", v);
186 dispose(v);
187 }
188 fprintf(out, ">");
189
190 /* If this is a <BASE> tag, no further adjustments are needed */
191 if (strcasecmp(name, "base") == 0) has_base = true;
192 }
193
194
195 /* handle_emptytag -- called after an empty tag is parsed */
handle_emptytag(void * clientdata,string name,pairlist attribs)196 void handle_emptytag(void *clientdata, string name, pairlist attribs)
197 {
198 conststring v;
199 pairlist p;
200
201 fprintf(out, "<%s", name);
202 for (p = attribs; p; p = p->next) {
203 fprintf(out, " %s", p->name);
204 if (!p->value) v = NULL;
205 else if (has_base) v = newstring(p->value); /* No need to adjust */
206 else if (attribute_is_url(p->name)) v = adjust_url(p->value);
207 else v = newstring(p->value); /* No need to adjust */
208 if (v) fprintf(out, "=\"%s\"", v);
209 dispose(v);
210 }
211 fprintf(out, " />");
212
213 /* If this is a <BASE> tag, no further adjustments are needed */
214 if (strcasecmp(name, "base") == 0) has_base = true;
215 }
216
217
218 /* handle_endtag -- called after an endtag is parsed (name may be "") */
handle_endtag(void * clientdata,string name)219 void handle_endtag(void *clientdata, string name)
220 {
221 fprintf(out, "</%s>", name);
222 }
223
224
225 /* usage -- print usage message and exit */
usage(const conststring progname)226 static void usage(const conststring progname)
227 {
228 fprintf(stderr, "Usage: %s [-v] [-s] [-i old-URL] [-o new-URL] [URL [URL]]\n", progname);
229 exit(1);
230 }
231
232
main(int argc,char * argv[])233 int main(int argc, char *argv[])
234 {
235 int c, status = 200;
236 string oldurl = NULL, newurl = NULL;
237
238 /* Bind the parser callback routines to our handlers */
239 set_error_handler(handle_error);
240 set_start_handler(start);
241 set_end_handler(end);
242 set_comment_handler(handle_comment);
243 set_text_handler(handle_text);
244 set_decl_handler(handle_decl);
245 set_pi_handler(handle_pi);
246 set_starttag_handler(handle_starttag);
247 set_emptytag_handler(handle_emptytag);
248 set_endtag_handler(handle_endtag);
249
250 /* Parse command line */
251 while ((c = getopt(argc, argv, "i:o:sv")) != -1)
252 switch (c) {
253 case 'o': newurl = optarg; break;
254 case 'i': oldurl = optarg; break;
255 case 's': replace_self = true; break;
256 case 'v': printf("Version: %s %s\n", PACKAGE, VERSION); return 0;
257 default: usage(argv[0]);
258 }
259 if (argc > optind + 2) usage(argv[0]);
260 if (argc > optind + 1) out = fopenurl(argv[optind+1], "w", NULL);
261 else if (newurl) out = stdout;
262 else errexit("%s: option -o is required if output is to stdout\n", argv[0]);
263 if (!out) {perror(argv[optind+1]); exit(3);}
264 if (argc > optind) yyin = fopenurl(argv[optind], "r", &status);
265 else if (oldurl) yyin = stdin;
266 else errexit("%s: option -i is required if input is from stdin\n", argv[0]);
267 if (!yyin) {perror(argv[optind]); exit(2);}
268 if (status != 200) errexit("%s : %s\n", argv[1], http_strerror(status));
269 if (!oldurl) oldurl = argv[optind];
270 if (!newurl) newurl = argv[optind+1];
271 newbase = path_from_url_to_url(newurl, oldurl);
272 if (!newbase) errexit("%s: could not parse argument as a URL\n", argv[0]);
273 if (yyparse() != 0) exit(4);
274 return has_errors ? 1 : 0;
275 }
276