1 /* hxcopy -- copy an HTML file and update relative URLs at the same time
2  *
3  * Copy an HTML file with all URLs that were relative to OLDURL
4  * updated to be relative to NEWURL instead. (If the document has a
5  * BASE element, only that is updated.) OLDURL and NEWURL may
6  * themselves be relative (to the same base URL, which need not be
7  * mentioned).
8  *
9  * Part of HTML-XML-utils, see:
10  * http://www.w3.org/Tools/HTML-XML-utils/
11  *
12  * TO DO: Should it be an option whether URL references of the form
13  * "", "#foo" and "?bar" are replaced by "oldurl", "oldurl#foo" and
14  * "oldurl?bar"? (See adjust_url().)
15  *
16  * Created: 5 Dec 2008
17  * Author: Bert Bos <bert@w3.org>
18  *
19  * Copyright © 2008-2012 W3C
20  * See http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231
21  */
22 
23 #include "config.h"
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <assert.h>
27 #include <unistd.h>
28 #include <errno.h>
29 #include <stdbool.h>
30 #if HAVE_STRING_H
31 #  include <string.h>
32 #endif
33 #if HAVE_STRINGS_H
34 #  include <strings.h>
35 #endif
36 #include "export.h"
37 #include "heap.e"
38 #include "types.e"
39 #include "html.e"
40 #include "scan.e"
41 #include "url.e"
42 #include "dict.e"
43 #include "openurl.e"
44 #include "errexit.e"
45 
46 #define same(a, b) ((a) ? ((b) && eq((a), (b))) : !(b))
47 
48 static bool has_errors = false;		/* Enconutered errors during parsing */
49 static FILE *out = NULL;		/* Where to write output */
50 static bool has_base = false;		/* Document has a <BASE> element */
51 static string newbase;			/* Path from OLDURL to NEWURL */
52 static bool replace_self = false;	/* Change link to self in link to old */
53 
54 
55 /* path_from_url_to_url -- compute URL that is path from one URL to another */
path_from_url_to_url(const conststring a,const conststring b)56 static string path_from_url_to_url(const conststring a, const conststring b)
57 {
58   URL p, q;
59   string s = NULL;
60   char cwd[4096];
61   int i, j;
62 
63   if (!getcwd(cwd, sizeof(cwd) - 1)) return NULL; /* To do: handle long path */
64   strcat(cwd, "/");
65   s = URL_s_absolutize(cwd, a); p = URL_new(s); dispose(s);
66   s = URL_s_absolutize(cwd, b); q = URL_new(s); dispose(s);
67   if (p->proto && !q->proto) {
68     errno = EACCES;		/* Path from remote to local not possible */
69   } else if (!same(p->proto, q->proto) ||
70       !same(p->user, q->user) ||
71       !same(p->password, q->password) ||
72       !same(p->machine, q->machine) ||
73       !same(p->port, q->port)) {
74     s = newstring(b);		/* Just use the URL b */
75   } else {
76     /* Find the last '/' before which both paths are the same */
77     for (j = i = 0; p->path[i] && q->path[i] && p->path[i] == q->path[i]; i++)
78       if (p->path[i] == '/') j = i;
79 
80     /* Construct path from a to b by descending a and climbing b */
81     for (i = j + 1; p->path[i]; i++)
82       if (p->path[i] == '/') strapp(&s, "../", NULL);
83     strapp(&s, q->path + j + 1, NULL);
84   }
85   URL_dispose(p);
86   URL_dispose(q);
87   return s;
88 }
89 
90 
91 /* adjust_url -- return a new URL relative to newurl instead of oldurl */
adjust_url(const conststring url)92 static conststring adjust_url(const conststring url)
93 {
94   if (!replace_self && (!url || !url[0] || url[0] == '#' || url[0] == '?'))
95     return url;			/* Don't replace references to self */
96   else
97     return URL_s_absolutize(newbase, url);
98 }
99 
100 
101 /* attribute_is_url -- check if the attribute is URL-valued */
attribute_is_url(const conststring attrib)102 static bool attribute_is_url(const conststring attrib)
103 {
104   return strcasecmp(attrib, "href") == 0 ||
105     strcasecmp(attrib, "src") == 0 ||
106     strcasecmp(attrib, "action") == 0 ||
107     strcasecmp(attrib, "background") == 0 ||
108     strcasecmp(attrib, "cite") == 0 ||
109     strcasecmp(attrib, "classid") == 0 ||
110     strcasecmp(attrib, "codebase") == 0 ||
111     strcasecmp(attrib, "data") == 0 ||
112     strcasecmp(attrib, "longdesc") == 0 ||
113     strcasecmp(attrib, "profile") == 0 ||
114     strcasecmp(attrib, "usemap") == 0;
115 }
116 
117 
118 /* handle_error -- called when a parse error occurred */
handle_error(void * clientdata,const string s,int lineno)119 void handle_error(void *clientdata, const string s, int lineno)
120 {
121   fprintf(stderr, "%d: %s\n", lineno, s);
122   has_errors = true;
123 }
124 
125 
126 /* start -- called before the first event is reported */
start(void)127 void* start(void)
128 {
129   return NULL;
130 }
131 
132 
133 /* end -- called after the last event is reported */
end(void * clientdata)134 void end(void *clientdata)
135 {
136   /* skip */
137 }
138 
139 
140 /* handle_comment -- called after a comment is parsed */
handle_comment(void * clientdata,string commenttext)141 void handle_comment(void *clientdata, string commenttext)
142 {
143   fprintf(out, "<!--%s-->", commenttext);
144 }
145 
146 
147 /* handle_text -- called after a text chunk is parsed */
handle_text(void * clientdata,string text)148 void handle_text(void *clientdata, string text)
149 {
150   fprintf(out, "%s", text);
151 }
152 
153 
154 /* handle_decl -- called after a declaration is parsed */
handle_decl(void * clientdata,string gi,string fpi,string url)155 void handle_decl(void *clientdata, string gi,
156 		 string fpi, string url)
157 {
158   fprintf(out, "<!DOCTYPE %s", gi);
159   if (fpi) fprintf(out, " PUBLIC \"%s\"", fpi);
160   if (url) fprintf(out, " %s\"%s\"", fpi ? "" : "SYSTEM ", url);
161   fprintf(out, ">");
162 }
163 
164 
165 /* handle_pi -- called after a PI is parsed */
handle_pi(void * clientdata,string pi_text)166 void handle_pi(void *clientdata, string pi_text)
167 {
168   fprintf(out, "<?%s>", pi_text);
169 }
170 
171 
172 /* handle_starttag -- called after a start tag is parsed */
handle_starttag(void * clientdata,string name,pairlist attribs)173 void handle_starttag(void *clientdata, string name, pairlist attribs)
174 {
175   conststring v;
176   pairlist p;
177 
178   fprintf(out, "<%s", name);
179   for (p = attribs; p; p = p->next) {
180     fprintf(out, " %s", p->name);
181     if (!p->value) v = NULL;
182     else if (has_base) v = newstring(p->value);	/* No need to adjust */
183     else if (attribute_is_url(p->name)) v = adjust_url(p->value);
184     else v = newstring(p->value);		/* No need to adjust */
185     if (v) fprintf(out, "=\"%s\"", v);
186     dispose(v);
187   }
188   fprintf(out, ">");
189 
190   /* If this is a <BASE> tag, no further adjustments are needed */
191   if (strcasecmp(name, "base") == 0) has_base = true;
192 }
193 
194 
195 /* handle_emptytag -- called after an empty tag is parsed */
handle_emptytag(void * clientdata,string name,pairlist attribs)196 void handle_emptytag(void *clientdata, string name, pairlist attribs)
197 {
198   conststring v;
199   pairlist p;
200 
201   fprintf(out, "<%s", name);
202   for (p = attribs; p; p = p->next) {
203     fprintf(out, " %s", p->name);
204     if (!p->value) v = NULL;
205     else if (has_base) v = newstring(p->value);	/* No need to adjust */
206     else if (attribute_is_url(p->name)) v = adjust_url(p->value);
207     else v = newstring(p->value);		/* No need to adjust */
208     if (v) fprintf(out, "=\"%s\"", v);
209     dispose(v);
210   }
211   fprintf(out, " />");
212 
213   /* If this is a <BASE> tag, no further adjustments are needed */
214   if (strcasecmp(name, "base") == 0) has_base = true;
215 }
216 
217 
218 /* handle_endtag -- called after an endtag is parsed (name may be "") */
handle_endtag(void * clientdata,string name)219 void handle_endtag(void *clientdata, string name)
220 {
221   fprintf(out, "</%s>", name);
222 }
223 
224 
225 /* usage -- print usage message and exit */
usage(const conststring progname)226 static void usage(const conststring progname)
227 {
228   fprintf(stderr, "Usage: %s [-v] [-s] [-i old-URL] [-o new-URL] [URL [URL]]\n", progname);
229   exit(1);
230 }
231 
232 
main(int argc,char * argv[])233 int main(int argc, char *argv[])
234 {
235   int c, status = 200;
236   string oldurl = NULL, newurl = NULL;
237 
238   /* Bind the parser callback routines to our handlers */
239   set_error_handler(handle_error);
240   set_start_handler(start);
241   set_end_handler(end);
242   set_comment_handler(handle_comment);
243   set_text_handler(handle_text);
244   set_decl_handler(handle_decl);
245   set_pi_handler(handle_pi);
246   set_starttag_handler(handle_starttag);
247   set_emptytag_handler(handle_emptytag);
248   set_endtag_handler(handle_endtag);
249 
250   /* Parse command line */
251   while ((c = getopt(argc, argv, "i:o:sv")) != -1)
252     switch (c) {
253     case 'o': newurl = optarg; break;
254     case 'i': oldurl = optarg; break;
255     case 's': replace_self = true; break;
256     case 'v': printf("Version: %s %s\n", PACKAGE, VERSION); return 0;
257     default: usage(argv[0]);
258     }
259   if (argc > optind + 2) usage(argv[0]);
260   if (argc > optind + 1) out =  fopenurl(argv[optind+1], "w", NULL);
261   else if (newurl) out = stdout;
262   else errexit("%s: option -o is required if output is to stdout\n", argv[0]);
263   if (!out) {perror(argv[optind+1]); exit(3);}
264   if (argc > optind) yyin = fopenurl(argv[optind], "r", &status);
265   else if (oldurl) yyin = stdin;
266   else errexit("%s: option -i is required if input is from stdin\n", argv[0]);
267   if (!yyin) {perror(argv[optind]); exit(2);}
268   if (status != 200) errexit("%s : %s\n", argv[1], http_strerror(status));
269   if (!oldurl) oldurl = argv[optind];
270   if (!newurl) newurl = argv[optind+1];
271   newbase = path_from_url_to_url(newurl, oldurl);
272   if (!newbase) errexit("%s: could not parse argument as a URL\n", argv[0]);
273   if (yyparse() != 0) exit(4);
274   return has_errors ? 1 : 0;
275 }
276