1 /* Make links relative after mirroring.
2 * For example, mirroring http://a.b/c/d.html gets "a.b/c/d.html",
3 * but the links in d.html need to be adjusted, so that in d.html,
4 * "/foo" becomes "../foo" */
5
6 #include <stralloc.h>
7 #include <buffer.h>
8 #include <errmsg.h>
9 #include <fmt.h>
10 #include <str.h>
11 #include <ctype.h>
12 #include <byte.h>
13 #include <scan.h>
14 #include <case.h>
15 #include <sys/stat.h>
16 #include <unistd.h>
17 #include <fcntl.h>
18 #include <sys/mman.h>
19 #include <utime.h>
20 #include <string.h>
21 #include "havealloca.h"
22
canonicalize(stralloc * url,const char * baseurl)23 static int canonicalize(stralloc* url,const char* baseurl) {
24 /* for the comments, assume baseurl is "http://www.fefe.de/x/y.html" */
25 int l=strlen(baseurl);
26 char* dest=alloca(url->len+l+2);
27 char* x=dest;
28 if (stralloc_0(url)==0) return 0;
29 if (url->s[0]=='#') {
30 /* "#bar" -> "http://www.fefe.de/x/y.html#bar" */
31 l=str_chr(baseurl,'#');
32 byte_copy(x,l,baseurl);
33 byte_copy(x+l,url->len+1,url->s);
34 } else if (url->s[0]=='?') {
35 /* "?bar" -> "http://www.fefe.de/x/y.html?bar" */
36 for (l=0; baseurl[l]; ++l)
37 if (baseurl[l]=='?' || baseurl[l]=='#')
38 break;
39 byte_copy(x,l,baseurl);
40 byte_copy(x+l,url->len+1,url->s);
41 } else if (url->s[0]=='/') {
42 if (url->s[1]=='/') {
43 /* "//fnord.fefe.de/bla.html" -> "http://fnord.fefe.de/bla.html" */
44 l=str_chr(baseurl,':');
45 if (baseurl[l]==':') ++l;
46 byte_copy(x,l,baseurl);
47 byte_copy(x+l,url->len+1,url->s);
48 } else {
49 /* "/bla.html" -> "http://www.fefe.de/bla.html" */
50 l=str_chr(baseurl,':');
51 if (baseurl[l]==':' && baseurl[l+1]=='/' && baseurl[l+2]=='/')
52 l+=3;
53 l+=str_chr(baseurl+l,'/');
54 byte_copy(x,l,baseurl);
55 byte_copy(x+l,url->len+1,url->s);
56 }
57 } else if (strstr(url->s,"://")) {
58 /* "http://foo/bar" -> "http://foo/bar" */
59 byte_copy(x,url->len+1,url->s);
60 } else {
61 /* "z.html" -> "http://www.fefe.de/x/z.html" */
62 int k;
63 for (k=l=0; baseurl[k]; ++k) {
64 if (baseurl[k]=='/') l=k+1;
65 if (baseurl[k]=='?') break;
66 }
67 byte_copy(x,l,baseurl);
68 byte_copy(x+l,url->len+1,url->s);
69 }
70 return stralloc_copys(url,x);
71 }
72
mmap_read_stat(const char * filename,struct stat * ss)73 static char* mmap_read_stat(const char* filename,struct stat* ss) {
74 int fd=open(filename,O_RDONLY);
75 char* map;
76 map=0;
77 if (fd>=0) {
78 if (fstat(fd,ss)==0) {
79 map=mmap(0,ss->st_size,PROT_READ,MAP_SHARED,fd,0);
80 if (map==(char*)-1)
81 map=0;
82 }
83 close(fd);
84 }
85 return map;
86 }
87
stralloc_istag(stralloc * sa,const char * in)88 static int stralloc_istag(stralloc* sa,const char* in) {
89 char* a;
90 int l;
91 l=strlen(in);
92 a=sa->s;
93 if (sa->len<l+2) return 0;
94 if (*a != '<') return 0;
95 ++a;
96 if (!case_equalb(a,l,in)) return 0;
97 a+=l;
98 if (*a==' ' || *a=='\t' || *a=='\n' || *a=='\r' || *a=='>') return 1;
99 return 0;
100 }
101
extractparam(stralloc * tag,const char * wanted,stralloc * before,stralloc * arg,stralloc * after)102 static int extractparam(stralloc* tag,const char* wanted,stralloc* before,stralloc* arg,stralloc* after) {
103 int l=strlen(wanted);
104 char* x,* max,* y;
105 if (tag->len<l+4) return 0;
106
107 max=tag->s+tag->len; y=0;
108 x=tag->s;
109 if (*x != '<') return 0;
110 ++x;
111 for (; x<max && !isspace(*x); ++x) ;
112 for (; x<max && isspace(*x); ++x) ;
113 for (; x<max-l; ++x) {
114 if (max-x>l && case_equalb(x,l,wanted) && x[l]=='=') {
115 x+=l+1;
116 if (stralloc_copyb(before,tag->s,x-tag->s)==0) return 0;
117 if (*x=='"') {
118 ++x;
119 y=x;
120 for (; x<max && *x!='"'; ++x) ;
121 if (stralloc_copyb(arg,y,x-y)==0) return 0;
122 ++x;
123 } else {
124 y=x;
125 for (; x<max && !isspace(*x) && *x!='>'; ++x) ;
126 if (stralloc_copyb(arg,y,x-y)==0) return 0;
127 }
128 y=x;
129 if (stralloc_copyb(after,y,max-y)==0) return 0;
130 return 1;
131 }
132 }
133 return 0;
134 }
135
mangleurl(stralloc * tag,const char * baseurl)136 static int mangleurl(stralloc* tag,const char* baseurl) {
137 char* x;
138 const char* y;
139 static stralloc before,arg,after,tmp;
140 int found;
141 struct stat ss;
142 found=0;
143 if (stralloc_istag(tag,"a") || stralloc_istag(tag,"link"))
144 found=1;
145 else if (stralloc_istag(tag,"img") || stralloc_istag(tag,"frame"))
146 found=2;
147 if (!found) return 0;
148 if (extractparam(tag,found==1?"href":"src",&before,&arg,&after)) {
149 if (stralloc_starts(&arg,"/") ||
150 stralloc_starts(&arg,"http://") ||
151 stralloc_starts(&arg,"https://")) {
152 canonicalize(&arg,baseurl);
153 } else
154 return 0; /* url was already relative */
155 if (stralloc_0(&arg)==0) return -1;
156 stralloc_chop(&arg);
157 x=arg.s+7; if (*x=='/') ++x;
158 y=baseurl+7; if (*y=='/') ++y;
159
160 /* now x is something like
161 * "www.spiegel.de/img/0,1020,525770,00.jpg"
162 * and baseurl is something like
163 * "www.spiegel.de/panorama/0,1518,378421,00.html"
164 * and we want to change x into "../img/0,1020,525770,00.jpg" */
165 if (stat(x,&ss)!=0) return 0;
166
167 for (;;) {
168 int i=str_chr(x,'/');
169 int j=str_chr(y,'/');
170 if (i>0 && i==j && byte_equal(x,i,y)) {
171 x+=i+1;
172 y+=i+1;
173 while (*x=='/') ++x;
174 while (*y=='/') ++y;
175 } else
176 break;
177 }
178 stralloc_zero(&tmp);
179 for (;;) {
180 int i=str_chr(y,'/');
181 if (y[i]=='/') {
182 y+=i+1;
183 while (*y=='/') ++y;
184 if (stralloc_cats(&tmp,"../")==0) return -1;
185 } else
186 break;
187 }
188 {
189 int i,needquote;
190 for (i=needquote=0; x[i]; ++i)
191 if (!isalnum(x[i]) && x[i]!='/' && x[i]!='_' && x[i]!='.') needquote=1;
192 if (needquote) {
193 if (stralloc_cats(&before,"\"")==0 ||
194 stralloc_cat(&before,&tmp)==0 ||
195 stralloc_cats(&before,x)==0 ||
196 stralloc_cats(&before,"\"")==0) return -1;
197 } else
198 if (stralloc_cat(&before,&tmp)==0 ||
199 stralloc_cats(&before,x)==0) return -1;
200 }
201 if (stralloc_cat(&before,&after)==0) return -1;
202 if (stralloc_copy(tag,&before)==0) return -1;
203 }
204 return 0;
205 }
206
207 /* usage: rellink "http://www.nytimes.com/2005/10/06/international/middleeast/06cnd-prexy.html?ex=1129262400&en=30e300dafe83d0fc&ei=5065&partner=MYWAY" downloaded-data.html */
main(int argc,char * argv[])208 int main(int argc,char* argv[]) {
209 char* baseurl;
210 char* map,* max,* x;
211 struct stat ss;
212 static stralloc sa;
213 if (argc!=3)
214 die(0,"usage: rellink http://base/url downloaded-data.html");
215 errmsg_iam("rellink");
216 baseurl=argv[1];
217
218 map=mmap_read_stat(argv[2],&ss);
219 if (map==0)
220 diesys(111,"open \"",argv[2],"\" failed");
221
222 max=map+ss.st_size;
223 for (x=map; x<max; ) {
224 stralloc tag;
225 /* copy non-tag */
226 for (; x<max && *x!='<'; ++x)
227 if (stralloc_append(&sa,x)==0)
228 nomem:
229 die(111,"out of memory");
230
231 if (x>=max) break;
232 stralloc_copys(&tag,"");
233
234 {
235 int indq,insq,ok;
236 indq=insq=ok=0;
237 for (; x<max; ++x) {
238 if (*x == '\'') insq^=1; else
239 if (*x == '"') indq^=1;
240 if (stralloc_append(&tag,x)==0) goto nomem;
241 if (*x == '>' && !insq && !indq) { ok=1; ++x; break; }
242 }
243 if (ok)
244 if (mangleurl(&tag,baseurl)==-1) goto nomem;
245 }
246 if (stralloc_cat(&sa,&tag)==0) goto nomem;
247 }
248 if (sa.len == ss.st_size && byte_equal(sa.s,ss.st_size,map)) return 0;
249 munmap(map,ss.st_size);
250 {
251 struct utimbuf utb;
252 int fd=open(argv[2],O_WRONLY|O_TRUNC,0600);
253 if (fd==-1) die(111,"open(\"",argv[2],"\")");
254 write(fd,sa.s,sa.len);
255 close(fd);
256 utb.actime=ss.st_atime;
257 utb.modtime=ss.st_mtime;
258 utime(argv[2],&utb);
259 }
260 return 0;
261 }
262