1 /***************************************************************************/
2 /* This code is part of WWW grabber called pavuk */
3 /* Copyright (c) 1997 - 2001 Stefan Ondrejicka */
4 /* Distributed under GPL 2 or later */
5 /***************************************************************************/
6
7 #include <unistd.h>
8 #include <stdlib.h>
9 #include <stdio.h>
10 #include <string.h>
11 #include <sys/types.h>
12 #include <sys/stat.h>
13 #include <fcntl.h>
14 #include <time.h>
15
16 #include "config.h"
17 #include "condition.h"
18 #include "mime.h"
19 #include "robots.h"
20 #include "url.h"
21 #include "tools.h"
22 #include "doc.h"
23 #include "abstract.h"
24 #include "tools.h"
25 #include "mode.h"
26 #include "times.h"
27 #include "errcode.h"
28 #include "gcinfo.h"
29 #include "gui_api.h"
30
31 #ifdef HAVE_MT
32 #define LOCK_ROBOTS_ENTRY(ent) mt_pthread_mutex_lock(&(ent)->lock, "robots entry")
33 #define UNLOCK_ROBOTS_ENTRY(ent) mt_pthread_mutex_unlock(&(ent)->lock, "robots entry")
34 #else
35 #define LOCK_ROBOTS_ENTRY(ent)
36 #define UNLOCK_ROBOTS_ENTRY(ent)
37 #endif
38
39 static char *get_robots(url * urlp);
40 static void parse_robots(char *, char *, char ***, char ***);
41
42 static robotlim **robots = NULL;
43
get_max_match(char * str,char ** pat)44 static char *get_max_match(char *str, char **pat)
45 {
46 char *rv = NULL;
47 int maxlen = -1;
48
49 while(pat && *pat)
50 {
51 if(!strncmp(*pat, str, strlen(*pat)))
52 {
53 int len = strlen(*pat);
54
55 if(len > maxlen)
56 {
57 rv = *pat;
58 maxlen = len;
59 }
60 }
61 pat++;
62 }
63 return rv;
64 }
65
66 /***************************************************/
67 /* kontrola ci URL splna podmienky pre WWW robotov */
68 /* FIXME: Translate me! */
69 /***************************************************/
robots_check(url * urlp)70 bool_t robots_check(url * urlp)
71 {
72 char *pom, *mdp, *map;
73 int i = 0;
74 int rv = TRUE;
75 robotlim *tmpr = NULL;
76 int dont_have = TRUE;
77
78 if((urlp->type != URLT_HTTP && urlp->type != URLT_HTTPS)
79 || !cfg.condition.allow_robots)
80 return TRUE;
81
82 LOCK_ROBOTS;
83 if(robots)
84 {
85 while(robots[i] && !(!strcmp(robots[i]->site, urlp->p.http.host) &&
86 (robots[i]->port == urlp->p.http.port)))
87 i++;
88 if(robots[i])
89 dont_have = FALSE;
90 }
91
92 if(dont_have)
93 {
94 tmpr = (robotlim *) _malloc(sizeof(robotlim));
95 tmpr->site = new_string(urlp->p.http.host);
96 tmpr->port = urlp->p.http.port;
97 tmpr->dpat = NULL;
98 tmpr->apat = NULL;
99 #ifdef HAVE_MT
100 pthread_mutex_init(&tmpr->lock, NULL);
101 #endif
102
103 robots = (robotlim **) _realloc(robots, (i + 2) * sizeof(robotlim *));
104 robots[i] = tmpr;
105 robots[i + 1] = NULL;
106
107 /* here is possible to cros enter/leave to critical */
108 /* sections because no chance that anyone else holds */
109 /* lock on robots[i]->lock when it is freshly created */
110 LOCK_ROBOTS_ENTRY(robots[i]);
111 }
112 UNLOCK_ROBOTS;
113
114 if(dont_have)
115 {
116
117 pom = get_robots(urlp);
118 if(pom)
119 {
120 parse_robots("pavuk", pom, &tmpr->dpat, &tmpr->apat);
121 _free(pom);
122 }
123 }
124 else
125 {
126 LOCK_ROBOTS_ENTRY(robots[i]);
127 }
128
129 mdp = get_max_match(urlp->p.http.document, robots[i]->dpat);
130 map = get_max_match(urlp->p.http.document, robots[i]->apat);
131 UNLOCK_ROBOTS_ENTRY(robots[i]);
132
133 if(map && mdp && (strlen(map) >= strlen(mdp)))
134 rv = TRUE;
135 else if(mdp)
136 rv = FALSE;
137
138 return rv;
139 }
140
141 /************************************************/
142 /* prenos suboru "robots.txt" pre dane URL */
143 /* FIXME: Translate me! */
144 /************************************************/
get_robots(url * urlp)145 static char *get_robots(url * urlp)
146 {
147 url *purl = _malloc(sizeof(url));
148 doc docu;
149 int rstat;
150 char *ret = NULL;
151 char *pom;
152 int nredir = 0, nreget = 0;
153 struct stat estat;
154 char *pp;
155 int f;
156 global_connection_info con_info;
157
158 #ifdef I_FACE
159 if(cfg.xi_face)
160 {
161 gui_set_status(gettext("transfering \"robots.txt\""));
162 }
163 #endif
164 xprintf(1, gettext("transfering \"robots.txt\"\n"));
165
166 memset(purl, '\0', sizeof(url));
167 purl->type = urlp->type;
168 purl->parent_url = NULL;
169 purl->status = URL_INLINE_OBJ; /*** required if -store_name option used ***/
170 purl->extension = NULL;
171 purl->local_name = NULL;
172
173 #ifdef HAVE_MT
174 pthread_mutex_init(&purl->lock, NULL);
175 #endif
176
177 #ifdef WITH_TREE
178 #ifdef I_FACE
179 purl->prop = NULL;
180 purl->tree_nfo = NULL;
181 #endif
182 #endif
183
184 purl->level = 0;
185 purl->p.http.user = new_string(urlp->p.http.user);
186 purl->p.http.password = new_string(urlp->p.http.password);
187 purl->p.http.host = new_string(urlp->p.http.host);
188 purl->p.http.port = urlp->p.http.port;
189 purl->p.http.document = new_string("/robots.txt");
190 purl->p.http.anchor_name = NULL;
191 purl->p.http.searchstr = NULL;
192
193 doc_init(&docu, purl);
194 docu.is_robot = TRUE;
195 docu.save_online = FALSE;
196 docu.report_size = FALSE;
197 docu.check_limits = FALSE;
198
199 if(cfg.mode == MODE_SYNC || cfg.mode == MODE_MIRROR)
200 {
201 pp = url_to_filename(purl, TRUE);
202 if(!stat(pp, &estat) && !S_ISDIR(estat.st_mode))
203 {
204 docu.dtime = estat.st_mtime;
205 }
206 }
207
208 init_global_connection_data(&con_info);
209
210 while((rstat = doc_download(&docu, TRUE, FALSE)) &&
211 ((nredir < cfg.nredir && docu.errcode == ERR_HTTP_REDIR) ||
212 (nreget < cfg.nreget && docu.errcode == ERR_HTTP_TRUNC)))
213 {
214 if(docu.errcode)
215 report_error(&docu, "robots.txt");
216
217 save_global_connection_data(&con_info, &docu);
218
219 nredir += docu.errcode == ERR_HTTP_REDIR;
220 nreget += docu.errcode == ERR_HTTP_TRUNC;
221
222 if(docu.errcode == ERR_HTTP_REDIR)
223 {
224 purl = docu.doc_url->moved_to;
225 pom = url_to_urlstr(purl, FALSE);
226 xprintf(1, gettext("Hmm: redirecting \"robots.txt\" to %s ???\n"), pom);
227 _free(pom);
228 free_deep_url(docu.doc_url);
229 _free(docu.doc_url) docu.doc_url = purl;
230 }
231
232 _free(docu.contents);
233 _free(docu.mime);
234 _free(docu.type_str);
235
236 doc_remove_lock(&docu);
237
238 if(cfg.mode == MODE_SYNC || cfg.mode == MODE_MIRROR)
239 {
240 pp = url_to_filename(purl, TRUE);
241 if(!stat(pp, &estat) && !S_ISDIR(estat.st_mode))
242 {
243 docu.dtime = estat.st_mtime;
244 }
245 }
246 restore_global_connection_data(&con_info, &docu);
247 }
248
249 if(docu.errcode)
250 report_error(&docu, "robots.txt");
251
252 save_global_connection_data(&con_info, &docu);
253 kill_global_connection_data(&con_info);
254
255 if(!rstat)
256 {
257 if(cfg.dumpfd < 0)
258 {
259 doc_store(&docu, TRUE);
260 }
261 ret = docu.contents;
262 }
263 else if(docu.errcode == ERR_HTTP_NFOUND || docu.errcode == ERR_HTTP_GONE)
264 {
265 pp = url_to_filename(purl, TRUE);
266
267 if(cfg.dumpfd < 0)
268 {
269 if((f =
270 open(pp, O_BINARY | O_CREAT | O_TRUNC | O_WRONLY,
271 S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR)) > 0)
272 close(f);
273 }
274 }
275 else
276 {
277 _free(docu.contents);
278 }
279
280 doc_remove_lock(&docu);
281
282 _free(docu.type_str);
283 _free(docu.mime);
284
285 if(purl && purl->moved_to)
286 {
287 free_deep_url(purl->moved_to);
288 free(purl->moved_to);
289 }
290 if(purl)
291 {
292 free_deep_url(purl);
293 free(purl);
294 }
295 return ret;
296 }
297
298 /*******************************/
299 /* analyza suboru "robots.txt" */
300 /* FIXME: Translate me! */
301 /*******************************/
302
parse_robots(char * agent,char * file,char *** dpat,char *** apat)303 static void parse_robots(char *agent, char *file, char ***dpat, char ***apat)
304 {
305 char *p, *p1, *p2;
306 bool_t is_me = FALSE;
307 int n_dret = 0, n_aret = 0;
308 bool_t last = 1;
309 int ilen;
310
311 *apat = NULL;
312 *dpat = NULL;
313
314 p = file;
315 while(*p)
316 {
317 ilen = strcspn(p, "\r\n");
318 if(*(p + ilen))
319 *(p + ilen) = '\0';
320 else
321 last = 0;
322
323 while(*p == ' ' || *p == '\t')
324 p++;
325
326 if(!*p)
327 {
328 is_me = FALSE;
329 }
330 else if(!strncasecmp("User-Agent: ", p, 12))
331 {
332 p2 = p + 12;
333 while(*p2 == ' ' || *p2 == '\t')
334 p2++;
335 p1 = p2 + strlen(p2);
336 while(*p1 == ' ' || *p1 == '\t')
337 {
338 *p1 = '\0';
339 p1--;
340 }
341
342 if(*p2 == '*')
343 is_me = TRUE;
344 else if(!strncmp(agent, p2, strlen(agent)))
345 is_me = TRUE;
346 }
347 else if(is_me && !strncasecmp("Disallow: ", p, 10))
348 {
349 p2 = p + 10;
350 while(*p2 == ' ' || *p2 == '\t')
351 p2++;
352 p1 = p2 + strlen(p2);
353 while(*p1 == ' ' || *p1 == '\t')
354 {
355 *p1 = '\0';
356 p1--;
357 }
358
359 if(*p2)
360 {
361 *dpat = (char **) _realloc(*dpat, (n_dret + 2) * sizeof(char *));
362 (*dpat)[n_dret + 1] = NULL;
363 (*dpat)[n_dret] = new_string(p2);
364 n_dret++;
365 }
366 }
367 else if(is_me && !strncasecmp("Allow: ", p, 7))
368 {
369 p2 = p + 7;
370 while(*p2 == ' ' || *p2 == '\t')
371 p2++;
372 p1 = p2 + strlen(p2);
373 while(*p1 == ' ' || *p1 == '\t')
374 {
375 *p1 = '\0';
376 p1--;
377 }
378
379 if(*p2)
380 {
381 *apat = (char **) _realloc(*apat, (n_aret + 2) * sizeof(char *));
382 (*apat)[n_aret + 1] = NULL;
383 (*apat)[n_aret] = new_string(p2);
384 n_aret++;
385 }
386 }
387
388 p += ilen + last;
389 p += strspn(p, "\n\r");
390 }
391 }
392
robots_do_cleanup(void)393 void robots_do_cleanup(void)
394 {
395 int i, j;
396
397 for(i = 0; robots && robots[i]; i++)
398 {
399 _free(robots[i]->site);
400 for(j = 0; robots[i]->apat && robots[i]->apat[j]; j++)
401 _free(robots[i]->apat[j]);
402 _free(robots[i]->apat);
403 for(j = 0; robots[i]->dpat && robots[i]->dpat[j]; j++)
404 _free(robots[i]->dpat[j]);
405 _free(robots[i]->dpat);
406 #ifdef HAVE_MT
407 pthread_mutex_destroy(&(robots[i]->lock));
408 #endif
409 _free(robots[i]);
410 }
411 _free(robots);
412 }
413