1 /***************************************************************************/
2 /*    This code is part of WWW grabber called pavuk                        */
3 /*    Copyright (c) 1997 - 2001 Stefan Ondrejicka                          */
4 /*    Distributed under GPL 2 or later                                     */
5 /***************************************************************************/
6 
7 #include <unistd.h>
8 #include <stdlib.h>
9 #include <stdio.h>
10 #include <string.h>
11 #include <sys/types.h>
12 #include <sys/stat.h>
13 #include <fcntl.h>
14 #include <time.h>
15 
16 #include "config.h"
17 #include "condition.h"
18 #include "mime.h"
19 #include "robots.h"
20 #include "url.h"
21 #include "tools.h"
22 #include "doc.h"
23 #include "abstract.h"
24 #include "tools.h"
25 #include "mode.h"
26 #include "times.h"
27 #include "errcode.h"
28 #include "gcinfo.h"
29 #include "gui_api.h"
30 
31 #ifdef HAVE_MT
32 #define LOCK_ROBOTS_ENTRY(ent)  mt_pthread_mutex_lock(&(ent)->lock, "robots entry")
33 #define UNLOCK_ROBOTS_ENTRY(ent) mt_pthread_mutex_unlock(&(ent)->lock, "robots entry")
34 #else
35 #define LOCK_ROBOTS_ENTRY(ent)
36 #define UNLOCK_ROBOTS_ENTRY(ent)
37 #endif
38 
39 static char *get_robots(url * urlp);
40 static void parse_robots(char *, char *, char ***, char ***);
41 
42 static robotlim **robots = NULL;
43 
get_max_match(char * str,char ** pat)44 static char *get_max_match(char *str, char **pat)
45 {
46   char *rv = NULL;
47   int maxlen = -1;
48 
49   while(pat && *pat)
50   {
51     if(!strncmp(*pat, str, strlen(*pat)))
52     {
53       int len = strlen(*pat);
54 
55       if(len > maxlen)
56       {
57         rv = *pat;
58         maxlen = len;
59       }
60     }
61     pat++;
62   }
63   return rv;
64 }
65 
66 /***************************************************/
67 /* kontrola ci URL splna podmienky pre WWW robotov */
68 /* FIXME: Translate me!                            */
69 /***************************************************/
robots_check(url * urlp)70 bool_t robots_check(url * urlp)
71 {
72   char *pom, *mdp, *map;
73   int i = 0;
74   int rv = TRUE;
75   robotlim *tmpr = NULL;
76   int dont_have = TRUE;
77 
78   if((urlp->type != URLT_HTTP && urlp->type != URLT_HTTPS)
79     || !cfg.condition.allow_robots)
80     return TRUE;
81 
82   LOCK_ROBOTS;
83   if(robots)
84   {
85     while(robots[i] && !(!strcmp(robots[i]->site, urlp->p.http.host) &&
86         (robots[i]->port == urlp->p.http.port)))
87       i++;
88     if(robots[i])
89       dont_have = FALSE;
90   }
91 
92   if(dont_have)
93   {
94     tmpr = (robotlim *) _malloc(sizeof(robotlim));
95     tmpr->site = new_string(urlp->p.http.host);
96     tmpr->port = urlp->p.http.port;
97     tmpr->dpat = NULL;
98     tmpr->apat = NULL;
99 #ifdef HAVE_MT
100     pthread_mutex_init(&tmpr->lock, NULL);
101 #endif
102 
103     robots = (robotlim **) _realloc(robots, (i + 2) * sizeof(robotlim *));
104     robots[i] = tmpr;
105     robots[i + 1] = NULL;
106 
107     /* here is possible to cros enter/leave to critical   */
108     /* sections because no chance that anyone else holds  */
109     /* lock on robots[i]->lock when it is freshly created */
110     LOCK_ROBOTS_ENTRY(robots[i]);
111   }
112   UNLOCK_ROBOTS;
113 
114   if(dont_have)
115   {
116 
117     pom = get_robots(urlp);
118     if(pom)
119     {
120       parse_robots("pavuk", pom, &tmpr->dpat, &tmpr->apat);
121       _free(pom);
122     }
123   }
124   else
125   {
126     LOCK_ROBOTS_ENTRY(robots[i]);
127   }
128 
129   mdp = get_max_match(urlp->p.http.document, robots[i]->dpat);
130   map = get_max_match(urlp->p.http.document, robots[i]->apat);
131   UNLOCK_ROBOTS_ENTRY(robots[i]);
132 
133   if(map && mdp && (strlen(map) >= strlen(mdp)))
134     rv = TRUE;
135   else if(mdp)
136     rv = FALSE;
137 
138   return rv;
139 }
140 
141 /************************************************/
142 /* prenos suboru "robots.txt" pre dane URL      */
143 /* FIXME: Translate me!                         */
144 /************************************************/
get_robots(url * urlp)145 static char *get_robots(url * urlp)
146 {
147   url *purl = _malloc(sizeof(url));
148   doc docu;
149   int rstat;
150   char *ret = NULL;
151   char *pom;
152   int nredir = 0, nreget = 0;
153   struct stat estat;
154   char *pp;
155   int f;
156   global_connection_info con_info;
157 
158 #ifdef I_FACE
159   if(cfg.xi_face)
160   {
161     gui_set_status(gettext("transfering \"robots.txt\""));
162   }
163 #endif
164   xprintf(1, gettext("transfering \"robots.txt\"\n"));
165 
166   memset(purl, '\0', sizeof(url));
167   purl->type = urlp->type;
168   purl->parent_url = NULL;
169   purl->status = URL_INLINE_OBJ; /*** required if -store_name option used ***/
170   purl->extension = NULL;
171   purl->local_name = NULL;
172 
173 #ifdef HAVE_MT
174   pthread_mutex_init(&purl->lock, NULL);
175 #endif
176 
177 #ifdef WITH_TREE
178 #ifdef I_FACE
179   purl->prop = NULL;
180   purl->tree_nfo = NULL;
181 #endif
182 #endif
183 
184   purl->level = 0;
185   purl->p.http.user = new_string(urlp->p.http.user);
186   purl->p.http.password = new_string(urlp->p.http.password);
187   purl->p.http.host = new_string(urlp->p.http.host);
188   purl->p.http.port = urlp->p.http.port;
189   purl->p.http.document = new_string("/robots.txt");
190   purl->p.http.anchor_name = NULL;
191   purl->p.http.searchstr = NULL;
192 
193   doc_init(&docu, purl);
194   docu.is_robot = TRUE;
195   docu.save_online = FALSE;
196   docu.report_size = FALSE;
197   docu.check_limits = FALSE;
198 
199   if(cfg.mode == MODE_SYNC || cfg.mode == MODE_MIRROR)
200   {
201     pp = url_to_filename(purl, TRUE);
202     if(!stat(pp, &estat) && !S_ISDIR(estat.st_mode))
203     {
204       docu.dtime = estat.st_mtime;
205     }
206   }
207 
208   init_global_connection_data(&con_info);
209 
210   while((rstat = doc_download(&docu, TRUE, FALSE)) &&
211     ((nredir < cfg.nredir && docu.errcode == ERR_HTTP_REDIR) ||
212       (nreget < cfg.nreget && docu.errcode == ERR_HTTP_TRUNC)))
213   {
214     if(docu.errcode)
215       report_error(&docu, "robots.txt");
216 
217     save_global_connection_data(&con_info, &docu);
218 
219     nredir += docu.errcode == ERR_HTTP_REDIR;
220     nreget += docu.errcode == ERR_HTTP_TRUNC;
221 
222     if(docu.errcode == ERR_HTTP_REDIR)
223     {
224       purl = docu.doc_url->moved_to;
225       pom = url_to_urlstr(purl, FALSE);
226       xprintf(1, gettext("Hmm: redirecting \"robots.txt\" to %s ???\n"), pom);
227       _free(pom);
228       free_deep_url(docu.doc_url);
229       _free(docu.doc_url) docu.doc_url = purl;
230     }
231 
232     _free(docu.contents);
233     _free(docu.mime);
234     _free(docu.type_str);
235 
236     doc_remove_lock(&docu);
237 
238     if(cfg.mode == MODE_SYNC || cfg.mode == MODE_MIRROR)
239     {
240       pp = url_to_filename(purl, TRUE);
241       if(!stat(pp, &estat) && !S_ISDIR(estat.st_mode))
242       {
243         docu.dtime = estat.st_mtime;
244       }
245     }
246     restore_global_connection_data(&con_info, &docu);
247   }
248 
249   if(docu.errcode)
250     report_error(&docu, "robots.txt");
251 
252   save_global_connection_data(&con_info, &docu);
253   kill_global_connection_data(&con_info);
254 
255   if(!rstat)
256   {
257     if(cfg.dumpfd < 0)
258     {
259       doc_store(&docu, TRUE);
260     }
261     ret = docu.contents;
262   }
263   else if(docu.errcode == ERR_HTTP_NFOUND || docu.errcode == ERR_HTTP_GONE)
264   {
265     pp = url_to_filename(purl, TRUE);
266 
267     if(cfg.dumpfd < 0)
268     {
269       if((f =
270           open(pp, O_BINARY | O_CREAT | O_TRUNC | O_WRONLY,
271             S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR)) > 0)
272         close(f);
273     }
274   }
275   else
276   {
277     _free(docu.contents);
278   }
279 
280   doc_remove_lock(&docu);
281 
282   _free(docu.type_str);
283   _free(docu.mime);
284 
285   if(purl && purl->moved_to)
286   {
287     free_deep_url(purl->moved_to);
288     free(purl->moved_to);
289   }
290   if(purl)
291   {
292     free_deep_url(purl);
293     free(purl);
294   }
295   return ret;
296 }
297 
298 /*******************************/
299 /* analyza suboru "robots.txt" */
300 /* FIXME: Translate me!        */
301 /*******************************/
302 
parse_robots(char * agent,char * file,char *** dpat,char *** apat)303 static void parse_robots(char *agent, char *file, char ***dpat, char ***apat)
304 {
305   char *p, *p1, *p2;
306   bool_t is_me = FALSE;
307   int n_dret = 0, n_aret = 0;
308   bool_t last = 1;
309   int ilen;
310 
311   *apat = NULL;
312   *dpat = NULL;
313 
314   p = file;
315   while(*p)
316   {
317     ilen = strcspn(p, "\r\n");
318     if(*(p + ilen))
319       *(p + ilen) = '\0';
320     else
321       last = 0;
322 
323     while(*p == ' ' || *p == '\t')
324       p++;
325 
326     if(!*p)
327     {
328       is_me = FALSE;
329     }
330     else if(!strncasecmp("User-Agent: ", p, 12))
331     {
332       p2 = p + 12;
333       while(*p2 == ' ' || *p2 == '\t')
334         p2++;
335       p1 = p2 + strlen(p2);
336       while(*p1 == ' ' || *p1 == '\t')
337       {
338         *p1 = '\0';
339         p1--;
340       }
341 
342       if(*p2 == '*')
343         is_me = TRUE;
344       else if(!strncmp(agent, p2, strlen(agent)))
345         is_me = TRUE;
346     }
347     else if(is_me && !strncasecmp("Disallow: ", p, 10))
348     {
349       p2 = p + 10;
350       while(*p2 == ' ' || *p2 == '\t')
351         p2++;
352       p1 = p2 + strlen(p2);
353       while(*p1 == ' ' || *p1 == '\t')
354       {
355         *p1 = '\0';
356         p1--;
357       }
358 
359       if(*p2)
360       {
361         *dpat = (char **) _realloc(*dpat, (n_dret + 2) * sizeof(char *));
362         (*dpat)[n_dret + 1] = NULL;
363         (*dpat)[n_dret] = new_string(p2);
364         n_dret++;
365       }
366     }
367     else if(is_me && !strncasecmp("Allow: ", p, 7))
368     {
369       p2 = p + 7;
370       while(*p2 == ' ' || *p2 == '\t')
371         p2++;
372       p1 = p2 + strlen(p2);
373       while(*p1 == ' ' || *p1 == '\t')
374       {
375         *p1 = '\0';
376         p1--;
377       }
378 
379       if(*p2)
380       {
381         *apat = (char **) _realloc(*apat, (n_aret + 2) * sizeof(char *));
382         (*apat)[n_aret + 1] = NULL;
383         (*apat)[n_aret] = new_string(p2);
384         n_aret++;
385       }
386     }
387 
388     p += ilen + last;
389     p += strspn(p, "\n\r");
390   }
391 }
392 
robots_do_cleanup(void)393 void robots_do_cleanup(void)
394 {
395   int i, j;
396 
397   for(i = 0; robots && robots[i]; i++)
398   {
399     _free(robots[i]->site);
400     for(j = 0; robots[i]->apat && robots[i]->apat[j]; j++)
401       _free(robots[i]->apat[j]);
402     _free(robots[i]->apat);
403     for(j = 0; robots[i]->dpat && robots[i]->dpat[j]; j++)
404       _free(robots[i]->dpat[j]);
405     _free(robots[i]->dpat);
406 #ifdef HAVE_MT
407     pthread_mutex_destroy(&(robots[i]->lock));
408 #endif
409     _free(robots[i]);
410   }
411   _free(robots);
412 }
413