1 // Larbin
2 // Sebastien Ailleret
3 // 08-02-00 -> 06-01-02
4 
5 #include <unistd.h>
6 #include <errno.h>
7 #include <cstring>
8 #include <assert.h>
9 #include <sys/socket.h>
10 #include <netinet/in.h>
11 #include <arpa/inet.h>
12 #include <ctype.h>
13 
14 #include "options.h"
15 
16 #include "types.h"
17 #include "utils/Fifo.h"
18 #include "utils/debug.h"
19 #include "utils/text.h"
20 #include "utils/connexion.h"
21 #include "interf/output.h"
22 #include "fetch/site.h"
23 
24 
25 /////////////////////////////////////////////////////////
26 // functions used for the 2 types of sites
27 /////////////////////////////////////////////////////////
28 
29 static struct sockaddr_in stataddr;
30 
initSite()31 void initSite () {
32   stataddr.sin_family = AF_INET;
33 }
34 
35 /** connect to this addr using connection conn
36  * return the state of the socket
37  */
getFds(Connexion * conn,struct in_addr * addr,uint port)38 static char getFds (Connexion *conn, struct in_addr *addr, uint port) {
39   memcpy (&stataddr.sin_addr,
40           addr,
41           sizeof (struct in_addr));
42   stataddr.sin_port = htons(port);
43   int fd = socket(AF_INET, SOCK_STREAM, 0);
44   if (fd < 0)
45     return emptyC;
46   else
47     global::verifMax(fd);
48   conn->socket = fd;
49   for (;;) {
50     fcntl(fd, F_SETFL, O_NONBLOCK);
51     struct sockaddr_in *theaddr;
52     if (global::proxyAddr != NULL)
53       theaddr = global::proxyAddr;
54     else
55       theaddr = &stataddr;
56     if (connect(fd, (struct sockaddr*) theaddr,
57                 sizeof (struct sockaddr_in)) == 0) {
58       // success
59       return writeC;
60     } else if (errno == EINPROGRESS) {
61       // would block
62       return connectingC;
63     } else {
64       // error
65       (void) close(fd);
66       return emptyC;
67     }
68   }
69 }
70 
71 
72 ///////////////////////////////////////////////////////////
73 // class NamedSite
74 ///////////////////////////////////////////////////////////
75 
76 /** Constructor : initiate fields used by the program
77  */
NamedSite()78 NamedSite::NamedSite () {
79   name[0] = 0;
80   nburls = 0;
81   inFifo = 0; outFifo = 0;
82   isInFifo = false;
83   dnsState = waitDns;
84   cname = NULL;
85 }
86 
87 /** Destructor : This one is never used
88  */
~NamedSite()89 NamedSite::~NamedSite () {
90   assert(false);
91 }
92 
93 /* Management of the Fifo */
putInFifo(url * u)94 void NamedSite::putInFifo(url *u) {
95   fifo[inFifo] = u;
96   inFifo = (inFifo + 1) % maxUrlsBySite;
97   assert(inFifo!=outFifo);
98 }
99 
getInFifo()100 url *NamedSite::getInFifo() {
101   assert (inFifo != outFifo);
102   url *tmp = fifo[outFifo];
103   outFifo = (outFifo + 1) % maxUrlsBySite;
104   return tmp;
105 }
106 
fifoLength()107 short NamedSite::fifoLength() {
108   return (inFifo + maxUrlsBySite - outFifo) % maxUrlsBySite;
109 }
110 
111 /* Put an url in the fifo if their are not too many */
putGenericUrl(url * u,int limit,bool prio)112 void NamedSite::putGenericUrl(url *u, int limit, bool prio) {
113   if (nburls > maxUrlsBySite-limit) {
114 	// Already enough Urls in memory for this Site
115     // first check if it can already be forgotten
116     if (!strcmp(name, u->getHost())) {
117       if (dnsState == errorDns) {
118         nburls++;
119         forgetUrl(u, noDNS);
120         return;
121       }
122       if (dnsState == noConnDns) {
123         nburls++;
124         forgetUrl(u, noConnection);
125         return;
126       }
127       if (u->getPort() == port
128           && dnsState == doneDns && !testRobots(u->getFile())) {
129         nburls++;
130         forgetUrl(u, forbiddenRobots);
131         return;
132       }
133     }
134     // else put it back in URLsDisk
135     refUrl();
136     global::inter->getOne();
137     if (prio) {
138       global::URLsPriorityWait->put(u);
139     } else {
140       global::URLsDiskWait->put(u);
141     }
142   } else {
143     nburls++;
144     if (dnsState == waitDns
145         || strcmp(name, u->getHost())
146         || port != u->getPort()
147         || global::now > dnsTimeout) {
148       // dns not done or other site
149       putInFifo(u);
150       addNamedUrl();
151       // Put Site in fifo if not yet in
152       if (!isInFifo) {
153         isInFifo = true;
154         global::dnsSites->put(this);
155       }
156     } else switch (dnsState) {
157     case doneDns:
158       transfer(u);
159       break;
160     case errorDns:
161       forgetUrl(u, noDNS);
162       break;
163     default: // noConnDns
164       forgetUrl(u, noConnection);
165     }
166   }
167 }
168 
169 /** Init a new dns query
170  */
newQuery()171 void NamedSite::newQuery () {
172   // Update our stats
173   newId();
174   if (global::proxyAddr != NULL) {
175     // we use a proxy, no need to get the sockaddr
176     // give anything for going on
177     siteSeen();
178     siteDNS();
179     // Get the robots.txt
180     dnsOK();
181   } else if (isdigit(name[0])) {
182     // the name already in numbers-and-dots notation
183 	siteSeen();
184 	if (inet_aton(name, &addr)) {
185 	  // Yes, it is in numbers-and-dots notation
186 	  siteDNS();
187 	  // Get the robots.txt
188 	  dnsOK();
189 	} else {
190 	  // No, it isn't : this site is a non sense
191       dnsState = errorDns;
192 	  dnsErr();
193 	}
194   } else {
195     // submit an adns query
196     global::nbDnsCalls++;
197     adns_query quer = NULL;
198     adns_submit(global::ads, name,
199                 (adns_rrtype) adns_r_addr,
200                 (adns_queryflags) 0,
201                 this, &quer);
202   }
203 }
204 
205 /** The dns query ended with success
206  * assert there is a freeConn
207  */
dnsAns(adns_answer * ans)208 void NamedSite::dnsAns (adns_answer *ans) {
209   if (ans->status == adns_s_prohibitedcname) {
210     if (cname == NULL) {
211       // try to find ip for cname of cname
212       cname = newString(ans->cname);
213       global::nbDnsCalls++;
214       adns_query quer = NULL;
215       adns_submit(global::ads, cname,
216                   (adns_rrtype) adns_r_addr,
217                   (adns_queryflags) 0,
218                   this, &quer);
219     } else {
220       // dns chains too long => dns error
221       // cf nslookup or host for more information
222       siteSeen();
223       delete [] cname; cname = NULL;
224       dnsState = errorDns;
225       dnsErr();
226     }
227   } else {
228     siteSeen();
229     if (cname != NULL) { delete [] cname; cname = NULL; }
230     if (ans->status != adns_s_ok) {
231       // No addr inet
232       dnsState = errorDns;
233       dnsErr();
234     } else {
235       siteDNS();
236       // compute the new addr
237       memcpy (&addr,
238               &ans->rrs.addr->addr.inet.sin_addr,
239               sizeof (struct in_addr));
240       // Get the robots.txt
241       dnsOK();
242     }
243   }
244 }
245 
246 /** we've got a good dns answer
247  * get the robots.txt
248  * assert there is a freeConn
249  */
dnsOK()250 void NamedSite::dnsOK () {
251   Connexion *conn = global::freeConns->get();
252   char res = getFds(conn, &addr, port);
253   if (res != emptyC) {
254     conn->timeout = timeoutPage;
255     if (global::proxyAddr != NULL) {
256       // use a proxy
257       conn->request.addString("GET http://");
258       conn->request.addString(name);
259       char tmp[15];
260       sprintf(tmp, ":%u", port);
261       conn->request.addString(tmp);
262       conn->request.addString("/robots.txt HTTP/1.0\r\nHost: ");
263     } else {
264       // direct connection
265       conn->request.addString("GET /robots.txt HTTP/1.0\r\nHost: ");
266     }
267     conn->request.addString(name);
268     conn->request.addString(global::headersRobots);
269     conn->parser = new robots(this, conn);
270     conn->pos = 0;
271     conn->err = success;
272     conn->state = res;
273   } else {
274     // Unable to get a socket
275     global::freeConns->put(conn);
276     dnsState = noConnDns;
277     dnsErr();
278   }
279 }
280 
281 /** Cannot get the inet addr
282  * dnsState must have been set properly before the call
283  */
dnsErr()284 void NamedSite::dnsErr () {
285   FetchError theErr;
286   if (dnsState == errorDns) {
287     theErr = noDNS;
288   } else {
289     theErr = noConnection;
290   }
291   int ss = fifoLength();
292   // scan the queue
293   for (int i=0; i<ss; i++) {
294     url *u = getInFifo();
295     if (!strcmp(name, u->getHost())) {
296       delNamedUrl();
297       forgetUrl(u, theErr);
298     } else { // different name
299       putInFifo(u);
300     }
301   }
302   // where should now lie this site
303   if (inFifo==outFifo) {
304     isInFifo = false;
305   } else {
306     global::dnsSites->put(this);
307   }
308 }
309 
310 /** test if a file can be fetched thanks to the robots.txt */
testRobots(char * file)311 bool NamedSite::testRobots(char *file) {
312   uint pos = forbidden.getLength();
313   for (uint i=0; i<pos; i++) {
314     if (robotsMatch(forbidden[i], file))
315       return false;
316   }
317   return true;
318 }
319 
320 /** Delete the old identity of the site */
newId()321 void NamedSite::newId () {
322   // ip expires or new name or just new port
323   // Change the identity of this site
324 #ifndef NDEBUG
325   if (name[0] == 0) {
326     addsite();
327   }
328 #endif // NDEBUG
329   url *u = fifo[outFifo];
330   strcpy(name, u->getHost());
331   port = u->getPort();
332   dnsTimeout = global::now + dnsValidTime;
333   dnsState = waitDns;
334 }
335 
336 /** we got the robots.txt,
337  * compute ipHashCode
338  * transfer what must be in IPSites
339  */
robotsResult(FetchError res)340 void NamedSite::robotsResult (FetchError res) {
341   bool ok = res != noConnection;
342   if (ok) {
343     dnsState = doneDns;
344     // compute ip hashcode
345     if (global::proxyAddr == NULL) {
346       ipHash=0;
347       char *s = (char *) &addr;
348       for (uint i=0; i<sizeof(struct in_addr); i++) {
349         ipHash = ipHash*31 + s[i];
350       }
351     } else {
352       // no ip and need to avoid rapidFire => use hostHashCode
353       ipHash = this - global::namedSiteList;
354     }
355     ipHash %= IPSiteListSize;
356   } else {
357     dnsState = noConnDns;
358   }
359   int ss = fifoLength();
360   // scan the queue
361   for (int i=0; i<ss; i++) {
362     url *u = getInFifo();
363     if (!strcmp(name, u->getHost())) {
364       delNamedUrl();
365       if (ok) {
366         if (port == u->getPort()) {
367           transfer(u);
368         } else {
369           putInFifo(u);
370         }
371       } else {
372         forgetUrl(u, noConnection);
373       }
374     } else {
375       putInFifo(u);
376     }
377   }
378   // where should now lie this site
379   if (inFifo==outFifo) {
380     isInFifo = false;
381   } else {
382     global::dnsSites->put(this);
383   }
384 }
385 
transfer(url * u)386 void NamedSite::transfer (url *u) {
387   if (testRobots(u->getFile())) {
388     if (global::proxyAddr == NULL) {
389       memcpy (&u->addr, &addr, sizeof (struct in_addr));
390     }
391     global::IPSiteList[ipHash].putUrl(u);
392   } else {
393     forgetUrl(u, forbiddenRobots);
394   }
395 }
396 
forgetUrl(url * u,FetchError reason)397 void NamedSite::forgetUrl (url *u, FetchError reason) {
398   urls();
399   fetchFail(u, reason);
400   answers(reason);
401   nburls--;
402   delete u;
403   global::inter->getOne();
404 }
405 
406 ///////////////////////////////////////////////////////////
407 // class IPSite
408 ///////////////////////////////////////////////////////////
409 
410 /** Constructor : initiate fields used by the program
411  */
IPSite()412 IPSite::IPSite () {
413   lastAccess = 0;
414   isInFifo = false;
415 }
416 
417 /** Destructor : This one is never used
418  */
~IPSite()419 IPSite::~IPSite () {
420   assert(false);
421 }
422 
423 /** Put an prioritarian url in the fifo
424  * Up to now, it's very naive
425  * because we have no memory of priority inside the url
426  */
putUrl(url * u)427 void IPSite::putUrl (url *u) {
428   // All right, put this url inside at the end of the queue
429   tab.put(u);
430   addIPUrl();
431   // Put Site in fifo if not yet in
432   if (!isInFifo) {
433 #ifndef NDEBUG
434     if (lastAccess == 0) addipsite();
435 #endif // NDEBUG
436     isInFifo = true;
437     if (lastAccess + global::waitDuration <= global::now
438         && global::freeConns->isNonEmpty()) {
439       fetch();
440     } else {
441       global::okSites->put(this);
442     }
443   }
444 }
445 
446 /** Get an url from the fifo and do some stats
447  */
getUrl()448 inline url *IPSite::getUrl () {
449   url *u = tab.get();
450   delIPUrl();
451   urls();
452   global::namedSiteList[u->hostHashCode()].nburls--;
453   global::inter->getOne();
454 #if defined(SPECIFICSEARCH) && !defined(NOSTATS)
455   if (privilegedExts[0] != NULL && matchPrivExt(u->getFile())) {
456     extensionTreated();
457   }
458 #endif
459   return u;
460 }
461 
462 /** fetch the first page in the fifo okSites
463  * there must be at least one element in freeConns !!!
464  * return expected time for next call (0 means now is OK)
465  * This function always put the IPSite in fifo before returning
466  *   (or set isInFifo to false if empty)
467  */
fetch()468 int IPSite::fetch () {
469   if (tab.isEmpty()) {
470 	// no more url to read
471 	// This is possible because this function can be called recursively
472 	isInFifo = false;
473     return 0;
474   } else {
475     int next_call = lastAccess + global::waitDuration;
476     if (next_call > global::now) {
477       global::okSites->rePut(this);
478       return next_call;
479     } else {
480       Connexion *conn = global::freeConns->get();
481       url *u = getUrl();
482       // We're allowed to fetch this one
483       // open the socket and write the request
484       char res = getFds(conn, &(u->addr), u->getPort());
485       if (res != emptyC) {
486         lastAccess = global::now;
487         conn->timeout = timeoutPage;
488         conn->request.addString("GET ");
489         if (global::proxyAddr != NULL) {
490           char *tmp = u->getUrl();
491           conn->request.addString(tmp);
492         } else {
493           conn->request.addString(u->getFile());
494         }
495         conn->request.addString(" HTTP/1.0\r\nHost: ");
496         conn->request.addString(u->getHost());
497 #ifdef COOKIES
498         if (u->cookie != NULL) {
499           conn->request.addString("\r\nCookie: ");
500           conn->request.addString(u->cookie);
501         }
502 #endif // COOKIES
503         conn->request.addString(global::headers);
504         conn->parser = new html (u, conn);
505         conn->pos = 0;
506         conn->err = success;
507         conn->state = res;
508         if (tab.isEmpty()) {
509           isInFifo = false;
510         } else {
511           global::okSites->put(this);
512         }
513         return 0;
514       } else {
515         // Unable to connect
516         fetchFail(u, noConnection);
517         answers(noConnection);
518         delete u;
519         global::freeConns->put(conn);
520         return fetch();
521       }
522     }
523   }
524 }
525