1 // Larbin
2 // Sebastien Ailleret
3 // 08-02-00 -> 06-01-02
4
5 #include <unistd.h>
6 #include <errno.h>
7 #include <cstring>
8 #include <assert.h>
9 #include <sys/socket.h>
10 #include <netinet/in.h>
11 #include <arpa/inet.h>
12 #include <ctype.h>
13
14 #include "options.h"
15
16 #include "types.h"
17 #include "utils/Fifo.h"
18 #include "utils/debug.h"
19 #include "utils/text.h"
20 #include "utils/connexion.h"
21 #include "interf/output.h"
22 #include "fetch/site.h"
23
24
25 /////////////////////////////////////////////////////////
26 // functions used for the 2 types of sites
27 /////////////////////////////////////////////////////////
28
29 static struct sockaddr_in stataddr;
30
initSite()31 void initSite () {
32 stataddr.sin_family = AF_INET;
33 }
34
35 /** connect to this addr using connection conn
36 * return the state of the socket
37 */
getFds(Connexion * conn,struct in_addr * addr,uint port)38 static char getFds (Connexion *conn, struct in_addr *addr, uint port) {
39 memcpy (&stataddr.sin_addr,
40 addr,
41 sizeof (struct in_addr));
42 stataddr.sin_port = htons(port);
43 int fd = socket(AF_INET, SOCK_STREAM, 0);
44 if (fd < 0)
45 return emptyC;
46 else
47 global::verifMax(fd);
48 conn->socket = fd;
49 for (;;) {
50 fcntl(fd, F_SETFL, O_NONBLOCK);
51 struct sockaddr_in *theaddr;
52 if (global::proxyAddr != NULL)
53 theaddr = global::proxyAddr;
54 else
55 theaddr = &stataddr;
56 if (connect(fd, (struct sockaddr*) theaddr,
57 sizeof (struct sockaddr_in)) == 0) {
58 // success
59 return writeC;
60 } else if (errno == EINPROGRESS) {
61 // would block
62 return connectingC;
63 } else {
64 // error
65 (void) close(fd);
66 return emptyC;
67 }
68 }
69 }
70
71
72 ///////////////////////////////////////////////////////////
73 // class NamedSite
74 ///////////////////////////////////////////////////////////
75
76 /** Constructor : initiate fields used by the program
77 */
NamedSite()78 NamedSite::NamedSite () {
79 name[0] = 0;
80 nburls = 0;
81 inFifo = 0; outFifo = 0;
82 isInFifo = false;
83 dnsState = waitDns;
84 cname = NULL;
85 }
86
87 /** Destructor : This one is never used
88 */
~NamedSite()89 NamedSite::~NamedSite () {
90 assert(false);
91 }
92
93 /* Management of the Fifo */
putInFifo(url * u)94 void NamedSite::putInFifo(url *u) {
95 fifo[inFifo] = u;
96 inFifo = (inFifo + 1) % maxUrlsBySite;
97 assert(inFifo!=outFifo);
98 }
99
getInFifo()100 url *NamedSite::getInFifo() {
101 assert (inFifo != outFifo);
102 url *tmp = fifo[outFifo];
103 outFifo = (outFifo + 1) % maxUrlsBySite;
104 return tmp;
105 }
106
fifoLength()107 short NamedSite::fifoLength() {
108 return (inFifo + maxUrlsBySite - outFifo) % maxUrlsBySite;
109 }
110
111 /* Put an url in the fifo if their are not too many */
putGenericUrl(url * u,int limit,bool prio)112 void NamedSite::putGenericUrl(url *u, int limit, bool prio) {
113 if (nburls > maxUrlsBySite-limit) {
114 // Already enough Urls in memory for this Site
115 // first check if it can already be forgotten
116 if (!strcmp(name, u->getHost())) {
117 if (dnsState == errorDns) {
118 nburls++;
119 forgetUrl(u, noDNS);
120 return;
121 }
122 if (dnsState == noConnDns) {
123 nburls++;
124 forgetUrl(u, noConnection);
125 return;
126 }
127 if (u->getPort() == port
128 && dnsState == doneDns && !testRobots(u->getFile())) {
129 nburls++;
130 forgetUrl(u, forbiddenRobots);
131 return;
132 }
133 }
134 // else put it back in URLsDisk
135 refUrl();
136 global::inter->getOne();
137 if (prio) {
138 global::URLsPriorityWait->put(u);
139 } else {
140 global::URLsDiskWait->put(u);
141 }
142 } else {
143 nburls++;
144 if (dnsState == waitDns
145 || strcmp(name, u->getHost())
146 || port != u->getPort()
147 || global::now > dnsTimeout) {
148 // dns not done or other site
149 putInFifo(u);
150 addNamedUrl();
151 // Put Site in fifo if not yet in
152 if (!isInFifo) {
153 isInFifo = true;
154 global::dnsSites->put(this);
155 }
156 } else switch (dnsState) {
157 case doneDns:
158 transfer(u);
159 break;
160 case errorDns:
161 forgetUrl(u, noDNS);
162 break;
163 default: // noConnDns
164 forgetUrl(u, noConnection);
165 }
166 }
167 }
168
169 /** Init a new dns query
170 */
newQuery()171 void NamedSite::newQuery () {
172 // Update our stats
173 newId();
174 if (global::proxyAddr != NULL) {
175 // we use a proxy, no need to get the sockaddr
176 // give anything for going on
177 siteSeen();
178 siteDNS();
179 // Get the robots.txt
180 dnsOK();
181 } else if (isdigit(name[0])) {
182 // the name already in numbers-and-dots notation
183 siteSeen();
184 if (inet_aton(name, &addr)) {
185 // Yes, it is in numbers-and-dots notation
186 siteDNS();
187 // Get the robots.txt
188 dnsOK();
189 } else {
190 // No, it isn't : this site is a non sense
191 dnsState = errorDns;
192 dnsErr();
193 }
194 } else {
195 // submit an adns query
196 global::nbDnsCalls++;
197 adns_query quer = NULL;
198 adns_submit(global::ads, name,
199 (adns_rrtype) adns_r_addr,
200 (adns_queryflags) 0,
201 this, &quer);
202 }
203 }
204
205 /** The dns query ended with success
206 * assert there is a freeConn
207 */
dnsAns(adns_answer * ans)208 void NamedSite::dnsAns (adns_answer *ans) {
209 if (ans->status == adns_s_prohibitedcname) {
210 if (cname == NULL) {
211 // try to find ip for cname of cname
212 cname = newString(ans->cname);
213 global::nbDnsCalls++;
214 adns_query quer = NULL;
215 adns_submit(global::ads, cname,
216 (adns_rrtype) adns_r_addr,
217 (adns_queryflags) 0,
218 this, &quer);
219 } else {
220 // dns chains too long => dns error
221 // cf nslookup or host for more information
222 siteSeen();
223 delete [] cname; cname = NULL;
224 dnsState = errorDns;
225 dnsErr();
226 }
227 } else {
228 siteSeen();
229 if (cname != NULL) { delete [] cname; cname = NULL; }
230 if (ans->status != adns_s_ok) {
231 // No addr inet
232 dnsState = errorDns;
233 dnsErr();
234 } else {
235 siteDNS();
236 // compute the new addr
237 memcpy (&addr,
238 &ans->rrs.addr->addr.inet.sin_addr,
239 sizeof (struct in_addr));
240 // Get the robots.txt
241 dnsOK();
242 }
243 }
244 }
245
246 /** we've got a good dns answer
247 * get the robots.txt
248 * assert there is a freeConn
249 */
dnsOK()250 void NamedSite::dnsOK () {
251 Connexion *conn = global::freeConns->get();
252 char res = getFds(conn, &addr, port);
253 if (res != emptyC) {
254 conn->timeout = timeoutPage;
255 if (global::proxyAddr != NULL) {
256 // use a proxy
257 conn->request.addString("GET http://");
258 conn->request.addString(name);
259 char tmp[15];
260 sprintf(tmp, ":%u", port);
261 conn->request.addString(tmp);
262 conn->request.addString("/robots.txt HTTP/1.0\r\nHost: ");
263 } else {
264 // direct connection
265 conn->request.addString("GET /robots.txt HTTP/1.0\r\nHost: ");
266 }
267 conn->request.addString(name);
268 conn->request.addString(global::headersRobots);
269 conn->parser = new robots(this, conn);
270 conn->pos = 0;
271 conn->err = success;
272 conn->state = res;
273 } else {
274 // Unable to get a socket
275 global::freeConns->put(conn);
276 dnsState = noConnDns;
277 dnsErr();
278 }
279 }
280
281 /** Cannot get the inet addr
282 * dnsState must have been set properly before the call
283 */
dnsErr()284 void NamedSite::dnsErr () {
285 FetchError theErr;
286 if (dnsState == errorDns) {
287 theErr = noDNS;
288 } else {
289 theErr = noConnection;
290 }
291 int ss = fifoLength();
292 // scan the queue
293 for (int i=0; i<ss; i++) {
294 url *u = getInFifo();
295 if (!strcmp(name, u->getHost())) {
296 delNamedUrl();
297 forgetUrl(u, theErr);
298 } else { // different name
299 putInFifo(u);
300 }
301 }
302 // where should now lie this site
303 if (inFifo==outFifo) {
304 isInFifo = false;
305 } else {
306 global::dnsSites->put(this);
307 }
308 }
309
310 /** test if a file can be fetched thanks to the robots.txt */
testRobots(char * file)311 bool NamedSite::testRobots(char *file) {
312 uint pos = forbidden.getLength();
313 for (uint i=0; i<pos; i++) {
314 if (robotsMatch(forbidden[i], file))
315 return false;
316 }
317 return true;
318 }
319
320 /** Delete the old identity of the site */
newId()321 void NamedSite::newId () {
322 // ip expires or new name or just new port
323 // Change the identity of this site
324 #ifndef NDEBUG
325 if (name[0] == 0) {
326 addsite();
327 }
328 #endif // NDEBUG
329 url *u = fifo[outFifo];
330 strcpy(name, u->getHost());
331 port = u->getPort();
332 dnsTimeout = global::now + dnsValidTime;
333 dnsState = waitDns;
334 }
335
336 /** we got the robots.txt,
337 * compute ipHashCode
338 * transfer what must be in IPSites
339 */
robotsResult(FetchError res)340 void NamedSite::robotsResult (FetchError res) {
341 bool ok = res != noConnection;
342 if (ok) {
343 dnsState = doneDns;
344 // compute ip hashcode
345 if (global::proxyAddr == NULL) {
346 ipHash=0;
347 char *s = (char *) &addr;
348 for (uint i=0; i<sizeof(struct in_addr); i++) {
349 ipHash = ipHash*31 + s[i];
350 }
351 } else {
352 // no ip and need to avoid rapidFire => use hostHashCode
353 ipHash = this - global::namedSiteList;
354 }
355 ipHash %= IPSiteListSize;
356 } else {
357 dnsState = noConnDns;
358 }
359 int ss = fifoLength();
360 // scan the queue
361 for (int i=0; i<ss; i++) {
362 url *u = getInFifo();
363 if (!strcmp(name, u->getHost())) {
364 delNamedUrl();
365 if (ok) {
366 if (port == u->getPort()) {
367 transfer(u);
368 } else {
369 putInFifo(u);
370 }
371 } else {
372 forgetUrl(u, noConnection);
373 }
374 } else {
375 putInFifo(u);
376 }
377 }
378 // where should now lie this site
379 if (inFifo==outFifo) {
380 isInFifo = false;
381 } else {
382 global::dnsSites->put(this);
383 }
384 }
385
transfer(url * u)386 void NamedSite::transfer (url *u) {
387 if (testRobots(u->getFile())) {
388 if (global::proxyAddr == NULL) {
389 memcpy (&u->addr, &addr, sizeof (struct in_addr));
390 }
391 global::IPSiteList[ipHash].putUrl(u);
392 } else {
393 forgetUrl(u, forbiddenRobots);
394 }
395 }
396
forgetUrl(url * u,FetchError reason)397 void NamedSite::forgetUrl (url *u, FetchError reason) {
398 urls();
399 fetchFail(u, reason);
400 answers(reason);
401 nburls--;
402 delete u;
403 global::inter->getOne();
404 }
405
406 ///////////////////////////////////////////////////////////
407 // class IPSite
408 ///////////////////////////////////////////////////////////
409
410 /** Constructor : initiate fields used by the program
411 */
IPSite()412 IPSite::IPSite () {
413 lastAccess = 0;
414 isInFifo = false;
415 }
416
417 /** Destructor : This one is never used
418 */
~IPSite()419 IPSite::~IPSite () {
420 assert(false);
421 }
422
423 /** Put an prioritarian url in the fifo
424 * Up to now, it's very naive
425 * because we have no memory of priority inside the url
426 */
putUrl(url * u)427 void IPSite::putUrl (url *u) {
428 // All right, put this url inside at the end of the queue
429 tab.put(u);
430 addIPUrl();
431 // Put Site in fifo if not yet in
432 if (!isInFifo) {
433 #ifndef NDEBUG
434 if (lastAccess == 0) addipsite();
435 #endif // NDEBUG
436 isInFifo = true;
437 if (lastAccess + global::waitDuration <= global::now
438 && global::freeConns->isNonEmpty()) {
439 fetch();
440 } else {
441 global::okSites->put(this);
442 }
443 }
444 }
445
446 /** Get an url from the fifo and do some stats
447 */
getUrl()448 inline url *IPSite::getUrl () {
449 url *u = tab.get();
450 delIPUrl();
451 urls();
452 global::namedSiteList[u->hostHashCode()].nburls--;
453 global::inter->getOne();
454 #if defined(SPECIFICSEARCH) && !defined(NOSTATS)
455 if (privilegedExts[0] != NULL && matchPrivExt(u->getFile())) {
456 extensionTreated();
457 }
458 #endif
459 return u;
460 }
461
462 /** fetch the first page in the fifo okSites
463 * there must be at least one element in freeConns !!!
464 * return expected time for next call (0 means now is OK)
465 * This function always put the IPSite in fifo before returning
466 * (or set isInFifo to false if empty)
467 */
fetch()468 int IPSite::fetch () {
469 if (tab.isEmpty()) {
470 // no more url to read
471 // This is possible because this function can be called recursively
472 isInFifo = false;
473 return 0;
474 } else {
475 int next_call = lastAccess + global::waitDuration;
476 if (next_call > global::now) {
477 global::okSites->rePut(this);
478 return next_call;
479 } else {
480 Connexion *conn = global::freeConns->get();
481 url *u = getUrl();
482 // We're allowed to fetch this one
483 // open the socket and write the request
484 char res = getFds(conn, &(u->addr), u->getPort());
485 if (res != emptyC) {
486 lastAccess = global::now;
487 conn->timeout = timeoutPage;
488 conn->request.addString("GET ");
489 if (global::proxyAddr != NULL) {
490 char *tmp = u->getUrl();
491 conn->request.addString(tmp);
492 } else {
493 conn->request.addString(u->getFile());
494 }
495 conn->request.addString(" HTTP/1.0\r\nHost: ");
496 conn->request.addString(u->getHost());
497 #ifdef COOKIES
498 if (u->cookie != NULL) {
499 conn->request.addString("\r\nCookie: ");
500 conn->request.addString(u->cookie);
501 }
502 #endif // COOKIES
503 conn->request.addString(global::headers);
504 conn->parser = new html (u, conn);
505 conn->pos = 0;
506 conn->err = success;
507 conn->state = res;
508 if (tab.isEmpty()) {
509 isInFifo = false;
510 } else {
511 global::okSites->put(this);
512 }
513 return 0;
514 } else {
515 // Unable to connect
516 fetchFail(u, noConnection);
517 answers(noConnection);
518 delete u;
519 global::freeConns->put(conn);
520 return fetch();
521 }
522 }
523 }
524 }
525