1 // Larbin
2 // Sebastien Ailleret
3 // 14-12-99 -> 19-03-02
4 
5 #include <unistd.h>
6 #include <iostream>
7 #include <assert.h>
8 #include <cstring>
9 #include <ctype.h>
10 #include <errno.h>
11 #include <sys/types.h>
12 #include <sys/socket.h>
13 
14 #include "options.h"
15 
16 #include "types.h"
17 #include "global.h"
18 #include "utils/text.h"
19 #include "utils/url.h"
20 #include "utils/mystring.h"
21 #include "utils/Vector.h"
22 #include "fetch/site.h"
23 #include "fetch/file.h"
24 #include "fetch/fetchOpen.h"
25 #include "fetch/checker.h"
26 
27 #include "utils/debug.h"
28 
29 #define ANSWER 0
30 #define HEADERS 1
31 #define HEADERS30X 2
32 #define HTML 3
33 #define SPECIFIC 4
34 
35 #define LINK 0
36 #define BASE 1
37 
38 
39 /***********************************
40  * implementation of file
41  ***********************************/
42 
file(Connexion * conn)43 file::file (Connexion *conn) {
44   buffer = conn->buffer;
45   pos = 0;
46   posParse = buffer;
47 }
48 
~file()49 file::~file () {
50 }
51 
52 /***********************************
53  * implementation of robots
54  ***********************************/
55 
56 /** Constructor
57  */
robots(NamedSite * server,Connexion * conn)58 robots::robots (NamedSite *server, Connexion *conn) : file(conn) {
59   newPars();
60   this->server = server;
61   answerCode = false;
62   isRobots = true;
63 }
64 
65 /** Destructor
66  */
~robots()67 robots::~robots () {
68   delPars();
69   // server is not deleted on purpose
70   // it belongs to someone else
71 }
72 
73 /** we get some more chars of this file
74  */
endInput()75 int robots::endInput () {
76   return 0;
77 }
78 
79 /** input and parse headers
80  */
inputHeaders(int size)81 int robots::inputHeaders (int size) {
82   pos += size;
83   if (!answerCode && pos > 12) {
84     if (buffer[9] == '2') {
85       answerCode = true;
86     } else {
87       errno = err40X;
88       return 1;
89     }
90   }
91   if (pos > maxRobotsSize) {
92 	// no more input, forget the end of this file
93 	errno = tooBig;
94 	return 1;
95   } else {
96 	return 0;
97   }
98 }
99 
100 /** parse the robots.txt
101  */
parse(bool isError)102 void robots::parse (bool isError) {
103   if (answerCode && parseHeaders()) {
104     siteRobots();
105     buffer[pos] = 0;
106 	if (isError) {
107 	  // The file could be incomplete, delete last token
108 	  // We could have Disallow / instead of Disallow /blabla
109 	  for (uint i=pos-1; i>0 && !isspace(buffer[i]); i--) {
110 		buffer[i] = ' ';
111 	  }
112 	}
113     parseRobots();
114   }
115 }
116 
117 /** test http headers
118  * return true if OK, false otherwise
119  */
parseHeaders()120 bool robots::parseHeaders () {
121   for(posParse = buffer+9; posParse[3] != 0; posParse++) {
122     if ((posParse[0] == '\n' &&
123          (posParse[1] == '\n'
124           || posParse[2] == '\n'))
125         || (posParse[0] == '\r' &&
126             (posParse[1] == '\r'
127              || posParse[2] == '\r'))) {
128       return true;
129     }
130   }
131   return false;
132 }
133 
134 /** try to understand the file
135  */
parseRobots()136 void robots::parseRobots () {
137   robotsOK();
138 #ifndef NOSTATS
139   bool goodfile = true;
140 #endif // NOSTATS
141   server->forbidden.recycle();
142   uint items = 0; // size of server->forbidden
143   // state
144   // 0 : not concerned
145   // 1 : weakly concerned
146   // 2 : strongly concerned
147   int state = 1;
148   char *tok = nextToken(&posParse, ':');
149   while (tok != NULL) {
150 	if (!strcasecmp(tok, "useragent") || !strcasecmp(tok, "user-agent")) {
151 	  if (state == 2) {
152 		// end of strong concern record => the end for us
153 		return;
154 	  } else {
155 		state = 0;
156 		// what is the new state ?
157 		tok = nextToken(&posParse, ':');
158 		while (tok != NULL
159 			   && strcasecmp(tok, "useragent")
160 			   && strcasecmp(tok, "user-agent")
161 			   && strcasecmp(tok, "disallow")) {
162           if (caseContain(tok, global::userAgent)) {
163             state = 2;
164           } else if (state == 0 && !strcmp(tok, "*")) {
165             state = 1;
166           }
167 		  tok = nextToken(&posParse, ':');
168 		}
169 	  }
170 	  if (state) {
171 		// delete old forbidden : we've got a better record than older ones
172 		server->forbidden.recycle();
173 		items = 0;
174 	  } else {
175         // forget this record
176         while (tok != NULL
177 			   && strcasecmp(tok, "useragent")
178 			   && strcasecmp(tok, "user-agent")) {
179           tok = nextToken(&posParse, ':');
180         }
181       }
182 	} else if (!strcasecmp(tok, "disallow")) {
183       tok = nextToken(&posParse, ':');
184       while (tok != NULL
185              && strcasecmp(tok, "useragent")
186              && strcasecmp(tok, "user-agent")
187              && strcasecmp(tok, "disallow")) {
188         // add nextToken to forbidden
189         if (items++ < maxRobotsItem) {
190           // make this token a good token
191           if (tok[0] == '*') { // * is not correct, / disallows everything
192             tok[0] = '/';
193           } else if (tok[0] != '/') {
194             tok--;
195             tok[0] = '/';
196           }
197           if (fileNormalize(tok)) {
198             server->forbidden.addElement(newString(tok));
199           }
200         }
201         tok = nextToken(&posParse, ':');
202       }
203 	} else {
204 #ifndef NOSTATS
205 	  if (goodfile) {
206         robotsOKdec();
207 		goodfile = false;
208 	  }
209 #endif // NOSTATS
210 	  tok = nextToken(&posParse, ':');
211 	}
212   }
213 }
214 
215 
216 /*************************************
217  * implementation of html
218  *************************************/
219 
220 
221 /////////////////////////////////////////
222 #ifdef SPECIFICSEARCH
223 
224 #include "fetch/specbuf.cc"
225 
226 #define _newSpec() if (state==SPECIFIC) newSpec()
227 #define _destructSpec() if (state==SPECIFIC) destructSpec()
228 #define _endOfInput() if (state==SPECIFIC) return endOfInput()
229 #define _getContent() \
230   if (state==SPECIFIC) return getContent(); \
231   else return contentStart
232 #define _getSize() \
233   if (state==SPECIFIC) return getSize(); \
234   else return (buffer + pos - contentStart)
235 
236 ///////////////////////////////////////
237 #else // not a SPECIFICSEARCH
238 
initSpecific()239 void initSpecific () { }
240 
241 #define constrSpec() ((void) 0)
242 #define _newSpec() ((void) 0)
243 #define pipeSpec() 0
244 #define _endOfInput() ((void) 0)
245 #define _destructSpec() ((void) 0)
246 #define _getContent() return contentStart
247 #define _getSize() return (buffer + pos - contentStart)
248 
249 #endif // SPECIFICSEARCH
250 /////////////////////////////////////////
251 
252 #if CGILEVEL >= 1
253 #define notCgiChar(c) (c!='?' && c!='=' && c!='*')
254 #else
255 #define notCgiChar(c) true
256 #endif // CGILEVEL
257 
258 /** Constructor
259  */
html(url * here,Connexion * conn)260 html::html (url *here, Connexion *conn) : file(conn) {
261   newPars();
262   this->here = here;
263   base = here->giveBase();
264   state = ANSWER;
265   isInteresting = false;
266   constrSpec();
267   pages();
268   isRobots = false;
269 }
270 
271 /** Destructor
272  */
~html()273 html::~html () {
274   _destructSpec();
275   delPars();
276   delete here;
277   delete base;
278 }
279 
280 /* get the content of the page */
getPage()281 char *html::getPage () {
282   _getContent();
283 }
284 
getLength()285 int html::getLength () {
286   _getSize();
287 }
288 
289 /* manage a new url : verify and send it */
manageUrl(url * nouv,bool isRedir)290 void html::manageUrl (url *nouv, bool isRedir) {
291   if (nouv->isValid()
292       && filter1(nouv->getHost(), nouv->getFile())
293       && (global::externalLinks || isRedir
294           || !strcmp(nouv->getHost(), this->here->getHost()))) {
295     // The extension is not stupid (gz, pdf...)
296 #ifdef LINKS_INFO
297     links.addElement(nouv->giveUrl());
298 #endif // LINKS_INFO
299     if (nouv->initOK(here)) {
300       check(nouv);
301     } else {
302       // this url is forbidden for errno reason (set by initOK)
303       answers(errno);
304       delete nouv;
305     }
306   } else {
307     // The extension is stupid
308     delete nouv;
309   }
310 }
311 
312 /**********************************************/
313 /* This part manages command line and headers */
314 /**********************************************/
315 
316 /** a string is arriving, treat it only up to the end of headers
317  * return 0 usually, 1 if no more input and set errno accordingly
318  */
inputHeaders(int size)319 int html::inputHeaders (int size) {
320   pos += size;
321   buffer[pos] = 0;
322   char *posn;
323   while (posParse < buffer + pos) {
324     switch (state) {
325     case ANSWER:
326       posn = strchr(posParse, '\n');
327       if (posn != NULL) {
328         posParse = posn;
329         if (parseCmdline ()) {
330           return 1;
331         }
332         area = ++posParse;
333       } else {
334         return 0;
335       }
336       break;
337     case HEADERS:
338     case HEADERS30X:
339       posn = strchr(posParse, '\n');
340       if (posn != NULL) {
341         posParse = posn;
342         int tmp;
343         if (state == HEADERS)
344           tmp = parseHeader();
345         else tmp = parseHeader30X();
346         if (tmp) {
347           return 1;
348         }
349         area = ++posParse;
350       } else {
351         return 0;
352       }
353       break;
354     case SPECIFIC:
355       return pipeSpec();
356     default:
357       return 0;
358     }
359   }
360   return 0;
361 }
362 
363 /** parse the answer code line */
parseCmdline()364 int html::parseCmdline () {
365   if (posParse - buffer >= 12) {
366     switch (buffer[9]) {
367     case '2':
368       state = HEADERS;
369       break;
370     case '3':
371       state = HEADERS30X;
372       break;
373     default:
374       errno = err40X;
375       return 1;
376     }
377   } else {
378     errno = earlyStop;
379     return 1;
380   }
381   return 0;
382 }
383 
384 /** parse a line of header
385  * @return 0 if OK, 1 if we don't want to read the file
386  */
parseHeader()387 int html::parseHeader () {
388   if (posParse - area < 2) {
389 	// end of http headers
390 #ifndef FOLLOW_LINKS
391     state = SPECIFIC;
392 #elif defined(SPECIFICSEARCH)
393     if (isInteresting) {
394       state = SPECIFIC;
395     } else {
396       state = HTML;
397     }
398 #else // not a SPECIFICSEARCH
399     state = HTML;
400 #endif // SPECIFICSEARCH
401     contentStart = posParse + 1;
402     *(posParse-1) = 0;
403     _newSpec();
404   } else {
405     *posParse = 0;
406     here->addCookie(area);
407     *posParse = '\n';
408     if (verifType ()) return 1;
409     if (verifLength()) return 1;
410   }
411   return 0;
412 }
413 
414 /** function called by parseHeader
415  * parse content-type
416  * return 1 (and set errno) if bad type, 0 otherwise
417  * can toggle isInteresting
418  */
419 #define errorType() errno=badType; return 1
420 
421 #ifdef ANYTYPE
422 #define checkType() return 0
423 #elif defined(IMAGES)
424 #define checkType() if (startWithIgnoreCase("image", area+14)) { \
425     return 0; \
426   } else { errorType (); }
427 #else
428 #define checkType() errorType()
429 #endif
430 
verifType()431 int html::verifType () {
432   if (startWithIgnoreCase("content-type: ", area)) {
433     // Let's read the type of this doc
434     if (!startWithIgnoreCase("text/html", area+14)) {
435 #ifdef SPECIFICSEARCH
436       if (matchContentType(area+14)) {
437         interestingSeen();
438         isInteresting = true;
439       } else {
440         checkType();
441       }
442 #else // SPECIFICSEARCH
443       checkType();
444 #endif // SPECIFICSEARCH
445     }
446   }
447   return 0;
448 }
449 
450 /** function called by parseHeader
451  * parse content-length
452  * return 1 (and set errno) if too long file, 0 otherwise
453  */
verifLength()454 int html::verifLength () {
455 #ifndef SPECIFICSEARCH
456   if (startWithIgnoreCase("content-length: ", area)) {
457     int len = 0;
458     char *p = area+16;
459     while (*p >= '0' && *p <= '9') {
460       len = len*10 + *p -'0';
461       p++;
462     }
463     if (len > maxPageSize) {
464       errno = tooBig;
465       return 1;
466     }
467   }
468 #endif // SPECIFICSEARCH
469   return 0;
470 }
471 
472 /** parse a line of header (ans 30X) => just look for location
473  * @return 0 if OK, 1 if we don't want to read the file
474  */
parseHeader30X()475 int html::parseHeader30X () {
476   if (posParse - area < 2) {
477 	// end of http headers without location => err40X
478     errno = err40X;
479     return 1;
480   } else {
481 	if (startWithIgnoreCase("location: ", area)) {
482       int i=10;
483       while (area[i]!=' ' && area[i]!='\n' && area[i]!='\r'
484              && notCgiChar(area[i])) {
485         i++;
486       }
487       if (notCgiChar(area[i])) {
488         area[i] = 0; // end of url
489         // read the location (do not decrease depth)
490         url *nouv = new url(area+10, here->getDepth(), base);
491 #ifdef URL_TAGS
492         nouv->tag = here->tag;
493 #endif // URL_TAGS
494         manageUrl(nouv, true);
495         // we do not need more headers
496       }
497       errno = err30X;
498       return 1;
499 	}
500   }
501   return 0;
502 }
503 
504 /*********************************************/
505 /* This part manages the content of the file */
506 /*********************************************/
507 
508 /** file download is complete, parse the file (headers already done)
509  * return 0 usually, 1 if there was an error
510  */
endInput()511 int html::endInput () {
512   if (state <= HEADERS) {
513     errno = earlyStop;
514     return 1;
515   }
516   if (state == HEADERS30X) {
517     errno = err40X;
518     return 1;
519   }
520 #ifdef NO_DUP
521   if (!global::hDuplicate->testSet(posParse)) {
522     errno = duplicate;
523     return 1;
524   }
525 #endif // NO_DUP
526   buffer[pos] = 0;
527   _endOfInput();
528   // now parse the html
529   parseHtml();
530   return 0;
531 }
532 
533 /* parse an html page */
parseHtml()534 void html::parseHtml () {
535   while ((posParse=strchr(posParse, '<')) != NULL) {
536     if (posParse[1] == '!') {
537       if (posParse[2] == '-' && posParse[3] == '-') {
538         posParse += 4;
539         parseComment();
540       } else {
541         // nothing...
542         posParse += 2;
543       }
544     } else {
545       posParse++;
546       parseTag();
547     }
548   }
549 }
550 
551 /* skip a comment */
parseComment()552 void html::parseComment() {
553   while ((posParse=strchr(posParse, '-')) != NULL) {
554     if (posParse[1] == '-' && posParse[2] == '>') {
555       posParse += 3;
556       return;
557     } else {
558       posParse++;
559     }
560   }
561   posParse = buffer+pos;
562 }
563 
564 /* macros used by the following functions */
565 #define skipSpace() \
566   while (*posParse == ' ' || *posParse == '\n' \
567          || *posParse == '\r' || *posParse == '\t') { \
568     posParse++; \
569   }
570 #define skipText() \
571   while (*posParse != ' ' && *posParse != '\n' && *posParse != '>' \
572          && *posParse != '\r' && *posParse != '\t' && *posParse != 0) { \
573     posParse++; \
574   }
575 #define nextWord() skipText(); skipSpace()
576 #define thisCharIs(i, c) (c == (posParse[i]|32))
577 #define isTag(t, p, a, i) if (t) { \
578       param = p; \
579       action = a; \
580       posParse += i; \
581     } else { \
582       posParse++; \
583       return; \
584     }
585 
586 /** Try to understand this tag */
parseTag()587 void html::parseTag () {
588   skipSpace();
589   char *param=NULL; // what parameter are we looking for
590   int action=-1;
591   // read the name of the tag
592   if (thisCharIs(0, 'a')) { // a href
593     param = "href";
594     action = LINK;
595     posParse++;
596   } else if (thisCharIs(0, 'l')) {
597     isTag(thisCharIs(1, 'i') && thisCharIs(2, 'n') && thisCharIs(3, 'k'),
598           "href", LINK, 4);
599   } else if (thisCharIs(0, 'b')) { // base href
600     isTag(thisCharIs(1, 'a') && thisCharIs(2, 's') && thisCharIs(3, 'e'),
601           "href", BASE, 4);
602   } else if (thisCharIs(0, 'f')) { // frame src
603     isTag(thisCharIs(1, 'r') && thisCharIs(2, 'a')
604           && thisCharIs(3, 'm') && thisCharIs(4, 'e'),
605           "src", LINK, 5);
606 #ifdef IMAGES
607   } else if (thisCharIs(0, 'i')) { // img src
608     isTag(thisCharIs(1, 'm') && thisCharIs(2, 'g'), "src", LINK, 3);
609 #endif // IMAGES
610   } else {
611     return;
612   }
613   // now find the parameter
614   assert(param != NULL);
615   skipSpace();
616   for (;;) {
617     int i=0;
618     while (param[i]!=0 && thisCharIs(i, param[i])) i++;
619     posParse += i;
620     if (posParse[i]=='>' || posParse[i]==0) return;
621     if (param[i]==0) {
622       parseContent(action);
623       return;
624     } else {
625       // not the good parameter
626       nextWord();
627     }
628   }
629 }
630 
631 /** read the content of an interesting tag */
parseContent(int action)632 void html::parseContent (int action) {
633   posParse++;
634   while (*posParse==' ' || *posParse=='=') posParse++;
635   if (*posParse=='\"' || *posParse=='\'') posParse++;
636   area = posParse;
637   char *endItem = area + maxUrlSize;
638   if (endItem > buffer + pos) endItem = buffer + pos;
639   while (posParse < endItem && *posParse!='\"' && *posParse!='\''
640          && *posParse!='\n' && *posParse!=' ' && *posParse!='>'
641          && *posParse!='\r' && *posParse!='\t' && notCgiChar(*posParse)) {
642     if (*posParse == '\\') *posParse = '/';    // Bye Bye DOS !
643     posParse++;
644   }
645   if (posParse == buffer + pos) {
646     // end of file => content may be truncated => forget it
647     return;
648   } else if (posParse < endItem && notCgiChar(*posParse)) {
649     // compute this url (not too long and not cgi)
650     char oldchar = *posParse;
651     *posParse = 0;
652     switch (action) {
653     case LINK:
654       // try to understand this new link
655       manageUrl(new url(area, here->getDepth()-1, base), false);
656       break;
657     case BASE:
658       // This page has a BASE HREF tag
659       {
660         uint end = posParse - area - 1;
661         while (end > 7 && area[end] != '/') end--; // 7 because http://
662         if (end > 7) { // this base looks good
663           end++;
664           char tmp = area[end];
665           area[end] = 0;
666           url *tmpbase = new url(area, 0, (url *) NULL);
667           area[end] = tmp;
668           delete base;
669           if (tmpbase->isValid()) {
670             base = tmpbase;
671           } else {
672             delete tmpbase;
673             base = NULL;
674           }
675         }
676       }
677       break;
678     default: assert(false);
679     }
680     *posParse = oldchar;
681   }
682   posParse++;
683 }
684