1 // Larbin
2 // Sebastien Ailleret
3 // 14-12-99 -> 19-03-02
4
5 #include <unistd.h>
6 #include <iostream>
7 #include <assert.h>
8 #include <cstring>
9 #include <ctype.h>
10 #include <errno.h>
11 #include <sys/types.h>
12 #include <sys/socket.h>
13
14 #include "options.h"
15
16 #include "types.h"
17 #include "global.h"
18 #include "utils/text.h"
19 #include "utils/url.h"
20 #include "utils/mystring.h"
21 #include "utils/Vector.h"
22 #include "fetch/site.h"
23 #include "fetch/file.h"
24 #include "fetch/fetchOpen.h"
25 #include "fetch/checker.h"
26
27 #include "utils/debug.h"
28
29 #define ANSWER 0
30 #define HEADERS 1
31 #define HEADERS30X 2
32 #define HTML 3
33 #define SPECIFIC 4
34
35 #define LINK 0
36 #define BASE 1
37
38
39 /***********************************
40 * implementation of file
41 ***********************************/
42
file(Connexion * conn)43 file::file (Connexion *conn) {
44 buffer = conn->buffer;
45 pos = 0;
46 posParse = buffer;
47 }
48
~file()49 file::~file () {
50 }
51
52 /***********************************
53 * implementation of robots
54 ***********************************/
55
56 /** Constructor
57 */
robots(NamedSite * server,Connexion * conn)58 robots::robots (NamedSite *server, Connexion *conn) : file(conn) {
59 newPars();
60 this->server = server;
61 answerCode = false;
62 isRobots = true;
63 }
64
65 /** Destructor
66 */
~robots()67 robots::~robots () {
68 delPars();
69 // server is not deleted on purpose
70 // it belongs to someone else
71 }
72
73 /** we get some more chars of this file
74 */
endInput()75 int robots::endInput () {
76 return 0;
77 }
78
79 /** input and parse headers
80 */
inputHeaders(int size)81 int robots::inputHeaders (int size) {
82 pos += size;
83 if (!answerCode && pos > 12) {
84 if (buffer[9] == '2') {
85 answerCode = true;
86 } else {
87 errno = err40X;
88 return 1;
89 }
90 }
91 if (pos > maxRobotsSize) {
92 // no more input, forget the end of this file
93 errno = tooBig;
94 return 1;
95 } else {
96 return 0;
97 }
98 }
99
100 /** parse the robots.txt
101 */
parse(bool isError)102 void robots::parse (bool isError) {
103 if (answerCode && parseHeaders()) {
104 siteRobots();
105 buffer[pos] = 0;
106 if (isError) {
107 // The file could be incomplete, delete last token
108 // We could have Disallow / instead of Disallow /blabla
109 for (uint i=pos-1; i>0 && !isspace(buffer[i]); i--) {
110 buffer[i] = ' ';
111 }
112 }
113 parseRobots();
114 }
115 }
116
117 /** test http headers
118 * return true if OK, false otherwise
119 */
parseHeaders()120 bool robots::parseHeaders () {
121 for(posParse = buffer+9; posParse[3] != 0; posParse++) {
122 if ((posParse[0] == '\n' &&
123 (posParse[1] == '\n'
124 || posParse[2] == '\n'))
125 || (posParse[0] == '\r' &&
126 (posParse[1] == '\r'
127 || posParse[2] == '\r'))) {
128 return true;
129 }
130 }
131 return false;
132 }
133
134 /** try to understand the file
135 */
parseRobots()136 void robots::parseRobots () {
137 robotsOK();
138 #ifndef NOSTATS
139 bool goodfile = true;
140 #endif // NOSTATS
141 server->forbidden.recycle();
142 uint items = 0; // size of server->forbidden
143 // state
144 // 0 : not concerned
145 // 1 : weakly concerned
146 // 2 : strongly concerned
147 int state = 1;
148 char *tok = nextToken(&posParse, ':');
149 while (tok != NULL) {
150 if (!strcasecmp(tok, "useragent") || !strcasecmp(tok, "user-agent")) {
151 if (state == 2) {
152 // end of strong concern record => the end for us
153 return;
154 } else {
155 state = 0;
156 // what is the new state ?
157 tok = nextToken(&posParse, ':');
158 while (tok != NULL
159 && strcasecmp(tok, "useragent")
160 && strcasecmp(tok, "user-agent")
161 && strcasecmp(tok, "disallow")) {
162 if (caseContain(tok, global::userAgent)) {
163 state = 2;
164 } else if (state == 0 && !strcmp(tok, "*")) {
165 state = 1;
166 }
167 tok = nextToken(&posParse, ':');
168 }
169 }
170 if (state) {
171 // delete old forbidden : we've got a better record than older ones
172 server->forbidden.recycle();
173 items = 0;
174 } else {
175 // forget this record
176 while (tok != NULL
177 && strcasecmp(tok, "useragent")
178 && strcasecmp(tok, "user-agent")) {
179 tok = nextToken(&posParse, ':');
180 }
181 }
182 } else if (!strcasecmp(tok, "disallow")) {
183 tok = nextToken(&posParse, ':');
184 while (tok != NULL
185 && strcasecmp(tok, "useragent")
186 && strcasecmp(tok, "user-agent")
187 && strcasecmp(tok, "disallow")) {
188 // add nextToken to forbidden
189 if (items++ < maxRobotsItem) {
190 // make this token a good token
191 if (tok[0] == '*') { // * is not correct, / disallows everything
192 tok[0] = '/';
193 } else if (tok[0] != '/') {
194 tok--;
195 tok[0] = '/';
196 }
197 if (fileNormalize(tok)) {
198 server->forbidden.addElement(newString(tok));
199 }
200 }
201 tok = nextToken(&posParse, ':');
202 }
203 } else {
204 #ifndef NOSTATS
205 if (goodfile) {
206 robotsOKdec();
207 goodfile = false;
208 }
209 #endif // NOSTATS
210 tok = nextToken(&posParse, ':');
211 }
212 }
213 }
214
215
216 /*************************************
217 * implementation of html
218 *************************************/
219
220
221 /////////////////////////////////////////
222 #ifdef SPECIFICSEARCH
223
224 #include "fetch/specbuf.cc"
225
226 #define _newSpec() if (state==SPECIFIC) newSpec()
227 #define _destructSpec() if (state==SPECIFIC) destructSpec()
228 #define _endOfInput() if (state==SPECIFIC) return endOfInput()
229 #define _getContent() \
230 if (state==SPECIFIC) return getContent(); \
231 else return contentStart
232 #define _getSize() \
233 if (state==SPECIFIC) return getSize(); \
234 else return (buffer + pos - contentStart)
235
236 ///////////////////////////////////////
237 #else // not a SPECIFICSEARCH
238
initSpecific()239 void initSpecific () { }
240
241 #define constrSpec() ((void) 0)
242 #define _newSpec() ((void) 0)
243 #define pipeSpec() 0
244 #define _endOfInput() ((void) 0)
245 #define _destructSpec() ((void) 0)
246 #define _getContent() return contentStart
247 #define _getSize() return (buffer + pos - contentStart)
248
249 #endif // SPECIFICSEARCH
250 /////////////////////////////////////////
251
252 #if CGILEVEL >= 1
253 #define notCgiChar(c) (c!='?' && c!='=' && c!='*')
254 #else
255 #define notCgiChar(c) true
256 #endif // CGILEVEL
257
258 /** Constructor
259 */
html(url * here,Connexion * conn)260 html::html (url *here, Connexion *conn) : file(conn) {
261 newPars();
262 this->here = here;
263 base = here->giveBase();
264 state = ANSWER;
265 isInteresting = false;
266 constrSpec();
267 pages();
268 isRobots = false;
269 }
270
271 /** Destructor
272 */
~html()273 html::~html () {
274 _destructSpec();
275 delPars();
276 delete here;
277 delete base;
278 }
279
280 /* get the content of the page */
getPage()281 char *html::getPage () {
282 _getContent();
283 }
284
getLength()285 int html::getLength () {
286 _getSize();
287 }
288
289 /* manage a new url : verify and send it */
manageUrl(url * nouv,bool isRedir)290 void html::manageUrl (url *nouv, bool isRedir) {
291 if (nouv->isValid()
292 && filter1(nouv->getHost(), nouv->getFile())
293 && (global::externalLinks || isRedir
294 || !strcmp(nouv->getHost(), this->here->getHost()))) {
295 // The extension is not stupid (gz, pdf...)
296 #ifdef LINKS_INFO
297 links.addElement(nouv->giveUrl());
298 #endif // LINKS_INFO
299 if (nouv->initOK(here)) {
300 check(nouv);
301 } else {
302 // this url is forbidden for errno reason (set by initOK)
303 answers(errno);
304 delete nouv;
305 }
306 } else {
307 // The extension is stupid
308 delete nouv;
309 }
310 }
311
312 /**********************************************/
313 /* This part manages command line and headers */
314 /**********************************************/
315
316 /** a string is arriving, treat it only up to the end of headers
317 * return 0 usually, 1 if no more input and set errno accordingly
318 */
inputHeaders(int size)319 int html::inputHeaders (int size) {
320 pos += size;
321 buffer[pos] = 0;
322 char *posn;
323 while (posParse < buffer + pos) {
324 switch (state) {
325 case ANSWER:
326 posn = strchr(posParse, '\n');
327 if (posn != NULL) {
328 posParse = posn;
329 if (parseCmdline ()) {
330 return 1;
331 }
332 area = ++posParse;
333 } else {
334 return 0;
335 }
336 break;
337 case HEADERS:
338 case HEADERS30X:
339 posn = strchr(posParse, '\n');
340 if (posn != NULL) {
341 posParse = posn;
342 int tmp;
343 if (state == HEADERS)
344 tmp = parseHeader();
345 else tmp = parseHeader30X();
346 if (tmp) {
347 return 1;
348 }
349 area = ++posParse;
350 } else {
351 return 0;
352 }
353 break;
354 case SPECIFIC:
355 return pipeSpec();
356 default:
357 return 0;
358 }
359 }
360 return 0;
361 }
362
363 /** parse the answer code line */
parseCmdline()364 int html::parseCmdline () {
365 if (posParse - buffer >= 12) {
366 switch (buffer[9]) {
367 case '2':
368 state = HEADERS;
369 break;
370 case '3':
371 state = HEADERS30X;
372 break;
373 default:
374 errno = err40X;
375 return 1;
376 }
377 } else {
378 errno = earlyStop;
379 return 1;
380 }
381 return 0;
382 }
383
384 /** parse a line of header
385 * @return 0 if OK, 1 if we don't want to read the file
386 */
parseHeader()387 int html::parseHeader () {
388 if (posParse - area < 2) {
389 // end of http headers
390 #ifndef FOLLOW_LINKS
391 state = SPECIFIC;
392 #elif defined(SPECIFICSEARCH)
393 if (isInteresting) {
394 state = SPECIFIC;
395 } else {
396 state = HTML;
397 }
398 #else // not a SPECIFICSEARCH
399 state = HTML;
400 #endif // SPECIFICSEARCH
401 contentStart = posParse + 1;
402 *(posParse-1) = 0;
403 _newSpec();
404 } else {
405 *posParse = 0;
406 here->addCookie(area);
407 *posParse = '\n';
408 if (verifType ()) return 1;
409 if (verifLength()) return 1;
410 }
411 return 0;
412 }
413
414 /** function called by parseHeader
415 * parse content-type
416 * return 1 (and set errno) if bad type, 0 otherwise
417 * can toggle isInteresting
418 */
419 #define errorType() errno=badType; return 1
420
421 #ifdef ANYTYPE
422 #define checkType() return 0
423 #elif defined(IMAGES)
424 #define checkType() if (startWithIgnoreCase("image", area+14)) { \
425 return 0; \
426 } else { errorType (); }
427 #else
428 #define checkType() errorType()
429 #endif
430
verifType()431 int html::verifType () {
432 if (startWithIgnoreCase("content-type: ", area)) {
433 // Let's read the type of this doc
434 if (!startWithIgnoreCase("text/html", area+14)) {
435 #ifdef SPECIFICSEARCH
436 if (matchContentType(area+14)) {
437 interestingSeen();
438 isInteresting = true;
439 } else {
440 checkType();
441 }
442 #else // SPECIFICSEARCH
443 checkType();
444 #endif // SPECIFICSEARCH
445 }
446 }
447 return 0;
448 }
449
450 /** function called by parseHeader
451 * parse content-length
452 * return 1 (and set errno) if too long file, 0 otherwise
453 */
verifLength()454 int html::verifLength () {
455 #ifndef SPECIFICSEARCH
456 if (startWithIgnoreCase("content-length: ", area)) {
457 int len = 0;
458 char *p = area+16;
459 while (*p >= '0' && *p <= '9') {
460 len = len*10 + *p -'0';
461 p++;
462 }
463 if (len > maxPageSize) {
464 errno = tooBig;
465 return 1;
466 }
467 }
468 #endif // SPECIFICSEARCH
469 return 0;
470 }
471
472 /** parse a line of header (ans 30X) => just look for location
473 * @return 0 if OK, 1 if we don't want to read the file
474 */
parseHeader30X()475 int html::parseHeader30X () {
476 if (posParse - area < 2) {
477 // end of http headers without location => err40X
478 errno = err40X;
479 return 1;
480 } else {
481 if (startWithIgnoreCase("location: ", area)) {
482 int i=10;
483 while (area[i]!=' ' && area[i]!='\n' && area[i]!='\r'
484 && notCgiChar(area[i])) {
485 i++;
486 }
487 if (notCgiChar(area[i])) {
488 area[i] = 0; // end of url
489 // read the location (do not decrease depth)
490 url *nouv = new url(area+10, here->getDepth(), base);
491 #ifdef URL_TAGS
492 nouv->tag = here->tag;
493 #endif // URL_TAGS
494 manageUrl(nouv, true);
495 // we do not need more headers
496 }
497 errno = err30X;
498 return 1;
499 }
500 }
501 return 0;
502 }
503
504 /*********************************************/
505 /* This part manages the content of the file */
506 /*********************************************/
507
508 /** file download is complete, parse the file (headers already done)
509 * return 0 usually, 1 if there was an error
510 */
endInput()511 int html::endInput () {
512 if (state <= HEADERS) {
513 errno = earlyStop;
514 return 1;
515 }
516 if (state == HEADERS30X) {
517 errno = err40X;
518 return 1;
519 }
520 #ifdef NO_DUP
521 if (!global::hDuplicate->testSet(posParse)) {
522 errno = duplicate;
523 return 1;
524 }
525 #endif // NO_DUP
526 buffer[pos] = 0;
527 _endOfInput();
528 // now parse the html
529 parseHtml();
530 return 0;
531 }
532
533 /* parse an html page */
parseHtml()534 void html::parseHtml () {
535 while ((posParse=strchr(posParse, '<')) != NULL) {
536 if (posParse[1] == '!') {
537 if (posParse[2] == '-' && posParse[3] == '-') {
538 posParse += 4;
539 parseComment();
540 } else {
541 // nothing...
542 posParse += 2;
543 }
544 } else {
545 posParse++;
546 parseTag();
547 }
548 }
549 }
550
551 /* skip a comment */
parseComment()552 void html::parseComment() {
553 while ((posParse=strchr(posParse, '-')) != NULL) {
554 if (posParse[1] == '-' && posParse[2] == '>') {
555 posParse += 3;
556 return;
557 } else {
558 posParse++;
559 }
560 }
561 posParse = buffer+pos;
562 }
563
564 /* macros used by the following functions */
565 #define skipSpace() \
566 while (*posParse == ' ' || *posParse == '\n' \
567 || *posParse == '\r' || *posParse == '\t') { \
568 posParse++; \
569 }
570 #define skipText() \
571 while (*posParse != ' ' && *posParse != '\n' && *posParse != '>' \
572 && *posParse != '\r' && *posParse != '\t' && *posParse != 0) { \
573 posParse++; \
574 }
575 #define nextWord() skipText(); skipSpace()
576 #define thisCharIs(i, c) (c == (posParse[i]|32))
577 #define isTag(t, p, a, i) if (t) { \
578 param = p; \
579 action = a; \
580 posParse += i; \
581 } else { \
582 posParse++; \
583 return; \
584 }
585
586 /** Try to understand this tag */
parseTag()587 void html::parseTag () {
588 skipSpace();
589 char *param=NULL; // what parameter are we looking for
590 int action=-1;
591 // read the name of the tag
592 if (thisCharIs(0, 'a')) { // a href
593 param = "href";
594 action = LINK;
595 posParse++;
596 } else if (thisCharIs(0, 'l')) {
597 isTag(thisCharIs(1, 'i') && thisCharIs(2, 'n') && thisCharIs(3, 'k'),
598 "href", LINK, 4);
599 } else if (thisCharIs(0, 'b')) { // base href
600 isTag(thisCharIs(1, 'a') && thisCharIs(2, 's') && thisCharIs(3, 'e'),
601 "href", BASE, 4);
602 } else if (thisCharIs(0, 'f')) { // frame src
603 isTag(thisCharIs(1, 'r') && thisCharIs(2, 'a')
604 && thisCharIs(3, 'm') && thisCharIs(4, 'e'),
605 "src", LINK, 5);
606 #ifdef IMAGES
607 } else if (thisCharIs(0, 'i')) { // img src
608 isTag(thisCharIs(1, 'm') && thisCharIs(2, 'g'), "src", LINK, 3);
609 #endif // IMAGES
610 } else {
611 return;
612 }
613 // now find the parameter
614 assert(param != NULL);
615 skipSpace();
616 for (;;) {
617 int i=0;
618 while (param[i]!=0 && thisCharIs(i, param[i])) i++;
619 posParse += i;
620 if (posParse[i]=='>' || posParse[i]==0) return;
621 if (param[i]==0) {
622 parseContent(action);
623 return;
624 } else {
625 // not the good parameter
626 nextWord();
627 }
628 }
629 }
630
631 /** read the content of an interesting tag */
parseContent(int action)632 void html::parseContent (int action) {
633 posParse++;
634 while (*posParse==' ' || *posParse=='=') posParse++;
635 if (*posParse=='\"' || *posParse=='\'') posParse++;
636 area = posParse;
637 char *endItem = area + maxUrlSize;
638 if (endItem > buffer + pos) endItem = buffer + pos;
639 while (posParse < endItem && *posParse!='\"' && *posParse!='\''
640 && *posParse!='\n' && *posParse!=' ' && *posParse!='>'
641 && *posParse!='\r' && *posParse!='\t' && notCgiChar(*posParse)) {
642 if (*posParse == '\\') *posParse = '/'; // Bye Bye DOS !
643 posParse++;
644 }
645 if (posParse == buffer + pos) {
646 // end of file => content may be truncated => forget it
647 return;
648 } else if (posParse < endItem && notCgiChar(*posParse)) {
649 // compute this url (not too long and not cgi)
650 char oldchar = *posParse;
651 *posParse = 0;
652 switch (action) {
653 case LINK:
654 // try to understand this new link
655 manageUrl(new url(area, here->getDepth()-1, base), false);
656 break;
657 case BASE:
658 // This page has a BASE HREF tag
659 {
660 uint end = posParse - area - 1;
661 while (end > 7 && area[end] != '/') end--; // 7 because http://
662 if (end > 7) { // this base looks good
663 end++;
664 char tmp = area[end];
665 area[end] = 0;
666 url *tmpbase = new url(area, 0, (url *) NULL);
667 area[end] = tmp;
668 delete base;
669 if (tmpbase->isValid()) {
670 base = tmpbase;
671 } else {
672 delete tmpbase;
673 base = NULL;
674 }
675 }
676 }
677 break;
678 default: assert(false);
679 }
680 *posParse = oldchar;
681 }
682 posParse++;
683 }
684