1 /*
2 ** @(#) $Id$
3 **
4 ** W3C Webbot can be found at "http://www.w3.org/Robot/"
5 **
6 ** Copyright �� 1995-1998 World Wide Web Consortium, (Massachusetts
7 ** Institute of Technology, Institut National de Recherche en
8 ** Informatique et en Automatique, Keio University). All Rights
9 ** Reserved. This program is distributed under the W3C's Software
10 ** Intellectual Property License. This program is distributed in the hope
11 ** that it will be useful, but WITHOUT ANY WARRANTY; without even the
12 ** implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
13 ** PURPOSE. See W3C License http://www.w3.org/Consortium/Legal/ for more
14 ** details.
15 **
16 ** Authors:
17 ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
18 ** BR Bob Racko
19 ** JP John Punin
20 **
21 ** History:
22 ** Dec 04 95 First version
23 ** Oct 1998 Split into separate files
24 */
25
26 #include "HTRobMan.h"
27 #include "HTQueue.h"
28 #include "HTAncMan.h"
29
30 #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))
31 #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))
32
33 PRIVATE HTErrorMessage HTErrors[HTERR_ELEMENTS] = {HTERR_ENGLISH_INITIALIZER};
34
35 /*
36 ** Some sorting algorithms
37 */
38 PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;
39
40 /*
41 ** Ths callbacks that we need from the libwww HTML parser
42 */
43 PRIVATE HText_new RHText_new;
44 PRIVATE HText_delete RHText_delete;
45 PRIVATE HText_foundLink RHText_foundLink;
46
47 /* ------------------------------------------------------------------------- */
48
49 /* Create a "HyperDoc" object
50 ** --------------------------
51 ** A HyperDoc object contains information about whether we have already
52 ** started checking the anchor and the depth in our search
53 */
HyperDoc_new(Robot * mr,HTParentAnchor * anchor,int depth)54 PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
55 {
56 HyperDoc * hd;
57 if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
58 HT_OUTOFMEM("HyperDoc_new");
59 hd->depth = depth;
60 hd->hits = 1;
61
62 hd->code = NO_CODE;
63 hd->index = ++mr->cindex;
64
65 /* Bind the HyperDoc object together with the Anchor Object */
66 hd->anchor = anchor;
67 HTAnchor_setDocument(anchor, (void *) hd);
68
69 /* Add this HyperDoc object to our list */
70 if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
71 HTList_addObject(mr->hyperdoc, (void *) hd);
72 return hd;
73 }
74
75 /* Delete a "HyperDoc" object
76 ** --------------------------
77 */
HyperDoc_delete(HyperDoc * hd)78 PUBLIC BOOL HyperDoc_delete (HyperDoc * hd)
79 {
80 if (hd) {
81 HT_FREE (hd);
82 return YES;
83 }
84 return NO;
85 }
86
87 /*
88 ** Sort the anchor array and log reference count
89 */
calculate_hits(Robot * mr,HTArray * array)90 PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
91 {
92 if (mr && array) {
93 HTLog * log = HTLog_open(mr->hitfile, YES, YES);
94 if (log) {
95 void ** data = NULL;
96 HTParentAnchor * anchor = NULL;
97 HTArray_sort(array, HitSort);
98 anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
99 while (anchor) {
100 char * uri = HTAnchor_address((HTAnchor *) anchor);
101 HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
102 if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
103 HT_FREE(uri);
104 anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
105 }
106 }
107 HTLog_close(log);
108 return YES;
109 }
110 return NO;
111 }
112
HitSort(const void * a,const void * b)113 PRIVATE int HitSort (const void * a, const void * b)
114 {
115 HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
116 HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
117 if (aa && bb) return (bb->hits - aa->hits);
118 return bb - aa;
119 }
120
121 /*
122 ** Sort the anchor array and log link relations
123 */
calculate_linkRelations(Robot * mr,HTArray * array)124 PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array)
125 {
126 if (mr && array) {
127 HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL;
128 void ** data = NULL;
129 HTParentAnchor * anchor = NULL;
130 anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
131 while (anchor) {
132
133 /*
134 ** If we have a specific link relation to look for then do this.
135 ** Otherwise look for all link relations.
136 */
137 if (mr->relation) {
138 HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation);
139 if (link) {
140 HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
141 char * src_uri = HTAnchor_address((HTAnchor *) anchor);
142 char * dest_uri = HTAnchor_address((HTAnchor *) dest);
143 if (src_uri && dest_uri) {
144 #ifdef HT_MYSQL
145 if (mr->sqllog) {
146 HTSQLLog_addLinkRelationship (mr->sqllog,
147 src_uri, dest_uri,
148 HTAtom_name(mr->relation),
149 NULL);
150 }
151 #endif
152 if (log) {
153 HTFormat format = HTAnchor_format(dest);
154 HTLog_addText(log, "%s %s %s --> %s\n",
155 HTAtom_name(mr->relation),
156 format != WWW_UNKNOWN ?
157 HTAtom_name(format) : "<unknown>",
158 src_uri, dest_uri);
159 }
160
161 /* Cleanup */
162 HT_FREE(src_uri);
163 HT_FREE(dest_uri);
164 }
165 }
166 } else {
167 HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor);
168 HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor);
169 char * src_uri = HTAnchor_address((HTAnchor *) anchor);
170 HTLinkType linktype;
171
172 /* First look in the main link */
173 if (link && (linktype = HTLink_type(link))) {
174 HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
175 char * dest_uri = HTAnchor_address((HTAnchor *) dest);
176 if (src_uri && dest_uri) {
177 #ifdef HT_MYSQL
178 if (mr->sqllog) {
179 HTSQLLog_addLinkRelationship (mr->sqllog,
180 src_uri, dest_uri,
181 HTAtom_name(linktype),
182 NULL);
183 }
184 #endif
185 if (log) {
186 HTFormat format = HTAnchor_format(dest);
187 HTLog_addText(log, "%s %s %s --> %s\n",
188 HTAtom_name(linktype),
189 format != WWW_UNKNOWN ?
190 HTAtom_name(format) : "<unknown>",
191 src_uri, dest_uri);
192 }
193 }
194 HT_FREE(dest_uri);
195 }
196
197 /* and then in any sublinks */
198 if (sublinks) {
199 HTLink * pres;
200 while ((pres = (HTLink *) HTList_nextObject(sublinks))) {
201 if ((linktype = HTLink_type(pres))) {
202 HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres));
203 char * dest_uri = HTAnchor_address((HTAnchor *) dest);
204 if (src_uri && dest_uri) {
205 #ifdef HT_MYSQL
206 if (mr->sqllog) {
207 HTSQLLog_addLinkRelationship (mr->sqllog,
208 src_uri, dest_uri,
209 HTAtom_name(linktype),
210 NULL);
211 }
212 #endif
213 if (log) {
214 HTFormat format = HTAnchor_format(dest);
215 HTLog_addText(log, "%s %s %s --> %s\n",
216 HTAtom_name(linktype),
217 format != WWW_UNKNOWN ?
218 HTAtom_name(format) : "<unknown>",
219 src_uri, dest_uri);
220 }
221 HT_FREE(dest_uri);
222 }
223 }
224 }
225 }
226
227 /* Cleanup */
228 HT_FREE(src_uri);
229 }
230 anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
231 }
232 if (log) HTLog_close(log);
233 return YES;
234 }
235 return NO;
236 }
237
238 /*
239 ** Sort the anchor array and log last modified date
240 */
calculate_lm(Robot * mr,HTArray * array)241 PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
242 {
243 if (mr && array) {
244 HTLog * log = HTLog_open(mr->lmfile, YES, YES);
245 if (log) {
246 void ** data = NULL;
247 HTParentAnchor * anchor = NULL;
248 HTArray_sort(array, LastModifiedSort);
249 anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
250 while (anchor) {
251 char * uri = HTAnchor_address((HTAnchor *) anchor);
252 time_t lm = HTAnchor_lastModified(anchor);
253 if (uri && lm > 0)
254 HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
255 HT_FREE(uri);
256 anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
257 }
258 }
259 HTLog_close(log);
260 return YES;
261 }
262 return NO;
263 }
264
LastModifiedSort(const void * a,const void * b)265 PRIVATE int LastModifiedSort (const void * a, const void * b)
266 {
267 time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
268 time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
269 return bb - aa;
270 }
271
272 /*
273 ** Sort the anchor array and log the document title
274 */
calculate_title(Robot * mr,HTArray * array)275 PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
276 {
277 if (mr && array) {
278 HTLog * log = HTLog_open(mr->titlefile, YES, YES);
279 if (log) {
280 void ** data = NULL;
281 HTParentAnchor * anchor = NULL;
282 HTArray_sort(array, TitleSort);
283 anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
284 while (anchor) {
285 char * uri = HTAnchor_address((HTAnchor *) anchor);
286 const char * title = HTAnchor_title(anchor);
287 HTCharset charset = HTAnchor_charset(anchor);
288 if (uri) HTLog_addText(log, "%s `%s\' %s\n",
289 charset ? HTAtom_name(charset) : "<none>",
290 title ? title : "<none>",
291 uri);
292 HT_FREE(uri);
293 anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
294 }
295 }
296 HTLog_close(log);
297 return YES;
298 }
299 return NO;
300 }
301
TitleSort(const void * a,const void * b)302 PRIVATE int TitleSort (const void * a, const void * b)
303 {
304 const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
305 const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
306 return strcasecomp(bb?bb:"", aa?aa:"");
307 }
308
309 /*
310 ** Calculate distributions for media types. The same mechanism
311 ** can be used for other characteristics with relatively
312 ** few outcomes.
313 */
mediatype_distribution(HTArray * array)314 PRIVATE HTList * mediatype_distribution (HTArray * array)
315 {
316 if (array) {
317 HTList * mt = HTList_new();
318 MetaDist * pres = NULL;
319 void ** data = NULL;
320 HTParentAnchor * anchor = NULL;
321 anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
322 while (anchor) {
323 HTFormat format = HTAnchor_format(anchor);
324 if (format && format != WWW_UNKNOWN) {
325 HTList * cur = mt;
326
327 /* If found then increase counter */
328 while ((pres = (MetaDist *) HTList_nextObject(cur))) {
329 if (pres->name == format) {
330 pres->hits++;
331 break;
332 }
333 }
334
335 /* If not found then add new format to list */
336 if (!pres) {
337 if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
338 HT_OUTOFMEM("mediatype_distribution");
339 pres->name = format;
340 pres->hits = 1;
341 HTList_addObject(mt, pres);
342 HTList_insertionSort(mt, FormatSort);
343 }
344 }
345
346 /* Find next anchor in array */
347 anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
348 }
349 return mt;
350 }
351 return NULL;
352 }
353
354 /*
355 ** Calculate distributions for charsets. The same mechanism
356 ** can be used for other characteristics with relatively
357 ** few outcomes.
358 */
charset_distribution(HTArray * array)359 PRIVATE HTList * charset_distribution (HTArray * array)
360 {
361 if (array) {
362 HTList * cs = HTList_new();
363 MetaDist * pres = NULL;
364 void ** data = NULL;
365 HTParentAnchor * anchor = NULL;
366 anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
367 while (anchor) {
368 HTCharset charset = HTAnchor_charset(anchor);
369 if (charset) {
370 HTList * cur = cs;
371
372 /* If found then increase counter */
373 while ((pres = (MetaDist *) HTList_nextObject(cur))) {
374 if (pres->name == charset) {
375 pres->hits++;
376 break;
377 }
378 }
379
380 /* If not found then add new format to list */
381 if (!pres) {
382 if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
383 HT_OUTOFMEM("charset_distribution");
384 pres->name = charset;
385 pres->hits = 1;
386 HTList_addObject(cs, pres);
387 HTList_insertionSort(cs, FormatSort);
388 }
389 }
390
391 /* Find next anchor in array */
392 anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
393 }
394 return cs;
395 }
396 return NULL;
397 }
398
FormatSort(const void * a,const void * b)399 PRIVATE int FormatSort (const void * a, const void * b)
400 {
401 MetaDist * aa = (MetaDist *) a;
402 MetaDist * bb = (MetaDist *) b;
403 return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
404 }
405
log_meta_distribution(const char * logfile,HTList * distribution)406 PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
407 {
408 if (logfile && distribution) {
409 HTLog * log = HTLog_open(logfile, YES, YES);
410 if (log) {
411 HTList * cur = distribution;
412 MetaDist * pres;
413 while ((pres = (MetaDist *) HTList_nextObject(cur))) {
414 if (pres->name) {
415 HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
416 }
417 }
418 HTLog_close(log);
419 }
420 }
421 return NO;
422 }
423
delete_meta_distribution(HTList * distribution)424 PRIVATE BOOL delete_meta_distribution (HTList * distribution)
425 {
426 if (distribution) {
427 HTList * cur = distribution;
428 MetaDist * pres;
429 while ((pres = (MetaDist *) HTList_nextObject(cur)))
430 HT_FREE(pres);
431 HTList_delete(distribution);
432 return YES;
433 }
434 return NO;
435 }
436
437
438 /* Statistics
439 ** ----------
440 ** Calculates a bunch of statistics for the anchors traversed
441 */
calculate_statistics(Robot * mr)442 PRIVATE BOOL calculate_statistics (Robot * mr)
443 {
444 long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
445 if (!mr) return NO;
446
447 /* Calculate efficiency */
448 if (mr->time > 0) {
449 ms_t t = HTGetTimeInMillis() - mr->time;
450 if (t > 0) {
451 double loadfactor = (mr->get_bytes / (t * 0.001));
452 double reqprsec = (total_docs / (t * 0.001));
453 double secs = t / 1000.0;
454 char bytes[50];
455 if (SHOW_REAL_QUIET(mr))
456 HTPrint("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
457 total_docs, secs, reqprsec);
458
459 HTNumToStr(mr->get_bytes, bytes, 50);
460 if (SHOW_REAL_QUIET(mr))
461 HTPrint("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
462 mr->get_docs, bytes, loadfactor);
463
464 HTNumToStr(mr->head_bytes, bytes, 50);
465 if (SHOW_REAL_QUIET(mr))
466 HTPrint("\tDid a HEAD on %ld document(s) with a total of %s bytes\n",
467 mr->head_docs, bytes);
468 }
469 }
470
471 /* Create an array of existing anchors */
472 if (total_docs > 1) {
473 HTArray * array = HTAnchor_getArray(total_docs);
474 if (array) {
475
476 /* Distributions */
477 if (mr->flags & MR_DISTRIBUTIONS) {
478 if (SHOW_REAL_QUIET(mr)) HTPrint("\nDistributions:\n");
479 }
480
481 /* Sort after hit counts */
482 if (mr->hitfile) {
483 if (SHOW_REAL_QUIET(mr))
484 HTPrint("\tLogged hit count distribution in file `%s\'\n",
485 mr->hitfile);
486 calculate_hits(mr, array);
487 }
488
489 /* Sort after link relations */
490 #ifdef HT_MYSQL
491 if (mr->relfile || mr->sqllog) {
492 #else
493 if (mr->relfile) {
494 #endif
495 if (mr->relfile && SHOW_REAL_QUIET(mr))
496 HTPrint("\tLogged link relationship distribution in file `%s\'\n",
497 mr->relfile);
498 calculate_linkRelations(mr, array);
499 }
500
501 /* Sort after modified date */
502 if (mr->lmfile) {
503 if (SHOW_REAL_QUIET(mr))
504 HTPrint("\tLogged last modified distribution in file `%s\'\n",
505 mr->lmfile);
506 calculate_lm(mr, array);
507 }
508
509 /* Sort after title */
510 if (mr->titlefile) {
511 if (SHOW_REAL_QUIET(mr))
512 HTPrint("\tLogged title distribution in file `%s\'\n",
513 mr->titlefile);
514 calculate_title(mr, array);
515 }
516
517 /* Find mediatype distribution */
518 if (mr->mtfile) {
519 HTList * mtdist = mediatype_distribution(array);
520 if (mtdist) {
521 if (SHOW_REAL_QUIET(mr))
522 HTPrint("\tLogged media type distribution in file `%s\'\n",
523 mr->mtfile);
524 log_meta_distribution(mr->mtfile, mtdist);
525 delete_meta_distribution(mtdist);
526 }
527 }
528
529 /* Find charset distribution */
530 if (mr->charsetfile) {
531 HTList * charsetdist = charset_distribution(array);
532 if (charsetdist) {
533 if (SHOW_REAL_QUIET(mr))
534 HTPrint("\tLogged charset distribution in file `%s\'\n",
535 mr->charsetfile);
536 log_meta_distribution(mr->charsetfile, charsetdist);
537 delete_meta_distribution(charsetdist);
538 }
539 }
540
541 /* Add as may other stats here as you like */
542 /* ... */
543
544 /* Delete the array */
545 HTArray_delete(array);
546 }
547 }
548 return YES;
549 }
550
551 PRIVATE HTParentAnchor *
552 get_last_parent(HTParentAnchor *anchor)
553 {
554 HTAnchor *anc;
555 HTList *sources = anchor->sources;
556
557 while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL)
558 {
559 HTParentAnchor *panchor = HTAnchor_parent(anc);
560 return panchor;
561 }
562 return NULL;
563 }
564
565 PRIVATE HTLink *
566 HTLink_find_type(HTAnchor * src, HTAnchor * dest, char *linktype)
567 {
568 if(src && dest && linktype)
569 {
570 HTLink * link = HTAnchor_mainLink(src);
571 HTList * sublinks = HTAnchor_subLinks(src);
572 HTLinkType type = (HTLinkType)HTAtom_caseFor(linktype);
573 HTAnchor *sdest = HTLink_destination(link);
574 if (link && sdest == dest && type == HTLink_type(link))
575 return link;
576 else if (sublinks) {
577 while ((link = (HTLink *) HTList_nextObject (sublinks))) {
578 sdest = HTLink_destination(link);
579 if (sdest == dest && HTLink_type(link) == type)
580 return link;
581
582 }
583 }
584 }
585 return NULL;
586 }
587
588 PRIVATE void
589 update_incoming_links(HTParentAnchor *anchor, HTParentAnchor *nanchor)
590 {
591 if(anchor && nanchor) {
592 HTAnchor *anc;
593 HTList *sources = anchor->sources;
594 while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL) {
595 HTParentAnchor *panchor = HTAnchor_parent(anc);
596 if((HTLink_find((HTAnchor *)panchor,(HTAnchor *)anchor)) &&
597 (!HTLink_find_type((HTAnchor *)panchor,
598 (HTAnchor *)nanchor,"redirection"))) {
599 HTLink_add((HTAnchor *)panchor,(HTAnchor *)nanchor,
600 (HTLinkType) HTAtom_caseFor("redirection"),
601 METHOD_HEAD);
602 }
603 }
604 }
605 }
606
607 PRIVATE void
608 update_hyperdoc(HyperDoc *hd,HTRequest *request)
609 {
610 if(hd && request) {
611 HTParentAnchor *anchor = hd->anchor;
612 HTParentAnchor *nanchor = HTRequest_anchor(request);
613 HTParentAnchor *parent = HTRequest_parent(request);
614 HyperDoc *nhd = HTAnchor_document(nanchor);
615
616 char *tit = (char *) HTAnchor_title(nanchor);
617
618 if(nhd && tit)
619 StrAllocCopy(nhd->title,tit);
620
621 if (anchor != nanchor) {
622 if(nhd) { /* The redirected anchor has a Hyperdoc */
623 if(nhd != hd) {
624 hd->code = REDIR_CODE;
625
626 HTAnchor_setDocument(anchor,(void *)nhd);
627
628 if(!HTLink_find_type((HTAnchor *)parent,
629 (HTAnchor *)nanchor,"redirection")) {
630 HTLink_add((HTAnchor *)parent,(HTAnchor *)nanchor,
631 (HTLinkType) HTAtom_caseFor("redirection"),
632 METHOD_HEAD);
633 }
634 }
635 } else { /* The redirected anchor does not have a Hyperdoc */
636 hd->anchor = nanchor;
637 HTAnchor_setDocument(nanchor,(void *) hd);
638
639 if(!HTLink_find_type((HTAnchor *)parent,(HTAnchor *)nanchor,
640 "redirection")) {
641 HTLink_add((HTAnchor *)parent,(HTAnchor *)nanchor,
642 (HTLinkType) HTAtom_caseFor("redirection") ,
643 METHOD_HEAD);
644 }
645 }
646 update_incoming_links(anchor,nanchor);
647 }
648 }
649 }
650
651 PRIVATE void
652 set_error_state_hyperdoc(HyperDoc * hd, HTRequest *request)
653 {
654 HTList * cur = HTRequest_error(request);
655 HTError *pres;
656 Finger * finger = (Finger *) HTRequest_context(request);
657 Robot * mr = finger->robot;
658
659 while((pres = (HTError *) HTList_nextObject(cur)) != NULL) {
660 int code =HTErrors[HTError_index(pres)].code;
661
662 hd->code = code;
663
664 if((mr->flags & MR_REDIR) && code >= 200 && code < 300 )
665 update_hyperdoc(hd,request);
666 }
667 }
668
669 #if 0
670 PRIVATE int
671 test_for_blank_spaces(char *uri)
672 {
673 char *ptr = uri;
674 for(;*ptr!='\0';ptr++)
675 if(*ptr == ' ')
676 return 1;
677 return 0;
678 }
679 #endif
680
681 /* Create a Command Line Object
682 ** ----------------------------
683 */
684 PUBLIC Robot * Robot_new (void)
685 {
686 Robot * me;
687 if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
688 HT_OUTOFMEM("Robot_new");
689 me->hyperdoc = HTList_new();
690 me->htext = HTList_new();
691 me->timer = DEFAULT_TIMEOUT*MILLIES;
692 me->waits = 0;
693 me->cwd = HTGetCurrentDirectoryURL();
694 me->output = OUTPUT;
695 me->cnt = 0;
696 me->ndoc = -1;
697 me->fingers = HTList_new();
698
699 /* This is new */
700 me->queue = HTQueue_new();
701 me->cq = 0;
702 me->furl = NULL;
703
704 return me;
705 }
706
707 /* Delete a Command Line Object
708 ** ----------------------------
709 */
710 PRIVATE BOOL Robot_delete (Robot * mr)
711 {
712 if (mr) {
713 HTList_delete(mr->fingers);
714
715 /* Calculate statistics */
716 calculate_statistics(mr);
717
718 if (mr->hyperdoc) {
719 HTList * cur = mr->hyperdoc;
720 HyperDoc * pres;
721 while ((pres = (HyperDoc *) HTList_nextObject(cur)))
722 HyperDoc_delete(pres);
723 HTList_delete(mr->hyperdoc);
724 }
725 if (mr->htext) {
726 HTList * cur = mr->htext;
727 HText * pres;
728 while ((pres = (HText *) HTList_nextObject(cur)))
729 RHText_delete(pres);
730 HTList_delete(mr->htext);
731 }
732
733 /* Close all the log files */
734 if (mr->flags & MR_LOGGING) {
735 if (SHOW_REAL_QUIET(mr)) HTPrint("\nRaw Log files:\n");
736 }
737
738 if (mr->log) {
739 if (SHOW_REAL_QUIET(mr))
740 HTPrint("\tLogged %5d entries in general log file `%s\'\n",
741 HTLog_accessCount(mr->log), mr->logfile);
742 HTLog_close(mr->log);
743 }
744 if (mr->ref) {
745 if (SHOW_REAL_QUIET(mr))
746 HTPrint("\tLogged %5d entries in referer log file `%s\'\n",
747 HTLog_accessCount(mr->ref), mr->reffile);
748 HTLog_close(mr->ref);
749 }
750 if (mr->reject) {
751 if (SHOW_REAL_QUIET(mr))
752 HTPrint("\tLogged %5d entries in rejected log file `%s\'\n",
753 HTLog_accessCount(mr->reject), mr->rejectfile);
754 HTLog_close(mr->reject);
755 }
756 if (mr->notfound) {
757 if (SHOW_REAL_QUIET(mr))
758 HTPrint("\tLogged %5d entries in not found log file `%s\'\n",
759 HTLog_accessCount(mr->notfound), mr->notfoundfile);
760 HTLog_close(mr->notfound);
761 }
762 if (mr->conneg) {
763 if (SHOW_REAL_QUIET(mr))
764 HTPrint("\tLogged %5d entries in content negotiation log file `%s\'\n",
765 HTLog_accessCount(mr->conneg), mr->connegfile);
766 HTLog_close(mr->conneg);
767 }
768 if (mr->noalttag) {
769 if (SHOW_REAL_QUIET(mr))
770 HTPrint("\tLogged %5d entries in missing alt tag log file `%s\'\n",
771 HTLog_accessCount(mr->noalttag), mr->noalttagfile);
772 HTLog_close(mr->noalttag);
773 }
774
775 if (mr->output && mr->output != STDOUT) fclose(mr->output);
776
777 if (mr->flags & MR_TIME) {
778 time_t local = time(NULL);
779 if (SHOW_REAL_QUIET(mr))
780 HTPrint("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));
781 }
782
783 /* This is new */
784 HT_FREE(mr->cdepth);
785 HT_FREE(mr->furl);
786
787 #ifdef HT_POSIX_REGEX
788 if (mr->include) {
789 regfree(mr->include);
790 HT_FREE(mr->include);
791 }
792 if (mr->exclude) {
793 regfree(mr->exclude);
794 HT_FREE(mr->exclude);
795 }
796 if (mr->exc_robot) {
797 regfree(mr->exc_robot);
798 HT_FREE(mr->exc_robot);
799 }
800 if (mr->check) {
801 regfree(mr->check);
802 HT_FREE(mr->check);
803 }
804 #endif
805
806 #ifdef HT_MYSQL
807 if (mr->sqllog) {
808 HTSQLLog_close(mr->sqllog);
809 mr->sqllog = NULL;
810 }
811 #endif
812
813 if (mr->queue) HTQueue_delete(mr->queue);
814 HT_FREE(mr->cwd);
815 HT_FREE(mr->prefix);
816 HT_FREE(mr->img_prefix);
817 HT_FREE(mr);
818 return YES;
819 }
820 return NO;
821 }
822
823 /*
824 ** This function creates a new finger object and initializes it with a new request
825 */
826 PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
827 {
828 Finger * me;
829 HTRequest * request = HTRequest_new();
830 if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
831 HT_OUTOFMEM("Finger_new");
832 me->robot = robot;
833 me->request = request;
834 me->dest = dest;
835 HTList_addObject(robot->fingers, (void *)me);
836
837 /* Set the context for this request */
838 HTRequest_setContext (request, me);
839
840 /* Check the various flags to customize the request */
841 if (robot->flags & MR_PREEMPTIVE)
842 HTRequest_setPreemptive(request, YES);
843 if (robot->flags & MR_VALIDATE)
844 HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
845 if (robot->flags & MR_END_VALIDATE)
846 HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
847
848 /* We wanna make sure that we are sending a Host header (default) */
849 HTRequest_addRqHd(request, HT_C_HOST);
850
851 /* Set the method for this request */
852 HTRequest_setMethod(request, method);
853 robot->cnt++;
854 return me;
855 }
856
857 PRIVATE int Finger_delete (Finger * me)
858 {
859 HTList_removeObject(me->robot->fingers, (void *)me);
860
861 /* Done with one more */
862 me->robot->cnt--;
863
864 /* See if we don't need to keep all the metadata around in the anchors */
865 if (!(me->robot->flags & MR_KEEP_META))
866 HTAnchor_clearHeader(HTRequest_anchor(me->request));
867
868 /*
869 ** If we are down at one request then flush the output buffer
870 */
871 if (me->request) {
872 if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
873 HTRequest_delete(me->request);
874 me->request = NULL;
875 }
876
877 /*
878 ** Delete the request and free myself
879 */
880 HT_FREE(me);
881 return YES;
882 }
883
884 PRIVATE BOOL check_constraints(Robot * mr, char *prefix, char *uri)
885 {
886 BOOL match = YES;
887 /* Check for prefix match */
888 if (prefix) {
889 match = HTStrMatch(prefix, uri) ? YES : NO;
890 }
891
892 #ifdef HT_POSIX_REGEX
893 /* Check for any regular expression */
894 if (match && mr->include) {
895 match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
896 }
897 if (match && mr->exc_robot) {
898 match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO;
899 }
900 if (match && mr->exclude) {
901 match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
902 }
903
904 #endif
905 return match;
906 }
907
908 /*
909 ** Cleanup and make sure we close all connections including the persistent
910 ** ones
911 */
912 PUBLIC void Cleanup (Robot * me, int status)
913 {
914 /*
915 ** First we clean up the robot itself and calculate the various
916 ** statistics. This can actually take some time as a lot of data
917 ** has to be manipulated
918 */
919 Robot_delete(me);
920
921 /*
922 ** Then we shut down libwww
923 */
924 HTProfile_delete();
925
926 #ifdef HT_MEMLOG
927 HTMemLog_close();
928 #endif
929
930 #ifdef VMS
931 exit(status ? status : 1);
932 #else
933 exit(status ? status : 0);
934 #endif
935 }
936
937 #ifdef HT_POSIX_REGEX
938 PRIVATE char * get_regerror (int errcode, regex_t * compiled)
939 {
940 size_t length = regerror (errcode, compiled, NULL, 0);
941 char * str = NULL;
942 if ((str = (char *) HT_MALLOC(length+1)) == NULL)
943 HT_OUTOFMEM("get_regerror");
944 (void) regerror (errcode, compiled, str, length);
945 return str;
946 }
947
948 PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
949 {
950 regex_t * regex = NULL;
951 if (regex_str && *regex_str) {
952 int status;
953 if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
954 HT_OUTOFMEM("get_regtype");
955 if ((status = regcomp(regex, regex_str, cflags))) {
956 char * err_msg = get_regerror(status, regex);
957 if (SHOW_REAL_QUIET(mr))
958 HTPrint("Regular expression error: %s\n", err_msg);
959 HT_FREE(err_msg);
960 Cleanup(mr, -1);
961 }
962 }
963 return regex;
964 }
965 #endif
966
967 PUBLIC void VersionInfo (void)
968 {
969 HTPrint("\nW3C OpenSource Software");
970 HTPrint("\n-----------------------\n\n");
971 HTPrint("\tWebbot version %s\n", APP_VERSION);
972 HTPrint("\tusing the W3C libwww library version %s.\n\n",HTLib_version());
973 HTPrint("\tSee \"%s\" for help\n", COMMAND_LINE);
974 HTPrint("\tSee \"http://www.w3.org/Robot/User/\" for user information\n");
975 HTPrint("\tSee \"http://www.w3.org/Robot/\" for general information\n\n");
976 HTPrint("\tPlease send feedback to the <www-lib@w3.org> mailing list,\n");
977 HTPrint("\tsee \"http://www.w3.org/Library/#Forums\" for details\n\n");
978 }
979
980 /* redirection_handler
981 ** -------------------
982 ** If we are set up to handle redirections then handle it here.
983 */
984 PUBLIC int redirection_handler (HTRequest * request, HTResponse * response,
985 void * param, int status)
986 {
987 Finger * finger = (Finger *) HTRequest_context(request);
988 Robot * mr = finger->robot;
989 HTParentAnchor * me = HTRequest_anchor(request);
990 HTAnchor * redirection = HTResponse_redirection(response);
991 HTParentAnchor * redirection_parent = HTAnchor_parent(redirection);
992 HyperDoc * redirection_hd = HTAnchor_document(redirection_parent);
993 char * uri = NULL;
994 char * redirection_parent_addr = NULL;
995 BOOL match = YES;
996 BOOL check = NO;
997
998 /* In case we didn't get any redirection destination */
999 if (!redirection) return HT_OK;
1000
1001 /* Get the addresses */
1002 uri = HTAnchor_address((HTAnchor *) me);
1003 redirection_parent_addr = HTAnchor_address((HTAnchor *) redirection_parent);
1004 if (SHOW_QUIET(mr))
1005 HTPrint("Robot....... Checking redirecting from `%s\' to `%s\'\n",
1006 uri, redirection_parent_addr);
1007
1008 /* Log the event */
1009 #ifdef HT_MYSQL
1010 if (mr->sqllog && redirection_parent_addr)
1011 HTSQLLog_addLinkRelationship(mr->sqllog, redirection_parent_addr,
1012 uri, "redirection", NULL);
1013 #endif
1014
1015 /* Check our constraints matcher */
1016 match = check_constraints(mr,mr->prefix, redirection_parent_addr);
1017
1018 #ifdef HT_POSIX_REGEX
1019 /* See if we should do a HEAD or a GET on this URI */
1020 if (match && mr->check) {
1021 check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
1022 }
1023 #endif
1024
1025 /*
1026 ** If we already have a HyperDoc for the redirected anchor
1027 ** then update it
1028 */
1029 if (match) {
1030 if ((redirection_hd = HTAnchor_document(redirection_parent)) != NULL) {
1031 if (SHOW_QUIET(mr)) HTPrint("............ Already checked\n");
1032 redirection_hd->hits++;
1033 HT_FREE(redirection_parent_addr);
1034 HT_FREE(uri);
1035 return HT_OK;
1036 }
1037
1038 /* Now call the default libwww handler for actually carrying it out */
1039 if (mr->redir_code==0 || mr->redir_code==status) {
1040 HyperDoc * me_hd = HTAnchor_document(me);
1041 HyperDoc *nhd = HyperDoc_new(mr, redirection_parent, me_hd->depth);
1042
1043 if(mr->flags & MR_BFS) {
1044 nhd->method = METHOD_HEAD;
1045 HTQueue_append(mr->queue, (void *) nhd);
1046 (mr->cq)++;
1047 }
1048
1049 if (check) {
1050 if (SHOW_QUIET(mr)) HTPrint("Checking redirection using HEAD\n");
1051 HTRequest_setMethod(request, METHOD_HEAD);
1052 }
1053 HT_FREE(redirection_parent_addr);
1054 HT_FREE(uri);
1055 return HTRedirectFilter(request, response, param, status);
1056 }
1057 } else {
1058 if (SHOW_QUIET(mr)) HTPrint("............ does not fulfill constraints\n");
1059 #ifdef HT_MYSQL
1060 if (mr->reject || mr->sqllog)
1061 #else
1062 if (mr->reject)
1063 #endif
1064 {
1065 if (mr->reject && redirection_parent_addr)
1066 HTLog_addText(mr->reject, "%s --> %s\n", redirection_parent_addr, uri);
1067 }
1068 }
1069
1070 /* Just fall through */
1071 HT_FREE(redirection_parent_addr);
1072 HT_FREE(uri);
1073 return HT_OK;
1074 }
1075
1076 /* terminate_handler
1077 ** -----------------
1078 ** This function is registered to handle the result of the request.
1079 ** If no more requests are pending then terminate program
1080 */
1081 PUBLIC int terminate_handler (HTRequest * request, HTResponse * response,
1082 void * param, int status)
1083 {
1084 Finger * finger = (Finger *) HTRequest_context(request);
1085 Robot * mr = finger->robot;
1086 if (SHOW_QUIET(mr)) HTPrint("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1087
1088 #ifdef HT_MYSQL
1089 if (mr->sqllog) HTSQLLog_addEntry(mr->sqllog, request, status);
1090 #endif
1091
1092 /* Check if negotiated resource and whether we should log that*/
1093 if (mr->conneg) {
1094 HTAssocList * cur = HTResponse_variant(response);
1095 if (cur) {
1096 BOOL first = YES;
1097 HTChunk * buffer = HTChunk_new(128);
1098 char * uri = HTAnchor_address((HTAnchor *) finger->dest);
1099 HTAssoc * pres;
1100 HTChunk_puts(buffer, uri);
1101 while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
1102 char * value = HTAssoc_value(pres);
1103 if (first) {
1104 HTChunk_puts(buffer, "\t(");
1105 first = NO;
1106 } else
1107 HTChunk_puts(buffer, ", ");
1108
1109 /* Output the name */
1110 HTChunk_puts(buffer, HTAssoc_name(pres));
1111
1112 /* Only output the value if not empty string */
1113 if (value && *value) {
1114 HTChunk_puts(buffer, "=");
1115 HTChunk_puts(buffer, value);
1116 }
1117 }
1118 if (!first) HTChunk_puts(buffer, ")");
1119 HTLog_addLine(mr->conneg, HTChunk_data(buffer));
1120 HTChunk_delete(buffer);
1121 HT_FREE(uri);
1122 }
1123 }
1124
1125 /* Count the amount of body data that we have read */
1126 if (HTRequest_method(request) == METHOD_GET) {
1127 int length = HTAnchor_length(HTRequest_anchor(request));
1128 if (length > 0) mr->get_bytes += length;
1129 mr->get_docs++;
1130 } else if (HTRequest_method(request) == METHOD_HEAD) {
1131 int length = HTAnchor_length(HTRequest_anchor(request));
1132 if (length > 0) mr->head_bytes += length;
1133 mr->head_docs++;
1134 } else {
1135 mr->other_docs++;
1136 }
1137
1138 if (!(mr->flags & MR_BFS)) {
1139
1140 #if 0
1141 HyperDoc * hd = HTAnchor_document(finger->dest);
1142 if (hd) set_error_state_hyperdoc(hd,request);
1143 #endif
1144
1145 /* Delete this thread */
1146 Finger_delete(finger);
1147
1148 /* Should we stop? */
1149 if (mr->cnt <= 0) {
1150 if (SHOW_QUIET(mr)) HTPrint(" Everything is finished...\n");
1151 Cleanup(mr, 0); /* No way back from here */
1152 }
1153 }
1154
1155 if (SHOW_QUIET(mr)) HTPrint(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1156 return HT_OK;
1157
1158 }
1159
1160 PUBLIC int bfs_terminate_handler (HTRequest * request, HTResponse * response,
1161 void * param, int status)
1162 {
1163 Finger * finger = (Finger *) HTRequest_context(request);
1164 Robot * mr = finger->robot;
1165 HTParentAnchor * dest = finger->dest;
1166 HyperDoc * hd = HTAnchor_document(dest);
1167 int depth = (hd ? hd->depth : -1);
1168
1169 if (hd) set_error_state_hyperdoc(hd,request);
1170
1171 if(hd && (HTRequest_method(request)== METHOD_HEAD) &&
1172 (depth < mr->depth))
1173 {
1174 hd->method = METHOD_GET;
1175 HTQueue_append(mr->queue, (void *)hd); (mr->cq)++;
1176 }
1177
1178 Finger_delete(finger);
1179
1180 if(!(mr->flags & MR_PREEMPTIVE))
1181 Serving_queue(mr);
1182
1183 return HT_OK;
1184 }
1185
1186 PUBLIC void Serving_queue(Robot *mr)
1187 {
1188 BOOL abort = NO;
1189 Finger *nfinger;
1190
1191 while(!abort)
1192 {
1193 if(!HTQueue_isEmpty(mr->queue))
1194 {
1195 HTRequest *newreq;
1196
1197 HyperDoc *nhd = (HyperDoc *)HTQueue_headOfQueue(mr->queue);
1198
1199 if(nhd)
1200 {
1201 char *uri = HTAnchor_address((HTAnchor *)nhd->anchor);
1202 HTQueue_dequeue(mr->queue); (mr->cq)--;
1203
1204 nfinger = Finger_new(mr, nhd->anchor, nhd->method);
1205
1206 newreq = nfinger->request;
1207
1208 if(SHOW_QUIET(mr)) HTPrint("Request from QUEUE %s\n",uri);
1209 HT_FREE(uri);
1210 if(SHOW_QUIET(mr)) HTPrint("%d elements in queue \n", mr->cq);
1211
1212 HTRequest_setParent(newreq,get_last_parent(nhd->anchor));
1213
1214 /* @@@ Should be done using a timer and not sleep! @@@ */
1215 #if 0
1216 if(mr->waits)
1217 sleep(mr->waits);
1218 #endif
1219
1220 if (HTLoadAnchor((HTAnchor *)nhd->anchor , newreq) != YES)
1221 {
1222 if (SHOW_QUIET(mr)) HTPrint("not tested!\n");
1223 Finger_delete(nfinger);
1224 }
1225 }
1226 else
1227 abort = YES;
1228 }
1229 else
1230 abort = YES;
1231 }
1232
1233 if(SHOW_QUIET(mr)) HTPrint("Queue size: %d \n", mr->cq);
1234
1235 if (mr->cnt <= 0 || (abort && (mr->flags & MR_PREEMPTIVE)))
1236 {
1237 if(mr->cnt > 0)
1238 if(SHOW_QUIET(mr)) HTPrint("%d requests were not served\n", mr->cnt);
1239
1240 if (SHOW_QUIET(mr)) HTPrint(" Everything is finished...\n");
1241 Cleanup(mr, 0); /* No way back from here */
1242 }
1243 }
1244
1245 /* ------------------------------------------------------------------------- */
1246 /* HTEXT INTERFACE */
1247 /* ------------------------------------------------------------------------- */
1248
1249 PUBLIC BOOL Robot_registerHTMLParser (void)
1250 {
1251 HText_registerCDCallback(RHText_new, RHText_delete);
1252 HText_registerLinkCallback(RHText_foundLink);
1253 return YES;
1254 }
1255
1256 PRIVATE HText * RHText_new (HTRequest * request, HTParentAnchor * anchor,
1257 HTStream * stream)
1258 {
1259 HText * me;
1260 Finger * finger = (Finger *) HTRequest_context(request);
1261 Robot * mr = finger->robot;
1262 char * robots = NULL;
1263
1264 if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
1265 HT_OUTOFMEM("RHText_new");
1266
1267 /* Bind the HText object together with the Request Object */
1268 me->request = request;
1269 me->follow = YES;
1270
1271 /* Check to see if we have any meta tags */
1272 if (!(mr->flags & MR_NOMETATAGS) && (robots = HTAnchor_robots(anchor)) != NULL) {
1273 char * strval = NULL;
1274 char * ptr = NULL;
1275 char * token = NULL;
1276 StrAllocCopy(strval, robots);
1277 ptr = strval;
1278 while ((token = HTNextField(&ptr)) != NULL) {
1279 if (!strcasecomp(token, "nofollow")) {
1280 me->follow = NO;
1281 break;
1282 }
1283 }
1284 HT_FREE(strval);
1285 }
1286
1287 /* Add this HyperDoc object to our list */
1288 if (!mr->htext) mr->htext = HTList_new();
1289 HTList_addObject(mr->htext, (void *) me);
1290 return me;
1291 }
1292
1293 PRIVATE BOOL RHText_delete (HText * me) {
1294 if (me) {
1295 HT_FREE(me);
1296 return YES;
1297 }
1298 return NO;
1299 }
1300
1301 PRIVATE void RHText_foundAnchor (HText * text, HTChildAnchor * anchor)
1302 {
1303 if (text && anchor) {
1304 Finger * finger = (Finger *) HTRequest_context(text->request);
1305 Robot * mr = finger->robot;
1306 HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1307 HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1308 char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1309 HyperDoc * hd = HTAnchor_document(dest_parent);
1310 HTParentAnchor * referer = HTRequest_anchor(text->request);
1311 BOOL match = text->follow;
1312 BOOL check = NO;
1313
1314 /* These are new variables */
1315 HyperDoc * nhd = NULL;
1316 BOOL follow = YES;
1317
1318 /* These three variables were moved */
1319 /*HTParentAnchor * last_anchor = HTRequest_parent(text->request);*/
1320 HTParentAnchor * last_anchor = HTRequest_anchor(text->request);
1321 HyperDoc * last_doc = HTAnchor_document(last_anchor);
1322 int depth = last_doc ? last_doc->depth+1 : 0;
1323
1324 if (!uri) return;
1325 if (SHOW_QUIET(mr)) HTPrint("Robot....... Found `%s\' - \n", uri ? uri : "NULL\n");
1326
1327 if (hd) {
1328 if (SHOW_QUIET(mr)) HTPrint("............ Already checked\n");
1329 hd->hits++;
1330 #ifdef HT_MYSQL
1331 if (mr->sqllog) {
1332 char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1333 if (ref_addr) {
1334 HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri,
1335 "referer", NULL);
1336 HT_FREE(ref_addr);
1337 }
1338 }
1339 #endif
1340 HT_FREE(uri);
1341 return;
1342 }
1343
1344 /* Check our constraints matcher */
1345 match = check_constraints(mr,mr->prefix, uri);
1346
1347 #ifdef HT_POSIX_REGEX
1348 /* See if we should do a HEAD or a GET on this URI */
1349 if (match && mr->check) {
1350 check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
1351 }
1352 #endif
1353
1354 #if 0
1355 /* This is already checked in HTParse.c */
1356 if(uri && test_for_blank_spaces(uri))
1357 follow = NO;
1358 else
1359 #endif
1360 if (mr->ndoc == 0) /* Number of Documents is reached */
1361 follow = NO;
1362
1363 /* Test whether we already have a hyperdoc for this document */
1364 if (!hd && dest_parent) {
1365 nhd = HyperDoc_new(mr, dest_parent, depth);
1366 mr->cdepth[depth]++;
1367 }
1368
1369 /* Test whether we already have a hyperdoc for this document */
1370 if (mr->flags & MR_LINK && match && dest_parent && follow && !hd) {
1371 if (mr->flags & MR_BFS) {
1372 nhd->method = METHOD_HEAD;
1373 HTQueue_enqueue(mr->queue, (void *) nhd);
1374 (mr->cq)++;
1375 if(mr->ndoc > 0) mr->ndoc--;
1376 } else {
1377 Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
1378 HTRequest * newreq = newfinger->request;
1379 HTRequest_setParent(newreq, referer);
1380 nhd->method = METHOD_GET;
1381
1382 if (check || depth >= mr->depth) {
1383 if (SHOW_QUIET(mr)) HTPrint("loading at depth %d using HEAD\n", depth);
1384 HTRequest_setMethod(newreq, METHOD_HEAD);
1385 nhd->method = METHOD_HEAD;
1386
1387 } else {
1388 if (SHOW_QUIET(mr)) HTPrint("loading at depth %d\n", depth);
1389 }
1390 if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1391 if (SHOW_QUIET(mr)) HTPrint("not tested!\n");
1392 Finger_delete(newfinger);
1393 }
1394 }
1395
1396 } else {
1397 if (SHOW_QUIET(mr)) HTPrint("............ does not fulfill constraints\n");
1398 #ifdef HT_MYSQL
1399 if (mr->reject || mr->sqllog) {
1400 #else
1401 if (mr->reject) {
1402 #endif
1403 if (referer) {
1404 char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1405 if (mr->reject && ref_addr)
1406 HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1407 #ifdef HT_MYSQL
1408 if (mr->sqllog && mr->sqlexternals && ref_addr)
1409 HTSQLLog_addLinkRelationship(mr->sqllog,
1410 ref_addr, uri,
1411 "referer", NULL);
1412 #endif
1413
1414 HT_FREE(ref_addr);
1415 }
1416 }
1417 }
1418 HT_FREE(uri);
1419 }
1420 }
1421
1422 PRIVATE void RHText_foundImage (HText * text, HTChildAnchor * anchor,
1423 const char *alt, const char * align, BOOL isMap)
1424 {
1425 if (text && anchor) {
1426 Finger * finger = (Finger *) HTRequest_context(text->request);
1427 Robot * mr = finger->robot;
1428
1429 if (mr->flags & MR_IMG) {
1430 HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1431 HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1432 char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1433 HyperDoc * hd = HTAnchor_document(dest_parent);
1434 HTParentAnchor * referer = HTRequest_anchor(text->request);
1435 BOOL match = YES;
1436
1437 if (!uri) return;
1438 if (hd) {
1439 if (SHOW_QUIET(mr)) HTPrint("............ Already checked\n");
1440 hd->hits++;
1441 #ifdef HT_MYSQL
1442 if (mr->sqllog) {
1443 char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1444 if (ref_addr) {
1445 HTSQLLog_addLinkRelationship(mr->sqllog,
1446 ref_addr, uri,
1447 "image", alt);
1448 HT_FREE(ref_addr);
1449 }
1450 }
1451 #endif
1452 HT_FREE(uri);
1453 return;
1454 }
1455
1456 /* Check our constraints matcher */
1457 match = check_constraints(mr, mr->img_prefix, uri);
1458
1459 /* Test whether we already have a hyperdoc for this document */
1460 if (match && dest) {
1461 Finger * newfinger = Finger_new(mr, dest_parent,
1462 mr->flags & MR_SAVE ?
1463 METHOD_GET : METHOD_HEAD);
1464 HTRequest * newreq = newfinger->request;
1465 HyperDoc_new(mr, dest_parent, 1);
1466 HTRequest_setParent(newreq, referer);
1467
1468 /* Check whether we should report missing ALT tags */
1469 if (mr->noalttag && (alt==NULL || *alt=='\0')) {
1470 if (referer) {
1471 char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1472 if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
1473 HT_FREE(ref_addr);
1474 }
1475 }
1476
1477 if (SHOW_QUIET(mr)) HTPrint("Robot....... Checking Image `%s\'\n", uri);
1478 if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
1479 if (SHOW_QUIET(mr)) HTPrint("Robot....... Image not tested!\n");
1480 Finger_delete(newfinger);
1481 }
1482 } else {
1483 if (SHOW_QUIET(mr)) HTPrint("............ does not fulfill constraints\n");
1484 #ifdef HT_MYSQL
1485 if (mr->reject || mr->sqllog) {
1486 #else
1487 if (mr->reject) {
1488 #endif
1489 if (referer) {
1490 char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1491 if (mr->reject && ref_addr)
1492 HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1493 #ifdef HT_MYSQL
1494 if (mr->sqllog && mr->sqlexternals && ref_addr)
1495 HTSQLLog_addLinkRelationship(mr->sqllog,
1496 ref_addr, uri,
1497 "image", alt);
1498 #endif
1499
1500 HT_FREE(ref_addr);
1501 }
1502 }
1503 }
1504 HT_FREE(uri);
1505 }
1506 }
1507 }
1508
1509 PRIVATE void RHText_foundLink (HText * text,
1510 int element_number, int attribute_number,
1511 HTChildAnchor * anchor,
1512 const BOOL * present, const char ** value)
1513 {
1514 if (text && anchor) {
1515 Finger * finger = (Finger *) HTRequest_context(text->request);
1516 Robot * mr = finger->robot;
1517 if (SHOW_QUIET(mr))
1518 HTPrint("Robot....... Received element %d, attribute %d with anchor %p\n",
1519 element_number, attribute_number, anchor);
1520 if ((element_number==HTML_IMG && attribute_number==HTML_IMG_SRC) ||
1521 (element_number==HTML_BODY && attribute_number==HTML_BODY_BACKGROUND) ||
1522 (element_number==HTML_INPUT && attribute_number==HTML_INPUT_SRC))
1523 RHText_foundImage(text, anchor, NULL, NULL, NO);
1524 else
1525 RHText_foundAnchor(text, anchor);
1526 }
1527 }
1528
1529 PUBLIC char * get_robots_txt(char * uri)
1530 {
1531 char *str = NULL;
1532 HTChunk * chunk;
1533 HTParentAnchor *anchor = HTAnchor_parent(HTAnchor_findAddress(uri));
1534 HTRequest *request = HTRequest_new();
1535 HTRequest_setOutputFormat(request, WWW_SOURCE);
1536 HTRequest_setPreemptive(request, YES);
1537 HTRequest_setMethod(request, METHOD_GET);
1538 chunk = HTLoadAnchorToChunk ((HTAnchor *)anchor, request);
1539 str = HTChunk_toCString(chunk);
1540 HTRequest_delete(request);
1541 return str;
1542 }
1543
1544
1545