1 /*
2 **	@(#) $Id$
3 **
4 **	W3C Webbot can be found at "http://www.w3.org/Robot/"
5 **
6 **	Copyright �� 1995-1998 World Wide Web Consortium, (Massachusetts
7 **	Institute of Technology, Institut National de Recherche en
8 **	Informatique et en Automatique, Keio University). All Rights
9 **	Reserved. This program is distributed under the W3C's Software
10 **	Intellectual Property License. This program is distributed in the hope
11 **	that it will be useful, but WITHOUT ANY WARRANTY; without even the
12 **	implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
13 **	PURPOSE. See W3C License http://www.w3.org/Consortium/Legal/ for more
14 **	details.
15 **
16 **  Authors:
17 **	HFN		Henrik Frystyk Nielsen, (frystyk@w3.org)
18 **	BR		Bob Racko
19 **	JP		John Punin
20 **
21 **  History:
22 **	Dec 04 95	First version
23 **	Oct 1998	Split into separate files
24 */
25 
26 #include "HTRobMan.h"
27 #include "HTQueue.h"
28 #include "HTAncMan.h"
29 
30 #define SHOW_QUIET(mr)		((mr) && !((mr)->flags & MR_QUIET))
31 #define SHOW_REAL_QUIET(mr)	((mr) && !((mr)->flags & MR_REAL_QUIET))
32 
33 PRIVATE HTErrorMessage HTErrors[HTERR_ELEMENTS] = {HTERR_ENGLISH_INITIALIZER};
34 
35 /*
36 **  Some sorting algorithms
37 */
38 PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;
39 
40 /*
41 **  Ths callbacks that we need from the libwww HTML parser
42 */
43 PRIVATE HText_new	RHText_new;
44 PRIVATE HText_delete	RHText_delete;
45 PRIVATE HText_foundLink	RHText_foundLink;
46 
47 /* ------------------------------------------------------------------------- */
48 
49 /*	Create a "HyperDoc" object
50 **	--------------------------
51 **	A HyperDoc object contains information about whether we have already
52 **	started checking the anchor and the depth in our search
53 */
HyperDoc_new(Robot * mr,HTParentAnchor * anchor,int depth)54 PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
55 {
56     HyperDoc * hd;
57     if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
58 	HT_OUTOFMEM("HyperDoc_new");
59     hd->depth = depth;
60     hd->hits = 1;
61 
62     hd->code = NO_CODE;
63     hd->index = ++mr->cindex;
64 
65     /* Bind the HyperDoc object together with the Anchor Object */
66     hd->anchor = anchor;
67     HTAnchor_setDocument(anchor, (void *) hd);
68 
69     /* Add this HyperDoc object to our list */
70     if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
71     HTList_addObject(mr->hyperdoc, (void *) hd);
72     return hd;
73 }
74 
75 /*	Delete a "HyperDoc" object
76 **	--------------------------
77 */
HyperDoc_delete(HyperDoc * hd)78 PUBLIC BOOL HyperDoc_delete (HyperDoc * hd)
79 {
80     if (hd) {
81 	HT_FREE (hd);
82 	return YES;
83     }
84     return NO;
85 }
86 
87 /*
88 **  Sort the anchor array and log reference count
89 */
calculate_hits(Robot * mr,HTArray * array)90 PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
91 {
92     if (mr && array) {
93         HTLog * log = HTLog_open(mr->hitfile, YES, YES);
94         if (log) {
95             void ** data = NULL;
96             HTParentAnchor * anchor = NULL;
97             HTArray_sort(array, HitSort);
98             anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
99 	    while (anchor) {
100                 char * uri = HTAnchor_address((HTAnchor *) anchor);
101                 HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
102                 if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
103                 HT_FREE(uri);
104                 anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
105             }
106 	}
107         HTLog_close(log);
108         return YES;
109     }
110     return NO;
111 }
112 
HitSort(const void * a,const void * b)113 PRIVATE int HitSort (const void * a, const void * b)
114 {
115     HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
116     HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
117     if (aa && bb) return (bb->hits - aa->hits);
118     return bb - aa;
119 }
120 
121 /*
122 **  Sort the anchor array and log link relations
123 */
calculate_linkRelations(Robot * mr,HTArray * array)124 PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array)
125 {
126     if (mr && array) {
127         HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL;
128 	void ** data = NULL;
129 	HTParentAnchor * anchor = NULL;
130 	anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
131 	while (anchor) {
132 
133 	    /*
134 	    **  If we have a specific link relation to look for then do this.
135 	    **  Otherwise look for all link relations.
136 	    */
137 	    if (mr->relation) {
138 		HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation);
139 		if (link) {
140 		    HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
141 		    char * src_uri = HTAnchor_address((HTAnchor *) anchor);
142 		    char * dest_uri = HTAnchor_address((HTAnchor *) dest);
143 		    if (src_uri && dest_uri) {
144 #ifdef HT_MYSQL
145 			if (mr->sqllog) {
146 			    HTSQLLog_addLinkRelationship (mr->sqllog,
147 							  src_uri, dest_uri,
148 							  HTAtom_name(mr->relation),
149 							  NULL);
150 			}
151 #endif
152 			if (log) {
153 			    HTFormat format = HTAnchor_format(dest);
154 			    HTLog_addText(log, "%s %s %s --> %s\n",
155 					  HTAtom_name(mr->relation),
156 					  format != WWW_UNKNOWN ?
157 					  HTAtom_name(format) : "<unknown>",
158 					  src_uri, dest_uri);
159 			}
160 
161 			/* Cleanup */
162 			HT_FREE(src_uri);
163 			HT_FREE(dest_uri);
164 		    }
165 		}
166 	    } else {
167 		HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor);
168 		HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor);
169 		char * src_uri = HTAnchor_address((HTAnchor *) anchor);
170 		HTLinkType linktype;
171 
172 		/* First look in the main link */
173 		if (link && (linktype = HTLink_type(link))) {
174 		    HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
175 		    char * dest_uri = HTAnchor_address((HTAnchor *) dest);
176 		    if (src_uri && dest_uri) {
177 #ifdef HT_MYSQL
178 			if (mr->sqllog) {
179 			    HTSQLLog_addLinkRelationship (mr->sqllog,
180 							  src_uri, dest_uri,
181 							  HTAtom_name(linktype),
182 							  NULL);
183 			}
184 #endif
185 			if (log) {
186 			    HTFormat format = HTAnchor_format(dest);
187 			    HTLog_addText(log, "%s %s %s --> %s\n",
188 					  HTAtom_name(linktype),
189 					  format != WWW_UNKNOWN ?
190 					  HTAtom_name(format) : "<unknown>",
191 					  src_uri, dest_uri);
192 			}
193 		    }
194 		    HT_FREE(dest_uri);
195 		}
196 
197 		/* and then in any sublinks */
198 		if (sublinks) {
199 		    HTLink * pres;
200 		    while ((pres = (HTLink *) HTList_nextObject(sublinks))) {
201 			if ((linktype = HTLink_type(pres))) {
202 			    HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres));
203 			    char * dest_uri = HTAnchor_address((HTAnchor *) dest);
204 			    if (src_uri && dest_uri) {
205 #ifdef HT_MYSQL
206 				if (mr->sqllog) {
207 				    HTSQLLog_addLinkRelationship (mr->sqllog,
208 								  src_uri, dest_uri,
209 								  HTAtom_name(linktype),
210 								  NULL);
211 				}
212 #endif
213 				if (log) {
214 				    HTFormat format = HTAnchor_format(dest);
215 				    HTLog_addText(log, "%s %s %s --> %s\n",
216 						  HTAtom_name(linktype),
217 						  format != WWW_UNKNOWN ?
218 						  HTAtom_name(format) : "<unknown>",
219 						  src_uri, dest_uri);
220 				}
221 				HT_FREE(dest_uri);
222 			    }
223 			}
224 		    }
225 		}
226 
227 		/* Cleanup */
228 		HT_FREE(src_uri);
229 	    }
230 	    anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
231 	}
232         if (log) HTLog_close(log);
233         return YES;
234     }
235     return NO;
236 }
237 
238 /*
239 **  Sort the anchor array and log last modified date
240 */
calculate_lm(Robot * mr,HTArray * array)241 PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
242 {
243     if (mr && array) {
244         HTLog * log = HTLog_open(mr->lmfile, YES, YES);
245         if (log) {
246             void ** data = NULL;
247             HTParentAnchor * anchor = NULL;
248             HTArray_sort(array, LastModifiedSort);
249             anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
250 	    while (anchor) {
251                 char * uri = HTAnchor_address((HTAnchor *) anchor);
252                 time_t lm = HTAnchor_lastModified(anchor);
253                 if (uri && lm > 0)
254 		    HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
255                 HT_FREE(uri);
256                 anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
257             }
258 	}
259         HTLog_close(log);
260         return YES;
261     }
262     return NO;
263 }
264 
LastModifiedSort(const void * a,const void * b)265 PRIVATE int LastModifiedSort (const void * a, const void * b)
266 {
267     time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
268     time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
269     return bb - aa;
270 }
271 
272 /*
273 **  Sort the anchor array and log the document title
274 */
calculate_title(Robot * mr,HTArray * array)275 PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
276 {
277     if (mr && array) {
278         HTLog * log = HTLog_open(mr->titlefile, YES, YES);
279         if (log) {
280             void ** data = NULL;
281             HTParentAnchor * anchor = NULL;
282             HTArray_sort(array, TitleSort);
283             anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
284 	    while (anchor) {
285                 char * uri = HTAnchor_address((HTAnchor *) anchor);
286                 const char * title = HTAnchor_title(anchor);
287 		HTCharset charset = HTAnchor_charset(anchor);
288                 if (uri) HTLog_addText(log, "%s `%s\' %s\n",
289 				       charset ? HTAtom_name(charset) : "<none>",
290 				       title ? title : "<none>",
291 				       uri);
292                 HT_FREE(uri);
293                 anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
294             }
295 	}
296         HTLog_close(log);
297         return YES;
298     }
299     return NO;
300 }
301 
TitleSort(const void * a,const void * b)302 PRIVATE int TitleSort (const void * a, const void * b)
303 {
304     const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
305     const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
306     return strcasecomp(bb?bb:"", aa?aa:"");
307 }
308 
309 /*
310 **  Calculate distributions for media types. The same mechanism
311 **  can be used for other characteristics with relatively
312 **  few outcomes.
313 */
mediatype_distribution(HTArray * array)314 PRIVATE HTList * mediatype_distribution (HTArray * array)
315 {
316     if (array) {
317 	HTList * mt = HTList_new();
318 	MetaDist * pres = NULL;
319 	void ** data = NULL;
320 	HTParentAnchor * anchor = NULL;
321 	anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
322 	while (anchor) {
323 	    HTFormat format = HTAnchor_format(anchor);
324 	    if (format && format != WWW_UNKNOWN) {
325 		HTList * cur = mt;
326 
327 		/* If found then increase counter */
328 		while ((pres = (MetaDist *) HTList_nextObject(cur))) {
329 		    if (pres->name == format) {
330 			pres->hits++;
331 			break;
332 		    }
333 		}
334 
335 		/* If not found then add new format to list */
336 		if (!pres) {
337                     if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
338         	         HT_OUTOFMEM("mediatype_distribution");
339 		    pres->name = format;
340 		    pres->hits = 1;
341 		    HTList_addObject(mt, pres);
342 		    HTList_insertionSort(mt, FormatSort);
343 		}
344 	    }
345 
346 	    /* Find next anchor in array */
347 	    anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
348 	}
349 	return mt;
350     }
351     return NULL;
352 }
353 
354 /*
355 **  Calculate distributions for charsets. The same mechanism
356 **  can be used for other characteristics with relatively
357 **  few outcomes.
358 */
charset_distribution(HTArray * array)359 PRIVATE HTList * charset_distribution (HTArray * array)
360 {
361     if (array) {
362 	HTList * cs = HTList_new();
363 	MetaDist * pres = NULL;
364 	void ** data = NULL;
365 	HTParentAnchor * anchor = NULL;
366 	anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
367 	while (anchor) {
368 	    HTCharset charset = HTAnchor_charset(anchor);
369 	    if (charset) {
370 		HTList * cur = cs;
371 
372 		/* If found then increase counter */
373 		while ((pres = (MetaDist *) HTList_nextObject(cur))) {
374 		    if (pres->name == charset) {
375 			pres->hits++;
376 			break;
377 		    }
378 		}
379 
380 		/* If not found then add new format to list */
381 		if (!pres) {
382                     if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
383         	         HT_OUTOFMEM("charset_distribution");
384 		    pres->name = charset;
385 		    pres->hits = 1;
386 		    HTList_addObject(cs, pres);
387 		    HTList_insertionSort(cs, FormatSort);
388 		}
389 	    }
390 
391 	    /* Find next anchor in array */
392 	    anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
393 	}
394 	return cs;
395     }
396     return NULL;
397 }
398 
FormatSort(const void * a,const void * b)399 PRIVATE int FormatSort (const void * a, const void * b)
400 {
401     MetaDist * aa = (MetaDist *) a;
402     MetaDist * bb = (MetaDist *) b;
403     return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
404 }
405 
log_meta_distribution(const char * logfile,HTList * distribution)406 PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
407 {
408     if (logfile && distribution) {
409         HTLog * log = HTLog_open(logfile, YES, YES);
410 	if (log) {
411 	    HTList * cur = distribution;
412 	    MetaDist * pres;
413 	    while ((pres = (MetaDist *) HTList_nextObject(cur))) {
414 		if (pres->name) {
415 		    HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
416 		}
417 	    }
418 	    HTLog_close(log);
419 	}
420     }
421     return NO;
422 }
423 
delete_meta_distribution(HTList * distribution)424 PRIVATE BOOL delete_meta_distribution (HTList * distribution)
425 {
426     if (distribution) {
427 	HTList * cur = distribution;
428 	MetaDist * pres;
429 	while ((pres = (MetaDist *) HTList_nextObject(cur)))
430 	    HT_FREE(pres);
431 	HTList_delete(distribution);
432 	return YES;
433     }
434     return NO;
435 }
436 
437 
438 /*	Statistics
439 **	----------
440 **	Calculates a bunch of statistics for the anchors traversed
441 */
calculate_statistics(Robot * mr)442 PRIVATE BOOL calculate_statistics (Robot * mr)
443 {
444     long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
445     if (!mr) return NO;
446 
447     /* Calculate efficiency */
448     if (mr->time > 0) {
449 	ms_t t = HTGetTimeInMillis() - mr->time;
450 	if (t > 0) {
451 	    double loadfactor = (mr->get_bytes / (t * 0.001));
452 	    double reqprsec = (total_docs / (t * 0.001));
453 	    double secs = t / 1000.0;
454             char bytes[50];
455 	    if (SHOW_REAL_QUIET(mr))
456 		HTPrint("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
457 			total_docs, secs, reqprsec);
458 
459             HTNumToStr(mr->get_bytes, bytes, 50);
460 	    if (SHOW_REAL_QUIET(mr))
461 		HTPrint("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
462 			mr->get_docs, bytes, loadfactor);
463 
464             HTNumToStr(mr->head_bytes, bytes, 50);
465 	    if (SHOW_REAL_QUIET(mr))
466 		HTPrint("\tDid a HEAD on %ld document(s) with a total of %s bytes\n",
467 			mr->head_docs, bytes);
468 	}
469     }
470 
471     /* Create an array of existing anchors */
472     if (total_docs > 1) {
473 	HTArray * array = HTAnchor_getArray(total_docs);
474         if (array) {
475 
476 	    /* Distributions */
477 	    if (mr->flags & MR_DISTRIBUTIONS) {
478 		if (SHOW_REAL_QUIET(mr)) HTPrint("\nDistributions:\n");
479 	    }
480 
481             /* Sort after hit counts */
482             if (mr->hitfile) {
483 		if (SHOW_REAL_QUIET(mr))
484 		    HTPrint("\tLogged hit count distribution in file `%s\'\n",
485 			    mr->hitfile);
486 		calculate_hits(mr, array);
487 	    }
488 
489             /* Sort after link relations */
490 #ifdef HT_MYSQL
491             if (mr->relfile || mr->sqllog) {
492 #else
493             if (mr->relfile) {
494 #endif
495 		if (mr->relfile && SHOW_REAL_QUIET(mr))
496 		    HTPrint("\tLogged link relationship distribution in file `%s\'\n",
497 			    mr->relfile);
498 		calculate_linkRelations(mr, array);
499 	    }
500 
501             /* Sort after modified date */
502             if (mr->lmfile) {
503 		if (SHOW_REAL_QUIET(mr))
504 		    HTPrint("\tLogged last modified distribution in file `%s\'\n",
505 			    mr->lmfile);
506 		calculate_lm(mr, array);
507 	    }
508 
509             /* Sort after title */
510             if (mr->titlefile) {
511 		if (SHOW_REAL_QUIET(mr))
512 		    HTPrint("\tLogged title distribution in file `%s\'\n",
513 			    mr->titlefile);
514 		calculate_title(mr, array);
515 	    }
516 
517             /* Find mediatype distribution */
518 	    if (mr->mtfile) {
519 		HTList * mtdist = mediatype_distribution(array);
520 		if (mtdist) {
521 		    if (SHOW_REAL_QUIET(mr))
522 			HTPrint("\tLogged media type distribution in file `%s\'\n",
523 				mr->mtfile);
524 		    log_meta_distribution(mr->mtfile, mtdist);
525 		    delete_meta_distribution(mtdist);
526 		}
527 	    }
528 
529             /* Find charset distribution */
530 	    if (mr->charsetfile) {
531 		HTList * charsetdist = charset_distribution(array);
532 		if (charsetdist) {
533 		    if (SHOW_REAL_QUIET(mr))
534 			HTPrint("\tLogged charset distribution in file `%s\'\n",
535 				mr->charsetfile);
536 		    log_meta_distribution(mr->charsetfile, charsetdist);
537 		    delete_meta_distribution(charsetdist);
538 		}
539 	    }
540 
541             /* Add as may other stats here as you like */
542 	    /* ... */
543 
544 	    /* Delete the array */
545             HTArray_delete(array);
546         }
547     }
548     return YES;
549 }
550 
551 PRIVATE HTParentAnchor *
552 get_last_parent(HTParentAnchor *anchor)
553 {
554   HTAnchor *anc;
555   HTList *sources = anchor->sources;
556 
557   while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL)
558     {
559       HTParentAnchor *panchor = HTAnchor_parent(anc);
560       return panchor;
561     }
562   return NULL;
563 }
564 
565 PRIVATE HTLink *
566 HTLink_find_type(HTAnchor * src, HTAnchor * dest, char *linktype)
567 {
568     if(src && dest && linktype)
569     {
570 	HTLink * link = HTAnchor_mainLink(src);
571 	HTList * sublinks = HTAnchor_subLinks(src);
572 	HTLinkType type = (HTLinkType)HTAtom_caseFor(linktype);
573 	HTAnchor *sdest = HTLink_destination(link);
574 	if (link && sdest == dest && type == HTLink_type(link))
575 	    return link;
576 	else if (sublinks) {
577 	    while ((link = (HTLink *) HTList_nextObject (sublinks))) {
578 		sdest = HTLink_destination(link);
579 		if (sdest == dest && HTLink_type(link) == type)
580 		    return link;
581 
582 	    }
583 	}
584     }
585     return NULL;
586 }
587 
588 PRIVATE void
589 update_incoming_links(HTParentAnchor *anchor, HTParentAnchor *nanchor)
590 {
591     if(anchor && nanchor) {
592 	HTAnchor *anc;
593 	HTList *sources = anchor->sources;
594 	while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL) {
595 	    HTParentAnchor *panchor = HTAnchor_parent(anc);
596 	    if((HTLink_find((HTAnchor *)panchor,(HTAnchor *)anchor)) &&
597 	       (!HTLink_find_type((HTAnchor *)panchor,
598 				  (HTAnchor *)nanchor,"redirection"))) {
599 		HTLink_add((HTAnchor *)panchor,(HTAnchor *)nanchor,
600 			   (HTLinkType) HTAtom_caseFor("redirection"),
601 			    METHOD_HEAD);
602 	    }
603 	}
604     }
605 }
606 
607 PRIVATE void
608 update_hyperdoc(HyperDoc *hd,HTRequest *request)
609 {
610     if(hd && request) {
611 	HTParentAnchor *anchor = hd->anchor;
612 	HTParentAnchor *nanchor = HTRequest_anchor(request);
613 	HTParentAnchor *parent = HTRequest_parent(request);
614 	HyperDoc *nhd = HTAnchor_document(nanchor);
615 
616 	char *tit = (char *) HTAnchor_title(nanchor);
617 
618 	if(nhd && tit)
619 	    StrAllocCopy(nhd->title,tit);
620 
621 	if (anchor != nanchor) {
622 	    if(nhd) { 	    /* The redirected anchor has a Hyperdoc */
623 		if(nhd != hd) {
624 		    hd->code = REDIR_CODE;
625 
626 		    HTAnchor_setDocument(anchor,(void *)nhd);
627 
628 		    if(!HTLink_find_type((HTAnchor *)parent,
629 					 (HTAnchor *)nanchor,"redirection")) {
630 			HTLink_add((HTAnchor *)parent,(HTAnchor *)nanchor,
631 				   (HTLinkType) HTAtom_caseFor("redirection"),
632 				   METHOD_HEAD);
633 		    }
634 		}
635 	    } else { /* The redirected anchor does not have a Hyperdoc */
636 		hd->anchor = nanchor;
637 		HTAnchor_setDocument(nanchor,(void *) hd);
638 
639 		if(!HTLink_find_type((HTAnchor *)parent,(HTAnchor *)nanchor,
640 				     "redirection")) {
641 		    HTLink_add((HTAnchor *)parent,(HTAnchor *)nanchor,
642 			      (HTLinkType) HTAtom_caseFor("redirection") ,
643 			       METHOD_HEAD);
644 		}
645 	    }
646 	    update_incoming_links(anchor,nanchor);
647 	}
648     }
649 }
650 
651 PRIVATE void
652 set_error_state_hyperdoc(HyperDoc * hd, HTRequest *request)
653 {
654     HTList * cur = HTRequest_error(request);
655     HTError *pres;
656     Finger * finger = (Finger *) HTRequest_context(request);
657     Robot * mr = finger->robot;
658 
659     while((pres = (HTError *) HTList_nextObject(cur)) != NULL) {
660 	int code =HTErrors[HTError_index(pres)].code;
661 
662 	hd->code = code;
663 
664 	if((mr->flags & MR_REDIR) && code >= 200 && code < 300 )
665 	    update_hyperdoc(hd,request);
666     }
667 }
668 
669 #if 0
670 PRIVATE int
671 test_for_blank_spaces(char *uri)
672 {
673   char *ptr = uri;
674   for(;*ptr!='\0';ptr++)
675     if(*ptr == ' ')
676       return 1;
677   return 0;
678 }
679 #endif
680 
681 /*	Create a Command Line Object
682 **	----------------------------
683 */
684 PUBLIC Robot * Robot_new (void)
685 {
686     Robot * me;
687     if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
688 	HT_OUTOFMEM("Robot_new");
689     me->hyperdoc = HTList_new();
690     me->htext = HTList_new();
691     me->timer = DEFAULT_TIMEOUT*MILLIES;
692     me->waits = 0;
693     me->cwd = HTGetCurrentDirectoryURL();
694     me->output = OUTPUT;
695     me->cnt = 0;
696     me->ndoc = -1;
697     me->fingers = HTList_new();
698 
699    /* This is new */
700     me->queue = HTQueue_new();
701     me->cq = 0;
702     me->furl = NULL;
703 
704     return me;
705 }
706 
707 /*	Delete a Command Line Object
708 **	----------------------------
709 */
710 PRIVATE BOOL Robot_delete (Robot * mr)
711 {
712     if (mr) {
713 	HTList_delete(mr->fingers);
714 
715        	/* Calculate statistics */
716 	calculate_statistics(mr);
717 
718         if (mr->hyperdoc) {
719 	    HTList * cur = mr->hyperdoc;
720 	    HyperDoc * pres;
721 	    while ((pres = (HyperDoc *) HTList_nextObject(cur)))
722 		HyperDoc_delete(pres);
723 	    HTList_delete(mr->hyperdoc);
724 	}
725 	if (mr->htext) {
726 	    HTList * cur = mr->htext;
727 	    HText * pres;
728 	    while ((pres = (HText *) HTList_nextObject(cur)))
729 		RHText_delete(pres);
730 	    HTList_delete(mr->htext);
731 	}
732 
733 	/* Close all the log files */
734 	if (mr->flags & MR_LOGGING) {
735 	    if (SHOW_REAL_QUIET(mr)) HTPrint("\nRaw Log files:\n");
736 	}
737 
738 	if (mr->log) {
739 	    if (SHOW_REAL_QUIET(mr))
740 		HTPrint("\tLogged %5d entries in general log file `%s\'\n",
741 			HTLog_accessCount(mr->log), mr->logfile);
742 	    HTLog_close(mr->log);
743 	}
744 	if (mr->ref) {
745 	    if (SHOW_REAL_QUIET(mr))
746 		HTPrint("\tLogged %5d entries in referer log file `%s\'\n",
747 			HTLog_accessCount(mr->ref), mr->reffile);
748 	    HTLog_close(mr->ref);
749 	}
750 	if (mr->reject) {
751 	    if (SHOW_REAL_QUIET(mr))
752 		HTPrint("\tLogged %5d entries in rejected log file `%s\'\n",
753 			HTLog_accessCount(mr->reject), mr->rejectfile);
754 	    HTLog_close(mr->reject);
755 	}
756 	if (mr->notfound) {
757 	    if (SHOW_REAL_QUIET(mr))
758 		HTPrint("\tLogged %5d entries in not found log file `%s\'\n",
759 			HTLog_accessCount(mr->notfound), mr->notfoundfile);
760 	    HTLog_close(mr->notfound);
761 	}
762 	if (mr->conneg) {
763 	    if (SHOW_REAL_QUIET(mr))
764 		HTPrint("\tLogged %5d entries in content negotiation log file `%s\'\n",
765 			HTLog_accessCount(mr->conneg), mr->connegfile);
766 	    HTLog_close(mr->conneg);
767 	}
768 	if (mr->noalttag) {
769 	    if (SHOW_REAL_QUIET(mr))
770 		HTPrint("\tLogged %5d entries in missing alt tag log file `%s\'\n",
771 			HTLog_accessCount(mr->noalttag), mr->noalttagfile);
772 	    HTLog_close(mr->noalttag);
773 	}
774 
775 	if (mr->output && mr->output != STDOUT) fclose(mr->output);
776 
777 	if (mr->flags & MR_TIME) {
778 	    time_t local = time(NULL);
779 	    if (SHOW_REAL_QUIET(mr))
780 		HTPrint("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));
781 	}
782 
783 	/* This is new */
784 	HT_FREE(mr->cdepth);
785 	HT_FREE(mr->furl);
786 
787 #ifdef HT_POSIX_REGEX
788 	if (mr->include) {
789 	    regfree(mr->include);
790 	    HT_FREE(mr->include);
791 	}
792 	if (mr->exclude) {
793 	    regfree(mr->exclude);
794 	    HT_FREE(mr->exclude);
795 	}
796 	if (mr->exc_robot) {
797 	    regfree(mr->exc_robot);
798 	    HT_FREE(mr->exc_robot);
799 	}
800 	if (mr->check) {
801 	    regfree(mr->check);
802 	    HT_FREE(mr->check);
803 	}
804 #endif
805 
806 #ifdef HT_MYSQL
807 	if (mr->sqllog) {
808 	    HTSQLLog_close(mr->sqllog);
809 	    mr->sqllog = NULL;
810 	}
811 #endif
812 
813 	if (mr->queue) HTQueue_delete(mr->queue);
814 	HT_FREE(mr->cwd);
815 	HT_FREE(mr->prefix);
816 	HT_FREE(mr->img_prefix);
817 	HT_FREE(mr);
818 	return YES;
819     }
820     return NO;
821 }
822 
823 /*
824 **  This function creates a new finger object and initializes it with a new request
825 */
826 PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
827 {
828     Finger * me;
829     HTRequest * request = HTRequest_new();
830     if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
831 	HT_OUTOFMEM("Finger_new");
832     me->robot = robot;
833     me->request = request;
834     me->dest = dest;
835     HTList_addObject(robot->fingers, (void *)me);
836 
837     /* Set the context for this request */
838     HTRequest_setContext (request, me);
839 
840     /* Check the various flags to customize the request */
841     if (robot->flags & MR_PREEMPTIVE)
842 	HTRequest_setPreemptive(request, YES);
843     if (robot->flags & MR_VALIDATE)
844 	HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
845     if (robot->flags & MR_END_VALIDATE)
846 	HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
847 
848     /* We wanna make sure that we are sending a Host header (default) */
849     HTRequest_addRqHd(request, HT_C_HOST);
850 
851     /* Set the method for this request */
852     HTRequest_setMethod(request, method);
853     robot->cnt++;
854     return me;
855 }
856 
857 PRIVATE int Finger_delete (Finger * me)
858 {
859     HTList_removeObject(me->robot->fingers, (void *)me);
860 
861     /* Done with one more */
862     me->robot->cnt--;
863 
864     /* See if we don't need to keep all the metadata around in the anchors */
865     if (!(me->robot->flags & MR_KEEP_META))
866 	HTAnchor_clearHeader(HTRequest_anchor(me->request));
867 
868     /*
869     **  If we are down at one request then flush the output buffer
870     */
871     if (me->request) {
872 	if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
873 	HTRequest_delete(me->request);
874 	me->request = NULL;
875     }
876 
877     /*
878     **  Delete the request and free myself
879     */
880     HT_FREE(me);
881     return YES;
882 }
883 
884 PRIVATE BOOL check_constraints(Robot * mr, char *prefix, char *uri)
885 {
886     BOOL match = YES;
887     /* Check for prefix match */
888     if (prefix) {
889 	match = HTStrMatch(prefix, uri) ? YES : NO;
890     }
891 
892 #ifdef HT_POSIX_REGEX
893     /* Check for any regular expression */
894     if (match && mr->include) {
895 	match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
896     }
897     if (match && mr->exc_robot) {
898 	match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO;
899     }
900     if (match && mr->exclude) {
901 	match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
902     }
903 
904 #endif
905     return match;
906 }
907 
908 /*
909 **  Cleanup and make sure we close all connections including the persistent
910 **  ones
911 */
912 PUBLIC void Cleanup (Robot * me, int status)
913 {
914     /*
915     **  First we clean up the robot itself and calculate the various
916     **  statistics. This can actually take some time as a lot of data
917     **  has to be manipulated
918     */
919     Robot_delete(me);
920 
921     /*
922     **  Then we shut down libwww
923     */
924     HTProfile_delete();
925 
926 #ifdef HT_MEMLOG
927     HTMemLog_close();
928 #endif
929 
930 #ifdef VMS
931     exit(status ? status : 1);
932 #else
933     exit(status ? status : 0);
934 #endif
935 }
936 
937 #ifdef HT_POSIX_REGEX
938 PRIVATE char * get_regerror (int errcode, regex_t * compiled)
939 {
940     size_t length = regerror (errcode, compiled, NULL, 0);
941     char * str = NULL;
942     if ((str = (char *) HT_MALLOC(length+1)) == NULL)
943 	HT_OUTOFMEM("get_regerror");
944     (void) regerror (errcode, compiled, str, length);
945     return str;
946 }
947 
948 PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
949 {
950     regex_t * regex = NULL;
951     if (regex_str && *regex_str) {
952 	int status;
953 	if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
954 	    HT_OUTOFMEM("get_regtype");
955 	if ((status = regcomp(regex, regex_str, cflags))) {
956 	    char * err_msg = get_regerror(status, regex);
957 	    if (SHOW_REAL_QUIET(mr))
958 		HTPrint("Regular expression error: %s\n", err_msg);
959 	    HT_FREE(err_msg);
960 	    Cleanup(mr, -1);
961 	}
962     }
963     return regex;
964 }
965 #endif
966 
967 PUBLIC void VersionInfo (void)
968 {
969     HTPrint("\nW3C OpenSource Software");
970     HTPrint("\n-----------------------\n\n");
971     HTPrint("\tWebbot version %s\n", APP_VERSION);
972     HTPrint("\tusing the W3C libwww library version %s.\n\n",HTLib_version());
973     HTPrint("\tSee \"%s\" for help\n", COMMAND_LINE);
974     HTPrint("\tSee \"http://www.w3.org/Robot/User/\" for user information\n");
975     HTPrint("\tSee \"http://www.w3.org/Robot/\" for general information\n\n");
976     HTPrint("\tPlease send feedback to the <www-lib@w3.org> mailing list,\n");
977     HTPrint("\tsee \"http://www.w3.org/Library/#Forums\" for details\n\n");
978 }
979 
980 /*	redirection_handler
981 **	-------------------
982 **	If we are set up to handle redirections then handle it here.
983 */
984 PUBLIC int redirection_handler (HTRequest * request, HTResponse * response,
985 				void * param, int status)
986 {
987     Finger * finger = (Finger *) HTRequest_context(request);
988     Robot * mr = finger->robot;
989     HTParentAnchor * me = HTRequest_anchor(request);
990     HTAnchor * redirection = HTResponse_redirection(response);
991     HTParentAnchor * redirection_parent = HTAnchor_parent(redirection);
992     HyperDoc * redirection_hd = HTAnchor_document(redirection_parent);
993     char * uri = NULL;
994     char * redirection_parent_addr = NULL;
995     BOOL match = YES;
996     BOOL check = NO;
997 
998     /* In case we didn't get any redirection destination */
999     if (!redirection) return HT_OK;
1000 
1001     /* Get the addresses */
1002     uri = HTAnchor_address((HTAnchor *) me);
1003     redirection_parent_addr = HTAnchor_address((HTAnchor *) redirection_parent);
1004     if (SHOW_QUIET(mr))
1005 	HTPrint("Robot....... Checking redirecting from `%s\' to `%s\'\n",
1006 		uri, redirection_parent_addr);
1007 
1008     /* Log the event */
1009 #ifdef HT_MYSQL
1010     if (mr->sqllog && redirection_parent_addr)
1011 	HTSQLLog_addLinkRelationship(mr->sqllog, redirection_parent_addr,
1012 				     uri, "redirection", NULL);
1013 #endif
1014 
1015     /* Check our constraints matcher */
1016     match = check_constraints(mr,mr->prefix, redirection_parent_addr);
1017 
1018 #ifdef HT_POSIX_REGEX
1019     /* See if we should do a HEAD or a GET on this URI */
1020     if (match && mr->check) {
1021 	check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
1022     }
1023 #endif
1024 
1025     /*
1026     ** If we already have a HyperDoc for the redirected anchor
1027     ** then update it
1028     */
1029     if (match) {
1030 	if ((redirection_hd = HTAnchor_document(redirection_parent)) != NULL) {
1031 	    if (SHOW_QUIET(mr)) HTPrint("............ Already checked\n");
1032 	    redirection_hd->hits++;
1033 	    HT_FREE(redirection_parent_addr);
1034 	    HT_FREE(uri);
1035 	    return HT_OK;
1036 	}
1037 
1038 	/* Now call the default libwww handler for actually carrying it out */
1039 	if (mr->redir_code==0 || mr->redir_code==status) {
1040 	    HyperDoc * me_hd = HTAnchor_document(me);
1041 	    HyperDoc *nhd = HyperDoc_new(mr, redirection_parent, me_hd->depth);
1042 
1043 	    if(mr->flags & MR_BFS) {
1044 		nhd->method = METHOD_HEAD;
1045 		HTQueue_append(mr->queue, (void *) nhd);
1046 		(mr->cq)++;
1047 	    }
1048 
1049 	    if (check) {
1050 		if (SHOW_QUIET(mr)) HTPrint("Checking redirection using HEAD\n");
1051 		HTRequest_setMethod(request, METHOD_HEAD);
1052 	    }
1053 	    HT_FREE(redirection_parent_addr);
1054 	    HT_FREE(uri);
1055 	    return HTRedirectFilter(request, response, param, status);
1056 	}
1057     } else {
1058 	if (SHOW_QUIET(mr)) HTPrint("............ does not fulfill constraints\n");
1059 #ifdef HT_MYSQL
1060 	if (mr->reject || mr->sqllog)
1061 #else
1062 	if (mr->reject)
1063 #endif
1064 	{
1065 	    if (mr->reject && redirection_parent_addr)
1066 		HTLog_addText(mr->reject, "%s --> %s\n", redirection_parent_addr, uri);
1067 	}
1068     }
1069 
1070     /* Just fall through */
1071     HT_FREE(redirection_parent_addr);
1072     HT_FREE(uri);
1073     return HT_OK;
1074 }
1075 
1076 /*	terminate_handler
1077 **	-----------------
1078 **	This function is registered to handle the result of the request.
1079 **	If no more requests are pending then terminate program
1080 */
1081 PUBLIC int terminate_handler (HTRequest * request, HTResponse * response,
1082 			       void * param, int status)
1083 {
1084     Finger * finger = (Finger *) HTRequest_context(request);
1085     Robot * mr = finger->robot;
1086     if (SHOW_QUIET(mr)) HTPrint("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1087 
1088 #ifdef HT_MYSQL
1089     if (mr->sqllog) HTSQLLog_addEntry(mr->sqllog, request, status);
1090 #endif
1091 
1092     /* Check if negotiated resource and whether we should log that*/
1093     if (mr->conneg) {
1094 	HTAssocList * cur = HTResponse_variant(response);
1095 	if (cur) {
1096 	    BOOL first = YES;
1097 	    HTChunk * buffer = HTChunk_new(128);
1098 	    char * uri = HTAnchor_address((HTAnchor *) finger->dest);
1099 	    HTAssoc * pres;
1100 	    HTChunk_puts(buffer, uri);
1101 	    while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
1102 		char * value = HTAssoc_value(pres);
1103 		if (first) {
1104 		    HTChunk_puts(buffer, "\t(");
1105 		    first = NO;
1106 		} else
1107 		    HTChunk_puts(buffer, ", ");
1108 
1109 		/* Output the name */
1110 		HTChunk_puts(buffer, HTAssoc_name(pres));
1111 
1112 		/* Only output the value if not empty string */
1113 		if (value && *value) {
1114 		    HTChunk_puts(buffer, "=");
1115 		    HTChunk_puts(buffer, value);
1116 		}
1117 	    }
1118 	    if (!first) HTChunk_puts(buffer, ")");
1119 	    HTLog_addLine(mr->conneg, HTChunk_data(buffer));
1120 	    HTChunk_delete(buffer);
1121 	    HT_FREE(uri);
1122 	}
1123     }
1124 
1125     /* Count the amount of body data that we have read */
1126     if (HTRequest_method(request) == METHOD_GET) {
1127 	int length = HTAnchor_length(HTRequest_anchor(request));
1128 	if (length > 0) mr->get_bytes += length;
1129 	mr->get_docs++;
1130     } else if (HTRequest_method(request) == METHOD_HEAD) {
1131 	int length = HTAnchor_length(HTRequest_anchor(request));
1132 	if (length > 0) mr->head_bytes += length;
1133 	mr->head_docs++;
1134     } else {
1135 	mr->other_docs++;
1136     }
1137 
1138     if (!(mr->flags & MR_BFS)) {
1139 
1140 #if 0
1141         HyperDoc * hd = HTAnchor_document(finger->dest);
1142 	if (hd) set_error_state_hyperdoc(hd,request);
1143 #endif
1144 
1145 	/* Delete this thread */
1146 	Finger_delete(finger);
1147 
1148 	/* Should we stop? */
1149 	if (mr->cnt <= 0) {
1150 	    if (SHOW_QUIET(mr)) HTPrint("             Everything is finished...\n");
1151 	    Cleanup(mr, 0);			/* No way back from here */
1152 	}
1153     }
1154 
1155     if (SHOW_QUIET(mr)) HTPrint("             %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1156     return HT_OK;
1157 
1158 }
1159 
1160 PUBLIC int bfs_terminate_handler (HTRequest * request, HTResponse * response,
1161 				  void * param, int status)
1162 {
1163     Finger * finger = (Finger *) HTRequest_context(request);
1164     Robot * mr = finger->robot;
1165     HTParentAnchor * dest = finger->dest;
1166     HyperDoc * hd = HTAnchor_document(dest);
1167     int depth = (hd ? hd->depth : -1);
1168 
1169     if (hd) set_error_state_hyperdoc(hd,request);
1170 
1171     if(hd && (HTRequest_method(request)== METHOD_HEAD) &&
1172        (depth < mr->depth))
1173       {
1174 	hd->method = METHOD_GET;
1175 	HTQueue_append(mr->queue, (void *)hd); (mr->cq)++;
1176       }
1177 
1178     Finger_delete(finger);
1179 
1180     if(!(mr->flags & MR_PREEMPTIVE))
1181       Serving_queue(mr);
1182 
1183     return HT_OK;
1184 }
1185 
1186 PUBLIC void Serving_queue(Robot *mr)
1187 {
1188   BOOL abort = NO;
1189   Finger *nfinger;
1190 
1191   while(!abort)
1192     {
1193       if(!HTQueue_isEmpty(mr->queue))
1194 	{
1195 	  HTRequest *newreq;
1196 
1197 	  HyperDoc *nhd = (HyperDoc *)HTQueue_headOfQueue(mr->queue);
1198 
1199 	  if(nhd)
1200 	    {
1201 	      char *uri = HTAnchor_address((HTAnchor *)nhd->anchor);
1202 	      HTQueue_dequeue(mr->queue); (mr->cq)--;
1203 
1204 	      nfinger = Finger_new(mr, nhd->anchor, nhd->method);
1205 
1206 	      newreq = nfinger->request;
1207 
1208 	      if(SHOW_QUIET(mr))  HTPrint("Request from QUEUE  %s\n",uri);
1209 	      HT_FREE(uri);
1210 	      if(SHOW_QUIET(mr)) HTPrint("%d elements in queue \n", mr->cq);
1211 
1212 	      HTRequest_setParent(newreq,get_last_parent(nhd->anchor));
1213 
1214 	      /* @@@ Should be done using a timer and not sleep! @@@ */
1215 #if 0
1216 	      if(mr->waits)
1217 		  sleep(mr->waits);
1218 #endif
1219 
1220 	      if (HTLoadAnchor((HTAnchor *)nhd->anchor , newreq) != YES)
1221 		{
1222 		  if (SHOW_QUIET(mr)) HTPrint("not tested!\n");
1223 		  Finger_delete(nfinger);
1224 		}
1225 	    }
1226 	  else
1227 	    abort = YES;
1228 	}
1229       else
1230 	abort = YES;
1231     }
1232 
1233   if(SHOW_QUIET(mr)) HTPrint("Queue size: %d \n", mr->cq);
1234 
1235     if (mr->cnt <= 0 || (abort && (mr->flags & MR_PREEMPTIVE)))
1236       {
1237 	if(mr->cnt > 0)
1238 	  if(SHOW_QUIET(mr)) HTPrint("%d requests were not served\n", mr->cnt);
1239 
1240 	if (SHOW_QUIET(mr)) HTPrint("             Everything is finished...\n");
1241 	Cleanup(mr, 0);			/* No way back from here */
1242       }
1243 }
1244 
1245 /* ------------------------------------------------------------------------- */
1246 /*				HTEXT INTERFACE				     */
1247 /* ------------------------------------------------------------------------- */
1248 
1249 PUBLIC BOOL Robot_registerHTMLParser (void)
1250 {
1251     HText_registerCDCallback(RHText_new, RHText_delete);
1252     HText_registerLinkCallback(RHText_foundLink);
1253     return YES;
1254 }
1255 
1256 PRIVATE HText * RHText_new (HTRequest * request, HTParentAnchor * anchor,
1257 			    HTStream * stream)
1258 {
1259     HText * me;
1260     Finger * finger = (Finger *) HTRequest_context(request);
1261     Robot * mr = finger->robot;
1262     char * robots = NULL;
1263 
1264     if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
1265 	HT_OUTOFMEM("RHText_new");
1266 
1267     /* Bind the HText object together with the Request Object */
1268     me->request = request;
1269     me->follow = YES;
1270 
1271     /* Check to see if we have any meta tags */
1272     if (!(mr->flags & MR_NOMETATAGS) && (robots = HTAnchor_robots(anchor)) != NULL) {
1273 	char * strval = NULL;
1274 	char * ptr = NULL;
1275 	char * token = NULL;
1276 	StrAllocCopy(strval, robots);
1277 	ptr = strval;
1278 	while ((token = HTNextField(&ptr)) != NULL) {
1279 	    if (!strcasecomp(token, "nofollow")) {
1280 		me->follow = NO;
1281 		break;
1282 	    }
1283 	}
1284 	HT_FREE(strval);
1285     }
1286 
1287     /* Add this HyperDoc object to our list */
1288     if (!mr->htext) mr->htext = HTList_new();
1289     HTList_addObject(mr->htext, (void *) me);
1290     return me;
1291 }
1292 
1293 PRIVATE BOOL RHText_delete (HText * me) {
1294     if (me) {
1295 	HT_FREE(me);
1296 	return YES;
1297     }
1298     return NO;
1299 }
1300 
1301 PRIVATE void RHText_foundAnchor (HText * text, HTChildAnchor * anchor)
1302 {
1303     if (text && anchor) {
1304 	Finger * finger = (Finger *) HTRequest_context(text->request);
1305 	Robot * mr = finger->robot;
1306 	HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1307 	HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1308 	char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1309 	HyperDoc * hd = HTAnchor_document(dest_parent);
1310 	HTParentAnchor * referer = HTRequest_anchor(text->request);
1311 	BOOL match = text->follow;
1312 	BOOL check = NO;
1313 
1314 	/* These are new variables */
1315 	HyperDoc * nhd = NULL;
1316 	BOOL follow = YES;
1317 
1318 	/* These three variables were moved */
1319 	/*HTParentAnchor * last_anchor = HTRequest_parent(text->request);*/
1320 	HTParentAnchor * last_anchor = HTRequest_anchor(text->request);
1321 	HyperDoc * last_doc = HTAnchor_document(last_anchor);
1322 	int depth = last_doc ? last_doc->depth+1 : 0;
1323 
1324 	if (!uri) return;
1325 	if (SHOW_QUIET(mr)) HTPrint("Robot....... Found `%s\' - \n", uri ? uri : "NULL\n");
1326 
1327         if (hd) {
1328 	    if (SHOW_QUIET(mr)) HTPrint("............ Already checked\n");
1329             hd->hits++;
1330 #ifdef HT_MYSQL
1331 	    if (mr->sqllog) {
1332 		char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1333 		if (ref_addr) {
1334 		    HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri,
1335 						 "referer", NULL);
1336 		    HT_FREE(ref_addr);
1337 		}
1338 	    }
1339 #endif
1340 	    HT_FREE(uri);
1341 	    return;
1342 	}
1343 
1344 	/* Check our constraints matcher */
1345 	match = check_constraints(mr,mr->prefix, uri);
1346 
1347 #ifdef HT_POSIX_REGEX
1348 	/* See if we should do a HEAD or a GET on this URI */
1349         if (match && mr->check) {
1350             check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
1351 	}
1352 #endif
1353 
1354 #if 0
1355 	/* This is already checked in HTParse.c */
1356 	if(uri && test_for_blank_spaces(uri))
1357 	  follow = NO;
1358 	else
1359 #endif
1360 	if (mr->ndoc == 0) /* Number of Documents is reached */
1361 	  follow = NO;
1362 
1363 	/* Test whether we already have a hyperdoc for this document */
1364 	if (!hd && dest_parent) {
1365 	    nhd = HyperDoc_new(mr, dest_parent, depth);
1366 	    mr->cdepth[depth]++;
1367 	}
1368 
1369 	/* Test whether we already have a hyperdoc for this document */
1370         if (mr->flags & MR_LINK && match && dest_parent && follow && !hd) {
1371 	    if (mr->flags & MR_BFS) {
1372 		nhd->method = METHOD_HEAD;
1373 		HTQueue_enqueue(mr->queue, (void *) nhd);
1374 		(mr->cq)++;
1375 		if(mr->ndoc > 0) mr->ndoc--;
1376 	    } else {
1377 		Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
1378 		HTRequest * newreq = newfinger->request;
1379 		HTRequest_setParent(newreq, referer);
1380 		nhd->method = METHOD_GET;
1381 
1382 		if (check || depth >= mr->depth) {
1383 		    if (SHOW_QUIET(mr)) HTPrint("loading at depth %d using HEAD\n", depth);
1384 		    HTRequest_setMethod(newreq, METHOD_HEAD);
1385 		    nhd->method = METHOD_HEAD;
1386 
1387 		} else {
1388 		    if (SHOW_QUIET(mr)) HTPrint("loading at depth %d\n", depth);
1389 		}
1390 		if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1391 		    if (SHOW_QUIET(mr)) HTPrint("not tested!\n");
1392 		    Finger_delete(newfinger);
1393 		}
1394 	    }
1395 
1396 	} else {
1397 	    if (SHOW_QUIET(mr)) HTPrint("............ does not fulfill constraints\n");
1398 #ifdef HT_MYSQL
1399 	    if (mr->reject || mr->sqllog) {
1400 #else
1401 	    if (mr->reject) {
1402 #endif
1403 		if (referer) {
1404 		    char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1405 		    if (mr->reject && ref_addr)
1406 			HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1407 #ifdef HT_MYSQL
1408 		    if (mr->sqllog && mr->sqlexternals && ref_addr)
1409 			HTSQLLog_addLinkRelationship(mr->sqllog,
1410 						     ref_addr, uri,
1411 						     "referer", NULL);
1412 #endif
1413 
1414 		    HT_FREE(ref_addr);
1415 		}
1416 	    }
1417 	}
1418 	HT_FREE(uri);
1419     }
1420 }
1421 
1422 PRIVATE void RHText_foundImage (HText * text, HTChildAnchor * anchor,
1423 				const char *alt, const char * align, BOOL isMap)
1424 {
1425     if (text && anchor) {
1426 	Finger * finger = (Finger *) HTRequest_context(text->request);
1427 	Robot * mr = finger->robot;
1428 
1429 	if (mr->flags & MR_IMG) {
1430 	    HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1431 	    HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1432 	    char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1433 	    HyperDoc * hd = HTAnchor_document(dest_parent);
1434 	    HTParentAnchor * referer = HTRequest_anchor(text->request);
1435 	    BOOL match = YES;
1436 
1437 	    if (!uri) return;
1438 	    if (hd) {
1439 		if (SHOW_QUIET(mr)) HTPrint("............ Already checked\n");
1440 		hd->hits++;
1441 #ifdef HT_MYSQL
1442 		if (mr->sqllog) {
1443 		    char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1444 		    if (ref_addr) {
1445 			HTSQLLog_addLinkRelationship(mr->sqllog,
1446 						     ref_addr, uri,
1447 						     "image", alt);
1448 			HT_FREE(ref_addr);
1449 		    }
1450 		}
1451 #endif
1452 		HT_FREE(uri);
1453 		return;
1454 	    }
1455 
1456 	    /* Check our constraints matcher */
1457 	    match = check_constraints(mr, mr->img_prefix, uri);
1458 
1459 	    /* Test whether we already have a hyperdoc for this document */
1460 	    if (match && dest) {
1461 		Finger * newfinger = Finger_new(mr, dest_parent,
1462 						mr->flags & MR_SAVE ?
1463 						METHOD_GET : METHOD_HEAD);
1464 		HTRequest * newreq = newfinger->request;
1465 		HyperDoc_new(mr, dest_parent, 1);
1466 		HTRequest_setParent(newreq, referer);
1467 
1468 		/* Check whether we should report missing ALT tags */
1469 		if (mr->noalttag && (alt==NULL || *alt=='\0')) {
1470 		    if (referer) {
1471 			char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1472 			if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
1473 			HT_FREE(ref_addr);
1474 		    }
1475 		}
1476 
1477 		if (SHOW_QUIET(mr)) HTPrint("Robot....... Checking Image `%s\'\n", uri);
1478 		if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
1479 		    if (SHOW_QUIET(mr)) HTPrint("Robot....... Image not tested!\n");
1480 		    Finger_delete(newfinger);
1481 		}
1482 	    } else {
1483 		if (SHOW_QUIET(mr)) HTPrint("............ does not fulfill constraints\n");
1484 #ifdef HT_MYSQL
1485 		if (mr->reject || mr->sqllog) {
1486 #else
1487 		if (mr->reject) {
1488 #endif
1489 		    if (referer) {
1490 			char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1491 			if (mr->reject && ref_addr)
1492 			    HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1493 #ifdef HT_MYSQL
1494 			if (mr->sqllog && mr->sqlexternals && ref_addr)
1495 			    HTSQLLog_addLinkRelationship(mr->sqllog,
1496 							 ref_addr, uri,
1497 							 "image", alt);
1498 #endif
1499 
1500 			HT_FREE(ref_addr);
1501 		    }
1502 		}
1503 	    }
1504 	    HT_FREE(uri);
1505 	}
1506     }
1507 }
1508 
1509 PRIVATE void RHText_foundLink (HText * text,
1510 			       int element_number, int attribute_number,
1511 			       HTChildAnchor * anchor,
1512 			       const BOOL * present, const char ** value)
1513 {
1514     if (text && anchor) {
1515 	Finger * finger = (Finger *) HTRequest_context(text->request);
1516 	Robot * mr = finger->robot;
1517 	if (SHOW_QUIET(mr))
1518 	    HTPrint("Robot....... Received element %d, attribute %d with anchor %p\n",
1519 		    element_number, attribute_number, anchor);
1520 	if ((element_number==HTML_IMG && attribute_number==HTML_IMG_SRC) ||
1521 	    (element_number==HTML_BODY && attribute_number==HTML_BODY_BACKGROUND) ||
1522 	    (element_number==HTML_INPUT && attribute_number==HTML_INPUT_SRC))
1523 	    RHText_foundImage(text, anchor, NULL, NULL, NO);
1524 	else
1525 	    RHText_foundAnchor(text, anchor);
1526     }
1527 }
1528 
1529 PUBLIC char * get_robots_txt(char * uri)
1530 {
1531     char *str = NULL;
1532     HTChunk * chunk;
1533     HTParentAnchor *anchor = HTAnchor_parent(HTAnchor_findAddress(uri));
1534     HTRequest *request = HTRequest_new();
1535     HTRequest_setOutputFormat(request, WWW_SOURCE);
1536     HTRequest_setPreemptive(request, YES);
1537     HTRequest_setMethod(request, METHOD_GET);
1538     chunk = HTLoadAnchorToChunk ((HTAnchor *)anchor, request);
1539     str = HTChunk_toCString(chunk);
1540     HTRequest_delete(request);
1541     return str;
1542 }
1543 
1544 
1545