1 /*
2 
3   					Webbot - the W3C Mini Robot
4 
5 
6 !
7   Webbot - the W3C Mini Robot
8 !
9 */
10 
11 /*
12 **	(c) COPRIGHT MIT 1995.
13 **	Please first read the full copyright statement in the file COPYRIGH.
14 */
15 
16 /*
17 
18 This program illustrates how to travers links using the Anchor object
19 */
20 
21 #ifndef HTROBMAN_H
22 #define HTROBMAN_H
23 
24 #include "WWWLib.h"			      /* Global Library Include file */
25 #include "WWWApp.h"				        /* Application stuff */
26 #include "WWWTrans.h"
27 #include "WWWInit.h"
28 #include "WWWSQL.h"
29 
30 #ifdef HT_SSL
31 #include "WWWSSL.h"
32 #endif /* HT_SSL */
33 
34 #include "HText.h"
35 #include "HTRobot.h"			     		 /* Implemented here */
36 
37 #ifndef W3C_VERSION
38 #define W3C_VERSION 		"Unspecified"
39 #endif
40 
41 #define APP_NAME		"W3CRobot"
42 #define APP_VERSION		W3C_VERSION
43 #define COMMAND_LINE		"http://www.w3.org/Robot/User/CommandLine"
44 #define ROBOTS_TXT              "/robots.txt"
45 
46 #define DEFAULT_OUTPUT_FILE	"robot.out"
47 #define DEFAULT_RULE_FILE	"robot.conf"
48 #define DEFAULT_LOG_FILE       	"log-clf.txt"
49 #define DEFAULT_HIT_FILE       	"log-hit.txt"
50 #define DEFAULT_REL_FILE      	"log-rel.txt"
51 #define DEFAULT_LM_FILE       	"log-lastmodified.txt"
52 #define DEFAULT_TITLE_FILE     	"log-title.txt"
53 #define DEFAULT_REFERER_FILE   	"log-referer.txt"
54 #define DEFAULT_REJECT_FILE   	"log-reject.txt"
55 #define DEFAULT_NOTFOUND_FILE  	"log-notfound.txt"
56 #define DEFAULT_CONNEG_FILE  	"log-conneg.txt"
57 #define DEFAULT_NOALTTAG_FILE  	"log-alt.txt"
58 #define DEFAULT_FORMAT_FILE  	"log-format.txt"
59 #define DEFAULT_CHARSET_FILE  	"log-charset.txt"
60 #define DEFAULT_MEMLOG		"robot.mem"
61 #define DEFAULT_PREFIX		""
62 #define DEFAULT_IMG_PREFIX	""
63 #define DEFAULT_DEPTH		0
64 #define DEFAULT_DELAY		50			/* Write delay in ms */
65 
66 #define DEFAULT_CACHE_SIZE	20			/* Default cache size */
67 
68 #define DEFAULT_SQL_SERVER	"localhost"
69 #define DEFAULT_SQL_DB		"webbot"
70 #define DEFAULT_SQL_USER	"webbot"
71 #define DEFAULT_SQL_PW		""
72 
73 #ifdef HT_SSL
74 #define DEFAULT_SSL_PROT        HTSSL_V23
75 #define DEFAULT_SSL_VDEPTH      2
76 #define DEFAULT_SSL_CFILE       ""
77 #define DEFAULT_SSL_KFILE       ""
78 #endif
79 
80 #if 0
81 #define HT_MEMLOG		/* Is expensive in performance! */
82 #endif
83 
84 #define MILLIES			1000
85 #define DEFAULT_TIMEOUT		20		          /* timeout in secs */
86 
87 typedef enum _MRFlags {
88     MR_IMG		= 0x1,
89     MR_LINK		= 0x2,
90     MR_PREEMPTIVE	= 0x4,
91     MR_TIME		= 0x8,
92     MR_SAVE	  	= 0x10,
93     MR_QUIET	  	= 0x20,
94     MR_REAL_QUIET  	= 0x40,
95     MR_VALIDATE		= 0x80,
96     MR_END_VALIDATE	= 0x100,
97     MR_KEEP_META	= 0x200,
98     MR_LOGGING		= 0x400,
99     MR_DISTRIBUTIONS	= 0x800,
100     MR_NOROBOTSTXT	= 0x1000,
101     MR_NOMETATAGS	= 0x2000,
102     MR_BFS      	= 0x4000,
103     MR_REDIR            = 0x8000
104 } MRFlags;
105 
106 typedef struct _Robot {
107     int			depth;			     /* How deep is our tree */
108     int                 ndoc;
109     int                *cdepth;                /* Number of nodes per level */
110     int			cnt;				/* Count of requests */
111     int                 cindex;         /* Number assigned to each document */
112 
113     HTList *		hyperdoc;	     /* List of our HyperDoc Objects */
114     HTList *		htext;			/* List of our HText Objects */
115     HTList *		fingers;
116 
117     HTList *            queue;                  /* Queue */
118     int                 cq;
119 
120     int 		timer;
121     int 		waits;
122 
123     char *		cwd;			/* Current dir URL */
124     char *		rules;
125     char *		prefix;
126     char *		img_prefix;
127 
128     char *		logfile;		/* clf log */
129     HTLog *             log;
130     char *		reffile;		/* referer log */
131     HTLog *             ref;
132     char *		rejectfile;		/* unchecked links */
133     HTLog *	        reject;
134     char *		notfoundfile;		/* links that returned 404 */
135     HTLog *	        notfound;
136     char *		connegfile;		/* links that were conneg'ed */
137     HTLog *	        conneg;
138     char *		noalttagfile;		/* images without alt tags*/
139     HTLog *	        noalttag;
140 
141 
142     char *		hitfile;		/* links sorted after hit counts */
143     char *		relfile;		/* link sorted after relationships */
144     HTLinkType		relation;		/* Specific relation to look for */
145     char *		titlefile;		/* links with titles */
146     char *		mtfile;			/* media types encountered */
147     char *		charsetfile;		/* charsets encountered */
148     char *		lmfile;			/* sortef after last modified dates */
149 
150     char *		outputfile;
151     FILE *	        output;
152 
153     char *              furl;                              /* First url */
154 
155     MRFlags		flags;
156 
157     int                 redir_code;     /* 0 means all, otherwise 301, 302, 305... */
158 
159     long		get_bytes;	/* Total number of bytes processed using GET*/
160     long                get_docs;     	/* Total number of documents using GET */
161 
162     long		head_bytes;	/* bytes processed bytes processed using HEAD */
163     long                head_docs;   	/* Total number of documents using HEAD*/
164 
165     long		other_docs;
166 
167     ms_t		time;		/* Time of run */
168 
169 #ifdef HT_POSIX_REGEX
170     regex_t *		include;
171     regex_t *		exclude;
172     regex_t *		check;
173     regex_t *		exc_robot;     /* Robots.txt exclusion */
174 #endif
175 
176 #ifdef HT_MYSQL
177     HTSQLLog *		sqllog;
178     char *		sqlserver;
179     char *		sqldb;
180     char *		sqluser;
181     char *		sqlpw;
182     char *		sqlrelative;
183     BOOL		sqlexternals;
184     int			sqlflags;
185 #endif
186 
187 #ifdef HT_SSL
188     HTSSL_PROTOCOL      sslprot;
189     int                 sslverifydepth;
190     char *              sslcertfile;
191     char *              sslkeyfile;
192 #endif
193 
194 } Robot;
195 
196 typedef struct _Finger {
197     Robot * robot;
198     HTRequest * request;
199     HTParentAnchor * dest;
200 } Finger;
201 
202 /*
203 **  The HyperDoc object is bound to the anchor and contains information about
204 **  where we are in the search for recursive searches
205 */
206 
207 #define NO_CODE -1
208 #define REDIR_CODE -2
209 
210 typedef struct _HyperDoc {
211     HTParentAnchor * 	anchor;
212     int			depth;
213     int                 hits;
214     int                 code;
215     int                 index;
216     char *              title;
217     HTMethod            method;
218 } HyperDoc;
219 
220 /*
221 ** This is the HText object that is created every time we start parsing an
222 ** HTML object
223 */
224 struct _HText {
225     HTRequest *		request;
226     BOOL		follow;
227 };
228 
229 /*
230 **  A structure for calculating metadata distributions
231 */
232 typedef struct _MetaDist {
233     HTAtom *		name;
234     int			hits;
235 } MetaDist;
236 
237 #ifdef HT_POSIX_REGEX
238 PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags);
239 #endif
240 
241 PUBLIC int OutputData(const char  * fmt, ...);
242 PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth);
243 PUBLIC BOOL HyperDoc_delete (HyperDoc * hd);
244 PUBLIC Robot * Robot_new (void);
245 PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method);
246 PUBLIC BOOL Robot_registerHTMLParser (void);
247 PUBLIC void Cleanup (Robot * me, int status);
248 PUBLIC void VersionInfo (void);
249 
250 PUBLIC int terminate_handler (HTRequest * request, HTResponse * response,
251 			       void * param, int status) ;
252 
253 PUBLIC int bfs_terminate_handler (HTRequest * request, HTResponse * response,
254 			          void * param, int status) ;
255 
256 PUBLIC int redirection_handler (HTRequest * request, HTResponse * response,
257 			        void * param, int status) ;
258 
259 PUBLIC void Serving_queue(Robot *mr);
260 
261 PUBLIC char *get_robots_txt(char *uri);
262 
263 #endif
264 
265 /*
266 
267 
268 
269   @(#) $Id$
270 
271 */
272