1 /* 2 3 Webbot - the W3C Mini Robot 4 5 6 ! 7 Webbot - the W3C Mini Robot 8 ! 9 */ 10 11 /* 12 ** (c) COPRIGHT MIT 1995. 13 ** Please first read the full copyright statement in the file COPYRIGH. 14 */ 15 16 /* 17 18 This program illustrates how to travers links using the Anchor object 19 */ 20 21 #ifndef HTROBMAN_H 22 #define HTROBMAN_H 23 24 #include "WWWLib.h" /* Global Library Include file */ 25 #include "WWWApp.h" /* Application stuff */ 26 #include "WWWTrans.h" 27 #include "WWWInit.h" 28 #include "WWWSQL.h" 29 30 #ifdef HT_SSL 31 #include "WWWSSL.h" 32 #endif /* HT_SSL */ 33 34 #include "HText.h" 35 #include "HTRobot.h" /* Implemented here */ 36 37 #ifndef W3C_VERSION 38 #define W3C_VERSION "Unspecified" 39 #endif 40 41 #define APP_NAME "W3CRobot" 42 #define APP_VERSION W3C_VERSION 43 #define COMMAND_LINE "http://www.w3.org/Robot/User/CommandLine" 44 #define ROBOTS_TXT "/robots.txt" 45 46 #define DEFAULT_OUTPUT_FILE "robot.out" 47 #define DEFAULT_RULE_FILE "robot.conf" 48 #define DEFAULT_LOG_FILE "log-clf.txt" 49 #define DEFAULT_HIT_FILE "log-hit.txt" 50 #define DEFAULT_REL_FILE "log-rel.txt" 51 #define DEFAULT_LM_FILE "log-lastmodified.txt" 52 #define DEFAULT_TITLE_FILE "log-title.txt" 53 #define DEFAULT_REFERER_FILE "log-referer.txt" 54 #define DEFAULT_REJECT_FILE "log-reject.txt" 55 #define DEFAULT_NOTFOUND_FILE "log-notfound.txt" 56 #define DEFAULT_CONNEG_FILE "log-conneg.txt" 57 #define DEFAULT_NOALTTAG_FILE "log-alt.txt" 58 #define DEFAULT_FORMAT_FILE "log-format.txt" 59 #define DEFAULT_CHARSET_FILE "log-charset.txt" 60 #define DEFAULT_MEMLOG "robot.mem" 61 #define DEFAULT_PREFIX "" 62 #define DEFAULT_IMG_PREFIX "" 63 #define DEFAULT_DEPTH 0 64 #define DEFAULT_DELAY 50 /* Write delay in ms */ 65 66 #define DEFAULT_CACHE_SIZE 20 /* Default cache size */ 67 68 #define DEFAULT_SQL_SERVER "localhost" 69 #define DEFAULT_SQL_DB "webbot" 70 #define DEFAULT_SQL_USER "webbot" 71 #define DEFAULT_SQL_PW "" 72 73 #ifdef HT_SSL 74 #define DEFAULT_SSL_PROT HTSSL_V23 75 #define DEFAULT_SSL_VDEPTH 2 76 #define DEFAULT_SSL_CFILE "" 77 #define DEFAULT_SSL_KFILE "" 78 #endif 79 80 #if 0 81 #define HT_MEMLOG /* Is expensive in performance! */ 82 #endif 83 84 #define MILLIES 1000 85 #define DEFAULT_TIMEOUT 20 /* timeout in secs */ 86 87 typedef enum _MRFlags { 88 MR_IMG = 0x1, 89 MR_LINK = 0x2, 90 MR_PREEMPTIVE = 0x4, 91 MR_TIME = 0x8, 92 MR_SAVE = 0x10, 93 MR_QUIET = 0x20, 94 MR_REAL_QUIET = 0x40, 95 MR_VALIDATE = 0x80, 96 MR_END_VALIDATE = 0x100, 97 MR_KEEP_META = 0x200, 98 MR_LOGGING = 0x400, 99 MR_DISTRIBUTIONS = 0x800, 100 MR_NOROBOTSTXT = 0x1000, 101 MR_NOMETATAGS = 0x2000, 102 MR_BFS = 0x4000, 103 MR_REDIR = 0x8000 104 } MRFlags; 105 106 typedef struct _Robot { 107 int depth; /* How deep is our tree */ 108 int ndoc; 109 int *cdepth; /* Number of nodes per level */ 110 int cnt; /* Count of requests */ 111 int cindex; /* Number assigned to each document */ 112 113 HTList * hyperdoc; /* List of our HyperDoc Objects */ 114 HTList * htext; /* List of our HText Objects */ 115 HTList * fingers; 116 117 HTList * queue; /* Queue */ 118 int cq; 119 120 int timer; 121 int waits; 122 123 char * cwd; /* Current dir URL */ 124 char * rules; 125 char * prefix; 126 char * img_prefix; 127 128 char * logfile; /* clf log */ 129 HTLog * log; 130 char * reffile; /* referer log */ 131 HTLog * ref; 132 char * rejectfile; /* unchecked links */ 133 HTLog * reject; 134 char * notfoundfile; /* links that returned 404 */ 135 HTLog * notfound; 136 char * connegfile; /* links that were conneg'ed */ 137 HTLog * conneg; 138 char * noalttagfile; /* images without alt tags*/ 139 HTLog * noalttag; 140 141 142 char * hitfile; /* links sorted after hit counts */ 143 char * relfile; /* link sorted after relationships */ 144 HTLinkType relation; /* Specific relation to look for */ 145 char * titlefile; /* links with titles */ 146 char * mtfile; /* media types encountered */ 147 char * charsetfile; /* charsets encountered */ 148 char * lmfile; /* sortef after last modified dates */ 149 150 char * outputfile; 151 FILE * output; 152 153 char * furl; /* First url */ 154 155 MRFlags flags; 156 157 int redir_code; /* 0 means all, otherwise 301, 302, 305... */ 158 159 long get_bytes; /* Total number of bytes processed using GET*/ 160 long get_docs; /* Total number of documents using GET */ 161 162 long head_bytes; /* bytes processed bytes processed using HEAD */ 163 long head_docs; /* Total number of documents using HEAD*/ 164 165 long other_docs; 166 167 ms_t time; /* Time of run */ 168 169 #ifdef HT_POSIX_REGEX 170 regex_t * include; 171 regex_t * exclude; 172 regex_t * check; 173 regex_t * exc_robot; /* Robots.txt exclusion */ 174 #endif 175 176 #ifdef HT_MYSQL 177 HTSQLLog * sqllog; 178 char * sqlserver; 179 char * sqldb; 180 char * sqluser; 181 char * sqlpw; 182 char * sqlrelative; 183 BOOL sqlexternals; 184 int sqlflags; 185 #endif 186 187 #ifdef HT_SSL 188 HTSSL_PROTOCOL sslprot; 189 int sslverifydepth; 190 char * sslcertfile; 191 char * sslkeyfile; 192 #endif 193 194 } Robot; 195 196 typedef struct _Finger { 197 Robot * robot; 198 HTRequest * request; 199 HTParentAnchor * dest; 200 } Finger; 201 202 /* 203 ** The HyperDoc object is bound to the anchor and contains information about 204 ** where we are in the search for recursive searches 205 */ 206 207 #define NO_CODE -1 208 #define REDIR_CODE -2 209 210 typedef struct _HyperDoc { 211 HTParentAnchor * anchor; 212 int depth; 213 int hits; 214 int code; 215 int index; 216 char * title; 217 HTMethod method; 218 } HyperDoc; 219 220 /* 221 ** This is the HText object that is created every time we start parsing an 222 ** HTML object 223 */ 224 struct _HText { 225 HTRequest * request; 226 BOOL follow; 227 }; 228 229 /* 230 ** A structure for calculating metadata distributions 231 */ 232 typedef struct _MetaDist { 233 HTAtom * name; 234 int hits; 235 } MetaDist; 236 237 #ifdef HT_POSIX_REGEX 238 PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags); 239 #endif 240 241 PUBLIC int OutputData(const char * fmt, ...); 242 PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth); 243 PUBLIC BOOL HyperDoc_delete (HyperDoc * hd); 244 PUBLIC Robot * Robot_new (void); 245 PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method); 246 PUBLIC BOOL Robot_registerHTMLParser (void); 247 PUBLIC void Cleanup (Robot * me, int status); 248 PUBLIC void VersionInfo (void); 249 250 PUBLIC int terminate_handler (HTRequest * request, HTResponse * response, 251 void * param, int status) ; 252 253 PUBLIC int bfs_terminate_handler (HTRequest * request, HTResponse * response, 254 void * param, int status) ; 255 256 PUBLIC int redirection_handler (HTRequest * request, HTResponse * response, 257 void * param, int status) ; 258 259 PUBLIC void Serving_queue(Robot *mr); 260 261 PUBLIC char *get_robots_txt(char *uri); 262 263 #endif 264 265 /* 266 267 268 269 @(#) $Id$ 270 271 */ 272