1 /***************************************************************************/ 2 /* This code is part of WWW grabber called pavuk */ 3 /* Copyright (c) 1997 - 2001 Stefan Ondrejicka */ 4 /* Distributed under GPL 2 or later */ 5 /***************************************************************************/ 6 7 #ifndef _doc_h_ 8 #define _doc_h_ 9 #include <sys/types.h> 10 #include <sys/time.h> 11 #include <time.h> 12 13 #include "url.h" 14 #include "tools.h" 15 #include "bufio.h" 16 #include "myssl_openssl.h" 17 #include "myssl_nss.h" 18 19 typedef struct 20 { 21 bool_t log; 22 char *msg; 23 } doc_msg; 24 25 typedef struct 26 { 27 /*******************************/ 28 /* Basic document infos */ 29 /*******************************/ 30 int doc_nr; /*** number of document in queue ***/ 31 url *doc_url; /*** URL of document ***/ 32 char *mime; /*** MIME header of document ***/ 33 char *type_str; /*** MIME type of document ***/ 34 bool_t is_parsable; /*** documemt dedicated for parsing ? (html,css,scripts) ***/ 35 36 ssize_t size; /*** document size readed ***/ 37 ssize_t totsz; /*** total size of document if known ***/ 38 ssize_t origsize; /*** original size of document ***/ 39 char *contents; /*** document content ***/ 40 ssize_t rest_pos; /*** restart position ***/ 41 ssize_t rest_end_pos; /*** restart end position ***/ 42 time_t dtime; /*** creation time of document ***/ 43 time_t stime; /*** time of document request ***/ 44 time_t origtime; /*** modification time of local copy of document */ 45 int errcode; /*** error code for current doc ***/ 46 47 /*******************************/ 48 /* temporary document infos */ 49 /*******************************/ 50 char *lock_fn; /*** lock file name for document ***/ 51 bufio *datasock; /*** socket for data connection ***/ 52 bufio *s_sock; /*** file where is going online saved data ***/ 53 54 /*******************************/ 55 /* downloading control flags */ 56 /*******************************/ 57 bool_t load; /*** load document into ->contents ***/ 58 bool_t doreget; /*** continue broken transfer ***/ 59 bool_t report_size; /*** show progressmeter while loading of document ***/ 60 bool_t check_limits; /*** check limits for adding URLs from this document ***/ 61 bool_t remove_lock; /*** remove lock file ***/ 62 bool_t save_online; /*** document should be saved online to file ***/ 63 bool_t is_robot; /*** redirection of robots.txt documents is handled differently ! ***/ 64 bool_t is_http_transfer; /*** is going current transfer over HTTP protocol ? ***/ 65 66 /*******************************/ 67 /* HTTP connection info */ 68 /*******************************/ 69 int request_type; /*** HTTP request method used for this document ***/ 70 unsigned short http_proxy_port;/*** used HTTP proxy port ***/ 71 char *http_proxy; /*** used HTTP proxy ***/ 72 bool_t http_proxy_10; /*** is this HTTP/1.0 proxy ***/ 73 char *etag; /*** ETag or Last-Modified for conditional partial HTTP GET method ***/ 74 char *connect_host; /*** hostname for CONNECT request ***/ 75 unsigned short connect_port; /*** portnumber for CONNECT request ***/ 76 77 /*******************************/ 78 /* informations for HTTP/1.1 */ 79 /* (chunked encoding) */ 80 /*******************************/ 81 bool_t is_http11; /*** we are talking with HTTP/1.1 server ***/ 82 ssize_t chunk_size; /*** for HTTP/1.1 chunked transfer encoding ***/ 83 bool_t is_chunked; /*** current document is encoded with chunked encoding ***/ 84 bool_t read_chunksize; /*** in next read we expect chunksize header ***/ 85 bool_t read_trailer; /*** in next read we expect trailer header ***/ 86 bool_t is_persistent; /*** is current HTTP connection persistent 87 (and should leave or not) ***/ 88 89 /*******************************/ 90 /* HTTPS SSL connection info */ 91 /* (used also by FTPS datacon) */ 92 /* need for persistent SSLID */ 93 /*******************************/ 94 #ifdef USE_SSL 95 ssl_connection ssl_data_con; 96 #endif 97 98 /*******************************/ 99 /* HTTP auth informations */ 100 /*******************************/ 101 short num_auth; /*** number of attempts to authenticate ***/ 102 short num_proxy_auth; /*** number of attempts to authenticate with proxy ***/ 103 void *auth_digest; /*** HTTP digest access authentification info ***/ 104 void *auth_proxy_digest; /*** HTTP digest access proxy authentification info ***/ 105 char *additional_headers; /*** additional headers (currently required by NTLM) ***/ 106 107 /*******************************/ 108 /* FTP connection info */ 109 /*******************************/ 110 bool_t ftp_fatal_err; /*** was FTP error fatal ? ***/ 111 short ftp_respc; /*** last FTP response code ***/ 112 bufio *ftp_control; /*** socket for FTP control connection ***/ 113 char *ftp_pasv_host; /*** info for passive data connection ***/ 114 unsigned short ftp_pasv_port; 115 bool_t ftp_data_con_finished; /*** FTP data connection was just fully established ***/ 116 117 /*******************************/ 118 /* progress meter informations */ 119 /*******************************/ 120 #ifdef HAVE_GETTIMEOFDAY 121 struct timeval start_time; /*** for progress metter ***/ 122 struct timeval hr_start_time; /*** when documant processing started ***/ 123 struct timeval redirect_time; /*** when all redirects finished ***/ 124 struct timeval dns_time; /*** when dns lookup finished ***/ 125 struct timeval connect_time; /*** when connect(2) finished ***/ 126 struct timeval first_byte_time;/*** when first byte was received ***/ 127 struct timeval end_time; /*** when download finished ***/ 128 #else 129 time_t start_time; /*** for progress metter ***/ 130 #endif 131 ssize_t current_size; /*** size for current speed ***/ 132 ssize_t adj_sz; /*** adjustment of doc size (HTTP header size ) ***/ 133 134 /*******************************/ 135 /* per thread informations */ 136 /*******************************/ 137 #ifdef HAVE_MT 138 dllist *msgbuf; /*** list of buffered messages ***/ 139 int __herrno; /*** per document h_errno value ***/ 140 int threadnr; /*** number of current thread ***/ 141 #endif 142 } doc; 143 144 extern int doc_download_init(doc *, int); 145 extern int doc_download(doc *, int, int); 146 extern int doc_store(doc *, int); 147 extern int doc_remove(url *); 148 extern int doc_lock(doc *, int); 149 extern time_t doc_etime(doc *, int); 150 extern void doc_init(doc *, url *); 151 extern void doc_cleanup(doc *); 152 extern void doc_destroy(doc *); 153 extern void doc_remove_lock(doc *); 154 extern void doc_update_parent_links(doc *); 155 #ifdef HAVE_MT 156 extern void doc_finish_processing(doc *); 157 #endif 158 159 #endif 160