1 /***************************************************************************/
2 /*    This code is part of WWW grabber called pavuk                        */
3 /*    Copyright (c) 1997 - 2001 Stefan Ondrejicka                          */
4 /*    Distributed under GPL 2 or later                                     */
5 /***************************************************************************/
6 
7 #ifndef _doc_h_
8 #define _doc_h_
9 #include <sys/types.h>
10 #include <sys/time.h>
11 #include <time.h>
12 
13 #include "url.h"
14 #include "tools.h"
15 #include "bufio.h"
16 #include "myssl_openssl.h"
17 #include "myssl_nss.h"
18 
19 typedef struct
20 {
21   bool_t log;
22   char *msg;
23 } doc_msg;
24 
25 typedef struct
26 {
27   /*******************************/
28   /* Basic document infos        */
29   /*******************************/
30   int doc_nr;                    /*** number of document in queue ***/
31   url *doc_url;                  /*** URL of document ***/
32   char *mime;                    /*** MIME header of document ***/
33   char *type_str;                /*** MIME type of document ***/
34   bool_t is_parsable;            /*** documemt dedicated for parsing ? (html,css,scripts) ***/
35 
36   ssize_t size;                  /*** document size readed ***/
37   ssize_t totsz;                 /*** total size of document if known ***/
38   ssize_t origsize;              /*** original size of document ***/
39   char *contents;                /*** document content ***/
40   ssize_t rest_pos;              /*** restart position ***/
41   ssize_t rest_end_pos;          /*** restart end position ***/
42   time_t dtime;                  /*** creation time of document ***/
43   time_t stime;                  /*** time of document request ***/
44   time_t origtime;               /*** modification time of local copy of document */
45   int errcode;                   /*** error code for current doc ***/
46 
47   /*******************************/
48   /* temporary document infos    */
49   /*******************************/
50   char *lock_fn;                 /*** lock file name for document ***/
51   bufio *datasock;               /*** socket for data connection ***/
52   bufio *s_sock;                 /*** file where is going online saved data ***/
53 
54   /*******************************/
55   /* downloading control flags   */
56   /*******************************/
57   bool_t load;                   /*** load document into ->contents ***/
58   bool_t doreget;                /*** continue broken transfer ***/
59   bool_t report_size;            /*** show progressmeter while loading of document ***/
60   bool_t check_limits;           /*** check limits for adding URLs from this document ***/
61   bool_t remove_lock;            /*** remove lock file ***/
62   bool_t save_online;            /*** document should be saved online to file ***/
63   bool_t is_robot;               /*** redirection of robots.txt documents is handled differently ! ***/
64   bool_t is_http_transfer;       /*** is going current transfer over HTTP protocol ? ***/
65 
66   /*******************************/
67   /* HTTP connection info        */
68   /*******************************/
69   int request_type;              /*** HTTP request method used for this document ***/
70   unsigned short http_proxy_port;/*** used HTTP proxy port ***/
71   char *http_proxy;              /*** used HTTP proxy ***/
72   bool_t http_proxy_10;          /*** is this HTTP/1.0 proxy ***/
73   char *etag;                    /*** ETag or Last-Modified for conditional partial HTTP GET method ***/
74   char *connect_host;            /*** hostname for CONNECT request ***/
75   unsigned short connect_port;   /*** portnumber for CONNECT request ***/
76 
77   /*******************************/
78   /* informations for HTTP/1.1   */
79   /* (chunked encoding)          */
80   /*******************************/
81   bool_t is_http11;              /*** we are talking with HTTP/1.1 server ***/
82   ssize_t chunk_size;            /*** for HTTP/1.1 chunked transfer encoding ***/
83   bool_t is_chunked;             /*** current document is encoded with chunked encoding ***/
84   bool_t read_chunksize;         /*** in next read we expect chunksize header ***/
85   bool_t read_trailer;           /*** in next read we expect trailer header ***/
86   bool_t is_persistent;          /*** is current HTTP connection persistent
87                                       (and should leave or not) ***/
88 
89   /*******************************/
90   /* HTTPS SSL connection info   */
91   /* (used also by FTPS datacon) */
92   /* need for persistent SSLID   */
93   /*******************************/
94 #ifdef USE_SSL
95   ssl_connection ssl_data_con;
96 #endif
97 
98   /*******************************/
99   /* HTTP auth informations      */
100   /*******************************/
101   short num_auth;                /*** number of attempts to authenticate ***/
102   short num_proxy_auth;          /*** number of attempts to authenticate with proxy ***/
103   void *auth_digest;             /*** HTTP digest access authentification info ***/
104   void *auth_proxy_digest;       /*** HTTP digest access proxy authentification info ***/
105   char *additional_headers;      /*** additional headers (currently required by NTLM) ***/
106 
107   /*******************************/
108   /* FTP connection info         */
109   /*******************************/
110   bool_t ftp_fatal_err;          /*** was FTP error fatal ? ***/
111   short ftp_respc;               /*** last FTP response code ***/
112   bufio *ftp_control;            /*** socket for FTP control connection ***/
113   char *ftp_pasv_host;           /*** info for passive data connection ***/
114   unsigned short ftp_pasv_port;
115   bool_t ftp_data_con_finished;  /*** FTP data connection was just fully established ***/
116 
117   /*******************************/
118   /* progress meter informations */
119   /*******************************/
120 #ifdef HAVE_GETTIMEOFDAY
121   struct timeval start_time;     /*** for progress metter ***/
122   struct timeval hr_start_time;  /*** when documant processing started ***/
123   struct timeval redirect_time;  /*** when all redirects finished ***/
124   struct timeval dns_time;       /*** when dns lookup finished ***/
125   struct timeval connect_time;   /*** when connect(2) finished ***/
126   struct timeval first_byte_time;/*** when first byte was received ***/
127   struct timeval end_time;       /*** when download finished ***/
128 #else
129   time_t start_time;             /*** for progress metter ***/
130 #endif
131   ssize_t current_size;          /*** size for current speed ***/
132   ssize_t adj_sz;                /*** adjustment of doc size (HTTP header size ) ***/
133 
134   /*******************************/
135   /* per thread informations     */
136   /*******************************/
137 #ifdef HAVE_MT
138   dllist *msgbuf;                /*** list of buffered messages ***/
139   int __herrno;                  /*** per document h_errno value ***/
140   int threadnr;                  /*** number of current thread ***/
141 #endif
142 } doc;
143 
144 extern int doc_download_init(doc *, int);
145 extern int doc_download(doc *, int, int);
146 extern int doc_store(doc *, int);
147 extern int doc_remove(url *);
148 extern int doc_lock(doc *, int);
149 extern time_t doc_etime(doc *, int);
150 extern void doc_init(doc *, url *);
151 extern void doc_cleanup(doc *);
152 extern void doc_destroy(doc *);
153 extern void doc_remove_lock(doc *);
154 extern void doc_update_parent_links(doc *);
155 #ifdef HAVE_MT
156 extern void doc_finish_processing(doc *);
157 #endif
158 
159 #endif
160