1 // Larbin
2 // Sebastien Ailleret
3 // 29-11-99 -> 09-03-02
4
5 #include <unistd.h>
6 #include <sys/socket.h>
7 #include <netinet/in.h>
8 #include <errno.h>
9 #include <fcntl.h>
10 #include <iostream>
11 #include <cstring>
12 #include <adns.h>
13 #include <netdb.h>
14 #include <sys/socket.h>
15 #include <cstring>
16 #include <signal.h>
17 #include <ctype.h>
18
19 #include "options.h"
20
21 #include "types.h"
22 #include "global.h"
23 #include "utils/text.h"
24 #include "utils/Fifo.h"
25 #include "utils/debug.h"
26 #include "fetch/site.h"
27 #include "interf/output.h"
28 #include "interf/input.h"
29
30
31 ///////////////////////////////////////////////////////////
32 // Struct global
33 ///////////////////////////////////////////////////////////
34
35 // define all the static variables
36 time_t global::now;
37 hashTable *global::seen;
38 #ifdef NO_DUP
39 hashDup *global::hDuplicate;
40 #endif // NO_DUP
41 SyncFifo<url> *global::URLsPriority;
42 SyncFifo<url> *global::URLsPriorityWait;
43 uint global::readPriorityWait=0;
44 PersistentFifo *global::URLsDisk;
45 PersistentFifo *global::URLsDiskWait;
46 uint global::readWait=0;
47 IPSite *global::IPSiteList;
48 NamedSite *global::namedSiteList;
49 Fifo<IPSite> *global::okSites;
50 Fifo<NamedSite> *global::dnsSites;
51 Connexion *global::connexions;
52 adns_state global::ads;
53 uint global::nbDnsCalls = 0;
54 ConstantSizedFifo<Connexion> *global::freeConns;
55 #ifdef THREAD_OUTPUT
56 ConstantSizedFifo<Connexion> *global::userConns;
57 #endif
58 Interval *global::inter;
59 int8_t global::depthInSite;
60 bool global::externalLinks = true;
61 time_t global::waitDuration;
62 char *global::userAgent;
63 char *global::sender;
64 char *global::headers;
65 char *global::headersRobots;
66 sockaddr_in *global::proxyAddr;
67 Vector<char> *global::domains;
68 Vector<char> global::forbExt;
69 uint global::nb_conn;
70 uint global::dnsConn;
71 unsigned short int global::httpPort;
72 unsigned short int global::inputPort;
73 struct pollfd *global::pollfds;
74 uint global::posPoll;
75 uint global::sizePoll;
76 short *global::ansPoll;
77 int global::maxFds;
78 #ifdef MAXBANDWIDTH
79 long int global::remainBand = MAXBANDWIDTH;
80 #endif // MAXBANDWIDTH
81 int global::IPUrl = 0;
82
83 /** Constructor : initialize almost everything
84 * Everything is read from the config file (larbin.conf by default)
85 */
global(int argc,char * argv[])86 global::global (int argc, char *argv[]) {
87 char *configFile = "larbin.conf";
88 #ifdef RELOAD
89 bool reload = true;
90 #else
91 bool reload = false;
92 #endif
93 now = time(NULL);
94 // verification of arguments
95 int pos = 1;
96 while (pos < argc) {
97 if (!strcmp(argv[pos], "-c") && argc > pos+1) {
98 configFile = argv[pos+1];
99 pos += 2;
100 } else if (!strcmp(argv[pos], "-scratch")) {
101 reload = false;
102 pos++;
103 } else {
104 break;
105 }
106 }
107 if (pos != argc) {
108 std::cerr << "usage : " << argv[0];
109 std::cerr << " [-c configFile] [-scratch]\n";
110 exit(1);
111 }
112
113 // Standard values
114 waitDuration = 60;
115 depthInSite = 5;
116 userAgent = "larbin";
117 sender = "larbin@unspecified.mail";
118 nb_conn = 20;
119 dnsConn = 3;
120 httpPort = 0;
121 inputPort = 0; // by default, no input available
122 proxyAddr = NULL;
123 domains = NULL;
124 // FIFOs
125 URLsDisk = new PersistentFifo(reload, fifoFile);
126 URLsDiskWait = new PersistentFifo(reload, fifoFileWait);
127 URLsPriority = new SyncFifo<url>;
128 URLsPriorityWait = new SyncFifo<url>;
129 inter = new Interval(ramUrls);
130 namedSiteList = new NamedSite[namedSiteListSize];
131 IPSiteList = new IPSite[IPSiteListSize];
132 okSites = new Fifo<IPSite>(2000);
133 dnsSites = new Fifo<NamedSite>(2000);
134 seen = new hashTable(!reload);
135 #ifdef NO_DUP
136 hDuplicate = new hashDup(dupSize, dupFile, !reload);
137 #endif // NO_DUP
138 // Read the configuration file
139 crash("Read the configuration file");
140 parseFile(configFile);
141 // Initialize everything
142 crash("Create global values");
143 // Headers
144 LarbinString strtmp;
145 strtmp.addString("\r\nUser-Agent: ");
146 strtmp.addString(userAgent);
147 strtmp.addString(" ");
148 strtmp.addString(sender);
149 #ifdef SPECIFICSEARCH
150 strtmp.addString("\r\nAccept: text/html");
151 int i=0;
152 while (contentTypes[i] != NULL) {
153 strtmp.addString(", ");
154 strtmp.addString(contentTypes[i]);
155 i++;
156 }
157 #elif !defined(IMAGES) && !defined(ANYTYPE)
158 strtmp.addString("\r\nAccept: text/html");
159 #endif // SPECIFICSEARCH
160 strtmp.addString("\r\n\r\n");
161 headers = strtmp.giveString();
162 // Headers robots.txt
163 strtmp.recycle();
164 strtmp.addString("\r\nUser-Agent: ");
165 strtmp.addString(userAgent);
166 strtmp.addString(" (");
167 strtmp.addString(sender);
168 strtmp.addString(")\r\n\r\n");
169 headersRobots = strtmp.giveString();
170 #ifdef THREAD_OUTPUT
171 userConns = new ConstantSizedFifo<Connexion>(nb_conn);
172 #endif
173 freeConns = new ConstantSizedFifo<Connexion>(nb_conn);
174 connexions = new Connexion [nb_conn];
175 for (uint i=0; i<nb_conn; i++) {
176 freeConns->put(connexions+i);
177 }
178 // init poll structures
179 sizePoll = nb_conn + maxInput;
180 pollfds = new struct pollfd[sizePoll];
181 posPoll = 0;
182 maxFds = sizePoll;
183 ansPoll = new short[maxFds];
184 // init non blocking dns calls
185 adns_initflags flags =
186 adns_initflags (adns_if_nosigpipe | adns_if_noerrprint);
187 adns_init(&ads, flags, NULL);
188 // call init functions of all modules
189 initSpecific();
190 initInput();
191 initOutput();
192 initSite();
193 // let's ignore SIGPIPE
194 static struct sigaction sn, so;
195 sigemptyset(&sn.sa_mask);
196 sn.sa_flags = SA_RESTART;
197 sn.sa_handler = SIG_IGN;
198 if (sigaction(SIGPIPE, &sn, &so)) {
199 std::cerr << "Unable to disable SIGPIPE : " << strerror(errno) << std::endl;
200 }
201 }
202
203 /** Destructor : never used because the program should never end !
204 */
~global()205 global::~global () {
206 assert(false);
207 }
208
209 /** parse configuration file */
parseFile(char * file)210 void global::parseFile (char *file) {
211 int fds = open(file, O_RDONLY);
212 if (fds < 0) {
213 std::cerr << "cannot open config file (" << file << ") : "
214 << strerror(errno) << std::endl;
215 exit(1);
216 }
217 char *tmp = readfile(fds);
218 close(fds);
219 // suppress commentary
220 bool eff = false;
221 for (int i=0; tmp[i] != 0; i++) {
222 switch (tmp[i]) {
223 case '\n': eff = false; break;
224 case '#': eff = true; // no break !!!
225 default: if (eff) tmp[i] = ' ';
226 }
227 }
228 char *posParse = tmp;
229 char *tok = nextToken(&posParse);
230 while (tok != NULL) {
231 if (!strcasecmp(tok, "UserAgent")) {
232 userAgent = newString(nextToken(&posParse));
233 } else if (!strcasecmp(tok, "From")) {
234 sender = newString(nextToken(&posParse));
235 } else if (!strcasecmp(tok, "startUrl")) {
236 tok = nextToken(&posParse);
237 url *u = new url(tok, global::depthInSite, (url *) NULL);
238 if (u->isValid()) {
239 check(u);
240 } else {
241 std::cerr << "the start url " << tok << " is invalid\n";
242 exit(1);
243 }
244 } else if (!strcasecmp(tok, "waitduration")) {
245 tok = nextToken(&posParse);
246 waitDuration = atoi(tok);
247 } else if (!strcasecmp(tok, "proxy")) {
248 // host name and dns call
249 tok = nextToken(&posParse);
250 struct hostent* hp;
251 proxyAddr = new sockaddr_in;
252 memset((char *) proxyAddr, 0, sizeof (struct sockaddr_in));
253 if ((hp = gethostbyname(tok)) == NULL) {
254 endhostent();
255 std::cerr << "Unable to find proxy ip address (" << tok << ")\n";
256 exit(1);
257 } else {
258 proxyAddr->sin_family = hp->h_addrtype;
259 memcpy ((char*) &proxyAddr->sin_addr, hp->h_addr, hp->h_length);
260 }
261 endhostent();
262 // port number
263 tok = nextToken(&posParse);
264 proxyAddr->sin_port = htons(atoi(tok));
265 } else if (!strcasecmp(tok, "pagesConnexions")) {
266 tok = nextToken(&posParse);
267 nb_conn = atoi(tok);
268 } else if (!strcasecmp(tok, "dnsConnexions")) {
269 tok = nextToken(&posParse);
270 dnsConn = atoi(tok);
271 } else if (!strcasecmp(tok, "httpPort")) {
272 tok = nextToken(&posParse);
273 httpPort = atoi(tok);
274 } else if (!strcasecmp(tok, "inputPort")) {
275 tok = nextToken(&posParse);
276 inputPort = atoi(tok);
277 } else if (!strcasecmp(tok, "depthInSite")) {
278 tok = nextToken(&posParse);
279 depthInSite = atoi(tok);
280 } else if (!strcasecmp(tok, "limitToDomain")) {
281 manageDomain(&posParse);
282 } else if (!strcasecmp(tok, "forbiddenExtensions")) {
283 manageExt(&posParse);
284 } else if (!strcasecmp(tok, "noExternalLinks")) {
285 externalLinks = false;
286 } else {
287 std::cerr << "bad configuration file : " << tok << "\n";
288 exit(1);
289 }
290 tok = nextToken(&posParse);
291 }
292 delete [] tmp;
293 }
294
295 /** read the domain limit */
manageDomain(char ** posParse)296 void global::manageDomain (char **posParse) {
297 char *tok = nextToken(posParse);
298 if (domains == NULL) {
299 domains = new Vector<char>;
300 }
301 while (tok != NULL && strcasecmp(tok, "end")) {
302 domains->addElement(newString(tok));
303 tok = nextToken(posParse);
304 }
305 if (tok == NULL) {
306 std::cerr << "Bad configuration file : no end to limitToDomain\n";
307 exit(1);
308 }
309 }
310
311 /** read the forbidden extensions */
manageExt(char ** posParse)312 void global::manageExt (char **posParse) {
313 char *tok = nextToken(posParse);
314 while (tok != NULL && strcasecmp(tok, "end")) {
315 int l = strlen(tok);
316 int i;
317 for (i=0; i<l; i++) {
318 tok[i] = tolower(tok[i]);
319 }
320 if (!matchPrivExt(tok))
321 forbExt.addElement(newString(tok));
322 tok = nextToken(posParse);
323 }
324 if (tok == NULL) {
325 std::cerr << "Bad configuration file : no end to forbiddenExtensions\n";
326 exit(1);
327 }
328 }
329
330 /** make sure the max fds has not been reached */
verifMax(int fd)331 void global::verifMax (int fd) {
332 if (fd >= maxFds) {
333 int n = 2 * maxFds;
334 if (fd >= n) {
335 n = fd + maxFds;
336 }
337 short *tmp = new short[n];
338 for (int i=0; i<maxFds; i++) {
339 tmp[i] = ansPoll[i];
340 }
341 for (int i=maxFds; i<n; i++) {
342 tmp[i] = 0;
343 }
344 delete (ansPoll);
345 maxFds = n;
346 ansPoll = tmp;
347 }
348 }
349
350 ///////////////////////////////////////////////////////////
351 // Struct Connexion
352 ///////////////////////////////////////////////////////////
353
354 /** put Connection in a coherent state
355 */
Connexion()356 Connexion::Connexion () {
357 state = emptyC;
358 parser = NULL;
359 }
360
361 /** Destructor : never used : we recycle !!!
362 */
~Connexion()363 Connexion::~Connexion () {
364 assert(false);
365 }
366
367 /** Recycle a connexion
368 */
recycle()369 void Connexion::recycle () {
370 delete parser;
371 request.recycle();
372 }
373