1 // Larbin
2 // Sebastien Ailleret
3 // 29-11-99 -> 09-03-02
4 
5 #include <unistd.h>
6 #include <sys/socket.h>
7 #include <netinet/in.h>
8 #include <errno.h>
9 #include <fcntl.h>
10 #include <iostream>
11 #include <cstring>
12 #include <adns.h>
13 #include <netdb.h>
14 #include <sys/socket.h>
15 #include <cstring>
16 #include <signal.h>
17 #include <ctype.h>
18 
19 #include "options.h"
20 
21 #include "types.h"
22 #include "global.h"
23 #include "utils/text.h"
24 #include "utils/Fifo.h"
25 #include "utils/debug.h"
26 #include "fetch/site.h"
27 #include "interf/output.h"
28 #include "interf/input.h"
29 
30 
31 ///////////////////////////////////////////////////////////
32 // Struct global
33 ///////////////////////////////////////////////////////////
34 
35 // define all the static variables
36 time_t global::now;
37 hashTable *global::seen;
38 #ifdef NO_DUP
39 hashDup *global::hDuplicate;
40 #endif // NO_DUP
41 SyncFifo<url> *global::URLsPriority;
42 SyncFifo<url> *global::URLsPriorityWait;
43 uint global::readPriorityWait=0;
44 PersistentFifo *global::URLsDisk;
45 PersistentFifo *global::URLsDiskWait;
46 uint global::readWait=0;
47 IPSite *global::IPSiteList;
48 NamedSite *global::namedSiteList;
49 Fifo<IPSite> *global::okSites;
50 Fifo<NamedSite> *global::dnsSites;
51 Connexion *global::connexions;
52 adns_state global::ads;
53 uint global::nbDnsCalls = 0;
54 ConstantSizedFifo<Connexion> *global::freeConns;
55 #ifdef THREAD_OUTPUT
56 ConstantSizedFifo<Connexion> *global::userConns;
57 #endif
58 Interval *global::inter;
59 int8_t global::depthInSite;
60 bool global::externalLinks = true;
61 time_t global::waitDuration;
62 char *global::userAgent;
63 char *global::sender;
64 char *global::headers;
65 char *global::headersRobots;
66 sockaddr_in *global::proxyAddr;
67 Vector<char> *global::domains;
68 Vector<char> global::forbExt;
69 uint global::nb_conn;
70 uint global::dnsConn;
71 unsigned short int global::httpPort;
72 unsigned short int global::inputPort;
73 struct pollfd *global::pollfds;
74 uint global::posPoll;
75 uint global::sizePoll;
76 short *global::ansPoll;
77 int global::maxFds;
78 #ifdef MAXBANDWIDTH
79 long int global::remainBand = MAXBANDWIDTH;
80 #endif // MAXBANDWIDTH
81 int global::IPUrl = 0;
82 
83 /** Constructor : initialize almost everything
84  * Everything is read from the config file (larbin.conf by default)
85  */
global(int argc,char * argv[])86 global::global (int argc, char *argv[]) {
87   char *configFile = "larbin.conf";
88 #ifdef RELOAD
89   bool reload = true;
90 #else
91   bool reload = false;
92 #endif
93   now = time(NULL);
94   // verification of arguments
95   int pos = 1;
96   while (pos < argc) {
97 	if (!strcmp(argv[pos], "-c") && argc > pos+1) {
98 	  configFile = argv[pos+1];
99 	  pos += 2;
100 	} else if (!strcmp(argv[pos], "-scratch")) {
101 	  reload = false;
102 	  pos++;
103 	} else {
104 	  break;
105 	}
106   }
107   if (pos != argc) {
108 	std::cerr << "usage : " << argv[0];
109 	std::cerr << " [-c configFile] [-scratch]\n";
110 	exit(1);
111   }
112 
113   // Standard values
114   waitDuration = 60;
115   depthInSite = 5;
116   userAgent = "larbin";
117   sender = "larbin@unspecified.mail";
118   nb_conn = 20;
119   dnsConn = 3;
120   httpPort = 0;
121   inputPort = 0;  // by default, no input available
122   proxyAddr = NULL;
123   domains = NULL;
124   // FIFOs
125   URLsDisk = new PersistentFifo(reload, fifoFile);
126   URLsDiskWait = new PersistentFifo(reload, fifoFileWait);
127   URLsPriority = new SyncFifo<url>;
128   URLsPriorityWait = new SyncFifo<url>;
129   inter = new Interval(ramUrls);
130   namedSiteList = new NamedSite[namedSiteListSize];
131   IPSiteList = new IPSite[IPSiteListSize];
132   okSites = new Fifo<IPSite>(2000);
133   dnsSites = new Fifo<NamedSite>(2000);
134   seen = new hashTable(!reload);
135 #ifdef NO_DUP
136   hDuplicate = new hashDup(dupSize, dupFile, !reload);
137 #endif // NO_DUP
138   // Read the configuration file
139   crash("Read the configuration file");
140   parseFile(configFile);
141   // Initialize everything
142   crash("Create global values");
143   // Headers
144   LarbinString strtmp;
145   strtmp.addString("\r\nUser-Agent: ");
146   strtmp.addString(userAgent);
147   strtmp.addString(" ");
148   strtmp.addString(sender);
149 #ifdef SPECIFICSEARCH
150   strtmp.addString("\r\nAccept: text/html");
151   int i=0;
152   while (contentTypes[i] != NULL) {
153     strtmp.addString(", ");
154     strtmp.addString(contentTypes[i]);
155     i++;
156   }
157 #elif !defined(IMAGES) && !defined(ANYTYPE)
158   strtmp.addString("\r\nAccept: text/html");
159 #endif // SPECIFICSEARCH
160   strtmp.addString("\r\n\r\n");
161   headers = strtmp.giveString();
162   // Headers robots.txt
163   strtmp.recycle();
164   strtmp.addString("\r\nUser-Agent: ");
165   strtmp.addString(userAgent);
166   strtmp.addString(" (");
167   strtmp.addString(sender);
168   strtmp.addString(")\r\n\r\n");
169   headersRobots = strtmp.giveString();
170 #ifdef THREAD_OUTPUT
171   userConns = new ConstantSizedFifo<Connexion>(nb_conn);
172 #endif
173   freeConns = new ConstantSizedFifo<Connexion>(nb_conn);
174   connexions = new Connexion [nb_conn];
175   for (uint i=0; i<nb_conn; i++) {
176 	freeConns->put(connexions+i);
177   }
178   // init poll structures
179   sizePoll = nb_conn + maxInput;
180   pollfds = new struct pollfd[sizePoll];
181   posPoll = 0;
182   maxFds = sizePoll;
183   ansPoll = new short[maxFds];
184   // init non blocking dns calls
185   adns_initflags flags =
186 	adns_initflags (adns_if_nosigpipe | adns_if_noerrprint);
187   adns_init(&ads, flags, NULL);
188   // call init functions of all modules
189   initSpecific();
190   initInput();
191   initOutput();
192   initSite();
193   // let's ignore SIGPIPE
194   static struct sigaction sn, so;
195   sigemptyset(&sn.sa_mask);
196   sn.sa_flags = SA_RESTART;
197   sn.sa_handler = SIG_IGN;
198   if (sigaction(SIGPIPE, &sn, &so)) {
199     std::cerr << "Unable to disable SIGPIPE : " << strerror(errno) << std::endl;
200   }
201 }
202 
203 /** Destructor : never used because the program should never end !
204  */
~global()205 global::~global () {
206   assert(false);
207 }
208 
209 /** parse configuration file */
parseFile(char * file)210 void global::parseFile (char *file) {
211   int fds = open(file, O_RDONLY);
212   if (fds < 0) {
213 	std::cerr << "cannot open config file (" << file << ") : "
214          << strerror(errno) << std::endl;
215 	exit(1);
216   }
217   char *tmp = readfile(fds);
218   close(fds);
219   // suppress commentary
220   bool eff = false;
221   for (int i=0; tmp[i] != 0; i++) {
222 	switch (tmp[i]) {
223 	case '\n': eff = false; break;
224 	case '#': eff = true; // no break !!!
225 	default: if (eff) tmp[i] = ' ';
226 	}
227   }
228   char *posParse = tmp;
229   char *tok = nextToken(&posParse);
230   while (tok != NULL) {
231 	if (!strcasecmp(tok, "UserAgent")) {
232 	  userAgent = newString(nextToken(&posParse));
233 	} else if (!strcasecmp(tok, "From")) {
234 	  sender = newString(nextToken(&posParse));
235 	} else if (!strcasecmp(tok, "startUrl")) {
236 	  tok = nextToken(&posParse);
237       url *u = new url(tok, global::depthInSite, (url *) NULL);
238       if (u->isValid()) {
239         check(u);
240       } else {
241         std::cerr << "the start url " << tok << " is invalid\n";
242         exit(1);
243       }
244 	} else if (!strcasecmp(tok, "waitduration")) {
245 	  tok = nextToken(&posParse);
246 	  waitDuration = atoi(tok);
247 	} else if (!strcasecmp(tok, "proxy")) {
248 	  // host name and dns call
249 	  tok = nextToken(&posParse);
250 	  struct hostent* hp;
251 	  proxyAddr = new sockaddr_in;
252 	  memset((char *) proxyAddr, 0, sizeof (struct sockaddr_in));
253 	  if ((hp = gethostbyname(tok)) == NULL) {
254 		endhostent();
255 		std::cerr << "Unable to find proxy ip address (" << tok << ")\n";
256 		exit(1);
257 	  } else {
258 		proxyAddr->sin_family = hp->h_addrtype;
259 		memcpy ((char*) &proxyAddr->sin_addr, hp->h_addr, hp->h_length);
260 	  }
261 	  endhostent();
262 	  // port number
263 	  tok = nextToken(&posParse);
264 	  proxyAddr->sin_port = htons(atoi(tok));
265 	} else if (!strcasecmp(tok, "pagesConnexions")) {
266 	  tok = nextToken(&posParse);
267 	  nb_conn = atoi(tok);
268 	} else if (!strcasecmp(tok, "dnsConnexions")) {
269 	  tok = nextToken(&posParse);
270 	  dnsConn = atoi(tok);
271 	} else if (!strcasecmp(tok, "httpPort")) {
272 	  tok = nextToken(&posParse);
273 	  httpPort = atoi(tok);
274 	} else if (!strcasecmp(tok, "inputPort")) {
275 	  tok = nextToken(&posParse);
276 	  inputPort = atoi(tok);
277 	} else if (!strcasecmp(tok, "depthInSite")) {
278 	  tok = nextToken(&posParse);
279 	  depthInSite = atoi(tok);
280 	} else if (!strcasecmp(tok, "limitToDomain")) {
281 	  manageDomain(&posParse);
282 	} else if (!strcasecmp(tok, "forbiddenExtensions")) {
283 	  manageExt(&posParse);
284 	} else if (!strcasecmp(tok, "noExternalLinks")) {
285 	  externalLinks = false;
286 	} else {
287 	  std::cerr << "bad configuration file : " << tok << "\n";
288 	  exit(1);
289 	}
290 	tok = nextToken(&posParse);
291   }
292   delete [] tmp;
293 }
294 
295 /** read the domain limit */
manageDomain(char ** posParse)296 void global::manageDomain (char **posParse) {
297   char *tok = nextToken(posParse);
298   if (domains == NULL) {
299 	domains = new Vector<char>;
300   }
301   while (tok != NULL && strcasecmp(tok, "end")) {
302 	domains->addElement(newString(tok));
303 	tok = nextToken(posParse);
304   }
305   if (tok == NULL) {
306 	std::cerr << "Bad configuration file : no end to limitToDomain\n";
307 	exit(1);
308   }
309 }
310 
311 /** read the forbidden extensions */
manageExt(char ** posParse)312 void global::manageExt (char **posParse) {
313   char *tok = nextToken(posParse);
314   while (tok != NULL && strcasecmp(tok, "end")) {
315     int l = strlen(tok);
316     int i;
317     for (i=0; i<l; i++) {
318       tok[i] = tolower(tok[i]);
319     }
320     if (!matchPrivExt(tok))
321       forbExt.addElement(newString(tok));
322 	tok = nextToken(posParse);
323   }
324   if (tok == NULL) {
325 	std::cerr << "Bad configuration file : no end to forbiddenExtensions\n";
326 	exit(1);
327   }
328 }
329 
330 /** make sure the max fds has not been reached */
verifMax(int fd)331 void global::verifMax (int fd) {
332   if (fd >= maxFds) {
333     int n = 2 * maxFds;
334     if (fd >= n) {
335       n = fd + maxFds;
336     }
337     short *tmp = new short[n];
338     for (int i=0; i<maxFds; i++) {
339       tmp[i] = ansPoll[i];
340     }
341     for (int i=maxFds; i<n; i++) {
342       tmp[i] = 0;
343     }
344     delete (ansPoll);
345     maxFds = n;
346     ansPoll = tmp;
347   }
348 }
349 
350 ///////////////////////////////////////////////////////////
351 // Struct Connexion
352 ///////////////////////////////////////////////////////////
353 
354 /** put Connection in a coherent state
355  */
Connexion()356 Connexion::Connexion () {
357   state = emptyC;
358   parser = NULL;
359 }
360 
361 /** Destructor : never used : we recycle !!!
362  */
~Connexion()363 Connexion::~Connexion () {
364   assert(false);
365 }
366 
367 /** Recycle a connexion
368  */
recycle()369 void Connexion::recycle () {
370   delete parser;
371   request.recycle();
372 }
373