1 // Larbin 2 // Sebastien Ailleret 3 // 08-02-00 -> 04-01-02 4 5 // This is the new structure of a site 6 // It includes a fifo of waiting urls 7 8 #ifndef SITE_H 9 #define SITE_H 10 11 #include <time.h> 12 #include <adns.h> 13 14 #include "types.h" 15 #include "utils/Fifo.h" 16 #include "utils/url.h" 17 18 void initSite (); 19 20 // define for the state of a connection 21 enum ConnState { 22 emptyC, 23 connectingC, 24 writeC, 25 openC 26 }; 27 28 // different state of dnsQuery 29 enum DnsState { 30 waitDns, 31 doneDns, 32 errorDns, 33 noConnDns 34 }; 35 36 /** This class is intended to make sure the sum of the 37 * sizes of the fifo included in the different sites 38 * are not too big 39 */ 40 class Interval { 41 private: 42 /** Position in the interval */ 43 uint pos; 44 /** Size of the interval */ 45 uint size; 46 public: 47 /** Constructor */ Interval(uint size)48 Interval (uint size) { this->size = size; pos = 0; } 49 /** Destructor */ ~Interval()50 ~Interval () {} 51 /** How many urls can we put 52 * answer 0 if no urls can be put 53 */ putAll()54 inline uint putAll () { int res=size-pos; pos=size; return res; } 55 /** Warn an url has been retrieved */ getOne()56 inline void getOne () { pos--; } 57 /** only for debugging, handle with care */ getPos()58 inline uint getPos () { return pos; } 59 }; 60 61 class NamedSite { 62 private: 63 /* string used for following CNAME chains (just one jump) */ 64 char *cname; 65 /** we've got a good dns answer 66 * get the robots.txt */ 67 void dnsOK (); 68 /** Cannot get the inet addr 69 * dnsState must have been set properly before the call */ 70 void dnsErr (); 71 /** Delete the old identity of the site */ 72 void newId (); 73 /** put this url in its IPSite */ 74 void transfer (url *u); 75 /** forget this url for this reason */ 76 void forgetUrl (url *u, FetchError reason); 77 public: 78 /** Constructor */ 79 NamedSite (); 80 /** Destructor : never used */ 81 ~NamedSite (); 82 /* name of the site */ 83 char name[maxSiteSize]; 84 /* port of the site */ 85 uint16_t port; 86 /* numbers of urls in ram for this site */ 87 uint16_t nburls; 88 /* fifo of urls waiting to be fetched */ 89 url *fifo[maxUrlsBySite]; 90 uint8_t inFifo; 91 uint8_t outFifo; 92 void putInFifo(url *u); 93 url *getInFifo(); 94 short fifoLength(); 95 /** Is this Site in a dnsSites */ 96 bool isInFifo; 97 /** internet addr of this server */ 98 char dnsState; 99 struct in_addr addr; 100 uint ipHash; 101 /* Date of expiration of dns call and robots.txt fetch */ 102 time_t dnsTimeout; 103 /** test if a file can be fetched thanks to the robots.txt */ 104 bool testRobots(char *file); 105 /* forbidden paths : given by robots.txt */ 106 Vector<char> forbidden; 107 /** Put an url in the fifo 108 * If there are too much, put it back in UrlsInternal 109 * Never fill totally the fifo => call at least with 1 */ 110 void putGenericUrl(url *u, int limit, bool prio); putUrl(url * u)111 inline void putUrl (url *u) { putGenericUrl(u, 15, false); } putUrlWait(url * u)112 inline void putUrlWait (url *u) { putGenericUrl(u, 10, false); } putPriorityUrl(url * u)113 inline void putPriorityUrl (url *u) { putGenericUrl(u, 5, true); } putPriorityUrlWait(url * u)114 inline void putPriorityUrlWait (url *u) { putGenericUrl(u, 1, true); } 115 /** Init a new dns query */ 116 void newQuery (); 117 /** The dns query ended with success */ 118 void dnsAns (adns_answer *ans); 119 /** we got the robots.txt, transfer what must be in IPSites */ 120 void robotsResult (FetchError res); 121 }; 122 123 class IPSite { 124 private: 125 /* date of last access : avoid rapid fire */ 126 time_t lastAccess; 127 /** Is this Site in a okSites (eg have something to fetch) */ 128 bool isInFifo; 129 /** Get an url from the fifo 130 * resize tab if too big 131 */ 132 url *getUrl (); 133 public: 134 /** Constructor */ 135 IPSite (); 136 /** Destructor : never used */ 137 ~IPSite (); 138 /** Urls waiting for being fetched */ 139 Fifo<url> tab; 140 /** Put an url in the fifo */ 141 void putUrl (url *u); 142 /** fetch the fist page in the fifo okSites 143 * expects at least one element in freeConns 144 * return expected time for next call (0 means now) 145 */ 146 int fetch (); 147 }; 148 149 #endif // SITE_H 150